diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,62940 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.999499081649691, + "eval_steps": 500, + "global_step": 8982, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0003339455668725998, + "grad_norm": 18.08110072524588, + "learning_rate": 1.1123470522803115e-08, + "loss": 1.1965, + "step": 1 + }, + { + "epoch": 0.0006678911337451996, + "grad_norm": 19.552894943582405, + "learning_rate": 2.224694104560623e-08, + "loss": 1.2467, + "step": 2 + }, + { + "epoch": 0.0010018367006177993, + "grad_norm": 18.45573642932544, + "learning_rate": 3.337041156840935e-08, + "loss": 1.1939, + "step": 3 + }, + { + "epoch": 0.0013357822674903992, + "grad_norm": 19.51921878235958, + "learning_rate": 4.449388209121246e-08, + "loss": 1.2352, + "step": 4 + }, + { + "epoch": 0.0016697278343629988, + "grad_norm": 19.438335705136836, + "learning_rate": 5.561735261401558e-08, + "loss": 1.2511, + "step": 5 + }, + { + "epoch": 0.0020036734012355987, + "grad_norm": 19.874015492468935, + "learning_rate": 6.67408231368187e-08, + "loss": 1.2588, + "step": 6 + }, + { + "epoch": 0.0023376189681081983, + "grad_norm": 18.81666516443105, + "learning_rate": 7.78642936596218e-08, + "loss": 1.1735, + "step": 7 + }, + { + "epoch": 0.0026715645349807983, + "grad_norm": 18.796402872724705, + "learning_rate": 8.898776418242492e-08, + "loss": 1.1907, + "step": 8 + }, + { + "epoch": 0.003005510101853398, + "grad_norm": 18.406989033693506, + "learning_rate": 1.0011123470522804e-07, + "loss": 1.1884, + "step": 9 + }, + { + "epoch": 0.0033394556687259976, + "grad_norm": 17.666473890858295, + "learning_rate": 1.1123470522803116e-07, + "loss": 1.156, + "step": 10 + }, + { + "epoch": 0.0036734012355985972, + "grad_norm": 18.312626054013474, + "learning_rate": 1.2235817575083427e-07, + "loss": 1.1896, + "step": 11 + }, + { + "epoch": 0.004007346802471197, + "grad_norm": 19.21746711412166, + "learning_rate": 1.334816462736374e-07, + "loss": 1.2486, + "step": 12 + }, + { + "epoch": 0.004341292369343797, + "grad_norm": 18.645287861138588, + "learning_rate": 1.446051167964405e-07, + "loss": 1.2237, + "step": 13 + }, + { + "epoch": 0.0046752379362163966, + "grad_norm": 18.389360599838056, + "learning_rate": 1.557285873192436e-07, + "loss": 1.174, + "step": 14 + }, + { + "epoch": 0.005009183503088996, + "grad_norm": 18.094045001801177, + "learning_rate": 1.6685205784204674e-07, + "loss": 1.2052, + "step": 15 + }, + { + "epoch": 0.005343129069961597, + "grad_norm": 18.4374027292432, + "learning_rate": 1.7797552836484985e-07, + "loss": 1.2309, + "step": 16 + }, + { + "epoch": 0.005677074636834196, + "grad_norm": 17.289354366652493, + "learning_rate": 1.8909899888765295e-07, + "loss": 1.1751, + "step": 17 + }, + { + "epoch": 0.006011020203706796, + "grad_norm": 18.24833811594456, + "learning_rate": 2.0022246941045608e-07, + "loss": 1.2278, + "step": 18 + }, + { + "epoch": 0.006344965770579396, + "grad_norm": 17.053322563470108, + "learning_rate": 2.113459399332592e-07, + "loss": 1.1721, + "step": 19 + }, + { + "epoch": 0.006678911337451995, + "grad_norm": 17.129792113985143, + "learning_rate": 2.2246941045606232e-07, + "loss": 1.1354, + "step": 20 + }, + { + "epoch": 0.007012856904324595, + "grad_norm": 13.47458828617283, + "learning_rate": 2.3359288097886543e-07, + "loss": 1.0349, + "step": 21 + }, + { + "epoch": 0.0073468024711971945, + "grad_norm": 14.161487265468253, + "learning_rate": 2.4471635150166853e-07, + "loss": 1.12, + "step": 22 + }, + { + "epoch": 0.007680748038069795, + "grad_norm": 13.161488444055747, + "learning_rate": 2.5583982202447166e-07, + "loss": 1.0514, + "step": 23 + }, + { + "epoch": 0.008014693604942395, + "grad_norm": 14.570394994187327, + "learning_rate": 2.669632925472748e-07, + "loss": 1.123, + "step": 24 + }, + { + "epoch": 0.008348639171814994, + "grad_norm": 14.666246390111493, + "learning_rate": 2.780867630700779e-07, + "loss": 1.1715, + "step": 25 + }, + { + "epoch": 0.008682584738687594, + "grad_norm": 14.577788314814974, + "learning_rate": 2.89210233592881e-07, + "loss": 1.1681, + "step": 26 + }, + { + "epoch": 0.009016530305560193, + "grad_norm": 13.214866163706802, + "learning_rate": 3.003337041156841e-07, + "loss": 1.0505, + "step": 27 + }, + { + "epoch": 0.009350475872432793, + "grad_norm": 10.769628248339709, + "learning_rate": 3.114571746384872e-07, + "loss": 0.9765, + "step": 28 + }, + { + "epoch": 0.009684421439305393, + "grad_norm": 10.098307468296536, + "learning_rate": 3.2258064516129035e-07, + "loss": 0.9672, + "step": 29 + }, + { + "epoch": 0.010018367006177992, + "grad_norm": 10.418771316687504, + "learning_rate": 3.337041156840935e-07, + "loss": 0.9561, + "step": 30 + }, + { + "epoch": 0.010352312573050592, + "grad_norm": 8.978717627730614, + "learning_rate": 3.4482758620689656e-07, + "loss": 0.8463, + "step": 31 + }, + { + "epoch": 0.010686258139923193, + "grad_norm": 10.286653113951482, + "learning_rate": 3.559510567296997e-07, + "loss": 0.9665, + "step": 32 + }, + { + "epoch": 0.011020203706795793, + "grad_norm": 10.462353476788321, + "learning_rate": 3.670745272525028e-07, + "loss": 0.9434, + "step": 33 + }, + { + "epoch": 0.011354149273668393, + "grad_norm": 9.714490170332663, + "learning_rate": 3.781979977753059e-07, + "loss": 0.9324, + "step": 34 + }, + { + "epoch": 0.011688094840540992, + "grad_norm": 10.083825623996413, + "learning_rate": 3.8932146829810904e-07, + "loss": 0.9361, + "step": 35 + }, + { + "epoch": 0.012022040407413592, + "grad_norm": 9.694323120504304, + "learning_rate": 4.0044493882091217e-07, + "loss": 0.9045, + "step": 36 + }, + { + "epoch": 0.012355985974286192, + "grad_norm": 9.10401434831957, + "learning_rate": 4.115684093437153e-07, + "loss": 0.8305, + "step": 37 + }, + { + "epoch": 0.012689931541158791, + "grad_norm": 8.912869023969314, + "learning_rate": 4.226918798665184e-07, + "loss": 0.7016, + "step": 38 + }, + { + "epoch": 0.01302387710803139, + "grad_norm": 9.949234492145816, + "learning_rate": 4.338153503893215e-07, + "loss": 0.6717, + "step": 39 + }, + { + "epoch": 0.01335782267490399, + "grad_norm": 9.107965901272914, + "learning_rate": 4.4493882091212464e-07, + "loss": 0.6031, + "step": 40 + }, + { + "epoch": 0.01369176824177659, + "grad_norm": 8.999748095069597, + "learning_rate": 4.560622914349278e-07, + "loss": 0.6396, + "step": 41 + }, + { + "epoch": 0.01402571380864919, + "grad_norm": 8.795599856241033, + "learning_rate": 4.6718576195773085e-07, + "loss": 0.5788, + "step": 42 + }, + { + "epoch": 0.01435965937552179, + "grad_norm": 9.433146095223425, + "learning_rate": 4.783092324805339e-07, + "loss": 0.6102, + "step": 43 + }, + { + "epoch": 0.014693604942394389, + "grad_norm": 7.960008599611662, + "learning_rate": 4.894327030033371e-07, + "loss": 0.5373, + "step": 44 + }, + { + "epoch": 0.01502755050926699, + "grad_norm": 7.366530046034347, + "learning_rate": 5.005561735261402e-07, + "loss": 0.5052, + "step": 45 + }, + { + "epoch": 0.01536149607613959, + "grad_norm": 6.363036907340728, + "learning_rate": 5.116796440489433e-07, + "loss": 0.4649, + "step": 46 + }, + { + "epoch": 0.01569544164301219, + "grad_norm": 5.556475848189961, + "learning_rate": 5.228031145717465e-07, + "loss": 0.4856, + "step": 47 + }, + { + "epoch": 0.01602938720988479, + "grad_norm": 3.83682323248533, + "learning_rate": 5.339265850945496e-07, + "loss": 0.4527, + "step": 48 + }, + { + "epoch": 0.01636333277675739, + "grad_norm": 3.4969259770523546, + "learning_rate": 5.450500556173527e-07, + "loss": 0.4612, + "step": 49 + }, + { + "epoch": 0.01669727834362999, + "grad_norm": 2.9909282013080722, + "learning_rate": 5.561735261401558e-07, + "loss": 0.4505, + "step": 50 + }, + { + "epoch": 0.017031223910502588, + "grad_norm": 2.7014984810055025, + "learning_rate": 5.672969966629589e-07, + "loss": 0.4161, + "step": 51 + }, + { + "epoch": 0.017365169477375188, + "grad_norm": 2.6387827313567604, + "learning_rate": 5.78420467185762e-07, + "loss": 0.4301, + "step": 52 + }, + { + "epoch": 0.017699115044247787, + "grad_norm": 2.4645004633554577, + "learning_rate": 5.89543937708565e-07, + "loss": 0.4103, + "step": 53 + }, + { + "epoch": 0.018033060611120387, + "grad_norm": 2.148956503468746, + "learning_rate": 6.006674082313682e-07, + "loss": 0.4049, + "step": 54 + }, + { + "epoch": 0.018367006177992987, + "grad_norm": 2.0812867730472076, + "learning_rate": 6.117908787541713e-07, + "loss": 0.4195, + "step": 55 + }, + { + "epoch": 0.018700951744865586, + "grad_norm": 1.97350230594877, + "learning_rate": 6.229143492769744e-07, + "loss": 0.3871, + "step": 56 + }, + { + "epoch": 0.019034897311738186, + "grad_norm": 2.0261853192304873, + "learning_rate": 6.340378197997777e-07, + "loss": 0.3907, + "step": 57 + }, + { + "epoch": 0.019368842878610786, + "grad_norm": 1.7884536259702397, + "learning_rate": 6.451612903225807e-07, + "loss": 0.3725, + "step": 58 + }, + { + "epoch": 0.019702788445483385, + "grad_norm": 1.7578524964487043, + "learning_rate": 6.562847608453838e-07, + "loss": 0.364, + "step": 59 + }, + { + "epoch": 0.020036734012355985, + "grad_norm": 1.575642860109224, + "learning_rate": 6.67408231368187e-07, + "loss": 0.3917, + "step": 60 + }, + { + "epoch": 0.020370679579228584, + "grad_norm": 1.4422891613432185, + "learning_rate": 6.785317018909901e-07, + "loss": 0.3494, + "step": 61 + }, + { + "epoch": 0.020704625146101184, + "grad_norm": 1.522233247296773, + "learning_rate": 6.896551724137931e-07, + "loss": 0.3539, + "step": 62 + }, + { + "epoch": 0.021038570712973784, + "grad_norm": 1.6394568552150803, + "learning_rate": 7.007786429365964e-07, + "loss": 0.3686, + "step": 63 + }, + { + "epoch": 0.021372516279846387, + "grad_norm": 1.4992220967488337, + "learning_rate": 7.119021134593994e-07, + "loss": 0.3675, + "step": 64 + }, + { + "epoch": 0.021706461846718986, + "grad_norm": 1.7873771972580739, + "learning_rate": 7.230255839822026e-07, + "loss": 0.3792, + "step": 65 + }, + { + "epoch": 0.022040407413591586, + "grad_norm": 1.5301039424791039, + "learning_rate": 7.341490545050057e-07, + "loss": 0.3561, + "step": 66 + }, + { + "epoch": 0.022374352980464186, + "grad_norm": 1.462805387783217, + "learning_rate": 7.452725250278087e-07, + "loss": 0.363, + "step": 67 + }, + { + "epoch": 0.022708298547336785, + "grad_norm": 1.4879055388402056, + "learning_rate": 7.563959955506118e-07, + "loss": 0.3724, + "step": 68 + }, + { + "epoch": 0.023042244114209385, + "grad_norm": 1.4681875012051924, + "learning_rate": 7.675194660734149e-07, + "loss": 0.3651, + "step": 69 + }, + { + "epoch": 0.023376189681081985, + "grad_norm": 1.547854625804747, + "learning_rate": 7.786429365962181e-07, + "loss": 0.3816, + "step": 70 + }, + { + "epoch": 0.023710135247954584, + "grad_norm": 1.7263953674720272, + "learning_rate": 7.897664071190211e-07, + "loss": 0.3699, + "step": 71 + }, + { + "epoch": 0.024044080814827184, + "grad_norm": 1.2751997064298404, + "learning_rate": 8.008898776418243e-07, + "loss": 0.3418, + "step": 72 + }, + { + "epoch": 0.024378026381699783, + "grad_norm": 1.4359100529226376, + "learning_rate": 8.120133481646274e-07, + "loss": 0.3693, + "step": 73 + }, + { + "epoch": 0.024711971948572383, + "grad_norm": 1.2749760525328362, + "learning_rate": 8.231368186874306e-07, + "loss": 0.3323, + "step": 74 + }, + { + "epoch": 0.025045917515444983, + "grad_norm": 1.2855735849326455, + "learning_rate": 8.342602892102336e-07, + "loss": 0.3407, + "step": 75 + }, + { + "epoch": 0.025379863082317582, + "grad_norm": 1.2508196970444088, + "learning_rate": 8.453837597330368e-07, + "loss": 0.3518, + "step": 76 + }, + { + "epoch": 0.025713808649190182, + "grad_norm": 1.406994531333698, + "learning_rate": 8.565072302558399e-07, + "loss": 0.3381, + "step": 77 + }, + { + "epoch": 0.02604775421606278, + "grad_norm": 1.4630422438671775, + "learning_rate": 8.67630700778643e-07, + "loss": 0.3213, + "step": 78 + }, + { + "epoch": 0.02638169978293538, + "grad_norm": 1.4111611274728295, + "learning_rate": 8.78754171301446e-07, + "loss": 0.348, + "step": 79 + }, + { + "epoch": 0.02671564534980798, + "grad_norm": 1.2796063016816839, + "learning_rate": 8.898776418242493e-07, + "loss": 0.3248, + "step": 80 + }, + { + "epoch": 0.02704959091668058, + "grad_norm": 1.5090068302982718, + "learning_rate": 9.010011123470523e-07, + "loss": 0.3559, + "step": 81 + }, + { + "epoch": 0.02738353648355318, + "grad_norm": 1.2343790950745581, + "learning_rate": 9.121245828698556e-07, + "loss": 0.3082, + "step": 82 + }, + { + "epoch": 0.02771748205042578, + "grad_norm": 1.1891735311805842, + "learning_rate": 9.232480533926586e-07, + "loss": 0.3116, + "step": 83 + }, + { + "epoch": 0.02805142761729838, + "grad_norm": 1.3062993215992929, + "learning_rate": 9.343715239154617e-07, + "loss": 0.3292, + "step": 84 + }, + { + "epoch": 0.02838537318417098, + "grad_norm": 1.2819392779749308, + "learning_rate": 9.454949944382647e-07, + "loss": 0.3395, + "step": 85 + }, + { + "epoch": 0.02871931875104358, + "grad_norm": 1.4416842302356845, + "learning_rate": 9.566184649610679e-07, + "loss": 0.336, + "step": 86 + }, + { + "epoch": 0.02905326431791618, + "grad_norm": 1.3987710774933, + "learning_rate": 9.67741935483871e-07, + "loss": 0.3063, + "step": 87 + }, + { + "epoch": 0.029387209884788778, + "grad_norm": 1.4418014348260515, + "learning_rate": 9.788654060066741e-07, + "loss": 0.3607, + "step": 88 + }, + { + "epoch": 0.029721155451661378, + "grad_norm": 1.585329350183969, + "learning_rate": 9.899888765294773e-07, + "loss": 0.3271, + "step": 89 + }, + { + "epoch": 0.03005510101853398, + "grad_norm": 1.6528669573974764, + "learning_rate": 1.0011123470522804e-06, + "loss": 0.3312, + "step": 90 + }, + { + "epoch": 0.03038904658540658, + "grad_norm": 1.5673416929359354, + "learning_rate": 1.0122358175750835e-06, + "loss": 0.3362, + "step": 91 + }, + { + "epoch": 0.03072299215227918, + "grad_norm": 1.3538512444375355, + "learning_rate": 1.0233592880978867e-06, + "loss": 0.3103, + "step": 92 + }, + { + "epoch": 0.03105693771915178, + "grad_norm": 1.266414511944769, + "learning_rate": 1.0344827586206898e-06, + "loss": 0.3476, + "step": 93 + }, + { + "epoch": 0.03139088328602438, + "grad_norm": 1.2381813473063292, + "learning_rate": 1.045606229143493e-06, + "loss": 0.322, + "step": 94 + }, + { + "epoch": 0.03172482885289698, + "grad_norm": 1.3517795579781238, + "learning_rate": 1.056729699666296e-06, + "loss": 0.3428, + "step": 95 + }, + { + "epoch": 0.03205877441976958, + "grad_norm": 1.2503553987008789, + "learning_rate": 1.0678531701890992e-06, + "loss": 0.3324, + "step": 96 + }, + { + "epoch": 0.03239271998664218, + "grad_norm": 1.2235838797074612, + "learning_rate": 1.0789766407119021e-06, + "loss": 0.3077, + "step": 97 + }, + { + "epoch": 0.03272666555351478, + "grad_norm": 1.2342368229237566, + "learning_rate": 1.0901001112347055e-06, + "loss": 0.3123, + "step": 98 + }, + { + "epoch": 0.03306061112038738, + "grad_norm": 1.3993217559841238, + "learning_rate": 1.1012235817575084e-06, + "loss": 0.3157, + "step": 99 + }, + { + "epoch": 0.03339455668725998, + "grad_norm": 1.187047636224087, + "learning_rate": 1.1123470522803115e-06, + "loss": 0.3115, + "step": 100 + }, + { + "epoch": 0.03372850225413258, + "grad_norm": 1.1888949396982949, + "learning_rate": 1.1234705228031146e-06, + "loss": 0.314, + "step": 101 + }, + { + "epoch": 0.034062447821005176, + "grad_norm": 1.3282629053014763, + "learning_rate": 1.1345939933259178e-06, + "loss": 0.3134, + "step": 102 + }, + { + "epoch": 0.034396393387877776, + "grad_norm": 1.409494192960795, + "learning_rate": 1.145717463848721e-06, + "loss": 0.343, + "step": 103 + }, + { + "epoch": 0.034730338954750375, + "grad_norm": 1.3955053377524733, + "learning_rate": 1.156840934371524e-06, + "loss": 0.3294, + "step": 104 + }, + { + "epoch": 0.035064284521622975, + "grad_norm": 1.1921009474473023, + "learning_rate": 1.1679644048943272e-06, + "loss": 0.2927, + "step": 105 + }, + { + "epoch": 0.035398230088495575, + "grad_norm": 1.342026976228603, + "learning_rate": 1.17908787541713e-06, + "loss": 0.3005, + "step": 106 + }, + { + "epoch": 0.035732175655368174, + "grad_norm": 1.4229030988806093, + "learning_rate": 1.1902113459399334e-06, + "loss": 0.333, + "step": 107 + }, + { + "epoch": 0.036066121222240774, + "grad_norm": 1.1373972993813148, + "learning_rate": 1.2013348164627363e-06, + "loss": 0.3059, + "step": 108 + }, + { + "epoch": 0.036400066789113374, + "grad_norm": 1.3470188636614417, + "learning_rate": 1.2124582869855397e-06, + "loss": 0.3168, + "step": 109 + }, + { + "epoch": 0.03673401235598597, + "grad_norm": 1.2640906475959344, + "learning_rate": 1.2235817575083426e-06, + "loss": 0.3064, + "step": 110 + }, + { + "epoch": 0.03706795792285857, + "grad_norm": 1.244712541251493, + "learning_rate": 1.2347052280311457e-06, + "loss": 0.3063, + "step": 111 + }, + { + "epoch": 0.03740190348973117, + "grad_norm": 1.0580996310724993, + "learning_rate": 1.2458286985539489e-06, + "loss": 0.2849, + "step": 112 + }, + { + "epoch": 0.03773584905660377, + "grad_norm": 1.3180651427074217, + "learning_rate": 1.256952169076752e-06, + "loss": 0.3112, + "step": 113 + }, + { + "epoch": 0.03806979462347637, + "grad_norm": 1.285650915666917, + "learning_rate": 1.2680756395995554e-06, + "loss": 0.3156, + "step": 114 + }, + { + "epoch": 0.03840374019034897, + "grad_norm": 1.2921513400307592, + "learning_rate": 1.2791991101223583e-06, + "loss": 0.3102, + "step": 115 + }, + { + "epoch": 0.03873768575722157, + "grad_norm": 1.1878797730987274, + "learning_rate": 1.2903225806451614e-06, + "loss": 0.2986, + "step": 116 + }, + { + "epoch": 0.03907163132409417, + "grad_norm": 1.2560584793651444, + "learning_rate": 1.3014460511679643e-06, + "loss": 0.2989, + "step": 117 + }, + { + "epoch": 0.03940557689096677, + "grad_norm": 1.2643345435783993, + "learning_rate": 1.3125695216907677e-06, + "loss": 0.3233, + "step": 118 + }, + { + "epoch": 0.03973952245783937, + "grad_norm": 1.3041047613547743, + "learning_rate": 1.3236929922135708e-06, + "loss": 0.336, + "step": 119 + }, + { + "epoch": 0.04007346802471197, + "grad_norm": 1.2031369067707778, + "learning_rate": 1.334816462736374e-06, + "loss": 0.304, + "step": 120 + }, + { + "epoch": 0.04040741359158457, + "grad_norm": 1.1399223289092129, + "learning_rate": 1.3459399332591769e-06, + "loss": 0.2892, + "step": 121 + }, + { + "epoch": 0.04074135915845717, + "grad_norm": 1.303855873648094, + "learning_rate": 1.3570634037819802e-06, + "loss": 0.3095, + "step": 122 + }, + { + "epoch": 0.04107530472532977, + "grad_norm": 1.0649829493695344, + "learning_rate": 1.3681868743047833e-06, + "loss": 0.285, + "step": 123 + }, + { + "epoch": 0.04140925029220237, + "grad_norm": 1.415926257488088, + "learning_rate": 1.3793103448275862e-06, + "loss": 0.3394, + "step": 124 + }, + { + "epoch": 0.04174319585907497, + "grad_norm": 1.183128643867534, + "learning_rate": 1.3904338153503894e-06, + "loss": 0.3079, + "step": 125 + }, + { + "epoch": 0.04207714142594757, + "grad_norm": 1.3177462295230522, + "learning_rate": 1.4015572858731927e-06, + "loss": 0.3369, + "step": 126 + }, + { + "epoch": 0.04241108699282017, + "grad_norm": 1.3473486897101608, + "learning_rate": 1.4126807563959956e-06, + "loss": 0.3045, + "step": 127 + }, + { + "epoch": 0.042745032559692774, + "grad_norm": 1.2390438395244772, + "learning_rate": 1.4238042269187988e-06, + "loss": 0.2975, + "step": 128 + }, + { + "epoch": 0.04307897812656537, + "grad_norm": 1.2740946670644597, + "learning_rate": 1.434927697441602e-06, + "loss": 0.3059, + "step": 129 + }, + { + "epoch": 0.04341292369343797, + "grad_norm": 1.3012091566446777, + "learning_rate": 1.4460511679644053e-06, + "loss": 0.3079, + "step": 130 + }, + { + "epoch": 0.04374686926031057, + "grad_norm": 1.1947097037496526, + "learning_rate": 1.4571746384872082e-06, + "loss": 0.3052, + "step": 131 + }, + { + "epoch": 0.04408081482718317, + "grad_norm": 1.1432786230589478, + "learning_rate": 1.4682981090100113e-06, + "loss": 0.3139, + "step": 132 + }, + { + "epoch": 0.04441476039405577, + "grad_norm": 1.3142624785291002, + "learning_rate": 1.4794215795328142e-06, + "loss": 0.3004, + "step": 133 + }, + { + "epoch": 0.04474870596092837, + "grad_norm": 1.1091244974414516, + "learning_rate": 1.4905450500556174e-06, + "loss": 0.2872, + "step": 134 + }, + { + "epoch": 0.04508265152780097, + "grad_norm": 1.3022437706824233, + "learning_rate": 1.5016685205784207e-06, + "loss": 0.3114, + "step": 135 + }, + { + "epoch": 0.04541659709467357, + "grad_norm": 1.323275845400312, + "learning_rate": 1.5127919911012236e-06, + "loss": 0.3265, + "step": 136 + }, + { + "epoch": 0.04575054266154617, + "grad_norm": 1.24079589522644, + "learning_rate": 1.5239154616240268e-06, + "loss": 0.2973, + "step": 137 + }, + { + "epoch": 0.04608448822841877, + "grad_norm": 1.0994743200549912, + "learning_rate": 1.5350389321468299e-06, + "loss": 0.2917, + "step": 138 + }, + { + "epoch": 0.04641843379529137, + "grad_norm": 1.1757509816140939, + "learning_rate": 1.5461624026696332e-06, + "loss": 0.2896, + "step": 139 + }, + { + "epoch": 0.04675237936216397, + "grad_norm": 1.097534938340695, + "learning_rate": 1.5572858731924361e-06, + "loss": 0.281, + "step": 140 + }, + { + "epoch": 0.04708632492903657, + "grad_norm": 1.1159015324002537, + "learning_rate": 1.5684093437152393e-06, + "loss": 0.2902, + "step": 141 + }, + { + "epoch": 0.04742027049590917, + "grad_norm": 1.2320769642378695, + "learning_rate": 1.5795328142380422e-06, + "loss": 0.3015, + "step": 142 + }, + { + "epoch": 0.04775421606278177, + "grad_norm": 1.2268199663025805, + "learning_rate": 1.5906562847608455e-06, + "loss": 0.2986, + "step": 143 + }, + { + "epoch": 0.04808816162965437, + "grad_norm": 1.106082896796778, + "learning_rate": 1.6017797552836487e-06, + "loss": 0.2824, + "step": 144 + }, + { + "epoch": 0.04842210719652697, + "grad_norm": 1.0688508167162885, + "learning_rate": 1.6129032258064516e-06, + "loss": 0.2887, + "step": 145 + }, + { + "epoch": 0.04875605276339957, + "grad_norm": 1.2355684355753798, + "learning_rate": 1.6240266963292547e-06, + "loss": 0.3029, + "step": 146 + }, + { + "epoch": 0.049089998330272167, + "grad_norm": 1.2608745789622653, + "learning_rate": 1.635150166852058e-06, + "loss": 0.3053, + "step": 147 + }, + { + "epoch": 0.049423943897144766, + "grad_norm": 1.2087145826619097, + "learning_rate": 1.6462736373748612e-06, + "loss": 0.3074, + "step": 148 + }, + { + "epoch": 0.049757889464017366, + "grad_norm": 1.4191211752662398, + "learning_rate": 1.6573971078976641e-06, + "loss": 0.3018, + "step": 149 + }, + { + "epoch": 0.050091835030889965, + "grad_norm": 1.6005040312929983, + "learning_rate": 1.6685205784204673e-06, + "loss": 0.2992, + "step": 150 + }, + { + "epoch": 0.050425780597762565, + "grad_norm": 1.3177431441471539, + "learning_rate": 1.6796440489432706e-06, + "loss": 0.3013, + "step": 151 + }, + { + "epoch": 0.050759726164635165, + "grad_norm": 1.0667233844048953, + "learning_rate": 1.6907675194660735e-06, + "loss": 0.2853, + "step": 152 + }, + { + "epoch": 0.051093671731507764, + "grad_norm": 1.2197105719883872, + "learning_rate": 1.7018909899888767e-06, + "loss": 0.2957, + "step": 153 + }, + { + "epoch": 0.051427617298380364, + "grad_norm": 1.2749427439891368, + "learning_rate": 1.7130144605116798e-06, + "loss": 0.2826, + "step": 154 + }, + { + "epoch": 0.051761562865252964, + "grad_norm": 1.3919022833764747, + "learning_rate": 1.724137931034483e-06, + "loss": 0.2828, + "step": 155 + }, + { + "epoch": 0.05209550843212556, + "grad_norm": 1.1706667684484162, + "learning_rate": 1.735261401557286e-06, + "loss": 0.2834, + "step": 156 + }, + { + "epoch": 0.05242945399899816, + "grad_norm": 1.1385584136115947, + "learning_rate": 1.7463848720800892e-06, + "loss": 0.278, + "step": 157 + }, + { + "epoch": 0.05276339956587076, + "grad_norm": 1.2784468592463725, + "learning_rate": 1.757508342602892e-06, + "loss": 0.2878, + "step": 158 + }, + { + "epoch": 0.05309734513274336, + "grad_norm": 1.0828791709613537, + "learning_rate": 1.7686318131256954e-06, + "loss": 0.2843, + "step": 159 + }, + { + "epoch": 0.05343129069961596, + "grad_norm": 1.0858986938430426, + "learning_rate": 1.7797552836484986e-06, + "loss": 0.2803, + "step": 160 + }, + { + "epoch": 0.05376523626648856, + "grad_norm": 1.4160711682479377, + "learning_rate": 1.7908787541713015e-06, + "loss": 0.2681, + "step": 161 + }, + { + "epoch": 0.05409918183336116, + "grad_norm": 1.1622241308379426, + "learning_rate": 1.8020022246941046e-06, + "loss": 0.2942, + "step": 162 + }, + { + "epoch": 0.05443312740023376, + "grad_norm": 1.3244697401189178, + "learning_rate": 1.813125695216908e-06, + "loss": 0.3069, + "step": 163 + }, + { + "epoch": 0.05476707296710636, + "grad_norm": 1.1651688282873625, + "learning_rate": 1.824249165739711e-06, + "loss": 0.2942, + "step": 164 + }, + { + "epoch": 0.05510101853397896, + "grad_norm": 1.3401516990160998, + "learning_rate": 1.835372636262514e-06, + "loss": 0.2979, + "step": 165 + }, + { + "epoch": 0.05543496410085156, + "grad_norm": 1.1232106728216595, + "learning_rate": 1.8464961067853172e-06, + "loss": 0.2919, + "step": 166 + }, + { + "epoch": 0.05576890966772416, + "grad_norm": 0.9645342716426896, + "learning_rate": 1.85761957730812e-06, + "loss": 0.2833, + "step": 167 + }, + { + "epoch": 0.05610285523459676, + "grad_norm": 1.2735387795044881, + "learning_rate": 1.8687430478309234e-06, + "loss": 0.311, + "step": 168 + }, + { + "epoch": 0.05643680080146936, + "grad_norm": 1.1593324040382949, + "learning_rate": 1.8798665183537266e-06, + "loss": 0.3005, + "step": 169 + }, + { + "epoch": 0.05677074636834196, + "grad_norm": 1.0571733025122132, + "learning_rate": 1.8909899888765295e-06, + "loss": 0.2871, + "step": 170 + }, + { + "epoch": 0.05710469193521456, + "grad_norm": 1.1772637818135905, + "learning_rate": 1.9021134593993326e-06, + "loss": 0.2963, + "step": 171 + }, + { + "epoch": 0.05743863750208716, + "grad_norm": 1.2648890164784086, + "learning_rate": 1.9132369299221357e-06, + "loss": 0.284, + "step": 172 + }, + { + "epoch": 0.05777258306895976, + "grad_norm": 1.257741155911166, + "learning_rate": 1.924360400444939e-06, + "loss": 0.3123, + "step": 173 + }, + { + "epoch": 0.05810652863583236, + "grad_norm": 1.2076339575951947, + "learning_rate": 1.935483870967742e-06, + "loss": 0.2968, + "step": 174 + }, + { + "epoch": 0.058440474202704956, + "grad_norm": 1.3831077669462297, + "learning_rate": 1.946607341490545e-06, + "loss": 0.302, + "step": 175 + }, + { + "epoch": 0.058774419769577556, + "grad_norm": 1.2392310046049393, + "learning_rate": 1.9577308120133483e-06, + "loss": 0.3019, + "step": 176 + }, + { + "epoch": 0.059108365336450155, + "grad_norm": 1.0630699642754082, + "learning_rate": 1.9688542825361514e-06, + "loss": 0.2794, + "step": 177 + }, + { + "epoch": 0.059442310903322755, + "grad_norm": 1.2231288730901275, + "learning_rate": 1.9799777530589545e-06, + "loss": 0.3036, + "step": 178 + }, + { + "epoch": 0.059776256470195355, + "grad_norm": 1.054287375357212, + "learning_rate": 1.9911012235817577e-06, + "loss": 0.2776, + "step": 179 + }, + { + "epoch": 0.06011020203706796, + "grad_norm": 1.3813659231450461, + "learning_rate": 2.002224694104561e-06, + "loss": 0.312, + "step": 180 + }, + { + "epoch": 0.06044414760394056, + "grad_norm": 1.223098684904635, + "learning_rate": 2.013348164627364e-06, + "loss": 0.2986, + "step": 181 + }, + { + "epoch": 0.06077809317081316, + "grad_norm": 1.306919633091064, + "learning_rate": 2.024471635150167e-06, + "loss": 0.3263, + "step": 182 + }, + { + "epoch": 0.06111203873768576, + "grad_norm": 1.2007296717327645, + "learning_rate": 2.03559510567297e-06, + "loss": 0.2856, + "step": 183 + }, + { + "epoch": 0.06144598430455836, + "grad_norm": 1.2020773671581122, + "learning_rate": 2.0467185761957733e-06, + "loss": 0.2777, + "step": 184 + }, + { + "epoch": 0.06177992987143096, + "grad_norm": 1.196861175909905, + "learning_rate": 2.0578420467185764e-06, + "loss": 0.2833, + "step": 185 + }, + { + "epoch": 0.06211387543830356, + "grad_norm": 1.2558516865319087, + "learning_rate": 2.0689655172413796e-06, + "loss": 0.2763, + "step": 186 + }, + { + "epoch": 0.06244782100517616, + "grad_norm": 1.0444075488420261, + "learning_rate": 2.0800889877641823e-06, + "loss": 0.2759, + "step": 187 + }, + { + "epoch": 0.06278176657204876, + "grad_norm": 1.4420044463146366, + "learning_rate": 2.091212458286986e-06, + "loss": 0.3064, + "step": 188 + }, + { + "epoch": 0.06311571213892135, + "grad_norm": 1.2416958553865685, + "learning_rate": 2.102335928809789e-06, + "loss": 0.3063, + "step": 189 + }, + { + "epoch": 0.06344965770579396, + "grad_norm": 1.0748461696024694, + "learning_rate": 2.113459399332592e-06, + "loss": 0.2752, + "step": 190 + }, + { + "epoch": 0.06378360327266655, + "grad_norm": 1.1742498266635828, + "learning_rate": 2.124582869855395e-06, + "loss": 0.2744, + "step": 191 + }, + { + "epoch": 0.06411754883953916, + "grad_norm": 1.0861534931135144, + "learning_rate": 2.1357063403781984e-06, + "loss": 0.2776, + "step": 192 + }, + { + "epoch": 0.06445149440641175, + "grad_norm": 1.3347076544814778, + "learning_rate": 2.1468298109010015e-06, + "loss": 0.2735, + "step": 193 + }, + { + "epoch": 0.06478543997328436, + "grad_norm": 1.2383353005287574, + "learning_rate": 2.1579532814238042e-06, + "loss": 0.274, + "step": 194 + }, + { + "epoch": 0.06511938554015695, + "grad_norm": 1.054486003296379, + "learning_rate": 2.1690767519466073e-06, + "loss": 0.284, + "step": 195 + }, + { + "epoch": 0.06545333110702956, + "grad_norm": 1.2422895734951493, + "learning_rate": 2.180200222469411e-06, + "loss": 0.271, + "step": 196 + }, + { + "epoch": 0.06578727667390215, + "grad_norm": 1.2952773756126603, + "learning_rate": 2.1913236929922136e-06, + "loss": 0.2972, + "step": 197 + }, + { + "epoch": 0.06612122224077475, + "grad_norm": 1.2211536848947333, + "learning_rate": 2.2024471635150167e-06, + "loss": 0.2841, + "step": 198 + }, + { + "epoch": 0.06645516780764735, + "grad_norm": 1.197513650785667, + "learning_rate": 2.21357063403782e-06, + "loss": 0.3226, + "step": 199 + }, + { + "epoch": 0.06678911337451995, + "grad_norm": 1.2718959471586204, + "learning_rate": 2.224694104560623e-06, + "loss": 0.3052, + "step": 200 + }, + { + "epoch": 0.06712305894139255, + "grad_norm": 1.11207328849927, + "learning_rate": 2.235817575083426e-06, + "loss": 0.2752, + "step": 201 + }, + { + "epoch": 0.06745700450826515, + "grad_norm": 1.0206175010699121, + "learning_rate": 2.2469410456062293e-06, + "loss": 0.2919, + "step": 202 + }, + { + "epoch": 0.06779095007513775, + "grad_norm": 1.0839833033777182, + "learning_rate": 2.2580645161290324e-06, + "loss": 0.2942, + "step": 203 + }, + { + "epoch": 0.06812489564201035, + "grad_norm": 1.078908508453811, + "learning_rate": 2.2691879866518355e-06, + "loss": 0.2758, + "step": 204 + }, + { + "epoch": 0.06845884120888296, + "grad_norm": 1.4625898609451118, + "learning_rate": 2.2803114571746387e-06, + "loss": 0.2909, + "step": 205 + }, + { + "epoch": 0.06879278677575555, + "grad_norm": 1.1657831625637678, + "learning_rate": 2.291434927697442e-06, + "loss": 0.2901, + "step": 206 + }, + { + "epoch": 0.06912673234262816, + "grad_norm": 1.1163150602739882, + "learning_rate": 2.302558398220245e-06, + "loss": 0.2836, + "step": 207 + }, + { + "epoch": 0.06946067790950075, + "grad_norm": 1.0394061522813909, + "learning_rate": 2.313681868743048e-06, + "loss": 0.2928, + "step": 208 + }, + { + "epoch": 0.06979462347637336, + "grad_norm": 1.0257281218495777, + "learning_rate": 2.324805339265851e-06, + "loss": 0.282, + "step": 209 + }, + { + "epoch": 0.07012856904324595, + "grad_norm": 1.060343943193521, + "learning_rate": 2.3359288097886543e-06, + "loss": 0.2494, + "step": 210 + }, + { + "epoch": 0.07046251461011856, + "grad_norm": 1.1754632761919408, + "learning_rate": 2.3470522803114575e-06, + "loss": 0.2875, + "step": 211 + }, + { + "epoch": 0.07079646017699115, + "grad_norm": 1.3889723569823251, + "learning_rate": 2.35817575083426e-06, + "loss": 0.2776, + "step": 212 + }, + { + "epoch": 0.07113040574386376, + "grad_norm": 1.1026304990108282, + "learning_rate": 2.3692992213570637e-06, + "loss": 0.2813, + "step": 213 + }, + { + "epoch": 0.07146435131073635, + "grad_norm": 1.3540421203167579, + "learning_rate": 2.380422691879867e-06, + "loss": 0.3084, + "step": 214 + }, + { + "epoch": 0.07179829687760896, + "grad_norm": 0.9644640688215597, + "learning_rate": 2.39154616240267e-06, + "loss": 0.2526, + "step": 215 + }, + { + "epoch": 0.07213224244448155, + "grad_norm": 1.077328685424387, + "learning_rate": 2.4026696329254727e-06, + "loss": 0.2925, + "step": 216 + }, + { + "epoch": 0.07246618801135415, + "grad_norm": 1.1941534402934753, + "learning_rate": 2.4137931034482762e-06, + "loss": 0.2912, + "step": 217 + }, + { + "epoch": 0.07280013357822675, + "grad_norm": 1.2216996006544307, + "learning_rate": 2.4249165739710794e-06, + "loss": 0.2926, + "step": 218 + }, + { + "epoch": 0.07313407914509935, + "grad_norm": 1.0288345472230127, + "learning_rate": 2.436040044493882e-06, + "loss": 0.2699, + "step": 219 + }, + { + "epoch": 0.07346802471197195, + "grad_norm": 1.108118580799365, + "learning_rate": 2.4471635150166852e-06, + "loss": 0.2705, + "step": 220 + }, + { + "epoch": 0.07380197027884455, + "grad_norm": 1.0837312318999974, + "learning_rate": 2.4582869855394888e-06, + "loss": 0.2926, + "step": 221 + }, + { + "epoch": 0.07413591584571715, + "grad_norm": 1.078034579186348, + "learning_rate": 2.4694104560622915e-06, + "loss": 0.2625, + "step": 222 + }, + { + "epoch": 0.07446986141258975, + "grad_norm": 1.1387659740587366, + "learning_rate": 2.4805339265850946e-06, + "loss": 0.2878, + "step": 223 + }, + { + "epoch": 0.07480380697946235, + "grad_norm": 1.1289059064466833, + "learning_rate": 2.4916573971078977e-06, + "loss": 0.2725, + "step": 224 + }, + { + "epoch": 0.07513775254633495, + "grad_norm": 1.1154801276679143, + "learning_rate": 2.502780867630701e-06, + "loss": 0.2796, + "step": 225 + }, + { + "epoch": 0.07547169811320754, + "grad_norm": 1.1637231705683622, + "learning_rate": 2.513904338153504e-06, + "loss": 0.2988, + "step": 226 + }, + { + "epoch": 0.07580564368008015, + "grad_norm": 1.0890226181580156, + "learning_rate": 2.5250278086763076e-06, + "loss": 0.2884, + "step": 227 + }, + { + "epoch": 0.07613958924695274, + "grad_norm": 1.133532164208556, + "learning_rate": 2.5361512791991107e-06, + "loss": 0.2896, + "step": 228 + }, + { + "epoch": 0.07647353481382535, + "grad_norm": 1.0864431293764834, + "learning_rate": 2.5472747497219134e-06, + "loss": 0.2778, + "step": 229 + }, + { + "epoch": 0.07680748038069794, + "grad_norm": 1.146220316681448, + "learning_rate": 2.5583982202447165e-06, + "loss": 0.3008, + "step": 230 + }, + { + "epoch": 0.07714142594757055, + "grad_norm": 0.9719070473034946, + "learning_rate": 2.5695216907675197e-06, + "loss": 0.2865, + "step": 231 + }, + { + "epoch": 0.07747537151444314, + "grad_norm": 0.9482297604156307, + "learning_rate": 2.580645161290323e-06, + "loss": 0.2541, + "step": 232 + }, + { + "epoch": 0.07780931708131575, + "grad_norm": 1.154538692385191, + "learning_rate": 2.591768631813126e-06, + "loss": 0.2743, + "step": 233 + }, + { + "epoch": 0.07814326264818834, + "grad_norm": 1.228755098669564, + "learning_rate": 2.6028921023359286e-06, + "loss": 0.2903, + "step": 234 + }, + { + "epoch": 0.07847720821506095, + "grad_norm": 1.225373106150885, + "learning_rate": 2.6140155728587318e-06, + "loss": 0.2766, + "step": 235 + }, + { + "epoch": 0.07881115378193354, + "grad_norm": 1.1650718474694142, + "learning_rate": 2.6251390433815353e-06, + "loss": 0.2647, + "step": 236 + }, + { + "epoch": 0.07914509934880615, + "grad_norm": 0.9954249618414314, + "learning_rate": 2.6362625139043385e-06, + "loss": 0.263, + "step": 237 + }, + { + "epoch": 0.07947904491567874, + "grad_norm": 1.137248317039158, + "learning_rate": 2.6473859844271416e-06, + "loss": 0.2792, + "step": 238 + }, + { + "epoch": 0.07981299048255135, + "grad_norm": 1.0766424441389253, + "learning_rate": 2.6585094549499447e-06, + "loss": 0.295, + "step": 239 + }, + { + "epoch": 0.08014693604942394, + "grad_norm": 1.133299162391096, + "learning_rate": 2.669632925472748e-06, + "loss": 0.2874, + "step": 240 + }, + { + "epoch": 0.08048088161629655, + "grad_norm": 1.6190237814839594, + "learning_rate": 2.6807563959955506e-06, + "loss": 0.2775, + "step": 241 + }, + { + "epoch": 0.08081482718316914, + "grad_norm": 1.0781192090641312, + "learning_rate": 2.6918798665183537e-06, + "loss": 0.2782, + "step": 242 + }, + { + "epoch": 0.08114877275004174, + "grad_norm": 1.1272432543088633, + "learning_rate": 2.703003337041157e-06, + "loss": 0.2843, + "step": 243 + }, + { + "epoch": 0.08148271831691434, + "grad_norm": 1.2925306771786247, + "learning_rate": 2.7141268075639604e-06, + "loss": 0.2803, + "step": 244 + }, + { + "epoch": 0.08181666388378694, + "grad_norm": 1.016699529674434, + "learning_rate": 2.7252502780867635e-06, + "loss": 0.2605, + "step": 245 + }, + { + "epoch": 0.08215060945065954, + "grad_norm": 1.0899875102025414, + "learning_rate": 2.7363737486095667e-06, + "loss": 0.2841, + "step": 246 + }, + { + "epoch": 0.08248455501753214, + "grad_norm": 1.289631959826523, + "learning_rate": 2.7474972191323694e-06, + "loss": 0.2904, + "step": 247 + }, + { + "epoch": 0.08281850058440474, + "grad_norm": 1.2578086396698145, + "learning_rate": 2.7586206896551725e-06, + "loss": 0.2691, + "step": 248 + }, + { + "epoch": 0.08315244615127734, + "grad_norm": 0.9712514626265429, + "learning_rate": 2.7697441601779756e-06, + "loss": 0.2771, + "step": 249 + }, + { + "epoch": 0.08348639171814994, + "grad_norm": 1.2110417920398469, + "learning_rate": 2.7808676307007788e-06, + "loss": 0.2746, + "step": 250 + }, + { + "epoch": 0.08382033728502254, + "grad_norm": 1.1917994392155153, + "learning_rate": 2.791991101223582e-06, + "loss": 0.281, + "step": 251 + }, + { + "epoch": 0.08415428285189513, + "grad_norm": 1.0773885616519046, + "learning_rate": 2.8031145717463854e-06, + "loss": 0.2766, + "step": 252 + }, + { + "epoch": 0.08448822841876774, + "grad_norm": 1.1477058723357543, + "learning_rate": 2.8142380422691886e-06, + "loss": 0.2868, + "step": 253 + }, + { + "epoch": 0.08482217398564033, + "grad_norm": 1.1308647572547197, + "learning_rate": 2.8253615127919913e-06, + "loss": 0.2809, + "step": 254 + }, + { + "epoch": 0.08515611955251294, + "grad_norm": 1.1886012198694917, + "learning_rate": 2.8364849833147944e-06, + "loss": 0.2838, + "step": 255 + }, + { + "epoch": 0.08549006511938555, + "grad_norm": 1.0917302174368306, + "learning_rate": 2.8476084538375975e-06, + "loss": 0.2966, + "step": 256 + }, + { + "epoch": 0.08582401068625814, + "grad_norm": 1.0628706382181317, + "learning_rate": 2.8587319243604007e-06, + "loss": 0.2746, + "step": 257 + }, + { + "epoch": 0.08615795625313075, + "grad_norm": 0.9011209090285437, + "learning_rate": 2.869855394883204e-06, + "loss": 0.2663, + "step": 258 + }, + { + "epoch": 0.08649190182000334, + "grad_norm": 0.9699244069128968, + "learning_rate": 2.8809788654060065e-06, + "loss": 0.2732, + "step": 259 + }, + { + "epoch": 0.08682584738687595, + "grad_norm": 1.0368860734439413, + "learning_rate": 2.8921023359288105e-06, + "loss": 0.2703, + "step": 260 + }, + { + "epoch": 0.08715979295374854, + "grad_norm": 1.1688091750555711, + "learning_rate": 2.903225806451613e-06, + "loss": 0.2888, + "step": 261 + }, + { + "epoch": 0.08749373852062114, + "grad_norm": 1.004959730063203, + "learning_rate": 2.9143492769744163e-06, + "loss": 0.2576, + "step": 262 + }, + { + "epoch": 0.08782768408749374, + "grad_norm": 0.9991808356904461, + "learning_rate": 2.9254727474972195e-06, + "loss": 0.2625, + "step": 263 + }, + { + "epoch": 0.08816162965436634, + "grad_norm": 0.9825510412554146, + "learning_rate": 2.9365962180200226e-06, + "loss": 0.2632, + "step": 264 + }, + { + "epoch": 0.08849557522123894, + "grad_norm": 0.9821153217304937, + "learning_rate": 2.9477196885428257e-06, + "loss": 0.2713, + "step": 265 + }, + { + "epoch": 0.08882952078811154, + "grad_norm": 1.1614726347763442, + "learning_rate": 2.9588431590656284e-06, + "loss": 0.282, + "step": 266 + }, + { + "epoch": 0.08916346635498414, + "grad_norm": 1.7691717435465628, + "learning_rate": 2.9699666295884316e-06, + "loss": 0.2799, + "step": 267 + }, + { + "epoch": 0.08949741192185674, + "grad_norm": 0.9974536728011792, + "learning_rate": 2.9810901001112347e-06, + "loss": 0.2754, + "step": 268 + }, + { + "epoch": 0.08983135748872934, + "grad_norm": 1.1409552236974378, + "learning_rate": 2.9922135706340383e-06, + "loss": 0.2974, + "step": 269 + }, + { + "epoch": 0.09016530305560194, + "grad_norm": 0.938700704239528, + "learning_rate": 3.0033370411568414e-06, + "loss": 0.2595, + "step": 270 + }, + { + "epoch": 0.09049924862247453, + "grad_norm": 1.1094304035830367, + "learning_rate": 3.0144605116796445e-06, + "loss": 0.2783, + "step": 271 + }, + { + "epoch": 0.09083319418934714, + "grad_norm": 1.0187603452237721, + "learning_rate": 3.0255839822024472e-06, + "loss": 0.272, + "step": 272 + }, + { + "epoch": 0.09116713975621973, + "grad_norm": 0.9507482461915819, + "learning_rate": 3.0367074527252504e-06, + "loss": 0.2786, + "step": 273 + }, + { + "epoch": 0.09150108532309234, + "grad_norm": 0.9623017053402451, + "learning_rate": 3.0478309232480535e-06, + "loss": 0.2563, + "step": 274 + }, + { + "epoch": 0.09183503088996493, + "grad_norm": 0.9705519249362503, + "learning_rate": 3.0589543937708566e-06, + "loss": 0.2634, + "step": 275 + }, + { + "epoch": 0.09216897645683754, + "grad_norm": 1.1162146922113336, + "learning_rate": 3.0700778642936598e-06, + "loss": 0.2814, + "step": 276 + }, + { + "epoch": 0.09250292202371013, + "grad_norm": 1.0488911817978013, + "learning_rate": 3.0812013348164633e-06, + "loss": 0.2736, + "step": 277 + }, + { + "epoch": 0.09283686759058274, + "grad_norm": 1.143098412646465, + "learning_rate": 3.0923248053392665e-06, + "loss": 0.2961, + "step": 278 + }, + { + "epoch": 0.09317081315745533, + "grad_norm": 1.0543653338829788, + "learning_rate": 3.103448275862069e-06, + "loss": 0.2921, + "step": 279 + }, + { + "epoch": 0.09350475872432794, + "grad_norm": 0.9983554927496489, + "learning_rate": 3.1145717463848723e-06, + "loss": 0.2694, + "step": 280 + }, + { + "epoch": 0.09383870429120053, + "grad_norm": 1.0555480006962337, + "learning_rate": 3.1256952169076754e-06, + "loss": 0.2583, + "step": 281 + }, + { + "epoch": 0.09417264985807314, + "grad_norm": 0.986411867922012, + "learning_rate": 3.1368186874304786e-06, + "loss": 0.2814, + "step": 282 + }, + { + "epoch": 0.09450659542494573, + "grad_norm": 1.0115618448923618, + "learning_rate": 3.1479421579532817e-06, + "loss": 0.2602, + "step": 283 + }, + { + "epoch": 0.09484054099181834, + "grad_norm": 1.074066553143631, + "learning_rate": 3.1590656284760844e-06, + "loss": 0.2577, + "step": 284 + }, + { + "epoch": 0.09517448655869093, + "grad_norm": 0.978641957387922, + "learning_rate": 3.170189098998888e-06, + "loss": 0.278, + "step": 285 + }, + { + "epoch": 0.09550843212556354, + "grad_norm": 0.9937876447676249, + "learning_rate": 3.181312569521691e-06, + "loss": 0.2532, + "step": 286 + }, + { + "epoch": 0.09584237769243613, + "grad_norm": 1.0298119478855567, + "learning_rate": 3.1924360400444942e-06, + "loss": 0.2669, + "step": 287 + }, + { + "epoch": 0.09617632325930874, + "grad_norm": 1.0380635876797846, + "learning_rate": 3.2035595105672973e-06, + "loss": 0.2849, + "step": 288 + }, + { + "epoch": 0.09651026882618133, + "grad_norm": 0.8705028041214296, + "learning_rate": 3.2146829810901005e-06, + "loss": 0.2631, + "step": 289 + }, + { + "epoch": 0.09684421439305393, + "grad_norm": 0.9554719138367959, + "learning_rate": 3.225806451612903e-06, + "loss": 0.2699, + "step": 290 + }, + { + "epoch": 0.09717815995992653, + "grad_norm": 1.1070871658729817, + "learning_rate": 3.2369299221357063e-06, + "loss": 0.2699, + "step": 291 + }, + { + "epoch": 0.09751210552679913, + "grad_norm": 1.1569397616168657, + "learning_rate": 3.2480533926585095e-06, + "loss": 0.2901, + "step": 292 + }, + { + "epoch": 0.09784605109367173, + "grad_norm": 0.8906942173665268, + "learning_rate": 3.259176863181313e-06, + "loss": 0.2491, + "step": 293 + }, + { + "epoch": 0.09817999666054433, + "grad_norm": 0.9616142547096681, + "learning_rate": 3.270300333704116e-06, + "loss": 0.2479, + "step": 294 + }, + { + "epoch": 0.09851394222741693, + "grad_norm": 0.9424174400332765, + "learning_rate": 3.2814238042269193e-06, + "loss": 0.2555, + "step": 295 + }, + { + "epoch": 0.09884788779428953, + "grad_norm": 0.969602671488551, + "learning_rate": 3.2925472747497224e-06, + "loss": 0.2522, + "step": 296 + }, + { + "epoch": 0.09918183336116213, + "grad_norm": 1.0778054535009527, + "learning_rate": 3.303670745272525e-06, + "loss": 0.2916, + "step": 297 + }, + { + "epoch": 0.09951577892803473, + "grad_norm": 0.950681407817474, + "learning_rate": 3.3147942157953282e-06, + "loss": 0.2624, + "step": 298 + }, + { + "epoch": 0.09984972449490732, + "grad_norm": 1.0136126498415492, + "learning_rate": 3.3259176863181314e-06, + "loss": 0.2933, + "step": 299 + }, + { + "epoch": 0.10018367006177993, + "grad_norm": 1.1157879759767526, + "learning_rate": 3.3370411568409345e-06, + "loss": 0.2752, + "step": 300 + }, + { + "epoch": 0.10051761562865252, + "grad_norm": 0.9659786911496452, + "learning_rate": 3.3481646273637376e-06, + "loss": 0.2637, + "step": 301 + }, + { + "epoch": 0.10085156119552513, + "grad_norm": 1.0334037162956897, + "learning_rate": 3.359288097886541e-06, + "loss": 0.2826, + "step": 302 + }, + { + "epoch": 0.10118550676239772, + "grad_norm": 0.8974929775545794, + "learning_rate": 3.3704115684093443e-06, + "loss": 0.2592, + "step": 303 + }, + { + "epoch": 0.10151945232927033, + "grad_norm": 0.9908654321167929, + "learning_rate": 3.381535038932147e-06, + "loss": 0.2514, + "step": 304 + }, + { + "epoch": 0.10185339789614292, + "grad_norm": 1.013861682980457, + "learning_rate": 3.39265850945495e-06, + "loss": 0.2659, + "step": 305 + }, + { + "epoch": 0.10218734346301553, + "grad_norm": 1.0310009709599692, + "learning_rate": 3.4037819799777533e-06, + "loss": 0.2936, + "step": 306 + }, + { + "epoch": 0.10252128902988812, + "grad_norm": 0.8989162049291942, + "learning_rate": 3.4149054505005564e-06, + "loss": 0.2524, + "step": 307 + }, + { + "epoch": 0.10285523459676073, + "grad_norm": 1.0325604088157159, + "learning_rate": 3.4260289210233596e-06, + "loss": 0.2725, + "step": 308 + }, + { + "epoch": 0.10318918016363333, + "grad_norm": 0.9980252685246547, + "learning_rate": 3.4371523915461623e-06, + "loss": 0.2663, + "step": 309 + }, + { + "epoch": 0.10352312573050593, + "grad_norm": 0.9993480947882514, + "learning_rate": 3.448275862068966e-06, + "loss": 0.2619, + "step": 310 + }, + { + "epoch": 0.10385707129737853, + "grad_norm": 1.2466860459144706, + "learning_rate": 3.459399332591769e-06, + "loss": 0.2864, + "step": 311 + }, + { + "epoch": 0.10419101686425113, + "grad_norm": 1.2924240507601645, + "learning_rate": 3.470522803114572e-06, + "loss": 0.2759, + "step": 312 + }, + { + "epoch": 0.10452496243112373, + "grad_norm": 1.117231355808104, + "learning_rate": 3.4816462736373752e-06, + "loss": 0.2493, + "step": 313 + }, + { + "epoch": 0.10485890799799633, + "grad_norm": 1.0013319983489994, + "learning_rate": 3.4927697441601784e-06, + "loss": 0.2694, + "step": 314 + }, + { + "epoch": 0.10519285356486893, + "grad_norm": 0.8939567136330437, + "learning_rate": 3.503893214682981e-06, + "loss": 0.2555, + "step": 315 + }, + { + "epoch": 0.10552679913174152, + "grad_norm": 1.239242556867951, + "learning_rate": 3.515016685205784e-06, + "loss": 0.2728, + "step": 316 + }, + { + "epoch": 0.10586074469861413, + "grad_norm": 1.0565860100298228, + "learning_rate": 3.5261401557285873e-06, + "loss": 0.2663, + "step": 317 + }, + { + "epoch": 0.10619469026548672, + "grad_norm": 0.9597333088021871, + "learning_rate": 3.537263626251391e-06, + "loss": 0.2729, + "step": 318 + }, + { + "epoch": 0.10652863583235933, + "grad_norm": 0.8695321802277968, + "learning_rate": 3.548387096774194e-06, + "loss": 0.2586, + "step": 319 + }, + { + "epoch": 0.10686258139923192, + "grad_norm": 1.035738979892239, + "learning_rate": 3.559510567296997e-06, + "loss": 0.2671, + "step": 320 + }, + { + "epoch": 0.10719652696610453, + "grad_norm": 0.9575178575212548, + "learning_rate": 3.5706340378198003e-06, + "loss": 0.2828, + "step": 321 + }, + { + "epoch": 0.10753047253297712, + "grad_norm": 1.1300397562818, + "learning_rate": 3.581757508342603e-06, + "loss": 0.2949, + "step": 322 + }, + { + "epoch": 0.10786441809984973, + "grad_norm": 1.1537677216616982, + "learning_rate": 3.592880978865406e-06, + "loss": 0.2656, + "step": 323 + }, + { + "epoch": 0.10819836366672232, + "grad_norm": 0.9627144838033714, + "learning_rate": 3.6040044493882093e-06, + "loss": 0.2785, + "step": 324 + }, + { + "epoch": 0.10853230923359493, + "grad_norm": 0.9449257156505781, + "learning_rate": 3.6151279199110124e-06, + "loss": 0.2676, + "step": 325 + }, + { + "epoch": 0.10886625480046752, + "grad_norm": 1.0206213863568905, + "learning_rate": 3.626251390433816e-06, + "loss": 0.2643, + "step": 326 + }, + { + "epoch": 0.10920020036734013, + "grad_norm": 0.9815487853303332, + "learning_rate": 3.637374860956619e-06, + "loss": 0.2789, + "step": 327 + }, + { + "epoch": 0.10953414593421272, + "grad_norm": 1.010306702915483, + "learning_rate": 3.648498331479422e-06, + "loss": 0.2683, + "step": 328 + }, + { + "epoch": 0.10986809150108533, + "grad_norm": 1.0690064824290435, + "learning_rate": 3.659621802002225e-06, + "loss": 0.2819, + "step": 329 + }, + { + "epoch": 0.11020203706795792, + "grad_norm": 1.0546653253133171, + "learning_rate": 3.670745272525028e-06, + "loss": 0.2869, + "step": 330 + }, + { + "epoch": 0.11053598263483053, + "grad_norm": 1.0596495202097613, + "learning_rate": 3.681868743047831e-06, + "loss": 0.2692, + "step": 331 + }, + { + "epoch": 0.11086992820170312, + "grad_norm": 0.9257332414938142, + "learning_rate": 3.6929922135706343e-06, + "loss": 0.2468, + "step": 332 + }, + { + "epoch": 0.11120387376857573, + "grad_norm": 0.9835419475940986, + "learning_rate": 3.7041156840934374e-06, + "loss": 0.2793, + "step": 333 + }, + { + "epoch": 0.11153781933544832, + "grad_norm": 0.9771967275391085, + "learning_rate": 3.71523915461624e-06, + "loss": 0.2845, + "step": 334 + }, + { + "epoch": 0.11187176490232092, + "grad_norm": 0.9602977118557694, + "learning_rate": 3.7263626251390437e-06, + "loss": 0.2742, + "step": 335 + }, + { + "epoch": 0.11220571046919352, + "grad_norm": 1.0029205944461674, + "learning_rate": 3.737486095661847e-06, + "loss": 0.2687, + "step": 336 + }, + { + "epoch": 0.11253965603606612, + "grad_norm": 1.0201328545542836, + "learning_rate": 3.74860956618465e-06, + "loss": 0.2708, + "step": 337 + }, + { + "epoch": 0.11287360160293872, + "grad_norm": 0.9103855657952339, + "learning_rate": 3.759733036707453e-06, + "loss": 0.2792, + "step": 338 + }, + { + "epoch": 0.11320754716981132, + "grad_norm": 0.9601017467552823, + "learning_rate": 3.7708565072302562e-06, + "loss": 0.2678, + "step": 339 + }, + { + "epoch": 0.11354149273668392, + "grad_norm": 1.0089053411102482, + "learning_rate": 3.781979977753059e-06, + "loss": 0.2743, + "step": 340 + }, + { + "epoch": 0.11387543830355652, + "grad_norm": 0.9504969314518689, + "learning_rate": 3.793103448275862e-06, + "loss": 0.2725, + "step": 341 + }, + { + "epoch": 0.11420938387042912, + "grad_norm": 0.9049400864683504, + "learning_rate": 3.804226918798665e-06, + "loss": 0.2545, + "step": 342 + }, + { + "epoch": 0.11454332943730172, + "grad_norm": 0.9801080200173922, + "learning_rate": 3.815350389321469e-06, + "loss": 0.2602, + "step": 343 + }, + { + "epoch": 0.11487727500417431, + "grad_norm": 0.9494051183285684, + "learning_rate": 3.8264738598442715e-06, + "loss": 0.2718, + "step": 344 + }, + { + "epoch": 0.11521122057104692, + "grad_norm": 0.9624224416135067, + "learning_rate": 3.837597330367075e-06, + "loss": 0.273, + "step": 345 + }, + { + "epoch": 0.11554516613791951, + "grad_norm": 1.2213226228441383, + "learning_rate": 3.848720800889878e-06, + "loss": 0.2825, + "step": 346 + }, + { + "epoch": 0.11587911170479212, + "grad_norm": 0.9776624862349191, + "learning_rate": 3.859844271412681e-06, + "loss": 0.277, + "step": 347 + }, + { + "epoch": 0.11621305727166471, + "grad_norm": 0.9947846421904093, + "learning_rate": 3.870967741935484e-06, + "loss": 0.2629, + "step": 348 + }, + { + "epoch": 0.11654700283853732, + "grad_norm": 0.9240560147186092, + "learning_rate": 3.8820912124582876e-06, + "loss": 0.2546, + "step": 349 + }, + { + "epoch": 0.11688094840540991, + "grad_norm": 0.963768770591079, + "learning_rate": 3.89321468298109e-06, + "loss": 0.2765, + "step": 350 + }, + { + "epoch": 0.11721489397228252, + "grad_norm": 0.9589949517884051, + "learning_rate": 3.904338153503894e-06, + "loss": 0.2775, + "step": 351 + }, + { + "epoch": 0.11754883953915511, + "grad_norm": 0.9782433798778546, + "learning_rate": 3.9154616240266965e-06, + "loss": 0.2799, + "step": 352 + }, + { + "epoch": 0.11788278510602772, + "grad_norm": 0.9858329287017443, + "learning_rate": 3.9265850945495e-06, + "loss": 0.2711, + "step": 353 + }, + { + "epoch": 0.11821673067290031, + "grad_norm": 0.9039100597255357, + "learning_rate": 3.937708565072303e-06, + "loss": 0.2581, + "step": 354 + }, + { + "epoch": 0.11855067623977292, + "grad_norm": 1.1034122762806373, + "learning_rate": 3.948832035595106e-06, + "loss": 0.2783, + "step": 355 + }, + { + "epoch": 0.11888462180664551, + "grad_norm": 0.960339590267913, + "learning_rate": 3.959955506117909e-06, + "loss": 0.2892, + "step": 356 + }, + { + "epoch": 0.11921856737351812, + "grad_norm": 1.0002994472569884, + "learning_rate": 3.971078976640712e-06, + "loss": 0.2866, + "step": 357 + }, + { + "epoch": 0.11955251294039071, + "grad_norm": 1.0057324471749156, + "learning_rate": 3.982202447163515e-06, + "loss": 0.2577, + "step": 358 + }, + { + "epoch": 0.11988645850726332, + "grad_norm": 0.9933056307505863, + "learning_rate": 3.993325917686319e-06, + "loss": 0.2671, + "step": 359 + }, + { + "epoch": 0.12022040407413592, + "grad_norm": 1.0118000022039888, + "learning_rate": 4.004449388209122e-06, + "loss": 0.2782, + "step": 360 + }, + { + "epoch": 0.12055434964100852, + "grad_norm": 0.9104306956415678, + "learning_rate": 4.015572858731925e-06, + "loss": 0.2719, + "step": 361 + }, + { + "epoch": 0.12088829520788112, + "grad_norm": 1.0409646407223883, + "learning_rate": 4.026696329254728e-06, + "loss": 0.2824, + "step": 362 + }, + { + "epoch": 0.12122224077475371, + "grad_norm": 0.9353332418061575, + "learning_rate": 4.0378197997775306e-06, + "loss": 0.2779, + "step": 363 + }, + { + "epoch": 0.12155618634162632, + "grad_norm": 0.9117928694164866, + "learning_rate": 4.048943270300334e-06, + "loss": 0.2645, + "step": 364 + }, + { + "epoch": 0.12189013190849891, + "grad_norm": 0.9516320491269368, + "learning_rate": 4.060066740823137e-06, + "loss": 0.2763, + "step": 365 + }, + { + "epoch": 0.12222407747537152, + "grad_norm": 0.8980496133272654, + "learning_rate": 4.07119021134594e-06, + "loss": 0.2479, + "step": 366 + }, + { + "epoch": 0.12255802304224411, + "grad_norm": 0.8722081882192233, + "learning_rate": 4.082313681868743e-06, + "loss": 0.254, + "step": 367 + }, + { + "epoch": 0.12289196860911672, + "grad_norm": 0.9489762799911274, + "learning_rate": 4.093437152391547e-06, + "loss": 0.2794, + "step": 368 + }, + { + "epoch": 0.12322591417598931, + "grad_norm": 0.8260848475511359, + "learning_rate": 4.104560622914349e-06, + "loss": 0.2581, + "step": 369 + }, + { + "epoch": 0.12355985974286192, + "grad_norm": 1.012870852293905, + "learning_rate": 4.115684093437153e-06, + "loss": 0.2795, + "step": 370 + }, + { + "epoch": 0.12389380530973451, + "grad_norm": 0.9078068672567016, + "learning_rate": 4.126807563959956e-06, + "loss": 0.2724, + "step": 371 + }, + { + "epoch": 0.12422775087660712, + "grad_norm": 0.9367356712295183, + "learning_rate": 4.137931034482759e-06, + "loss": 0.2772, + "step": 372 + }, + { + "epoch": 0.12456169644347971, + "grad_norm": 0.9769142088201448, + "learning_rate": 4.149054505005562e-06, + "loss": 0.273, + "step": 373 + }, + { + "epoch": 0.12489564201035232, + "grad_norm": 0.8714374593023121, + "learning_rate": 4.160177975528365e-06, + "loss": 0.2523, + "step": 374 + }, + { + "epoch": 0.1252295875772249, + "grad_norm": 0.9713759181008551, + "learning_rate": 4.171301446051168e-06, + "loss": 0.2627, + "step": 375 + }, + { + "epoch": 0.12556353314409752, + "grad_norm": 1.0024543615178043, + "learning_rate": 4.182424916573972e-06, + "loss": 0.2736, + "step": 376 + }, + { + "epoch": 0.12589747871097012, + "grad_norm": 0.8841635902459595, + "learning_rate": 4.193548387096774e-06, + "loss": 0.266, + "step": 377 + }, + { + "epoch": 0.1262314242778427, + "grad_norm": 0.9422878272730257, + "learning_rate": 4.204671857619578e-06, + "loss": 0.2763, + "step": 378 + }, + { + "epoch": 0.1265653698447153, + "grad_norm": 0.9737880805971011, + "learning_rate": 4.215795328142381e-06, + "loss": 0.2604, + "step": 379 + }, + { + "epoch": 0.12689931541158792, + "grad_norm": 0.9311503938608767, + "learning_rate": 4.226918798665184e-06, + "loss": 0.2687, + "step": 380 + }, + { + "epoch": 0.12723326097846052, + "grad_norm": 0.9099240809857687, + "learning_rate": 4.238042269187987e-06, + "loss": 0.2615, + "step": 381 + }, + { + "epoch": 0.1275672065453331, + "grad_norm": 0.8644535162182144, + "learning_rate": 4.24916573971079e-06, + "loss": 0.2629, + "step": 382 + }, + { + "epoch": 0.1279011521122057, + "grad_norm": 1.08354660489156, + "learning_rate": 4.260289210233593e-06, + "loss": 0.2757, + "step": 383 + }, + { + "epoch": 0.1282350976790783, + "grad_norm": 0.9059885118793621, + "learning_rate": 4.271412680756397e-06, + "loss": 0.2741, + "step": 384 + }, + { + "epoch": 0.12856904324595092, + "grad_norm": 0.8214511989438817, + "learning_rate": 4.2825361512791995e-06, + "loss": 0.2588, + "step": 385 + }, + { + "epoch": 0.1289029888128235, + "grad_norm": 0.8920712731302586, + "learning_rate": 4.293659621802003e-06, + "loss": 0.2628, + "step": 386 + }, + { + "epoch": 0.1292369343796961, + "grad_norm": 1.02140475986345, + "learning_rate": 4.304783092324806e-06, + "loss": 0.2769, + "step": 387 + }, + { + "epoch": 0.1295708799465687, + "grad_norm": 1.1041718598040544, + "learning_rate": 4.3159065628476084e-06, + "loss": 0.2829, + "step": 388 + }, + { + "epoch": 0.12990482551344132, + "grad_norm": 0.9103286130796249, + "learning_rate": 4.327030033370412e-06, + "loss": 0.2616, + "step": 389 + }, + { + "epoch": 0.1302387710803139, + "grad_norm": 0.8952232727330045, + "learning_rate": 4.338153503893215e-06, + "loss": 0.2484, + "step": 390 + }, + { + "epoch": 0.1305727166471865, + "grad_norm": 0.7877032707324497, + "learning_rate": 4.349276974416018e-06, + "loss": 0.2615, + "step": 391 + }, + { + "epoch": 0.1309066622140591, + "grad_norm": 0.881238803534704, + "learning_rate": 4.360400444938822e-06, + "loss": 0.2606, + "step": 392 + }, + { + "epoch": 0.13124060778093172, + "grad_norm": 0.8156813680994354, + "learning_rate": 4.3715239154616245e-06, + "loss": 0.2483, + "step": 393 + }, + { + "epoch": 0.1315745533478043, + "grad_norm": 0.8460744085644406, + "learning_rate": 4.382647385984427e-06, + "loss": 0.266, + "step": 394 + }, + { + "epoch": 0.1319084989146769, + "grad_norm": 0.8909110880065707, + "learning_rate": 4.393770856507231e-06, + "loss": 0.2507, + "step": 395 + }, + { + "epoch": 0.1322424444815495, + "grad_norm": 1.0610350746398265, + "learning_rate": 4.4048943270300335e-06, + "loss": 0.2962, + "step": 396 + }, + { + "epoch": 0.13257639004842212, + "grad_norm": 1.2159344229127298, + "learning_rate": 4.416017797552837e-06, + "loss": 0.2628, + "step": 397 + }, + { + "epoch": 0.1329103356152947, + "grad_norm": 0.9343938441218402, + "learning_rate": 4.42714126807564e-06, + "loss": 0.2809, + "step": 398 + }, + { + "epoch": 0.1332442811821673, + "grad_norm": 0.9460286809085141, + "learning_rate": 4.4382647385984425e-06, + "loss": 0.2672, + "step": 399 + }, + { + "epoch": 0.1335782267490399, + "grad_norm": 0.9028504129341358, + "learning_rate": 4.449388209121246e-06, + "loss": 0.253, + "step": 400 + }, + { + "epoch": 0.13391217231591251, + "grad_norm": 0.777636732453647, + "learning_rate": 4.4605116796440496e-06, + "loss": 0.252, + "step": 401 + }, + { + "epoch": 0.1342461178827851, + "grad_norm": 0.8787403586719423, + "learning_rate": 4.471635150166852e-06, + "loss": 0.2681, + "step": 402 + }, + { + "epoch": 0.1345800634496577, + "grad_norm": 0.8623473953489393, + "learning_rate": 4.482758620689656e-06, + "loss": 0.2615, + "step": 403 + }, + { + "epoch": 0.1349140090165303, + "grad_norm": 0.8590329595872587, + "learning_rate": 4.4938820912124585e-06, + "loss": 0.2693, + "step": 404 + }, + { + "epoch": 0.1352479545834029, + "grad_norm": 1.0942605325821617, + "learning_rate": 4.505005561735262e-06, + "loss": 0.276, + "step": 405 + }, + { + "epoch": 0.1355819001502755, + "grad_norm": 0.8384871494712324, + "learning_rate": 4.516129032258065e-06, + "loss": 0.2539, + "step": 406 + }, + { + "epoch": 0.1359158457171481, + "grad_norm": 0.9713974468286866, + "learning_rate": 4.5272525027808675e-06, + "loss": 0.248, + "step": 407 + }, + { + "epoch": 0.1362497912840207, + "grad_norm": 0.8912405350992786, + "learning_rate": 4.538375973303671e-06, + "loss": 0.2606, + "step": 408 + }, + { + "epoch": 0.1365837368508933, + "grad_norm": 0.8504112799682982, + "learning_rate": 4.549499443826475e-06, + "loss": 0.2478, + "step": 409 + }, + { + "epoch": 0.13691768241776592, + "grad_norm": 0.8845213824948364, + "learning_rate": 4.560622914349277e-06, + "loss": 0.2549, + "step": 410 + }, + { + "epoch": 0.1372516279846385, + "grad_norm": 0.9170303075568879, + "learning_rate": 4.571746384872081e-06, + "loss": 0.2824, + "step": 411 + }, + { + "epoch": 0.1375855735515111, + "grad_norm": 0.9418395539373073, + "learning_rate": 4.582869855394884e-06, + "loss": 0.2512, + "step": 412 + }, + { + "epoch": 0.1379195191183837, + "grad_norm": 0.961468515239987, + "learning_rate": 4.593993325917686e-06, + "loss": 0.2798, + "step": 413 + }, + { + "epoch": 0.13825346468525632, + "grad_norm": 0.884383561019613, + "learning_rate": 4.60511679644049e-06, + "loss": 0.2693, + "step": 414 + }, + { + "epoch": 0.1385874102521289, + "grad_norm": 0.8938825429085708, + "learning_rate": 4.6162402669632926e-06, + "loss": 0.2686, + "step": 415 + }, + { + "epoch": 0.1389213558190015, + "grad_norm": 0.8354946158818547, + "learning_rate": 4.627363737486096e-06, + "loss": 0.245, + "step": 416 + }, + { + "epoch": 0.1392553013858741, + "grad_norm": 0.9373626614204349, + "learning_rate": 4.6384872080089e-06, + "loss": 0.2802, + "step": 417 + }, + { + "epoch": 0.13958924695274672, + "grad_norm": 0.7859209899855684, + "learning_rate": 4.649610678531702e-06, + "loss": 0.2445, + "step": 418 + }, + { + "epoch": 0.1399231925196193, + "grad_norm": 0.9807671068134438, + "learning_rate": 4.660734149054505e-06, + "loss": 0.2644, + "step": 419 + }, + { + "epoch": 0.1402571380864919, + "grad_norm": 0.851020616239925, + "learning_rate": 4.671857619577309e-06, + "loss": 0.2551, + "step": 420 + }, + { + "epoch": 0.1405910836533645, + "grad_norm": 0.9306635039098415, + "learning_rate": 4.682981090100111e-06, + "loss": 0.2559, + "step": 421 + }, + { + "epoch": 0.1409250292202371, + "grad_norm": 0.9028478012185194, + "learning_rate": 4.694104560622915e-06, + "loss": 0.2797, + "step": 422 + }, + { + "epoch": 0.1412589747871097, + "grad_norm": 0.8282408886330277, + "learning_rate": 4.705228031145718e-06, + "loss": 0.2634, + "step": 423 + }, + { + "epoch": 0.1415929203539823, + "grad_norm": 0.9583571910070512, + "learning_rate": 4.71635150166852e-06, + "loss": 0.2737, + "step": 424 + }, + { + "epoch": 0.1419268659208549, + "grad_norm": 0.8598845314007896, + "learning_rate": 4.727474972191325e-06, + "loss": 0.2734, + "step": 425 + }, + { + "epoch": 0.1422608114877275, + "grad_norm": 0.9823961957996282, + "learning_rate": 4.7385984427141274e-06, + "loss": 0.2756, + "step": 426 + }, + { + "epoch": 0.1425947570546001, + "grad_norm": 0.8402666268767217, + "learning_rate": 4.74972191323693e-06, + "loss": 0.2644, + "step": 427 + }, + { + "epoch": 0.1429287026214727, + "grad_norm": 0.9504374640973223, + "learning_rate": 4.760845383759734e-06, + "loss": 0.2802, + "step": 428 + }, + { + "epoch": 0.1432626481883453, + "grad_norm": 0.860942841549608, + "learning_rate": 4.771968854282536e-06, + "loss": 0.2642, + "step": 429 + }, + { + "epoch": 0.1435965937552179, + "grad_norm": 0.7976260693507284, + "learning_rate": 4.78309232480534e-06, + "loss": 0.2396, + "step": 430 + }, + { + "epoch": 0.1439305393220905, + "grad_norm": 0.9444299816616959, + "learning_rate": 4.794215795328143e-06, + "loss": 0.2647, + "step": 431 + }, + { + "epoch": 0.1442644848889631, + "grad_norm": 0.9810683931758797, + "learning_rate": 4.805339265850945e-06, + "loss": 0.2755, + "step": 432 + }, + { + "epoch": 0.1445984304558357, + "grad_norm": 0.8774017600524443, + "learning_rate": 4.816462736373749e-06, + "loss": 0.2516, + "step": 433 + }, + { + "epoch": 0.1449323760227083, + "grad_norm": 0.9482573960208934, + "learning_rate": 4.8275862068965525e-06, + "loss": 0.2862, + "step": 434 + }, + { + "epoch": 0.1452663215895809, + "grad_norm": 0.8175895495379112, + "learning_rate": 4.838709677419355e-06, + "loss": 0.2573, + "step": 435 + }, + { + "epoch": 0.1456002671564535, + "grad_norm": 0.9000739545892836, + "learning_rate": 4.849833147942159e-06, + "loss": 0.2596, + "step": 436 + }, + { + "epoch": 0.1459342127233261, + "grad_norm": 0.9000554099286614, + "learning_rate": 4.8609566184649615e-06, + "loss": 0.2525, + "step": 437 + }, + { + "epoch": 0.1462681582901987, + "grad_norm": 1.035965080184084, + "learning_rate": 4.872080088987764e-06, + "loss": 0.2746, + "step": 438 + }, + { + "epoch": 0.1466021038570713, + "grad_norm": 0.9495692424587069, + "learning_rate": 4.883203559510568e-06, + "loss": 0.2765, + "step": 439 + }, + { + "epoch": 0.1469360494239439, + "grad_norm": 0.8468127334478587, + "learning_rate": 4.8943270300333704e-06, + "loss": 0.2499, + "step": 440 + }, + { + "epoch": 0.1472699949908165, + "grad_norm": 0.9430177643905899, + "learning_rate": 4.905450500556174e-06, + "loss": 0.2596, + "step": 441 + }, + { + "epoch": 0.1476039405576891, + "grad_norm": 0.8564111776747324, + "learning_rate": 4.9165739710789776e-06, + "loss": 0.2742, + "step": 442 + }, + { + "epoch": 0.14793788612456168, + "grad_norm": 0.9034833859232495, + "learning_rate": 4.92769744160178e-06, + "loss": 0.2624, + "step": 443 + }, + { + "epoch": 0.1482718316914343, + "grad_norm": 1.0085778060835962, + "learning_rate": 4.938820912124583e-06, + "loss": 0.2788, + "step": 444 + }, + { + "epoch": 0.1486057772583069, + "grad_norm": 0.8486548720543253, + "learning_rate": 4.9499443826473865e-06, + "loss": 0.2499, + "step": 445 + }, + { + "epoch": 0.1489397228251795, + "grad_norm": 0.8647171380949372, + "learning_rate": 4.961067853170189e-06, + "loss": 0.2467, + "step": 446 + }, + { + "epoch": 0.14927366839205208, + "grad_norm": 0.9458194360965803, + "learning_rate": 4.972191323692993e-06, + "loss": 0.26, + "step": 447 + }, + { + "epoch": 0.1496076139589247, + "grad_norm": 0.98845534464712, + "learning_rate": 4.9833147942157955e-06, + "loss": 0.277, + "step": 448 + }, + { + "epoch": 0.1499415595257973, + "grad_norm": 1.0038333784339768, + "learning_rate": 4.994438264738598e-06, + "loss": 0.2607, + "step": 449 + }, + { + "epoch": 0.1502755050926699, + "grad_norm": 0.9882288469242262, + "learning_rate": 5.005561735261402e-06, + "loss": 0.2736, + "step": 450 + }, + { + "epoch": 0.15060945065954248, + "grad_norm": 0.9332132918013903, + "learning_rate": 5.016685205784205e-06, + "loss": 0.2746, + "step": 451 + }, + { + "epoch": 0.1509433962264151, + "grad_norm": 0.8514088935668446, + "learning_rate": 5.027808676307008e-06, + "loss": 0.2667, + "step": 452 + }, + { + "epoch": 0.1512773417932877, + "grad_norm": 0.8462811908982392, + "learning_rate": 5.038932146829812e-06, + "loss": 0.2675, + "step": 453 + }, + { + "epoch": 0.1516112873601603, + "grad_norm": 0.8691114025176568, + "learning_rate": 5.050055617352615e-06, + "loss": 0.289, + "step": 454 + }, + { + "epoch": 0.15194523292703288, + "grad_norm": 0.969925154555584, + "learning_rate": 5.061179087875418e-06, + "loss": 0.2658, + "step": 455 + }, + { + "epoch": 0.1522791784939055, + "grad_norm": 0.7612750857319712, + "learning_rate": 5.072302558398221e-06, + "loss": 0.246, + "step": 456 + }, + { + "epoch": 0.1526131240607781, + "grad_norm": 0.9012049865349984, + "learning_rate": 5.083426028921023e-06, + "loss": 0.2791, + "step": 457 + }, + { + "epoch": 0.1529470696276507, + "grad_norm": 0.8335869019920015, + "learning_rate": 5.094549499443827e-06, + "loss": 0.2542, + "step": 458 + }, + { + "epoch": 0.15328101519452328, + "grad_norm": 0.8982538630222419, + "learning_rate": 5.1056729699666295e-06, + "loss": 0.2693, + "step": 459 + }, + { + "epoch": 0.15361496076139589, + "grad_norm": 0.8430594873139436, + "learning_rate": 5.116796440489433e-06, + "loss": 0.2533, + "step": 460 + }, + { + "epoch": 0.1539489063282685, + "grad_norm": 0.8653242092400866, + "learning_rate": 5.127919911012236e-06, + "loss": 0.264, + "step": 461 + }, + { + "epoch": 0.1542828518951411, + "grad_norm": 0.8994436945219121, + "learning_rate": 5.139043381535039e-06, + "loss": 0.2615, + "step": 462 + }, + { + "epoch": 0.1546167974620137, + "grad_norm": 0.8101799725720696, + "learning_rate": 5.150166852057843e-06, + "loss": 0.2532, + "step": 463 + }, + { + "epoch": 0.15495074302888628, + "grad_norm": 0.8855034068759076, + "learning_rate": 5.161290322580646e-06, + "loss": 0.2732, + "step": 464 + }, + { + "epoch": 0.1552846885957589, + "grad_norm": 0.8043484789939199, + "learning_rate": 5.172413793103449e-06, + "loss": 0.2379, + "step": 465 + }, + { + "epoch": 0.1556186341626315, + "grad_norm": 0.8962988729164553, + "learning_rate": 5.183537263626252e-06, + "loss": 0.2799, + "step": 466 + }, + { + "epoch": 0.1559525797295041, + "grad_norm": 0.8232384777691226, + "learning_rate": 5.1946607341490554e-06, + "loss": 0.2586, + "step": 467 + }, + { + "epoch": 0.15628652529637668, + "grad_norm": 0.8472967462900588, + "learning_rate": 5.205784204671857e-06, + "loss": 0.2591, + "step": 468 + }, + { + "epoch": 0.1566204708632493, + "grad_norm": 0.7977382518203232, + "learning_rate": 5.216907675194661e-06, + "loss": 0.2517, + "step": 469 + }, + { + "epoch": 0.1569544164301219, + "grad_norm": 0.9780305545037385, + "learning_rate": 5.2280311457174636e-06, + "loss": 0.2772, + "step": 470 + }, + { + "epoch": 0.1572883619969945, + "grad_norm": 0.7622852005224084, + "learning_rate": 5.239154616240267e-06, + "loss": 0.2538, + "step": 471 + }, + { + "epoch": 0.15762230756386708, + "grad_norm": 0.8593964452116354, + "learning_rate": 5.250278086763071e-06, + "loss": 0.2568, + "step": 472 + }, + { + "epoch": 0.1579562531307397, + "grad_norm": 0.8100335857942087, + "learning_rate": 5.261401557285873e-06, + "loss": 0.2501, + "step": 473 + }, + { + "epoch": 0.1582901986976123, + "grad_norm": 0.8014373233797282, + "learning_rate": 5.272525027808677e-06, + "loss": 0.2535, + "step": 474 + }, + { + "epoch": 0.1586241442644849, + "grad_norm": 0.7975499651795742, + "learning_rate": 5.28364849833148e-06, + "loss": 0.2504, + "step": 475 + }, + { + "epoch": 0.15895808983135748, + "grad_norm": 0.8394801448362739, + "learning_rate": 5.294771968854283e-06, + "loss": 0.2707, + "step": 476 + }, + { + "epoch": 0.1592920353982301, + "grad_norm": 0.9828129623333244, + "learning_rate": 5.305895439377086e-06, + "loss": 0.2775, + "step": 477 + }, + { + "epoch": 0.1596259809651027, + "grad_norm": 0.732105226843958, + "learning_rate": 5.3170189098998895e-06, + "loss": 0.2289, + "step": 478 + }, + { + "epoch": 0.1599599265319753, + "grad_norm": 0.8965879378663164, + "learning_rate": 5.328142380422693e-06, + "loss": 0.2401, + "step": 479 + }, + { + "epoch": 0.16029387209884788, + "grad_norm": 0.9057075540665339, + "learning_rate": 5.339265850945496e-06, + "loss": 0.2681, + "step": 480 + }, + { + "epoch": 0.16062781766572048, + "grad_norm": 0.7990495639152133, + "learning_rate": 5.350389321468299e-06, + "loss": 0.2485, + "step": 481 + }, + { + "epoch": 0.1609617632325931, + "grad_norm": 1.015072564398485, + "learning_rate": 5.361512791991101e-06, + "loss": 0.2662, + "step": 482 + }, + { + "epoch": 0.1612957087994657, + "grad_norm": 0.8030254740021361, + "learning_rate": 5.372636262513905e-06, + "loss": 0.2585, + "step": 483 + }, + { + "epoch": 0.16162965436633828, + "grad_norm": 1.0859820471858048, + "learning_rate": 5.383759733036707e-06, + "loss": 0.2715, + "step": 484 + }, + { + "epoch": 0.16196359993321088, + "grad_norm": 0.8205452365480986, + "learning_rate": 5.394883203559511e-06, + "loss": 0.2596, + "step": 485 + }, + { + "epoch": 0.1622975455000835, + "grad_norm": 0.875783049907706, + "learning_rate": 5.406006674082314e-06, + "loss": 0.2605, + "step": 486 + }, + { + "epoch": 0.1626314910669561, + "grad_norm": 0.8802931215752893, + "learning_rate": 5.417130144605117e-06, + "loss": 0.2599, + "step": 487 + }, + { + "epoch": 0.16296543663382868, + "grad_norm": 0.8179528891483596, + "learning_rate": 5.428253615127921e-06, + "loss": 0.2527, + "step": 488 + }, + { + "epoch": 0.16329938220070128, + "grad_norm": 0.734992877721891, + "learning_rate": 5.4393770856507235e-06, + "loss": 0.241, + "step": 489 + }, + { + "epoch": 0.1636333277675739, + "grad_norm": 0.8158038849222073, + "learning_rate": 5.450500556173527e-06, + "loss": 0.2506, + "step": 490 + }, + { + "epoch": 0.1639672733344465, + "grad_norm": 0.835866048908736, + "learning_rate": 5.46162402669633e-06, + "loss": 0.2568, + "step": 491 + }, + { + "epoch": 0.16430121890131907, + "grad_norm": 0.895021641794215, + "learning_rate": 5.472747497219133e-06, + "loss": 0.2632, + "step": 492 + }, + { + "epoch": 0.16463516446819168, + "grad_norm": 0.8090105148370232, + "learning_rate": 5.483870967741935e-06, + "loss": 0.2473, + "step": 493 + }, + { + "epoch": 0.1649691100350643, + "grad_norm": 0.810000176014472, + "learning_rate": 5.494994438264739e-06, + "loss": 0.242, + "step": 494 + }, + { + "epoch": 0.1653030556019369, + "grad_norm": 0.8836235099488476, + "learning_rate": 5.506117908787543e-06, + "loss": 0.2455, + "step": 495 + }, + { + "epoch": 0.16563700116880947, + "grad_norm": 0.8254870164296962, + "learning_rate": 5.517241379310345e-06, + "loss": 0.2539, + "step": 496 + }, + { + "epoch": 0.16597094673568208, + "grad_norm": 0.9525772225779567, + "learning_rate": 5.5283648498331485e-06, + "loss": 0.261, + "step": 497 + }, + { + "epoch": 0.16630489230255469, + "grad_norm": 0.8377853082667718, + "learning_rate": 5.539488320355951e-06, + "loss": 0.253, + "step": 498 + }, + { + "epoch": 0.1666388378694273, + "grad_norm": 0.9616580429013377, + "learning_rate": 5.550611790878755e-06, + "loss": 0.2587, + "step": 499 + }, + { + "epoch": 0.16697278343629987, + "grad_norm": 0.7796060710868616, + "learning_rate": 5.5617352614015575e-06, + "loss": 0.2588, + "step": 500 + }, + { + "epoch": 0.16730672900317248, + "grad_norm": 0.9319120249495298, + "learning_rate": 5.572858731924361e-06, + "loss": 0.2451, + "step": 501 + }, + { + "epoch": 0.16764067457004508, + "grad_norm": 1.0596544782036998, + "learning_rate": 5.583982202447164e-06, + "loss": 0.2749, + "step": 502 + }, + { + "epoch": 0.1679746201369177, + "grad_norm": 0.8863515672067872, + "learning_rate": 5.595105672969967e-06, + "loss": 0.2511, + "step": 503 + }, + { + "epoch": 0.16830856570379027, + "grad_norm": 0.8225742978610687, + "learning_rate": 5.606229143492771e-06, + "loss": 0.2377, + "step": 504 + }, + { + "epoch": 0.16864251127066288, + "grad_norm": 0.7637996051027277, + "learning_rate": 5.617352614015574e-06, + "loss": 0.2335, + "step": 505 + }, + { + "epoch": 0.16897645683753548, + "grad_norm": 0.8249967207095058, + "learning_rate": 5.628476084538377e-06, + "loss": 0.2748, + "step": 506 + }, + { + "epoch": 0.1693104024044081, + "grad_norm": 0.8161862924539601, + "learning_rate": 5.639599555061179e-06, + "loss": 0.2649, + "step": 507 + }, + { + "epoch": 0.16964434797128067, + "grad_norm": 0.8749861083035826, + "learning_rate": 5.6507230255839826e-06, + "loss": 0.2641, + "step": 508 + }, + { + "epoch": 0.16997829353815327, + "grad_norm": 0.8934466811434186, + "learning_rate": 5.661846496106785e-06, + "loss": 0.2679, + "step": 509 + }, + { + "epoch": 0.17031223910502588, + "grad_norm": 0.8787600615518483, + "learning_rate": 5.672969966629589e-06, + "loss": 0.2735, + "step": 510 + }, + { + "epoch": 0.1706461846718985, + "grad_norm": 0.7797278565537913, + "learning_rate": 5.6840934371523915e-06, + "loss": 0.2652, + "step": 511 + }, + { + "epoch": 0.1709801302387711, + "grad_norm": 0.7628477421169882, + "learning_rate": 5.695216907675195e-06, + "loss": 0.2424, + "step": 512 + }, + { + "epoch": 0.17131407580564367, + "grad_norm": 0.840464455052028, + "learning_rate": 5.706340378197999e-06, + "loss": 0.2659, + "step": 513 + }, + { + "epoch": 0.17164802137251628, + "grad_norm": 0.985780992202247, + "learning_rate": 5.717463848720801e-06, + "loss": 0.2507, + "step": 514 + }, + { + "epoch": 0.1719819669393889, + "grad_norm": 1.4268260954113605, + "learning_rate": 5.728587319243605e-06, + "loss": 0.2718, + "step": 515 + }, + { + "epoch": 0.1723159125062615, + "grad_norm": 0.7437022173759871, + "learning_rate": 5.739710789766408e-06, + "loss": 0.2543, + "step": 516 + }, + { + "epoch": 0.17264985807313407, + "grad_norm": 0.7373193238546386, + "learning_rate": 5.750834260289211e-06, + "loss": 0.2451, + "step": 517 + }, + { + "epoch": 0.17298380364000668, + "grad_norm": 0.9049436389723264, + "learning_rate": 5.761957730812013e-06, + "loss": 0.2634, + "step": 518 + }, + { + "epoch": 0.17331774920687928, + "grad_norm": 0.8105545282139595, + "learning_rate": 5.773081201334817e-06, + "loss": 0.2588, + "step": 519 + }, + { + "epoch": 0.1736516947737519, + "grad_norm": 0.7844885492997241, + "learning_rate": 5.784204671857621e-06, + "loss": 0.2507, + "step": 520 + }, + { + "epoch": 0.17398564034062447, + "grad_norm": 1.5001443038134146, + "learning_rate": 5.795328142380423e-06, + "loss": 0.258, + "step": 521 + }, + { + "epoch": 0.17431958590749708, + "grad_norm": 0.9479471274537018, + "learning_rate": 5.806451612903226e-06, + "loss": 0.2547, + "step": 522 + }, + { + "epoch": 0.17465353147436968, + "grad_norm": 0.8165944158582878, + "learning_rate": 5.817575083426029e-06, + "loss": 0.2618, + "step": 523 + }, + { + "epoch": 0.1749874770412423, + "grad_norm": 0.793372998639345, + "learning_rate": 5.828698553948833e-06, + "loss": 0.2538, + "step": 524 + }, + { + "epoch": 0.17532142260811487, + "grad_norm": 0.7708291500668256, + "learning_rate": 5.839822024471635e-06, + "loss": 0.2561, + "step": 525 + }, + { + "epoch": 0.17565536817498748, + "grad_norm": 0.7448808234488508, + "learning_rate": 5.850945494994439e-06, + "loss": 0.2502, + "step": 526 + }, + { + "epoch": 0.17598931374186008, + "grad_norm": 0.8021357502415758, + "learning_rate": 5.862068965517242e-06, + "loss": 0.2783, + "step": 527 + }, + { + "epoch": 0.1763232593087327, + "grad_norm": 0.7495063590638561, + "learning_rate": 5.873192436040045e-06, + "loss": 0.2476, + "step": 528 + }, + { + "epoch": 0.17665720487560527, + "grad_norm": 0.7961292168845412, + "learning_rate": 5.884315906562849e-06, + "loss": 0.2726, + "step": 529 + }, + { + "epoch": 0.17699115044247787, + "grad_norm": 0.8074466377944476, + "learning_rate": 5.8954393770856515e-06, + "loss": 0.2582, + "step": 530 + }, + { + "epoch": 0.17732509600935048, + "grad_norm": 0.8345256528144968, + "learning_rate": 5.906562847608455e-06, + "loss": 0.245, + "step": 531 + }, + { + "epoch": 0.1776590415762231, + "grad_norm": 0.7869923386555632, + "learning_rate": 5.917686318131257e-06, + "loss": 0.2569, + "step": 532 + }, + { + "epoch": 0.17799298714309567, + "grad_norm": 0.8147246527010965, + "learning_rate": 5.9288097886540604e-06, + "loss": 0.2362, + "step": 533 + }, + { + "epoch": 0.17832693270996827, + "grad_norm": 0.8341283135652426, + "learning_rate": 5.939933259176863e-06, + "loss": 0.2764, + "step": 534 + }, + { + "epoch": 0.17866087827684088, + "grad_norm": 0.7045070493955722, + "learning_rate": 5.951056729699667e-06, + "loss": 0.2354, + "step": 535 + }, + { + "epoch": 0.17899482384371349, + "grad_norm": 0.7421340011554978, + "learning_rate": 5.962180200222469e-06, + "loss": 0.2386, + "step": 536 + }, + { + "epoch": 0.17932876941058606, + "grad_norm": 0.8226524022867789, + "learning_rate": 5.973303670745273e-06, + "loss": 0.2594, + "step": 537 + }, + { + "epoch": 0.17966271497745867, + "grad_norm": 0.9397477018343224, + "learning_rate": 5.9844271412680765e-06, + "loss": 0.2639, + "step": 538 + }, + { + "epoch": 0.17999666054433128, + "grad_norm": 0.9390525099666275, + "learning_rate": 5.995550611790879e-06, + "loss": 0.2477, + "step": 539 + }, + { + "epoch": 0.18033060611120388, + "grad_norm": 0.8300240065923898, + "learning_rate": 6.006674082313683e-06, + "loss": 0.2751, + "step": 540 + }, + { + "epoch": 0.18066455167807646, + "grad_norm": 0.813819268112457, + "learning_rate": 6.0177975528364855e-06, + "loss": 0.2617, + "step": 541 + }, + { + "epoch": 0.18099849724494907, + "grad_norm": 0.9523295641887335, + "learning_rate": 6.028921023359289e-06, + "loss": 0.2642, + "step": 542 + }, + { + "epoch": 0.18133244281182168, + "grad_norm": 0.8610172814649126, + "learning_rate": 6.040044493882091e-06, + "loss": 0.2738, + "step": 543 + }, + { + "epoch": 0.18166638837869428, + "grad_norm": 0.8247418133291398, + "learning_rate": 6.0511679644048945e-06, + "loss": 0.2794, + "step": 544 + }, + { + "epoch": 0.18200033394556686, + "grad_norm": 0.8563775168824261, + "learning_rate": 6.062291434927698e-06, + "loss": 0.255, + "step": 545 + }, + { + "epoch": 0.18233427951243947, + "grad_norm": 0.8968217199385246, + "learning_rate": 6.073414905450501e-06, + "loss": 0.2861, + "step": 546 + }, + { + "epoch": 0.18266822507931207, + "grad_norm": 0.8235810669629106, + "learning_rate": 6.084538375973304e-06, + "loss": 0.2663, + "step": 547 + }, + { + "epoch": 0.18300217064618468, + "grad_norm": 0.8348617646793106, + "learning_rate": 6.095661846496107e-06, + "loss": 0.2486, + "step": 548 + }, + { + "epoch": 0.18333611621305726, + "grad_norm": 0.8185061661036209, + "learning_rate": 6.1067853170189106e-06, + "loss": 0.2885, + "step": 549 + }, + { + "epoch": 0.18367006177992987, + "grad_norm": 0.8056111861107552, + "learning_rate": 6.117908787541713e-06, + "loss": 0.2651, + "step": 550 + }, + { + "epoch": 0.18400400734680247, + "grad_norm": 0.8746807107987543, + "learning_rate": 6.129032258064517e-06, + "loss": 0.2568, + "step": 551 + }, + { + "epoch": 0.18433795291367508, + "grad_norm": 0.8681694547363, + "learning_rate": 6.1401557285873195e-06, + "loss": 0.2638, + "step": 552 + }, + { + "epoch": 0.18467189848054766, + "grad_norm": 0.8221512767810651, + "learning_rate": 6.151279199110123e-06, + "loss": 0.2415, + "step": 553 + }, + { + "epoch": 0.18500584404742026, + "grad_norm": 0.698387186152687, + "learning_rate": 6.162402669632927e-06, + "loss": 0.245, + "step": 554 + }, + { + "epoch": 0.18533978961429287, + "grad_norm": 0.8433273577991696, + "learning_rate": 6.173526140155729e-06, + "loss": 0.2649, + "step": 555 + }, + { + "epoch": 0.18567373518116548, + "grad_norm": 0.8021748456512204, + "learning_rate": 6.184649610678533e-06, + "loss": 0.2469, + "step": 556 + }, + { + "epoch": 0.18600768074803806, + "grad_norm": 0.7364557546886381, + "learning_rate": 6.195773081201335e-06, + "loss": 0.255, + "step": 557 + }, + { + "epoch": 0.18634162631491066, + "grad_norm": 0.7762130750403377, + "learning_rate": 6.206896551724138e-06, + "loss": 0.2524, + "step": 558 + }, + { + "epoch": 0.18667557188178327, + "grad_norm": 0.8366629532210769, + "learning_rate": 6.218020022246941e-06, + "loss": 0.2746, + "step": 559 + }, + { + "epoch": 0.18700951744865588, + "grad_norm": 0.752997914423049, + "learning_rate": 6.229143492769745e-06, + "loss": 0.2523, + "step": 560 + }, + { + "epoch": 0.18734346301552846, + "grad_norm": 0.7260230957886581, + "learning_rate": 6.240266963292548e-06, + "loss": 0.2541, + "step": 561 + }, + { + "epoch": 0.18767740858240106, + "grad_norm": 0.8788917141496118, + "learning_rate": 6.251390433815351e-06, + "loss": 0.2715, + "step": 562 + }, + { + "epoch": 0.18801135414927367, + "grad_norm": 0.800907436768764, + "learning_rate": 6.262513904338154e-06, + "loss": 0.2645, + "step": 563 + }, + { + "epoch": 0.18834529971614627, + "grad_norm": 0.6844788488749838, + "learning_rate": 6.273637374860957e-06, + "loss": 0.2427, + "step": 564 + }, + { + "epoch": 0.18867924528301888, + "grad_norm": 0.8650404495590827, + "learning_rate": 6.284760845383761e-06, + "loss": 0.2517, + "step": 565 + }, + { + "epoch": 0.18901319084989146, + "grad_norm": 0.9007698022142087, + "learning_rate": 6.295884315906563e-06, + "loss": 0.2321, + "step": 566 + }, + { + "epoch": 0.18934713641676407, + "grad_norm": 0.7993098878472382, + "learning_rate": 6.307007786429367e-06, + "loss": 0.2431, + "step": 567 + }, + { + "epoch": 0.18968108198363667, + "grad_norm": 0.9721899662902379, + "learning_rate": 6.318131256952169e-06, + "loss": 0.2605, + "step": 568 + }, + { + "epoch": 0.19001502755050928, + "grad_norm": 0.8389462490316344, + "learning_rate": 6.329254727474972e-06, + "loss": 0.2601, + "step": 569 + }, + { + "epoch": 0.19034897311738186, + "grad_norm": 0.8414565668740682, + "learning_rate": 6.340378197997776e-06, + "loss": 0.2621, + "step": 570 + }, + { + "epoch": 0.19068291868425447, + "grad_norm": 0.8398750862803184, + "learning_rate": 6.351501668520579e-06, + "loss": 0.2695, + "step": 571 + }, + { + "epoch": 0.19101686425112707, + "grad_norm": 0.8866607655134973, + "learning_rate": 6.362625139043382e-06, + "loss": 0.2533, + "step": 572 + }, + { + "epoch": 0.19135080981799968, + "grad_norm": 0.9367172444562246, + "learning_rate": 6.373748609566185e-06, + "loss": 0.2586, + "step": 573 + }, + { + "epoch": 0.19168475538487226, + "grad_norm": 0.8225843599787163, + "learning_rate": 6.3848720800889884e-06, + "loss": 0.2565, + "step": 574 + }, + { + "epoch": 0.19201870095174486, + "grad_norm": 0.8444529094519592, + "learning_rate": 6.395995550611791e-06, + "loss": 0.2752, + "step": 575 + }, + { + "epoch": 0.19235264651861747, + "grad_norm": 0.8144999053793032, + "learning_rate": 6.407119021134595e-06, + "loss": 0.25, + "step": 576 + }, + { + "epoch": 0.19268659208549008, + "grad_norm": 0.9646835095492448, + "learning_rate": 6.418242491657397e-06, + "loss": 0.2797, + "step": 577 + }, + { + "epoch": 0.19302053765236266, + "grad_norm": 0.8030364087953927, + "learning_rate": 6.429365962180201e-06, + "loss": 0.2521, + "step": 578 + }, + { + "epoch": 0.19335448321923526, + "grad_norm": 0.7321772482455484, + "learning_rate": 6.4404894327030045e-06, + "loss": 0.2407, + "step": 579 + }, + { + "epoch": 0.19368842878610787, + "grad_norm": 0.8945349521153622, + "learning_rate": 6.451612903225806e-06, + "loss": 0.2606, + "step": 580 + }, + { + "epoch": 0.19402237435298048, + "grad_norm": 0.9583738923059661, + "learning_rate": 6.462736373748611e-06, + "loss": 0.2688, + "step": 581 + }, + { + "epoch": 0.19435631991985305, + "grad_norm": 0.7713487168779097, + "learning_rate": 6.473859844271413e-06, + "loss": 0.2557, + "step": 582 + }, + { + "epoch": 0.19469026548672566, + "grad_norm": 0.7917343133180527, + "learning_rate": 6.484983314794216e-06, + "loss": 0.2802, + "step": 583 + }, + { + "epoch": 0.19502421105359827, + "grad_norm": 0.8943120918526262, + "learning_rate": 6.496106785317019e-06, + "loss": 0.2501, + "step": 584 + }, + { + "epoch": 0.19535815662047087, + "grad_norm": 1.0548584285291802, + "learning_rate": 6.5072302558398225e-06, + "loss": 0.27, + "step": 585 + }, + { + "epoch": 0.19569210218734345, + "grad_norm": 0.8891494426082305, + "learning_rate": 6.518353726362626e-06, + "loss": 0.2762, + "step": 586 + }, + { + "epoch": 0.19602604775421606, + "grad_norm": 0.8248070845769653, + "learning_rate": 6.529477196885429e-06, + "loss": 0.2662, + "step": 587 + }, + { + "epoch": 0.19635999332108867, + "grad_norm": 0.7767078346910501, + "learning_rate": 6.540600667408232e-06, + "loss": 0.2714, + "step": 588 + }, + { + "epoch": 0.19669393888796127, + "grad_norm": 0.9437342483702248, + "learning_rate": 6.551724137931035e-06, + "loss": 0.2565, + "step": 589 + }, + { + "epoch": 0.19702788445483385, + "grad_norm": 0.7538036441233125, + "learning_rate": 6.5628476084538385e-06, + "loss": 0.2489, + "step": 590 + }, + { + "epoch": 0.19736183002170646, + "grad_norm": 1.3943431533296153, + "learning_rate": 6.573971078976641e-06, + "loss": 0.253, + "step": 591 + }, + { + "epoch": 0.19769577558857906, + "grad_norm": 0.8252629179756918, + "learning_rate": 6.585094549499445e-06, + "loss": 0.2514, + "step": 592 + }, + { + "epoch": 0.19802972115545167, + "grad_norm": 0.7701423212327584, + "learning_rate": 6.596218020022247e-06, + "loss": 0.2637, + "step": 593 + }, + { + "epoch": 0.19836366672232425, + "grad_norm": 0.9533433627482636, + "learning_rate": 6.60734149054505e-06, + "loss": 0.2611, + "step": 594 + }, + { + "epoch": 0.19869761228919686, + "grad_norm": 1.0300962240262892, + "learning_rate": 6.618464961067854e-06, + "loss": 0.2606, + "step": 595 + }, + { + "epoch": 0.19903155785606946, + "grad_norm": 0.7274771616034863, + "learning_rate": 6.6295884315906565e-06, + "loss": 0.247, + "step": 596 + }, + { + "epoch": 0.19936550342294207, + "grad_norm": 0.8521863744652877, + "learning_rate": 6.64071190211346e-06, + "loss": 0.2633, + "step": 597 + }, + { + "epoch": 0.19969944898981465, + "grad_norm": 0.857057035238085, + "learning_rate": 6.651835372636263e-06, + "loss": 0.2589, + "step": 598 + }, + { + "epoch": 0.20003339455668726, + "grad_norm": 0.7632237378212952, + "learning_rate": 6.662958843159066e-06, + "loss": 0.252, + "step": 599 + }, + { + "epoch": 0.20036734012355986, + "grad_norm": 0.7570323690891942, + "learning_rate": 6.674082313681869e-06, + "loss": 0.2493, + "step": 600 + }, + { + "epoch": 0.20070128569043247, + "grad_norm": 0.7808753856394541, + "learning_rate": 6.6852057842046726e-06, + "loss": 0.2525, + "step": 601 + }, + { + "epoch": 0.20103523125730505, + "grad_norm": 0.7516839563088439, + "learning_rate": 6.696329254727475e-06, + "loss": 0.2531, + "step": 602 + }, + { + "epoch": 0.20136917682417765, + "grad_norm": 0.9130303291108971, + "learning_rate": 6.707452725250279e-06, + "loss": 0.2602, + "step": 603 + }, + { + "epoch": 0.20170312239105026, + "grad_norm": 0.7675587493061325, + "learning_rate": 6.718576195773082e-06, + "loss": 0.2558, + "step": 604 + }, + { + "epoch": 0.20203706795792287, + "grad_norm": 0.8499898280803776, + "learning_rate": 6.729699666295884e-06, + "loss": 0.2521, + "step": 605 + }, + { + "epoch": 0.20237101352479545, + "grad_norm": 0.6792136287125394, + "learning_rate": 6.740823136818689e-06, + "loss": 0.241, + "step": 606 + }, + { + "epoch": 0.20270495909166805, + "grad_norm": 0.8162142022991196, + "learning_rate": 6.7519466073414905e-06, + "loss": 0.2527, + "step": 607 + }, + { + "epoch": 0.20303890465854066, + "grad_norm": 0.7296861143295201, + "learning_rate": 6.763070077864294e-06, + "loss": 0.2435, + "step": 608 + }, + { + "epoch": 0.20337285022541327, + "grad_norm": 0.7502154456382952, + "learning_rate": 6.774193548387097e-06, + "loss": 0.2614, + "step": 609 + }, + { + "epoch": 0.20370679579228584, + "grad_norm": 0.7661648899048032, + "learning_rate": 6.7853170189099e-06, + "loss": 0.2559, + "step": 610 + }, + { + "epoch": 0.20404074135915845, + "grad_norm": 0.7204037874440861, + "learning_rate": 6.796440489432704e-06, + "loss": 0.2414, + "step": 611 + }, + { + "epoch": 0.20437468692603106, + "grad_norm": 0.8284127688423369, + "learning_rate": 6.807563959955507e-06, + "loss": 0.2486, + "step": 612 + }, + { + "epoch": 0.20470863249290366, + "grad_norm": 0.7646957746452056, + "learning_rate": 6.81868743047831e-06, + "loss": 0.2482, + "step": 613 + }, + { + "epoch": 0.20504257805977624, + "grad_norm": 0.6666997972855363, + "learning_rate": 6.829810901001113e-06, + "loss": 0.2412, + "step": 614 + }, + { + "epoch": 0.20537652362664885, + "grad_norm": 0.8060448469342122, + "learning_rate": 6.840934371523916e-06, + "loss": 0.2592, + "step": 615 + }, + { + "epoch": 0.20571046919352146, + "grad_norm": 0.7430816705543792, + "learning_rate": 6.852057842046719e-06, + "loss": 0.2633, + "step": 616 + }, + { + "epoch": 0.20604441476039406, + "grad_norm": 0.705554520404781, + "learning_rate": 6.863181312569523e-06, + "loss": 0.2547, + "step": 617 + }, + { + "epoch": 0.20637836032726667, + "grad_norm": 0.7321042548247252, + "learning_rate": 6.8743047830923245e-06, + "loss": 0.2465, + "step": 618 + }, + { + "epoch": 0.20671230589413925, + "grad_norm": 0.7046105183839342, + "learning_rate": 6.885428253615128e-06, + "loss": 0.2422, + "step": 619 + }, + { + "epoch": 0.20704625146101185, + "grad_norm": 0.6631002359153175, + "learning_rate": 6.896551724137932e-06, + "loss": 0.2338, + "step": 620 + }, + { + "epoch": 0.20738019702788446, + "grad_norm": 0.820463308715899, + "learning_rate": 6.907675194660734e-06, + "loss": 0.2357, + "step": 621 + }, + { + "epoch": 0.20771414259475707, + "grad_norm": 0.8757054022253623, + "learning_rate": 6.918798665183538e-06, + "loss": 0.2576, + "step": 622 + }, + { + "epoch": 0.20804808816162965, + "grad_norm": 0.7617466716742917, + "learning_rate": 6.929922135706341e-06, + "loss": 0.2625, + "step": 623 + }, + { + "epoch": 0.20838203372850225, + "grad_norm": 0.7507182463449198, + "learning_rate": 6.941045606229144e-06, + "loss": 0.2484, + "step": 624 + }, + { + "epoch": 0.20871597929537486, + "grad_norm": 0.653627768081853, + "learning_rate": 6.952169076751947e-06, + "loss": 0.228, + "step": 625 + }, + { + "epoch": 0.20904992486224747, + "grad_norm": 0.7532509534249405, + "learning_rate": 6.9632925472747504e-06, + "loss": 0.2594, + "step": 626 + }, + { + "epoch": 0.20938387042912004, + "grad_norm": 0.6656450442528803, + "learning_rate": 6.974416017797554e-06, + "loss": 0.2434, + "step": 627 + }, + { + "epoch": 0.20971781599599265, + "grad_norm": 0.9213023326955349, + "learning_rate": 6.985539488320357e-06, + "loss": 0.2577, + "step": 628 + }, + { + "epoch": 0.21005176156286526, + "grad_norm": 0.8151666294986606, + "learning_rate": 6.99666295884316e-06, + "loss": 0.2582, + "step": 629 + }, + { + "epoch": 0.21038570712973786, + "grad_norm": 0.7631009288121973, + "learning_rate": 7.007786429365962e-06, + "loss": 0.2578, + "step": 630 + }, + { + "epoch": 0.21071965269661044, + "grad_norm": 0.8363790627747725, + "learning_rate": 7.0189098998887665e-06, + "loss": 0.2781, + "step": 631 + }, + { + "epoch": 0.21105359826348305, + "grad_norm": 0.839505394509319, + "learning_rate": 7.030033370411568e-06, + "loss": 0.2314, + "step": 632 + }, + { + "epoch": 0.21138754383035566, + "grad_norm": 1.0596615080003053, + "learning_rate": 7.041156840934372e-06, + "loss": 0.284, + "step": 633 + }, + { + "epoch": 0.21172148939722826, + "grad_norm": 0.7122109086577653, + "learning_rate": 7.052280311457175e-06, + "loss": 0.2304, + "step": 634 + }, + { + "epoch": 0.21205543496410084, + "grad_norm": 0.8054756743369217, + "learning_rate": 7.063403781979978e-06, + "loss": 0.2714, + "step": 635 + }, + { + "epoch": 0.21238938053097345, + "grad_norm": 0.7150051499547507, + "learning_rate": 7.074527252502782e-06, + "loss": 0.2456, + "step": 636 + }, + { + "epoch": 0.21272332609784605, + "grad_norm": 0.7974489943926535, + "learning_rate": 7.0856507230255845e-06, + "loss": 0.2571, + "step": 637 + }, + { + "epoch": 0.21305727166471866, + "grad_norm": 0.9680300225233516, + "learning_rate": 7.096774193548388e-06, + "loss": 0.2745, + "step": 638 + }, + { + "epoch": 0.21339121723159124, + "grad_norm": 0.7427730803660618, + "learning_rate": 7.107897664071191e-06, + "loss": 0.2518, + "step": 639 + }, + { + "epoch": 0.21372516279846385, + "grad_norm": 0.741805688846562, + "learning_rate": 7.119021134593994e-06, + "loss": 0.2444, + "step": 640 + }, + { + "epoch": 0.21405910836533645, + "grad_norm": 0.7821624254946435, + "learning_rate": 7.130144605116797e-06, + "loss": 0.2727, + "step": 641 + }, + { + "epoch": 0.21439305393220906, + "grad_norm": 0.6585665747997141, + "learning_rate": 7.1412680756396006e-06, + "loss": 0.2408, + "step": 642 + }, + { + "epoch": 0.21472699949908164, + "grad_norm": 0.6560687036372677, + "learning_rate": 7.152391546162402e-06, + "loss": 0.2396, + "step": 643 + }, + { + "epoch": 0.21506094506595425, + "grad_norm": 0.746510194063174, + "learning_rate": 7.163515016685206e-06, + "loss": 0.244, + "step": 644 + }, + { + "epoch": 0.21539489063282685, + "grad_norm": 0.7591594490569733, + "learning_rate": 7.1746384872080095e-06, + "loss": 0.2501, + "step": 645 + }, + { + "epoch": 0.21572883619969946, + "grad_norm": 0.9390381754063648, + "learning_rate": 7.185761957730812e-06, + "loss": 0.2597, + "step": 646 + }, + { + "epoch": 0.21606278176657204, + "grad_norm": 0.7677869272615818, + "learning_rate": 7.196885428253616e-06, + "loss": 0.2309, + "step": 647 + }, + { + "epoch": 0.21639672733344464, + "grad_norm": 0.7831044566641154, + "learning_rate": 7.2080088987764185e-06, + "loss": 0.2674, + "step": 648 + }, + { + "epoch": 0.21673067290031725, + "grad_norm": 0.7513498348722037, + "learning_rate": 7.219132369299222e-06, + "loss": 0.2608, + "step": 649 + }, + { + "epoch": 0.21706461846718986, + "grad_norm": 0.8318654606544825, + "learning_rate": 7.230255839822025e-06, + "loss": 0.277, + "step": 650 + }, + { + "epoch": 0.21739856403406244, + "grad_norm": 0.6802811622216082, + "learning_rate": 7.241379310344828e-06, + "loss": 0.2319, + "step": 651 + }, + { + "epoch": 0.21773250960093504, + "grad_norm": 1.106089563161147, + "learning_rate": 7.252502780867632e-06, + "loss": 0.2582, + "step": 652 + }, + { + "epoch": 0.21806645516780765, + "grad_norm": 0.7584213630649976, + "learning_rate": 7.263626251390435e-06, + "loss": 0.2495, + "step": 653 + }, + { + "epoch": 0.21840040073468026, + "grad_norm": 0.7854757276260729, + "learning_rate": 7.274749721913238e-06, + "loss": 0.2557, + "step": 654 + }, + { + "epoch": 0.21873434630155283, + "grad_norm": 0.7612163368733534, + "learning_rate": 7.28587319243604e-06, + "loss": 0.2451, + "step": 655 + }, + { + "epoch": 0.21906829186842544, + "grad_norm": 0.7748245742742014, + "learning_rate": 7.296996662958844e-06, + "loss": 0.2442, + "step": 656 + }, + { + "epoch": 0.21940223743529805, + "grad_norm": 0.6807686068804447, + "learning_rate": 7.308120133481646e-06, + "loss": 0.2408, + "step": 657 + }, + { + "epoch": 0.21973618300217065, + "grad_norm": 0.9239556234849106, + "learning_rate": 7.31924360400445e-06, + "loss": 0.2702, + "step": 658 + }, + { + "epoch": 0.22007012856904323, + "grad_norm": 0.7321576339149546, + "learning_rate": 7.3303670745272525e-06, + "loss": 0.2644, + "step": 659 + }, + { + "epoch": 0.22040407413591584, + "grad_norm": 0.7996011030861904, + "learning_rate": 7.341490545050056e-06, + "loss": 0.2489, + "step": 660 + }, + { + "epoch": 0.22073801970278845, + "grad_norm": 0.8764342155392295, + "learning_rate": 7.35261401557286e-06, + "loss": 0.2751, + "step": 661 + }, + { + "epoch": 0.22107196526966105, + "grad_norm": 0.8597656221387565, + "learning_rate": 7.363737486095662e-06, + "loss": 0.2786, + "step": 662 + }, + { + "epoch": 0.22140591083653363, + "grad_norm": 0.6958608408138943, + "learning_rate": 7.374860956618466e-06, + "loss": 0.2538, + "step": 663 + }, + { + "epoch": 0.22173985640340624, + "grad_norm": 0.7016554454611178, + "learning_rate": 7.385984427141269e-06, + "loss": 0.2467, + "step": 664 + }, + { + "epoch": 0.22207380197027884, + "grad_norm": 0.7640514477559327, + "learning_rate": 7.397107897664072e-06, + "loss": 0.2634, + "step": 665 + }, + { + "epoch": 0.22240774753715145, + "grad_norm": 0.7363554934776556, + "learning_rate": 7.408231368186875e-06, + "loss": 0.2298, + "step": 666 + }, + { + "epoch": 0.22274169310402406, + "grad_norm": 0.8144798385573445, + "learning_rate": 7.4193548387096784e-06, + "loss": 0.2443, + "step": 667 + }, + { + "epoch": 0.22307563867089664, + "grad_norm": 0.8918153080014487, + "learning_rate": 7.43047830923248e-06, + "loss": 0.2578, + "step": 668 + }, + { + "epoch": 0.22340958423776924, + "grad_norm": 0.7303457303558588, + "learning_rate": 7.441601779755284e-06, + "loss": 0.2674, + "step": 669 + }, + { + "epoch": 0.22374352980464185, + "grad_norm": 0.6891671510625942, + "learning_rate": 7.452725250278087e-06, + "loss": 0.2528, + "step": 670 + }, + { + "epoch": 0.22407747537151446, + "grad_norm": 0.7806126310729112, + "learning_rate": 7.46384872080089e-06, + "loss": 0.2571, + "step": 671 + }, + { + "epoch": 0.22441142093838704, + "grad_norm": 0.6843873330049303, + "learning_rate": 7.474972191323694e-06, + "loss": 0.2255, + "step": 672 + }, + { + "epoch": 0.22474536650525964, + "grad_norm": 0.8453798784973325, + "learning_rate": 7.486095661846496e-06, + "loss": 0.2625, + "step": 673 + }, + { + "epoch": 0.22507931207213225, + "grad_norm": 0.8434557324517794, + "learning_rate": 7.4972191323693e-06, + "loss": 0.2776, + "step": 674 + }, + { + "epoch": 0.22541325763900485, + "grad_norm": 0.7048585497108847, + "learning_rate": 7.508342602892103e-06, + "loss": 0.2585, + "step": 675 + }, + { + "epoch": 0.22574720320587743, + "grad_norm": 0.7659379836914064, + "learning_rate": 7.519466073414906e-06, + "loss": 0.2548, + "step": 676 + }, + { + "epoch": 0.22608114877275004, + "grad_norm": 0.6934227319885261, + "learning_rate": 7.53058954393771e-06, + "loss": 0.2415, + "step": 677 + }, + { + "epoch": 0.22641509433962265, + "grad_norm": 0.7811227987102337, + "learning_rate": 7.5417130144605125e-06, + "loss": 0.2517, + "step": 678 + }, + { + "epoch": 0.22674903990649525, + "grad_norm": 0.804992221647555, + "learning_rate": 7.552836484983316e-06, + "loss": 0.2741, + "step": 679 + }, + { + "epoch": 0.22708298547336783, + "grad_norm": 0.8687196799605786, + "learning_rate": 7.563959955506118e-06, + "loss": 0.2727, + "step": 680 + }, + { + "epoch": 0.22741693104024044, + "grad_norm": 0.7320254452663593, + "learning_rate": 7.575083426028922e-06, + "loss": 0.2593, + "step": 681 + }, + { + "epoch": 0.22775087660711305, + "grad_norm": 0.8031326712488748, + "learning_rate": 7.586206896551724e-06, + "loss": 0.2539, + "step": 682 + }, + { + "epoch": 0.22808482217398565, + "grad_norm": 0.7676874292506355, + "learning_rate": 7.597330367074528e-06, + "loss": 0.2327, + "step": 683 + }, + { + "epoch": 0.22841876774085823, + "grad_norm": 0.7010316070246935, + "learning_rate": 7.60845383759733e-06, + "loss": 0.2351, + "step": 684 + }, + { + "epoch": 0.22875271330773084, + "grad_norm": 0.8074417436579385, + "learning_rate": 7.619577308120134e-06, + "loss": 0.2521, + "step": 685 + }, + { + "epoch": 0.22908665887460344, + "grad_norm": 0.8071936582217873, + "learning_rate": 7.630700778642938e-06, + "loss": 0.244, + "step": 686 + }, + { + "epoch": 0.22942060444147605, + "grad_norm": 0.6359795454476252, + "learning_rate": 7.64182424916574e-06, + "loss": 0.2368, + "step": 687 + }, + { + "epoch": 0.22975455000834863, + "grad_norm": 0.7777196776901433, + "learning_rate": 7.652947719688543e-06, + "loss": 0.2414, + "step": 688 + }, + { + "epoch": 0.23008849557522124, + "grad_norm": 0.8592388607477667, + "learning_rate": 7.664071190211346e-06, + "loss": 0.2455, + "step": 689 + }, + { + "epoch": 0.23042244114209384, + "grad_norm": 0.7537444209652607, + "learning_rate": 7.67519466073415e-06, + "loss": 0.2568, + "step": 690 + }, + { + "epoch": 0.23075638670896645, + "grad_norm": 0.7514964866360881, + "learning_rate": 7.686318131256953e-06, + "loss": 0.2543, + "step": 691 + }, + { + "epoch": 0.23109033227583903, + "grad_norm": 0.7856637220526238, + "learning_rate": 7.697441601779755e-06, + "loss": 0.2396, + "step": 692 + }, + { + "epoch": 0.23142427784271163, + "grad_norm": 0.6766482208992999, + "learning_rate": 7.70856507230256e-06, + "loss": 0.2306, + "step": 693 + }, + { + "epoch": 0.23175822340958424, + "grad_norm": 0.7190582180915687, + "learning_rate": 7.719688542825363e-06, + "loss": 0.2332, + "step": 694 + }, + { + "epoch": 0.23209216897645685, + "grad_norm": 0.7365246732401293, + "learning_rate": 7.730812013348165e-06, + "loss": 0.2462, + "step": 695 + }, + { + "epoch": 0.23242611454332943, + "grad_norm": 0.7531487377575422, + "learning_rate": 7.741935483870968e-06, + "loss": 0.2516, + "step": 696 + }, + { + "epoch": 0.23276006011020203, + "grad_norm": 0.7856970157332456, + "learning_rate": 7.753058954393772e-06, + "loss": 0.2595, + "step": 697 + }, + { + "epoch": 0.23309400567707464, + "grad_norm": 0.7465491947839247, + "learning_rate": 7.764182424916575e-06, + "loss": 0.243, + "step": 698 + }, + { + "epoch": 0.23342795124394725, + "grad_norm": 0.78553071211408, + "learning_rate": 7.775305895439378e-06, + "loss": 0.2727, + "step": 699 + }, + { + "epoch": 0.23376189681081982, + "grad_norm": 0.6936051447843483, + "learning_rate": 7.78642936596218e-06, + "loss": 0.2406, + "step": 700 + }, + { + "epoch": 0.23409584237769243, + "grad_norm": 0.7284183041445422, + "learning_rate": 7.797552836484983e-06, + "loss": 0.2498, + "step": 701 + }, + { + "epoch": 0.23442978794456504, + "grad_norm": 0.7500119738086012, + "learning_rate": 7.808676307007788e-06, + "loss": 0.2353, + "step": 702 + }, + { + "epoch": 0.23476373351143764, + "grad_norm": 0.7405650072009493, + "learning_rate": 7.81979977753059e-06, + "loss": 0.2565, + "step": 703 + }, + { + "epoch": 0.23509767907831022, + "grad_norm": 0.7259042405639793, + "learning_rate": 7.830923248053393e-06, + "loss": 0.2448, + "step": 704 + }, + { + "epoch": 0.23543162464518283, + "grad_norm": 0.6356529795786438, + "learning_rate": 7.842046718576196e-06, + "loss": 0.2275, + "step": 705 + }, + { + "epoch": 0.23576557021205544, + "grad_norm": 0.7052503065077377, + "learning_rate": 7.853170189099e-06, + "loss": 0.2475, + "step": 706 + }, + { + "epoch": 0.23609951577892804, + "grad_norm": 0.7686445726114107, + "learning_rate": 7.864293659621803e-06, + "loss": 0.2644, + "step": 707 + }, + { + "epoch": 0.23643346134580062, + "grad_norm": 0.7484144633557975, + "learning_rate": 7.875417130144606e-06, + "loss": 0.2575, + "step": 708 + }, + { + "epoch": 0.23676740691267323, + "grad_norm": 0.7357228659660728, + "learning_rate": 7.886540600667408e-06, + "loss": 0.2372, + "step": 709 + }, + { + "epoch": 0.23710135247954583, + "grad_norm": 0.6982740541543035, + "learning_rate": 7.897664071190213e-06, + "loss": 0.2508, + "step": 710 + }, + { + "epoch": 0.23743529804641844, + "grad_norm": 0.7398245740088609, + "learning_rate": 7.908787541713015e-06, + "loss": 0.2381, + "step": 711 + }, + { + "epoch": 0.23776924361329102, + "grad_norm": 0.7895941667598881, + "learning_rate": 7.919911012235818e-06, + "loss": 0.2283, + "step": 712 + }, + { + "epoch": 0.23810318918016363, + "grad_norm": 0.779972681262703, + "learning_rate": 7.93103448275862e-06, + "loss": 0.2577, + "step": 713 + }, + { + "epoch": 0.23843713474703623, + "grad_norm": 0.7064211035558918, + "learning_rate": 7.942157953281424e-06, + "loss": 0.2391, + "step": 714 + }, + { + "epoch": 0.23877108031390884, + "grad_norm": 0.8977366266428429, + "learning_rate": 7.953281423804228e-06, + "loss": 0.2735, + "step": 715 + }, + { + "epoch": 0.23910502588078142, + "grad_norm": 0.8146695015510214, + "learning_rate": 7.96440489432703e-06, + "loss": 0.2553, + "step": 716 + }, + { + "epoch": 0.23943897144765403, + "grad_norm": 0.8075630691246333, + "learning_rate": 7.975528364849833e-06, + "loss": 0.24, + "step": 717 + }, + { + "epoch": 0.23977291701452663, + "grad_norm": 0.7417746252500675, + "learning_rate": 7.986651835372638e-06, + "loss": 0.2455, + "step": 718 + }, + { + "epoch": 0.24010686258139924, + "grad_norm": 0.7960193301121595, + "learning_rate": 7.99777530589544e-06, + "loss": 0.2537, + "step": 719 + }, + { + "epoch": 0.24044080814827185, + "grad_norm": 0.6581455395066982, + "learning_rate": 8.008898776418243e-06, + "loss": 0.2508, + "step": 720 + }, + { + "epoch": 0.24077475371514442, + "grad_norm": 0.762028236187238, + "learning_rate": 8.020022246941046e-06, + "loss": 0.2726, + "step": 721 + }, + { + "epoch": 0.24110869928201703, + "grad_norm": 0.7200877982327974, + "learning_rate": 8.03114571746385e-06, + "loss": 0.2551, + "step": 722 + }, + { + "epoch": 0.24144264484888964, + "grad_norm": 0.6356328284318833, + "learning_rate": 8.042269187986651e-06, + "loss": 0.2249, + "step": 723 + }, + { + "epoch": 0.24177659041576224, + "grad_norm": 0.7300735104858539, + "learning_rate": 8.053392658509456e-06, + "loss": 0.2535, + "step": 724 + }, + { + "epoch": 0.24211053598263482, + "grad_norm": 0.7236816257988127, + "learning_rate": 8.064516129032258e-06, + "loss": 0.2521, + "step": 725 + }, + { + "epoch": 0.24244448154950743, + "grad_norm": 0.6587602905727439, + "learning_rate": 8.075639599555061e-06, + "loss": 0.2425, + "step": 726 + }, + { + "epoch": 0.24277842711638004, + "grad_norm": 0.7780907894191817, + "learning_rate": 8.086763070077866e-06, + "loss": 0.264, + "step": 727 + }, + { + "epoch": 0.24311237268325264, + "grad_norm": 0.686670566205572, + "learning_rate": 8.097886540600668e-06, + "loss": 0.2509, + "step": 728 + }, + { + "epoch": 0.24344631825012522, + "grad_norm": 0.7534125635472662, + "learning_rate": 8.109010011123471e-06, + "loss": 0.2402, + "step": 729 + }, + { + "epoch": 0.24378026381699783, + "grad_norm": 0.607443586736483, + "learning_rate": 8.120133481646274e-06, + "loss": 0.2397, + "step": 730 + }, + { + "epoch": 0.24411420938387043, + "grad_norm": 1.3378541145711298, + "learning_rate": 8.131256952169078e-06, + "loss": 0.255, + "step": 731 + }, + { + "epoch": 0.24444815495074304, + "grad_norm": 0.7693531003770262, + "learning_rate": 8.14238042269188e-06, + "loss": 0.2469, + "step": 732 + }, + { + "epoch": 0.24478210051761562, + "grad_norm": 0.7630786614916105, + "learning_rate": 8.153503893214683e-06, + "loss": 0.2529, + "step": 733 + }, + { + "epoch": 0.24511604608448823, + "grad_norm": 0.6830755477799588, + "learning_rate": 8.164627363737486e-06, + "loss": 0.2593, + "step": 734 + }, + { + "epoch": 0.24544999165136083, + "grad_norm": 0.769470982645176, + "learning_rate": 8.17575083426029e-06, + "loss": 0.2635, + "step": 735 + }, + { + "epoch": 0.24578393721823344, + "grad_norm": 0.6411315581669028, + "learning_rate": 8.186874304783093e-06, + "loss": 0.2333, + "step": 736 + }, + { + "epoch": 0.24611788278510602, + "grad_norm": 0.7021751041939086, + "learning_rate": 8.197997775305896e-06, + "loss": 0.2614, + "step": 737 + }, + { + "epoch": 0.24645182835197862, + "grad_norm": 0.6973361840787954, + "learning_rate": 8.209121245828699e-06, + "loss": 0.2517, + "step": 738 + }, + { + "epoch": 0.24678577391885123, + "grad_norm": 0.7235562634782384, + "learning_rate": 8.220244716351501e-06, + "loss": 0.247, + "step": 739 + }, + { + "epoch": 0.24711971948572384, + "grad_norm": 0.7907039415300885, + "learning_rate": 8.231368186874306e-06, + "loss": 0.2747, + "step": 740 + }, + { + "epoch": 0.24745366505259642, + "grad_norm": 0.9920594690593687, + "learning_rate": 8.242491657397109e-06, + "loss": 0.2504, + "step": 741 + }, + { + "epoch": 0.24778761061946902, + "grad_norm": 0.6620117662895729, + "learning_rate": 8.253615127919911e-06, + "loss": 0.2539, + "step": 742 + }, + { + "epoch": 0.24812155618634163, + "grad_norm": 0.8115893003059279, + "learning_rate": 8.264738598442716e-06, + "loss": 0.2445, + "step": 743 + }, + { + "epoch": 0.24845550175321424, + "grad_norm": 0.7888030282533821, + "learning_rate": 8.275862068965518e-06, + "loss": 0.2447, + "step": 744 + }, + { + "epoch": 0.24878944732008682, + "grad_norm": 0.7396486295280933, + "learning_rate": 8.286985539488321e-06, + "loss": 0.2502, + "step": 745 + }, + { + "epoch": 0.24912339288695942, + "grad_norm": 0.7245100088509931, + "learning_rate": 8.298109010011124e-06, + "loss": 0.2484, + "step": 746 + }, + { + "epoch": 0.24945733845383203, + "grad_norm": 0.7768451542815072, + "learning_rate": 8.309232480533928e-06, + "loss": 0.2537, + "step": 747 + }, + { + "epoch": 0.24979128402070463, + "grad_norm": 0.7260972447956803, + "learning_rate": 8.32035595105673e-06, + "loss": 0.2664, + "step": 748 + }, + { + "epoch": 0.2501252295875772, + "grad_norm": 0.7727276441288169, + "learning_rate": 8.331479421579534e-06, + "loss": 0.2457, + "step": 749 + }, + { + "epoch": 0.2504591751544498, + "grad_norm": 0.7116614209879674, + "learning_rate": 8.342602892102336e-06, + "loss": 0.228, + "step": 750 + }, + { + "epoch": 0.2507931207213224, + "grad_norm": 0.9127998577319423, + "learning_rate": 8.353726362625139e-06, + "loss": 0.2362, + "step": 751 + }, + { + "epoch": 0.25112706628819503, + "grad_norm": 0.8377483636026994, + "learning_rate": 8.364849833147943e-06, + "loss": 0.2464, + "step": 752 + }, + { + "epoch": 0.25146101185506764, + "grad_norm": 0.6816783448014366, + "learning_rate": 8.375973303670746e-06, + "loss": 0.2369, + "step": 753 + }, + { + "epoch": 0.25179495742194025, + "grad_norm": 0.8758433268313643, + "learning_rate": 8.387096774193549e-06, + "loss": 0.2795, + "step": 754 + }, + { + "epoch": 0.2521289029888128, + "grad_norm": 0.9056051609541013, + "learning_rate": 8.398220244716352e-06, + "loss": 0.263, + "step": 755 + }, + { + "epoch": 0.2524628485556854, + "grad_norm": 0.8917755495799926, + "learning_rate": 8.409343715239156e-06, + "loss": 0.2599, + "step": 756 + }, + { + "epoch": 0.252796794122558, + "grad_norm": 0.6905229275325627, + "learning_rate": 8.420467185761959e-06, + "loss": 0.2328, + "step": 757 + }, + { + "epoch": 0.2531307396894306, + "grad_norm": 0.68920478639127, + "learning_rate": 8.431590656284761e-06, + "loss": 0.2477, + "step": 758 + }, + { + "epoch": 0.2534646852563032, + "grad_norm": 0.745543256443487, + "learning_rate": 8.442714126807566e-06, + "loss": 0.2405, + "step": 759 + }, + { + "epoch": 0.25379863082317583, + "grad_norm": 0.7884756984382175, + "learning_rate": 8.453837597330368e-06, + "loss": 0.2362, + "step": 760 + }, + { + "epoch": 0.25413257639004844, + "grad_norm": 0.7468278824055963, + "learning_rate": 8.464961067853171e-06, + "loss": 0.262, + "step": 761 + }, + { + "epoch": 0.25446652195692104, + "grad_norm": 0.8653261962193426, + "learning_rate": 8.476084538375974e-06, + "loss": 0.2586, + "step": 762 + }, + { + "epoch": 0.2548004675237936, + "grad_norm": 0.6837818557349334, + "learning_rate": 8.487208008898777e-06, + "loss": 0.2366, + "step": 763 + }, + { + "epoch": 0.2551344130906662, + "grad_norm": 0.7536532409242654, + "learning_rate": 8.49833147942158e-06, + "loss": 0.2591, + "step": 764 + }, + { + "epoch": 0.2554683586575388, + "grad_norm": 0.9083573270282176, + "learning_rate": 8.509454949944384e-06, + "loss": 0.2521, + "step": 765 + }, + { + "epoch": 0.2558023042244114, + "grad_norm": 0.767552861904764, + "learning_rate": 8.520578420467186e-06, + "loss": 0.2625, + "step": 766 + }, + { + "epoch": 0.256136249791284, + "grad_norm": 0.7629866167443453, + "learning_rate": 8.531701890989989e-06, + "loss": 0.251, + "step": 767 + }, + { + "epoch": 0.2564701953581566, + "grad_norm": 0.7377761813541851, + "learning_rate": 8.542825361512793e-06, + "loss": 0.2404, + "step": 768 + }, + { + "epoch": 0.25680414092502923, + "grad_norm": 0.7352772762034555, + "learning_rate": 8.553948832035596e-06, + "loss": 0.2509, + "step": 769 + }, + { + "epoch": 0.25713808649190184, + "grad_norm": 0.7212760539284759, + "learning_rate": 8.565072302558399e-06, + "loss": 0.2418, + "step": 770 + }, + { + "epoch": 0.25747203205877445, + "grad_norm": 0.6973041703150606, + "learning_rate": 8.576195773081202e-06, + "loss": 0.2378, + "step": 771 + }, + { + "epoch": 0.257805977625647, + "grad_norm": 0.8275720116284327, + "learning_rate": 8.587319243604006e-06, + "loss": 0.2453, + "step": 772 + }, + { + "epoch": 0.2581399231925196, + "grad_norm": 0.7089874078732612, + "learning_rate": 8.598442714126807e-06, + "loss": 0.2363, + "step": 773 + }, + { + "epoch": 0.2584738687593922, + "grad_norm": 0.6667510504135715, + "learning_rate": 8.609566184649611e-06, + "loss": 0.2404, + "step": 774 + }, + { + "epoch": 0.2588078143262648, + "grad_norm": 0.8122294668554957, + "learning_rate": 8.620689655172414e-06, + "loss": 0.2622, + "step": 775 + }, + { + "epoch": 0.2591417598931374, + "grad_norm": 0.823450700107754, + "learning_rate": 8.631813125695217e-06, + "loss": 0.2417, + "step": 776 + }, + { + "epoch": 0.25947570546001003, + "grad_norm": 0.7726092316119784, + "learning_rate": 8.642936596218021e-06, + "loss": 0.2474, + "step": 777 + }, + { + "epoch": 0.25980965102688264, + "grad_norm": 0.6746381988563429, + "learning_rate": 8.654060066740824e-06, + "loss": 0.2445, + "step": 778 + }, + { + "epoch": 0.26014359659375524, + "grad_norm": 0.7622637807060025, + "learning_rate": 8.665183537263627e-06, + "loss": 0.2593, + "step": 779 + }, + { + "epoch": 0.2604775421606278, + "grad_norm": 0.7660043938587235, + "learning_rate": 8.67630700778643e-06, + "loss": 0.2612, + "step": 780 + }, + { + "epoch": 0.2608114877275004, + "grad_norm": 0.733656969442999, + "learning_rate": 8.687430478309234e-06, + "loss": 0.2649, + "step": 781 + }, + { + "epoch": 0.261145433294373, + "grad_norm": 0.8112312167102349, + "learning_rate": 8.698553948832036e-06, + "loss": 0.2637, + "step": 782 + }, + { + "epoch": 0.2614793788612456, + "grad_norm": 0.7552777944668807, + "learning_rate": 8.70967741935484e-06, + "loss": 0.2511, + "step": 783 + }, + { + "epoch": 0.2618133244281182, + "grad_norm": 0.6705632857664593, + "learning_rate": 8.720800889877644e-06, + "loss": 0.2517, + "step": 784 + }, + { + "epoch": 0.26214726999499083, + "grad_norm": 0.7991435702186575, + "learning_rate": 8.731924360400446e-06, + "loss": 0.2798, + "step": 785 + }, + { + "epoch": 0.26248121556186343, + "grad_norm": 0.7428406935602541, + "learning_rate": 8.743047830923249e-06, + "loss": 0.2669, + "step": 786 + }, + { + "epoch": 0.26281516112873604, + "grad_norm": 0.7460196154653285, + "learning_rate": 8.754171301446052e-06, + "loss": 0.2314, + "step": 787 + }, + { + "epoch": 0.2631491066956086, + "grad_norm": 0.6813560165573437, + "learning_rate": 8.765294771968854e-06, + "loss": 0.2387, + "step": 788 + }, + { + "epoch": 0.2634830522624812, + "grad_norm": 1.1679094658128664, + "learning_rate": 8.776418242491657e-06, + "loss": 0.2642, + "step": 789 + }, + { + "epoch": 0.2638169978293538, + "grad_norm": 0.6998489054574449, + "learning_rate": 8.787541713014462e-06, + "loss": 0.2493, + "step": 790 + }, + { + "epoch": 0.2641509433962264, + "grad_norm": 0.7233905242204927, + "learning_rate": 8.798665183537264e-06, + "loss": 0.2765, + "step": 791 + }, + { + "epoch": 0.264484888963099, + "grad_norm": 0.8130295529317806, + "learning_rate": 8.809788654060067e-06, + "loss": 0.2572, + "step": 792 + }, + { + "epoch": 0.2648188345299716, + "grad_norm": 0.734643637678886, + "learning_rate": 8.820912124582871e-06, + "loss": 0.2764, + "step": 793 + }, + { + "epoch": 0.26515278009684423, + "grad_norm": 0.8129757879046563, + "learning_rate": 8.832035595105674e-06, + "loss": 0.2597, + "step": 794 + }, + { + "epoch": 0.26548672566371684, + "grad_norm": 0.7165953475826065, + "learning_rate": 8.843159065628477e-06, + "loss": 0.2338, + "step": 795 + }, + { + "epoch": 0.2658206712305894, + "grad_norm": 0.7259835911679116, + "learning_rate": 8.85428253615128e-06, + "loss": 0.2579, + "step": 796 + }, + { + "epoch": 0.266154616797462, + "grad_norm": 0.6619475009098943, + "learning_rate": 8.865406006674084e-06, + "loss": 0.2119, + "step": 797 + }, + { + "epoch": 0.2664885623643346, + "grad_norm": 0.7625539062454664, + "learning_rate": 8.876529477196885e-06, + "loss": 0.2281, + "step": 798 + }, + { + "epoch": 0.2668225079312072, + "grad_norm": 0.7790875438082882, + "learning_rate": 8.88765294771969e-06, + "loss": 0.264, + "step": 799 + }, + { + "epoch": 0.2671564534980798, + "grad_norm": 0.7937898709271423, + "learning_rate": 8.898776418242492e-06, + "loss": 0.2448, + "step": 800 + }, + { + "epoch": 0.2674903990649524, + "grad_norm": 0.7534922554712896, + "learning_rate": 8.909899888765295e-06, + "loss": 0.2601, + "step": 801 + }, + { + "epoch": 0.26782434463182503, + "grad_norm": 0.709328797539281, + "learning_rate": 8.921023359288099e-06, + "loss": 0.2378, + "step": 802 + }, + { + "epoch": 0.26815829019869764, + "grad_norm": 0.6700322828506419, + "learning_rate": 8.932146829810902e-06, + "loss": 0.2377, + "step": 803 + }, + { + "epoch": 0.2684922357655702, + "grad_norm": 0.7397483140494081, + "learning_rate": 8.943270300333705e-06, + "loss": 0.2504, + "step": 804 + }, + { + "epoch": 0.2688261813324428, + "grad_norm": 0.6236769417814907, + "learning_rate": 8.954393770856507e-06, + "loss": 0.2407, + "step": 805 + }, + { + "epoch": 0.2691601268993154, + "grad_norm": 0.6903960276905347, + "learning_rate": 8.965517241379312e-06, + "loss": 0.2449, + "step": 806 + }, + { + "epoch": 0.269494072466188, + "grad_norm": 0.6618661572934079, + "learning_rate": 8.976640711902114e-06, + "loss": 0.2322, + "step": 807 + }, + { + "epoch": 0.2698280180330606, + "grad_norm": 0.6450814858368458, + "learning_rate": 8.987764182424917e-06, + "loss": 0.2527, + "step": 808 + }, + { + "epoch": 0.2701619635999332, + "grad_norm": 0.6732302794328731, + "learning_rate": 8.998887652947721e-06, + "loss": 0.2492, + "step": 809 + }, + { + "epoch": 0.2704959091668058, + "grad_norm": 0.7193330108510266, + "learning_rate": 9.010011123470524e-06, + "loss": 0.2521, + "step": 810 + }, + { + "epoch": 0.27082985473367843, + "grad_norm": 0.6638055458342996, + "learning_rate": 9.021134593993327e-06, + "loss": 0.2558, + "step": 811 + }, + { + "epoch": 0.271163800300551, + "grad_norm": 0.6932796625861776, + "learning_rate": 9.03225806451613e-06, + "loss": 0.2685, + "step": 812 + }, + { + "epoch": 0.2714977458674236, + "grad_norm": 0.669882215821802, + "learning_rate": 9.043381535038932e-06, + "loss": 0.2511, + "step": 813 + }, + { + "epoch": 0.2718316914342962, + "grad_norm": 0.738016797517266, + "learning_rate": 9.054505005561735e-06, + "loss": 0.2508, + "step": 814 + }, + { + "epoch": 0.2721656370011688, + "grad_norm": 0.6579473564630967, + "learning_rate": 9.06562847608454e-06, + "loss": 0.2428, + "step": 815 + }, + { + "epoch": 0.2724995825680414, + "grad_norm": 0.6601416633554115, + "learning_rate": 9.076751946607342e-06, + "loss": 0.2334, + "step": 816 + }, + { + "epoch": 0.272833528134914, + "grad_norm": 0.8681212873817967, + "learning_rate": 9.087875417130145e-06, + "loss": 0.2649, + "step": 817 + }, + { + "epoch": 0.2731674737017866, + "grad_norm": 0.6871613484359503, + "learning_rate": 9.09899888765295e-06, + "loss": 0.2488, + "step": 818 + }, + { + "epoch": 0.27350141926865923, + "grad_norm": 0.8565684394856233, + "learning_rate": 9.110122358175752e-06, + "loss": 0.2566, + "step": 819 + }, + { + "epoch": 0.27383536483553184, + "grad_norm": 0.8255322571537959, + "learning_rate": 9.121245828698555e-06, + "loss": 0.2493, + "step": 820 + }, + { + "epoch": 0.2741693104024044, + "grad_norm": 0.8229306366015684, + "learning_rate": 9.132369299221357e-06, + "loss": 0.252, + "step": 821 + }, + { + "epoch": 0.274503255969277, + "grad_norm": 0.8463175261291924, + "learning_rate": 9.143492769744162e-06, + "loss": 0.2567, + "step": 822 + }, + { + "epoch": 0.2748372015361496, + "grad_norm": 0.6728403338031879, + "learning_rate": 9.154616240266963e-06, + "loss": 0.2441, + "step": 823 + }, + { + "epoch": 0.2751711471030222, + "grad_norm": 0.8767027013992857, + "learning_rate": 9.165739710789767e-06, + "loss": 0.2473, + "step": 824 + }, + { + "epoch": 0.2755050926698948, + "grad_norm": 0.8201243130499303, + "learning_rate": 9.176863181312572e-06, + "loss": 0.2563, + "step": 825 + }, + { + "epoch": 0.2758390382367674, + "grad_norm": 0.6872232291309764, + "learning_rate": 9.187986651835373e-06, + "loss": 0.2485, + "step": 826 + }, + { + "epoch": 0.27617298380364, + "grad_norm": 0.6930371900328928, + "learning_rate": 9.199110122358177e-06, + "loss": 0.236, + "step": 827 + }, + { + "epoch": 0.27650692937051263, + "grad_norm": 0.7768765519692938, + "learning_rate": 9.21023359288098e-06, + "loss": 0.2268, + "step": 828 + }, + { + "epoch": 0.2768408749373852, + "grad_norm": 0.641650918415613, + "learning_rate": 9.221357063403782e-06, + "loss": 0.2458, + "step": 829 + }, + { + "epoch": 0.2771748205042578, + "grad_norm": 0.7847607928855562, + "learning_rate": 9.232480533926585e-06, + "loss": 0.2561, + "step": 830 + }, + { + "epoch": 0.2775087660711304, + "grad_norm": 0.7522962047924147, + "learning_rate": 9.24360400444939e-06, + "loss": 0.2462, + "step": 831 + }, + { + "epoch": 0.277842711638003, + "grad_norm": 0.7856412167874145, + "learning_rate": 9.254727474972192e-06, + "loss": 0.2453, + "step": 832 + }, + { + "epoch": 0.2781766572048756, + "grad_norm": 0.6379368244033592, + "learning_rate": 9.265850945494995e-06, + "loss": 0.2319, + "step": 833 + }, + { + "epoch": 0.2785106027717482, + "grad_norm": 0.6923372665425509, + "learning_rate": 9.2769744160178e-06, + "loss": 0.2521, + "step": 834 + }, + { + "epoch": 0.2788445483386208, + "grad_norm": 0.6285185522634985, + "learning_rate": 9.288097886540602e-06, + "loss": 0.2414, + "step": 835 + }, + { + "epoch": 0.27917849390549343, + "grad_norm": 0.7659134854937873, + "learning_rate": 9.299221357063405e-06, + "loss": 0.2617, + "step": 836 + }, + { + "epoch": 0.279512439472366, + "grad_norm": 0.651210591819511, + "learning_rate": 9.310344827586207e-06, + "loss": 0.2412, + "step": 837 + }, + { + "epoch": 0.2798463850392386, + "grad_norm": 0.604891742779754, + "learning_rate": 9.32146829810901e-06, + "loss": 0.2361, + "step": 838 + }, + { + "epoch": 0.2801803306061112, + "grad_norm": 0.6371826775462133, + "learning_rate": 9.332591768631813e-06, + "loss": 0.2512, + "step": 839 + }, + { + "epoch": 0.2805142761729838, + "grad_norm": 0.6340690502327005, + "learning_rate": 9.343715239154617e-06, + "loss": 0.2416, + "step": 840 + }, + { + "epoch": 0.2808482217398564, + "grad_norm": 0.7342239300077404, + "learning_rate": 9.35483870967742e-06, + "loss": 0.2448, + "step": 841 + }, + { + "epoch": 0.281182167306729, + "grad_norm": 0.6549609966808125, + "learning_rate": 9.365962180200223e-06, + "loss": 0.2458, + "step": 842 + }, + { + "epoch": 0.2815161128736016, + "grad_norm": 0.7299241942323713, + "learning_rate": 9.377085650723027e-06, + "loss": 0.2318, + "step": 843 + }, + { + "epoch": 0.2818500584404742, + "grad_norm": 0.6845739065601146, + "learning_rate": 9.38820912124583e-06, + "loss": 0.2428, + "step": 844 + }, + { + "epoch": 0.2821840040073468, + "grad_norm": 0.677968498084514, + "learning_rate": 9.399332591768633e-06, + "loss": 0.2594, + "step": 845 + }, + { + "epoch": 0.2825179495742194, + "grad_norm": 0.682427111871113, + "learning_rate": 9.410456062291435e-06, + "loss": 0.2504, + "step": 846 + }, + { + "epoch": 0.282851895141092, + "grad_norm": 0.6035000880972591, + "learning_rate": 9.42157953281424e-06, + "loss": 0.2399, + "step": 847 + }, + { + "epoch": 0.2831858407079646, + "grad_norm": 0.7419050323765086, + "learning_rate": 9.43270300333704e-06, + "loss": 0.2724, + "step": 848 + }, + { + "epoch": 0.2835197862748372, + "grad_norm": 0.6133376838559707, + "learning_rate": 9.443826473859845e-06, + "loss": 0.2289, + "step": 849 + }, + { + "epoch": 0.2838537318417098, + "grad_norm": 0.687952283763828, + "learning_rate": 9.45494994438265e-06, + "loss": 0.2427, + "step": 850 + }, + { + "epoch": 0.2841876774085824, + "grad_norm": 0.7028605452057839, + "learning_rate": 9.46607341490545e-06, + "loss": 0.2476, + "step": 851 + }, + { + "epoch": 0.284521622975455, + "grad_norm": 0.6623344875228663, + "learning_rate": 9.477196885428255e-06, + "loss": 0.2452, + "step": 852 + }, + { + "epoch": 0.2848555685423276, + "grad_norm": 0.6332298537339767, + "learning_rate": 9.488320355951058e-06, + "loss": 0.2399, + "step": 853 + }, + { + "epoch": 0.2851895141092002, + "grad_norm": 0.6645743882820965, + "learning_rate": 9.49944382647386e-06, + "loss": 0.2447, + "step": 854 + }, + { + "epoch": 0.2855234596760728, + "grad_norm": 0.6662259006288558, + "learning_rate": 9.510567296996663e-06, + "loss": 0.2296, + "step": 855 + }, + { + "epoch": 0.2858574052429454, + "grad_norm": 0.6919974903102527, + "learning_rate": 9.521690767519467e-06, + "loss": 0.2537, + "step": 856 + }, + { + "epoch": 0.286191350809818, + "grad_norm": 0.6454163080434254, + "learning_rate": 9.53281423804227e-06, + "loss": 0.2514, + "step": 857 + }, + { + "epoch": 0.2865252963766906, + "grad_norm": 0.6480489173178399, + "learning_rate": 9.543937708565073e-06, + "loss": 0.2422, + "step": 858 + }, + { + "epoch": 0.2868592419435632, + "grad_norm": 0.7353022665236786, + "learning_rate": 9.555061179087877e-06, + "loss": 0.2351, + "step": 859 + }, + { + "epoch": 0.2871931875104358, + "grad_norm": 0.661065913724645, + "learning_rate": 9.56618464961068e-06, + "loss": 0.2479, + "step": 860 + }, + { + "epoch": 0.28752713307730837, + "grad_norm": 0.6763453703327771, + "learning_rate": 9.577308120133483e-06, + "loss": 0.2226, + "step": 861 + }, + { + "epoch": 0.287861078644181, + "grad_norm": 0.6727786879376527, + "learning_rate": 9.588431590656285e-06, + "loss": 0.2548, + "step": 862 + }, + { + "epoch": 0.2881950242110536, + "grad_norm": 0.9197408709347626, + "learning_rate": 9.599555061179088e-06, + "loss": 0.2489, + "step": 863 + }, + { + "epoch": 0.2885289697779262, + "grad_norm": 0.6646129153644628, + "learning_rate": 9.61067853170189e-06, + "loss": 0.2541, + "step": 864 + }, + { + "epoch": 0.2888629153447988, + "grad_norm": 0.7687613388007959, + "learning_rate": 9.621802002224695e-06, + "loss": 0.2549, + "step": 865 + }, + { + "epoch": 0.2891968609116714, + "grad_norm": 0.663099474914373, + "learning_rate": 9.632925472747498e-06, + "loss": 0.255, + "step": 866 + }, + { + "epoch": 0.289530806478544, + "grad_norm": 0.6373838203654694, + "learning_rate": 9.6440489432703e-06, + "loss": 0.2274, + "step": 867 + }, + { + "epoch": 0.2898647520454166, + "grad_norm": 0.8958296860718516, + "learning_rate": 9.655172413793105e-06, + "loss": 0.2744, + "step": 868 + }, + { + "epoch": 0.29019869761228917, + "grad_norm": 0.6689521121922187, + "learning_rate": 9.666295884315908e-06, + "loss": 0.2486, + "step": 869 + }, + { + "epoch": 0.2905326431791618, + "grad_norm": 0.7051581520556306, + "learning_rate": 9.67741935483871e-06, + "loss": 0.2325, + "step": 870 + }, + { + "epoch": 0.2908665887460344, + "grad_norm": 0.6723499452278224, + "learning_rate": 9.688542825361513e-06, + "loss": 0.2494, + "step": 871 + }, + { + "epoch": 0.291200534312907, + "grad_norm": 0.7127882744746751, + "learning_rate": 9.699666295884318e-06, + "loss": 0.2492, + "step": 872 + }, + { + "epoch": 0.2915344798797796, + "grad_norm": 0.6707104286364339, + "learning_rate": 9.710789766407119e-06, + "loss": 0.251, + "step": 873 + }, + { + "epoch": 0.2918684254466522, + "grad_norm": 0.781136986166393, + "learning_rate": 9.721913236929923e-06, + "loss": 0.2484, + "step": 874 + }, + { + "epoch": 0.2922023710135248, + "grad_norm": 0.6755229208947209, + "learning_rate": 9.733036707452727e-06, + "loss": 0.2545, + "step": 875 + }, + { + "epoch": 0.2925363165803974, + "grad_norm": 0.663707156055788, + "learning_rate": 9.744160177975528e-06, + "loss": 0.235, + "step": 876 + }, + { + "epoch": 0.29287026214727, + "grad_norm": 0.6679337491435877, + "learning_rate": 9.755283648498333e-06, + "loss": 0.2409, + "step": 877 + }, + { + "epoch": 0.2932042077141426, + "grad_norm": 0.8073669301897611, + "learning_rate": 9.766407119021135e-06, + "loss": 0.2554, + "step": 878 + }, + { + "epoch": 0.2935381532810152, + "grad_norm": 0.7198972500299666, + "learning_rate": 9.777530589543938e-06, + "loss": 0.2601, + "step": 879 + }, + { + "epoch": 0.2938720988478878, + "grad_norm": 0.7486411039843818, + "learning_rate": 9.788654060066741e-06, + "loss": 0.2717, + "step": 880 + }, + { + "epoch": 0.2942060444147604, + "grad_norm": 0.7554055472380966, + "learning_rate": 9.799777530589545e-06, + "loss": 0.2536, + "step": 881 + }, + { + "epoch": 0.294539989981633, + "grad_norm": 0.6769414341242086, + "learning_rate": 9.810901001112348e-06, + "loss": 0.2459, + "step": 882 + }, + { + "epoch": 0.2948739355485056, + "grad_norm": 0.6873429312658788, + "learning_rate": 9.82202447163515e-06, + "loss": 0.2353, + "step": 883 + }, + { + "epoch": 0.2952078811153782, + "grad_norm": 0.7752597910074831, + "learning_rate": 9.833147942157955e-06, + "loss": 0.2535, + "step": 884 + }, + { + "epoch": 0.2955418266822508, + "grad_norm": 0.6943618227859522, + "learning_rate": 9.844271412680758e-06, + "loss": 0.2689, + "step": 885 + }, + { + "epoch": 0.29587577224912337, + "grad_norm": 0.6801838220701963, + "learning_rate": 9.85539488320356e-06, + "loss": 0.2509, + "step": 886 + }, + { + "epoch": 0.296209717815996, + "grad_norm": 0.7339744312012771, + "learning_rate": 9.866518353726363e-06, + "loss": 0.2506, + "step": 887 + }, + { + "epoch": 0.2965436633828686, + "grad_norm": 0.7467491221803867, + "learning_rate": 9.877641824249166e-06, + "loss": 0.2566, + "step": 888 + }, + { + "epoch": 0.2968776089497412, + "grad_norm": 0.6412331225573681, + "learning_rate": 9.888765294771969e-06, + "loss": 0.2474, + "step": 889 + }, + { + "epoch": 0.2972115545166138, + "grad_norm": 0.7697158729899859, + "learning_rate": 9.899888765294773e-06, + "loss": 0.247, + "step": 890 + }, + { + "epoch": 0.2975455000834864, + "grad_norm": 0.7524145415460668, + "learning_rate": 9.911012235817576e-06, + "loss": 0.2567, + "step": 891 + }, + { + "epoch": 0.297879445650359, + "grad_norm": 0.6734182049066624, + "learning_rate": 9.922135706340378e-06, + "loss": 0.2549, + "step": 892 + }, + { + "epoch": 0.2982133912172316, + "grad_norm": 0.7218829257352565, + "learning_rate": 9.933259176863183e-06, + "loss": 0.2406, + "step": 893 + }, + { + "epoch": 0.29854733678410417, + "grad_norm": 0.6704757464416423, + "learning_rate": 9.944382647385986e-06, + "loss": 0.2441, + "step": 894 + }, + { + "epoch": 0.2988812823509768, + "grad_norm": 0.7404615308151165, + "learning_rate": 9.955506117908788e-06, + "loss": 0.2708, + "step": 895 + }, + { + "epoch": 0.2992152279178494, + "grad_norm": 0.7370776669079746, + "learning_rate": 9.966629588431591e-06, + "loss": 0.2554, + "step": 896 + }, + { + "epoch": 0.299549173484722, + "grad_norm": 0.7453068037060375, + "learning_rate": 9.977753058954395e-06, + "loss": 0.2522, + "step": 897 + }, + { + "epoch": 0.2998831190515946, + "grad_norm": 0.7537765886662524, + "learning_rate": 9.988876529477196e-06, + "loss": 0.2532, + "step": 898 + }, + { + "epoch": 0.3002170646184672, + "grad_norm": 0.7151572032080934, + "learning_rate": 1e-05, + "loss": 0.2534, + "step": 899 + }, + { + "epoch": 0.3005510101853398, + "grad_norm": 0.714477700987338, + "learning_rate": 9.999999622345564e-06, + "loss": 0.2404, + "step": 900 + }, + { + "epoch": 0.3008849557522124, + "grad_norm": 0.77324691413999, + "learning_rate": 9.999998489382312e-06, + "loss": 0.2426, + "step": 901 + }, + { + "epoch": 0.30121890131908496, + "grad_norm": 0.6128610447991886, + "learning_rate": 9.999996601110414e-06, + "loss": 0.2368, + "step": 902 + }, + { + "epoch": 0.30155284688595757, + "grad_norm": 0.8103821519751393, + "learning_rate": 9.999993957530157e-06, + "loss": 0.2486, + "step": 903 + }, + { + "epoch": 0.3018867924528302, + "grad_norm": 0.7509864144087558, + "learning_rate": 9.999990558641939e-06, + "loss": 0.2538, + "step": 904 + }, + { + "epoch": 0.3022207380197028, + "grad_norm": 0.6455618999464463, + "learning_rate": 9.999986404446276e-06, + "loss": 0.2622, + "step": 905 + }, + { + "epoch": 0.3025546835865754, + "grad_norm": 0.7642363495501507, + "learning_rate": 9.999981494943791e-06, + "loss": 0.2579, + "step": 906 + }, + { + "epoch": 0.302888629153448, + "grad_norm": 0.7516753216991904, + "learning_rate": 9.99997583013523e-06, + "loss": 0.2634, + "step": 907 + }, + { + "epoch": 0.3032225747203206, + "grad_norm": 0.5702441441339939, + "learning_rate": 9.999969410021447e-06, + "loss": 0.2212, + "step": 908 + }, + { + "epoch": 0.3035565202871932, + "grad_norm": 0.7489930131914292, + "learning_rate": 9.999962234603412e-06, + "loss": 0.245, + "step": 909 + }, + { + "epoch": 0.30389046585406576, + "grad_norm": 0.6082830635429192, + "learning_rate": 9.99995430388221e-06, + "loss": 0.2325, + "step": 910 + }, + { + "epoch": 0.30422441142093837, + "grad_norm": 0.620352857608966, + "learning_rate": 9.999945617859034e-06, + "loss": 0.2396, + "step": 911 + }, + { + "epoch": 0.304558356987811, + "grad_norm": 0.7221173476692679, + "learning_rate": 9.999936176535203e-06, + "loss": 0.2305, + "step": 912 + }, + { + "epoch": 0.3048923025546836, + "grad_norm": 0.8190610193603814, + "learning_rate": 9.99992597991214e-06, + "loss": 0.2528, + "step": 913 + }, + { + "epoch": 0.3052262481215562, + "grad_norm": 0.6278371889811301, + "learning_rate": 9.999915027991384e-06, + "loss": 0.227, + "step": 914 + }, + { + "epoch": 0.3055601936884288, + "grad_norm": 0.6759595522501595, + "learning_rate": 9.999903320774593e-06, + "loss": 0.2519, + "step": 915 + }, + { + "epoch": 0.3058941392553014, + "grad_norm": 0.6733098529721205, + "learning_rate": 9.999890858263532e-06, + "loss": 0.2299, + "step": 916 + }, + { + "epoch": 0.306228084822174, + "grad_norm": 0.6706970692688264, + "learning_rate": 9.999877640460085e-06, + "loss": 0.248, + "step": 917 + }, + { + "epoch": 0.30656203038904656, + "grad_norm": 0.7033323944048314, + "learning_rate": 9.999863667366249e-06, + "loss": 0.2528, + "step": 918 + }, + { + "epoch": 0.30689597595591916, + "grad_norm": 0.7882000097369829, + "learning_rate": 9.999848938984135e-06, + "loss": 0.2462, + "step": 919 + }, + { + "epoch": 0.30722992152279177, + "grad_norm": 0.663692017284394, + "learning_rate": 9.999833455315966e-06, + "loss": 0.2401, + "step": 920 + }, + { + "epoch": 0.3075638670896644, + "grad_norm": 0.7406267967602063, + "learning_rate": 9.999817216364085e-06, + "loss": 0.2426, + "step": 921 + }, + { + "epoch": 0.307897812656537, + "grad_norm": 0.719538796129936, + "learning_rate": 9.99980022213094e-06, + "loss": 0.259, + "step": 922 + }, + { + "epoch": 0.3082317582234096, + "grad_norm": 0.7198059211802099, + "learning_rate": 9.999782472619102e-06, + "loss": 0.2356, + "step": 923 + }, + { + "epoch": 0.3085657037902822, + "grad_norm": 0.7748449854954533, + "learning_rate": 9.99976396783125e-06, + "loss": 0.2512, + "step": 924 + }, + { + "epoch": 0.3088996493571548, + "grad_norm": 0.6743289212320538, + "learning_rate": 9.999744707770182e-06, + "loss": 0.2295, + "step": 925 + }, + { + "epoch": 0.3092335949240274, + "grad_norm": 0.8585415752543676, + "learning_rate": 9.999724692438805e-06, + "loss": 0.2855, + "step": 926 + }, + { + "epoch": 0.30956754049089996, + "grad_norm": 0.6469196243149617, + "learning_rate": 9.999703921840143e-06, + "loss": 0.2527, + "step": 927 + }, + { + "epoch": 0.30990148605777257, + "grad_norm": 0.6798611050791047, + "learning_rate": 9.999682395977334e-06, + "loss": 0.2469, + "step": 928 + }, + { + "epoch": 0.3102354316246452, + "grad_norm": 0.7112677389326599, + "learning_rate": 9.999660114853631e-06, + "loss": 0.243, + "step": 929 + }, + { + "epoch": 0.3105693771915178, + "grad_norm": 0.5521283993542538, + "learning_rate": 9.999637078472398e-06, + "loss": 0.2228, + "step": 930 + }, + { + "epoch": 0.3109033227583904, + "grad_norm": 0.6084209475663657, + "learning_rate": 9.999613286837115e-06, + "loss": 0.2429, + "step": 931 + }, + { + "epoch": 0.311237268325263, + "grad_norm": 0.6229978352267757, + "learning_rate": 9.999588739951376e-06, + "loss": 0.2676, + "step": 932 + }, + { + "epoch": 0.3115712138921356, + "grad_norm": 0.6220393849687876, + "learning_rate": 9.99956343781889e-06, + "loss": 0.2396, + "step": 933 + }, + { + "epoch": 0.3119051594590082, + "grad_norm": 0.6661414030839978, + "learning_rate": 9.999537380443479e-06, + "loss": 0.2464, + "step": 934 + }, + { + "epoch": 0.31223910502588076, + "grad_norm": 0.657272661420604, + "learning_rate": 9.999510567829079e-06, + "loss": 0.2318, + "step": 935 + }, + { + "epoch": 0.31257305059275337, + "grad_norm": 0.6663711093096606, + "learning_rate": 9.999482999979739e-06, + "loss": 0.2738, + "step": 936 + }, + { + "epoch": 0.31290699615962597, + "grad_norm": 1.3839502383145643, + "learning_rate": 9.999454676899628e-06, + "loss": 0.2663, + "step": 937 + }, + { + "epoch": 0.3132409417264986, + "grad_norm": 0.7046860437125939, + "learning_rate": 9.999425598593018e-06, + "loss": 0.2601, + "step": 938 + }, + { + "epoch": 0.3135748872933712, + "grad_norm": 0.7271393767781734, + "learning_rate": 9.999395765064308e-06, + "loss": 0.2669, + "step": 939 + }, + { + "epoch": 0.3139088328602438, + "grad_norm": 0.6277865590931718, + "learning_rate": 9.999365176318e-06, + "loss": 0.24, + "step": 940 + }, + { + "epoch": 0.3142427784271164, + "grad_norm": 0.7623469016255324, + "learning_rate": 9.999333832358716e-06, + "loss": 0.2474, + "step": 941 + }, + { + "epoch": 0.314576723993989, + "grad_norm": 0.6244292989040466, + "learning_rate": 9.999301733191193e-06, + "loss": 0.2476, + "step": 942 + }, + { + "epoch": 0.31491066956086156, + "grad_norm": 0.6679513544214297, + "learning_rate": 9.999268878820278e-06, + "loss": 0.2566, + "step": 943 + }, + { + "epoch": 0.31524461512773416, + "grad_norm": 0.9118400082357937, + "learning_rate": 9.999235269250933e-06, + "loss": 0.2472, + "step": 944 + }, + { + "epoch": 0.31557856069460677, + "grad_norm": 0.5506843638558178, + "learning_rate": 9.999200904488238e-06, + "loss": 0.2239, + "step": 945 + }, + { + "epoch": 0.3159125062614794, + "grad_norm": 0.6977183679119197, + "learning_rate": 9.999165784537381e-06, + "loss": 0.2522, + "step": 946 + }, + { + "epoch": 0.316246451828352, + "grad_norm": 0.6229267335231975, + "learning_rate": 9.999129909403671e-06, + "loss": 0.2385, + "step": 947 + }, + { + "epoch": 0.3165803973952246, + "grad_norm": 0.6346107999834576, + "learning_rate": 9.999093279092524e-06, + "loss": 0.2464, + "step": 948 + }, + { + "epoch": 0.3169143429620972, + "grad_norm": 0.7147362148938331, + "learning_rate": 9.999055893609475e-06, + "loss": 0.2519, + "step": 949 + }, + { + "epoch": 0.3172482885289698, + "grad_norm": 0.681378538809813, + "learning_rate": 9.999017752960172e-06, + "loss": 0.2497, + "step": 950 + }, + { + "epoch": 0.31758223409584235, + "grad_norm": 0.7257313818749964, + "learning_rate": 9.998978857150375e-06, + "loss": 0.2586, + "step": 951 + }, + { + "epoch": 0.31791617966271496, + "grad_norm": 0.6552438164802247, + "learning_rate": 9.99893920618596e-06, + "loss": 0.2486, + "step": 952 + }, + { + "epoch": 0.31825012522958757, + "grad_norm": 0.7499319030931288, + "learning_rate": 9.998898800072919e-06, + "loss": 0.2519, + "step": 953 + }, + { + "epoch": 0.3185840707964602, + "grad_norm": 0.623600497254021, + "learning_rate": 9.998857638817354e-06, + "loss": 0.2435, + "step": 954 + }, + { + "epoch": 0.3189180163633328, + "grad_norm": 0.7248155860321855, + "learning_rate": 9.99881572242548e-06, + "loss": 0.2376, + "step": 955 + }, + { + "epoch": 0.3192519619302054, + "grad_norm": 0.5991586377037108, + "learning_rate": 9.998773050903637e-06, + "loss": 0.2376, + "step": 956 + }, + { + "epoch": 0.319585907497078, + "grad_norm": 0.7346806017257687, + "learning_rate": 9.998729624258262e-06, + "loss": 0.2633, + "step": 957 + }, + { + "epoch": 0.3199198530639506, + "grad_norm": 0.563428634742872, + "learning_rate": 9.998685442495921e-06, + "loss": 0.2382, + "step": 958 + }, + { + "epoch": 0.32025379863082315, + "grad_norm": 0.6736707339352784, + "learning_rate": 9.998640505623284e-06, + "loss": 0.2523, + "step": 959 + }, + { + "epoch": 0.32058774419769576, + "grad_norm": 0.6334538052099089, + "learning_rate": 9.998594813647145e-06, + "loss": 0.2382, + "step": 960 + }, + { + "epoch": 0.32092168976456836, + "grad_norm": 0.6355713277943253, + "learning_rate": 9.998548366574401e-06, + "loss": 0.245, + "step": 961 + }, + { + "epoch": 0.32125563533144097, + "grad_norm": 0.5993057051640902, + "learning_rate": 9.99850116441207e-06, + "loss": 0.2418, + "step": 962 + }, + { + "epoch": 0.3215895808983136, + "grad_norm": 0.5920496868805775, + "learning_rate": 9.998453207167282e-06, + "loss": 0.2565, + "step": 963 + }, + { + "epoch": 0.3219235264651862, + "grad_norm": 0.61198413975063, + "learning_rate": 9.998404494847285e-06, + "loss": 0.2585, + "step": 964 + }, + { + "epoch": 0.3222574720320588, + "grad_norm": 0.6369440770274445, + "learning_rate": 9.998355027459432e-06, + "loss": 0.2706, + "step": 965 + }, + { + "epoch": 0.3225914175989314, + "grad_norm": 0.5872473237923148, + "learning_rate": 9.998304805011199e-06, + "loss": 0.2486, + "step": 966 + }, + { + "epoch": 0.32292536316580395, + "grad_norm": 0.620263995911841, + "learning_rate": 9.998253827510173e-06, + "loss": 0.2547, + "step": 967 + }, + { + "epoch": 0.32325930873267655, + "grad_norm": 0.7511215575831635, + "learning_rate": 9.998202094964053e-06, + "loss": 0.2556, + "step": 968 + }, + { + "epoch": 0.32359325429954916, + "grad_norm": 0.6305888434992816, + "learning_rate": 9.998149607380654e-06, + "loss": 0.2356, + "step": 969 + }, + { + "epoch": 0.32392719986642177, + "grad_norm": 0.6474250636445457, + "learning_rate": 9.998096364767906e-06, + "loss": 0.2409, + "step": 970 + }, + { + "epoch": 0.3242611454332944, + "grad_norm": 0.6404737722389929, + "learning_rate": 9.998042367133854e-06, + "loss": 0.2418, + "step": 971 + }, + { + "epoch": 0.324595091000167, + "grad_norm": 0.6255070104433448, + "learning_rate": 9.997987614486648e-06, + "loss": 0.2421, + "step": 972 + }, + { + "epoch": 0.3249290365670396, + "grad_norm": 0.5688015945545797, + "learning_rate": 9.997932106834567e-06, + "loss": 0.2349, + "step": 973 + }, + { + "epoch": 0.3252629821339122, + "grad_norm": 0.6829656370856075, + "learning_rate": 9.997875844185991e-06, + "loss": 0.2501, + "step": 974 + }, + { + "epoch": 0.3255969277007848, + "grad_norm": 0.6365251336709236, + "learning_rate": 9.99781882654942e-06, + "loss": 0.2408, + "step": 975 + }, + { + "epoch": 0.32593087326765735, + "grad_norm": 0.6327216943145679, + "learning_rate": 9.997761053933469e-06, + "loss": 0.2532, + "step": 976 + }, + { + "epoch": 0.32626481883452996, + "grad_norm": 0.5741706691051712, + "learning_rate": 9.997702526346864e-06, + "loss": 0.2517, + "step": 977 + }, + { + "epoch": 0.32659876440140256, + "grad_norm": 0.634532616958001, + "learning_rate": 9.997643243798446e-06, + "loss": 0.2561, + "step": 978 + }, + { + "epoch": 0.32693270996827517, + "grad_norm": 0.6627572201424764, + "learning_rate": 9.99758320629717e-06, + "loss": 0.2212, + "step": 979 + }, + { + "epoch": 0.3272666555351478, + "grad_norm": 0.6562948613513276, + "learning_rate": 9.997522413852108e-06, + "loss": 0.2503, + "step": 980 + }, + { + "epoch": 0.3276006011020204, + "grad_norm": 0.7777578566787204, + "learning_rate": 9.997460866472439e-06, + "loss": 0.2431, + "step": 981 + }, + { + "epoch": 0.327934546668893, + "grad_norm": 0.7143909349751347, + "learning_rate": 9.997398564167465e-06, + "loss": 0.261, + "step": 982 + }, + { + "epoch": 0.3282684922357656, + "grad_norm": 0.6511654646356778, + "learning_rate": 9.997335506946596e-06, + "loss": 0.2605, + "step": 983 + }, + { + "epoch": 0.32860243780263815, + "grad_norm": 0.8594408758077905, + "learning_rate": 9.997271694819354e-06, + "loss": 0.2563, + "step": 984 + }, + { + "epoch": 0.32893638336951075, + "grad_norm": 0.630895825476141, + "learning_rate": 9.997207127795383e-06, + "loss": 0.2357, + "step": 985 + }, + { + "epoch": 0.32927032893638336, + "grad_norm": 0.6193761330050703, + "learning_rate": 9.997141805884436e-06, + "loss": 0.2395, + "step": 986 + }, + { + "epoch": 0.32960427450325597, + "grad_norm": 0.611311370706388, + "learning_rate": 9.997075729096379e-06, + "loss": 0.2506, + "step": 987 + }, + { + "epoch": 0.3299382200701286, + "grad_norm": 0.7033542886635047, + "learning_rate": 9.997008897441194e-06, + "loss": 0.2526, + "step": 988 + }, + { + "epoch": 0.3302721656370012, + "grad_norm": 0.6230286822204218, + "learning_rate": 9.996941310928978e-06, + "loss": 0.2395, + "step": 989 + }, + { + "epoch": 0.3306061112038738, + "grad_norm": 0.6362704854163252, + "learning_rate": 9.99687296956994e-06, + "loss": 0.2378, + "step": 990 + }, + { + "epoch": 0.3309400567707464, + "grad_norm": 0.5800914414032446, + "learning_rate": 9.996803873374402e-06, + "loss": 0.2254, + "step": 991 + }, + { + "epoch": 0.33127400233761894, + "grad_norm": 0.7204698593726746, + "learning_rate": 9.996734022352805e-06, + "loss": 0.2527, + "step": 992 + }, + { + "epoch": 0.33160794790449155, + "grad_norm": 0.634627877169416, + "learning_rate": 9.9966634165157e-06, + "loss": 0.2449, + "step": 993 + }, + { + "epoch": 0.33194189347136416, + "grad_norm": 0.6111123727859595, + "learning_rate": 9.99659205587375e-06, + "loss": 0.2474, + "step": 994 + }, + { + "epoch": 0.33227583903823676, + "grad_norm": 0.6755862324184183, + "learning_rate": 9.996519940437737e-06, + "loss": 0.2505, + "step": 995 + }, + { + "epoch": 0.33260978460510937, + "grad_norm": 0.6460412308879759, + "learning_rate": 9.996447070218557e-06, + "loss": 0.2473, + "step": 996 + }, + { + "epoch": 0.332943730171982, + "grad_norm": 0.6346454236316136, + "learning_rate": 9.996373445227215e-06, + "loss": 0.239, + "step": 997 + }, + { + "epoch": 0.3332776757388546, + "grad_norm": 0.6415377886338729, + "learning_rate": 9.996299065474832e-06, + "loss": 0.2445, + "step": 998 + }, + { + "epoch": 0.3336116213057272, + "grad_norm": 0.6555194675483528, + "learning_rate": 9.996223930972649e-06, + "loss": 0.2401, + "step": 999 + }, + { + "epoch": 0.33394556687259974, + "grad_norm": 0.5713457684437109, + "learning_rate": 9.99614804173201e-06, + "loss": 0.2244, + "step": 1000 + }, + { + "epoch": 0.33427951243947235, + "grad_norm": 0.5893092973518516, + "learning_rate": 9.996071397764381e-06, + "loss": 0.2602, + "step": 1001 + }, + { + "epoch": 0.33461345800634495, + "grad_norm": 0.5906004788656943, + "learning_rate": 9.995993999081343e-06, + "loss": 0.2298, + "step": 1002 + }, + { + "epoch": 0.33494740357321756, + "grad_norm": 0.6462480077770343, + "learning_rate": 9.995915845694584e-06, + "loss": 0.2573, + "step": 1003 + }, + { + "epoch": 0.33528134914009017, + "grad_norm": 0.9424314763458541, + "learning_rate": 9.995836937615913e-06, + "loss": 0.2275, + "step": 1004 + }, + { + "epoch": 0.3356152947069628, + "grad_norm": 0.5975148522005419, + "learning_rate": 9.995757274857246e-06, + "loss": 0.252, + "step": 1005 + }, + { + "epoch": 0.3359492402738354, + "grad_norm": 0.6746114929233912, + "learning_rate": 9.995676857430621e-06, + "loss": 0.2296, + "step": 1006 + }, + { + "epoch": 0.336283185840708, + "grad_norm": 0.6043959986784809, + "learning_rate": 9.995595685348186e-06, + "loss": 0.2418, + "step": 1007 + }, + { + "epoch": 0.33661713140758054, + "grad_norm": 0.6561996117107064, + "learning_rate": 9.995513758622198e-06, + "loss": 0.2411, + "step": 1008 + }, + { + "epoch": 0.33695107697445315, + "grad_norm": 0.5768734030397459, + "learning_rate": 9.995431077265038e-06, + "loss": 0.2397, + "step": 1009 + }, + { + "epoch": 0.33728502254132575, + "grad_norm": 0.5644019192557386, + "learning_rate": 9.995347641289194e-06, + "loss": 0.2308, + "step": 1010 + }, + { + "epoch": 0.33761896810819836, + "grad_norm": 0.7155920321425914, + "learning_rate": 9.995263450707273e-06, + "loss": 0.2483, + "step": 1011 + }, + { + "epoch": 0.33795291367507097, + "grad_norm": 0.5820377404778101, + "learning_rate": 9.995178505531989e-06, + "loss": 0.2361, + "step": 1012 + }, + { + "epoch": 0.33828685924194357, + "grad_norm": 0.6074649479328393, + "learning_rate": 9.995092805776175e-06, + "loss": 0.2501, + "step": 1013 + }, + { + "epoch": 0.3386208048088162, + "grad_norm": 0.6808463616772454, + "learning_rate": 9.995006351452775e-06, + "loss": 0.2514, + "step": 1014 + }, + { + "epoch": 0.3389547503756888, + "grad_norm": 0.5357571508833467, + "learning_rate": 9.994919142574854e-06, + "loss": 0.2361, + "step": 1015 + }, + { + "epoch": 0.33928869594256134, + "grad_norm": 0.6020487192909576, + "learning_rate": 9.994831179155584e-06, + "loss": 0.2419, + "step": 1016 + }, + { + "epoch": 0.33962264150943394, + "grad_norm": 0.7283194863543118, + "learning_rate": 9.994742461208251e-06, + "loss": 0.2607, + "step": 1017 + }, + { + "epoch": 0.33995658707630655, + "grad_norm": 0.5782631325584849, + "learning_rate": 9.994652988746258e-06, + "loss": 0.238, + "step": 1018 + }, + { + "epoch": 0.34029053264317916, + "grad_norm": 0.7864013734784921, + "learning_rate": 9.994562761783122e-06, + "loss": 0.2497, + "step": 1019 + }, + { + "epoch": 0.34062447821005176, + "grad_norm": 0.6293240854325619, + "learning_rate": 9.99447178033247e-06, + "loss": 0.2423, + "step": 1020 + }, + { + "epoch": 0.34095842377692437, + "grad_norm": 0.6532327987330085, + "learning_rate": 9.99438004440805e-06, + "loss": 0.2511, + "step": 1021 + }, + { + "epoch": 0.341292369343797, + "grad_norm": 0.5993467036575328, + "learning_rate": 9.994287554023717e-06, + "loss": 0.2241, + "step": 1022 + }, + { + "epoch": 0.3416263149106696, + "grad_norm": 0.6048208268550589, + "learning_rate": 9.994194309193442e-06, + "loss": 0.2263, + "step": 1023 + }, + { + "epoch": 0.3419602604775422, + "grad_norm": 0.620928708879154, + "learning_rate": 9.99410030993131e-06, + "loss": 0.2333, + "step": 1024 + }, + { + "epoch": 0.34229420604441474, + "grad_norm": 0.5811126029360281, + "learning_rate": 9.994005556251525e-06, + "loss": 0.2369, + "step": 1025 + }, + { + "epoch": 0.34262815161128735, + "grad_norm": 0.6238825923972041, + "learning_rate": 9.993910048168399e-06, + "loss": 0.2273, + "step": 1026 + }, + { + "epoch": 0.34296209717815995, + "grad_norm": 0.6765552725580103, + "learning_rate": 9.993813785696355e-06, + "loss": 0.2254, + "step": 1027 + }, + { + "epoch": 0.34329604274503256, + "grad_norm": 0.6187685905117865, + "learning_rate": 9.993716768849942e-06, + "loss": 0.2408, + "step": 1028 + }, + { + "epoch": 0.34362998831190517, + "grad_norm": 0.6097073883952683, + "learning_rate": 9.99361899764381e-06, + "loss": 0.25, + "step": 1029 + }, + { + "epoch": 0.3439639338787778, + "grad_norm": 0.6545748380846143, + "learning_rate": 9.993520472092732e-06, + "loss": 0.2434, + "step": 1030 + }, + { + "epoch": 0.3442978794456504, + "grad_norm": 0.5477963426981712, + "learning_rate": 9.99342119221159e-06, + "loss": 0.2346, + "step": 1031 + }, + { + "epoch": 0.344631825012523, + "grad_norm": 0.5510391995222212, + "learning_rate": 9.993321158015379e-06, + "loss": 0.2305, + "step": 1032 + }, + { + "epoch": 0.34496577057939554, + "grad_norm": 0.6324909697234063, + "learning_rate": 9.993220369519215e-06, + "loss": 0.2467, + "step": 1033 + }, + { + "epoch": 0.34529971614626814, + "grad_norm": 0.6142047956027233, + "learning_rate": 9.99311882673832e-06, + "loss": 0.2333, + "step": 1034 + }, + { + "epoch": 0.34563366171314075, + "grad_norm": 0.6555126781240093, + "learning_rate": 9.993016529688033e-06, + "loss": 0.2383, + "step": 1035 + }, + { + "epoch": 0.34596760728001336, + "grad_norm": 0.6360668156570694, + "learning_rate": 9.99291347838381e-06, + "loss": 0.2418, + "step": 1036 + }, + { + "epoch": 0.34630155284688596, + "grad_norm": 0.6241515101912609, + "learning_rate": 9.992809672841218e-06, + "loss": 0.2362, + "step": 1037 + }, + { + "epoch": 0.34663549841375857, + "grad_norm": 0.6300889467982095, + "learning_rate": 9.992705113075933e-06, + "loss": 0.237, + "step": 1038 + }, + { + "epoch": 0.3469694439806312, + "grad_norm": 0.564235622274707, + "learning_rate": 9.992599799103754e-06, + "loss": 0.2354, + "step": 1039 + }, + { + "epoch": 0.3473033895475038, + "grad_norm": 0.6842460803144252, + "learning_rate": 9.99249373094059e-06, + "loss": 0.2416, + "step": 1040 + }, + { + "epoch": 0.34763733511437633, + "grad_norm": 0.7151896495749245, + "learning_rate": 9.992386908602466e-06, + "loss": 0.2572, + "step": 1041 + }, + { + "epoch": 0.34797128068124894, + "grad_norm": 0.5719922134860118, + "learning_rate": 9.992279332105512e-06, + "loss": 0.2325, + "step": 1042 + }, + { + "epoch": 0.34830522624812155, + "grad_norm": 0.7231166488003365, + "learning_rate": 9.992171001465985e-06, + "loss": 0.2549, + "step": 1043 + }, + { + "epoch": 0.34863917181499415, + "grad_norm": 0.592629753373637, + "learning_rate": 9.992061916700247e-06, + "loss": 0.235, + "step": 1044 + }, + { + "epoch": 0.34897311738186676, + "grad_norm": 0.6764193734512775, + "learning_rate": 9.991952077824776e-06, + "loss": 0.2413, + "step": 1045 + }, + { + "epoch": 0.34930706294873937, + "grad_norm": 0.7115915690879145, + "learning_rate": 9.991841484856166e-06, + "loss": 0.2521, + "step": 1046 + }, + { + "epoch": 0.349641008515612, + "grad_norm": 0.5797754264927091, + "learning_rate": 9.991730137811122e-06, + "loss": 0.2476, + "step": 1047 + }, + { + "epoch": 0.3499749540824846, + "grad_norm": 0.6078645313605627, + "learning_rate": 9.991618036706464e-06, + "loss": 0.2429, + "step": 1048 + }, + { + "epoch": 0.35030889964935713, + "grad_norm": 0.810806694264766, + "learning_rate": 9.99150518155913e-06, + "loss": 0.2475, + "step": 1049 + }, + { + "epoch": 0.35064284521622974, + "grad_norm": 0.7023437648349548, + "learning_rate": 9.991391572386162e-06, + "loss": 0.2565, + "step": 1050 + }, + { + "epoch": 0.35097679078310234, + "grad_norm": 0.7282652300633451, + "learning_rate": 9.991277209204728e-06, + "loss": 0.244, + "step": 1051 + }, + { + "epoch": 0.35131073634997495, + "grad_norm": 0.6241402490643777, + "learning_rate": 9.991162092032101e-06, + "loss": 0.2614, + "step": 1052 + }, + { + "epoch": 0.35164468191684756, + "grad_norm": 0.6258752658980473, + "learning_rate": 9.99104622088567e-06, + "loss": 0.2461, + "step": 1053 + }, + { + "epoch": 0.35197862748372016, + "grad_norm": 0.6658730176732505, + "learning_rate": 9.990929595782938e-06, + "loss": 0.2402, + "step": 1054 + }, + { + "epoch": 0.35231257305059277, + "grad_norm": 0.5923285109590797, + "learning_rate": 9.990812216741529e-06, + "loss": 0.2275, + "step": 1055 + }, + { + "epoch": 0.3526465186174654, + "grad_norm": 0.6257275221441978, + "learning_rate": 9.990694083779166e-06, + "loss": 0.2396, + "step": 1056 + }, + { + "epoch": 0.3529804641843379, + "grad_norm": 0.6557840979079784, + "learning_rate": 9.990575196913699e-06, + "loss": 0.2337, + "step": 1057 + }, + { + "epoch": 0.35331440975121053, + "grad_norm": 0.7708982420567367, + "learning_rate": 9.990455556163086e-06, + "loss": 0.251, + "step": 1058 + }, + { + "epoch": 0.35364835531808314, + "grad_norm": 0.7583771899024517, + "learning_rate": 9.990335161545401e-06, + "loss": 0.2584, + "step": 1059 + }, + { + "epoch": 0.35398230088495575, + "grad_norm": 0.6318386182926147, + "learning_rate": 9.99021401307883e-06, + "loss": 0.2519, + "step": 1060 + }, + { + "epoch": 0.35431624645182835, + "grad_norm": 0.769122863891848, + "learning_rate": 9.990092110781675e-06, + "loss": 0.2404, + "step": 1061 + }, + { + "epoch": 0.35465019201870096, + "grad_norm": 0.6258454284206358, + "learning_rate": 9.98996945467235e-06, + "loss": 0.2406, + "step": 1062 + }, + { + "epoch": 0.35498413758557357, + "grad_norm": 0.5571971449891453, + "learning_rate": 9.989846044769384e-06, + "loss": 0.2342, + "step": 1063 + }, + { + "epoch": 0.3553180831524462, + "grad_norm": 0.6702068647047834, + "learning_rate": 9.98972188109142e-06, + "loss": 0.2532, + "step": 1064 + }, + { + "epoch": 0.3556520287193187, + "grad_norm": 0.709957113892022, + "learning_rate": 9.989596963657213e-06, + "loss": 0.2501, + "step": 1065 + }, + { + "epoch": 0.35598597428619133, + "grad_norm": 0.7180875116372593, + "learning_rate": 9.989471292485636e-06, + "loss": 0.2767, + "step": 1066 + }, + { + "epoch": 0.35631991985306394, + "grad_norm": 0.6068909083103505, + "learning_rate": 9.989344867595668e-06, + "loss": 0.2425, + "step": 1067 + }, + { + "epoch": 0.35665386541993654, + "grad_norm": 0.7137465058277483, + "learning_rate": 9.989217689006412e-06, + "loss": 0.2531, + "step": 1068 + }, + { + "epoch": 0.35698781098680915, + "grad_norm": 0.6022049439905831, + "learning_rate": 9.989089756737077e-06, + "loss": 0.2427, + "step": 1069 + }, + { + "epoch": 0.35732175655368176, + "grad_norm": 0.6083150772695914, + "learning_rate": 9.988961070806991e-06, + "loss": 0.248, + "step": 1070 + }, + { + "epoch": 0.35765570212055436, + "grad_norm": 0.6471745928158773, + "learning_rate": 9.988831631235591e-06, + "loss": 0.238, + "step": 1071 + }, + { + "epoch": 0.35798964768742697, + "grad_norm": 0.5615061566854802, + "learning_rate": 9.98870143804243e-06, + "loss": 0.2149, + "step": 1072 + }, + { + "epoch": 0.3583235932542995, + "grad_norm": 0.6199663578980865, + "learning_rate": 9.988570491247179e-06, + "loss": 0.2493, + "step": 1073 + }, + { + "epoch": 0.35865753882117213, + "grad_norm": 0.6728065943159997, + "learning_rate": 9.988438790869616e-06, + "loss": 0.2547, + "step": 1074 + }, + { + "epoch": 0.35899148438804473, + "grad_norm": 0.7194963403788357, + "learning_rate": 9.988306336929637e-06, + "loss": 0.2482, + "step": 1075 + }, + { + "epoch": 0.35932542995491734, + "grad_norm": 0.6633897906991787, + "learning_rate": 9.988173129447251e-06, + "loss": 0.257, + "step": 1076 + }, + { + "epoch": 0.35965937552178995, + "grad_norm": 0.6374583029418619, + "learning_rate": 9.98803916844258e-06, + "loss": 0.2498, + "step": 1077 + }, + { + "epoch": 0.35999332108866255, + "grad_norm": 0.5644829103003917, + "learning_rate": 9.98790445393586e-06, + "loss": 0.2219, + "step": 1078 + }, + { + "epoch": 0.36032726665553516, + "grad_norm": 0.7465815493447656, + "learning_rate": 9.98776898594744e-06, + "loss": 0.2516, + "step": 1079 + }, + { + "epoch": 0.36066121222240777, + "grad_norm": 0.6137141055718743, + "learning_rate": 9.987632764497787e-06, + "loss": 0.2307, + "step": 1080 + }, + { + "epoch": 0.3609951577892804, + "grad_norm": 0.6606373030819324, + "learning_rate": 9.987495789607478e-06, + "loss": 0.2372, + "step": 1081 + }, + { + "epoch": 0.3613291033561529, + "grad_norm": 0.6144078594991942, + "learning_rate": 9.987358061297203e-06, + "loss": 0.2552, + "step": 1082 + }, + { + "epoch": 0.36166304892302553, + "grad_norm": 0.6034573546156843, + "learning_rate": 9.987219579587768e-06, + "loss": 0.2271, + "step": 1083 + }, + { + "epoch": 0.36199699448989814, + "grad_norm": 0.6365200936320912, + "learning_rate": 9.987080344500094e-06, + "loss": 0.2305, + "step": 1084 + }, + { + "epoch": 0.36233094005677075, + "grad_norm": 0.6381373323927708, + "learning_rate": 9.986940356055212e-06, + "loss": 0.2371, + "step": 1085 + }, + { + "epoch": 0.36266488562364335, + "grad_norm": 0.612493000628613, + "learning_rate": 9.986799614274271e-06, + "loss": 0.238, + "step": 1086 + }, + { + "epoch": 0.36299883119051596, + "grad_norm": 0.6157534327216453, + "learning_rate": 9.986658119178532e-06, + "loss": 0.2597, + "step": 1087 + }, + { + "epoch": 0.36333277675738856, + "grad_norm": 0.6729028218692379, + "learning_rate": 9.986515870789366e-06, + "loss": 0.2415, + "step": 1088 + }, + { + "epoch": 0.36366672232426117, + "grad_norm": 0.6283943072436228, + "learning_rate": 9.986372869128264e-06, + "loss": 0.2563, + "step": 1089 + }, + { + "epoch": 0.3640006678911337, + "grad_norm": 0.6576049042218921, + "learning_rate": 9.986229114216828e-06, + "loss": 0.2427, + "step": 1090 + }, + { + "epoch": 0.36433461345800633, + "grad_norm": 0.5841024132825844, + "learning_rate": 9.986084606076772e-06, + "loss": 0.242, + "step": 1091 + }, + { + "epoch": 0.36466855902487894, + "grad_norm": 0.5877810096206579, + "learning_rate": 9.985939344729926e-06, + "loss": 0.2214, + "step": 1092 + }, + { + "epoch": 0.36500250459175154, + "grad_norm": 0.590923371403508, + "learning_rate": 9.985793330198237e-06, + "loss": 0.2376, + "step": 1093 + }, + { + "epoch": 0.36533645015862415, + "grad_norm": 0.5462432642092312, + "learning_rate": 9.98564656250376e-06, + "loss": 0.2239, + "step": 1094 + }, + { + "epoch": 0.36567039572549676, + "grad_norm": 0.649410508856008, + "learning_rate": 9.985499041668664e-06, + "loss": 0.2589, + "step": 1095 + }, + { + "epoch": 0.36600434129236936, + "grad_norm": 0.5597787325842369, + "learning_rate": 9.985350767715236e-06, + "loss": 0.2265, + "step": 1096 + }, + { + "epoch": 0.36633828685924197, + "grad_norm": 0.5506515525737525, + "learning_rate": 9.985201740665873e-06, + "loss": 0.2254, + "step": 1097 + }, + { + "epoch": 0.3666722324261145, + "grad_norm": 0.599236470388504, + "learning_rate": 9.98505196054309e-06, + "loss": 0.2164, + "step": 1098 + }, + { + "epoch": 0.3670061779929871, + "grad_norm": 0.6214018418575783, + "learning_rate": 9.98490142736951e-06, + "loss": 0.258, + "step": 1099 + }, + { + "epoch": 0.36734012355985973, + "grad_norm": 0.6032134090942005, + "learning_rate": 9.984750141167874e-06, + "loss": 0.2412, + "step": 1100 + }, + { + "epoch": 0.36767406912673234, + "grad_norm": 0.6180438639636011, + "learning_rate": 9.984598101961036e-06, + "loss": 0.2426, + "step": 1101 + }, + { + "epoch": 0.36800801469360495, + "grad_norm": 0.9315847932687742, + "learning_rate": 9.984445309771963e-06, + "loss": 0.2613, + "step": 1102 + }, + { + "epoch": 0.36834196026047755, + "grad_norm": 0.5753009194815258, + "learning_rate": 9.984291764623735e-06, + "loss": 0.2298, + "step": 1103 + }, + { + "epoch": 0.36867590582735016, + "grad_norm": 0.7200885040091543, + "learning_rate": 9.98413746653955e-06, + "loss": 0.2487, + "step": 1104 + }, + { + "epoch": 0.36900985139422277, + "grad_norm": 0.6844842815753607, + "learning_rate": 9.983982415542713e-06, + "loss": 0.2229, + "step": 1105 + }, + { + "epoch": 0.3693437969610953, + "grad_norm": 0.610318931691899, + "learning_rate": 9.983826611656649e-06, + "loss": 0.2417, + "step": 1106 + }, + { + "epoch": 0.3696777425279679, + "grad_norm": 0.6735146550314316, + "learning_rate": 9.983670054904891e-06, + "loss": 0.2455, + "step": 1107 + }, + { + "epoch": 0.37001168809484053, + "grad_norm": 0.6404413794547794, + "learning_rate": 9.98351274531109e-06, + "loss": 0.2393, + "step": 1108 + }, + { + "epoch": 0.37034563366171314, + "grad_norm": 0.590966335203484, + "learning_rate": 9.983354682899012e-06, + "loss": 0.2316, + "step": 1109 + }, + { + "epoch": 0.37067957922858574, + "grad_norm": 0.6989245308286045, + "learning_rate": 9.98319586769253e-06, + "loss": 0.2408, + "step": 1110 + }, + { + "epoch": 0.37101352479545835, + "grad_norm": 0.641463921195796, + "learning_rate": 9.983036299715637e-06, + "loss": 0.2358, + "step": 1111 + }, + { + "epoch": 0.37134747036233096, + "grad_norm": 0.7275199738126983, + "learning_rate": 9.98287597899244e-06, + "loss": 0.2494, + "step": 1112 + }, + { + "epoch": 0.37168141592920356, + "grad_norm": 0.6809370125852048, + "learning_rate": 9.982714905547152e-06, + "loss": 0.2322, + "step": 1113 + }, + { + "epoch": 0.3720153614960761, + "grad_norm": 0.6312324387169239, + "learning_rate": 9.982553079404109e-06, + "loss": 0.2458, + "step": 1114 + }, + { + "epoch": 0.3723493070629487, + "grad_norm": 0.5856517142062221, + "learning_rate": 9.982390500587755e-06, + "loss": 0.2344, + "step": 1115 + }, + { + "epoch": 0.3726832526298213, + "grad_norm": 0.6576879620189332, + "learning_rate": 9.982227169122652e-06, + "loss": 0.2406, + "step": 1116 + }, + { + "epoch": 0.37301719819669393, + "grad_norm": 0.6109211179006137, + "learning_rate": 9.98206308503347e-06, + "loss": 0.2407, + "step": 1117 + }, + { + "epoch": 0.37335114376356654, + "grad_norm": 0.5360857677658837, + "learning_rate": 9.981898248344996e-06, + "loss": 0.2291, + "step": 1118 + }, + { + "epoch": 0.37368508933043915, + "grad_norm": 0.6350541721035942, + "learning_rate": 9.981732659082136e-06, + "loss": 0.2351, + "step": 1119 + }, + { + "epoch": 0.37401903489731175, + "grad_norm": 0.5564505418095035, + "learning_rate": 9.981566317269895e-06, + "loss": 0.2199, + "step": 1120 + }, + { + "epoch": 0.37435298046418436, + "grad_norm": 0.5869314057324901, + "learning_rate": 9.981399222933408e-06, + "loss": 0.2459, + "step": 1121 + }, + { + "epoch": 0.3746869260310569, + "grad_norm": 0.7657771256449448, + "learning_rate": 9.981231376097914e-06, + "loss": 0.2407, + "step": 1122 + }, + { + "epoch": 0.3750208715979295, + "grad_norm": 0.5795690536586283, + "learning_rate": 9.981062776788769e-06, + "loss": 0.2347, + "step": 1123 + }, + { + "epoch": 0.3753548171648021, + "grad_norm": 0.5891710443110899, + "learning_rate": 9.98089342503144e-06, + "loss": 0.2382, + "step": 1124 + }, + { + "epoch": 0.37568876273167473, + "grad_norm": 0.621199984594368, + "learning_rate": 9.980723320851512e-06, + "loss": 0.2347, + "step": 1125 + }, + { + "epoch": 0.37602270829854734, + "grad_norm": 0.6517502567095506, + "learning_rate": 9.98055246427468e-06, + "loss": 0.2462, + "step": 1126 + }, + { + "epoch": 0.37635665386541994, + "grad_norm": 0.6881884763822493, + "learning_rate": 9.980380855326754e-06, + "loss": 0.2504, + "step": 1127 + }, + { + "epoch": 0.37669059943229255, + "grad_norm": 0.5745203456313879, + "learning_rate": 9.980208494033659e-06, + "loss": 0.2404, + "step": 1128 + }, + { + "epoch": 0.37702454499916516, + "grad_norm": 0.6050555371100264, + "learning_rate": 9.98003538042143e-06, + "loss": 0.2531, + "step": 1129 + }, + { + "epoch": 0.37735849056603776, + "grad_norm": 0.9254994222499924, + "learning_rate": 9.979861514516217e-06, + "loss": 0.2427, + "step": 1130 + }, + { + "epoch": 0.3776924361329103, + "grad_norm": 0.6004163946203853, + "learning_rate": 9.979686896344289e-06, + "loss": 0.2199, + "step": 1131 + }, + { + "epoch": 0.3780263816997829, + "grad_norm": 0.6102051176431205, + "learning_rate": 9.97951152593202e-06, + "loss": 0.2514, + "step": 1132 + }, + { + "epoch": 0.3783603272666555, + "grad_norm": 0.6680363132138403, + "learning_rate": 9.979335403305904e-06, + "loss": 0.2491, + "step": 1133 + }, + { + "epoch": 0.37869427283352813, + "grad_norm": 0.6372805133933099, + "learning_rate": 9.979158528492546e-06, + "loss": 0.2553, + "step": 1134 + }, + { + "epoch": 0.37902821840040074, + "grad_norm": 0.7266606127441678, + "learning_rate": 9.978980901518663e-06, + "loss": 0.2435, + "step": 1135 + }, + { + "epoch": 0.37936216396727335, + "grad_norm": 0.5668609743092723, + "learning_rate": 9.978802522411091e-06, + "loss": 0.2361, + "step": 1136 + }, + { + "epoch": 0.37969610953414595, + "grad_norm": 0.6140023762446175, + "learning_rate": 9.978623391196774e-06, + "loss": 0.2581, + "step": 1137 + }, + { + "epoch": 0.38003005510101856, + "grad_norm": 0.6739052795637623, + "learning_rate": 9.978443507902772e-06, + "loss": 0.2446, + "step": 1138 + }, + { + "epoch": 0.3803640006678911, + "grad_norm": 0.5882555204439429, + "learning_rate": 9.978262872556257e-06, + "loss": 0.2407, + "step": 1139 + }, + { + "epoch": 0.3806979462347637, + "grad_norm": 0.7887496587693288, + "learning_rate": 9.97808148518452e-06, + "loss": 0.2564, + "step": 1140 + }, + { + "epoch": 0.3810318918016363, + "grad_norm": 0.7154651655569723, + "learning_rate": 9.977899345814959e-06, + "loss": 0.2677, + "step": 1141 + }, + { + "epoch": 0.38136583736850893, + "grad_norm": 0.5884603812414391, + "learning_rate": 9.977716454475089e-06, + "loss": 0.243, + "step": 1142 + }, + { + "epoch": 0.38169978293538154, + "grad_norm": 0.5670137381925265, + "learning_rate": 9.977532811192539e-06, + "loss": 0.2321, + "step": 1143 + }, + { + "epoch": 0.38203372850225414, + "grad_norm": 0.6540687344399836, + "learning_rate": 9.977348415995048e-06, + "loss": 0.2382, + "step": 1144 + }, + { + "epoch": 0.38236767406912675, + "grad_norm": 0.6062895960141702, + "learning_rate": 9.977163268910472e-06, + "loss": 0.2467, + "step": 1145 + }, + { + "epoch": 0.38270161963599936, + "grad_norm": 0.6354013496136728, + "learning_rate": 9.976977369966781e-06, + "loss": 0.2568, + "step": 1146 + }, + { + "epoch": 0.3830355652028719, + "grad_norm": 0.6291640800274567, + "learning_rate": 9.976790719192055e-06, + "loss": 0.239, + "step": 1147 + }, + { + "epoch": 0.3833695107697445, + "grad_norm": 0.6401219097828764, + "learning_rate": 9.976603316614492e-06, + "loss": 0.2528, + "step": 1148 + }, + { + "epoch": 0.3837034563366171, + "grad_norm": 0.6010786738863266, + "learning_rate": 9.976415162262401e-06, + "loss": 0.2328, + "step": 1149 + }, + { + "epoch": 0.38403740190348973, + "grad_norm": 0.606105846014851, + "learning_rate": 9.976226256164204e-06, + "loss": 0.249, + "step": 1150 + }, + { + "epoch": 0.38437134747036233, + "grad_norm": 0.6021670703179877, + "learning_rate": 9.976036598348437e-06, + "loss": 0.2464, + "step": 1151 + }, + { + "epoch": 0.38470529303723494, + "grad_norm": 0.5488912740197421, + "learning_rate": 9.975846188843754e-06, + "loss": 0.2343, + "step": 1152 + }, + { + "epoch": 0.38503923860410755, + "grad_norm": 0.6489026795753373, + "learning_rate": 9.975655027678913e-06, + "loss": 0.2383, + "step": 1153 + }, + { + "epoch": 0.38537318417098015, + "grad_norm": 0.5352509454624307, + "learning_rate": 9.975463114882792e-06, + "loss": 0.2385, + "step": 1154 + }, + { + "epoch": 0.3857071297378527, + "grad_norm": 0.5557378389098141, + "learning_rate": 9.975270450484385e-06, + "loss": 0.2331, + "step": 1155 + }, + { + "epoch": 0.3860410753047253, + "grad_norm": 0.6592063946537283, + "learning_rate": 9.975077034512795e-06, + "loss": 0.228, + "step": 1156 + }, + { + "epoch": 0.3863750208715979, + "grad_norm": 0.6561305895606129, + "learning_rate": 9.97488286699724e-06, + "loss": 0.2507, + "step": 1157 + }, + { + "epoch": 0.3867089664384705, + "grad_norm": 0.5344971723442756, + "learning_rate": 9.974687947967047e-06, + "loss": 0.2347, + "step": 1158 + }, + { + "epoch": 0.38704291200534313, + "grad_norm": 0.5740634729457469, + "learning_rate": 9.974492277451668e-06, + "loss": 0.2402, + "step": 1159 + }, + { + "epoch": 0.38737685757221574, + "grad_norm": 0.6784228325087824, + "learning_rate": 9.974295855480658e-06, + "loss": 0.2468, + "step": 1160 + }, + { + "epoch": 0.38771080313908834, + "grad_norm": 0.5351257365785597, + "learning_rate": 9.974098682083687e-06, + "loss": 0.236, + "step": 1161 + }, + { + "epoch": 0.38804474870596095, + "grad_norm": 0.5153552023195942, + "learning_rate": 9.973900757290541e-06, + "loss": 0.2096, + "step": 1162 + }, + { + "epoch": 0.3883786942728335, + "grad_norm": 0.5687511104588763, + "learning_rate": 9.97370208113112e-06, + "loss": 0.2368, + "step": 1163 + }, + { + "epoch": 0.3887126398397061, + "grad_norm": 0.6394062683395763, + "learning_rate": 9.973502653635438e-06, + "loss": 0.2521, + "step": 1164 + }, + { + "epoch": 0.3890465854065787, + "grad_norm": 0.6513149391308338, + "learning_rate": 9.97330247483362e-06, + "loss": 0.2421, + "step": 1165 + }, + { + "epoch": 0.3893805309734513, + "grad_norm": 0.5407808834505893, + "learning_rate": 9.973101544755901e-06, + "loss": 0.2295, + "step": 1166 + }, + { + "epoch": 0.38971447654032393, + "grad_norm": 0.6398170225716239, + "learning_rate": 9.97289986343264e-06, + "loss": 0.2346, + "step": 1167 + }, + { + "epoch": 0.39004842210719654, + "grad_norm": 0.6043234167832073, + "learning_rate": 9.972697430894299e-06, + "loss": 0.2338, + "step": 1168 + }, + { + "epoch": 0.39038236767406914, + "grad_norm": 0.6333600734362572, + "learning_rate": 9.97249424717146e-06, + "loss": 0.2472, + "step": 1169 + }, + { + "epoch": 0.39071631324094175, + "grad_norm": 0.6017998882821272, + "learning_rate": 9.972290312294816e-06, + "loss": 0.2488, + "step": 1170 + }, + { + "epoch": 0.3910502588078143, + "grad_norm": 0.6311509148718895, + "learning_rate": 9.972085626295173e-06, + "loss": 0.2426, + "step": 1171 + }, + { + "epoch": 0.3913842043746869, + "grad_norm": 0.6141756863914073, + "learning_rate": 9.971880189203452e-06, + "loss": 0.2537, + "step": 1172 + }, + { + "epoch": 0.3917181499415595, + "grad_norm": 0.5793351876901642, + "learning_rate": 9.971674001050687e-06, + "loss": 0.2432, + "step": 1173 + }, + { + "epoch": 0.3920520955084321, + "grad_norm": 0.7048398913255026, + "learning_rate": 9.971467061868022e-06, + "loss": 0.2774, + "step": 1174 + }, + { + "epoch": 0.3923860410753047, + "grad_norm": 0.5894804335467615, + "learning_rate": 9.971259371686724e-06, + "loss": 0.2264, + "step": 1175 + }, + { + "epoch": 0.39271998664217733, + "grad_norm": 0.6305881343504768, + "learning_rate": 9.971050930538161e-06, + "loss": 0.2515, + "step": 1176 + }, + { + "epoch": 0.39305393220904994, + "grad_norm": 0.6169670939348186, + "learning_rate": 9.970841738453823e-06, + "loss": 0.2474, + "step": 1177 + }, + { + "epoch": 0.39338787777592255, + "grad_norm": 0.6203195420477706, + "learning_rate": 9.970631795465311e-06, + "loss": 0.2446, + "step": 1178 + }, + { + "epoch": 0.39372182334279515, + "grad_norm": 0.5285532423016363, + "learning_rate": 9.970421101604339e-06, + "loss": 0.2168, + "step": 1179 + }, + { + "epoch": 0.3940557689096677, + "grad_norm": 0.6175748683107137, + "learning_rate": 9.970209656902734e-06, + "loss": 0.2421, + "step": 1180 + }, + { + "epoch": 0.3943897144765403, + "grad_norm": 0.5780266651964711, + "learning_rate": 9.969997461392439e-06, + "loss": 0.2334, + "step": 1181 + }, + { + "epoch": 0.3947236600434129, + "grad_norm": 0.7028863790206683, + "learning_rate": 9.969784515105508e-06, + "loss": 0.2619, + "step": 1182 + }, + { + "epoch": 0.3950576056102855, + "grad_norm": 0.5744613731332193, + "learning_rate": 9.969570818074109e-06, + "loss": 0.2273, + "step": 1183 + }, + { + "epoch": 0.39539155117715813, + "grad_norm": 0.7222715318962293, + "learning_rate": 9.96935637033052e-06, + "loss": 0.2282, + "step": 1184 + }, + { + "epoch": 0.39572549674403074, + "grad_norm": 0.6124101572650357, + "learning_rate": 9.969141171907142e-06, + "loss": 0.2348, + "step": 1185 + }, + { + "epoch": 0.39605944231090334, + "grad_norm": 0.5960876102003673, + "learning_rate": 9.968925222836478e-06, + "loss": 0.235, + "step": 1186 + }, + { + "epoch": 0.39639338787777595, + "grad_norm": 0.574113404593996, + "learning_rate": 9.968708523151154e-06, + "loss": 0.2494, + "step": 1187 + }, + { + "epoch": 0.3967273334446485, + "grad_norm": 0.660656988924486, + "learning_rate": 9.968491072883902e-06, + "loss": 0.2437, + "step": 1188 + }, + { + "epoch": 0.3970612790115211, + "grad_norm": 0.6305428349461573, + "learning_rate": 9.968272872067571e-06, + "loss": 0.2477, + "step": 1189 + }, + { + "epoch": 0.3973952245783937, + "grad_norm": 0.6256118019755806, + "learning_rate": 9.968053920735124e-06, + "loss": 0.2616, + "step": 1190 + }, + { + "epoch": 0.3977291701452663, + "grad_norm": 0.5724515150428571, + "learning_rate": 9.967834218919634e-06, + "loss": 0.2399, + "step": 1191 + }, + { + "epoch": 0.3980631157121389, + "grad_norm": 0.7058668997385279, + "learning_rate": 9.967613766654293e-06, + "loss": 0.2488, + "step": 1192 + }, + { + "epoch": 0.39839706127901153, + "grad_norm": 0.5837497840997278, + "learning_rate": 9.967392563972399e-06, + "loss": 0.2404, + "step": 1193 + }, + { + "epoch": 0.39873100684588414, + "grad_norm": 0.5877505859460367, + "learning_rate": 9.96717061090737e-06, + "loss": 0.2427, + "step": 1194 + }, + { + "epoch": 0.39906495241275675, + "grad_norm": 0.6767793531128838, + "learning_rate": 9.966947907492734e-06, + "loss": 0.2423, + "step": 1195 + }, + { + "epoch": 0.3993988979796293, + "grad_norm": 0.7876764633998403, + "learning_rate": 9.966724453762131e-06, + "loss": 0.2593, + "step": 1196 + }, + { + "epoch": 0.3997328435465019, + "grad_norm": 0.664535756576125, + "learning_rate": 9.96650024974932e-06, + "loss": 0.2668, + "step": 1197 + }, + { + "epoch": 0.4000667891133745, + "grad_norm": 0.6748545214684532, + "learning_rate": 9.966275295488165e-06, + "loss": 0.2307, + "step": 1198 + }, + { + "epoch": 0.4004007346802471, + "grad_norm": 0.5847120767540386, + "learning_rate": 9.966049591012651e-06, + "loss": 0.25, + "step": 1199 + }, + { + "epoch": 0.4007346802471197, + "grad_norm": 0.5593088473361937, + "learning_rate": 9.965823136356877e-06, + "loss": 0.2512, + "step": 1200 + }, + { + "epoch": 0.40106862581399233, + "grad_norm": 0.5979036171420881, + "learning_rate": 9.965595931555043e-06, + "loss": 0.2344, + "step": 1201 + }, + { + "epoch": 0.40140257138086494, + "grad_norm": 0.5610390171317386, + "learning_rate": 9.965367976641478e-06, + "loss": 0.2297, + "step": 1202 + }, + { + "epoch": 0.40173651694773754, + "grad_norm": 0.6833684414411917, + "learning_rate": 9.965139271650614e-06, + "loss": 0.2459, + "step": 1203 + }, + { + "epoch": 0.4020704625146101, + "grad_norm": 0.728364535501137, + "learning_rate": 9.964909816617002e-06, + "loss": 0.2544, + "step": 1204 + }, + { + "epoch": 0.4024044080814827, + "grad_norm": 0.5693930955364505, + "learning_rate": 9.964679611575298e-06, + "loss": 0.2378, + "step": 1205 + }, + { + "epoch": 0.4027383536483553, + "grad_norm": 0.6133102401066323, + "learning_rate": 9.964448656560286e-06, + "loss": 0.2279, + "step": 1206 + }, + { + "epoch": 0.4030722992152279, + "grad_norm": 0.563392679821793, + "learning_rate": 9.964216951606848e-06, + "loss": 0.2397, + "step": 1207 + }, + { + "epoch": 0.4034062447821005, + "grad_norm": 0.5672909007940137, + "learning_rate": 9.963984496749988e-06, + "loss": 0.2267, + "step": 1208 + }, + { + "epoch": 0.4037401903489731, + "grad_norm": 0.6488344228564302, + "learning_rate": 9.96375129202482e-06, + "loss": 0.2494, + "step": 1209 + }, + { + "epoch": 0.40407413591584573, + "grad_norm": 0.5139669587828634, + "learning_rate": 9.963517337466575e-06, + "loss": 0.207, + "step": 1210 + }, + { + "epoch": 0.40440808148271834, + "grad_norm": 0.5287676891678358, + "learning_rate": 9.963282633110591e-06, + "loss": 0.2303, + "step": 1211 + }, + { + "epoch": 0.4047420270495909, + "grad_norm": 0.5058680387315015, + "learning_rate": 9.963047178992324e-06, + "loss": 0.22, + "step": 1212 + }, + { + "epoch": 0.4050759726164635, + "grad_norm": 0.5445840438834135, + "learning_rate": 9.962810975147344e-06, + "loss": 0.2397, + "step": 1213 + }, + { + "epoch": 0.4054099181833361, + "grad_norm": 0.6382046509955335, + "learning_rate": 9.96257402161133e-06, + "loss": 0.2715, + "step": 1214 + }, + { + "epoch": 0.4057438637502087, + "grad_norm": 0.5413823853664697, + "learning_rate": 9.962336318420078e-06, + "loss": 0.2261, + "step": 1215 + }, + { + "epoch": 0.4060778093170813, + "grad_norm": 0.6202996949474051, + "learning_rate": 9.962097865609495e-06, + "loss": 0.2283, + "step": 1216 + }, + { + "epoch": 0.4064117548839539, + "grad_norm": 0.60696612940717, + "learning_rate": 9.961858663215604e-06, + "loss": 0.2401, + "step": 1217 + }, + { + "epoch": 0.40674570045082653, + "grad_norm": 0.649264221755663, + "learning_rate": 9.961618711274537e-06, + "loss": 0.2472, + "step": 1218 + }, + { + "epoch": 0.40707964601769914, + "grad_norm": 0.5501090472861203, + "learning_rate": 9.961378009822542e-06, + "loss": 0.2407, + "step": 1219 + }, + { + "epoch": 0.4074135915845717, + "grad_norm": 0.5345854297532595, + "learning_rate": 9.961136558895981e-06, + "loss": 0.228, + "step": 1220 + }, + { + "epoch": 0.4077475371514443, + "grad_norm": 0.5125851264368648, + "learning_rate": 9.960894358531328e-06, + "loss": 0.2167, + "step": 1221 + }, + { + "epoch": 0.4080814827183169, + "grad_norm": 0.5920584268375877, + "learning_rate": 9.960651408765168e-06, + "loss": 0.2367, + "step": 1222 + }, + { + "epoch": 0.4084154282851895, + "grad_norm": 0.716453784192867, + "learning_rate": 9.960407709634203e-06, + "loss": 0.2477, + "step": 1223 + }, + { + "epoch": 0.4087493738520621, + "grad_norm": 0.7562990534172225, + "learning_rate": 9.960163261175247e-06, + "loss": 0.2332, + "step": 1224 + }, + { + "epoch": 0.4090833194189347, + "grad_norm": 0.5231312350431248, + "learning_rate": 9.959918063425228e-06, + "loss": 0.2259, + "step": 1225 + }, + { + "epoch": 0.4094172649858073, + "grad_norm": 0.5921963859667128, + "learning_rate": 9.959672116421181e-06, + "loss": 0.2263, + "step": 1226 + }, + { + "epoch": 0.40975121055267993, + "grad_norm": 0.6028966599262293, + "learning_rate": 9.959425420200267e-06, + "loss": 0.2489, + "step": 1227 + }, + { + "epoch": 0.4100851561195525, + "grad_norm": 0.686672288403263, + "learning_rate": 9.959177974799742e-06, + "loss": 0.2508, + "step": 1228 + }, + { + "epoch": 0.4104191016864251, + "grad_norm": 0.5673389018187519, + "learning_rate": 9.958929780256996e-06, + "loss": 0.2207, + "step": 1229 + }, + { + "epoch": 0.4107530472532977, + "grad_norm": 0.5753969316911537, + "learning_rate": 9.958680836609516e-06, + "loss": 0.239, + "step": 1230 + }, + { + "epoch": 0.4110869928201703, + "grad_norm": 0.676883776899676, + "learning_rate": 9.95843114389491e-06, + "loss": 0.2225, + "step": 1231 + }, + { + "epoch": 0.4114209383870429, + "grad_norm": 0.6238678967525173, + "learning_rate": 9.958180702150895e-06, + "loss": 0.2278, + "step": 1232 + }, + { + "epoch": 0.4117548839539155, + "grad_norm": 0.6316307542043934, + "learning_rate": 9.957929511415304e-06, + "loss": 0.2485, + "step": 1233 + }, + { + "epoch": 0.4120888295207881, + "grad_norm": 0.5602900296016604, + "learning_rate": 9.957677571726084e-06, + "loss": 0.2288, + "step": 1234 + }, + { + "epoch": 0.41242277508766073, + "grad_norm": 0.6275850836013913, + "learning_rate": 9.95742488312129e-06, + "loss": 0.2393, + "step": 1235 + }, + { + "epoch": 0.41275672065453334, + "grad_norm": 0.6298629000187451, + "learning_rate": 9.957171445639096e-06, + "loss": 0.2456, + "step": 1236 + }, + { + "epoch": 0.4130906662214059, + "grad_norm": 0.5989836124208895, + "learning_rate": 9.956917259317788e-06, + "loss": 0.2349, + "step": 1237 + }, + { + "epoch": 0.4134246117882785, + "grad_norm": 0.6067541086981411, + "learning_rate": 9.95666232419576e-06, + "loss": 0.2377, + "step": 1238 + }, + { + "epoch": 0.4137585573551511, + "grad_norm": 0.5754404112239477, + "learning_rate": 9.956406640311527e-06, + "loss": 0.2555, + "step": 1239 + }, + { + "epoch": 0.4140925029220237, + "grad_norm": 0.5621508892107175, + "learning_rate": 9.956150207703712e-06, + "loss": 0.2386, + "step": 1240 + }, + { + "epoch": 0.4144264484888963, + "grad_norm": 0.5959704813881286, + "learning_rate": 9.955893026411048e-06, + "loss": 0.2469, + "step": 1241 + }, + { + "epoch": 0.4147603940557689, + "grad_norm": 0.6373037754433803, + "learning_rate": 9.955635096472391e-06, + "loss": 0.2326, + "step": 1242 + }, + { + "epoch": 0.41509433962264153, + "grad_norm": 0.8712757922100017, + "learning_rate": 9.9553764179267e-06, + "loss": 0.2408, + "step": 1243 + }, + { + "epoch": 0.41542828518951413, + "grad_norm": 0.5794440921487206, + "learning_rate": 9.955116990813056e-06, + "loss": 0.2515, + "step": 1244 + }, + { + "epoch": 0.4157622307563867, + "grad_norm": 0.5417617912815341, + "learning_rate": 9.954856815170644e-06, + "loss": 0.2257, + "step": 1245 + }, + { + "epoch": 0.4160961763232593, + "grad_norm": 0.608261531031807, + "learning_rate": 9.95459589103877e-06, + "loss": 0.2422, + "step": 1246 + }, + { + "epoch": 0.4164301218901319, + "grad_norm": 0.5882276462833476, + "learning_rate": 9.954334218456846e-06, + "loss": 0.237, + "step": 1247 + }, + { + "epoch": 0.4167640674570045, + "grad_norm": 0.9285636581320588, + "learning_rate": 9.954071797464405e-06, + "loss": 0.249, + "step": 1248 + }, + { + "epoch": 0.4170980130238771, + "grad_norm": 0.6268984023382586, + "learning_rate": 9.953808628101086e-06, + "loss": 0.2415, + "step": 1249 + }, + { + "epoch": 0.4174319585907497, + "grad_norm": 0.6069222808256296, + "learning_rate": 9.953544710406646e-06, + "loss": 0.2545, + "step": 1250 + }, + { + "epoch": 0.4177659041576223, + "grad_norm": 0.5778817193778154, + "learning_rate": 9.95328004442095e-06, + "loss": 0.241, + "step": 1251 + }, + { + "epoch": 0.41809984972449493, + "grad_norm": 0.61922600341147, + "learning_rate": 9.953014630183979e-06, + "loss": 0.2244, + "step": 1252 + }, + { + "epoch": 0.4184337952913675, + "grad_norm": 0.6687820682983995, + "learning_rate": 9.95274846773583e-06, + "loss": 0.2337, + "step": 1253 + }, + { + "epoch": 0.4187677408582401, + "grad_norm": 0.5042432866684735, + "learning_rate": 9.952481557116708e-06, + "loss": 0.2169, + "step": 1254 + }, + { + "epoch": 0.4191016864251127, + "grad_norm": 0.6258367926926472, + "learning_rate": 9.952213898366932e-06, + "loss": 0.249, + "step": 1255 + }, + { + "epoch": 0.4194356319919853, + "grad_norm": 0.5585811866612296, + "learning_rate": 9.951945491526938e-06, + "loss": 0.226, + "step": 1256 + }, + { + "epoch": 0.4197695775588579, + "grad_norm": 0.6218677388521249, + "learning_rate": 9.951676336637267e-06, + "loss": 0.2318, + "step": 1257 + }, + { + "epoch": 0.4201035231257305, + "grad_norm": 0.5375786545701925, + "learning_rate": 9.951406433738587e-06, + "loss": 0.2295, + "step": 1258 + }, + { + "epoch": 0.4204374686926031, + "grad_norm": 0.5316795089983514, + "learning_rate": 9.95113578287166e-06, + "loss": 0.2278, + "step": 1259 + }, + { + "epoch": 0.42077141425947573, + "grad_norm": 0.6387497516438277, + "learning_rate": 9.950864384077376e-06, + "loss": 0.2513, + "step": 1260 + }, + { + "epoch": 0.4211053598263483, + "grad_norm": 0.7334294291090919, + "learning_rate": 9.950592237396732e-06, + "loss": 0.2376, + "step": 1261 + }, + { + "epoch": 0.4214393053932209, + "grad_norm": 0.47638791811568987, + "learning_rate": 9.95031934287084e-06, + "loss": 0.2203, + "step": 1262 + }, + { + "epoch": 0.4217732509600935, + "grad_norm": 0.4970617194044711, + "learning_rate": 9.950045700540923e-06, + "loss": 0.2322, + "step": 1263 + }, + { + "epoch": 0.4221071965269661, + "grad_norm": 0.5481044534689685, + "learning_rate": 9.949771310448317e-06, + "loss": 0.2348, + "step": 1264 + }, + { + "epoch": 0.4224411420938387, + "grad_norm": 0.5249211204705201, + "learning_rate": 9.949496172634474e-06, + "loss": 0.2237, + "step": 1265 + }, + { + "epoch": 0.4227750876607113, + "grad_norm": 0.6158684467073959, + "learning_rate": 9.949220287140955e-06, + "loss": 0.2528, + "step": 1266 + }, + { + "epoch": 0.4231090332275839, + "grad_norm": 0.5751869229573902, + "learning_rate": 9.948943654009438e-06, + "loss": 0.2437, + "step": 1267 + }, + { + "epoch": 0.4234429787944565, + "grad_norm": 0.5278534477612402, + "learning_rate": 9.948666273281708e-06, + "loss": 0.2281, + "step": 1268 + }, + { + "epoch": 0.4237769243613291, + "grad_norm": 0.6021004635115851, + "learning_rate": 9.94838814499967e-06, + "loss": 0.2273, + "step": 1269 + }, + { + "epoch": 0.4241108699282017, + "grad_norm": 0.5556933684548845, + "learning_rate": 9.948109269205338e-06, + "loss": 0.2458, + "step": 1270 + }, + { + "epoch": 0.4244448154950743, + "grad_norm": 0.581592239372877, + "learning_rate": 9.947829645940836e-06, + "loss": 0.2316, + "step": 1271 + }, + { + "epoch": 0.4247787610619469, + "grad_norm": 0.5489943931236657, + "learning_rate": 9.94754927524841e-06, + "loss": 0.2313, + "step": 1272 + }, + { + "epoch": 0.4251127066288195, + "grad_norm": 0.6603804956451033, + "learning_rate": 9.947268157170409e-06, + "loss": 0.2503, + "step": 1273 + }, + { + "epoch": 0.4254466521956921, + "grad_norm": 0.5408603570247881, + "learning_rate": 9.9469862917493e-06, + "loss": 0.2385, + "step": 1274 + }, + { + "epoch": 0.4257805977625647, + "grad_norm": 0.506151945318377, + "learning_rate": 9.946703679027664e-06, + "loss": 0.2129, + "step": 1275 + }, + { + "epoch": 0.4261145433294373, + "grad_norm": 0.6388174252564417, + "learning_rate": 9.946420319048192e-06, + "loss": 0.2522, + "step": 1276 + }, + { + "epoch": 0.4264484888963099, + "grad_norm": 0.6479722260293415, + "learning_rate": 9.946136211853689e-06, + "loss": 0.2502, + "step": 1277 + }, + { + "epoch": 0.4267824344631825, + "grad_norm": 0.5423191804013884, + "learning_rate": 9.94585135748707e-06, + "loss": 0.2243, + "step": 1278 + }, + { + "epoch": 0.4271163800300551, + "grad_norm": 0.6023773989376456, + "learning_rate": 9.94556575599137e-06, + "loss": 0.2388, + "step": 1279 + }, + { + "epoch": 0.4274503255969277, + "grad_norm": 0.6403445586636296, + "learning_rate": 9.94527940740973e-06, + "loss": 0.253, + "step": 1280 + }, + { + "epoch": 0.4277842711638003, + "grad_norm": 0.5765601105471051, + "learning_rate": 9.944992311785406e-06, + "loss": 0.2351, + "step": 1281 + }, + { + "epoch": 0.4281182167306729, + "grad_norm": 0.5943838786454438, + "learning_rate": 9.94470446916177e-06, + "loss": 0.2477, + "step": 1282 + }, + { + "epoch": 0.4284521622975455, + "grad_norm": 0.5967603952647004, + "learning_rate": 9.9444158795823e-06, + "loss": 0.2286, + "step": 1283 + }, + { + "epoch": 0.4287861078644181, + "grad_norm": 0.5889203354692991, + "learning_rate": 9.944126543090593e-06, + "loss": 0.2318, + "step": 1284 + }, + { + "epoch": 0.4291200534312907, + "grad_norm": 0.5778644128403275, + "learning_rate": 9.943836459730356e-06, + "loss": 0.2385, + "step": 1285 + }, + { + "epoch": 0.4294539989981633, + "grad_norm": 0.829101133307546, + "learning_rate": 9.943545629545412e-06, + "loss": 0.2493, + "step": 1286 + }, + { + "epoch": 0.4297879445650359, + "grad_norm": 0.5682692079123255, + "learning_rate": 9.94325405257969e-06, + "loss": 0.2594, + "step": 1287 + }, + { + "epoch": 0.4301218901319085, + "grad_norm": 0.5616769757489756, + "learning_rate": 9.94296172887724e-06, + "loss": 0.2407, + "step": 1288 + }, + { + "epoch": 0.4304558356987811, + "grad_norm": 0.5082241382193762, + "learning_rate": 9.942668658482219e-06, + "loss": 0.2247, + "step": 1289 + }, + { + "epoch": 0.4307897812656537, + "grad_norm": 0.5720871139751971, + "learning_rate": 9.942374841438898e-06, + "loss": 0.2412, + "step": 1290 + }, + { + "epoch": 0.4311237268325263, + "grad_norm": 0.5571165598994844, + "learning_rate": 9.942080277791663e-06, + "loss": 0.2318, + "step": 1291 + }, + { + "epoch": 0.4314576723993989, + "grad_norm": 0.6092865735409773, + "learning_rate": 9.941784967585012e-06, + "loss": 0.2472, + "step": 1292 + }, + { + "epoch": 0.4317916179662715, + "grad_norm": 0.47701447633800187, + "learning_rate": 9.941488910863553e-06, + "loss": 0.2055, + "step": 1293 + }, + { + "epoch": 0.4321255635331441, + "grad_norm": 0.5174504669044425, + "learning_rate": 9.941192107672011e-06, + "loss": 0.2353, + "step": 1294 + }, + { + "epoch": 0.4324595091000167, + "grad_norm": 0.5998296572054426, + "learning_rate": 9.940894558055218e-06, + "loss": 0.2497, + "step": 1295 + }, + { + "epoch": 0.4327934546668893, + "grad_norm": 0.5960708917312382, + "learning_rate": 9.940596262058128e-06, + "loss": 0.2369, + "step": 1296 + }, + { + "epoch": 0.4331274002337619, + "grad_norm": 0.5601394501830662, + "learning_rate": 9.940297219725797e-06, + "loss": 0.2363, + "step": 1297 + }, + { + "epoch": 0.4334613458006345, + "grad_norm": 0.5181219900429102, + "learning_rate": 9.939997431103402e-06, + "loss": 0.2327, + "step": 1298 + }, + { + "epoch": 0.4337952913675071, + "grad_norm": 0.5893205151841744, + "learning_rate": 9.939696896236229e-06, + "loss": 0.2463, + "step": 1299 + }, + { + "epoch": 0.4341292369343797, + "grad_norm": 0.5527429152911172, + "learning_rate": 9.939395615169673e-06, + "loss": 0.2401, + "step": 1300 + }, + { + "epoch": 0.4344631825012523, + "grad_norm": 0.5109471022299947, + "learning_rate": 9.939093587949254e-06, + "loss": 0.2243, + "step": 1301 + }, + { + "epoch": 0.43479712806812487, + "grad_norm": 0.5656116960222147, + "learning_rate": 9.938790814620591e-06, + "loss": 0.2325, + "step": 1302 + }, + { + "epoch": 0.4351310736349975, + "grad_norm": 0.7838636481919538, + "learning_rate": 9.938487295229423e-06, + "loss": 0.2528, + "step": 1303 + }, + { + "epoch": 0.4354650192018701, + "grad_norm": 0.5427979756356961, + "learning_rate": 9.9381830298216e-06, + "loss": 0.2288, + "step": 1304 + }, + { + "epoch": 0.4357989647687427, + "grad_norm": 0.5688915141922664, + "learning_rate": 9.937878018443085e-06, + "loss": 0.2196, + "step": 1305 + }, + { + "epoch": 0.4361329103356153, + "grad_norm": 0.49786316106132467, + "learning_rate": 9.937572261139956e-06, + "loss": 0.2376, + "step": 1306 + }, + { + "epoch": 0.4364668559024879, + "grad_norm": 0.6767510753934641, + "learning_rate": 9.937265757958397e-06, + "loss": 0.2521, + "step": 1307 + }, + { + "epoch": 0.4368008014693605, + "grad_norm": 0.542449196038807, + "learning_rate": 9.93695850894471e-06, + "loss": 0.2305, + "step": 1308 + }, + { + "epoch": 0.4371347470362331, + "grad_norm": 0.5329012111090545, + "learning_rate": 9.93665051414531e-06, + "loss": 0.2241, + "step": 1309 + }, + { + "epoch": 0.43746869260310567, + "grad_norm": 0.5644124677867142, + "learning_rate": 9.936341773606723e-06, + "loss": 0.2418, + "step": 1310 + }, + { + "epoch": 0.4378026381699783, + "grad_norm": 0.8167083316266492, + "learning_rate": 9.936032287375587e-06, + "loss": 0.2533, + "step": 1311 + }, + { + "epoch": 0.4381365837368509, + "grad_norm": 0.5474885438610664, + "learning_rate": 9.935722055498655e-06, + "loss": 0.2368, + "step": 1312 + }, + { + "epoch": 0.4384705293037235, + "grad_norm": 0.6208297848511186, + "learning_rate": 9.935411078022791e-06, + "loss": 0.2388, + "step": 1313 + }, + { + "epoch": 0.4388044748705961, + "grad_norm": 0.6282302035769642, + "learning_rate": 9.93509935499497e-06, + "loss": 0.2493, + "step": 1314 + }, + { + "epoch": 0.4391384204374687, + "grad_norm": 0.5820840951158709, + "learning_rate": 9.934786886462282e-06, + "loss": 0.2263, + "step": 1315 + }, + { + "epoch": 0.4394723660043413, + "grad_norm": 0.582239493200357, + "learning_rate": 9.934473672471931e-06, + "loss": 0.2248, + "step": 1316 + }, + { + "epoch": 0.4398063115712139, + "grad_norm": 0.5473573028008037, + "learning_rate": 9.934159713071229e-06, + "loss": 0.2353, + "step": 1317 + }, + { + "epoch": 0.44014025713808647, + "grad_norm": 0.59739267317348, + "learning_rate": 9.933845008307605e-06, + "loss": 0.2398, + "step": 1318 + }, + { + "epoch": 0.4404742027049591, + "grad_norm": 0.6216076077529502, + "learning_rate": 9.933529558228599e-06, + "loss": 0.2263, + "step": 1319 + }, + { + "epoch": 0.4408081482718317, + "grad_norm": 0.5900409644511551, + "learning_rate": 9.933213362881861e-06, + "loss": 0.235, + "step": 1320 + }, + { + "epoch": 0.4411420938387043, + "grad_norm": 0.5318503294685462, + "learning_rate": 9.932896422315159e-06, + "loss": 0.2183, + "step": 1321 + }, + { + "epoch": 0.4414760394055769, + "grad_norm": 0.5992427234414971, + "learning_rate": 9.93257873657637e-06, + "loss": 0.2416, + "step": 1322 + }, + { + "epoch": 0.4418099849724495, + "grad_norm": 0.6067601416763263, + "learning_rate": 9.932260305713481e-06, + "loss": 0.2391, + "step": 1323 + }, + { + "epoch": 0.4421439305393221, + "grad_norm": 0.5311726881325887, + "learning_rate": 9.9319411297746e-06, + "loss": 0.2463, + "step": 1324 + }, + { + "epoch": 0.4424778761061947, + "grad_norm": 0.6287448890448014, + "learning_rate": 9.931621208807939e-06, + "loss": 0.2494, + "step": 1325 + }, + { + "epoch": 0.44281182167306726, + "grad_norm": 0.6170893282731494, + "learning_rate": 9.931300542861826e-06, + "loss": 0.2418, + "step": 1326 + }, + { + "epoch": 0.44314576723993987, + "grad_norm": 0.5783810354471404, + "learning_rate": 9.930979131984702e-06, + "loss": 0.2418, + "step": 1327 + }, + { + "epoch": 0.4434797128068125, + "grad_norm": 0.5475904806111974, + "learning_rate": 9.93065697622512e-06, + "loss": 0.2315, + "step": 1328 + }, + { + "epoch": 0.4438136583736851, + "grad_norm": 0.5659886586228791, + "learning_rate": 9.930334075631745e-06, + "loss": 0.223, + "step": 1329 + }, + { + "epoch": 0.4441476039405577, + "grad_norm": 0.5751459012303536, + "learning_rate": 9.930010430253356e-06, + "loss": 0.2215, + "step": 1330 + }, + { + "epoch": 0.4444815495074303, + "grad_norm": 0.6063177897359333, + "learning_rate": 9.92968604013884e-06, + "loss": 0.2563, + "step": 1331 + }, + { + "epoch": 0.4448154950743029, + "grad_norm": 0.7243143461095843, + "learning_rate": 9.929360905337204e-06, + "loss": 0.2327, + "step": 1332 + }, + { + "epoch": 0.4451494406411755, + "grad_norm": 0.570687190416024, + "learning_rate": 9.929035025897561e-06, + "loss": 0.2226, + "step": 1333 + }, + { + "epoch": 0.4454833862080481, + "grad_norm": 0.5940808216197645, + "learning_rate": 9.928708401869143e-06, + "loss": 0.2221, + "step": 1334 + }, + { + "epoch": 0.44581733177492067, + "grad_norm": 0.6811536558542676, + "learning_rate": 9.928381033301284e-06, + "loss": 0.2463, + "step": 1335 + }, + { + "epoch": 0.4461512773417933, + "grad_norm": 0.6824905822313252, + "learning_rate": 9.928052920243443e-06, + "loss": 0.2545, + "step": 1336 + }, + { + "epoch": 0.4464852229086659, + "grad_norm": 0.6637186339127504, + "learning_rate": 9.927724062745179e-06, + "loss": 0.2662, + "step": 1337 + }, + { + "epoch": 0.4468191684755385, + "grad_norm": 0.5809812855523707, + "learning_rate": 9.927394460856174e-06, + "loss": 0.2335, + "step": 1338 + }, + { + "epoch": 0.4471531140424111, + "grad_norm": 0.5700895995361696, + "learning_rate": 9.92706411462622e-06, + "loss": 0.2137, + "step": 1339 + }, + { + "epoch": 0.4474870596092837, + "grad_norm": 0.5735937174321404, + "learning_rate": 9.926733024105216e-06, + "loss": 0.2401, + "step": 1340 + }, + { + "epoch": 0.4478210051761563, + "grad_norm": 0.5128813606990313, + "learning_rate": 9.926401189343177e-06, + "loss": 0.2335, + "step": 1341 + }, + { + "epoch": 0.4481549507430289, + "grad_norm": 0.5468742678506229, + "learning_rate": 9.926068610390231e-06, + "loss": 0.2232, + "step": 1342 + }, + { + "epoch": 0.44848889630990146, + "grad_norm": 0.5713584153616417, + "learning_rate": 9.925735287296621e-06, + "loss": 0.2331, + "step": 1343 + }, + { + "epoch": 0.44882284187677407, + "grad_norm": 0.6096003519856071, + "learning_rate": 9.925401220112698e-06, + "loss": 0.2303, + "step": 1344 + }, + { + "epoch": 0.4491567874436467, + "grad_norm": 0.58393370045869, + "learning_rate": 9.925066408888924e-06, + "loss": 0.2401, + "step": 1345 + }, + { + "epoch": 0.4494907330105193, + "grad_norm": 0.5503546449887823, + "learning_rate": 9.92473085367588e-06, + "loss": 0.2422, + "step": 1346 + }, + { + "epoch": 0.4498246785773919, + "grad_norm": 0.5633846354206201, + "learning_rate": 9.924394554524252e-06, + "loss": 0.245, + "step": 1347 + }, + { + "epoch": 0.4501586241442645, + "grad_norm": 0.5534055213955522, + "learning_rate": 9.924057511484844e-06, + "loss": 0.2315, + "step": 1348 + }, + { + "epoch": 0.4504925697111371, + "grad_norm": 0.6078240341950537, + "learning_rate": 9.92371972460857e-06, + "loss": 0.2341, + "step": 1349 + }, + { + "epoch": 0.4508265152780097, + "grad_norm": 0.7758411012157077, + "learning_rate": 9.923381193946457e-06, + "loss": 0.2513, + "step": 1350 + }, + { + "epoch": 0.45116046084488226, + "grad_norm": 0.590993603668986, + "learning_rate": 9.923041919549644e-06, + "loss": 0.2259, + "step": 1351 + }, + { + "epoch": 0.45149440641175487, + "grad_norm": 0.5771541634320295, + "learning_rate": 9.92270190146938e-06, + "loss": 0.237, + "step": 1352 + }, + { + "epoch": 0.4518283519786275, + "grad_norm": 0.5921642270486791, + "learning_rate": 9.922361139757033e-06, + "loss": 0.2391, + "step": 1353 + }, + { + "epoch": 0.4521622975455001, + "grad_norm": 0.5803176539415612, + "learning_rate": 9.922019634464077e-06, + "loss": 0.237, + "step": 1354 + }, + { + "epoch": 0.4524962431123727, + "grad_norm": 0.5689236619706025, + "learning_rate": 9.9216773856421e-06, + "loss": 0.2329, + "step": 1355 + }, + { + "epoch": 0.4528301886792453, + "grad_norm": 0.4940682334608807, + "learning_rate": 9.921334393342803e-06, + "loss": 0.2341, + "step": 1356 + }, + { + "epoch": 0.4531641342461179, + "grad_norm": 0.5752961824097249, + "learning_rate": 9.920990657617998e-06, + "loss": 0.254, + "step": 1357 + }, + { + "epoch": 0.4534980798129905, + "grad_norm": 0.5364989105256935, + "learning_rate": 9.920646178519612e-06, + "loss": 0.2203, + "step": 1358 + }, + { + "epoch": 0.45383202537986306, + "grad_norm": 0.5052033735914636, + "learning_rate": 9.920300956099682e-06, + "loss": 0.2328, + "step": 1359 + }, + { + "epoch": 0.45416597094673566, + "grad_norm": 0.47346441786960736, + "learning_rate": 9.919954990410359e-06, + "loss": 0.2226, + "step": 1360 + }, + { + "epoch": 0.45449991651360827, + "grad_norm": 0.4934375959384355, + "learning_rate": 9.919608281503903e-06, + "loss": 0.2348, + "step": 1361 + }, + { + "epoch": 0.4548338620804809, + "grad_norm": 0.511312257771701, + "learning_rate": 9.91926082943269e-06, + "loss": 0.2153, + "step": 1362 + }, + { + "epoch": 0.4551678076473535, + "grad_norm": 0.5543164186583251, + "learning_rate": 9.918912634249206e-06, + "loss": 0.2454, + "step": 1363 + }, + { + "epoch": 0.4555017532142261, + "grad_norm": 0.5212508239813047, + "learning_rate": 9.91856369600605e-06, + "loss": 0.2232, + "step": 1364 + }, + { + "epoch": 0.4558356987810987, + "grad_norm": 0.527429149205606, + "learning_rate": 9.918214014755935e-06, + "loss": 0.2342, + "step": 1365 + }, + { + "epoch": 0.4561696443479713, + "grad_norm": 0.5562734152529801, + "learning_rate": 9.917863590551682e-06, + "loss": 0.2185, + "step": 1366 + }, + { + "epoch": 0.45650358991484385, + "grad_norm": 0.5093451096437928, + "learning_rate": 9.917512423446226e-06, + "loss": 0.2252, + "step": 1367 + }, + { + "epoch": 0.45683753548171646, + "grad_norm": 0.5376789383435711, + "learning_rate": 9.917160513492619e-06, + "loss": 0.2326, + "step": 1368 + }, + { + "epoch": 0.45717148104858907, + "grad_norm": 0.5160005226617597, + "learning_rate": 9.916807860744017e-06, + "loss": 0.2458, + "step": 1369 + }, + { + "epoch": 0.4575054266154617, + "grad_norm": 0.5243392738518032, + "learning_rate": 9.916454465253695e-06, + "loss": 0.2208, + "step": 1370 + }, + { + "epoch": 0.4578393721823343, + "grad_norm": 0.5454097822750845, + "learning_rate": 9.916100327075038e-06, + "loss": 0.2388, + "step": 1371 + }, + { + "epoch": 0.4581733177492069, + "grad_norm": 0.5311013984757362, + "learning_rate": 9.91574544626154e-06, + "loss": 0.2374, + "step": 1372 + }, + { + "epoch": 0.4585072633160795, + "grad_norm": 0.5081063384150541, + "learning_rate": 9.915389822866811e-06, + "loss": 0.2221, + "step": 1373 + }, + { + "epoch": 0.4588412088829521, + "grad_norm": 0.529213615144114, + "learning_rate": 9.915033456944572e-06, + "loss": 0.2264, + "step": 1374 + }, + { + "epoch": 0.45917515444982465, + "grad_norm": 0.5216114427240847, + "learning_rate": 9.914676348548658e-06, + "loss": 0.2343, + "step": 1375 + }, + { + "epoch": 0.45950910001669726, + "grad_norm": 0.5098198767095327, + "learning_rate": 9.914318497733013e-06, + "loss": 0.2199, + "step": 1376 + }, + { + "epoch": 0.45984304558356986, + "grad_norm": 0.5772594585549337, + "learning_rate": 9.913959904551695e-06, + "loss": 0.2444, + "step": 1377 + }, + { + "epoch": 0.46017699115044247, + "grad_norm": 0.5692684244936813, + "learning_rate": 9.913600569058871e-06, + "loss": 0.2325, + "step": 1378 + }, + { + "epoch": 0.4605109367173151, + "grad_norm": 0.5528985886777825, + "learning_rate": 9.913240491308828e-06, + "loss": 0.2268, + "step": 1379 + }, + { + "epoch": 0.4608448822841877, + "grad_norm": 0.5417272692609387, + "learning_rate": 9.912879671355956e-06, + "loss": 0.2171, + "step": 1380 + }, + { + "epoch": 0.4611788278510603, + "grad_norm": 0.5302491893848811, + "learning_rate": 9.912518109254763e-06, + "loss": 0.2215, + "step": 1381 + }, + { + "epoch": 0.4615127734179329, + "grad_norm": 0.5350805468842509, + "learning_rate": 9.912155805059866e-06, + "loss": 0.2351, + "step": 1382 + }, + { + "epoch": 0.4618467189848055, + "grad_norm": 0.5606286743790297, + "learning_rate": 9.911792758825996e-06, + "loss": 0.2267, + "step": 1383 + }, + { + "epoch": 0.46218066455167806, + "grad_norm": 0.5599706835878967, + "learning_rate": 9.911428970607995e-06, + "loss": 0.2361, + "step": 1384 + }, + { + "epoch": 0.46251461011855066, + "grad_norm": 0.615424046565781, + "learning_rate": 9.911064440460818e-06, + "loss": 0.2298, + "step": 1385 + }, + { + "epoch": 0.46284855568542327, + "grad_norm": 0.5649976414245927, + "learning_rate": 9.91069916843953e-06, + "loss": 0.232, + "step": 1386 + }, + { + "epoch": 0.4631825012522959, + "grad_norm": 0.5508577983404023, + "learning_rate": 9.910333154599314e-06, + "loss": 0.2436, + "step": 1387 + }, + { + "epoch": 0.4635164468191685, + "grad_norm": 0.5205854960323886, + "learning_rate": 9.909966398995456e-06, + "loss": 0.2362, + "step": 1388 + }, + { + "epoch": 0.4638503923860411, + "grad_norm": 0.5757688708297488, + "learning_rate": 9.909598901683361e-06, + "loss": 0.2491, + "step": 1389 + }, + { + "epoch": 0.4641843379529137, + "grad_norm": 0.5869979821978962, + "learning_rate": 9.909230662718543e-06, + "loss": 0.2324, + "step": 1390 + }, + { + "epoch": 0.4645182835197863, + "grad_norm": 0.5625640594323424, + "learning_rate": 9.908861682156628e-06, + "loss": 0.236, + "step": 1391 + }, + { + "epoch": 0.46485222908665885, + "grad_norm": 0.7152995547677362, + "learning_rate": 9.908491960053357e-06, + "loss": 0.2418, + "step": 1392 + }, + { + "epoch": 0.46518617465353146, + "grad_norm": 0.4895348021939775, + "learning_rate": 9.90812149646458e-06, + "loss": 0.2246, + "step": 1393 + }, + { + "epoch": 0.46552012022040407, + "grad_norm": 0.5909081784606736, + "learning_rate": 9.907750291446258e-06, + "loss": 0.2381, + "step": 1394 + }, + { + "epoch": 0.46585406578727667, + "grad_norm": 0.5724895641003475, + "learning_rate": 9.907378345054471e-06, + "loss": 0.2385, + "step": 1395 + }, + { + "epoch": 0.4661880113541493, + "grad_norm": 0.4977242293200873, + "learning_rate": 9.9070056573454e-06, + "loss": 0.2221, + "step": 1396 + }, + { + "epoch": 0.4665219569210219, + "grad_norm": 0.7104851590746769, + "learning_rate": 9.906632228375346e-06, + "loss": 0.2439, + "step": 1397 + }, + { + "epoch": 0.4668559024878945, + "grad_norm": 0.5366132433315185, + "learning_rate": 9.906258058200722e-06, + "loss": 0.2334, + "step": 1398 + }, + { + "epoch": 0.4671898480547671, + "grad_norm": 0.603863208324995, + "learning_rate": 9.905883146878049e-06, + "loss": 0.2469, + "step": 1399 + }, + { + "epoch": 0.46752379362163965, + "grad_norm": 0.6712903315298903, + "learning_rate": 9.90550749446396e-06, + "loss": 0.2525, + "step": 1400 + }, + { + "epoch": 0.46785773918851226, + "grad_norm": 0.5599933585459058, + "learning_rate": 9.905131101015204e-06, + "loss": 0.2298, + "step": 1401 + }, + { + "epoch": 0.46819168475538486, + "grad_norm": 0.6742593776670335, + "learning_rate": 9.904753966588638e-06, + "loss": 0.2249, + "step": 1402 + }, + { + "epoch": 0.46852563032225747, + "grad_norm": 0.5966659500589226, + "learning_rate": 9.904376091241236e-06, + "loss": 0.2406, + "step": 1403 + }, + { + "epoch": 0.4688595758891301, + "grad_norm": 0.5086157470594107, + "learning_rate": 9.903997475030077e-06, + "loss": 0.2239, + "step": 1404 + }, + { + "epoch": 0.4691935214560027, + "grad_norm": 0.5688250377303993, + "learning_rate": 9.903618118012358e-06, + "loss": 0.2499, + "step": 1405 + }, + { + "epoch": 0.4695274670228753, + "grad_norm": 0.5001870005825596, + "learning_rate": 9.903238020245383e-06, + "loss": 0.228, + "step": 1406 + }, + { + "epoch": 0.4698614125897479, + "grad_norm": 0.6111840293711042, + "learning_rate": 9.902857181786571e-06, + "loss": 0.2388, + "step": 1407 + }, + { + "epoch": 0.47019535815662045, + "grad_norm": 0.620871773232944, + "learning_rate": 9.902475602693451e-06, + "loss": 0.2176, + "step": 1408 + }, + { + "epoch": 0.47052930372349305, + "grad_norm": 0.4920698441065501, + "learning_rate": 9.90209328302367e-06, + "loss": 0.2375, + "step": 1409 + }, + { + "epoch": 0.47086324929036566, + "grad_norm": 0.5528573354567796, + "learning_rate": 9.901710222834976e-06, + "loss": 0.2424, + "step": 1410 + }, + { + "epoch": 0.47119719485723827, + "grad_norm": 0.5208706668107904, + "learning_rate": 9.901326422185238e-06, + "loss": 0.2298, + "step": 1411 + }, + { + "epoch": 0.4715311404241109, + "grad_norm": 0.5579864808318521, + "learning_rate": 9.900941881132431e-06, + "loss": 0.2303, + "step": 1412 + }, + { + "epoch": 0.4718650859909835, + "grad_norm": 0.7348092396729472, + "learning_rate": 9.900556599734647e-06, + "loss": 0.2589, + "step": 1413 + }, + { + "epoch": 0.4721990315578561, + "grad_norm": 0.5613570516628159, + "learning_rate": 9.900170578050088e-06, + "loss": 0.2541, + "step": 1414 + }, + { + "epoch": 0.4725329771247287, + "grad_norm": 0.5573029926646937, + "learning_rate": 9.899783816137065e-06, + "loss": 0.2491, + "step": 1415 + }, + { + "epoch": 0.47286692269160124, + "grad_norm": 0.5737576301390739, + "learning_rate": 9.899396314054002e-06, + "loss": 0.2165, + "step": 1416 + }, + { + "epoch": 0.47320086825847385, + "grad_norm": 0.49834479477003774, + "learning_rate": 9.89900807185944e-06, + "loss": 0.2188, + "step": 1417 + }, + { + "epoch": 0.47353481382534646, + "grad_norm": 0.5684392075108803, + "learning_rate": 9.89861908961202e-06, + "loss": 0.2361, + "step": 1418 + }, + { + "epoch": 0.47386875939221906, + "grad_norm": 0.5847270713464249, + "learning_rate": 9.89822936737051e-06, + "loss": 0.2402, + "step": 1419 + }, + { + "epoch": 0.47420270495909167, + "grad_norm": 0.5831201748397754, + "learning_rate": 9.897838905193781e-06, + "loss": 0.2264, + "step": 1420 + }, + { + "epoch": 0.4745366505259643, + "grad_norm": 0.6718561947154861, + "learning_rate": 9.897447703140813e-06, + "loss": 0.2483, + "step": 1421 + }, + { + "epoch": 0.4748705960928369, + "grad_norm": 0.5212579182437543, + "learning_rate": 9.897055761270705e-06, + "loss": 0.2264, + "step": 1422 + }, + { + "epoch": 0.4752045416597095, + "grad_norm": 0.5544409675912124, + "learning_rate": 9.896663079642663e-06, + "loss": 0.2447, + "step": 1423 + }, + { + "epoch": 0.47553848722658204, + "grad_norm": 0.5946504988614014, + "learning_rate": 9.896269658316006e-06, + "loss": 0.2322, + "step": 1424 + }, + { + "epoch": 0.47587243279345465, + "grad_norm": 0.5325243574651551, + "learning_rate": 9.895875497350165e-06, + "loss": 0.2138, + "step": 1425 + }, + { + "epoch": 0.47620637836032725, + "grad_norm": 0.7211936693494748, + "learning_rate": 9.895480596804684e-06, + "loss": 0.2187, + "step": 1426 + }, + { + "epoch": 0.47654032392719986, + "grad_norm": 0.5542697823075144, + "learning_rate": 9.895084956739215e-06, + "loss": 0.2251, + "step": 1427 + }, + { + "epoch": 0.47687426949407247, + "grad_norm": 0.5701293308719834, + "learning_rate": 9.894688577213527e-06, + "loss": 0.2416, + "step": 1428 + }, + { + "epoch": 0.4772082150609451, + "grad_norm": 0.5347630793631123, + "learning_rate": 9.894291458287496e-06, + "loss": 0.2292, + "step": 1429 + }, + { + "epoch": 0.4775421606278177, + "grad_norm": 0.5973098372961372, + "learning_rate": 9.893893600021112e-06, + "loss": 0.2455, + "step": 1430 + }, + { + "epoch": 0.4778761061946903, + "grad_norm": 0.5641760379191003, + "learning_rate": 9.893495002474475e-06, + "loss": 0.231, + "step": 1431 + }, + { + "epoch": 0.47821005176156284, + "grad_norm": 0.5541774904431316, + "learning_rate": 9.893095665707801e-06, + "loss": 0.2501, + "step": 1432 + }, + { + "epoch": 0.47854399732843544, + "grad_norm": 0.5136825487190733, + "learning_rate": 9.89269558978141e-06, + "loss": 0.2241, + "step": 1433 + }, + { + "epoch": 0.47887794289530805, + "grad_norm": 0.6526585039569316, + "learning_rate": 9.892294774755741e-06, + "loss": 0.25, + "step": 1434 + }, + { + "epoch": 0.47921188846218066, + "grad_norm": 0.5924925924061507, + "learning_rate": 9.891893220691343e-06, + "loss": 0.245, + "step": 1435 + }, + { + "epoch": 0.47954583402905326, + "grad_norm": 0.605221439278321, + "learning_rate": 9.891490927648872e-06, + "loss": 0.2365, + "step": 1436 + }, + { + "epoch": 0.47987977959592587, + "grad_norm": 0.5890164596674092, + "learning_rate": 9.891087895689102e-06, + "loss": 0.2298, + "step": 1437 + }, + { + "epoch": 0.4802137251627985, + "grad_norm": 0.5321381950410381, + "learning_rate": 9.890684124872914e-06, + "loss": 0.2202, + "step": 1438 + }, + { + "epoch": 0.4805476707296711, + "grad_norm": 0.7746352115111984, + "learning_rate": 9.890279615261302e-06, + "loss": 0.2241, + "step": 1439 + }, + { + "epoch": 0.4808816162965437, + "grad_norm": 0.5854514402548051, + "learning_rate": 9.889874366915374e-06, + "loss": 0.2265, + "step": 1440 + }, + { + "epoch": 0.48121556186341624, + "grad_norm": 0.5842152055041235, + "learning_rate": 9.889468379896347e-06, + "loss": 0.2398, + "step": 1441 + }, + { + "epoch": 0.48154950743028885, + "grad_norm": 0.5692166615610584, + "learning_rate": 9.88906165426555e-06, + "loss": 0.248, + "step": 1442 + }, + { + "epoch": 0.48188345299716145, + "grad_norm": 0.459765580131553, + "learning_rate": 9.888654190084422e-06, + "loss": 0.2071, + "step": 1443 + }, + { + "epoch": 0.48221739856403406, + "grad_norm": 0.6471093852235853, + "learning_rate": 9.888245987414517e-06, + "loss": 0.2368, + "step": 1444 + }, + { + "epoch": 0.48255134413090667, + "grad_norm": 0.5765295704173624, + "learning_rate": 9.8878370463175e-06, + "loss": 0.238, + "step": 1445 + }, + { + "epoch": 0.4828852896977793, + "grad_norm": 0.5371085420371692, + "learning_rate": 9.887427366855142e-06, + "loss": 0.2234, + "step": 1446 + }, + { + "epoch": 0.4832192352646519, + "grad_norm": 0.49845898630714686, + "learning_rate": 9.887016949089334e-06, + "loss": 0.2377, + "step": 1447 + }, + { + "epoch": 0.4835531808315245, + "grad_norm": 0.47885124555831643, + "learning_rate": 9.886605793082073e-06, + "loss": 0.2262, + "step": 1448 + }, + { + "epoch": 0.48388712639839704, + "grad_norm": 0.5622417184126103, + "learning_rate": 9.886193898895468e-06, + "loss": 0.2388, + "step": 1449 + }, + { + "epoch": 0.48422107196526964, + "grad_norm": 0.6126625007922746, + "learning_rate": 9.885781266591742e-06, + "loss": 0.2374, + "step": 1450 + }, + { + "epoch": 0.48455501753214225, + "grad_norm": 0.5754346810486999, + "learning_rate": 9.885367896233229e-06, + "loss": 0.2495, + "step": 1451 + }, + { + "epoch": 0.48488896309901486, + "grad_norm": 0.5679227654667943, + "learning_rate": 9.88495378788237e-06, + "loss": 0.232, + "step": 1452 + }, + { + "epoch": 0.48522290866588746, + "grad_norm": 0.523471620727347, + "learning_rate": 9.884538941601725e-06, + "loss": 0.2433, + "step": 1453 + }, + { + "epoch": 0.48555685423276007, + "grad_norm": 0.527665998819472, + "learning_rate": 9.884123357453959e-06, + "loss": 0.2264, + "step": 1454 + }, + { + "epoch": 0.4858907997996327, + "grad_norm": 0.507908698813618, + "learning_rate": 9.883707035501849e-06, + "loss": 0.2211, + "step": 1455 + }, + { + "epoch": 0.4862247453665053, + "grad_norm": 0.5790040972393914, + "learning_rate": 9.883289975808288e-06, + "loss": 0.2475, + "step": 1456 + }, + { + "epoch": 0.48655869093337784, + "grad_norm": 0.6259861327751014, + "learning_rate": 9.882872178436277e-06, + "loss": 0.2122, + "step": 1457 + }, + { + "epoch": 0.48689263650025044, + "grad_norm": 0.5921509956180295, + "learning_rate": 9.882453643448933e-06, + "loss": 0.2431, + "step": 1458 + }, + { + "epoch": 0.48722658206712305, + "grad_norm": 0.6237750475844797, + "learning_rate": 9.882034370909474e-06, + "loss": 0.2505, + "step": 1459 + }, + { + "epoch": 0.48756052763399566, + "grad_norm": 0.5668101845599637, + "learning_rate": 9.88161436088124e-06, + "loss": 0.2474, + "step": 1460 + }, + { + "epoch": 0.48789447320086826, + "grad_norm": 0.5512787179899852, + "learning_rate": 9.881193613427676e-06, + "loss": 0.2422, + "step": 1461 + }, + { + "epoch": 0.48822841876774087, + "grad_norm": 0.5569486602988732, + "learning_rate": 9.880772128612345e-06, + "loss": 0.2286, + "step": 1462 + }, + { + "epoch": 0.4885623643346135, + "grad_norm": 0.666951567922003, + "learning_rate": 9.880349906498914e-06, + "loss": 0.2386, + "step": 1463 + }, + { + "epoch": 0.4888963099014861, + "grad_norm": 0.6330185692498038, + "learning_rate": 9.879926947151164e-06, + "loss": 0.2461, + "step": 1464 + }, + { + "epoch": 0.48923025546835863, + "grad_norm": 0.5316986356744938, + "learning_rate": 9.879503250632991e-06, + "loss": 0.2424, + "step": 1465 + }, + { + "epoch": 0.48956420103523124, + "grad_norm": 0.8086554466946749, + "learning_rate": 9.879078817008395e-06, + "loss": 0.2381, + "step": 1466 + }, + { + "epoch": 0.48989814660210385, + "grad_norm": 0.6065660752029407, + "learning_rate": 9.878653646341498e-06, + "loss": 0.2331, + "step": 1467 + }, + { + "epoch": 0.49023209216897645, + "grad_norm": 0.557985159542511, + "learning_rate": 9.878227738696522e-06, + "loss": 0.2237, + "step": 1468 + }, + { + "epoch": 0.49056603773584906, + "grad_norm": 0.48904785077843377, + "learning_rate": 9.877801094137807e-06, + "loss": 0.2117, + "step": 1469 + }, + { + "epoch": 0.49089998330272167, + "grad_norm": 0.5750681520541658, + "learning_rate": 9.877373712729803e-06, + "loss": 0.2321, + "step": 1470 + }, + { + "epoch": 0.49123392886959427, + "grad_norm": 0.6187608854168688, + "learning_rate": 9.876945594537069e-06, + "loss": 0.2323, + "step": 1471 + }, + { + "epoch": 0.4915678744364669, + "grad_norm": 0.5914515580006867, + "learning_rate": 9.876516739624279e-06, + "loss": 0.2343, + "step": 1472 + }, + { + "epoch": 0.49190182000333943, + "grad_norm": 0.6132148683275411, + "learning_rate": 9.876087148056217e-06, + "loss": 0.2206, + "step": 1473 + }, + { + "epoch": 0.49223576557021204, + "grad_norm": 0.5779312466388966, + "learning_rate": 9.875656819897776e-06, + "loss": 0.2217, + "step": 1474 + }, + { + "epoch": 0.49256971113708464, + "grad_norm": 0.5934721585739572, + "learning_rate": 9.875225755213966e-06, + "loss": 0.2242, + "step": 1475 + }, + { + "epoch": 0.49290365670395725, + "grad_norm": 0.6090421834628303, + "learning_rate": 9.874793954069899e-06, + "loss": 0.2488, + "step": 1476 + }, + { + "epoch": 0.49323760227082986, + "grad_norm": 0.5063010044760622, + "learning_rate": 9.874361416530808e-06, + "loss": 0.2394, + "step": 1477 + }, + { + "epoch": 0.49357154783770246, + "grad_norm": 0.5439710083823438, + "learning_rate": 9.873928142662031e-06, + "loss": 0.2375, + "step": 1478 + }, + { + "epoch": 0.49390549340457507, + "grad_norm": 0.48631313293144357, + "learning_rate": 9.873494132529018e-06, + "loss": 0.2314, + "step": 1479 + }, + { + "epoch": 0.4942394389714477, + "grad_norm": 0.6393500940969762, + "learning_rate": 9.873059386197335e-06, + "loss": 0.2618, + "step": 1480 + }, + { + "epoch": 0.4945733845383202, + "grad_norm": 0.5827650304612245, + "learning_rate": 9.872623903732652e-06, + "loss": 0.2174, + "step": 1481 + }, + { + "epoch": 0.49490733010519283, + "grad_norm": 0.48147858509311164, + "learning_rate": 9.872187685200756e-06, + "loss": 0.2177, + "step": 1482 + }, + { + "epoch": 0.49524127567206544, + "grad_norm": 0.5515351930407174, + "learning_rate": 9.87175073066754e-06, + "loss": 0.24, + "step": 1483 + }, + { + "epoch": 0.49557522123893805, + "grad_norm": 0.565292330492309, + "learning_rate": 9.871313040199015e-06, + "loss": 0.2315, + "step": 1484 + }, + { + "epoch": 0.49590916680581065, + "grad_norm": 0.5179304120767915, + "learning_rate": 9.870874613861297e-06, + "loss": 0.2157, + "step": 1485 + }, + { + "epoch": 0.49624311237268326, + "grad_norm": 0.5509004443246928, + "learning_rate": 9.870435451720614e-06, + "loss": 0.2276, + "step": 1486 + }, + { + "epoch": 0.49657705793955587, + "grad_norm": 0.4892998005954115, + "learning_rate": 9.869995553843313e-06, + "loss": 0.2128, + "step": 1487 + }, + { + "epoch": 0.4969110035064285, + "grad_norm": 0.5037397253995877, + "learning_rate": 9.869554920295836e-06, + "loss": 0.2251, + "step": 1488 + }, + { + "epoch": 0.4972449490733011, + "grad_norm": 0.5227998261948305, + "learning_rate": 9.869113551144754e-06, + "loss": 0.2175, + "step": 1489 + }, + { + "epoch": 0.49757889464017363, + "grad_norm": 0.5995114670501726, + "learning_rate": 9.86867144645674e-06, + "loss": 0.2606, + "step": 1490 + }, + { + "epoch": 0.49791284020704624, + "grad_norm": 0.4795393090914556, + "learning_rate": 9.868228606298574e-06, + "loss": 0.2269, + "step": 1491 + }, + { + "epoch": 0.49824678577391884, + "grad_norm": 0.5497097986938829, + "learning_rate": 9.867785030737157e-06, + "loss": 0.2422, + "step": 1492 + }, + { + "epoch": 0.49858073134079145, + "grad_norm": 0.5503594416571509, + "learning_rate": 9.867340719839494e-06, + "loss": 0.241, + "step": 1493 + }, + { + "epoch": 0.49891467690766406, + "grad_norm": 0.7026689122560772, + "learning_rate": 9.866895673672704e-06, + "loss": 0.2523, + "step": 1494 + }, + { + "epoch": 0.49924862247453666, + "grad_norm": 0.5290404576726735, + "learning_rate": 9.866449892304017e-06, + "loss": 0.2233, + "step": 1495 + }, + { + "epoch": 0.49958256804140927, + "grad_norm": 0.4789461584020222, + "learning_rate": 9.866003375800773e-06, + "loss": 0.2275, + "step": 1496 + }, + { + "epoch": 0.4999165136082819, + "grad_norm": 0.5496686648273017, + "learning_rate": 9.865556124230425e-06, + "loss": 0.2269, + "step": 1497 + }, + { + "epoch": 0.5002504591751544, + "grad_norm": 0.5178066142061346, + "learning_rate": 9.865108137660533e-06, + "loss": 0.2352, + "step": 1498 + }, + { + "epoch": 0.500584404742027, + "grad_norm": 0.5198841589298615, + "learning_rate": 9.864659416158773e-06, + "loss": 0.2312, + "step": 1499 + }, + { + "epoch": 0.5009183503088996, + "grad_norm": 0.5388111513331779, + "learning_rate": 9.864209959792927e-06, + "loss": 0.2384, + "step": 1500 + }, + { + "epoch": 0.5012522958757722, + "grad_norm": 0.6553768022329063, + "learning_rate": 9.863759768630893e-06, + "loss": 0.2506, + "step": 1501 + }, + { + "epoch": 0.5015862414426449, + "grad_norm": 0.5610454814274192, + "learning_rate": 9.863308842740678e-06, + "loss": 0.2247, + "step": 1502 + }, + { + "epoch": 0.5019201870095175, + "grad_norm": 0.5502060297653726, + "learning_rate": 9.862857182190398e-06, + "loss": 0.2342, + "step": 1503 + }, + { + "epoch": 0.5022541325763901, + "grad_norm": 0.4949388903158392, + "learning_rate": 9.862404787048283e-06, + "loss": 0.2201, + "step": 1504 + }, + { + "epoch": 0.5025880781432627, + "grad_norm": 0.5240359024269955, + "learning_rate": 9.861951657382671e-06, + "loss": 0.2426, + "step": 1505 + }, + { + "epoch": 0.5029220237101353, + "grad_norm": 0.5132026830298406, + "learning_rate": 9.861497793262014e-06, + "loss": 0.2245, + "step": 1506 + }, + { + "epoch": 0.5032559692770079, + "grad_norm": 0.5198774578302745, + "learning_rate": 9.861043194754874e-06, + "loss": 0.2403, + "step": 1507 + }, + { + "epoch": 0.5035899148438805, + "grad_norm": 0.5039524096753569, + "learning_rate": 9.860587861929922e-06, + "loss": 0.2159, + "step": 1508 + }, + { + "epoch": 0.5039238604107531, + "grad_norm": 0.5448793630792352, + "learning_rate": 9.86013179485594e-06, + "loss": 0.2316, + "step": 1509 + }, + { + "epoch": 0.5042578059776256, + "grad_norm": 0.48213974652147173, + "learning_rate": 9.859674993601826e-06, + "loss": 0.2242, + "step": 1510 + }, + { + "epoch": 0.5045917515444982, + "grad_norm": 0.590315746105308, + "learning_rate": 9.859217458236583e-06, + "loss": 0.2384, + "step": 1511 + }, + { + "epoch": 0.5049256971113708, + "grad_norm": 0.5104158501610313, + "learning_rate": 9.858759188829328e-06, + "loss": 0.2426, + "step": 1512 + }, + { + "epoch": 0.5052596426782434, + "grad_norm": 0.582674382044229, + "learning_rate": 9.858300185449287e-06, + "loss": 0.2401, + "step": 1513 + }, + { + "epoch": 0.505593588245116, + "grad_norm": 0.5203261108265982, + "learning_rate": 9.857840448165798e-06, + "loss": 0.2391, + "step": 1514 + }, + { + "epoch": 0.5059275338119886, + "grad_norm": 0.48683141373358735, + "learning_rate": 9.857379977048311e-06, + "loss": 0.2185, + "step": 1515 + }, + { + "epoch": 0.5062614793788612, + "grad_norm": 0.5561097986159383, + "learning_rate": 9.856918772166385e-06, + "loss": 0.2403, + "step": 1516 + }, + { + "epoch": 0.5065954249457338, + "grad_norm": 0.6235558405935004, + "learning_rate": 9.856456833589688e-06, + "loss": 0.2183, + "step": 1517 + }, + { + "epoch": 0.5069293705126064, + "grad_norm": 0.6446845091009377, + "learning_rate": 9.855994161388005e-06, + "loss": 0.2333, + "step": 1518 + }, + { + "epoch": 0.507263316079479, + "grad_norm": 0.5687674110037545, + "learning_rate": 9.855530755631226e-06, + "loss": 0.2428, + "step": 1519 + }, + { + "epoch": 0.5075972616463517, + "grad_norm": 0.5587656168664682, + "learning_rate": 9.855066616389356e-06, + "loss": 0.2369, + "step": 1520 + }, + { + "epoch": 0.5079312072132243, + "grad_norm": 0.7359238587828697, + "learning_rate": 9.854601743732504e-06, + "loss": 0.2317, + "step": 1521 + }, + { + "epoch": 0.5082651527800969, + "grad_norm": 0.5066229049650518, + "learning_rate": 9.854136137730899e-06, + "loss": 0.2337, + "step": 1522 + }, + { + "epoch": 0.5085990983469695, + "grad_norm": 0.49537075038118694, + "learning_rate": 9.853669798454875e-06, + "loss": 0.2243, + "step": 1523 + }, + { + "epoch": 0.5089330439138421, + "grad_norm": 0.6022772076474969, + "learning_rate": 9.853202725974878e-06, + "loss": 0.2415, + "step": 1524 + }, + { + "epoch": 0.5092669894807147, + "grad_norm": 0.608749379967523, + "learning_rate": 9.852734920361465e-06, + "loss": 0.2345, + "step": 1525 + }, + { + "epoch": 0.5096009350475872, + "grad_norm": 0.6715781075920919, + "learning_rate": 9.8522663816853e-06, + "loss": 0.2641, + "step": 1526 + }, + { + "epoch": 0.5099348806144598, + "grad_norm": 0.5101715269421332, + "learning_rate": 9.851797110017167e-06, + "loss": 0.2196, + "step": 1527 + }, + { + "epoch": 0.5102688261813324, + "grad_norm": 0.511537007454956, + "learning_rate": 9.851327105427952e-06, + "loss": 0.2133, + "step": 1528 + }, + { + "epoch": 0.510602771748205, + "grad_norm": 0.5178737587676562, + "learning_rate": 9.850856367988657e-06, + "loss": 0.2185, + "step": 1529 + }, + { + "epoch": 0.5109367173150776, + "grad_norm": 0.5942221037087895, + "learning_rate": 9.850384897770388e-06, + "loss": 0.245, + "step": 1530 + }, + { + "epoch": 0.5112706628819502, + "grad_norm": 0.5444327388738013, + "learning_rate": 9.84991269484437e-06, + "loss": 0.2412, + "step": 1531 + }, + { + "epoch": 0.5116046084488228, + "grad_norm": 0.5352070717510221, + "learning_rate": 9.849439759281934e-06, + "loss": 0.2304, + "step": 1532 + }, + { + "epoch": 0.5119385540156954, + "grad_norm": 0.5602568244179051, + "learning_rate": 9.848966091154522e-06, + "loss": 0.238, + "step": 1533 + }, + { + "epoch": 0.512272499582568, + "grad_norm": 0.5211858362853864, + "learning_rate": 9.848491690533686e-06, + "loss": 0.2376, + "step": 1534 + }, + { + "epoch": 0.5126064451494406, + "grad_norm": 0.5783858287499946, + "learning_rate": 9.848016557491092e-06, + "loss": 0.2582, + "step": 1535 + }, + { + "epoch": 0.5129403907163133, + "grad_norm": 0.5915548412171439, + "learning_rate": 9.847540692098513e-06, + "loss": 0.2462, + "step": 1536 + }, + { + "epoch": 0.5132743362831859, + "grad_norm": 0.5380120667594149, + "learning_rate": 9.847064094427835e-06, + "loss": 0.244, + "step": 1537 + }, + { + "epoch": 0.5136082818500585, + "grad_norm": 0.5931387539258556, + "learning_rate": 9.846586764551054e-06, + "loss": 0.241, + "step": 1538 + }, + { + "epoch": 0.5139422274169311, + "grad_norm": 0.6732547388174122, + "learning_rate": 9.846108702540274e-06, + "loss": 0.2453, + "step": 1539 + }, + { + "epoch": 0.5142761729838037, + "grad_norm": 0.5708516751959783, + "learning_rate": 9.845629908467714e-06, + "loss": 0.2253, + "step": 1540 + }, + { + "epoch": 0.5146101185506763, + "grad_norm": 0.5964930644824759, + "learning_rate": 9.8451503824057e-06, + "loss": 0.205, + "step": 1541 + }, + { + "epoch": 0.5149440641175489, + "grad_norm": 0.5433778021010894, + "learning_rate": 9.844670124426672e-06, + "loss": 0.2151, + "step": 1542 + }, + { + "epoch": 0.5152780096844214, + "grad_norm": 0.5567634491740848, + "learning_rate": 9.844189134603178e-06, + "loss": 0.2154, + "step": 1543 + }, + { + "epoch": 0.515611955251294, + "grad_norm": 0.4903197026217307, + "learning_rate": 9.843707413007874e-06, + "loss": 0.2137, + "step": 1544 + }, + { + "epoch": 0.5159459008181666, + "grad_norm": 0.5960697433572092, + "learning_rate": 9.843224959713535e-06, + "loss": 0.2527, + "step": 1545 + }, + { + "epoch": 0.5162798463850392, + "grad_norm": 0.4944143034523633, + "learning_rate": 9.842741774793038e-06, + "loss": 0.2307, + "step": 1546 + }, + { + "epoch": 0.5166137919519118, + "grad_norm": 0.49358314195918296, + "learning_rate": 9.842257858319375e-06, + "loss": 0.2179, + "step": 1547 + }, + { + "epoch": 0.5169477375187844, + "grad_norm": 0.5796108269031679, + "learning_rate": 9.841773210365646e-06, + "loss": 0.2519, + "step": 1548 + }, + { + "epoch": 0.517281683085657, + "grad_norm": 0.5138397804293885, + "learning_rate": 9.841287831005064e-06, + "loss": 0.2305, + "step": 1549 + }, + { + "epoch": 0.5176156286525296, + "grad_norm": 0.5250071600094292, + "learning_rate": 9.84080172031095e-06, + "loss": 0.2311, + "step": 1550 + }, + { + "epoch": 0.5179495742194022, + "grad_norm": 0.5574764925023964, + "learning_rate": 9.840314878356739e-06, + "loss": 0.2323, + "step": 1551 + }, + { + "epoch": 0.5182835197862748, + "grad_norm": 0.6140408300146574, + "learning_rate": 9.839827305215972e-06, + "loss": 0.2533, + "step": 1552 + }, + { + "epoch": 0.5186174653531475, + "grad_norm": 0.7516633182539431, + "learning_rate": 9.839339000962305e-06, + "loss": 0.226, + "step": 1553 + }, + { + "epoch": 0.5189514109200201, + "grad_norm": 0.5458694547214934, + "learning_rate": 9.838849965669499e-06, + "loss": 0.2325, + "step": 1554 + }, + { + "epoch": 0.5192853564868927, + "grad_norm": 0.5064986538497875, + "learning_rate": 9.83836019941143e-06, + "loss": 0.2114, + "step": 1555 + }, + { + "epoch": 0.5196193020537653, + "grad_norm": 0.61319519408008, + "learning_rate": 9.837869702262082e-06, + "loss": 0.2473, + "step": 1556 + }, + { + "epoch": 0.5199532476206379, + "grad_norm": 0.5565828724070242, + "learning_rate": 9.837378474295553e-06, + "loss": 0.2332, + "step": 1557 + }, + { + "epoch": 0.5202871931875105, + "grad_norm": 0.5064088876577069, + "learning_rate": 9.836886515586045e-06, + "loss": 0.217, + "step": 1558 + }, + { + "epoch": 0.520621138754383, + "grad_norm": 0.536537012097158, + "learning_rate": 9.83639382620788e-06, + "loss": 0.2276, + "step": 1559 + }, + { + "epoch": 0.5209550843212556, + "grad_norm": 0.518315034919932, + "learning_rate": 9.835900406235479e-06, + "loss": 0.2247, + "step": 1560 + }, + { + "epoch": 0.5212890298881282, + "grad_norm": 0.5401437965180717, + "learning_rate": 9.835406255743381e-06, + "loss": 0.229, + "step": 1561 + }, + { + "epoch": 0.5216229754550008, + "grad_norm": 0.5105619541123207, + "learning_rate": 9.834911374806231e-06, + "loss": 0.2335, + "step": 1562 + }, + { + "epoch": 0.5219569210218734, + "grad_norm": 0.519164367586733, + "learning_rate": 9.83441576349879e-06, + "loss": 0.2251, + "step": 1563 + }, + { + "epoch": 0.522290866588746, + "grad_norm": 0.569853562629724, + "learning_rate": 9.833919421895926e-06, + "loss": 0.2537, + "step": 1564 + }, + { + "epoch": 0.5226248121556186, + "grad_norm": 0.516625707086611, + "learning_rate": 9.833422350072615e-06, + "loss": 0.2161, + "step": 1565 + }, + { + "epoch": 0.5229587577224912, + "grad_norm": 0.6535880193463475, + "learning_rate": 9.832924548103945e-06, + "loss": 0.209, + "step": 1566 + }, + { + "epoch": 0.5232927032893638, + "grad_norm": 0.4916515442829188, + "learning_rate": 9.832426016065117e-06, + "loss": 0.2185, + "step": 1567 + }, + { + "epoch": 0.5236266488562364, + "grad_norm": 0.5814664612841597, + "learning_rate": 9.83192675403144e-06, + "loss": 0.2266, + "step": 1568 + }, + { + "epoch": 0.523960594423109, + "grad_norm": 0.5884057167346974, + "learning_rate": 9.831426762078331e-06, + "loss": 0.2244, + "step": 1569 + }, + { + "epoch": 0.5242945399899817, + "grad_norm": 0.6183375261868602, + "learning_rate": 9.830926040281321e-06, + "loss": 0.2445, + "step": 1570 + }, + { + "epoch": 0.5246284855568543, + "grad_norm": 0.5200985380140108, + "learning_rate": 9.830424588716053e-06, + "loss": 0.2248, + "step": 1571 + }, + { + "epoch": 0.5249624311237269, + "grad_norm": 0.5420915104887184, + "learning_rate": 9.829922407458273e-06, + "loss": 0.2327, + "step": 1572 + }, + { + "epoch": 0.5252963766905995, + "grad_norm": 0.5270598323458139, + "learning_rate": 9.829419496583843e-06, + "loss": 0.2256, + "step": 1573 + }, + { + "epoch": 0.5256303222574721, + "grad_norm": 0.5239074446729693, + "learning_rate": 9.828915856168734e-06, + "loss": 0.2284, + "step": 1574 + }, + { + "epoch": 0.5259642678243446, + "grad_norm": 0.5489528638296536, + "learning_rate": 9.828411486289026e-06, + "loss": 0.2346, + "step": 1575 + }, + { + "epoch": 0.5262982133912172, + "grad_norm": 0.5306622558740911, + "learning_rate": 9.82790638702091e-06, + "loss": 0.2334, + "step": 1576 + }, + { + "epoch": 0.5266321589580898, + "grad_norm": 0.5020797546266, + "learning_rate": 9.827400558440687e-06, + "loss": 0.2193, + "step": 1577 + }, + { + "epoch": 0.5269661045249624, + "grad_norm": 0.4958858002471795, + "learning_rate": 9.826894000624769e-06, + "loss": 0.2214, + "step": 1578 + }, + { + "epoch": 0.527300050091835, + "grad_norm": 0.5495730199464439, + "learning_rate": 9.826386713649678e-06, + "loss": 0.2391, + "step": 1579 + }, + { + "epoch": 0.5276339956587076, + "grad_norm": 0.5337213938676789, + "learning_rate": 9.825878697592046e-06, + "loss": 0.241, + "step": 1580 + }, + { + "epoch": 0.5279679412255802, + "grad_norm": 0.5243139931047547, + "learning_rate": 9.825369952528611e-06, + "loss": 0.2347, + "step": 1581 + }, + { + "epoch": 0.5283018867924528, + "grad_norm": 0.46951090232168313, + "learning_rate": 9.824860478536231e-06, + "loss": 0.2174, + "step": 1582 + }, + { + "epoch": 0.5286358323593254, + "grad_norm": 0.5591143330075115, + "learning_rate": 9.824350275691864e-06, + "loss": 0.2315, + "step": 1583 + }, + { + "epoch": 0.528969777926198, + "grad_norm": 0.5741634648736227, + "learning_rate": 9.823839344072582e-06, + "loss": 0.2392, + "step": 1584 + }, + { + "epoch": 0.5293037234930706, + "grad_norm": 0.5477207037243178, + "learning_rate": 9.823327683755566e-06, + "loss": 0.2479, + "step": 1585 + }, + { + "epoch": 0.5296376690599433, + "grad_norm": 0.5814226357210571, + "learning_rate": 9.822815294818113e-06, + "loss": 0.2503, + "step": 1586 + }, + { + "epoch": 0.5299716146268159, + "grad_norm": 0.5959177044049923, + "learning_rate": 9.822302177337624e-06, + "loss": 0.2414, + "step": 1587 + }, + { + "epoch": 0.5303055601936885, + "grad_norm": 0.5034065372193987, + "learning_rate": 9.821788331391609e-06, + "loss": 0.226, + "step": 1588 + }, + { + "epoch": 0.5306395057605611, + "grad_norm": 0.5444917509379016, + "learning_rate": 9.821273757057692e-06, + "loss": 0.2208, + "step": 1589 + }, + { + "epoch": 0.5309734513274337, + "grad_norm": 0.6340433755540438, + "learning_rate": 9.820758454413606e-06, + "loss": 0.2341, + "step": 1590 + }, + { + "epoch": 0.5313073968943063, + "grad_norm": 0.5068735932793473, + "learning_rate": 9.820242423537192e-06, + "loss": 0.2279, + "step": 1591 + }, + { + "epoch": 0.5316413424611788, + "grad_norm": 0.5514835902750657, + "learning_rate": 9.819725664506404e-06, + "loss": 0.2397, + "step": 1592 + }, + { + "epoch": 0.5319752880280514, + "grad_norm": 0.540033557370152, + "learning_rate": 9.819208177399303e-06, + "loss": 0.2239, + "step": 1593 + }, + { + "epoch": 0.532309233594924, + "grad_norm": 0.592977095077727, + "learning_rate": 9.818689962294063e-06, + "loss": 0.2339, + "step": 1594 + }, + { + "epoch": 0.5326431791617966, + "grad_norm": 0.5871207671435377, + "learning_rate": 9.818171019268965e-06, + "loss": 0.2293, + "step": 1595 + }, + { + "epoch": 0.5329771247286692, + "grad_norm": 0.5947688900274528, + "learning_rate": 9.817651348402403e-06, + "loss": 0.2514, + "step": 1596 + }, + { + "epoch": 0.5333110702955418, + "grad_norm": 0.5812926758957463, + "learning_rate": 9.81713094977288e-06, + "loss": 0.2248, + "step": 1597 + }, + { + "epoch": 0.5336450158624144, + "grad_norm": 0.513970380059725, + "learning_rate": 9.816609823459007e-06, + "loss": 0.2375, + "step": 1598 + }, + { + "epoch": 0.533978961429287, + "grad_norm": 0.5034512417703262, + "learning_rate": 9.816087969539506e-06, + "loss": 0.2287, + "step": 1599 + }, + { + "epoch": 0.5343129069961596, + "grad_norm": 0.5550546878199349, + "learning_rate": 9.815565388093209e-06, + "loss": 0.2396, + "step": 1600 + }, + { + "epoch": 0.5346468525630322, + "grad_norm": 0.5324993847600048, + "learning_rate": 9.81504207919906e-06, + "loss": 0.2223, + "step": 1601 + }, + { + "epoch": 0.5349807981299048, + "grad_norm": 0.526275829556474, + "learning_rate": 9.814518042936107e-06, + "loss": 0.2431, + "step": 1602 + }, + { + "epoch": 0.5353147436967775, + "grad_norm": 0.5494997861868455, + "learning_rate": 9.813993279383518e-06, + "loss": 0.2323, + "step": 1603 + }, + { + "epoch": 0.5356486892636501, + "grad_norm": 0.5171193572022826, + "learning_rate": 9.813467788620559e-06, + "loss": 0.2416, + "step": 1604 + }, + { + "epoch": 0.5359826348305227, + "grad_norm": 0.5027893595121139, + "learning_rate": 9.812941570726615e-06, + "loss": 0.218, + "step": 1605 + }, + { + "epoch": 0.5363165803973953, + "grad_norm": 0.5506740906758446, + "learning_rate": 9.812414625781175e-06, + "loss": 0.2296, + "step": 1606 + }, + { + "epoch": 0.5366505259642679, + "grad_norm": 0.5183511110941164, + "learning_rate": 9.811886953863841e-06, + "loss": 0.2082, + "step": 1607 + }, + { + "epoch": 0.5369844715311404, + "grad_norm": 0.6302353066399081, + "learning_rate": 9.811358555054326e-06, + "loss": 0.2228, + "step": 1608 + }, + { + "epoch": 0.537318417098013, + "grad_norm": 0.5341795817076737, + "learning_rate": 9.810829429432449e-06, + "loss": 0.2293, + "step": 1609 + }, + { + "epoch": 0.5376523626648856, + "grad_norm": 0.5763059453482908, + "learning_rate": 9.81029957707814e-06, + "loss": 0.2558, + "step": 1610 + }, + { + "epoch": 0.5379863082317582, + "grad_norm": 0.6041426482444534, + "learning_rate": 9.809768998071442e-06, + "loss": 0.2293, + "step": 1611 + }, + { + "epoch": 0.5383202537986308, + "grad_norm": 0.5132151326478286, + "learning_rate": 9.809237692492503e-06, + "loss": 0.2185, + "step": 1612 + }, + { + "epoch": 0.5386541993655034, + "grad_norm": 0.5391826703922983, + "learning_rate": 9.808705660421582e-06, + "loss": 0.2323, + "step": 1613 + }, + { + "epoch": 0.538988144932376, + "grad_norm": 0.6461704927245718, + "learning_rate": 9.808172901939053e-06, + "loss": 0.2125, + "step": 1614 + }, + { + "epoch": 0.5393220904992486, + "grad_norm": 0.5270375186381666, + "learning_rate": 9.807639417125392e-06, + "loss": 0.2176, + "step": 1615 + }, + { + "epoch": 0.5396560360661212, + "grad_norm": 0.5445635894652113, + "learning_rate": 9.807105206061186e-06, + "loss": 0.2274, + "step": 1616 + }, + { + "epoch": 0.5399899816329938, + "grad_norm": 0.5317828887600755, + "learning_rate": 9.80657026882714e-06, + "loss": 0.2369, + "step": 1617 + }, + { + "epoch": 0.5403239271998664, + "grad_norm": 0.5686035453100612, + "learning_rate": 9.80603460550406e-06, + "loss": 0.2321, + "step": 1618 + }, + { + "epoch": 0.540657872766739, + "grad_norm": 0.527823856857519, + "learning_rate": 9.805498216172861e-06, + "loss": 0.2326, + "step": 1619 + }, + { + "epoch": 0.5409918183336117, + "grad_norm": 0.6946489211645083, + "learning_rate": 9.804961100914575e-06, + "loss": 0.2275, + "step": 1620 + }, + { + "epoch": 0.5413257639004843, + "grad_norm": 0.5021269916482256, + "learning_rate": 9.804423259810338e-06, + "loss": 0.217, + "step": 1621 + }, + { + "epoch": 0.5416597094673569, + "grad_norm": 0.5280955200768908, + "learning_rate": 9.803884692941397e-06, + "loss": 0.232, + "step": 1622 + }, + { + "epoch": 0.5419936550342295, + "grad_norm": 0.524502353180037, + "learning_rate": 9.803345400389111e-06, + "loss": 0.2295, + "step": 1623 + }, + { + "epoch": 0.542327600601102, + "grad_norm": 0.5588116006844829, + "learning_rate": 9.802805382234941e-06, + "loss": 0.2533, + "step": 1624 + }, + { + "epoch": 0.5426615461679746, + "grad_norm": 0.500852907399858, + "learning_rate": 9.80226463856047e-06, + "loss": 0.2134, + "step": 1625 + }, + { + "epoch": 0.5429954917348472, + "grad_norm": 0.6862541833794148, + "learning_rate": 9.801723169447378e-06, + "loss": 0.2506, + "step": 1626 + }, + { + "epoch": 0.5433294373017198, + "grad_norm": 0.5040386542518999, + "learning_rate": 9.801180974977466e-06, + "loss": 0.227, + "step": 1627 + }, + { + "epoch": 0.5436633828685924, + "grad_norm": 0.5474990228218318, + "learning_rate": 9.800638055232635e-06, + "loss": 0.2166, + "step": 1628 + }, + { + "epoch": 0.543997328435465, + "grad_norm": 0.6388226847856388, + "learning_rate": 9.800094410294897e-06, + "loss": 0.2487, + "step": 1629 + }, + { + "epoch": 0.5443312740023376, + "grad_norm": 0.4765248139449764, + "learning_rate": 9.799550040246381e-06, + "loss": 0.2129, + "step": 1630 + }, + { + "epoch": 0.5446652195692102, + "grad_norm": 0.5019451317873559, + "learning_rate": 9.799004945169319e-06, + "loss": 0.2038, + "step": 1631 + }, + { + "epoch": 0.5449991651360828, + "grad_norm": 0.48693504267132603, + "learning_rate": 9.798459125146054e-06, + "loss": 0.2379, + "step": 1632 + }, + { + "epoch": 0.5453331107029554, + "grad_norm": 0.5011793654105599, + "learning_rate": 9.797912580259037e-06, + "loss": 0.2278, + "step": 1633 + }, + { + "epoch": 0.545667056269828, + "grad_norm": 0.5627274706249237, + "learning_rate": 9.797365310590832e-06, + "loss": 0.2399, + "step": 1634 + }, + { + "epoch": 0.5460010018367006, + "grad_norm": 0.5317296411359453, + "learning_rate": 9.796817316224107e-06, + "loss": 0.2313, + "step": 1635 + }, + { + "epoch": 0.5463349474035732, + "grad_norm": 0.5256731560457311, + "learning_rate": 9.79626859724165e-06, + "loss": 0.2274, + "step": 1636 + }, + { + "epoch": 0.5466688929704459, + "grad_norm": 0.5295939011926792, + "learning_rate": 9.795719153726345e-06, + "loss": 0.2306, + "step": 1637 + }, + { + "epoch": 0.5470028385373185, + "grad_norm": 0.49722225435126244, + "learning_rate": 9.795168985761192e-06, + "loss": 0.2424, + "step": 1638 + }, + { + "epoch": 0.5473367841041911, + "grad_norm": 0.4782717883596217, + "learning_rate": 9.794618093429305e-06, + "loss": 0.2337, + "step": 1639 + }, + { + "epoch": 0.5476707296710637, + "grad_norm": 0.48131945296813267, + "learning_rate": 9.794066476813901e-06, + "loss": 0.2223, + "step": 1640 + }, + { + "epoch": 0.5480046752379362, + "grad_norm": 0.5320451343577421, + "learning_rate": 9.793514135998306e-06, + "loss": 0.2194, + "step": 1641 + }, + { + "epoch": 0.5483386208048088, + "grad_norm": 0.5662644769319405, + "learning_rate": 9.792961071065958e-06, + "loss": 0.2404, + "step": 1642 + }, + { + "epoch": 0.5486725663716814, + "grad_norm": 0.4708050063434642, + "learning_rate": 9.792407282100407e-06, + "loss": 0.2125, + "step": 1643 + }, + { + "epoch": 0.549006511938554, + "grad_norm": 0.5331092111658083, + "learning_rate": 9.791852769185306e-06, + "loss": 0.2317, + "step": 1644 + }, + { + "epoch": 0.5493404575054266, + "grad_norm": 0.46285348701283613, + "learning_rate": 9.791297532404422e-06, + "loss": 0.2188, + "step": 1645 + }, + { + "epoch": 0.5496744030722992, + "grad_norm": 0.6966380998201791, + "learning_rate": 9.790741571841629e-06, + "loss": 0.2434, + "step": 1646 + }, + { + "epoch": 0.5500083486391718, + "grad_norm": 0.6705149653085449, + "learning_rate": 9.790184887580914e-06, + "loss": 0.2545, + "step": 1647 + }, + { + "epoch": 0.5503422942060444, + "grad_norm": 0.5488413845730032, + "learning_rate": 9.78962747970637e-06, + "loss": 0.2192, + "step": 1648 + }, + { + "epoch": 0.550676239772917, + "grad_norm": 0.508399250930862, + "learning_rate": 9.789069348302197e-06, + "loss": 0.2147, + "step": 1649 + }, + { + "epoch": 0.5510101853397896, + "grad_norm": 0.5308431015969063, + "learning_rate": 9.78851049345271e-06, + "loss": 0.2451, + "step": 1650 + }, + { + "epoch": 0.5513441309066622, + "grad_norm": 0.5254441033154762, + "learning_rate": 9.78795091524233e-06, + "loss": 0.2307, + "step": 1651 + }, + { + "epoch": 0.5516780764735348, + "grad_norm": 0.49287285493077243, + "learning_rate": 9.78739061375559e-06, + "loss": 0.2279, + "step": 1652 + }, + { + "epoch": 0.5520120220404074, + "grad_norm": 0.6250540163648396, + "learning_rate": 9.786829589077125e-06, + "loss": 0.2568, + "step": 1653 + }, + { + "epoch": 0.55234596760728, + "grad_norm": 0.5757068773628706, + "learning_rate": 9.78626784129169e-06, + "loss": 0.2388, + "step": 1654 + }, + { + "epoch": 0.5526799131741527, + "grad_norm": 0.5131405555742891, + "learning_rate": 9.78570537048414e-06, + "loss": 0.2195, + "step": 1655 + }, + { + "epoch": 0.5530138587410253, + "grad_norm": 0.5273437954722322, + "learning_rate": 9.785142176739444e-06, + "loss": 0.234, + "step": 1656 + }, + { + "epoch": 0.5533478043078978, + "grad_norm": 0.5715667449885279, + "learning_rate": 9.784578260142679e-06, + "loss": 0.2364, + "step": 1657 + }, + { + "epoch": 0.5536817498747704, + "grad_norm": 0.4662686706134208, + "learning_rate": 9.784013620779031e-06, + "loss": 0.2174, + "step": 1658 + }, + { + "epoch": 0.554015695441643, + "grad_norm": 0.5573487695151922, + "learning_rate": 9.783448258733795e-06, + "loss": 0.2393, + "step": 1659 + }, + { + "epoch": 0.5543496410085156, + "grad_norm": 0.6004739211376822, + "learning_rate": 9.782882174092377e-06, + "loss": 0.2366, + "step": 1660 + }, + { + "epoch": 0.5546835865753882, + "grad_norm": 0.5130868339214583, + "learning_rate": 9.78231536694029e-06, + "loss": 0.2317, + "step": 1661 + }, + { + "epoch": 0.5550175321422608, + "grad_norm": 0.5081615667438504, + "learning_rate": 9.781747837363158e-06, + "loss": 0.2211, + "step": 1662 + }, + { + "epoch": 0.5553514777091334, + "grad_norm": 0.5243058551452536, + "learning_rate": 9.781179585446711e-06, + "loss": 0.2321, + "step": 1663 + }, + { + "epoch": 0.555685423276006, + "grad_norm": 0.5552330996626126, + "learning_rate": 9.780610611276791e-06, + "loss": 0.2263, + "step": 1664 + }, + { + "epoch": 0.5560193688428786, + "grad_norm": 0.5084354233773488, + "learning_rate": 9.780040914939349e-06, + "loss": 0.2209, + "step": 1665 + }, + { + "epoch": 0.5563533144097512, + "grad_norm": 0.8052644359842761, + "learning_rate": 9.779470496520442e-06, + "loss": 0.2786, + "step": 1666 + }, + { + "epoch": 0.5566872599766238, + "grad_norm": 0.5320336558276018, + "learning_rate": 9.77889935610624e-06, + "loss": 0.2182, + "step": 1667 + }, + { + "epoch": 0.5570212055434964, + "grad_norm": 0.5365736296465224, + "learning_rate": 9.778327493783022e-06, + "loss": 0.2251, + "step": 1668 + }, + { + "epoch": 0.557355151110369, + "grad_norm": 0.4630060106256875, + "learning_rate": 9.777754909637173e-06, + "loss": 0.2072, + "step": 1669 + }, + { + "epoch": 0.5576890966772416, + "grad_norm": 0.48881473250954816, + "learning_rate": 9.777181603755188e-06, + "loss": 0.2251, + "step": 1670 + }, + { + "epoch": 0.5580230422441143, + "grad_norm": 0.5416086012245569, + "learning_rate": 9.776607576223673e-06, + "loss": 0.2241, + "step": 1671 + }, + { + "epoch": 0.5583569878109869, + "grad_norm": 0.5213227615067991, + "learning_rate": 9.776032827129338e-06, + "loss": 0.221, + "step": 1672 + }, + { + "epoch": 0.5586909333778594, + "grad_norm": 0.5756279722517523, + "learning_rate": 9.775457356559013e-06, + "loss": 0.2274, + "step": 1673 + }, + { + "epoch": 0.559024878944732, + "grad_norm": 0.7857019129301895, + "learning_rate": 9.774881164599621e-06, + "loss": 0.2443, + "step": 1674 + }, + { + "epoch": 0.5593588245116046, + "grad_norm": 0.5730959574701345, + "learning_rate": 9.77430425133821e-06, + "loss": 0.2275, + "step": 1675 + }, + { + "epoch": 0.5596927700784772, + "grad_norm": 0.5424650987984629, + "learning_rate": 9.773726616861926e-06, + "loss": 0.2386, + "step": 1676 + }, + { + "epoch": 0.5600267156453498, + "grad_norm": 0.527455714283128, + "learning_rate": 9.773148261258025e-06, + "loss": 0.2316, + "step": 1677 + }, + { + "epoch": 0.5603606612122224, + "grad_norm": 0.6038936532104409, + "learning_rate": 9.772569184613879e-06, + "loss": 0.2251, + "step": 1678 + }, + { + "epoch": 0.560694606779095, + "grad_norm": 0.500342452052048, + "learning_rate": 9.771989387016962e-06, + "loss": 0.2301, + "step": 1679 + }, + { + "epoch": 0.5610285523459676, + "grad_norm": 0.5596628524290133, + "learning_rate": 9.77140886855486e-06, + "loss": 0.2347, + "step": 1680 + }, + { + "epoch": 0.5613624979128402, + "grad_norm": 0.513227216859956, + "learning_rate": 9.770827629315266e-06, + "loss": 0.2316, + "step": 1681 + }, + { + "epoch": 0.5616964434797128, + "grad_norm": 0.5571244273244604, + "learning_rate": 9.770245669385984e-06, + "loss": 0.2246, + "step": 1682 + }, + { + "epoch": 0.5620303890465854, + "grad_norm": 0.5315880903884372, + "learning_rate": 9.76966298885493e-06, + "loss": 0.2215, + "step": 1683 + }, + { + "epoch": 0.562364334613458, + "grad_norm": 0.49598745769160424, + "learning_rate": 9.769079587810115e-06, + "loss": 0.2309, + "step": 1684 + }, + { + "epoch": 0.5626982801803306, + "grad_norm": 0.5013001295478747, + "learning_rate": 9.768495466339675e-06, + "loss": 0.2294, + "step": 1685 + }, + { + "epoch": 0.5630322257472032, + "grad_norm": 0.5009756499143495, + "learning_rate": 9.767910624531852e-06, + "loss": 0.2178, + "step": 1686 + }, + { + "epoch": 0.5633661713140758, + "grad_norm": 0.5041983827071985, + "learning_rate": 9.767325062474984e-06, + "loss": 0.2264, + "step": 1687 + }, + { + "epoch": 0.5637001168809485, + "grad_norm": 0.5388073393930474, + "learning_rate": 9.766738780257535e-06, + "loss": 0.2443, + "step": 1688 + }, + { + "epoch": 0.564034062447821, + "grad_norm": 0.4849109322874211, + "learning_rate": 9.766151777968063e-06, + "loss": 0.2238, + "step": 1689 + }, + { + "epoch": 0.5643680080146936, + "grad_norm": 0.5165836599020902, + "learning_rate": 9.765564055695249e-06, + "loss": 0.2492, + "step": 1690 + }, + { + "epoch": 0.5647019535815662, + "grad_norm": 0.5901306226307081, + "learning_rate": 9.76497561352787e-06, + "loss": 0.2369, + "step": 1691 + }, + { + "epoch": 0.5650358991484388, + "grad_norm": 0.529734231185276, + "learning_rate": 9.764386451554819e-06, + "loss": 0.2289, + "step": 1692 + }, + { + "epoch": 0.5653698447153114, + "grad_norm": 0.5307573390534124, + "learning_rate": 9.763796569865095e-06, + "loss": 0.2312, + "step": 1693 + }, + { + "epoch": 0.565703790282184, + "grad_norm": 0.5486503617208062, + "learning_rate": 9.763205968547808e-06, + "loss": 0.2162, + "step": 1694 + }, + { + "epoch": 0.5660377358490566, + "grad_norm": 0.5649817924362338, + "learning_rate": 9.762614647692175e-06, + "loss": 0.2191, + "step": 1695 + }, + { + "epoch": 0.5663716814159292, + "grad_norm": 0.533218563853688, + "learning_rate": 9.762022607387522e-06, + "loss": 0.2277, + "step": 1696 + }, + { + "epoch": 0.5667056269828018, + "grad_norm": 0.5396005837210106, + "learning_rate": 9.761429847723281e-06, + "loss": 0.251, + "step": 1697 + }, + { + "epoch": 0.5670395725496744, + "grad_norm": 0.6021708734947762, + "learning_rate": 9.760836368788999e-06, + "loss": 0.2298, + "step": 1698 + }, + { + "epoch": 0.567373518116547, + "grad_norm": 0.5571831043526276, + "learning_rate": 9.760242170674325e-06, + "loss": 0.221, + "step": 1699 + }, + { + "epoch": 0.5677074636834196, + "grad_norm": 0.6496040372874637, + "learning_rate": 9.759647253469023e-06, + "loss": 0.2518, + "step": 1700 + }, + { + "epoch": 0.5680414092502922, + "grad_norm": 0.5260316723253985, + "learning_rate": 9.75905161726296e-06, + "loss": 0.2489, + "step": 1701 + }, + { + "epoch": 0.5683753548171648, + "grad_norm": 0.522847565423199, + "learning_rate": 9.758455262146114e-06, + "loss": 0.2379, + "step": 1702 + }, + { + "epoch": 0.5687093003840374, + "grad_norm": 0.5215908948559349, + "learning_rate": 9.757858188208571e-06, + "loss": 0.2267, + "step": 1703 + }, + { + "epoch": 0.56904324595091, + "grad_norm": 0.4937749913698861, + "learning_rate": 9.757260395540527e-06, + "loss": 0.2167, + "step": 1704 + }, + { + "epoch": 0.5693771915177827, + "grad_norm": 0.5231134607360919, + "learning_rate": 9.756661884232286e-06, + "loss": 0.2288, + "step": 1705 + }, + { + "epoch": 0.5697111370846552, + "grad_norm": 0.48578184135466346, + "learning_rate": 9.756062654374259e-06, + "loss": 0.2254, + "step": 1706 + }, + { + "epoch": 0.5700450826515278, + "grad_norm": 0.4979412299848651, + "learning_rate": 9.755462706056966e-06, + "loss": 0.2266, + "step": 1707 + }, + { + "epoch": 0.5703790282184004, + "grad_norm": 0.5441091075782964, + "learning_rate": 9.75486203937104e-06, + "loss": 0.2193, + "step": 1708 + }, + { + "epoch": 0.570712973785273, + "grad_norm": 0.5216793063707312, + "learning_rate": 9.754260654407214e-06, + "loss": 0.2231, + "step": 1709 + }, + { + "epoch": 0.5710469193521456, + "grad_norm": 0.5004552915915513, + "learning_rate": 9.753658551256338e-06, + "loss": 0.2399, + "step": 1710 + }, + { + "epoch": 0.5713808649190182, + "grad_norm": 0.4875294674870155, + "learning_rate": 9.753055730009364e-06, + "loss": 0.2258, + "step": 1711 + }, + { + "epoch": 0.5717148104858908, + "grad_norm": 0.6035412634336383, + "learning_rate": 9.752452190757358e-06, + "loss": 0.2284, + "step": 1712 + }, + { + "epoch": 0.5720487560527634, + "grad_norm": 0.4665053445860015, + "learning_rate": 9.751847933591489e-06, + "loss": 0.2112, + "step": 1713 + }, + { + "epoch": 0.572382701619636, + "grad_norm": 0.4892405263993013, + "learning_rate": 9.75124295860304e-06, + "loss": 0.228, + "step": 1714 + }, + { + "epoch": 0.5727166471865086, + "grad_norm": 0.47689612627770583, + "learning_rate": 9.750637265883395e-06, + "loss": 0.2217, + "step": 1715 + }, + { + "epoch": 0.5730505927533812, + "grad_norm": 0.5851060256324795, + "learning_rate": 9.750030855524058e-06, + "loss": 0.2308, + "step": 1716 + }, + { + "epoch": 0.5733845383202538, + "grad_norm": 0.4739718971007316, + "learning_rate": 9.749423727616628e-06, + "loss": 0.2276, + "step": 1717 + }, + { + "epoch": 0.5737184838871264, + "grad_norm": 0.5130343462285478, + "learning_rate": 9.748815882252823e-06, + "loss": 0.2379, + "step": 1718 + }, + { + "epoch": 0.574052429453999, + "grad_norm": 0.5085916890670312, + "learning_rate": 9.748207319524462e-06, + "loss": 0.2238, + "step": 1719 + }, + { + "epoch": 0.5743863750208716, + "grad_norm": 0.5028286185116214, + "learning_rate": 9.747598039523476e-06, + "loss": 0.2195, + "step": 1720 + }, + { + "epoch": 0.5747203205877442, + "grad_norm": 0.5672833426547657, + "learning_rate": 9.746988042341907e-06, + "loss": 0.2429, + "step": 1721 + }, + { + "epoch": 0.5750542661546167, + "grad_norm": 0.5485257563867748, + "learning_rate": 9.746377328071899e-06, + "loss": 0.2126, + "step": 1722 + }, + { + "epoch": 0.5753882117214894, + "grad_norm": 0.5472794541807945, + "learning_rate": 9.74576589680571e-06, + "loss": 0.2257, + "step": 1723 + }, + { + "epoch": 0.575722157288362, + "grad_norm": 0.6005489214038497, + "learning_rate": 9.745153748635702e-06, + "loss": 0.2297, + "step": 1724 + }, + { + "epoch": 0.5760561028552346, + "grad_norm": 0.5331745130852713, + "learning_rate": 9.744540883654348e-06, + "loss": 0.2299, + "step": 1725 + }, + { + "epoch": 0.5763900484221072, + "grad_norm": 0.5467918769216435, + "learning_rate": 9.743927301954229e-06, + "loss": 0.2434, + "step": 1726 + }, + { + "epoch": 0.5767239939889798, + "grad_norm": 0.47631200906324167, + "learning_rate": 9.743313003628033e-06, + "loss": 0.2294, + "step": 1727 + }, + { + "epoch": 0.5770579395558524, + "grad_norm": 1.028042272396889, + "learning_rate": 9.742697988768557e-06, + "loss": 0.2286, + "step": 1728 + }, + { + "epoch": 0.577391885122725, + "grad_norm": 0.5723656104484236, + "learning_rate": 9.742082257468705e-06, + "loss": 0.2328, + "step": 1729 + }, + { + "epoch": 0.5777258306895976, + "grad_norm": 0.5658357107734303, + "learning_rate": 9.741465809821493e-06, + "loss": 0.2206, + "step": 1730 + }, + { + "epoch": 0.5780597762564702, + "grad_norm": 0.5303347638147783, + "learning_rate": 9.74084864592004e-06, + "loss": 0.2382, + "step": 1731 + }, + { + "epoch": 0.5783937218233428, + "grad_norm": 0.5591531368062704, + "learning_rate": 9.74023076585758e-06, + "loss": 0.2216, + "step": 1732 + }, + { + "epoch": 0.5787276673902154, + "grad_norm": 0.5152285449211208, + "learning_rate": 9.739612169727446e-06, + "loss": 0.2016, + "step": 1733 + }, + { + "epoch": 0.579061612957088, + "grad_norm": 0.5240518968639969, + "learning_rate": 9.73899285762309e-06, + "loss": 0.236, + "step": 1734 + }, + { + "epoch": 0.5793955585239606, + "grad_norm": 0.5941452749632281, + "learning_rate": 9.738372829638058e-06, + "loss": 0.2208, + "step": 1735 + }, + { + "epoch": 0.5797295040908332, + "grad_norm": 0.5383582980347633, + "learning_rate": 9.73775208586602e-06, + "loss": 0.2231, + "step": 1736 + }, + { + "epoch": 0.5800634496577058, + "grad_norm": 0.5097873416009885, + "learning_rate": 9.737130626400745e-06, + "loss": 0.2214, + "step": 1737 + }, + { + "epoch": 0.5803973952245783, + "grad_norm": 0.6409001150404144, + "learning_rate": 9.736508451336111e-06, + "loss": 0.2565, + "step": 1738 + }, + { + "epoch": 0.580731340791451, + "grad_norm": 0.5171820969817694, + "learning_rate": 9.735885560766104e-06, + "loss": 0.2247, + "step": 1739 + }, + { + "epoch": 0.5810652863583236, + "grad_norm": 0.565262498850217, + "learning_rate": 9.73526195478482e-06, + "loss": 0.2525, + "step": 1740 + }, + { + "epoch": 0.5813992319251962, + "grad_norm": 0.5191663542448296, + "learning_rate": 9.73463763348646e-06, + "loss": 0.2269, + "step": 1741 + }, + { + "epoch": 0.5817331774920688, + "grad_norm": 0.5008213494455472, + "learning_rate": 9.734012596965341e-06, + "loss": 0.2292, + "step": 1742 + }, + { + "epoch": 0.5820671230589414, + "grad_norm": 0.4985166781819941, + "learning_rate": 9.733386845315875e-06, + "loss": 0.2251, + "step": 1743 + }, + { + "epoch": 0.582401068625814, + "grad_norm": 0.7068726658287037, + "learning_rate": 9.732760378632592e-06, + "loss": 0.2575, + "step": 1744 + }, + { + "epoch": 0.5827350141926866, + "grad_norm": 0.532950090400461, + "learning_rate": 9.73213319701013e-06, + "loss": 0.2203, + "step": 1745 + }, + { + "epoch": 0.5830689597595592, + "grad_norm": 0.4870910187814131, + "learning_rate": 9.731505300543228e-06, + "loss": 0.2235, + "step": 1746 + }, + { + "epoch": 0.5834029053264318, + "grad_norm": 0.5183483007891654, + "learning_rate": 9.730876689326739e-06, + "loss": 0.2391, + "step": 1747 + }, + { + "epoch": 0.5837368508933044, + "grad_norm": 0.5822886754220148, + "learning_rate": 9.730247363455621e-06, + "loss": 0.246, + "step": 1748 + }, + { + "epoch": 0.584070796460177, + "grad_norm": 0.5303604700191408, + "learning_rate": 9.729617323024943e-06, + "loss": 0.2161, + "step": 1749 + }, + { + "epoch": 0.5844047420270496, + "grad_norm": 0.5402294666419608, + "learning_rate": 9.728986568129876e-06, + "loss": 0.2165, + "step": 1750 + }, + { + "epoch": 0.5847386875939222, + "grad_norm": 0.5229291505268835, + "learning_rate": 9.72835509886571e-06, + "loss": 0.2487, + "step": 1751 + }, + { + "epoch": 0.5850726331607948, + "grad_norm": 1.005248107058838, + "learning_rate": 9.727722915327828e-06, + "loss": 0.2344, + "step": 1752 + }, + { + "epoch": 0.5854065787276674, + "grad_norm": 0.4984992426091078, + "learning_rate": 9.727090017611736e-06, + "loss": 0.196, + "step": 1753 + }, + { + "epoch": 0.58574052429454, + "grad_norm": 0.5333276983734322, + "learning_rate": 9.726456405813033e-06, + "loss": 0.2264, + "step": 1754 + }, + { + "epoch": 0.5860744698614125, + "grad_norm": 0.45641453158909184, + "learning_rate": 9.725822080027442e-06, + "loss": 0.2239, + "step": 1755 + }, + { + "epoch": 0.5864084154282851, + "grad_norm": 0.48841035707321817, + "learning_rate": 9.725187040350778e-06, + "loss": 0.222, + "step": 1756 + }, + { + "epoch": 0.5867423609951578, + "grad_norm": 0.4741064605363138, + "learning_rate": 9.724551286878976e-06, + "loss": 0.2108, + "step": 1757 + }, + { + "epoch": 0.5870763065620304, + "grad_norm": 0.502475937380326, + "learning_rate": 9.723914819708073e-06, + "loss": 0.2253, + "step": 1758 + }, + { + "epoch": 0.587410252128903, + "grad_norm": 0.5194244729847486, + "learning_rate": 9.723277638934212e-06, + "loss": 0.2341, + "step": 1759 + }, + { + "epoch": 0.5877441976957756, + "grad_norm": 0.4916149636167516, + "learning_rate": 9.72263974465365e-06, + "loss": 0.2306, + "step": 1760 + }, + { + "epoch": 0.5880781432626482, + "grad_norm": 0.5023458268143842, + "learning_rate": 9.722001136962746e-06, + "loss": 0.2227, + "step": 1761 + }, + { + "epoch": 0.5884120888295208, + "grad_norm": 0.5608383767110363, + "learning_rate": 9.721361815957973e-06, + "loss": 0.2491, + "step": 1762 + }, + { + "epoch": 0.5887460343963934, + "grad_norm": 0.5144301238589608, + "learning_rate": 9.720721781735905e-06, + "loss": 0.2222, + "step": 1763 + }, + { + "epoch": 0.589079979963266, + "grad_norm": 0.5018433935726713, + "learning_rate": 9.720081034393226e-06, + "loss": 0.2258, + "step": 1764 + }, + { + "epoch": 0.5894139255301386, + "grad_norm": 0.49790563062526416, + "learning_rate": 9.71943957402673e-06, + "loss": 0.2285, + "step": 1765 + }, + { + "epoch": 0.5897478710970112, + "grad_norm": 0.4748861509241753, + "learning_rate": 9.718797400733314e-06, + "loss": 0.218, + "step": 1766 + }, + { + "epoch": 0.5900818166638838, + "grad_norm": 0.5390774391581642, + "learning_rate": 9.718154514609992e-06, + "loss": 0.2286, + "step": 1767 + }, + { + "epoch": 0.5904157622307564, + "grad_norm": 0.5536961578875585, + "learning_rate": 9.717510915753876e-06, + "loss": 0.2402, + "step": 1768 + }, + { + "epoch": 0.590749707797629, + "grad_norm": 0.6695127415098429, + "learning_rate": 9.716866604262189e-06, + "loss": 0.2235, + "step": 1769 + }, + { + "epoch": 0.5910836533645016, + "grad_norm": 0.5069772545716459, + "learning_rate": 9.716221580232261e-06, + "loss": 0.2201, + "step": 1770 + }, + { + "epoch": 0.5914175989313741, + "grad_norm": 0.8037833620387766, + "learning_rate": 9.715575843761534e-06, + "loss": 0.231, + "step": 1771 + }, + { + "epoch": 0.5917515444982467, + "grad_norm": 0.5068062069274792, + "learning_rate": 9.714929394947548e-06, + "loss": 0.2211, + "step": 1772 + }, + { + "epoch": 0.5920854900651193, + "grad_norm": 0.5549550762490882, + "learning_rate": 9.714282233887962e-06, + "loss": 0.2393, + "step": 1773 + }, + { + "epoch": 0.592419435631992, + "grad_norm": 0.5395976419004379, + "learning_rate": 9.713634360680537e-06, + "loss": 0.2252, + "step": 1774 + }, + { + "epoch": 0.5927533811988646, + "grad_norm": 0.4892732346046282, + "learning_rate": 9.712985775423141e-06, + "loss": 0.2193, + "step": 1775 + }, + { + "epoch": 0.5930873267657372, + "grad_norm": 0.4711153870554155, + "learning_rate": 9.712336478213747e-06, + "loss": 0.2315, + "step": 1776 + }, + { + "epoch": 0.5934212723326098, + "grad_norm": 0.4762551984595192, + "learning_rate": 9.711686469150444e-06, + "loss": 0.2355, + "step": 1777 + }, + { + "epoch": 0.5937552178994824, + "grad_norm": 0.539286037756309, + "learning_rate": 9.711035748331421e-06, + "loss": 0.2305, + "step": 1778 + }, + { + "epoch": 0.594089163466355, + "grad_norm": 0.44306089667586035, + "learning_rate": 9.710384315854977e-06, + "loss": 0.2119, + "step": 1779 + }, + { + "epoch": 0.5944231090332276, + "grad_norm": 0.54924454544785, + "learning_rate": 9.70973217181952e-06, + "loss": 0.2452, + "step": 1780 + }, + { + "epoch": 0.5947570546001002, + "grad_norm": 0.6928470758099178, + "learning_rate": 9.709079316323564e-06, + "loss": 0.2344, + "step": 1781 + }, + { + "epoch": 0.5950910001669728, + "grad_norm": 0.482232968127417, + "learning_rate": 9.70842574946573e-06, + "loss": 0.2207, + "step": 1782 + }, + { + "epoch": 0.5954249457338454, + "grad_norm": 0.49136177006387877, + "learning_rate": 9.707771471344744e-06, + "loss": 0.2261, + "step": 1783 + }, + { + "epoch": 0.595758891300718, + "grad_norm": 0.46768352694125126, + "learning_rate": 9.707116482059447e-06, + "loss": 0.2336, + "step": 1784 + }, + { + "epoch": 0.5960928368675906, + "grad_norm": 0.5029578869306203, + "learning_rate": 9.70646078170878e-06, + "loss": 0.2299, + "step": 1785 + }, + { + "epoch": 0.5964267824344632, + "grad_norm": 0.6031061149189456, + "learning_rate": 9.705804370391794e-06, + "loss": 0.2338, + "step": 1786 + }, + { + "epoch": 0.5967607280013357, + "grad_norm": 0.5619990384851195, + "learning_rate": 9.705147248207652e-06, + "loss": 0.2458, + "step": 1787 + }, + { + "epoch": 0.5970946735682083, + "grad_norm": 0.481566798104976, + "learning_rate": 9.704489415255614e-06, + "loss": 0.2272, + "step": 1788 + }, + { + "epoch": 0.5974286191350809, + "grad_norm": 0.5094230596305139, + "learning_rate": 9.703830871635057e-06, + "loss": 0.2366, + "step": 1789 + }, + { + "epoch": 0.5977625647019535, + "grad_norm": 0.5032186806617618, + "learning_rate": 9.703171617445461e-06, + "loss": 0.2378, + "step": 1790 + }, + { + "epoch": 0.5980965102688262, + "grad_norm": 0.5174151813237029, + "learning_rate": 9.702511652786414e-06, + "loss": 0.2194, + "step": 1791 + }, + { + "epoch": 0.5984304558356988, + "grad_norm": 0.5280668787057868, + "learning_rate": 9.701850977757611e-06, + "loss": 0.2236, + "step": 1792 + }, + { + "epoch": 0.5987644014025714, + "grad_norm": 0.5381616037918245, + "learning_rate": 9.701189592458858e-06, + "loss": 0.2243, + "step": 1793 + }, + { + "epoch": 0.599098346969444, + "grad_norm": 0.5404391079985676, + "learning_rate": 9.70052749699006e-06, + "loss": 0.2281, + "step": 1794 + }, + { + "epoch": 0.5994322925363166, + "grad_norm": 0.5628113540045915, + "learning_rate": 9.699864691451236e-06, + "loss": 0.23, + "step": 1795 + }, + { + "epoch": 0.5997662381031892, + "grad_norm": 0.5329453587759123, + "learning_rate": 9.699201175942514e-06, + "loss": 0.215, + "step": 1796 + }, + { + "epoch": 0.6001001836700618, + "grad_norm": 0.4718836619269112, + "learning_rate": 9.698536950564121e-06, + "loss": 0.2123, + "step": 1797 + }, + { + "epoch": 0.6004341292369344, + "grad_norm": 0.5539291679398612, + "learning_rate": 9.6978720154164e-06, + "loss": 0.2108, + "step": 1798 + }, + { + "epoch": 0.600768074803807, + "grad_norm": 0.5144961691465181, + "learning_rate": 9.697206370599793e-06, + "loss": 0.2322, + "step": 1799 + }, + { + "epoch": 0.6011020203706796, + "grad_norm": 0.5731553026791534, + "learning_rate": 9.696540016214857e-06, + "loss": 0.2286, + "step": 1800 + }, + { + "epoch": 0.6014359659375522, + "grad_norm": 0.5188370628082019, + "learning_rate": 9.695872952362253e-06, + "loss": 0.2284, + "step": 1801 + }, + { + "epoch": 0.6017699115044248, + "grad_norm": 0.5671772412519566, + "learning_rate": 9.695205179142746e-06, + "loss": 0.2338, + "step": 1802 + }, + { + "epoch": 0.6021038570712974, + "grad_norm": 0.5086216193765247, + "learning_rate": 9.694536696657213e-06, + "loss": 0.2197, + "step": 1803 + }, + { + "epoch": 0.6024378026381699, + "grad_norm": 0.5238150472595622, + "learning_rate": 9.693867505006634e-06, + "loss": 0.2209, + "step": 1804 + }, + { + "epoch": 0.6027717482050425, + "grad_norm": 0.49537223803612296, + "learning_rate": 9.693197604292101e-06, + "loss": 0.2314, + "step": 1805 + }, + { + "epoch": 0.6031056937719151, + "grad_norm": 0.537065895743497, + "learning_rate": 9.69252699461481e-06, + "loss": 0.2411, + "step": 1806 + }, + { + "epoch": 0.6034396393387877, + "grad_norm": 0.5685097570680073, + "learning_rate": 9.691855676076064e-06, + "loss": 0.24, + "step": 1807 + }, + { + "epoch": 0.6037735849056604, + "grad_norm": 0.48659737235918643, + "learning_rate": 9.691183648777271e-06, + "loss": 0.217, + "step": 1808 + }, + { + "epoch": 0.604107530472533, + "grad_norm": 0.48629641393278755, + "learning_rate": 9.690510912819952e-06, + "loss": 0.2199, + "step": 1809 + }, + { + "epoch": 0.6044414760394056, + "grad_norm": 0.547728598895119, + "learning_rate": 9.689837468305732e-06, + "loss": 0.2299, + "step": 1810 + }, + { + "epoch": 0.6047754216062782, + "grad_norm": 0.5394439412865157, + "learning_rate": 9.689163315336339e-06, + "loss": 0.243, + "step": 1811 + }, + { + "epoch": 0.6051093671731508, + "grad_norm": 0.5103261884061013, + "learning_rate": 9.688488454013616e-06, + "loss": 0.2388, + "step": 1812 + }, + { + "epoch": 0.6054433127400234, + "grad_norm": 0.5099890466296649, + "learning_rate": 9.687812884439506e-06, + "loss": 0.2344, + "step": 1813 + }, + { + "epoch": 0.605777258306896, + "grad_norm": 0.46788777418989, + "learning_rate": 9.687136606716064e-06, + "loss": 0.2187, + "step": 1814 + }, + { + "epoch": 0.6061112038737686, + "grad_norm": 0.5249164624914014, + "learning_rate": 9.686459620945445e-06, + "loss": 0.2227, + "step": 1815 + }, + { + "epoch": 0.6064451494406412, + "grad_norm": 0.5331457391545483, + "learning_rate": 9.685781927229923e-06, + "loss": 0.2443, + "step": 1816 + }, + { + "epoch": 0.6067790950075138, + "grad_norm": 0.5886242608442575, + "learning_rate": 9.685103525671864e-06, + "loss": 0.2531, + "step": 1817 + }, + { + "epoch": 0.6071130405743864, + "grad_norm": 0.49860326082795825, + "learning_rate": 9.684424416373754e-06, + "loss": 0.2254, + "step": 1818 + }, + { + "epoch": 0.607446986141259, + "grad_norm": 0.5325971197360724, + "learning_rate": 9.683744599438178e-06, + "loss": 0.2294, + "step": 1819 + }, + { + "epoch": 0.6077809317081315, + "grad_norm": 0.5084348336824585, + "learning_rate": 9.683064074967832e-06, + "loss": 0.2375, + "step": 1820 + }, + { + "epoch": 0.6081148772750041, + "grad_norm": 0.46827119088454616, + "learning_rate": 9.682382843065516e-06, + "loss": 0.2146, + "step": 1821 + }, + { + "epoch": 0.6084488228418767, + "grad_norm": 0.5179404560268646, + "learning_rate": 9.681700903834137e-06, + "loss": 0.2324, + "step": 1822 + }, + { + "epoch": 0.6087827684087493, + "grad_norm": 0.5482754379272196, + "learning_rate": 9.681018257376713e-06, + "loss": 0.2266, + "step": 1823 + }, + { + "epoch": 0.609116713975622, + "grad_norm": 0.5402115399595885, + "learning_rate": 9.680334903796363e-06, + "loss": 0.2437, + "step": 1824 + }, + { + "epoch": 0.6094506595424946, + "grad_norm": 0.485938659979222, + "learning_rate": 9.679650843196318e-06, + "loss": 0.2379, + "step": 1825 + }, + { + "epoch": 0.6097846051093672, + "grad_norm": 0.47740857211098514, + "learning_rate": 9.678966075679909e-06, + "loss": 0.2294, + "step": 1826 + }, + { + "epoch": 0.6101185506762398, + "grad_norm": 0.5105546022851197, + "learning_rate": 9.678280601350584e-06, + "loss": 0.2342, + "step": 1827 + }, + { + "epoch": 0.6104524962431124, + "grad_norm": 0.5084617379607869, + "learning_rate": 9.67759442031189e-06, + "loss": 0.2333, + "step": 1828 + }, + { + "epoch": 0.610786441809985, + "grad_norm": 0.4877401739921855, + "learning_rate": 9.676907532667478e-06, + "loss": 0.2286, + "step": 1829 + }, + { + "epoch": 0.6111203873768576, + "grad_norm": 0.48925434897179987, + "learning_rate": 9.676219938521116e-06, + "loss": 0.2182, + "step": 1830 + }, + { + "epoch": 0.6114543329437302, + "grad_norm": 0.4795913747536622, + "learning_rate": 9.675531637976673e-06, + "loss": 0.2183, + "step": 1831 + }, + { + "epoch": 0.6117882785106028, + "grad_norm": 0.49260466676010023, + "learning_rate": 9.674842631138121e-06, + "loss": 0.2165, + "step": 1832 + }, + { + "epoch": 0.6121222240774754, + "grad_norm": 0.48704610155668626, + "learning_rate": 9.674152918109547e-06, + "loss": 0.2306, + "step": 1833 + }, + { + "epoch": 0.612456169644348, + "grad_norm": 0.5738129836410425, + "learning_rate": 9.673462498995138e-06, + "loss": 0.225, + "step": 1834 + }, + { + "epoch": 0.6127901152112206, + "grad_norm": 0.5935737806472424, + "learning_rate": 9.672771373899192e-06, + "loss": 0.2263, + "step": 1835 + }, + { + "epoch": 0.6131240607780931, + "grad_norm": 0.4805492485102887, + "learning_rate": 9.672079542926108e-06, + "loss": 0.2326, + "step": 1836 + }, + { + "epoch": 0.6134580063449657, + "grad_norm": 0.5717231219637876, + "learning_rate": 9.671387006180398e-06, + "loss": 0.2354, + "step": 1837 + }, + { + "epoch": 0.6137919519118383, + "grad_norm": 0.5224215947069372, + "learning_rate": 9.670693763766674e-06, + "loss": 0.2151, + "step": 1838 + }, + { + "epoch": 0.6141258974787109, + "grad_norm": 0.526357591657413, + "learning_rate": 9.669999815789664e-06, + "loss": 0.233, + "step": 1839 + }, + { + "epoch": 0.6144598430455835, + "grad_norm": 0.453213851588365, + "learning_rate": 9.669305162354194e-06, + "loss": 0.2039, + "step": 1840 + }, + { + "epoch": 0.6147937886124561, + "grad_norm": 0.5265771089532885, + "learning_rate": 9.6686098035652e-06, + "loss": 0.226, + "step": 1841 + }, + { + "epoch": 0.6151277341793288, + "grad_norm": 0.4822531246116318, + "learning_rate": 9.667913739527724e-06, + "loss": 0.2266, + "step": 1842 + }, + { + "epoch": 0.6154616797462014, + "grad_norm": 0.5092579744932829, + "learning_rate": 9.667216970346916e-06, + "loss": 0.2227, + "step": 1843 + }, + { + "epoch": 0.615795625313074, + "grad_norm": 0.5705960019212173, + "learning_rate": 9.666519496128027e-06, + "loss": 0.2179, + "step": 1844 + }, + { + "epoch": 0.6161295708799466, + "grad_norm": 0.5870450982878944, + "learning_rate": 9.665821316976423e-06, + "loss": 0.2309, + "step": 1845 + }, + { + "epoch": 0.6164635164468192, + "grad_norm": 0.48323049309301813, + "learning_rate": 9.665122432997571e-06, + "loss": 0.2228, + "step": 1846 + }, + { + "epoch": 0.6167974620136918, + "grad_norm": 0.43522897761120205, + "learning_rate": 9.664422844297045e-06, + "loss": 0.2168, + "step": 1847 + }, + { + "epoch": 0.6171314075805644, + "grad_norm": 0.4797401271036264, + "learning_rate": 9.663722550980528e-06, + "loss": 0.2166, + "step": 1848 + }, + { + "epoch": 0.617465353147437, + "grad_norm": 0.5404918091345311, + "learning_rate": 9.663021553153805e-06, + "loss": 0.2155, + "step": 1849 + }, + { + "epoch": 0.6177992987143096, + "grad_norm": 0.5019775079244504, + "learning_rate": 9.66231985092277e-06, + "loss": 0.205, + "step": 1850 + }, + { + "epoch": 0.6181332442811822, + "grad_norm": 0.5126171358341591, + "learning_rate": 9.661617444393427e-06, + "loss": 0.2342, + "step": 1851 + }, + { + "epoch": 0.6184671898480548, + "grad_norm": 0.5298697261126594, + "learning_rate": 9.660914333671878e-06, + "loss": 0.2189, + "step": 1852 + }, + { + "epoch": 0.6188011354149273, + "grad_norm": 0.6977397151332118, + "learning_rate": 9.66021051886434e-06, + "loss": 0.242, + "step": 1853 + }, + { + "epoch": 0.6191350809817999, + "grad_norm": 0.4420208100094779, + "learning_rate": 9.65950600007713e-06, + "loss": 0.1995, + "step": 1854 + }, + { + "epoch": 0.6194690265486725, + "grad_norm": 0.49759113853149267, + "learning_rate": 9.658800777416676e-06, + "loss": 0.2142, + "step": 1855 + }, + { + "epoch": 0.6198029721155451, + "grad_norm": 0.4644683178989199, + "learning_rate": 9.658094850989508e-06, + "loss": 0.206, + "step": 1856 + }, + { + "epoch": 0.6201369176824177, + "grad_norm": 0.7389862240060607, + "learning_rate": 9.657388220902265e-06, + "loss": 0.2231, + "step": 1857 + }, + { + "epoch": 0.6204708632492903, + "grad_norm": 0.4606947820212059, + "learning_rate": 9.656680887261693e-06, + "loss": 0.2115, + "step": 1858 + }, + { + "epoch": 0.620804808816163, + "grad_norm": 0.5020543120959755, + "learning_rate": 9.655972850174642e-06, + "loss": 0.2274, + "step": 1859 + }, + { + "epoch": 0.6211387543830356, + "grad_norm": 0.44475121899664427, + "learning_rate": 9.65526410974807e-06, + "loss": 0.2113, + "step": 1860 + }, + { + "epoch": 0.6214726999499082, + "grad_norm": 0.47941219176464955, + "learning_rate": 9.65455466608904e-06, + "loss": 0.2232, + "step": 1861 + }, + { + "epoch": 0.6218066455167808, + "grad_norm": 0.4662479753839037, + "learning_rate": 9.653844519304722e-06, + "loss": 0.2238, + "step": 1862 + }, + { + "epoch": 0.6221405910836534, + "grad_norm": 0.5892373131541172, + "learning_rate": 9.653133669502393e-06, + "loss": 0.2307, + "step": 1863 + }, + { + "epoch": 0.622474536650526, + "grad_norm": 0.46292878818156435, + "learning_rate": 9.652422116789432e-06, + "loss": 0.221, + "step": 1864 + }, + { + "epoch": 0.6228084822173986, + "grad_norm": 0.5683790013976829, + "learning_rate": 9.651709861273334e-06, + "loss": 0.2267, + "step": 1865 + }, + { + "epoch": 0.6231424277842712, + "grad_norm": 0.4718926218027168, + "learning_rate": 9.650996903061685e-06, + "loss": 0.2138, + "step": 1866 + }, + { + "epoch": 0.6234763733511438, + "grad_norm": 0.496502433614263, + "learning_rate": 9.650283242262192e-06, + "loss": 0.2246, + "step": 1867 + }, + { + "epoch": 0.6238103189180164, + "grad_norm": 0.4876162037690022, + "learning_rate": 9.64956887898266e-06, + "loss": 0.2303, + "step": 1868 + }, + { + "epoch": 0.6241442644848889, + "grad_norm": 0.5145615842272623, + "learning_rate": 9.648853813331e-06, + "loss": 0.2278, + "step": 1869 + }, + { + "epoch": 0.6244782100517615, + "grad_norm": 0.527130326304707, + "learning_rate": 9.648138045415236e-06, + "loss": 0.2301, + "step": 1870 + }, + { + "epoch": 0.6248121556186341, + "grad_norm": 0.49041057430030827, + "learning_rate": 9.647421575343488e-06, + "loss": 0.2271, + "step": 1871 + }, + { + "epoch": 0.6251461011855067, + "grad_norm": 0.4861327887722697, + "learning_rate": 9.646704403223991e-06, + "loss": 0.2153, + "step": 1872 + }, + { + "epoch": 0.6254800467523793, + "grad_norm": 0.4859486191134469, + "learning_rate": 9.64598652916508e-06, + "loss": 0.2159, + "step": 1873 + }, + { + "epoch": 0.6258139923192519, + "grad_norm": 0.46779863060566784, + "learning_rate": 9.6452679532752e-06, + "loss": 0.2145, + "step": 1874 + }, + { + "epoch": 0.6261479378861246, + "grad_norm": 0.5215440387248278, + "learning_rate": 9.644548675662897e-06, + "loss": 0.2309, + "step": 1875 + }, + { + "epoch": 0.6264818834529972, + "grad_norm": 0.5113295193583, + "learning_rate": 9.64382869643683e-06, + "loss": 0.2393, + "step": 1876 + }, + { + "epoch": 0.6268158290198698, + "grad_norm": 0.6629216513399953, + "learning_rate": 9.64310801570576e-06, + "loss": 0.243, + "step": 1877 + }, + { + "epoch": 0.6271497745867424, + "grad_norm": 0.5805733643515224, + "learning_rate": 9.642386633578553e-06, + "loss": 0.236, + "step": 1878 + }, + { + "epoch": 0.627483720153615, + "grad_norm": 0.5211638075348191, + "learning_rate": 9.641664550164182e-06, + "loss": 0.2262, + "step": 1879 + }, + { + "epoch": 0.6278176657204876, + "grad_norm": 0.44593296545663935, + "learning_rate": 9.640941765571727e-06, + "loss": 0.231, + "step": 1880 + }, + { + "epoch": 0.6281516112873602, + "grad_norm": 0.47615138615510033, + "learning_rate": 9.640218279910374e-06, + "loss": 0.2146, + "step": 1881 + }, + { + "epoch": 0.6284855568542328, + "grad_norm": 0.5022339248493972, + "learning_rate": 9.639494093289412e-06, + "loss": 0.2291, + "step": 1882 + }, + { + "epoch": 0.6288195024211054, + "grad_norm": 0.46430936600624695, + "learning_rate": 9.638769205818239e-06, + "loss": 0.2071, + "step": 1883 + }, + { + "epoch": 0.629153447987978, + "grad_norm": 0.5226803610865401, + "learning_rate": 9.638043617606358e-06, + "loss": 0.2414, + "step": 1884 + }, + { + "epoch": 0.6294873935548505, + "grad_norm": 0.48540480199653707, + "learning_rate": 9.637317328763378e-06, + "loss": 0.2233, + "step": 1885 + }, + { + "epoch": 0.6298213391217231, + "grad_norm": 0.44530932798385103, + "learning_rate": 9.636590339399012e-06, + "loss": 0.2044, + "step": 1886 + }, + { + "epoch": 0.6301552846885957, + "grad_norm": 0.46347505096860814, + "learning_rate": 9.63586264962308e-06, + "loss": 0.226, + "step": 1887 + }, + { + "epoch": 0.6304892302554683, + "grad_norm": 0.5058077241719129, + "learning_rate": 9.635134259545511e-06, + "loss": 0.2334, + "step": 1888 + }, + { + "epoch": 0.6308231758223409, + "grad_norm": 0.5197637248708888, + "learning_rate": 9.634405169276335e-06, + "loss": 0.2325, + "step": 1889 + }, + { + "epoch": 0.6311571213892135, + "grad_norm": 0.5179888030499751, + "learning_rate": 9.63367537892569e-06, + "loss": 0.2226, + "step": 1890 + }, + { + "epoch": 0.6314910669560861, + "grad_norm": 0.6237768819180914, + "learning_rate": 9.63294488860382e-06, + "loss": 0.2429, + "step": 1891 + }, + { + "epoch": 0.6318250125229588, + "grad_norm": 0.4783353974434491, + "learning_rate": 9.63221369842107e-06, + "loss": 0.2161, + "step": 1892 + }, + { + "epoch": 0.6321589580898314, + "grad_norm": 0.4625966356502894, + "learning_rate": 9.631481808487902e-06, + "loss": 0.219, + "step": 1893 + }, + { + "epoch": 0.632492903656704, + "grad_norm": 0.5729936471207139, + "learning_rate": 9.63074921891487e-06, + "loss": 0.2275, + "step": 1894 + }, + { + "epoch": 0.6328268492235766, + "grad_norm": 0.4584520008958071, + "learning_rate": 9.630015929812646e-06, + "loss": 0.2271, + "step": 1895 + }, + { + "epoch": 0.6331607947904492, + "grad_norm": 0.5542075963679276, + "learning_rate": 9.629281941291998e-06, + "loss": 0.2412, + "step": 1896 + }, + { + "epoch": 0.6334947403573218, + "grad_norm": 0.47050880416056196, + "learning_rate": 9.628547253463804e-06, + "loss": 0.2254, + "step": 1897 + }, + { + "epoch": 0.6338286859241944, + "grad_norm": 0.5287224363148264, + "learning_rate": 9.627811866439048e-06, + "loss": 0.226, + "step": 1898 + }, + { + "epoch": 0.634162631491067, + "grad_norm": 0.5349630430590014, + "learning_rate": 9.627075780328818e-06, + "loss": 0.2457, + "step": 1899 + }, + { + "epoch": 0.6344965770579396, + "grad_norm": 0.48848827404295203, + "learning_rate": 9.626338995244313e-06, + "loss": 0.2239, + "step": 1900 + }, + { + "epoch": 0.6348305226248122, + "grad_norm": 0.5392885299832741, + "learning_rate": 9.625601511296826e-06, + "loss": 0.2292, + "step": 1901 + }, + { + "epoch": 0.6351644681916847, + "grad_norm": 0.51626967141371, + "learning_rate": 9.624863328597767e-06, + "loss": 0.2411, + "step": 1902 + }, + { + "epoch": 0.6354984137585573, + "grad_norm": 0.48472493185557913, + "learning_rate": 9.624124447258647e-06, + "loss": 0.2177, + "step": 1903 + }, + { + "epoch": 0.6358323593254299, + "grad_norm": 0.6068590044334997, + "learning_rate": 9.62338486739108e-06, + "loss": 0.2214, + "step": 1904 + }, + { + "epoch": 0.6361663048923025, + "grad_norm": 0.5569455920656661, + "learning_rate": 9.62264458910679e-06, + "loss": 0.2227, + "step": 1905 + }, + { + "epoch": 0.6365002504591751, + "grad_norm": 0.4943658711020671, + "learning_rate": 9.621903612517608e-06, + "loss": 0.22, + "step": 1906 + }, + { + "epoch": 0.6368341960260477, + "grad_norm": 0.6738439864635068, + "learning_rate": 9.621161937735463e-06, + "loss": 0.2215, + "step": 1907 + }, + { + "epoch": 0.6371681415929203, + "grad_norm": 0.5125207288216446, + "learning_rate": 9.620419564872394e-06, + "loss": 0.2311, + "step": 1908 + }, + { + "epoch": 0.637502087159793, + "grad_norm": 0.5477693989903016, + "learning_rate": 9.619676494040547e-06, + "loss": 0.2447, + "step": 1909 + }, + { + "epoch": 0.6378360327266656, + "grad_norm": 0.618827790959411, + "learning_rate": 9.61893272535217e-06, + "loss": 0.2269, + "step": 1910 + }, + { + "epoch": 0.6381699782935382, + "grad_norm": 0.5831020385440705, + "learning_rate": 9.618188258919618e-06, + "loss": 0.2504, + "step": 1911 + }, + { + "epoch": 0.6385039238604108, + "grad_norm": 0.4575021594118544, + "learning_rate": 9.617443094855354e-06, + "loss": 0.2212, + "step": 1912 + }, + { + "epoch": 0.6388378694272834, + "grad_norm": 0.474232930993678, + "learning_rate": 9.61669723327194e-06, + "loss": 0.2352, + "step": 1913 + }, + { + "epoch": 0.639171814994156, + "grad_norm": 0.5374800460315998, + "learning_rate": 9.615950674282049e-06, + "loss": 0.2396, + "step": 1914 + }, + { + "epoch": 0.6395057605610286, + "grad_norm": 0.5287587226953798, + "learning_rate": 9.61520341799846e-06, + "loss": 0.2302, + "step": 1915 + }, + { + "epoch": 0.6398397061279012, + "grad_norm": 0.5237854216155173, + "learning_rate": 9.614455464534049e-06, + "loss": 0.2453, + "step": 1916 + }, + { + "epoch": 0.6401736516947738, + "grad_norm": 0.44508260080269146, + "learning_rate": 9.613706814001809e-06, + "loss": 0.2066, + "step": 1917 + }, + { + "epoch": 0.6405075972616463, + "grad_norm": 0.5211596090749453, + "learning_rate": 9.612957466514829e-06, + "loss": 0.2279, + "step": 1918 + }, + { + "epoch": 0.6408415428285189, + "grad_norm": 0.4862656406723413, + "learning_rate": 9.61220742218631e-06, + "loss": 0.2304, + "step": 1919 + }, + { + "epoch": 0.6411754883953915, + "grad_norm": 0.4472518622732128, + "learning_rate": 9.61145668112955e-06, + "loss": 0.2296, + "step": 1920 + }, + { + "epoch": 0.6415094339622641, + "grad_norm": 0.5348057312329286, + "learning_rate": 9.610705243457962e-06, + "loss": 0.2421, + "step": 1921 + }, + { + "epoch": 0.6418433795291367, + "grad_norm": 0.539916851096655, + "learning_rate": 9.609953109285057e-06, + "loss": 0.2225, + "step": 1922 + }, + { + "epoch": 0.6421773250960093, + "grad_norm": 0.486698830537002, + "learning_rate": 9.609200278724456e-06, + "loss": 0.2303, + "step": 1923 + }, + { + "epoch": 0.6425112706628819, + "grad_norm": 0.4853767234392697, + "learning_rate": 9.60844675188988e-06, + "loss": 0.2358, + "step": 1924 + }, + { + "epoch": 0.6428452162297545, + "grad_norm": 0.5783948282032476, + "learning_rate": 9.60769252889516e-06, + "loss": 0.2549, + "step": 1925 + }, + { + "epoch": 0.6431791617966272, + "grad_norm": 0.4677106105333194, + "learning_rate": 9.606937609854227e-06, + "loss": 0.2311, + "step": 1926 + }, + { + "epoch": 0.6435131073634998, + "grad_norm": 0.7242966644398318, + "learning_rate": 9.606181994881124e-06, + "loss": 0.2273, + "step": 1927 + }, + { + "epoch": 0.6438470529303724, + "grad_norm": 0.5186721278224474, + "learning_rate": 9.605425684089998e-06, + "loss": 0.246, + "step": 1928 + }, + { + "epoch": 0.644180998497245, + "grad_norm": 0.6545504763262779, + "learning_rate": 9.604668677595093e-06, + "loss": 0.2339, + "step": 1929 + }, + { + "epoch": 0.6445149440641176, + "grad_norm": 0.4496430333660451, + "learning_rate": 9.603910975510764e-06, + "loss": 0.2205, + "step": 1930 + }, + { + "epoch": 0.6448488896309902, + "grad_norm": 0.5824478060538069, + "learning_rate": 9.603152577951476e-06, + "loss": 0.2292, + "step": 1931 + }, + { + "epoch": 0.6451828351978628, + "grad_norm": 0.4683634543655863, + "learning_rate": 9.60239348503179e-06, + "loss": 0.2103, + "step": 1932 + }, + { + "epoch": 0.6455167807647354, + "grad_norm": 0.4654095627434126, + "learning_rate": 9.601633696866376e-06, + "loss": 0.2247, + "step": 1933 + }, + { + "epoch": 0.6458507263316079, + "grad_norm": 0.503168780861353, + "learning_rate": 9.60087321357001e-06, + "loss": 0.2255, + "step": 1934 + }, + { + "epoch": 0.6461846718984805, + "grad_norm": 0.5396694281989363, + "learning_rate": 9.600112035257571e-06, + "loss": 0.2222, + "step": 1935 + }, + { + "epoch": 0.6465186174653531, + "grad_norm": 0.4407844703562947, + "learning_rate": 9.599350162044045e-06, + "loss": 0.2215, + "step": 1936 + }, + { + "epoch": 0.6468525630322257, + "grad_norm": 0.4504311612267907, + "learning_rate": 9.598587594044522e-06, + "loss": 0.2241, + "step": 1937 + }, + { + "epoch": 0.6471865085990983, + "grad_norm": 0.5078973609706072, + "learning_rate": 9.597824331374196e-06, + "loss": 0.2147, + "step": 1938 + }, + { + "epoch": 0.6475204541659709, + "grad_norm": 0.5105786340988594, + "learning_rate": 9.597060374148365e-06, + "loss": 0.222, + "step": 1939 + }, + { + "epoch": 0.6478543997328435, + "grad_norm": 0.4459757628273001, + "learning_rate": 9.596295722482439e-06, + "loss": 0.2175, + "step": 1940 + }, + { + "epoch": 0.6481883452997161, + "grad_norm": 0.5146618161191947, + "learning_rate": 9.595530376491924e-06, + "loss": 0.2367, + "step": 1941 + }, + { + "epoch": 0.6485222908665887, + "grad_norm": 0.4924035886938882, + "learning_rate": 9.594764336292432e-06, + "loss": 0.227, + "step": 1942 + }, + { + "epoch": 0.6488562364334614, + "grad_norm": 0.4836297523018866, + "learning_rate": 9.593997601999689e-06, + "loss": 0.2184, + "step": 1943 + }, + { + "epoch": 0.649190182000334, + "grad_norm": 0.4954472502507133, + "learning_rate": 9.593230173729514e-06, + "loss": 0.2289, + "step": 1944 + }, + { + "epoch": 0.6495241275672066, + "grad_norm": 0.46892208039162586, + "learning_rate": 9.592462051597838e-06, + "loss": 0.2333, + "step": 1945 + }, + { + "epoch": 0.6498580731340792, + "grad_norm": 0.4707733396458665, + "learning_rate": 9.591693235720695e-06, + "loss": 0.2179, + "step": 1946 + }, + { + "epoch": 0.6501920187009518, + "grad_norm": 0.5021563108304271, + "learning_rate": 9.590923726214224e-06, + "loss": 0.2312, + "step": 1947 + }, + { + "epoch": 0.6505259642678244, + "grad_norm": 0.5216409946552976, + "learning_rate": 9.590153523194665e-06, + "loss": 0.236, + "step": 1948 + }, + { + "epoch": 0.650859909834697, + "grad_norm": 0.445367026551766, + "learning_rate": 9.589382626778371e-06, + "loss": 0.2039, + "step": 1949 + }, + { + "epoch": 0.6511938554015696, + "grad_norm": 0.4719313967503157, + "learning_rate": 9.588611037081793e-06, + "loss": 0.2291, + "step": 1950 + }, + { + "epoch": 0.6515278009684421, + "grad_norm": 0.5041618490835817, + "learning_rate": 9.587838754221488e-06, + "loss": 0.2186, + "step": 1951 + }, + { + "epoch": 0.6518617465353147, + "grad_norm": 0.5656846698351666, + "learning_rate": 9.587065778314119e-06, + "loss": 0.2292, + "step": 1952 + }, + { + "epoch": 0.6521956921021873, + "grad_norm": 0.48863066421780066, + "learning_rate": 9.586292109476454e-06, + "loss": 0.2107, + "step": 1953 + }, + { + "epoch": 0.6525296376690599, + "grad_norm": 0.7874907012437945, + "learning_rate": 9.585517747825363e-06, + "loss": 0.2378, + "step": 1954 + }, + { + "epoch": 0.6528635832359325, + "grad_norm": 0.5095487818003742, + "learning_rate": 9.584742693477825e-06, + "loss": 0.2279, + "step": 1955 + }, + { + "epoch": 0.6531975288028051, + "grad_norm": 0.5643641986886433, + "learning_rate": 9.58396694655092e-06, + "loss": 0.2581, + "step": 1956 + }, + { + "epoch": 0.6535314743696777, + "grad_norm": 0.4898371660965593, + "learning_rate": 9.583190507161832e-06, + "loss": 0.2203, + "step": 1957 + }, + { + "epoch": 0.6538654199365503, + "grad_norm": 0.5921772486152574, + "learning_rate": 9.582413375427852e-06, + "loss": 0.2268, + "step": 1958 + }, + { + "epoch": 0.654199365503423, + "grad_norm": 0.45116730167892133, + "learning_rate": 9.581635551466376e-06, + "loss": 0.2195, + "step": 1959 + }, + { + "epoch": 0.6545333110702956, + "grad_norm": 0.47843623987223366, + "learning_rate": 9.580857035394904e-06, + "loss": 0.225, + "step": 1960 + }, + { + "epoch": 0.6548672566371682, + "grad_norm": 0.5197754147716475, + "learning_rate": 9.580077827331038e-06, + "loss": 0.2368, + "step": 1961 + }, + { + "epoch": 0.6552012022040408, + "grad_norm": 0.4170784963842771, + "learning_rate": 9.579297927392488e-06, + "loss": 0.2065, + "step": 1962 + }, + { + "epoch": 0.6555351477709134, + "grad_norm": 0.5337746138339877, + "learning_rate": 9.578517335697065e-06, + "loss": 0.2313, + "step": 1963 + }, + { + "epoch": 0.655869093337786, + "grad_norm": 0.49279896240854576, + "learning_rate": 9.577736052362689e-06, + "loss": 0.2268, + "step": 1964 + }, + { + "epoch": 0.6562030389046586, + "grad_norm": 0.4302932121647986, + "learning_rate": 9.576954077507381e-06, + "loss": 0.2049, + "step": 1965 + }, + { + "epoch": 0.6565369844715312, + "grad_norm": 0.5127555964144664, + "learning_rate": 9.576171411249269e-06, + "loss": 0.23, + "step": 1966 + }, + { + "epoch": 0.6568709300384037, + "grad_norm": 0.5080459538592854, + "learning_rate": 9.575388053706582e-06, + "loss": 0.2317, + "step": 1967 + }, + { + "epoch": 0.6572048756052763, + "grad_norm": 0.4745192813209085, + "learning_rate": 9.574604004997654e-06, + "loss": 0.2154, + "step": 1968 + }, + { + "epoch": 0.6575388211721489, + "grad_norm": 0.5389965358570025, + "learning_rate": 9.57381926524093e-06, + "loss": 0.2433, + "step": 1969 + }, + { + "epoch": 0.6578727667390215, + "grad_norm": 0.4956040255633844, + "learning_rate": 9.57303383455495e-06, + "loss": 0.2234, + "step": 1970 + }, + { + "epoch": 0.6582067123058941, + "grad_norm": 0.48821226341668883, + "learning_rate": 9.572247713058362e-06, + "loss": 0.2225, + "step": 1971 + }, + { + "epoch": 0.6585406578727667, + "grad_norm": 0.48960509199150504, + "learning_rate": 9.571460900869923e-06, + "loss": 0.2243, + "step": 1972 + }, + { + "epoch": 0.6588746034396393, + "grad_norm": 0.5388856840457162, + "learning_rate": 9.570673398108485e-06, + "loss": 0.2191, + "step": 1973 + }, + { + "epoch": 0.6592085490065119, + "grad_norm": 0.5247531776272966, + "learning_rate": 9.569885204893015e-06, + "loss": 0.2438, + "step": 1974 + }, + { + "epoch": 0.6595424945733845, + "grad_norm": 0.4636309459992003, + "learning_rate": 9.569096321342574e-06, + "loss": 0.2205, + "step": 1975 + }, + { + "epoch": 0.6598764401402571, + "grad_norm": 0.5647518684704627, + "learning_rate": 9.568306747576335e-06, + "loss": 0.2424, + "step": 1976 + }, + { + "epoch": 0.6602103857071298, + "grad_norm": 0.4632908967911912, + "learning_rate": 9.567516483713572e-06, + "loss": 0.2183, + "step": 1977 + }, + { + "epoch": 0.6605443312740024, + "grad_norm": 0.4806235309010455, + "learning_rate": 9.566725529873664e-06, + "loss": 0.2129, + "step": 1978 + }, + { + "epoch": 0.660878276840875, + "grad_norm": 0.47812898299179163, + "learning_rate": 9.565933886176093e-06, + "loss": 0.2226, + "step": 1979 + }, + { + "epoch": 0.6612122224077476, + "grad_norm": 0.4790050168212765, + "learning_rate": 9.565141552740445e-06, + "loss": 0.2127, + "step": 1980 + }, + { + "epoch": 0.6615461679746202, + "grad_norm": 0.47301797232701465, + "learning_rate": 9.564348529686413e-06, + "loss": 0.2276, + "step": 1981 + }, + { + "epoch": 0.6618801135414928, + "grad_norm": 0.47800269002423895, + "learning_rate": 9.563554817133794e-06, + "loss": 0.2276, + "step": 1982 + }, + { + "epoch": 0.6622140591083653, + "grad_norm": 0.47697326231641335, + "learning_rate": 9.562760415202483e-06, + "loss": 0.2075, + "step": 1983 + }, + { + "epoch": 0.6625480046752379, + "grad_norm": 0.45114047498178445, + "learning_rate": 9.56196532401249e-06, + "loss": 0.1993, + "step": 1984 + }, + { + "epoch": 0.6628819502421105, + "grad_norm": 0.6767131550163791, + "learning_rate": 9.561169543683917e-06, + "loss": 0.2323, + "step": 1985 + }, + { + "epoch": 0.6632158958089831, + "grad_norm": 0.46779536182015385, + "learning_rate": 9.560373074336977e-06, + "loss": 0.2219, + "step": 1986 + }, + { + "epoch": 0.6635498413758557, + "grad_norm": 0.5689668029517431, + "learning_rate": 9.55957591609199e-06, + "loss": 0.2294, + "step": 1987 + }, + { + "epoch": 0.6638837869427283, + "grad_norm": 0.5473530266696686, + "learning_rate": 9.558778069069373e-06, + "loss": 0.2187, + "step": 1988 + }, + { + "epoch": 0.6642177325096009, + "grad_norm": 0.49049488313910705, + "learning_rate": 9.55797953338965e-06, + "loss": 0.2159, + "step": 1989 + }, + { + "epoch": 0.6645516780764735, + "grad_norm": 0.6069581055887437, + "learning_rate": 9.55718030917345e-06, + "loss": 0.2333, + "step": 1990 + }, + { + "epoch": 0.6648856236433461, + "grad_norm": 0.5169479730953396, + "learning_rate": 9.556380396541507e-06, + "loss": 0.2312, + "step": 1991 + }, + { + "epoch": 0.6652195692102187, + "grad_norm": 0.5700537116713167, + "learning_rate": 9.555579795614654e-06, + "loss": 0.247, + "step": 1992 + }, + { + "epoch": 0.6655535147770913, + "grad_norm": 0.6320463038320528, + "learning_rate": 9.554778506513834e-06, + "loss": 0.2578, + "step": 1993 + }, + { + "epoch": 0.665887460343964, + "grad_norm": 0.6242767829904925, + "learning_rate": 9.553976529360087e-06, + "loss": 0.2453, + "step": 1994 + }, + { + "epoch": 0.6662214059108366, + "grad_norm": 0.4657124319169294, + "learning_rate": 9.553173864274567e-06, + "loss": 0.2049, + "step": 1995 + }, + { + "epoch": 0.6665553514777092, + "grad_norm": 0.6911587103761502, + "learning_rate": 9.552370511378522e-06, + "loss": 0.2255, + "step": 1996 + }, + { + "epoch": 0.6668892970445818, + "grad_norm": 0.5680114058138519, + "learning_rate": 9.551566470793308e-06, + "loss": 0.2209, + "step": 1997 + }, + { + "epoch": 0.6672232426114544, + "grad_norm": 0.4878976742541729, + "learning_rate": 9.550761742640387e-06, + "loss": 0.2254, + "step": 1998 + }, + { + "epoch": 0.667557188178327, + "grad_norm": 0.5997891836824637, + "learning_rate": 9.549956327041318e-06, + "loss": 0.2323, + "step": 1999 + }, + { + "epoch": 0.6678911337451995, + "grad_norm": 0.4561949621578282, + "learning_rate": 9.549150224117776e-06, + "loss": 0.2078, + "step": 2000 + }, + { + "epoch": 0.6682250793120721, + "grad_norm": 0.4842209833350863, + "learning_rate": 9.548343433991524e-06, + "loss": 0.2096, + "step": 2001 + }, + { + "epoch": 0.6685590248789447, + "grad_norm": 0.5321439301519651, + "learning_rate": 9.547535956784445e-06, + "loss": 0.2498, + "step": 2002 + }, + { + "epoch": 0.6688929704458173, + "grad_norm": 0.5345226511901285, + "learning_rate": 9.546727792618512e-06, + "loss": 0.2149, + "step": 2003 + }, + { + "epoch": 0.6692269160126899, + "grad_norm": 0.5948754984549891, + "learning_rate": 9.545918941615811e-06, + "loss": 0.2268, + "step": 2004 + }, + { + "epoch": 0.6695608615795625, + "grad_norm": 0.5249741131297062, + "learning_rate": 9.545109403898527e-06, + "loss": 0.2332, + "step": 2005 + }, + { + "epoch": 0.6698948071464351, + "grad_norm": 0.448439782885968, + "learning_rate": 9.544299179588952e-06, + "loss": 0.2197, + "step": 2006 + }, + { + "epoch": 0.6702287527133077, + "grad_norm": 0.5320016792358742, + "learning_rate": 9.543488268809478e-06, + "loss": 0.2217, + "step": 2007 + }, + { + "epoch": 0.6705626982801803, + "grad_norm": 0.5033575128331692, + "learning_rate": 9.542676671682601e-06, + "loss": 0.2294, + "step": 2008 + }, + { + "epoch": 0.6708966438470529, + "grad_norm": 0.5338603551058775, + "learning_rate": 9.541864388330926e-06, + "loss": 0.2359, + "step": 2009 + }, + { + "epoch": 0.6712305894139255, + "grad_norm": 0.4960343876895645, + "learning_rate": 9.541051418877156e-06, + "loss": 0.23, + "step": 2010 + }, + { + "epoch": 0.6715645349807982, + "grad_norm": 0.5019233736697615, + "learning_rate": 9.5402377634441e-06, + "loss": 0.2184, + "step": 2011 + }, + { + "epoch": 0.6718984805476708, + "grad_norm": 0.5116242184595458, + "learning_rate": 9.539423422154672e-06, + "loss": 0.2057, + "step": 2012 + }, + { + "epoch": 0.6722324261145434, + "grad_norm": 0.4581151499736462, + "learning_rate": 9.538608395131884e-06, + "loss": 0.219, + "step": 2013 + }, + { + "epoch": 0.672566371681416, + "grad_norm": 0.5263573724429383, + "learning_rate": 9.537792682498859e-06, + "loss": 0.2345, + "step": 2014 + }, + { + "epoch": 0.6729003172482886, + "grad_norm": 0.5486956203361241, + "learning_rate": 9.536976284378818e-06, + "loss": 0.2365, + "step": 2015 + }, + { + "epoch": 0.6732342628151611, + "grad_norm": 0.48552662228072885, + "learning_rate": 9.536159200895088e-06, + "loss": 0.2472, + "step": 2016 + }, + { + "epoch": 0.6735682083820337, + "grad_norm": 0.4807730164835203, + "learning_rate": 9.535341432171098e-06, + "loss": 0.2158, + "step": 2017 + }, + { + "epoch": 0.6739021539489063, + "grad_norm": 0.5082241482128, + "learning_rate": 9.534522978330384e-06, + "loss": 0.2273, + "step": 2018 + }, + { + "epoch": 0.6742360995157789, + "grad_norm": 0.5247052847237837, + "learning_rate": 9.533703839496581e-06, + "loss": 0.2122, + "step": 2019 + }, + { + "epoch": 0.6745700450826515, + "grad_norm": 0.4996722623100839, + "learning_rate": 9.532884015793432e-06, + "loss": 0.2144, + "step": 2020 + }, + { + "epoch": 0.6749039906495241, + "grad_norm": 0.4864919237009365, + "learning_rate": 9.532063507344777e-06, + "loss": 0.2246, + "step": 2021 + }, + { + "epoch": 0.6752379362163967, + "grad_norm": 0.5055062622420079, + "learning_rate": 9.53124231427457e-06, + "loss": 0.2329, + "step": 2022 + }, + { + "epoch": 0.6755718817832693, + "grad_norm": 0.5127546113587057, + "learning_rate": 9.530420436706853e-06, + "loss": 0.225, + "step": 2023 + }, + { + "epoch": 0.6759058273501419, + "grad_norm": 0.5199847442243274, + "learning_rate": 9.529597874765788e-06, + "loss": 0.2224, + "step": 2024 + }, + { + "epoch": 0.6762397729170145, + "grad_norm": 0.5520828526591032, + "learning_rate": 9.528774628575628e-06, + "loss": 0.2329, + "step": 2025 + }, + { + "epoch": 0.6765737184838871, + "grad_norm": 0.4690377580828556, + "learning_rate": 9.527950698260737e-06, + "loss": 0.2162, + "step": 2026 + }, + { + "epoch": 0.6769076640507597, + "grad_norm": 0.7257663653006456, + "learning_rate": 9.527126083945578e-06, + "loss": 0.2423, + "step": 2027 + }, + { + "epoch": 0.6772416096176324, + "grad_norm": 0.592781750687045, + "learning_rate": 9.526300785754719e-06, + "loss": 0.2328, + "step": 2028 + }, + { + "epoch": 0.677575555184505, + "grad_norm": 0.6355902972521256, + "learning_rate": 9.525474803812831e-06, + "loss": 0.2281, + "step": 2029 + }, + { + "epoch": 0.6779095007513776, + "grad_norm": 0.5201719503575588, + "learning_rate": 9.524648138244688e-06, + "loss": 0.2376, + "step": 2030 + }, + { + "epoch": 0.6782434463182502, + "grad_norm": 0.5216385214418539, + "learning_rate": 9.523820789175167e-06, + "loss": 0.2329, + "step": 2031 + }, + { + "epoch": 0.6785773918851227, + "grad_norm": 0.4860538015687635, + "learning_rate": 9.52299275672925e-06, + "loss": 0.2239, + "step": 2032 + }, + { + "epoch": 0.6789113374519953, + "grad_norm": 0.5065750638589931, + "learning_rate": 9.52216404103202e-06, + "loss": 0.2204, + "step": 2033 + }, + { + "epoch": 0.6792452830188679, + "grad_norm": 0.5465674778090972, + "learning_rate": 9.521334642208666e-06, + "loss": 0.2149, + "step": 2034 + }, + { + "epoch": 0.6795792285857405, + "grad_norm": 0.530148424285913, + "learning_rate": 9.520504560384476e-06, + "loss": 0.2291, + "step": 2035 + }, + { + "epoch": 0.6799131741526131, + "grad_norm": 0.439419070166103, + "learning_rate": 9.519673795684845e-06, + "loss": 0.2246, + "step": 2036 + }, + { + "epoch": 0.6802471197194857, + "grad_norm": 0.5110159600833178, + "learning_rate": 9.518842348235271e-06, + "loss": 0.2156, + "step": 2037 + }, + { + "epoch": 0.6805810652863583, + "grad_norm": 0.5080312664365709, + "learning_rate": 9.51801021816135e-06, + "loss": 0.2282, + "step": 2038 + }, + { + "epoch": 0.6809150108532309, + "grad_norm": 0.4405972025652121, + "learning_rate": 9.51717740558879e-06, + "loss": 0.2287, + "step": 2039 + }, + { + "epoch": 0.6812489564201035, + "grad_norm": 0.4846966375847971, + "learning_rate": 9.516343910643395e-06, + "loss": 0.2304, + "step": 2040 + }, + { + "epoch": 0.6815829019869761, + "grad_norm": 0.4458254730841546, + "learning_rate": 9.515509733451074e-06, + "loss": 0.2266, + "step": 2041 + }, + { + "epoch": 0.6819168475538487, + "grad_norm": 0.5081979817054557, + "learning_rate": 9.514674874137838e-06, + "loss": 0.2212, + "step": 2042 + }, + { + "epoch": 0.6822507931207213, + "grad_norm": 0.44506787175264934, + "learning_rate": 9.513839332829806e-06, + "loss": 0.217, + "step": 2043 + }, + { + "epoch": 0.682584738687594, + "grad_norm": 0.4711214808235664, + "learning_rate": 9.513003109653192e-06, + "loss": 0.214, + "step": 2044 + }, + { + "epoch": 0.6829186842544666, + "grad_norm": 0.4690699744568473, + "learning_rate": 9.512166204734322e-06, + "loss": 0.2265, + "step": 2045 + }, + { + "epoch": 0.6832526298213392, + "grad_norm": 0.47218778632793423, + "learning_rate": 9.511328618199614e-06, + "loss": 0.222, + "step": 2046 + }, + { + "epoch": 0.6835865753882118, + "grad_norm": 0.5360979263013558, + "learning_rate": 9.510490350175602e-06, + "loss": 0.2571, + "step": 2047 + }, + { + "epoch": 0.6839205209550844, + "grad_norm": 0.4708991844847955, + "learning_rate": 9.50965140078891e-06, + "loss": 0.2347, + "step": 2048 + }, + { + "epoch": 0.6842544665219569, + "grad_norm": 0.4819450196925043, + "learning_rate": 9.508811770166277e-06, + "loss": 0.2322, + "step": 2049 + }, + { + "epoch": 0.6845884120888295, + "grad_norm": 0.45770077260472547, + "learning_rate": 9.507971458434538e-06, + "loss": 0.2217, + "step": 2050 + }, + { + "epoch": 0.6849223576557021, + "grad_norm": 0.4682650978661971, + "learning_rate": 9.507130465720628e-06, + "loss": 0.2258, + "step": 2051 + }, + { + "epoch": 0.6852563032225747, + "grad_norm": 0.4225589686793817, + "learning_rate": 9.506288792151592e-06, + "loss": 0.2082, + "step": 2052 + }, + { + "epoch": 0.6855902487894473, + "grad_norm": 0.43243099858850814, + "learning_rate": 9.505446437854574e-06, + "loss": 0.2192, + "step": 2053 + }, + { + "epoch": 0.6859241943563199, + "grad_norm": 0.48545888804912246, + "learning_rate": 9.504603402956823e-06, + "loss": 0.232, + "step": 2054 + }, + { + "epoch": 0.6862581399231925, + "grad_norm": 0.4784145024105088, + "learning_rate": 9.503759687585686e-06, + "loss": 0.2257, + "step": 2055 + }, + { + "epoch": 0.6865920854900651, + "grad_norm": 0.47809150718111393, + "learning_rate": 9.50291529186862e-06, + "loss": 0.2264, + "step": 2056 + }, + { + "epoch": 0.6869260310569377, + "grad_norm": 0.45443749767645863, + "learning_rate": 9.502070215933177e-06, + "loss": 0.2247, + "step": 2057 + }, + { + "epoch": 0.6872599766238103, + "grad_norm": 0.5001191908128457, + "learning_rate": 9.501224459907019e-06, + "loss": 0.229, + "step": 2058 + }, + { + "epoch": 0.6875939221906829, + "grad_norm": 0.4682122413633237, + "learning_rate": 9.500378023917906e-06, + "loss": 0.2206, + "step": 2059 + }, + { + "epoch": 0.6879278677575555, + "grad_norm": 0.48110266017390535, + "learning_rate": 9.499530908093702e-06, + "loss": 0.2237, + "step": 2060 + }, + { + "epoch": 0.6882618133244282, + "grad_norm": 0.4758868687878399, + "learning_rate": 9.498683112562374e-06, + "loss": 0.2292, + "step": 2061 + }, + { + "epoch": 0.6885957588913008, + "grad_norm": 0.4398159757318568, + "learning_rate": 9.497834637451992e-06, + "loss": 0.2194, + "step": 2062 + }, + { + "epoch": 0.6889297044581734, + "grad_norm": 0.4806638031099439, + "learning_rate": 9.496985482890728e-06, + "loss": 0.2204, + "step": 2063 + }, + { + "epoch": 0.689263650025046, + "grad_norm": 0.7573298106478715, + "learning_rate": 9.496135649006857e-06, + "loss": 0.2451, + "step": 2064 + }, + { + "epoch": 0.6895975955919185, + "grad_norm": 0.4398493907855471, + "learning_rate": 9.495285135928755e-06, + "loss": 0.2357, + "step": 2065 + }, + { + "epoch": 0.6899315411587911, + "grad_norm": 0.4722521081195024, + "learning_rate": 9.494433943784901e-06, + "loss": 0.2284, + "step": 2066 + }, + { + "epoch": 0.6902654867256637, + "grad_norm": 0.44358844114458357, + "learning_rate": 9.493582072703883e-06, + "loss": 0.2094, + "step": 2067 + }, + { + "epoch": 0.6905994322925363, + "grad_norm": 0.5388979510188426, + "learning_rate": 9.49272952281438e-06, + "loss": 0.2498, + "step": 2068 + }, + { + "epoch": 0.6909333778594089, + "grad_norm": 0.4401449144477547, + "learning_rate": 9.491876294245184e-06, + "loss": 0.2288, + "step": 2069 + }, + { + "epoch": 0.6912673234262815, + "grad_norm": 0.5258022574749537, + "learning_rate": 9.491022387125183e-06, + "loss": 0.2227, + "step": 2070 + }, + { + "epoch": 0.6916012689931541, + "grad_norm": 0.484024323700997, + "learning_rate": 9.490167801583373e-06, + "loss": 0.2139, + "step": 2071 + }, + { + "epoch": 0.6919352145600267, + "grad_norm": 0.506106087923025, + "learning_rate": 9.489312537748843e-06, + "loss": 0.2431, + "step": 2072 + }, + { + "epoch": 0.6922691601268993, + "grad_norm": 0.4585978711248216, + "learning_rate": 9.488456595750795e-06, + "loss": 0.2339, + "step": 2073 + }, + { + "epoch": 0.6926031056937719, + "grad_norm": 0.6031424608851983, + "learning_rate": 9.487599975718529e-06, + "loss": 0.2433, + "step": 2074 + }, + { + "epoch": 0.6929370512606445, + "grad_norm": 0.4556413453192436, + "learning_rate": 9.486742677781446e-06, + "loss": 0.2189, + "step": 2075 + }, + { + "epoch": 0.6932709968275171, + "grad_norm": 0.687156099033192, + "learning_rate": 9.485884702069053e-06, + "loss": 0.2343, + "step": 2076 + }, + { + "epoch": 0.6936049423943897, + "grad_norm": 0.4659994646594298, + "learning_rate": 9.485026048710957e-06, + "loss": 0.2295, + "step": 2077 + }, + { + "epoch": 0.6939388879612624, + "grad_norm": 0.5014688431544414, + "learning_rate": 9.484166717836865e-06, + "loss": 0.234, + "step": 2078 + }, + { + "epoch": 0.694272833528135, + "grad_norm": 0.5334695156505862, + "learning_rate": 9.48330670957659e-06, + "loss": 0.2192, + "step": 2079 + }, + { + "epoch": 0.6946067790950076, + "grad_norm": 0.5223706657750462, + "learning_rate": 9.48244602406005e-06, + "loss": 0.2254, + "step": 2080 + }, + { + "epoch": 0.6949407246618801, + "grad_norm": 0.5463595012524518, + "learning_rate": 9.481584661417258e-06, + "loss": 0.2254, + "step": 2081 + }, + { + "epoch": 0.6952746702287527, + "grad_norm": 0.48561380690299644, + "learning_rate": 9.480722621778334e-06, + "loss": 0.2277, + "step": 2082 + }, + { + "epoch": 0.6956086157956253, + "grad_norm": 0.4492219269705461, + "learning_rate": 9.479859905273498e-06, + "loss": 0.2236, + "step": 2083 + }, + { + "epoch": 0.6959425613624979, + "grad_norm": 0.5504216916036948, + "learning_rate": 9.478996512033074e-06, + "loss": 0.2467, + "step": 2084 + }, + { + "epoch": 0.6962765069293705, + "grad_norm": 0.5111070325627567, + "learning_rate": 9.478132442187491e-06, + "loss": 0.2112, + "step": 2085 + }, + { + "epoch": 0.6966104524962431, + "grad_norm": 0.47186041559580094, + "learning_rate": 9.477267695867275e-06, + "loss": 0.2229, + "step": 2086 + }, + { + "epoch": 0.6969443980631157, + "grad_norm": 0.4488571924469654, + "learning_rate": 9.476402273203052e-06, + "loss": 0.2225, + "step": 2087 + }, + { + "epoch": 0.6972783436299883, + "grad_norm": 0.5787574166594714, + "learning_rate": 9.47553617432556e-06, + "loss": 0.2335, + "step": 2088 + }, + { + "epoch": 0.6976122891968609, + "grad_norm": 0.49866311539288444, + "learning_rate": 9.47466939936563e-06, + "loss": 0.2237, + "step": 2089 + }, + { + "epoch": 0.6979462347637335, + "grad_norm": 0.7548546423555135, + "learning_rate": 9.473801948454199e-06, + "loss": 0.23, + "step": 2090 + }, + { + "epoch": 0.6982801803306061, + "grad_norm": 0.49007999916257805, + "learning_rate": 9.472933821722307e-06, + "loss": 0.2166, + "step": 2091 + }, + { + "epoch": 0.6986141258974787, + "grad_norm": 0.47535667454395897, + "learning_rate": 9.472065019301095e-06, + "loss": 0.2184, + "step": 2092 + }, + { + "epoch": 0.6989480714643513, + "grad_norm": 0.5211167728844137, + "learning_rate": 9.471195541321805e-06, + "loss": 0.2176, + "step": 2093 + }, + { + "epoch": 0.699282017031224, + "grad_norm": 0.5003555535014772, + "learning_rate": 9.470325387915782e-06, + "loss": 0.2353, + "step": 2094 + }, + { + "epoch": 0.6996159625980966, + "grad_norm": 0.48728850394662204, + "learning_rate": 9.469454559214473e-06, + "loss": 0.2278, + "step": 2095 + }, + { + "epoch": 0.6999499081649692, + "grad_norm": 0.5088758728966357, + "learning_rate": 9.468583055349425e-06, + "loss": 0.22, + "step": 2096 + }, + { + "epoch": 0.7002838537318417, + "grad_norm": 0.4368236017414324, + "learning_rate": 9.467710876452292e-06, + "loss": 0.2233, + "step": 2097 + }, + { + "epoch": 0.7006177992987143, + "grad_norm": 0.5077021290184639, + "learning_rate": 9.466838022654826e-06, + "loss": 0.2352, + "step": 2098 + }, + { + "epoch": 0.7009517448655869, + "grad_norm": 0.5513449938345952, + "learning_rate": 9.465964494088879e-06, + "loss": 0.2327, + "step": 2099 + }, + { + "epoch": 0.7012856904324595, + "grad_norm": 0.4771101109092869, + "learning_rate": 9.465090290886411e-06, + "loss": 0.2305, + "step": 2100 + }, + { + "epoch": 0.7016196359993321, + "grad_norm": 0.5415328425413375, + "learning_rate": 9.464215413179483e-06, + "loss": 0.2424, + "step": 2101 + }, + { + "epoch": 0.7019535815662047, + "grad_norm": 0.49226725525209947, + "learning_rate": 9.46333986110025e-06, + "loss": 0.2095, + "step": 2102 + }, + { + "epoch": 0.7022875271330773, + "grad_norm": 0.47931978869160313, + "learning_rate": 9.462463634780977e-06, + "loss": 0.2221, + "step": 2103 + }, + { + "epoch": 0.7026214726999499, + "grad_norm": 0.4655415271549972, + "learning_rate": 9.461586734354027e-06, + "loss": 0.2201, + "step": 2104 + }, + { + "epoch": 0.7029554182668225, + "grad_norm": 0.5668783713703968, + "learning_rate": 9.460709159951867e-06, + "loss": 0.2385, + "step": 2105 + }, + { + "epoch": 0.7032893638336951, + "grad_norm": 0.4442504668426384, + "learning_rate": 9.459830911707066e-06, + "loss": 0.2174, + "step": 2106 + }, + { + "epoch": 0.7036233094005677, + "grad_norm": 0.5148802971653738, + "learning_rate": 9.458951989752295e-06, + "loss": 0.2298, + "step": 2107 + }, + { + "epoch": 0.7039572549674403, + "grad_norm": 0.4961289354471199, + "learning_rate": 9.458072394220321e-06, + "loss": 0.2365, + "step": 2108 + }, + { + "epoch": 0.7042912005343129, + "grad_norm": 0.39859284746010964, + "learning_rate": 9.457192125244021e-06, + "loss": 0.2026, + "step": 2109 + }, + { + "epoch": 0.7046251461011855, + "grad_norm": 0.43207299712712405, + "learning_rate": 9.456311182956368e-06, + "loss": 0.2191, + "step": 2110 + }, + { + "epoch": 0.7049590916680581, + "grad_norm": 0.4752148941522198, + "learning_rate": 9.45542956749044e-06, + "loss": 0.2325, + "step": 2111 + }, + { + "epoch": 0.7052930372349308, + "grad_norm": 0.5010644723261528, + "learning_rate": 9.454547278979415e-06, + "loss": 0.2191, + "step": 2112 + }, + { + "epoch": 0.7056269828018034, + "grad_norm": 0.40752873703821485, + "learning_rate": 9.453664317556572e-06, + "loss": 0.2106, + "step": 2113 + }, + { + "epoch": 0.7059609283686759, + "grad_norm": 0.5576876661906841, + "learning_rate": 9.452780683355295e-06, + "loss": 0.222, + "step": 2114 + }, + { + "epoch": 0.7062948739355485, + "grad_norm": 0.432810708164213, + "learning_rate": 9.451896376509065e-06, + "loss": 0.2113, + "step": 2115 + }, + { + "epoch": 0.7066288195024211, + "grad_norm": 0.4482096027677475, + "learning_rate": 9.451011397151469e-06, + "loss": 0.2209, + "step": 2116 + }, + { + "epoch": 0.7069627650692937, + "grad_norm": 0.4366216644067192, + "learning_rate": 9.450125745416191e-06, + "loss": 0.2107, + "step": 2117 + }, + { + "epoch": 0.7072967106361663, + "grad_norm": 0.4948001961193805, + "learning_rate": 9.44923942143702e-06, + "loss": 0.2346, + "step": 2118 + }, + { + "epoch": 0.7076306562030389, + "grad_norm": 0.8894874375807659, + "learning_rate": 9.448352425347848e-06, + "loss": 0.2458, + "step": 2119 + }, + { + "epoch": 0.7079646017699115, + "grad_norm": 0.5299843319037437, + "learning_rate": 9.447464757282665e-06, + "loss": 0.2367, + "step": 2120 + }, + { + "epoch": 0.7082985473367841, + "grad_norm": 0.4727096807635542, + "learning_rate": 9.44657641737556e-06, + "loss": 0.2327, + "step": 2121 + }, + { + "epoch": 0.7086324929036567, + "grad_norm": 0.4600930804291522, + "learning_rate": 9.445687405760735e-06, + "loss": 0.2226, + "step": 2122 + }, + { + "epoch": 0.7089664384705293, + "grad_norm": 0.4332929425486284, + "learning_rate": 9.444797722572479e-06, + "loss": 0.2107, + "step": 2123 + }, + { + "epoch": 0.7093003840374019, + "grad_norm": 0.731912468149352, + "learning_rate": 9.44390736794519e-06, + "loss": 0.238, + "step": 2124 + }, + { + "epoch": 0.7096343296042745, + "grad_norm": 0.5081533596669653, + "learning_rate": 9.443016342013369e-06, + "loss": 0.2351, + "step": 2125 + }, + { + "epoch": 0.7099682751711471, + "grad_norm": 0.5148768327746467, + "learning_rate": 9.442124644911614e-06, + "loss": 0.2228, + "step": 2126 + }, + { + "epoch": 0.7103022207380197, + "grad_norm": 0.4889377936506926, + "learning_rate": 9.441232276774629e-06, + "loss": 0.2185, + "step": 2127 + }, + { + "epoch": 0.7106361663048923, + "grad_norm": 0.48641943614807776, + "learning_rate": 9.440339237737213e-06, + "loss": 0.2201, + "step": 2128 + }, + { + "epoch": 0.710970111871765, + "grad_norm": 0.5242633602293272, + "learning_rate": 9.439445527934272e-06, + "loss": 0.2176, + "step": 2129 + }, + { + "epoch": 0.7113040574386374, + "grad_norm": 0.5762365105646677, + "learning_rate": 9.438551147500812e-06, + "loss": 0.2338, + "step": 2130 + }, + { + "epoch": 0.7116380030055101, + "grad_norm": 0.48698836807254486, + "learning_rate": 9.437656096571938e-06, + "loss": 0.2287, + "step": 2131 + }, + { + "epoch": 0.7119719485723827, + "grad_norm": 0.5525740892373583, + "learning_rate": 9.436760375282858e-06, + "loss": 0.2395, + "step": 2132 + }, + { + "epoch": 0.7123058941392553, + "grad_norm": 0.6121087031250623, + "learning_rate": 9.435863983768884e-06, + "loss": 0.2327, + "step": 2133 + }, + { + "epoch": 0.7126398397061279, + "grad_norm": 0.5393868118238357, + "learning_rate": 9.434966922165424e-06, + "loss": 0.242, + "step": 2134 + }, + { + "epoch": 0.7129737852730005, + "grad_norm": 0.5338938325124174, + "learning_rate": 9.43406919060799e-06, + "loss": 0.2094, + "step": 2135 + }, + { + "epoch": 0.7133077308398731, + "grad_norm": 0.4834589668720248, + "learning_rate": 9.433170789232196e-06, + "loss": 0.207, + "step": 2136 + }, + { + "epoch": 0.7136416764067457, + "grad_norm": 0.636163814834721, + "learning_rate": 9.432271718173756e-06, + "loss": 0.2373, + "step": 2137 + }, + { + "epoch": 0.7139756219736183, + "grad_norm": 0.5423717816603074, + "learning_rate": 9.431371977568483e-06, + "loss": 0.2398, + "step": 2138 + }, + { + "epoch": 0.7143095675404909, + "grad_norm": 0.6024062328191949, + "learning_rate": 9.430471567552295e-06, + "loss": 0.2215, + "step": 2139 + }, + { + "epoch": 0.7146435131073635, + "grad_norm": 0.5675673827510521, + "learning_rate": 9.42957048826121e-06, + "loss": 0.2296, + "step": 2140 + }, + { + "epoch": 0.7149774586742361, + "grad_norm": 0.47911769123994113, + "learning_rate": 9.428668739831349e-06, + "loss": 0.2133, + "step": 2141 + }, + { + "epoch": 0.7153114042411087, + "grad_norm": 0.5032255216977924, + "learning_rate": 9.427766322398926e-06, + "loss": 0.228, + "step": 2142 + }, + { + "epoch": 0.7156453498079813, + "grad_norm": 0.4892738328424283, + "learning_rate": 9.426863236100266e-06, + "loss": 0.2315, + "step": 2143 + }, + { + "epoch": 0.7159792953748539, + "grad_norm": 0.5488139396113335, + "learning_rate": 9.425959481071787e-06, + "loss": 0.2283, + "step": 2144 + }, + { + "epoch": 0.7163132409417265, + "grad_norm": 0.46181208491623815, + "learning_rate": 9.425055057450017e-06, + "loss": 0.2199, + "step": 2145 + }, + { + "epoch": 0.716647186508599, + "grad_norm": 0.47958048701662137, + "learning_rate": 9.424149965371576e-06, + "loss": 0.2377, + "step": 2146 + }, + { + "epoch": 0.7169811320754716, + "grad_norm": 0.533708758771216, + "learning_rate": 9.423244204973191e-06, + "loss": 0.2263, + "step": 2147 + }, + { + "epoch": 0.7173150776423443, + "grad_norm": 0.47032392487456925, + "learning_rate": 9.422337776391686e-06, + "loss": 0.2139, + "step": 2148 + }, + { + "epoch": 0.7176490232092169, + "grad_norm": 0.4681306879448958, + "learning_rate": 9.421430679763989e-06, + "loss": 0.2217, + "step": 2149 + }, + { + "epoch": 0.7179829687760895, + "grad_norm": 0.4611059292110921, + "learning_rate": 9.420522915227129e-06, + "loss": 0.2262, + "step": 2150 + }, + { + "epoch": 0.7183169143429621, + "grad_norm": 0.4649061259338331, + "learning_rate": 9.419614482918229e-06, + "loss": 0.2194, + "step": 2151 + }, + { + "epoch": 0.7186508599098347, + "grad_norm": 0.49767814980354536, + "learning_rate": 9.418705382974524e-06, + "loss": 0.2165, + "step": 2152 + }, + { + "epoch": 0.7189848054767073, + "grad_norm": 0.4876893264124481, + "learning_rate": 9.417795615533343e-06, + "loss": 0.2306, + "step": 2153 + }, + { + "epoch": 0.7193187510435799, + "grad_norm": 0.5573835824314209, + "learning_rate": 9.416885180732115e-06, + "loss": 0.241, + "step": 2154 + }, + { + "epoch": 0.7196526966104525, + "grad_norm": 0.42980907750226544, + "learning_rate": 9.415974078708375e-06, + "loss": 0.2142, + "step": 2155 + }, + { + "epoch": 0.7199866421773251, + "grad_norm": 0.4581280120482188, + "learning_rate": 9.415062309599751e-06, + "loss": 0.225, + "step": 2156 + }, + { + "epoch": 0.7203205877441977, + "grad_norm": 0.4793267456929462, + "learning_rate": 9.414149873543983e-06, + "loss": 0.2216, + "step": 2157 + }, + { + "epoch": 0.7206545333110703, + "grad_norm": 0.5360592011567692, + "learning_rate": 9.4132367706789e-06, + "loss": 0.207, + "step": 2158 + }, + { + "epoch": 0.7209884788779429, + "grad_norm": 0.422519463252475, + "learning_rate": 9.412323001142438e-06, + "loss": 0.2048, + "step": 2159 + }, + { + "epoch": 0.7213224244448155, + "grad_norm": 0.47418431358564955, + "learning_rate": 9.411408565072635e-06, + "loss": 0.2353, + "step": 2160 + }, + { + "epoch": 0.7216563700116881, + "grad_norm": 0.43728181066282573, + "learning_rate": 9.410493462607623e-06, + "loss": 0.2095, + "step": 2161 + }, + { + "epoch": 0.7219903155785607, + "grad_norm": 0.5017096018356688, + "learning_rate": 9.409577693885642e-06, + "loss": 0.2326, + "step": 2162 + }, + { + "epoch": 0.7223242611454332, + "grad_norm": 0.45202500297613973, + "learning_rate": 9.408661259045032e-06, + "loss": 0.215, + "step": 2163 + }, + { + "epoch": 0.7226582067123059, + "grad_norm": 0.49163901550448014, + "learning_rate": 9.407744158224227e-06, + "loss": 0.2149, + "step": 2164 + }, + { + "epoch": 0.7229921522791785, + "grad_norm": 0.49708508765871556, + "learning_rate": 9.406826391561767e-06, + "loss": 0.228, + "step": 2165 + }, + { + "epoch": 0.7233260978460511, + "grad_norm": 0.5530402316677213, + "learning_rate": 9.405907959196293e-06, + "loss": 0.2534, + "step": 2166 + }, + { + "epoch": 0.7236600434129237, + "grad_norm": 0.4590331097109294, + "learning_rate": 9.404988861266543e-06, + "loss": 0.2127, + "step": 2167 + }, + { + "epoch": 0.7239939889797963, + "grad_norm": 0.5061377928038171, + "learning_rate": 9.404069097911358e-06, + "loss": 0.2181, + "step": 2168 + }, + { + "epoch": 0.7243279345466689, + "grad_norm": 0.4485152013068261, + "learning_rate": 9.40314866926968e-06, + "loss": 0.2113, + "step": 2169 + }, + { + "epoch": 0.7246618801135415, + "grad_norm": 0.4916280062892611, + "learning_rate": 9.402227575480549e-06, + "loss": 0.2363, + "step": 2170 + }, + { + "epoch": 0.7249958256804141, + "grad_norm": 0.4369101484429515, + "learning_rate": 9.401305816683111e-06, + "loss": 0.2022, + "step": 2171 + }, + { + "epoch": 0.7253297712472867, + "grad_norm": 0.4506814138302595, + "learning_rate": 9.400383393016604e-06, + "loss": 0.2273, + "step": 2172 + }, + { + "epoch": 0.7256637168141593, + "grad_norm": 0.4383138721900733, + "learning_rate": 9.39946030462037e-06, + "loss": 0.2058, + "step": 2173 + }, + { + "epoch": 0.7259976623810319, + "grad_norm": 0.4733775066633754, + "learning_rate": 9.39853655163386e-06, + "loss": 0.2033, + "step": 2174 + }, + { + "epoch": 0.7263316079479045, + "grad_norm": 0.44987005012925274, + "learning_rate": 9.39761213419661e-06, + "loss": 0.2093, + "step": 2175 + }, + { + "epoch": 0.7266655535147771, + "grad_norm": 0.44081375935995787, + "learning_rate": 9.396687052448267e-06, + "loss": 0.2151, + "step": 2176 + }, + { + "epoch": 0.7269994990816497, + "grad_norm": 0.4668970976585449, + "learning_rate": 9.395761306528576e-06, + "loss": 0.2111, + "step": 2177 + }, + { + "epoch": 0.7273334446485223, + "grad_norm": 0.4762785397500771, + "learning_rate": 9.39483489657738e-06, + "loss": 0.212, + "step": 2178 + }, + { + "epoch": 0.7276673902153948, + "grad_norm": 0.447677116890413, + "learning_rate": 9.393907822734627e-06, + "loss": 0.22, + "step": 2179 + }, + { + "epoch": 0.7280013357822674, + "grad_norm": 0.535442818777305, + "learning_rate": 9.39298008514036e-06, + "loss": 0.2372, + "step": 2180 + }, + { + "epoch": 0.72833528134914, + "grad_norm": 0.5188386933388953, + "learning_rate": 9.392051683934726e-06, + "loss": 0.2268, + "step": 2181 + }, + { + "epoch": 0.7286692269160127, + "grad_norm": 0.4727820750476286, + "learning_rate": 9.39112261925797e-06, + "loss": 0.2179, + "step": 2182 + }, + { + "epoch": 0.7290031724828853, + "grad_norm": 0.4854734035330415, + "learning_rate": 9.390192891250439e-06, + "loss": 0.2406, + "step": 2183 + }, + { + "epoch": 0.7293371180497579, + "grad_norm": 1.4538057381250347, + "learning_rate": 9.389262500052578e-06, + "loss": 0.226, + "step": 2184 + }, + { + "epoch": 0.7296710636166305, + "grad_norm": 0.511345482598057, + "learning_rate": 9.388331445804935e-06, + "loss": 0.2238, + "step": 2185 + }, + { + "epoch": 0.7300050091835031, + "grad_norm": 0.49329394070154936, + "learning_rate": 9.387399728648156e-06, + "loss": 0.2164, + "step": 2186 + }, + { + "epoch": 0.7303389547503757, + "grad_norm": 0.47261545953275097, + "learning_rate": 9.386467348722989e-06, + "loss": 0.2369, + "step": 2187 + }, + { + "epoch": 0.7306729003172483, + "grad_norm": 0.4970024372138262, + "learning_rate": 9.385534306170279e-06, + "loss": 0.2337, + "step": 2188 + }, + { + "epoch": 0.7310068458841209, + "grad_norm": 0.4407037790976595, + "learning_rate": 9.384600601130973e-06, + "loss": 0.2181, + "step": 2189 + }, + { + "epoch": 0.7313407914509935, + "grad_norm": 0.4612359667383596, + "learning_rate": 9.383666233746121e-06, + "loss": 0.213, + "step": 2190 + }, + { + "epoch": 0.7316747370178661, + "grad_norm": 0.5683151991367129, + "learning_rate": 9.382731204156869e-06, + "loss": 0.2403, + "step": 2191 + }, + { + "epoch": 0.7320086825847387, + "grad_norm": 0.6075346410739748, + "learning_rate": 9.381795512504461e-06, + "loss": 0.2251, + "step": 2192 + }, + { + "epoch": 0.7323426281516113, + "grad_norm": 0.7259845103570428, + "learning_rate": 9.380859158930249e-06, + "loss": 0.2379, + "step": 2193 + }, + { + "epoch": 0.7326765737184839, + "grad_norm": 0.47741648545629145, + "learning_rate": 9.379922143575678e-06, + "loss": 0.2191, + "step": 2194 + }, + { + "epoch": 0.7330105192853564, + "grad_norm": 0.45871484382317196, + "learning_rate": 9.378984466582294e-06, + "loss": 0.207, + "step": 2195 + }, + { + "epoch": 0.733344464852229, + "grad_norm": 0.4698346774760905, + "learning_rate": 9.378046128091748e-06, + "loss": 0.2156, + "step": 2196 + }, + { + "epoch": 0.7336784104191016, + "grad_norm": 0.517374409193277, + "learning_rate": 9.377107128245782e-06, + "loss": 0.2168, + "step": 2197 + }, + { + "epoch": 0.7340123559859743, + "grad_norm": 0.4960261055253891, + "learning_rate": 9.376167467186246e-06, + "loss": 0.2302, + "step": 2198 + }, + { + "epoch": 0.7343463015528469, + "grad_norm": 0.5756540347478746, + "learning_rate": 9.375227145055085e-06, + "loss": 0.2132, + "step": 2199 + }, + { + "epoch": 0.7346802471197195, + "grad_norm": 0.5091682769372302, + "learning_rate": 9.374286161994351e-06, + "loss": 0.2282, + "step": 2200 + }, + { + "epoch": 0.7350141926865921, + "grad_norm": 0.5067877827964193, + "learning_rate": 9.373344518146184e-06, + "loss": 0.2359, + "step": 2201 + }, + { + "epoch": 0.7353481382534647, + "grad_norm": 0.5423740124921783, + "learning_rate": 9.372402213652833e-06, + "loss": 0.2315, + "step": 2202 + }, + { + "epoch": 0.7356820838203373, + "grad_norm": 0.453426255813076, + "learning_rate": 9.371459248656645e-06, + "loss": 0.2123, + "step": 2203 + }, + { + "epoch": 0.7360160293872099, + "grad_norm": 0.5465347054793437, + "learning_rate": 9.370515623300066e-06, + "loss": 0.2203, + "step": 2204 + }, + { + "epoch": 0.7363499749540825, + "grad_norm": 0.5653821901957052, + "learning_rate": 9.369571337725638e-06, + "loss": 0.2308, + "step": 2205 + }, + { + "epoch": 0.7366839205209551, + "grad_norm": 0.5246706912753366, + "learning_rate": 9.368626392076013e-06, + "loss": 0.2288, + "step": 2206 + }, + { + "epoch": 0.7370178660878277, + "grad_norm": 0.42226917655168017, + "learning_rate": 9.367680786493929e-06, + "loss": 0.2245, + "step": 2207 + }, + { + "epoch": 0.7373518116547003, + "grad_norm": 0.5148652520612048, + "learning_rate": 9.366734521122236e-06, + "loss": 0.2121, + "step": 2208 + }, + { + "epoch": 0.7376857572215729, + "grad_norm": 0.49636706629870225, + "learning_rate": 9.365787596103877e-06, + "loss": 0.2422, + "step": 2209 + }, + { + "epoch": 0.7380197027884455, + "grad_norm": 0.5272484506883985, + "learning_rate": 9.364840011581896e-06, + "loss": 0.2361, + "step": 2210 + }, + { + "epoch": 0.7383536483553181, + "grad_norm": 0.48013110860491137, + "learning_rate": 9.363891767699437e-06, + "loss": 0.2289, + "step": 2211 + }, + { + "epoch": 0.7386875939221906, + "grad_norm": 0.4464703818992004, + "learning_rate": 9.362942864599746e-06, + "loss": 0.2189, + "step": 2212 + }, + { + "epoch": 0.7390215394890632, + "grad_norm": 0.49475924194998067, + "learning_rate": 9.36199330242616e-06, + "loss": 0.2301, + "step": 2213 + }, + { + "epoch": 0.7393554850559358, + "grad_norm": 0.46063739425578615, + "learning_rate": 9.361043081322125e-06, + "loss": 0.2268, + "step": 2214 + }, + { + "epoch": 0.7396894306228085, + "grad_norm": 0.4741690736245642, + "learning_rate": 9.360092201431186e-06, + "loss": 0.2333, + "step": 2215 + }, + { + "epoch": 0.7400233761896811, + "grad_norm": 0.5013668916419637, + "learning_rate": 9.359140662896978e-06, + "loss": 0.2202, + "step": 2216 + }, + { + "epoch": 0.7403573217565537, + "grad_norm": 0.4308948496107637, + "learning_rate": 9.358188465863247e-06, + "loss": 0.2086, + "step": 2217 + }, + { + "epoch": 0.7406912673234263, + "grad_norm": 0.4175975145931895, + "learning_rate": 9.357235610473833e-06, + "loss": 0.2091, + "step": 2218 + }, + { + "epoch": 0.7410252128902989, + "grad_norm": 0.4655604424669319, + "learning_rate": 9.356282096872673e-06, + "loss": 0.23, + "step": 2219 + }, + { + "epoch": 0.7413591584571715, + "grad_norm": 0.4740711739925384, + "learning_rate": 9.355327925203811e-06, + "loss": 0.2176, + "step": 2220 + }, + { + "epoch": 0.7416931040240441, + "grad_norm": 0.4748639281311392, + "learning_rate": 9.354373095611383e-06, + "loss": 0.2359, + "step": 2221 + }, + { + "epoch": 0.7420270495909167, + "grad_norm": 0.4368036977894943, + "learning_rate": 9.353417608239627e-06, + "loss": 0.2171, + "step": 2222 + }, + { + "epoch": 0.7423609951577893, + "grad_norm": 0.4666842076480527, + "learning_rate": 9.352461463232882e-06, + "loss": 0.2324, + "step": 2223 + }, + { + "epoch": 0.7426949407246619, + "grad_norm": 0.48937622343211834, + "learning_rate": 9.351504660735583e-06, + "loss": 0.2247, + "step": 2224 + }, + { + "epoch": 0.7430288862915345, + "grad_norm": 0.4711175442547071, + "learning_rate": 9.350547200892271e-06, + "loss": 0.2225, + "step": 2225 + }, + { + "epoch": 0.7433628318584071, + "grad_norm": 0.47189505898784784, + "learning_rate": 9.349589083847577e-06, + "loss": 0.2336, + "step": 2226 + }, + { + "epoch": 0.7436967774252797, + "grad_norm": 0.49998037587574656, + "learning_rate": 9.348630309746236e-06, + "loss": 0.242, + "step": 2227 + }, + { + "epoch": 0.7440307229921522, + "grad_norm": 0.4389783954854127, + "learning_rate": 9.347670878733084e-06, + "loss": 0.2179, + "step": 2228 + }, + { + "epoch": 0.7443646685590248, + "grad_norm": 0.46169101406158664, + "learning_rate": 9.346710790953053e-06, + "loss": 0.2259, + "step": 2229 + }, + { + "epoch": 0.7446986141258974, + "grad_norm": 0.43050129747366644, + "learning_rate": 9.345750046551177e-06, + "loss": 0.211, + "step": 2230 + }, + { + "epoch": 0.74503255969277, + "grad_norm": 0.4296786556030227, + "learning_rate": 9.344788645672585e-06, + "loss": 0.2237, + "step": 2231 + }, + { + "epoch": 0.7453665052596427, + "grad_norm": 0.5686148097924658, + "learning_rate": 9.343826588462513e-06, + "loss": 0.2344, + "step": 2232 + }, + { + "epoch": 0.7457004508265153, + "grad_norm": 0.4233257630785193, + "learning_rate": 9.342863875066284e-06, + "loss": 0.2123, + "step": 2233 + }, + { + "epoch": 0.7460343963933879, + "grad_norm": 0.437653238005159, + "learning_rate": 9.341900505629333e-06, + "loss": 0.2154, + "step": 2234 + }, + { + "epoch": 0.7463683419602605, + "grad_norm": 0.45698025308598117, + "learning_rate": 9.340936480297187e-06, + "loss": 0.2295, + "step": 2235 + }, + { + "epoch": 0.7467022875271331, + "grad_norm": 0.4688987813684971, + "learning_rate": 9.339971799215472e-06, + "loss": 0.226, + "step": 2236 + }, + { + "epoch": 0.7470362330940057, + "grad_norm": 0.4760421598172396, + "learning_rate": 9.339006462529916e-06, + "loss": 0.2334, + "step": 2237 + }, + { + "epoch": 0.7473701786608783, + "grad_norm": 0.4192485301306118, + "learning_rate": 9.338040470386344e-06, + "loss": 0.2122, + "step": 2238 + }, + { + "epoch": 0.7477041242277509, + "grad_norm": 0.45176630982507726, + "learning_rate": 9.337073822930681e-06, + "loss": 0.2387, + "step": 2239 + }, + { + "epoch": 0.7480380697946235, + "grad_norm": 0.4486924188323239, + "learning_rate": 9.336106520308948e-06, + "loss": 0.1943, + "step": 2240 + }, + { + "epoch": 0.7483720153614961, + "grad_norm": 0.4646813003017934, + "learning_rate": 9.335138562667267e-06, + "loss": 0.2308, + "step": 2241 + }, + { + "epoch": 0.7487059609283687, + "grad_norm": 0.46477838126704796, + "learning_rate": 9.334169950151866e-06, + "loss": 0.2234, + "step": 2242 + }, + { + "epoch": 0.7490399064952413, + "grad_norm": 0.5008667979340363, + "learning_rate": 9.333200682909059e-06, + "loss": 0.2236, + "step": 2243 + }, + { + "epoch": 0.7493738520621138, + "grad_norm": 0.48975993721972527, + "learning_rate": 9.332230761085265e-06, + "loss": 0.2269, + "step": 2244 + }, + { + "epoch": 0.7497077976289864, + "grad_norm": 0.43081722939141126, + "learning_rate": 9.331260184827006e-06, + "loss": 0.2189, + "step": 2245 + }, + { + "epoch": 0.750041743195859, + "grad_norm": 0.4627663439303731, + "learning_rate": 9.330288954280898e-06, + "loss": 0.2105, + "step": 2246 + }, + { + "epoch": 0.7503756887627316, + "grad_norm": 0.5521700146824564, + "learning_rate": 9.329317069593654e-06, + "loss": 0.2273, + "step": 2247 + }, + { + "epoch": 0.7507096343296042, + "grad_norm": 0.518812087453654, + "learning_rate": 9.328344530912093e-06, + "loss": 0.2518, + "step": 2248 + }, + { + "epoch": 0.7510435798964769, + "grad_norm": 0.5221575866002434, + "learning_rate": 9.327371338383124e-06, + "loss": 0.2458, + "step": 2249 + }, + { + "epoch": 0.7513775254633495, + "grad_norm": 0.47799468782489984, + "learning_rate": 9.326397492153762e-06, + "loss": 0.2268, + "step": 2250 + }, + { + "epoch": 0.7517114710302221, + "grad_norm": 0.8073887593343466, + "learning_rate": 9.325422992371117e-06, + "loss": 0.2399, + "step": 2251 + }, + { + "epoch": 0.7520454165970947, + "grad_norm": 0.5105733251703505, + "learning_rate": 9.324447839182397e-06, + "loss": 0.2248, + "step": 2252 + }, + { + "epoch": 0.7523793621639673, + "grad_norm": 0.5480491125068885, + "learning_rate": 9.323472032734915e-06, + "loss": 0.2464, + "step": 2253 + }, + { + "epoch": 0.7527133077308399, + "grad_norm": 0.48116501479820156, + "learning_rate": 9.322495573176073e-06, + "loss": 0.226, + "step": 2254 + }, + { + "epoch": 0.7530472532977125, + "grad_norm": 0.4210936718940355, + "learning_rate": 9.321518460653381e-06, + "loss": 0.2125, + "step": 2255 + }, + { + "epoch": 0.7533811988645851, + "grad_norm": 0.48326083685217036, + "learning_rate": 9.32054069531444e-06, + "loss": 0.2262, + "step": 2256 + }, + { + "epoch": 0.7537151444314577, + "grad_norm": 0.9265740345041341, + "learning_rate": 9.319562277306955e-06, + "loss": 0.2241, + "step": 2257 + }, + { + "epoch": 0.7540490899983303, + "grad_norm": 0.45048145099511155, + "learning_rate": 9.318583206778726e-06, + "loss": 0.216, + "step": 2258 + }, + { + "epoch": 0.7543830355652029, + "grad_norm": 0.578061264472536, + "learning_rate": 9.317603483877654e-06, + "loss": 0.2202, + "step": 2259 + }, + { + "epoch": 0.7547169811320755, + "grad_norm": 0.4307754856617888, + "learning_rate": 9.316623108751739e-06, + "loss": 0.2132, + "step": 2260 + }, + { + "epoch": 0.755050926698948, + "grad_norm": 0.486997614653093, + "learning_rate": 9.315642081549074e-06, + "loss": 0.2318, + "step": 2261 + }, + { + "epoch": 0.7553848722658206, + "grad_norm": 0.5251137171531048, + "learning_rate": 9.31466040241786e-06, + "loss": 0.2263, + "step": 2262 + }, + { + "epoch": 0.7557188178326932, + "grad_norm": 0.4664038869726607, + "learning_rate": 9.313678071506388e-06, + "loss": 0.2252, + "step": 2263 + }, + { + "epoch": 0.7560527633995658, + "grad_norm": 0.4349295324939173, + "learning_rate": 9.31269508896305e-06, + "loss": 0.2191, + "step": 2264 + }, + { + "epoch": 0.7563867089664384, + "grad_norm": 0.5160541914139507, + "learning_rate": 9.31171145493634e-06, + "loss": 0.2304, + "step": 2265 + }, + { + "epoch": 0.756720654533311, + "grad_norm": 0.48044683366908697, + "learning_rate": 9.310727169574847e-06, + "loss": 0.2313, + "step": 2266 + }, + { + "epoch": 0.7570546001001837, + "grad_norm": 0.44340511774161134, + "learning_rate": 9.309742233027258e-06, + "loss": 0.2178, + "step": 2267 + }, + { + "epoch": 0.7573885456670563, + "grad_norm": 0.4401405973485924, + "learning_rate": 9.308756645442356e-06, + "loss": 0.225, + "step": 2268 + }, + { + "epoch": 0.7577224912339289, + "grad_norm": 0.47966482086980605, + "learning_rate": 9.307770406969032e-06, + "loss": 0.23, + "step": 2269 + }, + { + "epoch": 0.7580564368008015, + "grad_norm": 0.43564783263314855, + "learning_rate": 9.306783517756264e-06, + "loss": 0.2131, + "step": 2270 + }, + { + "epoch": 0.7583903823676741, + "grad_norm": 0.45364809207582024, + "learning_rate": 9.305795977953134e-06, + "loss": 0.2276, + "step": 2271 + }, + { + "epoch": 0.7587243279345467, + "grad_norm": 0.5198386100281404, + "learning_rate": 9.304807787708825e-06, + "loss": 0.2259, + "step": 2272 + }, + { + "epoch": 0.7590582735014193, + "grad_norm": 0.46342370795905347, + "learning_rate": 9.303818947172611e-06, + "loss": 0.2179, + "step": 2273 + }, + { + "epoch": 0.7593922190682919, + "grad_norm": 0.4731835740602336, + "learning_rate": 9.302829456493868e-06, + "loss": 0.2333, + "step": 2274 + }, + { + "epoch": 0.7597261646351645, + "grad_norm": 0.4216435732249339, + "learning_rate": 9.301839315822072e-06, + "loss": 0.2196, + "step": 2275 + }, + { + "epoch": 0.7600601102020371, + "grad_norm": 0.4499998322284951, + "learning_rate": 9.300848525306797e-06, + "loss": 0.2208, + "step": 2276 + }, + { + "epoch": 0.7603940557689096, + "grad_norm": 0.4504516858457847, + "learning_rate": 9.299857085097708e-06, + "loss": 0.2258, + "step": 2277 + }, + { + "epoch": 0.7607280013357822, + "grad_norm": 0.5090935883272614, + "learning_rate": 9.298864995344579e-06, + "loss": 0.2151, + "step": 2278 + }, + { + "epoch": 0.7610619469026548, + "grad_norm": 0.485130607846563, + "learning_rate": 9.297872256197276e-06, + "loss": 0.231, + "step": 2279 + }, + { + "epoch": 0.7613958924695274, + "grad_norm": 0.3951173987462743, + "learning_rate": 9.296878867805762e-06, + "loss": 0.2119, + "step": 2280 + }, + { + "epoch": 0.7617298380364, + "grad_norm": 0.42718739210129064, + "learning_rate": 9.2958848303201e-06, + "loss": 0.2259, + "step": 2281 + }, + { + "epoch": 0.7620637836032726, + "grad_norm": 0.40877281318732694, + "learning_rate": 9.294890143890451e-06, + "loss": 0.217, + "step": 2282 + }, + { + "epoch": 0.7623977291701453, + "grad_norm": 0.44623672624981436, + "learning_rate": 9.293894808667077e-06, + "loss": 0.2216, + "step": 2283 + }, + { + "epoch": 0.7627316747370179, + "grad_norm": 0.5236867845873939, + "learning_rate": 9.292898824800333e-06, + "loss": 0.2561, + "step": 2284 + }, + { + "epoch": 0.7630656203038905, + "grad_norm": 0.46275656758815503, + "learning_rate": 9.291902192440673e-06, + "loss": 0.2191, + "step": 2285 + }, + { + "epoch": 0.7633995658707631, + "grad_norm": 0.44986183401594326, + "learning_rate": 9.290904911738653e-06, + "loss": 0.2196, + "step": 2286 + }, + { + "epoch": 0.7637335114376357, + "grad_norm": 0.47312386838896714, + "learning_rate": 9.289906982844923e-06, + "loss": 0.2311, + "step": 2287 + }, + { + "epoch": 0.7640674570045083, + "grad_norm": 0.4955825415082542, + "learning_rate": 9.288908405910228e-06, + "loss": 0.2187, + "step": 2288 + }, + { + "epoch": 0.7644014025713809, + "grad_norm": 0.4529969289979694, + "learning_rate": 9.287909181085421e-06, + "loss": 0.2211, + "step": 2289 + }, + { + "epoch": 0.7647353481382535, + "grad_norm": 0.4576506572434161, + "learning_rate": 9.286909308521443e-06, + "loss": 0.2095, + "step": 2290 + }, + { + "epoch": 0.7650692937051261, + "grad_norm": 0.4508998678478078, + "learning_rate": 9.285908788369336e-06, + "loss": 0.2287, + "step": 2291 + }, + { + "epoch": 0.7654032392719987, + "grad_norm": 0.45715461051048567, + "learning_rate": 9.284907620780244e-06, + "loss": 0.2042, + "step": 2292 + }, + { + "epoch": 0.7657371848388712, + "grad_norm": 0.4180093603700725, + "learning_rate": 9.2839058059054e-06, + "loss": 0.2125, + "step": 2293 + }, + { + "epoch": 0.7660711304057438, + "grad_norm": 0.43093767255958393, + "learning_rate": 9.282903343896144e-06, + "loss": 0.2238, + "step": 2294 + }, + { + "epoch": 0.7664050759726164, + "grad_norm": 0.5483814770622409, + "learning_rate": 9.281900234903908e-06, + "loss": 0.2316, + "step": 2295 + }, + { + "epoch": 0.766739021539489, + "grad_norm": 0.43771157015292517, + "learning_rate": 9.280896479080224e-06, + "loss": 0.2138, + "step": 2296 + }, + { + "epoch": 0.7670729671063616, + "grad_norm": 0.4574659911330641, + "learning_rate": 9.27989207657672e-06, + "loss": 0.226, + "step": 2297 + }, + { + "epoch": 0.7674069126732342, + "grad_norm": 0.42221388094567286, + "learning_rate": 9.278887027545125e-06, + "loss": 0.2209, + "step": 2298 + }, + { + "epoch": 0.7677408582401068, + "grad_norm": 0.5150950388506087, + "learning_rate": 9.277881332137261e-06, + "loss": 0.2349, + "step": 2299 + }, + { + "epoch": 0.7680748038069795, + "grad_norm": 0.48245297149523503, + "learning_rate": 9.276874990505053e-06, + "loss": 0.23, + "step": 2300 + }, + { + "epoch": 0.7684087493738521, + "grad_norm": 0.548683322704859, + "learning_rate": 9.27586800280052e-06, + "loss": 0.2135, + "step": 2301 + }, + { + "epoch": 0.7687426949407247, + "grad_norm": 0.4994150747196339, + "learning_rate": 9.274860369175775e-06, + "loss": 0.2303, + "step": 2302 + }, + { + "epoch": 0.7690766405075973, + "grad_norm": 0.5378040458428275, + "learning_rate": 9.27385208978304e-06, + "loss": 0.2461, + "step": 2303 + }, + { + "epoch": 0.7694105860744699, + "grad_norm": 0.45033554985182755, + "learning_rate": 9.272843164774622e-06, + "loss": 0.2131, + "step": 2304 + }, + { + "epoch": 0.7697445316413425, + "grad_norm": 0.46757095199070065, + "learning_rate": 9.27183359430293e-06, + "loss": 0.2232, + "step": 2305 + }, + { + "epoch": 0.7700784772082151, + "grad_norm": 0.39788086669142664, + "learning_rate": 9.270823378520478e-06, + "loss": 0.1993, + "step": 2306 + }, + { + "epoch": 0.7704124227750877, + "grad_norm": 0.5432380637027175, + "learning_rate": 9.269812517579867e-06, + "loss": 0.2409, + "step": 2307 + }, + { + "epoch": 0.7707463683419603, + "grad_norm": 0.5044905567763108, + "learning_rate": 9.268801011633799e-06, + "loss": 0.2263, + "step": 2308 + }, + { + "epoch": 0.7710803139088329, + "grad_norm": 0.4430694585924386, + "learning_rate": 9.267788860835076e-06, + "loss": 0.214, + "step": 2309 + }, + { + "epoch": 0.7714142594757054, + "grad_norm": 0.5203303582541469, + "learning_rate": 9.266776065336593e-06, + "loss": 0.246, + "step": 2310 + }, + { + "epoch": 0.771748205042578, + "grad_norm": 0.42657503652675616, + "learning_rate": 9.265762625291346e-06, + "loss": 0.2232, + "step": 2311 + }, + { + "epoch": 0.7720821506094506, + "grad_norm": 0.4594336261365237, + "learning_rate": 9.264748540852427e-06, + "loss": 0.2359, + "step": 2312 + }, + { + "epoch": 0.7724160961763232, + "grad_norm": 0.46882401712288807, + "learning_rate": 9.263733812173023e-06, + "loss": 0.2083, + "step": 2313 + }, + { + "epoch": 0.7727500417431958, + "grad_norm": 0.5028204129392393, + "learning_rate": 9.262718439406425e-06, + "loss": 0.2256, + "step": 2314 + }, + { + "epoch": 0.7730839873100684, + "grad_norm": 0.49213998524386604, + "learning_rate": 9.261702422706014e-06, + "loss": 0.2436, + "step": 2315 + }, + { + "epoch": 0.773417932876941, + "grad_norm": 0.43231620796153136, + "learning_rate": 9.260685762225273e-06, + "loss": 0.233, + "step": 2316 + }, + { + "epoch": 0.7737518784438137, + "grad_norm": 0.4434494887987508, + "learning_rate": 9.25966845811778e-06, + "loss": 0.2301, + "step": 2317 + }, + { + "epoch": 0.7740858240106863, + "grad_norm": 0.4370287988017438, + "learning_rate": 9.258650510537208e-06, + "loss": 0.1969, + "step": 2318 + }, + { + "epoch": 0.7744197695775589, + "grad_norm": 0.47263084992349913, + "learning_rate": 9.257631919637333e-06, + "loss": 0.2157, + "step": 2319 + }, + { + "epoch": 0.7747537151444315, + "grad_norm": 0.48339688653267926, + "learning_rate": 9.256612685572027e-06, + "loss": 0.2201, + "step": 2320 + }, + { + "epoch": 0.7750876607113041, + "grad_norm": 0.5110737778673129, + "learning_rate": 9.255592808495254e-06, + "loss": 0.2405, + "step": 2321 + }, + { + "epoch": 0.7754216062781767, + "grad_norm": 0.45441898821062743, + "learning_rate": 9.254572288561077e-06, + "loss": 0.2124, + "step": 2322 + }, + { + "epoch": 0.7757555518450493, + "grad_norm": 0.4588311491582138, + "learning_rate": 9.253551125923662e-06, + "loss": 0.2201, + "step": 2323 + }, + { + "epoch": 0.7760894974119219, + "grad_norm": 0.5177735740891841, + "learning_rate": 9.252529320737265e-06, + "loss": 0.2429, + "step": 2324 + }, + { + "epoch": 0.7764234429787945, + "grad_norm": 0.5022245852427304, + "learning_rate": 9.251506873156242e-06, + "loss": 0.2188, + "step": 2325 + }, + { + "epoch": 0.776757388545667, + "grad_norm": 0.4537744070437574, + "learning_rate": 9.250483783335046e-06, + "loss": 0.2106, + "step": 2326 + }, + { + "epoch": 0.7770913341125396, + "grad_norm": 0.6192354840406287, + "learning_rate": 9.249460051428226e-06, + "loss": 0.222, + "step": 2327 + }, + { + "epoch": 0.7774252796794122, + "grad_norm": 0.44129467986532117, + "learning_rate": 9.24843567759043e-06, + "loss": 0.206, + "step": 2328 + }, + { + "epoch": 0.7777592252462848, + "grad_norm": 0.4770974165507659, + "learning_rate": 9.247410661976402e-06, + "loss": 0.2309, + "step": 2329 + }, + { + "epoch": 0.7780931708131574, + "grad_norm": 0.48716858018195214, + "learning_rate": 9.246385004740981e-06, + "loss": 0.2411, + "step": 2330 + }, + { + "epoch": 0.77842711638003, + "grad_norm": 0.5243113960671338, + "learning_rate": 9.245358706039105e-06, + "loss": 0.234, + "step": 2331 + }, + { + "epoch": 0.7787610619469026, + "grad_norm": 0.41801045846912316, + "learning_rate": 9.244331766025812e-06, + "loss": 0.2239, + "step": 2332 + }, + { + "epoch": 0.7790950075137753, + "grad_norm": 0.46394404487526614, + "learning_rate": 9.243304184856226e-06, + "loss": 0.2148, + "step": 2333 + }, + { + "epoch": 0.7794289530806479, + "grad_norm": 0.4277415195601737, + "learning_rate": 9.242275962685584e-06, + "loss": 0.212, + "step": 2334 + }, + { + "epoch": 0.7797628986475205, + "grad_norm": 0.5185901645423077, + "learning_rate": 9.241247099669202e-06, + "loss": 0.245, + "step": 2335 + }, + { + "epoch": 0.7800968442143931, + "grad_norm": 0.44824621316470326, + "learning_rate": 9.24021759596251e-06, + "loss": 0.2255, + "step": 2336 + }, + { + "epoch": 0.7804307897812657, + "grad_norm": 0.4812361814852991, + "learning_rate": 9.239187451721021e-06, + "loss": 0.2352, + "step": 2337 + }, + { + "epoch": 0.7807647353481383, + "grad_norm": 0.4830179394244599, + "learning_rate": 9.238156667100354e-06, + "loss": 0.2291, + "step": 2338 + }, + { + "epoch": 0.7810986809150109, + "grad_norm": 0.44869641118909687, + "learning_rate": 9.237125242256219e-06, + "loss": 0.2121, + "step": 2339 + }, + { + "epoch": 0.7814326264818835, + "grad_norm": 0.4183573673022089, + "learning_rate": 9.236093177344427e-06, + "loss": 0.2165, + "step": 2340 + }, + { + "epoch": 0.7817665720487561, + "grad_norm": 0.44158451252351555, + "learning_rate": 9.23506047252088e-06, + "loss": 0.2193, + "step": 2341 + }, + { + "epoch": 0.7821005176156286, + "grad_norm": 0.4988366633490991, + "learning_rate": 9.234027127941585e-06, + "loss": 0.23, + "step": 2342 + }, + { + "epoch": 0.7824344631825012, + "grad_norm": 0.47542697794018896, + "learning_rate": 9.232993143762637e-06, + "loss": 0.2205, + "step": 2343 + }, + { + "epoch": 0.7827684087493738, + "grad_norm": 0.4307912169479183, + "learning_rate": 9.231958520140232e-06, + "loss": 0.2193, + "step": 2344 + }, + { + "epoch": 0.7831023543162464, + "grad_norm": 0.4686420725641428, + "learning_rate": 9.230923257230663e-06, + "loss": 0.2207, + "step": 2345 + }, + { + "epoch": 0.783436299883119, + "grad_norm": 0.7093627249782076, + "learning_rate": 9.22988735519032e-06, + "loss": 0.211, + "step": 2346 + }, + { + "epoch": 0.7837702454499916, + "grad_norm": 0.4449712797327876, + "learning_rate": 9.228850814175684e-06, + "loss": 0.2203, + "step": 2347 + }, + { + "epoch": 0.7841041910168642, + "grad_norm": 0.4224110582979993, + "learning_rate": 9.22781363434334e-06, + "loss": 0.2164, + "step": 2348 + }, + { + "epoch": 0.7844381365837368, + "grad_norm": 0.4453644666775102, + "learning_rate": 9.226775815849969e-06, + "loss": 0.2184, + "step": 2349 + }, + { + "epoch": 0.7847720821506095, + "grad_norm": 0.47882639139667954, + "learning_rate": 9.225737358852339e-06, + "loss": 0.2259, + "step": 2350 + }, + { + "epoch": 0.7851060277174821, + "grad_norm": 0.4614230944404661, + "learning_rate": 9.224698263507326e-06, + "loss": 0.2437, + "step": 2351 + }, + { + "epoch": 0.7854399732843547, + "grad_norm": 0.46240361961793186, + "learning_rate": 9.223658529971896e-06, + "loss": 0.2159, + "step": 2352 + }, + { + "epoch": 0.7857739188512273, + "grad_norm": 0.4326362693521754, + "learning_rate": 9.222618158403111e-06, + "loss": 0.2111, + "step": 2353 + }, + { + "epoch": 0.7861078644180999, + "grad_norm": 0.4294245191748408, + "learning_rate": 9.221577148958137e-06, + "loss": 0.2179, + "step": 2354 + }, + { + "epoch": 0.7864418099849725, + "grad_norm": 0.47430465166835356, + "learning_rate": 9.220535501794224e-06, + "loss": 0.2148, + "step": 2355 + }, + { + "epoch": 0.7867757555518451, + "grad_norm": 0.40903100342304954, + "learning_rate": 9.21949321706873e-06, + "loss": 0.2111, + "step": 2356 + }, + { + "epoch": 0.7871097011187177, + "grad_norm": 0.4871994961911349, + "learning_rate": 9.218450294939103e-06, + "loss": 0.2329, + "step": 2357 + }, + { + "epoch": 0.7874436466855903, + "grad_norm": 0.4802859353134735, + "learning_rate": 9.217406735562887e-06, + "loss": 0.2148, + "step": 2358 + }, + { + "epoch": 0.7877775922524628, + "grad_norm": 0.4829538927703034, + "learning_rate": 9.216362539097726e-06, + "loss": 0.2109, + "step": 2359 + }, + { + "epoch": 0.7881115378193354, + "grad_norm": 0.46957086759196354, + "learning_rate": 9.215317705701356e-06, + "loss": 0.2158, + "step": 2360 + }, + { + "epoch": 0.788445483386208, + "grad_norm": 0.4415507689008121, + "learning_rate": 9.214272235531615e-06, + "loss": 0.22, + "step": 2361 + }, + { + "epoch": 0.7887794289530806, + "grad_norm": 0.4784953125828448, + "learning_rate": 9.213226128746431e-06, + "loss": 0.2112, + "step": 2362 + }, + { + "epoch": 0.7891133745199532, + "grad_norm": 0.4748308372416173, + "learning_rate": 9.21217938550383e-06, + "loss": 0.2267, + "step": 2363 + }, + { + "epoch": 0.7894473200868258, + "grad_norm": 0.42786072729815783, + "learning_rate": 9.211132005961936e-06, + "loss": 0.2182, + "step": 2364 + }, + { + "epoch": 0.7897812656536984, + "grad_norm": 0.46379390215134986, + "learning_rate": 9.210083990278968e-06, + "loss": 0.2159, + "step": 2365 + }, + { + "epoch": 0.790115211220571, + "grad_norm": 0.434672422878081, + "learning_rate": 9.209035338613242e-06, + "loss": 0.2176, + "step": 2366 + }, + { + "epoch": 0.7904491567874437, + "grad_norm": 0.4712397323015266, + "learning_rate": 9.207986051123167e-06, + "loss": 0.2169, + "step": 2367 + }, + { + "epoch": 0.7907831023543163, + "grad_norm": 0.4643471410872069, + "learning_rate": 9.206936127967254e-06, + "loss": 0.2231, + "step": 2368 + }, + { + "epoch": 0.7911170479211889, + "grad_norm": 0.4832625761947329, + "learning_rate": 9.205885569304103e-06, + "loss": 0.2346, + "step": 2369 + }, + { + "epoch": 0.7914509934880615, + "grad_norm": 0.5120653341402449, + "learning_rate": 9.204834375292413e-06, + "loss": 0.2248, + "step": 2370 + }, + { + "epoch": 0.7917849390549341, + "grad_norm": 0.4595895504768996, + "learning_rate": 9.20378254609098e-06, + "loss": 0.212, + "step": 2371 + }, + { + "epoch": 0.7921188846218067, + "grad_norm": 0.4464850753967196, + "learning_rate": 9.202730081858697e-06, + "loss": 0.2216, + "step": 2372 + }, + { + "epoch": 0.7924528301886793, + "grad_norm": 0.4465507871460174, + "learning_rate": 9.201676982754549e-06, + "loss": 0.2175, + "step": 2373 + }, + { + "epoch": 0.7927867757555519, + "grad_norm": 0.7186089123657715, + "learning_rate": 9.200623248937619e-06, + "loss": 0.2217, + "step": 2374 + }, + { + "epoch": 0.7931207213224244, + "grad_norm": 0.48065369107795447, + "learning_rate": 9.199568880567085e-06, + "loss": 0.2346, + "step": 2375 + }, + { + "epoch": 0.793454666889297, + "grad_norm": 0.46022889936302425, + "learning_rate": 9.198513877802226e-06, + "loss": 0.2185, + "step": 2376 + }, + { + "epoch": 0.7937886124561696, + "grad_norm": 0.48578689235218536, + "learning_rate": 9.19745824080241e-06, + "loss": 0.2325, + "step": 2377 + }, + { + "epoch": 0.7941225580230422, + "grad_norm": 0.4201772420752296, + "learning_rate": 9.196401969727101e-06, + "loss": 0.2219, + "step": 2378 + }, + { + "epoch": 0.7944565035899148, + "grad_norm": 0.5148645151535987, + "learning_rate": 9.195345064735865e-06, + "loss": 0.2187, + "step": 2379 + }, + { + "epoch": 0.7947904491567874, + "grad_norm": 0.4793318370535307, + "learning_rate": 9.194287525988358e-06, + "loss": 0.2206, + "step": 2380 + }, + { + "epoch": 0.79512439472366, + "grad_norm": 0.44812143862309217, + "learning_rate": 9.193229353644336e-06, + "loss": 0.2066, + "step": 2381 + }, + { + "epoch": 0.7954583402905326, + "grad_norm": 0.48309758118279134, + "learning_rate": 9.192170547863644e-06, + "loss": 0.2272, + "step": 2382 + }, + { + "epoch": 0.7957922858574052, + "grad_norm": 0.4955575076751468, + "learning_rate": 9.191111108806228e-06, + "loss": 0.2152, + "step": 2383 + }, + { + "epoch": 0.7961262314242779, + "grad_norm": 0.5483835825302927, + "learning_rate": 9.190051036632133e-06, + "loss": 0.2155, + "step": 2384 + }, + { + "epoch": 0.7964601769911505, + "grad_norm": 0.43256677860584014, + "learning_rate": 9.188990331501493e-06, + "loss": 0.2172, + "step": 2385 + }, + { + "epoch": 0.7967941225580231, + "grad_norm": 0.47777695666620773, + "learning_rate": 9.187928993574537e-06, + "loss": 0.218, + "step": 2386 + }, + { + "epoch": 0.7971280681248957, + "grad_norm": 0.4857124920043452, + "learning_rate": 9.186867023011598e-06, + "loss": 0.2269, + "step": 2387 + }, + { + "epoch": 0.7974620136917683, + "grad_norm": 0.46097558597069427, + "learning_rate": 9.185804419973096e-06, + "loss": 0.2262, + "step": 2388 + }, + { + "epoch": 0.7977959592586409, + "grad_norm": 0.45451151055369127, + "learning_rate": 9.18474118461955e-06, + "loss": 0.2283, + "step": 2389 + }, + { + "epoch": 0.7981299048255135, + "grad_norm": 0.45002263688863814, + "learning_rate": 9.183677317111572e-06, + "loss": 0.2264, + "step": 2390 + }, + { + "epoch": 0.798463850392386, + "grad_norm": 0.4351929657301828, + "learning_rate": 9.182612817609877e-06, + "loss": 0.2162, + "step": 2391 + }, + { + "epoch": 0.7987977959592586, + "grad_norm": 0.4829125840864148, + "learning_rate": 9.181547686275266e-06, + "loss": 0.2209, + "step": 2392 + }, + { + "epoch": 0.7991317415261312, + "grad_norm": 0.4617414472574589, + "learning_rate": 9.180481923268641e-06, + "loss": 0.2317, + "step": 2393 + }, + { + "epoch": 0.7994656870930038, + "grad_norm": 0.4715677223409943, + "learning_rate": 9.179415528750998e-06, + "loss": 0.2324, + "step": 2394 + }, + { + "epoch": 0.7997996326598764, + "grad_norm": 0.445753677713308, + "learning_rate": 9.178348502883428e-06, + "loss": 0.2086, + "step": 2395 + }, + { + "epoch": 0.800133578226749, + "grad_norm": 0.4739163161049927, + "learning_rate": 9.17728084582712e-06, + "loss": 0.2299, + "step": 2396 + }, + { + "epoch": 0.8004675237936216, + "grad_norm": 0.4444338702698356, + "learning_rate": 9.176212557743352e-06, + "loss": 0.2179, + "step": 2397 + }, + { + "epoch": 0.8008014693604942, + "grad_norm": 0.5521009272187888, + "learning_rate": 9.175143638793504e-06, + "loss": 0.2332, + "step": 2398 + }, + { + "epoch": 0.8011354149273668, + "grad_norm": 0.4529565320102755, + "learning_rate": 9.174074089139048e-06, + "loss": 0.2188, + "step": 2399 + }, + { + "epoch": 0.8014693604942394, + "grad_norm": 0.4655241748444513, + "learning_rate": 9.173003908941555e-06, + "loss": 0.231, + "step": 2400 + }, + { + "epoch": 0.801803306061112, + "grad_norm": 0.46546430337015227, + "learning_rate": 9.171933098362685e-06, + "loss": 0.2245, + "step": 2401 + }, + { + "epoch": 0.8021372516279847, + "grad_norm": 0.4291232236423889, + "learning_rate": 9.170861657564197e-06, + "loss": 0.1983, + "step": 2402 + }, + { + "epoch": 0.8024711971948573, + "grad_norm": 0.47959703405449, + "learning_rate": 9.169789586707947e-06, + "loss": 0.2225, + "step": 2403 + }, + { + "epoch": 0.8028051427617299, + "grad_norm": 0.4705338430027695, + "learning_rate": 9.16871688595588e-06, + "loss": 0.2174, + "step": 2404 + }, + { + "epoch": 0.8031390883286025, + "grad_norm": 0.4542925186202159, + "learning_rate": 9.167643555470044e-06, + "loss": 0.221, + "step": 2405 + }, + { + "epoch": 0.8034730338954751, + "grad_norm": 0.5053346494758011, + "learning_rate": 9.166569595412576e-06, + "loss": 0.2214, + "step": 2406 + }, + { + "epoch": 0.8038069794623477, + "grad_norm": 0.518802140242786, + "learning_rate": 9.16549500594571e-06, + "loss": 0.203, + "step": 2407 + }, + { + "epoch": 0.8041409250292202, + "grad_norm": 0.4830630204730151, + "learning_rate": 9.164419787231778e-06, + "loss": 0.2334, + "step": 2408 + }, + { + "epoch": 0.8044748705960928, + "grad_norm": 0.5058947808268289, + "learning_rate": 9.163343939433202e-06, + "loss": 0.2303, + "step": 2409 + }, + { + "epoch": 0.8048088161629654, + "grad_norm": 0.5481350305947928, + "learning_rate": 9.162267462712502e-06, + "loss": 0.227, + "step": 2410 + }, + { + "epoch": 0.805142761729838, + "grad_norm": 0.40346017951021734, + "learning_rate": 9.161190357232292e-06, + "loss": 0.2078, + "step": 2411 + }, + { + "epoch": 0.8054767072967106, + "grad_norm": 0.4012932942269892, + "learning_rate": 9.160112623155282e-06, + "loss": 0.2099, + "step": 2412 + }, + { + "epoch": 0.8058106528635832, + "grad_norm": 0.41692385024259015, + "learning_rate": 9.159034260644277e-06, + "loss": 0.2084, + "step": 2413 + }, + { + "epoch": 0.8061445984304558, + "grad_norm": 0.5463744356480256, + "learning_rate": 9.157955269862176e-06, + "loss": 0.2286, + "step": 2414 + }, + { + "epoch": 0.8064785439973284, + "grad_norm": 0.49138767972709996, + "learning_rate": 9.156875650971974e-06, + "loss": 0.2223, + "step": 2415 + }, + { + "epoch": 0.806812489564201, + "grad_norm": 0.46139748034889444, + "learning_rate": 9.155795404136757e-06, + "loss": 0.2143, + "step": 2416 + }, + { + "epoch": 0.8071464351310736, + "grad_norm": 0.41387074512700345, + "learning_rate": 9.154714529519715e-06, + "loss": 0.2094, + "step": 2417 + }, + { + "epoch": 0.8074803806979463, + "grad_norm": 0.4711974425999777, + "learning_rate": 9.15363302728412e-06, + "loss": 0.2252, + "step": 2418 + }, + { + "epoch": 0.8078143262648189, + "grad_norm": 0.43803798636984687, + "learning_rate": 9.15255089759335e-06, + "loss": 0.2097, + "step": 2419 + }, + { + "epoch": 0.8081482718316915, + "grad_norm": 0.4315283690951102, + "learning_rate": 9.151468140610872e-06, + "loss": 0.2029, + "step": 2420 + }, + { + "epoch": 0.8084822173985641, + "grad_norm": 0.43314565060866766, + "learning_rate": 9.150384756500249e-06, + "loss": 0.2129, + "step": 2421 + }, + { + "epoch": 0.8088161629654367, + "grad_norm": 0.4992267726729122, + "learning_rate": 9.14930074542514e-06, + "loss": 0.2276, + "step": 2422 + }, + { + "epoch": 0.8091501085323093, + "grad_norm": 0.8734400531790079, + "learning_rate": 9.148216107549297e-06, + "loss": 0.2276, + "step": 2423 + }, + { + "epoch": 0.8094840540991818, + "grad_norm": 0.4618645547769956, + "learning_rate": 9.147130843036567e-06, + "loss": 0.2216, + "step": 2424 + }, + { + "epoch": 0.8098179996660544, + "grad_norm": 0.503229948469187, + "learning_rate": 9.146044952050891e-06, + "loss": 0.2266, + "step": 2425 + }, + { + "epoch": 0.810151945232927, + "grad_norm": 0.48234269217820075, + "learning_rate": 9.144958434756308e-06, + "loss": 0.246, + "step": 2426 + }, + { + "epoch": 0.8104858907997996, + "grad_norm": 0.45799158973946075, + "learning_rate": 9.14387129131695e-06, + "loss": 0.2241, + "step": 2427 + }, + { + "epoch": 0.8108198363666722, + "grad_norm": 0.47473856709077045, + "learning_rate": 9.142783521897038e-06, + "loss": 0.2315, + "step": 2428 + }, + { + "epoch": 0.8111537819335448, + "grad_norm": 0.46232253841106646, + "learning_rate": 9.141695126660896e-06, + "loss": 0.2146, + "step": 2429 + }, + { + "epoch": 0.8114877275004174, + "grad_norm": 0.45675763317544865, + "learning_rate": 9.14060610577294e-06, + "loss": 0.2196, + "step": 2430 + }, + { + "epoch": 0.81182167306729, + "grad_norm": 0.45024530888195646, + "learning_rate": 9.139516459397675e-06, + "loss": 0.2087, + "step": 2431 + }, + { + "epoch": 0.8121556186341626, + "grad_norm": 0.5389654482824952, + "learning_rate": 9.13842618769971e-06, + "loss": 0.231, + "step": 2432 + }, + { + "epoch": 0.8124895642010352, + "grad_norm": 0.48436355868941494, + "learning_rate": 9.13733529084374e-06, + "loss": 0.2434, + "step": 2433 + }, + { + "epoch": 0.8128235097679078, + "grad_norm": 0.46078224853375155, + "learning_rate": 9.13624376899456e-06, + "loss": 0.2338, + "step": 2434 + }, + { + "epoch": 0.8131574553347805, + "grad_norm": 0.41677648160432945, + "learning_rate": 9.135151622317054e-06, + "loss": 0.224, + "step": 2435 + }, + { + "epoch": 0.8134914009016531, + "grad_norm": 0.4755376042102644, + "learning_rate": 9.134058850976205e-06, + "loss": 0.2247, + "step": 2436 + }, + { + "epoch": 0.8138253464685257, + "grad_norm": 0.436828747543142, + "learning_rate": 9.132965455137092e-06, + "loss": 0.2056, + "step": 2437 + }, + { + "epoch": 0.8141592920353983, + "grad_norm": 0.5091572548534862, + "learning_rate": 9.13187143496488e-06, + "loss": 0.2249, + "step": 2438 + }, + { + "epoch": 0.8144932376022709, + "grad_norm": 0.4430402243350094, + "learning_rate": 9.13077679062484e-06, + "loss": 0.2222, + "step": 2439 + }, + { + "epoch": 0.8148271831691434, + "grad_norm": 0.47005559131312463, + "learning_rate": 9.129681522282326e-06, + "loss": 0.2224, + "step": 2440 + }, + { + "epoch": 0.815161128736016, + "grad_norm": 0.46240395621992303, + "learning_rate": 9.128585630102793e-06, + "loss": 0.218, + "step": 2441 + }, + { + "epoch": 0.8154950743028886, + "grad_norm": 0.4229967980030704, + "learning_rate": 9.127489114251787e-06, + "loss": 0.2088, + "step": 2442 + }, + { + "epoch": 0.8158290198697612, + "grad_norm": 0.4359587185147543, + "learning_rate": 9.12639197489495e-06, + "loss": 0.2226, + "step": 2443 + }, + { + "epoch": 0.8161629654366338, + "grad_norm": 0.47401382528724656, + "learning_rate": 9.125294212198022e-06, + "loss": 0.2334, + "step": 2444 + }, + { + "epoch": 0.8164969110035064, + "grad_norm": 0.4838938684515212, + "learning_rate": 9.124195826326827e-06, + "loss": 0.2328, + "step": 2445 + }, + { + "epoch": 0.816830856570379, + "grad_norm": 0.5430695760268881, + "learning_rate": 9.12309681744729e-06, + "loss": 0.2175, + "step": 2446 + }, + { + "epoch": 0.8171648021372516, + "grad_norm": 0.5297232531969129, + "learning_rate": 9.121997185725433e-06, + "loss": 0.2239, + "step": 2447 + }, + { + "epoch": 0.8174987477041242, + "grad_norm": 0.4606818496718444, + "learning_rate": 9.120896931327366e-06, + "loss": 0.2322, + "step": 2448 + }, + { + "epoch": 0.8178326932709968, + "grad_norm": 0.43326888876665187, + "learning_rate": 9.119796054419295e-06, + "loss": 0.2352, + "step": 2449 + }, + { + "epoch": 0.8181666388378694, + "grad_norm": 0.45191071938809313, + "learning_rate": 9.118694555167521e-06, + "loss": 0.2334, + "step": 2450 + }, + { + "epoch": 0.818500584404742, + "grad_norm": 0.4137197400642333, + "learning_rate": 9.117592433738439e-06, + "loss": 0.2209, + "step": 2451 + }, + { + "epoch": 0.8188345299716147, + "grad_norm": 0.4726434520191131, + "learning_rate": 9.116489690298536e-06, + "loss": 0.2307, + "step": 2452 + }, + { + "epoch": 0.8191684755384873, + "grad_norm": 0.45335955621754376, + "learning_rate": 9.115386325014396e-06, + "loss": 0.2149, + "step": 2453 + }, + { + "epoch": 0.8195024211053599, + "grad_norm": 0.5966832339899147, + "learning_rate": 9.114282338052695e-06, + "loss": 0.2494, + "step": 2454 + }, + { + "epoch": 0.8198363666722325, + "grad_norm": 0.507628757751821, + "learning_rate": 9.113177729580203e-06, + "loss": 0.224, + "step": 2455 + }, + { + "epoch": 0.820170312239105, + "grad_norm": 0.41348330095186164, + "learning_rate": 9.112072499763783e-06, + "loss": 0.2139, + "step": 2456 + }, + { + "epoch": 0.8205042578059776, + "grad_norm": 0.4445659097139337, + "learning_rate": 9.110966648770392e-06, + "loss": 0.2098, + "step": 2457 + }, + { + "epoch": 0.8208382033728502, + "grad_norm": 0.49639772274003185, + "learning_rate": 9.109860176767085e-06, + "loss": 0.2242, + "step": 2458 + }, + { + "epoch": 0.8211721489397228, + "grad_norm": 0.4692548380202328, + "learning_rate": 9.108753083921007e-06, + "loss": 0.2223, + "step": 2459 + }, + { + "epoch": 0.8215060945065954, + "grad_norm": 0.4857245701306937, + "learning_rate": 9.107645370399395e-06, + "loss": 0.2282, + "step": 2460 + }, + { + "epoch": 0.821840040073468, + "grad_norm": 0.4350617485594562, + "learning_rate": 9.106537036369587e-06, + "loss": 0.2057, + "step": 2461 + }, + { + "epoch": 0.8221739856403406, + "grad_norm": 0.45938249982968987, + "learning_rate": 9.105428081999004e-06, + "loss": 0.2297, + "step": 2462 + }, + { + "epoch": 0.8225079312072132, + "grad_norm": 0.4086313343960967, + "learning_rate": 9.10431850745517e-06, + "loss": 0.2036, + "step": 2463 + }, + { + "epoch": 0.8228418767740858, + "grad_norm": 0.44187113165377345, + "learning_rate": 9.103208312905698e-06, + "loss": 0.2149, + "step": 2464 + }, + { + "epoch": 0.8231758223409584, + "grad_norm": 0.4456715017463197, + "learning_rate": 9.102097498518299e-06, + "loss": 0.2157, + "step": 2465 + }, + { + "epoch": 0.823509767907831, + "grad_norm": 0.3795807225141392, + "learning_rate": 9.100986064460769e-06, + "loss": 0.1994, + "step": 2466 + }, + { + "epoch": 0.8238437134747036, + "grad_norm": 0.5015498235452791, + "learning_rate": 9.099874010901009e-06, + "loss": 0.2067, + "step": 2467 + }, + { + "epoch": 0.8241776590415762, + "grad_norm": 0.46873528314568835, + "learning_rate": 9.098761338007003e-06, + "loss": 0.2342, + "step": 2468 + }, + { + "epoch": 0.8245116046084489, + "grad_norm": 0.4802429384639126, + "learning_rate": 9.097648045946837e-06, + "loss": 0.2077, + "step": 2469 + }, + { + "epoch": 0.8248455501753215, + "grad_norm": 0.47674620695731734, + "learning_rate": 9.096534134888685e-06, + "loss": 0.2311, + "step": 2470 + }, + { + "epoch": 0.8251794957421941, + "grad_norm": 0.4741718011929554, + "learning_rate": 9.095419605000817e-06, + "loss": 0.2223, + "step": 2471 + }, + { + "epoch": 0.8255134413090667, + "grad_norm": 0.4614907153038838, + "learning_rate": 9.094304456451596e-06, + "loss": 0.2301, + "step": 2472 + }, + { + "epoch": 0.8258473868759392, + "grad_norm": 0.4517767126346064, + "learning_rate": 9.093188689409477e-06, + "loss": 0.2222, + "step": 2473 + }, + { + "epoch": 0.8261813324428118, + "grad_norm": 0.652118941163761, + "learning_rate": 9.09207230404301e-06, + "loss": 0.2331, + "step": 2474 + }, + { + "epoch": 0.8265152780096844, + "grad_norm": 0.45586670064349893, + "learning_rate": 9.090955300520842e-06, + "loss": 0.2111, + "step": 2475 + }, + { + "epoch": 0.826849223576557, + "grad_norm": 0.4899208936605748, + "learning_rate": 9.089837679011704e-06, + "loss": 0.2282, + "step": 2476 + }, + { + "epoch": 0.8271831691434296, + "grad_norm": 0.4964827628532168, + "learning_rate": 9.08871943968443e-06, + "loss": 0.2269, + "step": 2477 + }, + { + "epoch": 0.8275171147103022, + "grad_norm": 0.4552900965289199, + "learning_rate": 9.08760058270794e-06, + "loss": 0.2356, + "step": 2478 + }, + { + "epoch": 0.8278510602771748, + "grad_norm": 0.4918378116260361, + "learning_rate": 9.086481108251253e-06, + "loss": 0.237, + "step": 2479 + }, + { + "epoch": 0.8281850058440474, + "grad_norm": 0.4607573617794218, + "learning_rate": 9.085361016483477e-06, + "loss": 0.2197, + "step": 2480 + }, + { + "epoch": 0.82851895141092, + "grad_norm": 0.5491934418016965, + "learning_rate": 9.084240307573816e-06, + "loss": 0.2145, + "step": 2481 + }, + { + "epoch": 0.8288528969777926, + "grad_norm": 0.48107467425045347, + "learning_rate": 9.083118981691567e-06, + "loss": 0.2198, + "step": 2482 + }, + { + "epoch": 0.8291868425446652, + "grad_norm": 0.48450480401984153, + "learning_rate": 9.081997039006117e-06, + "loss": 0.2226, + "step": 2483 + }, + { + "epoch": 0.8295207881115378, + "grad_norm": 0.4454234793986314, + "learning_rate": 9.080874479686952e-06, + "loss": 0.2091, + "step": 2484 + }, + { + "epoch": 0.8298547336784105, + "grad_norm": 0.5397372637338255, + "learning_rate": 9.079751303903646e-06, + "loss": 0.2302, + "step": 2485 + }, + { + "epoch": 0.8301886792452831, + "grad_norm": 0.49058908175007737, + "learning_rate": 9.078627511825866e-06, + "loss": 0.2339, + "step": 2486 + }, + { + "epoch": 0.8305226248121557, + "grad_norm": 0.49048769393241404, + "learning_rate": 9.077503103623379e-06, + "loss": 0.2263, + "step": 2487 + }, + { + "epoch": 0.8308565703790283, + "grad_norm": 0.4704762203253689, + "learning_rate": 9.076378079466036e-06, + "loss": 0.2132, + "step": 2488 + }, + { + "epoch": 0.8311905159459008, + "grad_norm": 0.4149169942291882, + "learning_rate": 9.075252439523785e-06, + "loss": 0.2185, + "step": 2489 + }, + { + "epoch": 0.8315244615127734, + "grad_norm": 0.4750547835779951, + "learning_rate": 9.074126183966669e-06, + "loss": 0.2136, + "step": 2490 + }, + { + "epoch": 0.831858407079646, + "grad_norm": 0.4757858563906054, + "learning_rate": 9.072999312964823e-06, + "loss": 0.2256, + "step": 2491 + }, + { + "epoch": 0.8321923526465186, + "grad_norm": 0.4588296740580707, + "learning_rate": 9.071871826688472e-06, + "loss": 0.2182, + "step": 2492 + }, + { + "epoch": 0.8325262982133912, + "grad_norm": 0.4501152802206452, + "learning_rate": 9.070743725307937e-06, + "loss": 0.2126, + "step": 2493 + }, + { + "epoch": 0.8328602437802638, + "grad_norm": 0.5107092439533395, + "learning_rate": 9.06961500899363e-06, + "loss": 0.2385, + "step": 2494 + }, + { + "epoch": 0.8331941893471364, + "grad_norm": 0.46172068053365517, + "learning_rate": 9.068485677916059e-06, + "loss": 0.2233, + "step": 2495 + }, + { + "epoch": 0.833528134914009, + "grad_norm": 0.4559896293320187, + "learning_rate": 9.06735573224582e-06, + "loss": 0.2156, + "step": 2496 + }, + { + "epoch": 0.8338620804808816, + "grad_norm": 0.48841992918673477, + "learning_rate": 9.066225172153607e-06, + "loss": 0.2509, + "step": 2497 + }, + { + "epoch": 0.8341960260477542, + "grad_norm": 0.48807810736665075, + "learning_rate": 9.065093997810204e-06, + "loss": 0.2297, + "step": 2498 + }, + { + "epoch": 0.8345299716146268, + "grad_norm": 0.45944899131100364, + "learning_rate": 9.063962209386485e-06, + "loss": 0.234, + "step": 2499 + }, + { + "epoch": 0.8348639171814994, + "grad_norm": 0.4787228349593665, + "learning_rate": 9.062829807053426e-06, + "loss": 0.2188, + "step": 2500 + }, + { + "epoch": 0.835197862748372, + "grad_norm": 0.5036077547144578, + "learning_rate": 9.061696790982086e-06, + "loss": 0.2366, + "step": 2501 + }, + { + "epoch": 0.8355318083152447, + "grad_norm": 0.544975155082703, + "learning_rate": 9.060563161343618e-06, + "loss": 0.214, + "step": 2502 + }, + { + "epoch": 0.8358657538821173, + "grad_norm": 0.4434329818632235, + "learning_rate": 9.059428918309276e-06, + "loss": 0.2141, + "step": 2503 + }, + { + "epoch": 0.8361996994489899, + "grad_norm": 0.4498540488388689, + "learning_rate": 9.058294062050396e-06, + "loss": 0.2178, + "step": 2504 + }, + { + "epoch": 0.8365336450158624, + "grad_norm": 0.42897565416363814, + "learning_rate": 9.057158592738414e-06, + "loss": 0.2147, + "step": 2505 + }, + { + "epoch": 0.836867590582735, + "grad_norm": 0.42112860630504073, + "learning_rate": 9.056022510544855e-06, + "loss": 0.2132, + "step": 2506 + }, + { + "epoch": 0.8372015361496076, + "grad_norm": 0.4408094729636434, + "learning_rate": 9.054885815641336e-06, + "loss": 0.2215, + "step": 2507 + }, + { + "epoch": 0.8375354817164802, + "grad_norm": 0.45258060889972634, + "learning_rate": 9.05374850819957e-06, + "loss": 0.2132, + "step": 2508 + }, + { + "epoch": 0.8378694272833528, + "grad_norm": 0.47132515883640624, + "learning_rate": 9.052610588391363e-06, + "loss": 0.2233, + "step": 2509 + }, + { + "epoch": 0.8382033728502254, + "grad_norm": 0.4767783733828431, + "learning_rate": 9.051472056388606e-06, + "loss": 0.2418, + "step": 2510 + }, + { + "epoch": 0.838537318417098, + "grad_norm": 0.46137862596916196, + "learning_rate": 9.050332912363292e-06, + "loss": 0.237, + "step": 2511 + }, + { + "epoch": 0.8388712639839706, + "grad_norm": 0.49409067455142913, + "learning_rate": 9.049193156487501e-06, + "loss": 0.2046, + "step": 2512 + }, + { + "epoch": 0.8392052095508432, + "grad_norm": 0.49205788033890396, + "learning_rate": 9.048052788933405e-06, + "loss": 0.2206, + "step": 2513 + }, + { + "epoch": 0.8395391551177158, + "grad_norm": 0.4667335941199234, + "learning_rate": 9.046911809873271e-06, + "loss": 0.2158, + "step": 2514 + }, + { + "epoch": 0.8398731006845884, + "grad_norm": 0.4121890688466787, + "learning_rate": 9.045770219479457e-06, + "loss": 0.2021, + "step": 2515 + }, + { + "epoch": 0.840207046251461, + "grad_norm": 0.42556930558028944, + "learning_rate": 9.044628017924415e-06, + "loss": 0.2226, + "step": 2516 + }, + { + "epoch": 0.8405409918183336, + "grad_norm": 0.5288410131785884, + "learning_rate": 9.043485205380685e-06, + "loss": 0.214, + "step": 2517 + }, + { + "epoch": 0.8408749373852062, + "grad_norm": 0.435346841429691, + "learning_rate": 9.042341782020906e-06, + "loss": 0.2163, + "step": 2518 + }, + { + "epoch": 0.8412088829520789, + "grad_norm": 0.4680554061197634, + "learning_rate": 9.041197748017802e-06, + "loss": 0.2162, + "step": 2519 + }, + { + "epoch": 0.8415428285189515, + "grad_norm": 0.6331729472174026, + "learning_rate": 9.040053103544196e-06, + "loss": 0.2171, + "step": 2520 + }, + { + "epoch": 0.8418767740858241, + "grad_norm": 0.46302434709386225, + "learning_rate": 9.038907848772999e-06, + "loss": 0.2313, + "step": 2521 + }, + { + "epoch": 0.8422107196526966, + "grad_norm": 0.5418674061177113, + "learning_rate": 9.037761983877214e-06, + "loss": 0.2314, + "step": 2522 + }, + { + "epoch": 0.8425446652195692, + "grad_norm": 0.422599498634045, + "learning_rate": 9.036615509029939e-06, + "loss": 0.2148, + "step": 2523 + }, + { + "epoch": 0.8428786107864418, + "grad_norm": 0.47415409143410847, + "learning_rate": 9.035468424404362e-06, + "loss": 0.2271, + "step": 2524 + }, + { + "epoch": 0.8432125563533144, + "grad_norm": 0.43769467339845197, + "learning_rate": 9.034320730173762e-06, + "loss": 0.2078, + "step": 2525 + }, + { + "epoch": 0.843546501920187, + "grad_norm": 0.4579042598091712, + "learning_rate": 9.033172426511515e-06, + "loss": 0.2275, + "step": 2526 + }, + { + "epoch": 0.8438804474870596, + "grad_norm": 0.472853158936843, + "learning_rate": 9.032023513591081e-06, + "loss": 0.2163, + "step": 2527 + }, + { + "epoch": 0.8442143930539322, + "grad_norm": 0.47883958606918925, + "learning_rate": 9.030873991586021e-06, + "loss": 0.2362, + "step": 2528 + }, + { + "epoch": 0.8445483386208048, + "grad_norm": 0.4244077632216657, + "learning_rate": 9.029723860669983e-06, + "loss": 0.2135, + "step": 2529 + }, + { + "epoch": 0.8448822841876774, + "grad_norm": 0.5703843399860384, + "learning_rate": 9.028573121016707e-06, + "loss": 0.2595, + "step": 2530 + }, + { + "epoch": 0.84521622975455, + "grad_norm": 0.5540172877400666, + "learning_rate": 9.027421772800027e-06, + "loss": 0.2268, + "step": 2531 + }, + { + "epoch": 0.8455501753214226, + "grad_norm": 0.46860925608211035, + "learning_rate": 9.026269816193867e-06, + "loss": 0.2231, + "step": 2532 + }, + { + "epoch": 0.8458841208882952, + "grad_norm": 0.4536040554309427, + "learning_rate": 9.025117251372242e-06, + "loss": 0.2264, + "step": 2533 + }, + { + "epoch": 0.8462180664551678, + "grad_norm": 0.48291163229730805, + "learning_rate": 9.023964078509263e-06, + "loss": 0.2268, + "step": 2534 + }, + { + "epoch": 0.8465520120220404, + "grad_norm": 0.47777184837544556, + "learning_rate": 9.022810297779129e-06, + "loss": 0.2208, + "step": 2535 + }, + { + "epoch": 0.846885957588913, + "grad_norm": 0.46448916819982217, + "learning_rate": 9.021655909356133e-06, + "loss": 0.2424, + "step": 2536 + }, + { + "epoch": 0.8472199031557857, + "grad_norm": 0.47316820975138824, + "learning_rate": 9.020500913414658e-06, + "loss": 0.2248, + "step": 2537 + }, + { + "epoch": 0.8475538487226582, + "grad_norm": 0.4767805328678842, + "learning_rate": 9.019345310129179e-06, + "loss": 0.2338, + "step": 2538 + }, + { + "epoch": 0.8478877942895308, + "grad_norm": 0.4397760724625773, + "learning_rate": 9.018189099674266e-06, + "loss": 0.2208, + "step": 2539 + }, + { + "epoch": 0.8482217398564034, + "grad_norm": 0.4862443013166264, + "learning_rate": 9.017032282224577e-06, + "loss": 0.2299, + "step": 2540 + }, + { + "epoch": 0.848555685423276, + "grad_norm": 0.44379757315440904, + "learning_rate": 9.015874857954863e-06, + "loss": 0.2192, + "step": 2541 + }, + { + "epoch": 0.8488896309901486, + "grad_norm": 0.42679243339558226, + "learning_rate": 9.014716827039965e-06, + "loss": 0.2222, + "step": 2542 + }, + { + "epoch": 0.8492235765570212, + "grad_norm": 0.41430931012260835, + "learning_rate": 9.013558189654819e-06, + "loss": 0.2161, + "step": 2543 + }, + { + "epoch": 0.8495575221238938, + "grad_norm": 0.48620083773360206, + "learning_rate": 9.01239894597445e-06, + "loss": 0.2191, + "step": 2544 + }, + { + "epoch": 0.8498914676907664, + "grad_norm": 0.4914872230641687, + "learning_rate": 9.011239096173977e-06, + "loss": 0.2205, + "step": 2545 + }, + { + "epoch": 0.850225413257639, + "grad_norm": 0.4726040998570721, + "learning_rate": 9.010078640428606e-06, + "loss": 0.2288, + "step": 2546 + }, + { + "epoch": 0.8505593588245116, + "grad_norm": 0.46491513024652575, + "learning_rate": 9.00891757891364e-06, + "loss": 0.2277, + "step": 2547 + }, + { + "epoch": 0.8508933043913842, + "grad_norm": 0.45611845460077005, + "learning_rate": 9.007755911804471e-06, + "loss": 0.2251, + "step": 2548 + }, + { + "epoch": 0.8512272499582568, + "grad_norm": 0.5926560572888228, + "learning_rate": 9.006593639276582e-06, + "loss": 0.2243, + "step": 2549 + }, + { + "epoch": 0.8515611955251294, + "grad_norm": 0.496523833587818, + "learning_rate": 9.005430761505548e-06, + "loss": 0.2403, + "step": 2550 + }, + { + "epoch": 0.851895141092002, + "grad_norm": 0.4714784682154149, + "learning_rate": 9.004267278667032e-06, + "loss": 0.2149, + "step": 2551 + }, + { + "epoch": 0.8522290866588746, + "grad_norm": 0.4313656521047808, + "learning_rate": 9.003103190936797e-06, + "loss": 0.2187, + "step": 2552 + }, + { + "epoch": 0.8525630322257473, + "grad_norm": 0.46928566508359176, + "learning_rate": 9.00193849849069e-06, + "loss": 0.2208, + "step": 2553 + }, + { + "epoch": 0.8528969777926197, + "grad_norm": 0.4283171636804329, + "learning_rate": 9.00077320150465e-06, + "loss": 0.2112, + "step": 2554 + }, + { + "epoch": 0.8532309233594924, + "grad_norm": 0.5000190368184554, + "learning_rate": 8.999607300154712e-06, + "loss": 0.2192, + "step": 2555 + }, + { + "epoch": 0.853564868926365, + "grad_norm": 0.47355749853117557, + "learning_rate": 8.998440794616998e-06, + "loss": 0.2213, + "step": 2556 + }, + { + "epoch": 0.8538988144932376, + "grad_norm": 0.43176568872005033, + "learning_rate": 8.99727368506772e-06, + "loss": 0.2098, + "step": 2557 + }, + { + "epoch": 0.8542327600601102, + "grad_norm": 0.46982069301090845, + "learning_rate": 8.996105971683187e-06, + "loss": 0.2177, + "step": 2558 + }, + { + "epoch": 0.8545667056269828, + "grad_norm": 0.47512821563209395, + "learning_rate": 8.994937654639793e-06, + "loss": 0.2278, + "step": 2559 + }, + { + "epoch": 0.8549006511938554, + "grad_norm": 0.4000965489856469, + "learning_rate": 8.993768734114029e-06, + "loss": 0.2023, + "step": 2560 + }, + { + "epoch": 0.855234596760728, + "grad_norm": 0.43268062516172295, + "learning_rate": 8.992599210282471e-06, + "loss": 0.2202, + "step": 2561 + }, + { + "epoch": 0.8555685423276006, + "grad_norm": 0.42305087012502596, + "learning_rate": 8.991429083321792e-06, + "loss": 0.2189, + "step": 2562 + }, + { + "epoch": 0.8559024878944732, + "grad_norm": 0.4708842682056169, + "learning_rate": 8.990258353408754e-06, + "loss": 0.2295, + "step": 2563 + }, + { + "epoch": 0.8562364334613458, + "grad_norm": 0.4600816021897864, + "learning_rate": 8.989087020720204e-06, + "loss": 0.2193, + "step": 2564 + }, + { + "epoch": 0.8565703790282184, + "grad_norm": 0.4198834665228543, + "learning_rate": 8.987915085433092e-06, + "loss": 0.2195, + "step": 2565 + }, + { + "epoch": 0.856904324595091, + "grad_norm": 0.4489653916885646, + "learning_rate": 8.98674254772445e-06, + "loss": 0.2126, + "step": 2566 + }, + { + "epoch": 0.8572382701619636, + "grad_norm": 0.40367098282319236, + "learning_rate": 8.985569407771404e-06, + "loss": 0.1956, + "step": 2567 + }, + { + "epoch": 0.8575722157288362, + "grad_norm": 0.4732011179689396, + "learning_rate": 8.984395665751169e-06, + "loss": 0.2296, + "step": 2568 + }, + { + "epoch": 0.8579061612957088, + "grad_norm": 0.47840121907574584, + "learning_rate": 8.983221321841056e-06, + "loss": 0.2299, + "step": 2569 + }, + { + "epoch": 0.8582401068625815, + "grad_norm": 0.4465691457596304, + "learning_rate": 8.98204637621846e-06, + "loss": 0.2143, + "step": 2570 + }, + { + "epoch": 0.858574052429454, + "grad_norm": 0.47090014081524445, + "learning_rate": 8.980870829060872e-06, + "loss": 0.2255, + "step": 2571 + }, + { + "epoch": 0.8589079979963266, + "grad_norm": 0.46190459557837005, + "learning_rate": 8.979694680545872e-06, + "loss": 0.2164, + "step": 2572 + }, + { + "epoch": 0.8592419435631992, + "grad_norm": 0.3984766673387074, + "learning_rate": 8.978517930851132e-06, + "loss": 0.2025, + "step": 2573 + }, + { + "epoch": 0.8595758891300718, + "grad_norm": 0.4881647227380028, + "learning_rate": 8.977340580154411e-06, + "loss": 0.2222, + "step": 2574 + }, + { + "epoch": 0.8599098346969444, + "grad_norm": 0.4577249839436491, + "learning_rate": 8.976162628633565e-06, + "loss": 0.2284, + "step": 2575 + }, + { + "epoch": 0.860243780263817, + "grad_norm": 0.42843210433451573, + "learning_rate": 8.974984076466537e-06, + "loss": 0.228, + "step": 2576 + }, + { + "epoch": 0.8605777258306896, + "grad_norm": 0.46579319758112697, + "learning_rate": 8.97380492383136e-06, + "loss": 0.2117, + "step": 2577 + }, + { + "epoch": 0.8609116713975622, + "grad_norm": 0.4552220225147081, + "learning_rate": 8.972625170906157e-06, + "loss": 0.2316, + "step": 2578 + }, + { + "epoch": 0.8612456169644348, + "grad_norm": 0.42726184700574793, + "learning_rate": 8.971444817869148e-06, + "loss": 0.2198, + "step": 2579 + }, + { + "epoch": 0.8615795625313074, + "grad_norm": 0.45374200275965654, + "learning_rate": 8.970263864898636e-06, + "loss": 0.2261, + "step": 2580 + }, + { + "epoch": 0.86191350809818, + "grad_norm": 0.5381715281038784, + "learning_rate": 8.969082312173021e-06, + "loss": 0.2344, + "step": 2581 + }, + { + "epoch": 0.8622474536650526, + "grad_norm": 0.4754027335097857, + "learning_rate": 8.967900159870787e-06, + "loss": 0.2293, + "step": 2582 + }, + { + "epoch": 0.8625813992319252, + "grad_norm": 0.4460243245841365, + "learning_rate": 8.966717408170512e-06, + "loss": 0.2129, + "step": 2583 + }, + { + "epoch": 0.8629153447987978, + "grad_norm": 0.471562541587052, + "learning_rate": 8.965534057250866e-06, + "loss": 0.232, + "step": 2584 + }, + { + "epoch": 0.8632492903656704, + "grad_norm": 0.4426034976537733, + "learning_rate": 8.964350107290609e-06, + "loss": 0.2099, + "step": 2585 + }, + { + "epoch": 0.863583235932543, + "grad_norm": 0.4588021928348187, + "learning_rate": 8.96316555846859e-06, + "loss": 0.2205, + "step": 2586 + }, + { + "epoch": 0.8639171814994155, + "grad_norm": 0.42630016568299534, + "learning_rate": 8.961980410963749e-06, + "loss": 0.2135, + "step": 2587 + }, + { + "epoch": 0.8642511270662881, + "grad_norm": 0.45810183824218453, + "learning_rate": 8.960794664955115e-06, + "loss": 0.2229, + "step": 2588 + }, + { + "epoch": 0.8645850726331608, + "grad_norm": 0.42559767116343933, + "learning_rate": 8.95960832062181e-06, + "loss": 0.2156, + "step": 2589 + }, + { + "epoch": 0.8649190182000334, + "grad_norm": 0.4245683101087079, + "learning_rate": 8.958421378143046e-06, + "loss": 0.22, + "step": 2590 + }, + { + "epoch": 0.865252963766906, + "grad_norm": 0.5033515035437922, + "learning_rate": 8.957233837698122e-06, + "loss": 0.2298, + "step": 2591 + }, + { + "epoch": 0.8655869093337786, + "grad_norm": 0.46700010068824005, + "learning_rate": 8.956045699466433e-06, + "loss": 0.2344, + "step": 2592 + }, + { + "epoch": 0.8659208549006512, + "grad_norm": 0.4434776982903602, + "learning_rate": 8.95485696362746e-06, + "loss": 0.2143, + "step": 2593 + }, + { + "epoch": 0.8662548004675238, + "grad_norm": 0.432504424021459, + "learning_rate": 8.953667630360778e-06, + "loss": 0.2135, + "step": 2594 + }, + { + "epoch": 0.8665887460343964, + "grad_norm": 0.4761778540131483, + "learning_rate": 8.952477699846044e-06, + "loss": 0.2311, + "step": 2595 + }, + { + "epoch": 0.866922691601269, + "grad_norm": 0.41699684183795516, + "learning_rate": 8.951287172263018e-06, + "loss": 0.2181, + "step": 2596 + }, + { + "epoch": 0.8672566371681416, + "grad_norm": 0.4546797953736065, + "learning_rate": 8.950096047791539e-06, + "loss": 0.2311, + "step": 2597 + }, + { + "epoch": 0.8675905827350142, + "grad_norm": 0.4793027535037006, + "learning_rate": 8.94890432661154e-06, + "loss": 0.2416, + "step": 2598 + }, + { + "epoch": 0.8679245283018868, + "grad_norm": 0.4259729344924639, + "learning_rate": 8.947712008903045e-06, + "loss": 0.2164, + "step": 2599 + }, + { + "epoch": 0.8682584738687594, + "grad_norm": 0.45367870809652466, + "learning_rate": 8.946519094846169e-06, + "loss": 0.2168, + "step": 2600 + }, + { + "epoch": 0.868592419435632, + "grad_norm": 0.4283930933516754, + "learning_rate": 8.945325584621116e-06, + "loss": 0.2048, + "step": 2601 + }, + { + "epoch": 0.8689263650025046, + "grad_norm": 0.47811696623877187, + "learning_rate": 8.944131478408177e-06, + "loss": 0.2256, + "step": 2602 + }, + { + "epoch": 0.8692603105693771, + "grad_norm": 0.474177095776076, + "learning_rate": 8.942936776387739e-06, + "loss": 0.2199, + "step": 2603 + }, + { + "epoch": 0.8695942561362497, + "grad_norm": 0.449120897392552, + "learning_rate": 8.941741478740272e-06, + "loss": 0.2251, + "step": 2604 + }, + { + "epoch": 0.8699282017031224, + "grad_norm": 0.3831811961730255, + "learning_rate": 8.940545585646344e-06, + "loss": 0.1982, + "step": 2605 + }, + { + "epoch": 0.870262147269995, + "grad_norm": 0.46919762256557285, + "learning_rate": 8.939349097286608e-06, + "loss": 0.237, + "step": 2606 + }, + { + "epoch": 0.8705960928368676, + "grad_norm": 0.4595681283636294, + "learning_rate": 8.938152013841803e-06, + "loss": 0.2359, + "step": 2607 + }, + { + "epoch": 0.8709300384037402, + "grad_norm": 0.46244221082714243, + "learning_rate": 8.93695433549277e-06, + "loss": 0.2172, + "step": 2608 + }, + { + "epoch": 0.8712639839706128, + "grad_norm": 0.44144435300671014, + "learning_rate": 8.935756062420426e-06, + "loss": 0.2259, + "step": 2609 + }, + { + "epoch": 0.8715979295374854, + "grad_norm": 0.4735100176435913, + "learning_rate": 8.934557194805787e-06, + "loss": 0.2221, + "step": 2610 + }, + { + "epoch": 0.871931875104358, + "grad_norm": 0.44284418340652504, + "learning_rate": 8.933357732829957e-06, + "loss": 0.2265, + "step": 2611 + }, + { + "epoch": 0.8722658206712306, + "grad_norm": 0.4659415166073965, + "learning_rate": 8.932157676674126e-06, + "loss": 0.2276, + "step": 2612 + }, + { + "epoch": 0.8725997662381032, + "grad_norm": 0.41233515606202303, + "learning_rate": 8.93095702651958e-06, + "loss": 0.2074, + "step": 2613 + }, + { + "epoch": 0.8729337118049758, + "grad_norm": 0.4877780049978787, + "learning_rate": 8.92975578254769e-06, + "loss": 0.2313, + "step": 2614 + }, + { + "epoch": 0.8732676573718484, + "grad_norm": 0.5150208995785533, + "learning_rate": 8.928553944939915e-06, + "loss": 0.2156, + "step": 2615 + }, + { + "epoch": 0.873601602938721, + "grad_norm": 0.4168813001248241, + "learning_rate": 8.92735151387781e-06, + "loss": 0.2222, + "step": 2616 + }, + { + "epoch": 0.8739355485055936, + "grad_norm": 0.4098574760494883, + "learning_rate": 8.926148489543018e-06, + "loss": 0.2175, + "step": 2617 + }, + { + "epoch": 0.8742694940724662, + "grad_norm": 0.4412115939247315, + "learning_rate": 8.924944872117264e-06, + "loss": 0.2195, + "step": 2618 + }, + { + "epoch": 0.8746034396393388, + "grad_norm": 0.4630857578664083, + "learning_rate": 8.923740661782376e-06, + "loss": 0.2195, + "step": 2619 + }, + { + "epoch": 0.8749373852062113, + "grad_norm": 0.48566643237103263, + "learning_rate": 8.92253585872026e-06, + "loss": 0.2351, + "step": 2620 + }, + { + "epoch": 0.8752713307730839, + "grad_norm": 0.43311552083576144, + "learning_rate": 8.921330463112915e-06, + "loss": 0.2288, + "step": 2621 + }, + { + "epoch": 0.8756052763399566, + "grad_norm": 0.39507171452378326, + "learning_rate": 8.92012447514243e-06, + "loss": 0.207, + "step": 2622 + }, + { + "epoch": 0.8759392219068292, + "grad_norm": 0.4452755906788476, + "learning_rate": 8.918917894990989e-06, + "loss": 0.1985, + "step": 2623 + }, + { + "epoch": 0.8762731674737018, + "grad_norm": 0.3910814697165605, + "learning_rate": 8.917710722840853e-06, + "loss": 0.2066, + "step": 2624 + }, + { + "epoch": 0.8766071130405744, + "grad_norm": 0.4239629843736424, + "learning_rate": 8.916502958874385e-06, + "loss": 0.2274, + "step": 2625 + }, + { + "epoch": 0.876941058607447, + "grad_norm": 0.45349007585563317, + "learning_rate": 8.915294603274027e-06, + "loss": 0.2205, + "step": 2626 + }, + { + "epoch": 0.8772750041743196, + "grad_norm": 0.42342259610566313, + "learning_rate": 8.91408565622232e-06, + "loss": 0.2065, + "step": 2627 + }, + { + "epoch": 0.8776089497411922, + "grad_norm": 0.44535591900172594, + "learning_rate": 8.912876117901887e-06, + "loss": 0.2203, + "step": 2628 + }, + { + "epoch": 0.8779428953080648, + "grad_norm": 0.46320851664376284, + "learning_rate": 8.911665988495446e-06, + "loss": 0.2432, + "step": 2629 + }, + { + "epoch": 0.8782768408749374, + "grad_norm": 0.4367734120566955, + "learning_rate": 8.910455268185795e-06, + "loss": 0.2166, + "step": 2630 + }, + { + "epoch": 0.87861078644181, + "grad_norm": 0.43971258610914693, + "learning_rate": 8.909243957155835e-06, + "loss": 0.2313, + "step": 2631 + }, + { + "epoch": 0.8789447320086826, + "grad_norm": 0.408475957094058, + "learning_rate": 8.908032055588544e-06, + "loss": 0.2282, + "step": 2632 + }, + { + "epoch": 0.8792786775755552, + "grad_norm": 0.4413318202987162, + "learning_rate": 8.906819563666997e-06, + "loss": 0.2325, + "step": 2633 + }, + { + "epoch": 0.8796126231424278, + "grad_norm": 0.4204393958934717, + "learning_rate": 8.905606481574351e-06, + "loss": 0.2212, + "step": 2634 + }, + { + "epoch": 0.8799465687093004, + "grad_norm": 0.43071251695265306, + "learning_rate": 8.90439280949386e-06, + "loss": 0.2153, + "step": 2635 + }, + { + "epoch": 0.8802805142761729, + "grad_norm": 0.44850361365375907, + "learning_rate": 8.903178547608863e-06, + "loss": 0.2221, + "step": 2636 + }, + { + "epoch": 0.8806144598430455, + "grad_norm": 0.42781213117927985, + "learning_rate": 8.901963696102788e-06, + "loss": 0.2306, + "step": 2637 + }, + { + "epoch": 0.8809484054099181, + "grad_norm": 0.42113339754508183, + "learning_rate": 8.900748255159152e-06, + "loss": 0.2111, + "step": 2638 + }, + { + "epoch": 0.8812823509767908, + "grad_norm": 0.47605309867063816, + "learning_rate": 8.899532224961562e-06, + "loss": 0.229, + "step": 2639 + }, + { + "epoch": 0.8816162965436634, + "grad_norm": 0.5527061292754034, + "learning_rate": 8.898315605693715e-06, + "loss": 0.217, + "step": 2640 + }, + { + "epoch": 0.881950242110536, + "grad_norm": 0.43906596005312143, + "learning_rate": 8.897098397539394e-06, + "loss": 0.2164, + "step": 2641 + }, + { + "epoch": 0.8822841876774086, + "grad_norm": 0.5067302891421397, + "learning_rate": 8.895880600682472e-06, + "loss": 0.2403, + "step": 2642 + }, + { + "epoch": 0.8826181332442812, + "grad_norm": 0.48526179484087406, + "learning_rate": 8.894662215306913e-06, + "loss": 0.216, + "step": 2643 + }, + { + "epoch": 0.8829520788111538, + "grad_norm": 0.5004998831458609, + "learning_rate": 8.89344324159677e-06, + "loss": 0.2182, + "step": 2644 + }, + { + "epoch": 0.8832860243780264, + "grad_norm": 0.4141565409924, + "learning_rate": 8.89222367973618e-06, + "loss": 0.2173, + "step": 2645 + }, + { + "epoch": 0.883619969944899, + "grad_norm": 0.451587274844467, + "learning_rate": 8.891003529909375e-06, + "loss": 0.2258, + "step": 2646 + }, + { + "epoch": 0.8839539155117716, + "grad_norm": 0.4357097873514739, + "learning_rate": 8.889782792300672e-06, + "loss": 0.2176, + "step": 2647 + }, + { + "epoch": 0.8842878610786442, + "grad_norm": 0.4424588073002416, + "learning_rate": 8.888561467094476e-06, + "loss": 0.2173, + "step": 2648 + }, + { + "epoch": 0.8846218066455168, + "grad_norm": 0.47374105210343537, + "learning_rate": 8.887339554475284e-06, + "loss": 0.2167, + "step": 2649 + }, + { + "epoch": 0.8849557522123894, + "grad_norm": 0.4314904279729543, + "learning_rate": 8.886117054627682e-06, + "loss": 0.2114, + "step": 2650 + }, + { + "epoch": 0.885289697779262, + "grad_norm": 0.48980056249973974, + "learning_rate": 8.88489396773634e-06, + "loss": 0.2344, + "step": 2651 + }, + { + "epoch": 0.8856236433461345, + "grad_norm": 0.38805445786960013, + "learning_rate": 8.883670293986019e-06, + "loss": 0.2056, + "step": 2652 + }, + { + "epoch": 0.8859575889130071, + "grad_norm": 0.4178809237058471, + "learning_rate": 8.882446033561576e-06, + "loss": 0.2242, + "step": 2653 + }, + { + "epoch": 0.8862915344798797, + "grad_norm": 0.4936359585377921, + "learning_rate": 8.881221186647941e-06, + "loss": 0.2183, + "step": 2654 + }, + { + "epoch": 0.8866254800467523, + "grad_norm": 0.4859800139169095, + "learning_rate": 8.879995753430148e-06, + "loss": 0.2178, + "step": 2655 + }, + { + "epoch": 0.886959425613625, + "grad_norm": 0.48946455778130216, + "learning_rate": 8.878769734093312e-06, + "loss": 0.2203, + "step": 2656 + }, + { + "epoch": 0.8872933711804976, + "grad_norm": 0.41725432475224355, + "learning_rate": 8.877543128822634e-06, + "loss": 0.2265, + "step": 2657 + }, + { + "epoch": 0.8876273167473702, + "grad_norm": 0.4663793748591142, + "learning_rate": 8.876315937803413e-06, + "loss": 0.2262, + "step": 2658 + }, + { + "epoch": 0.8879612623142428, + "grad_norm": 0.3916146858993435, + "learning_rate": 8.875088161221025e-06, + "loss": 0.2072, + "step": 2659 + }, + { + "epoch": 0.8882952078811154, + "grad_norm": 0.47716488660190576, + "learning_rate": 8.873859799260944e-06, + "loss": 0.2245, + "step": 2660 + }, + { + "epoch": 0.888629153447988, + "grad_norm": 0.42641030028705323, + "learning_rate": 8.872630852108725e-06, + "loss": 0.2301, + "step": 2661 + }, + { + "epoch": 0.8889630990148606, + "grad_norm": 0.4117929648291072, + "learning_rate": 8.87140131995002e-06, + "loss": 0.2303, + "step": 2662 + }, + { + "epoch": 0.8892970445817332, + "grad_norm": 0.4161638007577847, + "learning_rate": 8.870171202970559e-06, + "loss": 0.2121, + "step": 2663 + }, + { + "epoch": 0.8896309901486058, + "grad_norm": 0.4530085404187551, + "learning_rate": 8.868940501356169e-06, + "loss": 0.2248, + "step": 2664 + }, + { + "epoch": 0.8899649357154784, + "grad_norm": 0.40243303655245055, + "learning_rate": 8.86770921529276e-06, + "loss": 0.2135, + "step": 2665 + }, + { + "epoch": 0.890298881282351, + "grad_norm": 0.4473350481589006, + "learning_rate": 8.866477344966334e-06, + "loss": 0.216, + "step": 2666 + }, + { + "epoch": 0.8906328268492236, + "grad_norm": 0.41627719547498515, + "learning_rate": 8.865244890562978e-06, + "loss": 0.2174, + "step": 2667 + }, + { + "epoch": 0.8909667724160962, + "grad_norm": 0.5442020904005506, + "learning_rate": 8.864011852268872e-06, + "loss": 0.2261, + "step": 2668 + }, + { + "epoch": 0.8913007179829687, + "grad_norm": 0.4366841475309793, + "learning_rate": 8.862778230270276e-06, + "loss": 0.2166, + "step": 2669 + }, + { + "epoch": 0.8916346635498413, + "grad_norm": 0.5358299377296973, + "learning_rate": 8.861544024753545e-06, + "loss": 0.256, + "step": 2670 + }, + { + "epoch": 0.8919686091167139, + "grad_norm": 0.7520222436984267, + "learning_rate": 8.860309235905122e-06, + "loss": 0.2221, + "step": 2671 + }, + { + "epoch": 0.8923025546835865, + "grad_norm": 0.456864799154424, + "learning_rate": 8.859073863911536e-06, + "loss": 0.2284, + "step": 2672 + }, + { + "epoch": 0.8926365002504592, + "grad_norm": 0.4443603685723179, + "learning_rate": 8.857837908959404e-06, + "loss": 0.2192, + "step": 2673 + }, + { + "epoch": 0.8929704458173318, + "grad_norm": 0.44649147782698073, + "learning_rate": 8.856601371235429e-06, + "loss": 0.2153, + "step": 2674 + }, + { + "epoch": 0.8933043913842044, + "grad_norm": 0.5127058305862575, + "learning_rate": 8.855364250926409e-06, + "loss": 0.2328, + "step": 2675 + }, + { + "epoch": 0.893638336951077, + "grad_norm": 0.42241283375166006, + "learning_rate": 8.854126548219222e-06, + "loss": 0.2144, + "step": 2676 + }, + { + "epoch": 0.8939722825179496, + "grad_norm": 0.44645580116752953, + "learning_rate": 8.85288826330084e-06, + "loss": 0.2217, + "step": 2677 + }, + { + "epoch": 0.8943062280848222, + "grad_norm": 0.4843402606886858, + "learning_rate": 8.85164939635832e-06, + "loss": 0.2058, + "step": 2678 + }, + { + "epoch": 0.8946401736516948, + "grad_norm": 0.4401205110768855, + "learning_rate": 8.850409947578806e-06, + "loss": 0.2255, + "step": 2679 + }, + { + "epoch": 0.8949741192185674, + "grad_norm": 0.46319114729765054, + "learning_rate": 8.849169917149532e-06, + "loss": 0.2152, + "step": 2680 + }, + { + "epoch": 0.89530806478544, + "grad_norm": 0.42318735608832414, + "learning_rate": 8.847929305257821e-06, + "loss": 0.2072, + "step": 2681 + }, + { + "epoch": 0.8956420103523126, + "grad_norm": 0.4774593411729633, + "learning_rate": 8.846688112091078e-06, + "loss": 0.2164, + "step": 2682 + }, + { + "epoch": 0.8959759559191852, + "grad_norm": 0.5172363111160776, + "learning_rate": 8.845446337836805e-06, + "loss": 0.2199, + "step": 2683 + }, + { + "epoch": 0.8963099014860578, + "grad_norm": 0.47524030222731717, + "learning_rate": 8.844203982682583e-06, + "loss": 0.2118, + "step": 2684 + }, + { + "epoch": 0.8966438470529303, + "grad_norm": 0.48935917439328974, + "learning_rate": 8.842961046816085e-06, + "loss": 0.2246, + "step": 2685 + }, + { + "epoch": 0.8969777926198029, + "grad_norm": 0.42309667094002784, + "learning_rate": 8.841717530425071e-06, + "loss": 0.2059, + "step": 2686 + }, + { + "epoch": 0.8973117381866755, + "grad_norm": 0.43840157623404685, + "learning_rate": 8.84047343369739e-06, + "loss": 0.2208, + "step": 2687 + }, + { + "epoch": 0.8976456837535481, + "grad_norm": 0.4595498272411674, + "learning_rate": 8.839228756820977e-06, + "loss": 0.2178, + "step": 2688 + }, + { + "epoch": 0.8979796293204207, + "grad_norm": 0.5061728834639878, + "learning_rate": 8.837983499983856e-06, + "loss": 0.2409, + "step": 2689 + }, + { + "epoch": 0.8983135748872934, + "grad_norm": 0.4681121712319394, + "learning_rate": 8.836737663374135e-06, + "loss": 0.2213, + "step": 2690 + }, + { + "epoch": 0.898647520454166, + "grad_norm": 0.47022403674358887, + "learning_rate": 8.835491247180012e-06, + "loss": 0.2266, + "step": 2691 + }, + { + "epoch": 0.8989814660210386, + "grad_norm": 0.46339165662113857, + "learning_rate": 8.834244251589778e-06, + "loss": 0.2293, + "step": 2692 + }, + { + "epoch": 0.8993154115879112, + "grad_norm": 0.5508066598437971, + "learning_rate": 8.832996676791802e-06, + "loss": 0.2296, + "step": 2693 + }, + { + "epoch": 0.8996493571547838, + "grad_norm": 0.3974241538911554, + "learning_rate": 8.831748522974545e-06, + "loss": 0.2016, + "step": 2694 + }, + { + "epoch": 0.8999833027216564, + "grad_norm": 0.46507520916790673, + "learning_rate": 8.830499790326556e-06, + "loss": 0.2204, + "step": 2695 + }, + { + "epoch": 0.900317248288529, + "grad_norm": 0.4080434870525821, + "learning_rate": 8.829250479036473e-06, + "loss": 0.2098, + "step": 2696 + }, + { + "epoch": 0.9006511938554016, + "grad_norm": 0.4703823065746979, + "learning_rate": 8.828000589293016e-06, + "loss": 0.2096, + "step": 2697 + }, + { + "epoch": 0.9009851394222742, + "grad_norm": 0.42236956688946864, + "learning_rate": 8.826750121284998e-06, + "loss": 0.2083, + "step": 2698 + }, + { + "epoch": 0.9013190849891468, + "grad_norm": 0.44662974396124566, + "learning_rate": 8.825499075201314e-06, + "loss": 0.2245, + "step": 2699 + }, + { + "epoch": 0.9016530305560194, + "grad_norm": 0.43992141849148725, + "learning_rate": 8.824247451230949e-06, + "loss": 0.2171, + "step": 2700 + }, + { + "epoch": 0.9019869761228919, + "grad_norm": 0.5054985915126862, + "learning_rate": 8.82299524956298e-06, + "loss": 0.2374, + "step": 2701 + }, + { + "epoch": 0.9023209216897645, + "grad_norm": 0.384565909847055, + "learning_rate": 8.821742470386565e-06, + "loss": 0.1991, + "step": 2702 + }, + { + "epoch": 0.9026548672566371, + "grad_norm": 0.4712108137917473, + "learning_rate": 8.820489113890949e-06, + "loss": 0.227, + "step": 2703 + }, + { + "epoch": 0.9029888128235097, + "grad_norm": 0.4162460345682535, + "learning_rate": 8.819235180265468e-06, + "loss": 0.209, + "step": 2704 + }, + { + "epoch": 0.9033227583903823, + "grad_norm": 0.3951344465539853, + "learning_rate": 8.817980669699544e-06, + "loss": 0.1994, + "step": 2705 + }, + { + "epoch": 0.903656703957255, + "grad_norm": 0.49667700484753613, + "learning_rate": 8.816725582382681e-06, + "loss": 0.2251, + "step": 2706 + }, + { + "epoch": 0.9039906495241276, + "grad_norm": 0.444646491895196, + "learning_rate": 8.815469918504482e-06, + "loss": 0.2253, + "step": 2707 + }, + { + "epoch": 0.9043245950910002, + "grad_norm": 0.47116995697716324, + "learning_rate": 8.814213678254624e-06, + "loss": 0.2323, + "step": 2708 + }, + { + "epoch": 0.9046585406578728, + "grad_norm": 0.4294065077553617, + "learning_rate": 8.81295686182288e-06, + "loss": 0.2155, + "step": 2709 + }, + { + "epoch": 0.9049924862247454, + "grad_norm": 0.4643290288473964, + "learning_rate": 8.811699469399106e-06, + "loss": 0.2257, + "step": 2710 + }, + { + "epoch": 0.905326431791618, + "grad_norm": 0.5106255418157994, + "learning_rate": 8.810441501173245e-06, + "loss": 0.2198, + "step": 2711 + }, + { + "epoch": 0.9056603773584906, + "grad_norm": 0.47059697891525737, + "learning_rate": 8.809182957335329e-06, + "loss": 0.2193, + "step": 2712 + }, + { + "epoch": 0.9059943229253632, + "grad_norm": 0.4121796907160016, + "learning_rate": 8.807923838075476e-06, + "loss": 0.2255, + "step": 2713 + }, + { + "epoch": 0.9063282684922358, + "grad_norm": 0.4914052442580082, + "learning_rate": 8.80666414358389e-06, + "loss": 0.2386, + "step": 2714 + }, + { + "epoch": 0.9066622140591084, + "grad_norm": 0.5300383794010388, + "learning_rate": 8.805403874050864e-06, + "loss": 0.2275, + "step": 2715 + }, + { + "epoch": 0.906996159625981, + "grad_norm": 0.4420574700910713, + "learning_rate": 8.804143029666775e-06, + "loss": 0.2207, + "step": 2716 + }, + { + "epoch": 0.9073301051928536, + "grad_norm": 0.4569049390299293, + "learning_rate": 8.802881610622089e-06, + "loss": 0.2102, + "step": 2717 + }, + { + "epoch": 0.9076640507597261, + "grad_norm": 0.4138994917550044, + "learning_rate": 8.801619617107359e-06, + "loss": 0.2229, + "step": 2718 + }, + { + "epoch": 0.9079979963265987, + "grad_norm": 0.4794838835897948, + "learning_rate": 8.800357049313222e-06, + "loss": 0.2354, + "step": 2719 + }, + { + "epoch": 0.9083319418934713, + "grad_norm": 0.4589463845784919, + "learning_rate": 8.799093907430406e-06, + "loss": 0.2195, + "step": 2720 + }, + { + "epoch": 0.9086658874603439, + "grad_norm": 0.40781923211396, + "learning_rate": 8.797830191649721e-06, + "loss": 0.2192, + "step": 2721 + }, + { + "epoch": 0.9089998330272165, + "grad_norm": 0.4177098084042151, + "learning_rate": 8.796565902162069e-06, + "loss": 0.2096, + "step": 2722 + }, + { + "epoch": 0.9093337785940891, + "grad_norm": 0.4323255277508984, + "learning_rate": 8.795301039158433e-06, + "loss": 0.2233, + "step": 2723 + }, + { + "epoch": 0.9096677241609618, + "grad_norm": 0.4587198185489595, + "learning_rate": 8.794035602829887e-06, + "loss": 0.2419, + "step": 2724 + }, + { + "epoch": 0.9100016697278344, + "grad_norm": 0.4542463004544109, + "learning_rate": 8.792769593367591e-06, + "loss": 0.2179, + "step": 2725 + }, + { + "epoch": 0.910335615294707, + "grad_norm": 0.4990418712075424, + "learning_rate": 8.79150301096279e-06, + "loss": 0.2238, + "step": 2726 + }, + { + "epoch": 0.9106695608615796, + "grad_norm": 0.41162778959199753, + "learning_rate": 8.790235855806814e-06, + "loss": 0.2127, + "step": 2727 + }, + { + "epoch": 0.9110035064284522, + "grad_norm": 0.4377098192081709, + "learning_rate": 8.788968128091084e-06, + "loss": 0.2171, + "step": 2728 + }, + { + "epoch": 0.9113374519953248, + "grad_norm": 0.4511181347047479, + "learning_rate": 8.787699828007104e-06, + "loss": 0.2264, + "step": 2729 + }, + { + "epoch": 0.9116713975621974, + "grad_norm": 0.44193992386166336, + "learning_rate": 8.786430955746468e-06, + "loss": 0.2142, + "step": 2730 + }, + { + "epoch": 0.91200534312907, + "grad_norm": 0.5164407737903689, + "learning_rate": 8.78516151150085e-06, + "loss": 0.2258, + "step": 2731 + }, + { + "epoch": 0.9123392886959426, + "grad_norm": 0.45102390120947355, + "learning_rate": 8.783891495462018e-06, + "loss": 0.2182, + "step": 2732 + }, + { + "epoch": 0.9126732342628152, + "grad_norm": 0.4717630713459086, + "learning_rate": 8.782620907821823e-06, + "loss": 0.216, + "step": 2733 + }, + { + "epoch": 0.9130071798296877, + "grad_norm": 0.4192665748513033, + "learning_rate": 8.781349748772198e-06, + "loss": 0.2054, + "step": 2734 + }, + { + "epoch": 0.9133411253965603, + "grad_norm": 0.5708074422120173, + "learning_rate": 8.780078018505172e-06, + "loss": 0.227, + "step": 2735 + }, + { + "epoch": 0.9136750709634329, + "grad_norm": 0.5610844903221539, + "learning_rate": 8.778805717212853e-06, + "loss": 0.2202, + "step": 2736 + }, + { + "epoch": 0.9140090165303055, + "grad_norm": 0.44429928394757645, + "learning_rate": 8.777532845087434e-06, + "loss": 0.2224, + "step": 2737 + }, + { + "epoch": 0.9143429620971781, + "grad_norm": 0.5344338868572158, + "learning_rate": 8.776259402321201e-06, + "loss": 0.235, + "step": 2738 + }, + { + "epoch": 0.9146769076640507, + "grad_norm": 0.5063152059042075, + "learning_rate": 8.774985389106521e-06, + "loss": 0.2272, + "step": 2739 + }, + { + "epoch": 0.9150108532309233, + "grad_norm": 0.5051461254697261, + "learning_rate": 8.77371080563585e-06, + "loss": 0.2251, + "step": 2740 + }, + { + "epoch": 0.915344798797796, + "grad_norm": 0.41201433720845737, + "learning_rate": 8.772435652101726e-06, + "loss": 0.2084, + "step": 2741 + }, + { + "epoch": 0.9156787443646686, + "grad_norm": 0.42455946149967577, + "learning_rate": 8.771159928696779e-06, + "loss": 0.2091, + "step": 2742 + }, + { + "epoch": 0.9160126899315412, + "grad_norm": 0.4605627342858186, + "learning_rate": 8.76988363561372e-06, + "loss": 0.2197, + "step": 2743 + }, + { + "epoch": 0.9163466354984138, + "grad_norm": 0.482110449293083, + "learning_rate": 8.76860677304535e-06, + "loss": 0.216, + "step": 2744 + }, + { + "epoch": 0.9166805810652864, + "grad_norm": 0.46195236737899087, + "learning_rate": 8.767329341184552e-06, + "loss": 0.2331, + "step": 2745 + }, + { + "epoch": 0.917014526632159, + "grad_norm": 0.4413188092360609, + "learning_rate": 8.766051340224297e-06, + "loss": 0.2171, + "step": 2746 + }, + { + "epoch": 0.9173484721990316, + "grad_norm": 0.48280257812503025, + "learning_rate": 8.764772770357646e-06, + "loss": 0.2182, + "step": 2747 + }, + { + "epoch": 0.9176824177659042, + "grad_norm": 0.4210285641166794, + "learning_rate": 8.763493631777738e-06, + "loss": 0.2044, + "step": 2748 + }, + { + "epoch": 0.9180163633327768, + "grad_norm": 0.4496036936278124, + "learning_rate": 8.762213924677802e-06, + "loss": 0.2224, + "step": 2749 + }, + { + "epoch": 0.9183503088996493, + "grad_norm": 0.4402813014352695, + "learning_rate": 8.760933649251155e-06, + "loss": 0.2157, + "step": 2750 + }, + { + "epoch": 0.9186842544665219, + "grad_norm": 0.43235518297213915, + "learning_rate": 8.759652805691197e-06, + "loss": 0.221, + "step": 2751 + }, + { + "epoch": 0.9190182000333945, + "grad_norm": 0.4424479020307994, + "learning_rate": 8.758371394191415e-06, + "loss": 0.22, + "step": 2752 + }, + { + "epoch": 0.9193521456002671, + "grad_norm": 0.4281531056346567, + "learning_rate": 8.75708941494538e-06, + "loss": 0.2127, + "step": 2753 + }, + { + "epoch": 0.9196860911671397, + "grad_norm": 0.48598727252634466, + "learning_rate": 8.75580686814675e-06, + "loss": 0.2201, + "step": 2754 + }, + { + "epoch": 0.9200200367340123, + "grad_norm": 0.4900282146647412, + "learning_rate": 8.75452375398927e-06, + "loss": 0.2202, + "step": 2755 + }, + { + "epoch": 0.9203539823008849, + "grad_norm": 0.41487723575509605, + "learning_rate": 8.753240072666769e-06, + "loss": 0.2172, + "step": 2756 + }, + { + "epoch": 0.9206879278677575, + "grad_norm": 0.46744752038473614, + "learning_rate": 8.751955824373161e-06, + "loss": 0.2299, + "step": 2757 + }, + { + "epoch": 0.9210218734346302, + "grad_norm": 0.3913659048757199, + "learning_rate": 8.750671009302448e-06, + "loss": 0.2142, + "step": 2758 + }, + { + "epoch": 0.9213558190015028, + "grad_norm": 0.3858840470760089, + "learning_rate": 8.749385627648717e-06, + "loss": 0.1974, + "step": 2759 + }, + { + "epoch": 0.9216897645683754, + "grad_norm": 0.46012895388328423, + "learning_rate": 8.748099679606139e-06, + "loss": 0.2263, + "step": 2760 + }, + { + "epoch": 0.922023710135248, + "grad_norm": 0.5056054844807831, + "learning_rate": 8.746813165368973e-06, + "loss": 0.2164, + "step": 2761 + }, + { + "epoch": 0.9223576557021206, + "grad_norm": 0.4099051015952098, + "learning_rate": 8.745526085131559e-06, + "loss": 0.2087, + "step": 2762 + }, + { + "epoch": 0.9226916012689932, + "grad_norm": 0.44267862491947235, + "learning_rate": 8.744238439088328e-06, + "loss": 0.2143, + "step": 2763 + }, + { + "epoch": 0.9230255468358658, + "grad_norm": 0.5002499141140306, + "learning_rate": 8.742950227433795e-06, + "loss": 0.227, + "step": 2764 + }, + { + "epoch": 0.9233594924027384, + "grad_norm": 0.4609375168550936, + "learning_rate": 8.741661450362559e-06, + "loss": 0.222, + "step": 2765 + }, + { + "epoch": 0.923693437969611, + "grad_norm": 0.40652027628765436, + "learning_rate": 8.740372108069304e-06, + "loss": 0.2063, + "step": 2766 + }, + { + "epoch": 0.9240273835364835, + "grad_norm": 0.46845775301622733, + "learning_rate": 8.739082200748799e-06, + "loss": 0.2056, + "step": 2767 + }, + { + "epoch": 0.9243613291033561, + "grad_norm": 0.43583945959938114, + "learning_rate": 8.737791728595903e-06, + "loss": 0.221, + "step": 2768 + }, + { + "epoch": 0.9246952746702287, + "grad_norm": 0.4211490574278276, + "learning_rate": 8.736500691805554e-06, + "loss": 0.2249, + "step": 2769 + }, + { + "epoch": 0.9250292202371013, + "grad_norm": 0.4928325266258507, + "learning_rate": 8.73520909057278e-06, + "loss": 0.2268, + "step": 2770 + }, + { + "epoch": 0.9253631658039739, + "grad_norm": 0.47059734345795795, + "learning_rate": 8.733916925092691e-06, + "loss": 0.231, + "step": 2771 + }, + { + "epoch": 0.9256971113708465, + "grad_norm": 0.45417002632161574, + "learning_rate": 8.732624195560487e-06, + "loss": 0.2389, + "step": 2772 + }, + { + "epoch": 0.9260310569377191, + "grad_norm": 0.3945646807349342, + "learning_rate": 8.731330902171447e-06, + "loss": 0.2168, + "step": 2773 + }, + { + "epoch": 0.9263650025045918, + "grad_norm": 0.44248145973303443, + "learning_rate": 8.730037045120941e-06, + "loss": 0.2238, + "step": 2774 + }, + { + "epoch": 0.9266989480714644, + "grad_norm": 0.4674819994664436, + "learning_rate": 8.728742624604418e-06, + "loss": 0.2341, + "step": 2775 + }, + { + "epoch": 0.927032893638337, + "grad_norm": 0.64231489409082, + "learning_rate": 8.727447640817417e-06, + "loss": 0.2133, + "step": 2776 + }, + { + "epoch": 0.9273668392052096, + "grad_norm": 0.44146865573432903, + "learning_rate": 8.726152093955561e-06, + "loss": 0.2104, + "step": 2777 + }, + { + "epoch": 0.9277007847720822, + "grad_norm": 0.43933661617710457, + "learning_rate": 8.724855984214558e-06, + "loss": 0.2321, + "step": 2778 + }, + { + "epoch": 0.9280347303389548, + "grad_norm": 0.46754943764880913, + "learning_rate": 8.723559311790197e-06, + "loss": 0.227, + "step": 2779 + }, + { + "epoch": 0.9283686759058274, + "grad_norm": 0.41188188249517227, + "learning_rate": 8.722262076878361e-06, + "loss": 0.2002, + "step": 2780 + }, + { + "epoch": 0.9287026214727, + "grad_norm": 0.4430354887476343, + "learning_rate": 8.720964279675009e-06, + "loss": 0.2115, + "step": 2781 + }, + { + "epoch": 0.9290365670395726, + "grad_norm": 0.4164609143001292, + "learning_rate": 8.71966592037619e-06, + "loss": 0.2121, + "step": 2782 + }, + { + "epoch": 0.9293705126064451, + "grad_norm": 0.42842924540161714, + "learning_rate": 8.718366999178037e-06, + "loss": 0.2219, + "step": 2783 + }, + { + "epoch": 0.9297044581733177, + "grad_norm": 0.46668660218796776, + "learning_rate": 8.717067516276764e-06, + "loss": 0.2293, + "step": 2784 + }, + { + "epoch": 0.9300384037401903, + "grad_norm": 0.4467610399518897, + "learning_rate": 8.715767471868679e-06, + "loss": 0.2166, + "step": 2785 + }, + { + "epoch": 0.9303723493070629, + "grad_norm": 0.5306058118803246, + "learning_rate": 8.714466866150162e-06, + "loss": 0.2515, + "step": 2786 + }, + { + "epoch": 0.9307062948739355, + "grad_norm": 0.4538833004379203, + "learning_rate": 8.71316569931769e-06, + "loss": 0.2245, + "step": 2787 + }, + { + "epoch": 0.9310402404408081, + "grad_norm": 0.44120044003214, + "learning_rate": 8.71186397156782e-06, + "loss": 0.2191, + "step": 2788 + }, + { + "epoch": 0.9313741860076807, + "grad_norm": 0.4610836615076066, + "learning_rate": 8.710561683097189e-06, + "loss": 0.2178, + "step": 2789 + }, + { + "epoch": 0.9317081315745533, + "grad_norm": 0.4363676340228982, + "learning_rate": 8.709258834102525e-06, + "loss": 0.2242, + "step": 2790 + }, + { + "epoch": 0.932042077141426, + "grad_norm": 0.6756307435936366, + "learning_rate": 8.70795542478064e-06, + "loss": 0.231, + "step": 2791 + }, + { + "epoch": 0.9323760227082986, + "grad_norm": 0.5041996579273476, + "learning_rate": 8.706651455328427e-06, + "loss": 0.2163, + "step": 2792 + }, + { + "epoch": 0.9327099682751712, + "grad_norm": 0.4698362067299536, + "learning_rate": 8.70534692594287e-06, + "loss": 0.2216, + "step": 2793 + }, + { + "epoch": 0.9330439138420438, + "grad_norm": 0.4820217295468483, + "learning_rate": 8.704041836821029e-06, + "loss": 0.2287, + "step": 2794 + }, + { + "epoch": 0.9333778594089164, + "grad_norm": 0.4577816146622304, + "learning_rate": 8.702736188160055e-06, + "loss": 0.2085, + "step": 2795 + }, + { + "epoch": 0.933711804975789, + "grad_norm": 0.4804066031180158, + "learning_rate": 8.70142998015718e-06, + "loss": 0.2128, + "step": 2796 + }, + { + "epoch": 0.9340457505426616, + "grad_norm": 0.48646393253865444, + "learning_rate": 8.700123213009726e-06, + "loss": 0.2315, + "step": 2797 + }, + { + "epoch": 0.9343796961095342, + "grad_norm": 0.4392885717258637, + "learning_rate": 8.698815886915094e-06, + "loss": 0.2372, + "step": 2798 + }, + { + "epoch": 0.9347136416764067, + "grad_norm": 0.4573456253338256, + "learning_rate": 8.697508002070766e-06, + "loss": 0.2237, + "step": 2799 + }, + { + "epoch": 0.9350475872432793, + "grad_norm": 1.0167289235959724, + "learning_rate": 8.696199558674321e-06, + "loss": 0.2282, + "step": 2800 + }, + { + "epoch": 0.9353815328101519, + "grad_norm": 0.49862618680445076, + "learning_rate": 8.69489055692341e-06, + "loss": 0.2191, + "step": 2801 + }, + { + "epoch": 0.9357154783770245, + "grad_norm": 0.4071314628849528, + "learning_rate": 8.693580997015775e-06, + "loss": 0.2145, + "step": 2802 + }, + { + "epoch": 0.9360494239438971, + "grad_norm": 0.48212789403115414, + "learning_rate": 8.692270879149241e-06, + "loss": 0.2264, + "step": 2803 + }, + { + "epoch": 0.9363833695107697, + "grad_norm": 0.46241253052454173, + "learning_rate": 8.690960203521713e-06, + "loss": 0.2084, + "step": 2804 + }, + { + "epoch": 0.9367173150776423, + "grad_norm": 0.5117131305242669, + "learning_rate": 8.689648970331188e-06, + "loss": 0.2498, + "step": 2805 + }, + { + "epoch": 0.9370512606445149, + "grad_norm": 0.488625905932195, + "learning_rate": 8.68833717977574e-06, + "loss": 0.205, + "step": 2806 + }, + { + "epoch": 0.9373852062113875, + "grad_norm": 0.46934650535365496, + "learning_rate": 8.687024832053534e-06, + "loss": 0.209, + "step": 2807 + }, + { + "epoch": 0.9377191517782602, + "grad_norm": 0.4876878801995502, + "learning_rate": 8.685711927362815e-06, + "loss": 0.2166, + "step": 2808 + }, + { + "epoch": 0.9380530973451328, + "grad_norm": 0.47415720928263255, + "learning_rate": 8.68439846590191e-06, + "loss": 0.2019, + "step": 2809 + }, + { + "epoch": 0.9383870429120054, + "grad_norm": 0.5071405923836091, + "learning_rate": 8.683084447869234e-06, + "loss": 0.2193, + "step": 2810 + }, + { + "epoch": 0.938720988478878, + "grad_norm": 0.5342801311116051, + "learning_rate": 8.681769873463286e-06, + "loss": 0.2238, + "step": 2811 + }, + { + "epoch": 0.9390549340457506, + "grad_norm": 0.4967761833386368, + "learning_rate": 8.680454742882647e-06, + "loss": 0.2319, + "step": 2812 + }, + { + "epoch": 0.9393888796126232, + "grad_norm": 0.45077129590302223, + "learning_rate": 8.679139056325983e-06, + "loss": 0.2295, + "step": 2813 + }, + { + "epoch": 0.9397228251794958, + "grad_norm": 0.4979759329992662, + "learning_rate": 8.677822813992046e-06, + "loss": 0.2261, + "step": 2814 + }, + { + "epoch": 0.9400567707463683, + "grad_norm": 0.4950847343176679, + "learning_rate": 8.676506016079664e-06, + "loss": 0.2238, + "step": 2815 + }, + { + "epoch": 0.9403907163132409, + "grad_norm": 0.40708194153685967, + "learning_rate": 8.675188662787762e-06, + "loss": 0.2187, + "step": 2816 + }, + { + "epoch": 0.9407246618801135, + "grad_norm": 0.5521110478916056, + "learning_rate": 8.673870754315336e-06, + "loss": 0.2053, + "step": 2817 + }, + { + "epoch": 0.9410586074469861, + "grad_norm": 0.4442733755427678, + "learning_rate": 8.672552290861478e-06, + "loss": 0.2164, + "step": 2818 + }, + { + "epoch": 0.9413925530138587, + "grad_norm": 0.4553102853972935, + "learning_rate": 8.67123327262535e-06, + "loss": 0.2169, + "step": 2819 + }, + { + "epoch": 0.9417264985807313, + "grad_norm": 0.4306856509162607, + "learning_rate": 8.669913699806209e-06, + "loss": 0.2096, + "step": 2820 + }, + { + "epoch": 0.9420604441476039, + "grad_norm": 0.5401958009078948, + "learning_rate": 8.668593572603394e-06, + "loss": 0.2282, + "step": 2821 + }, + { + "epoch": 0.9423943897144765, + "grad_norm": 0.5107141764908304, + "learning_rate": 8.667272891216323e-06, + "loss": 0.2359, + "step": 2822 + }, + { + "epoch": 0.9427283352813491, + "grad_norm": 0.45683098172915115, + "learning_rate": 8.6659516558445e-06, + "loss": 0.2207, + "step": 2823 + }, + { + "epoch": 0.9430622808482217, + "grad_norm": 0.5311817625144637, + "learning_rate": 8.664629866687514e-06, + "loss": 0.2377, + "step": 2824 + }, + { + "epoch": 0.9433962264150944, + "grad_norm": 0.441871608821732, + "learning_rate": 8.663307523945038e-06, + "loss": 0.2185, + "step": 2825 + }, + { + "epoch": 0.943730171981967, + "grad_norm": 0.4506561928586293, + "learning_rate": 8.661984627816827e-06, + "loss": 0.2136, + "step": 2826 + }, + { + "epoch": 0.9440641175488396, + "grad_norm": 0.40555871123613024, + "learning_rate": 8.660661178502719e-06, + "loss": 0.2094, + "step": 2827 + }, + { + "epoch": 0.9443980631157122, + "grad_norm": 0.48122612843516865, + "learning_rate": 8.659337176202636e-06, + "loss": 0.2177, + "step": 2828 + }, + { + "epoch": 0.9447320086825848, + "grad_norm": 0.493730557455204, + "learning_rate": 8.658012621116585e-06, + "loss": 0.2033, + "step": 2829 + }, + { + "epoch": 0.9450659542494574, + "grad_norm": 0.39913730987982116, + "learning_rate": 8.656687513444656e-06, + "loss": 0.1957, + "step": 2830 + }, + { + "epoch": 0.94539989981633, + "grad_norm": 0.43231231966574224, + "learning_rate": 8.655361853387024e-06, + "loss": 0.2212, + "step": 2831 + }, + { + "epoch": 0.9457338453832025, + "grad_norm": 0.5034429330788713, + "learning_rate": 8.654035641143944e-06, + "loss": 0.2266, + "step": 2832 + }, + { + "epoch": 0.9460677909500751, + "grad_norm": 0.5248570196879081, + "learning_rate": 8.652708876915752e-06, + "loss": 0.2183, + "step": 2833 + }, + { + "epoch": 0.9464017365169477, + "grad_norm": 0.47638029426551504, + "learning_rate": 8.651381560902876e-06, + "loss": 0.2257, + "step": 2834 + }, + { + "epoch": 0.9467356820838203, + "grad_norm": 0.4872575546717003, + "learning_rate": 8.650053693305824e-06, + "loss": 0.2371, + "step": 2835 + }, + { + "epoch": 0.9470696276506929, + "grad_norm": 0.535075069115959, + "learning_rate": 8.648725274325182e-06, + "loss": 0.2206, + "step": 2836 + }, + { + "epoch": 0.9474035732175655, + "grad_norm": 0.48011848827739007, + "learning_rate": 8.647396304161625e-06, + "loss": 0.2204, + "step": 2837 + }, + { + "epoch": 0.9477375187844381, + "grad_norm": 0.43107044927809024, + "learning_rate": 8.64606678301591e-06, + "loss": 0.2143, + "step": 2838 + }, + { + "epoch": 0.9480714643513107, + "grad_norm": 0.48263476741763794, + "learning_rate": 8.644736711088874e-06, + "loss": 0.2128, + "step": 2839 + }, + { + "epoch": 0.9484054099181833, + "grad_norm": 0.4227729549713639, + "learning_rate": 8.643406088581446e-06, + "loss": 0.2181, + "step": 2840 + }, + { + "epoch": 0.948739355485056, + "grad_norm": 0.4423800504922258, + "learning_rate": 8.642074915694626e-06, + "loss": 0.2181, + "step": 2841 + }, + { + "epoch": 0.9490733010519286, + "grad_norm": 0.49729486477624335, + "learning_rate": 8.640743192629507e-06, + "loss": 0.2196, + "step": 2842 + }, + { + "epoch": 0.9494072466188012, + "grad_norm": 0.48595434515596525, + "learning_rate": 8.63941091958726e-06, + "loss": 0.2181, + "step": 2843 + }, + { + "epoch": 0.9497411921856738, + "grad_norm": 0.5001238407946756, + "learning_rate": 8.638078096769141e-06, + "loss": 0.2209, + "step": 2844 + }, + { + "epoch": 0.9500751377525464, + "grad_norm": 0.4720041136115112, + "learning_rate": 8.636744724376488e-06, + "loss": 0.2214, + "step": 2845 + }, + { + "epoch": 0.950409083319419, + "grad_norm": 0.48399406369761006, + "learning_rate": 8.635410802610724e-06, + "loss": 0.2292, + "step": 2846 + }, + { + "epoch": 0.9507430288862916, + "grad_norm": 0.4332190126394425, + "learning_rate": 8.634076331673354e-06, + "loss": 0.2182, + "step": 2847 + }, + { + "epoch": 0.9510769744531641, + "grad_norm": 0.47377715160398876, + "learning_rate": 8.632741311765962e-06, + "loss": 0.2217, + "step": 2848 + }, + { + "epoch": 0.9514109200200367, + "grad_norm": 0.4803537971441186, + "learning_rate": 8.631405743090223e-06, + "loss": 0.2224, + "step": 2849 + }, + { + "epoch": 0.9517448655869093, + "grad_norm": 0.44465165141275165, + "learning_rate": 8.630069625847885e-06, + "loss": 0.212, + "step": 2850 + }, + { + "epoch": 0.9520788111537819, + "grad_norm": 0.4189509028928389, + "learning_rate": 8.628732960240788e-06, + "loss": 0.2201, + "step": 2851 + }, + { + "epoch": 0.9524127567206545, + "grad_norm": 0.4072448497585389, + "learning_rate": 8.627395746470852e-06, + "loss": 0.1999, + "step": 2852 + }, + { + "epoch": 0.9527467022875271, + "grad_norm": 0.5302490534823262, + "learning_rate": 8.626057984740077e-06, + "loss": 0.2235, + "step": 2853 + }, + { + "epoch": 0.9530806478543997, + "grad_norm": 0.4770176390794186, + "learning_rate": 8.624719675250547e-06, + "loss": 0.2092, + "step": 2854 + }, + { + "epoch": 0.9534145934212723, + "grad_norm": 0.4649548910468379, + "learning_rate": 8.623380818204431e-06, + "loss": 0.2206, + "step": 2855 + }, + { + "epoch": 0.9537485389881449, + "grad_norm": 0.41794317440850726, + "learning_rate": 8.622041413803979e-06, + "loss": 0.2299, + "step": 2856 + }, + { + "epoch": 0.9540824845550175, + "grad_norm": 0.4270595046761071, + "learning_rate": 8.620701462251522e-06, + "loss": 0.2274, + "step": 2857 + }, + { + "epoch": 0.9544164301218901, + "grad_norm": 0.5605112243213927, + "learning_rate": 8.619360963749478e-06, + "loss": 0.2366, + "step": 2858 + }, + { + "epoch": 0.9547503756887628, + "grad_norm": 0.4315029266321909, + "learning_rate": 8.618019918500342e-06, + "loss": 0.2155, + "step": 2859 + }, + { + "epoch": 0.9550843212556354, + "grad_norm": 0.41819570091246855, + "learning_rate": 8.616678326706698e-06, + "loss": 0.2032, + "step": 2860 + }, + { + "epoch": 0.955418266822508, + "grad_norm": 0.4720800653986111, + "learning_rate": 8.615336188571208e-06, + "loss": 0.2106, + "step": 2861 + }, + { + "epoch": 0.9557522123893806, + "grad_norm": 0.4336533842521673, + "learning_rate": 8.613993504296617e-06, + "loss": 0.2106, + "step": 2862 + }, + { + "epoch": 0.9560861579562532, + "grad_norm": 0.4842098122111017, + "learning_rate": 8.612650274085755e-06, + "loss": 0.2153, + "step": 2863 + }, + { + "epoch": 0.9564201035231257, + "grad_norm": 0.46478104433885903, + "learning_rate": 8.61130649814153e-06, + "loss": 0.2172, + "step": 2864 + }, + { + "epoch": 0.9567540490899983, + "grad_norm": 0.3995110999710848, + "learning_rate": 8.609962176666936e-06, + "loss": 0.2008, + "step": 2865 + }, + { + "epoch": 0.9570879946568709, + "grad_norm": 0.4619066958368434, + "learning_rate": 8.608617309865051e-06, + "loss": 0.2247, + "step": 2866 + }, + { + "epoch": 0.9574219402237435, + "grad_norm": 0.4772269876791035, + "learning_rate": 8.60727189793903e-06, + "loss": 0.2238, + "step": 2867 + }, + { + "epoch": 0.9577558857906161, + "grad_norm": 0.45025507277873233, + "learning_rate": 8.605925941092114e-06, + "loss": 0.2321, + "step": 2868 + }, + { + "epoch": 0.9580898313574887, + "grad_norm": 0.4240250807538226, + "learning_rate": 8.604579439527627e-06, + "loss": 0.2221, + "step": 2869 + }, + { + "epoch": 0.9584237769243613, + "grad_norm": 0.4467352359560645, + "learning_rate": 8.603232393448974e-06, + "loss": 0.2115, + "step": 2870 + }, + { + "epoch": 0.9587577224912339, + "grad_norm": 0.43281845584862416, + "learning_rate": 8.601884803059641e-06, + "loss": 0.2084, + "step": 2871 + }, + { + "epoch": 0.9590916680581065, + "grad_norm": 0.43146109431192975, + "learning_rate": 8.600536668563197e-06, + "loss": 0.2185, + "step": 2872 + }, + { + "epoch": 0.9594256136249791, + "grad_norm": 0.47249516324982266, + "learning_rate": 8.599187990163296e-06, + "loss": 0.2221, + "step": 2873 + }, + { + "epoch": 0.9597595591918517, + "grad_norm": 0.4698332572735109, + "learning_rate": 8.597838768063667e-06, + "loss": 0.23, + "step": 2874 + }, + { + "epoch": 0.9600935047587243, + "grad_norm": 0.5412304324762713, + "learning_rate": 8.596489002468132e-06, + "loss": 0.2245, + "step": 2875 + }, + { + "epoch": 0.960427450325597, + "grad_norm": 0.44637589874465433, + "learning_rate": 8.595138693580583e-06, + "loss": 0.2233, + "step": 2876 + }, + { + "epoch": 0.9607613958924696, + "grad_norm": 0.43336083402117387, + "learning_rate": 8.593787841605004e-06, + "loss": 0.2048, + "step": 2877 + }, + { + "epoch": 0.9610953414593422, + "grad_norm": 0.4507857608921773, + "learning_rate": 8.592436446745457e-06, + "loss": 0.2187, + "step": 2878 + }, + { + "epoch": 0.9614292870262148, + "grad_norm": 0.4137810248936209, + "learning_rate": 8.591084509206085e-06, + "loss": 0.2288, + "step": 2879 + }, + { + "epoch": 0.9617632325930874, + "grad_norm": 0.4381991589239642, + "learning_rate": 8.589732029191113e-06, + "loss": 0.2156, + "step": 2880 + }, + { + "epoch": 0.9620971781599599, + "grad_norm": 0.46254816718933667, + "learning_rate": 8.588379006904852e-06, + "loss": 0.2345, + "step": 2881 + }, + { + "epoch": 0.9624311237268325, + "grad_norm": 0.447052657136721, + "learning_rate": 8.587025442551689e-06, + "loss": 0.211, + "step": 2882 + }, + { + "epoch": 0.9627650692937051, + "grad_norm": 0.4220565998823512, + "learning_rate": 8.585671336336096e-06, + "loss": 0.2142, + "step": 2883 + }, + { + "epoch": 0.9630990148605777, + "grad_norm": 0.46402238237283716, + "learning_rate": 8.58431668846263e-06, + "loss": 0.2124, + "step": 2884 + }, + { + "epoch": 0.9634329604274503, + "grad_norm": 0.8148299624701707, + "learning_rate": 8.582961499135925e-06, + "loss": 0.2138, + "step": 2885 + }, + { + "epoch": 0.9637669059943229, + "grad_norm": 0.4218423758746241, + "learning_rate": 8.581605768560694e-06, + "loss": 0.2237, + "step": 2886 + }, + { + "epoch": 0.9641008515611955, + "grad_norm": 0.42468992711883435, + "learning_rate": 8.580249496941742e-06, + "loss": 0.2222, + "step": 2887 + }, + { + "epoch": 0.9644347971280681, + "grad_norm": 0.45231316231310115, + "learning_rate": 8.578892684483947e-06, + "loss": 0.2304, + "step": 2888 + }, + { + "epoch": 0.9647687426949407, + "grad_norm": 0.39665581763729213, + "learning_rate": 8.577535331392272e-06, + "loss": 0.2061, + "step": 2889 + }, + { + "epoch": 0.9651026882618133, + "grad_norm": 0.42218622196372374, + "learning_rate": 8.57617743787176e-06, + "loss": 0.214, + "step": 2890 + }, + { + "epoch": 0.9654366338286859, + "grad_norm": 0.4327387439898741, + "learning_rate": 8.574819004127539e-06, + "loss": 0.2165, + "step": 2891 + }, + { + "epoch": 0.9657705793955585, + "grad_norm": 0.4368794278929328, + "learning_rate": 8.573460030364816e-06, + "loss": 0.2057, + "step": 2892 + }, + { + "epoch": 0.9661045249624312, + "grad_norm": 0.46821955032272594, + "learning_rate": 8.572100516788878e-06, + "loss": 0.2122, + "step": 2893 + }, + { + "epoch": 0.9664384705293038, + "grad_norm": 0.5024454235492738, + "learning_rate": 8.570740463605096e-06, + "loss": 0.2301, + "step": 2894 + }, + { + "epoch": 0.9667724160961764, + "grad_norm": 0.42826897322512725, + "learning_rate": 8.569379871018925e-06, + "loss": 0.2359, + "step": 2895 + }, + { + "epoch": 0.967106361663049, + "grad_norm": 0.447750666872138, + "learning_rate": 8.568018739235895e-06, + "loss": 0.1986, + "step": 2896 + }, + { + "epoch": 0.9674403072299215, + "grad_norm": 0.45508421449926406, + "learning_rate": 8.566657068461624e-06, + "loss": 0.2089, + "step": 2897 + }, + { + "epoch": 0.9677742527967941, + "grad_norm": 0.4895779825553509, + "learning_rate": 8.565294858901804e-06, + "loss": 0.2276, + "step": 2898 + }, + { + "epoch": 0.9681081983636667, + "grad_norm": 0.4114005627874931, + "learning_rate": 8.563932110762218e-06, + "loss": 0.2125, + "step": 2899 + }, + { + "epoch": 0.9684421439305393, + "grad_norm": 0.44863986840084646, + "learning_rate": 8.562568824248722e-06, + "loss": 0.2294, + "step": 2900 + }, + { + "epoch": 0.9687760894974119, + "grad_norm": 0.45066913390824037, + "learning_rate": 8.561204999567258e-06, + "loss": 0.2125, + "step": 2901 + }, + { + "epoch": 0.9691100350642845, + "grad_norm": 0.7353204759248914, + "learning_rate": 8.559840636923845e-06, + "loss": 0.2262, + "step": 2902 + }, + { + "epoch": 0.9694439806311571, + "grad_norm": 0.4509345787133013, + "learning_rate": 8.55847573652459e-06, + "loss": 0.2242, + "step": 2903 + }, + { + "epoch": 0.9697779261980297, + "grad_norm": 0.41027552893332203, + "learning_rate": 8.557110298575674e-06, + "loss": 0.1956, + "step": 2904 + }, + { + "epoch": 0.9701118717649023, + "grad_norm": 0.3886765860123989, + "learning_rate": 8.555744323283364e-06, + "loss": 0.2062, + "step": 2905 + }, + { + "epoch": 0.9704458173317749, + "grad_norm": 0.4099609507610317, + "learning_rate": 8.554377810854006e-06, + "loss": 0.2229, + "step": 2906 + }, + { + "epoch": 0.9707797628986475, + "grad_norm": 0.4514135763925288, + "learning_rate": 8.553010761494029e-06, + "loss": 0.205, + "step": 2907 + }, + { + "epoch": 0.9711137084655201, + "grad_norm": 0.6008850226168184, + "learning_rate": 8.551643175409941e-06, + "loss": 0.2156, + "step": 2908 + }, + { + "epoch": 0.9714476540323927, + "grad_norm": 0.4142063360961322, + "learning_rate": 8.550275052808332e-06, + "loss": 0.2333, + "step": 2909 + }, + { + "epoch": 0.9717815995992654, + "grad_norm": 0.3950140050648195, + "learning_rate": 8.548906393895876e-06, + "loss": 0.202, + "step": 2910 + }, + { + "epoch": 0.972115545166138, + "grad_norm": 0.4092389771262237, + "learning_rate": 8.547537198879318e-06, + "loss": 0.221, + "step": 2911 + }, + { + "epoch": 0.9724494907330106, + "grad_norm": 0.4256297246287956, + "learning_rate": 8.546167467965496e-06, + "loss": 0.2134, + "step": 2912 + }, + { + "epoch": 0.9727834362998831, + "grad_norm": 0.4442048031336666, + "learning_rate": 8.544797201361324e-06, + "loss": 0.2213, + "step": 2913 + }, + { + "epoch": 0.9731173818667557, + "grad_norm": 0.7029604309389493, + "learning_rate": 8.543426399273796e-06, + "loss": 0.2621, + "step": 2914 + }, + { + "epoch": 0.9734513274336283, + "grad_norm": 0.45508518355245203, + "learning_rate": 8.542055061909988e-06, + "loss": 0.2271, + "step": 2915 + }, + { + "epoch": 0.9737852730005009, + "grad_norm": 0.4386269165717581, + "learning_rate": 8.540683189477057e-06, + "loss": 0.223, + "step": 2916 + }, + { + "epoch": 0.9741192185673735, + "grad_norm": 0.392448221999816, + "learning_rate": 8.539310782182238e-06, + "loss": 0.2021, + "step": 2917 + }, + { + "epoch": 0.9744531641342461, + "grad_norm": 0.42094965559359726, + "learning_rate": 8.537937840232853e-06, + "loss": 0.2052, + "step": 2918 + }, + { + "epoch": 0.9747871097011187, + "grad_norm": 0.454216033457181, + "learning_rate": 8.5365643638363e-06, + "loss": 0.2272, + "step": 2919 + }, + { + "epoch": 0.9751210552679913, + "grad_norm": 0.4225256473955746, + "learning_rate": 8.535190353200056e-06, + "loss": 0.2199, + "step": 2920 + }, + { + "epoch": 0.9754550008348639, + "grad_norm": 0.39002247751830826, + "learning_rate": 8.533815808531685e-06, + "loss": 0.2065, + "step": 2921 + }, + { + "epoch": 0.9757889464017365, + "grad_norm": 0.4266599827086228, + "learning_rate": 8.532440730038826e-06, + "loss": 0.2187, + "step": 2922 + }, + { + "epoch": 0.9761228919686091, + "grad_norm": 0.4205083010157622, + "learning_rate": 8.531065117929202e-06, + "loss": 0.2215, + "step": 2923 + }, + { + "epoch": 0.9764568375354817, + "grad_norm": 0.39958558170677844, + "learning_rate": 8.529688972410616e-06, + "loss": 0.2024, + "step": 2924 + }, + { + "epoch": 0.9767907831023543, + "grad_norm": 0.4258603303840793, + "learning_rate": 8.52831229369095e-06, + "loss": 0.2219, + "step": 2925 + }, + { + "epoch": 0.977124728669227, + "grad_norm": 0.46196673066836863, + "learning_rate": 8.526935081978166e-06, + "loss": 0.2247, + "step": 2926 + }, + { + "epoch": 0.9774586742360996, + "grad_norm": 0.39273292233721263, + "learning_rate": 8.52555733748031e-06, + "loss": 0.2047, + "step": 2927 + }, + { + "epoch": 0.9777926198029722, + "grad_norm": 0.43992124078925776, + "learning_rate": 8.524179060405507e-06, + "loss": 0.2215, + "step": 2928 + }, + { + "epoch": 0.9781265653698448, + "grad_norm": 0.43852567181358093, + "learning_rate": 8.52280025096196e-06, + "loss": 0.2194, + "step": 2929 + }, + { + "epoch": 0.9784605109367173, + "grad_norm": 0.3978127024829244, + "learning_rate": 8.521420909357956e-06, + "loss": 0.2048, + "step": 2930 + }, + { + "epoch": 0.9787944565035899, + "grad_norm": 0.5547762340317196, + "learning_rate": 8.52004103580186e-06, + "loss": 0.2273, + "step": 2931 + }, + { + "epoch": 0.9791284020704625, + "grad_norm": 0.43347864645126716, + "learning_rate": 8.51866063050212e-06, + "loss": 0.2086, + "step": 2932 + }, + { + "epoch": 0.9794623476373351, + "grad_norm": 0.4228219103235732, + "learning_rate": 8.51727969366726e-06, + "loss": 0.2133, + "step": 2933 + }, + { + "epoch": 0.9797962932042077, + "grad_norm": 0.44917306834090986, + "learning_rate": 8.515898225505885e-06, + "loss": 0.2042, + "step": 2934 + }, + { + "epoch": 0.9801302387710803, + "grad_norm": 0.46303289407329606, + "learning_rate": 8.514516226226688e-06, + "loss": 0.2111, + "step": 2935 + }, + { + "epoch": 0.9804641843379529, + "grad_norm": 0.4636036916535876, + "learning_rate": 8.513133696038432e-06, + "loss": 0.232, + "step": 2936 + }, + { + "epoch": 0.9807981299048255, + "grad_norm": 0.3664746812558084, + "learning_rate": 8.511750635149965e-06, + "loss": 0.1995, + "step": 2937 + }, + { + "epoch": 0.9811320754716981, + "grad_norm": 0.3944802889752811, + "learning_rate": 8.510367043770213e-06, + "loss": 0.1984, + "step": 2938 + }, + { + "epoch": 0.9814660210385707, + "grad_norm": 0.444656078405521, + "learning_rate": 8.508982922108188e-06, + "loss": 0.2209, + "step": 2939 + }, + { + "epoch": 0.9817999666054433, + "grad_norm": 0.4562655623442214, + "learning_rate": 8.507598270372977e-06, + "loss": 0.2312, + "step": 2940 + }, + { + "epoch": 0.9821339121723159, + "grad_norm": 0.41457463444511544, + "learning_rate": 8.506213088773744e-06, + "loss": 0.2142, + "step": 2941 + }, + { + "epoch": 0.9824678577391885, + "grad_norm": 0.39229806393694383, + "learning_rate": 8.504827377519743e-06, + "loss": 0.2077, + "step": 2942 + }, + { + "epoch": 0.9828018033060612, + "grad_norm": 0.45229518000097885, + "learning_rate": 8.503441136820296e-06, + "loss": 0.2343, + "step": 2943 + }, + { + "epoch": 0.9831357488729338, + "grad_norm": 0.4069598220008855, + "learning_rate": 8.502054366884813e-06, + "loss": 0.2062, + "step": 2944 + }, + { + "epoch": 0.9834696944398064, + "grad_norm": 0.5008800220194679, + "learning_rate": 8.500667067922784e-06, + "loss": 0.2183, + "step": 2945 + }, + { + "epoch": 0.9838036400066789, + "grad_norm": 0.4430084078638324, + "learning_rate": 8.499279240143776e-06, + "loss": 0.2272, + "step": 2946 + }, + { + "epoch": 0.9841375855735515, + "grad_norm": 0.4201159436753723, + "learning_rate": 8.497890883757434e-06, + "loss": 0.2145, + "step": 2947 + }, + { + "epoch": 0.9844715311404241, + "grad_norm": 0.4087927730039263, + "learning_rate": 8.496501998973489e-06, + "loss": 0.2124, + "step": 2948 + }, + { + "epoch": 0.9848054767072967, + "grad_norm": 0.4203559208003164, + "learning_rate": 8.495112586001747e-06, + "loss": 0.2237, + "step": 2949 + }, + { + "epoch": 0.9851394222741693, + "grad_norm": 0.464817002877942, + "learning_rate": 8.493722645052093e-06, + "loss": 0.2126, + "step": 2950 + }, + { + "epoch": 0.9854733678410419, + "grad_norm": 0.46608580721174564, + "learning_rate": 8.4923321763345e-06, + "loss": 0.2061, + "step": 2951 + }, + { + "epoch": 0.9858073134079145, + "grad_norm": 0.4967627432556802, + "learning_rate": 8.490941180059009e-06, + "loss": 0.2227, + "step": 2952 + }, + { + "epoch": 0.9861412589747871, + "grad_norm": 0.37913666532255536, + "learning_rate": 8.489549656435748e-06, + "loss": 0.2015, + "step": 2953 + }, + { + "epoch": 0.9864752045416597, + "grad_norm": 0.4623920401155906, + "learning_rate": 8.488157605674924e-06, + "loss": 0.2315, + "step": 2954 + }, + { + "epoch": 0.9868091501085323, + "grad_norm": 0.4250267254101623, + "learning_rate": 8.486765027986821e-06, + "loss": 0.2106, + "step": 2955 + }, + { + "epoch": 0.9871430956754049, + "grad_norm": 0.4378715394618819, + "learning_rate": 8.485371923581807e-06, + "loss": 0.2234, + "step": 2956 + }, + { + "epoch": 0.9874770412422775, + "grad_norm": 0.39969124237667886, + "learning_rate": 8.483978292670324e-06, + "loss": 0.2013, + "step": 2957 + }, + { + "epoch": 0.9878109868091501, + "grad_norm": 0.4052628643444376, + "learning_rate": 8.482584135462896e-06, + "loss": 0.2049, + "step": 2958 + }, + { + "epoch": 0.9881449323760227, + "grad_norm": 0.5345263629856472, + "learning_rate": 8.48118945217013e-06, + "loss": 0.2483, + "step": 2959 + }, + { + "epoch": 0.9884788779428954, + "grad_norm": 0.4411343257938832, + "learning_rate": 8.479794243002707e-06, + "loss": 0.2185, + "step": 2960 + }, + { + "epoch": 0.988812823509768, + "grad_norm": 0.41540826133254866, + "learning_rate": 8.47839850817139e-06, + "loss": 0.2125, + "step": 2961 + }, + { + "epoch": 0.9891467690766405, + "grad_norm": 0.44157585681599704, + "learning_rate": 8.477002247887024e-06, + "loss": 0.1991, + "step": 2962 + }, + { + "epoch": 0.9894807146435131, + "grad_norm": 0.4391424439864253, + "learning_rate": 8.475605462360525e-06, + "loss": 0.2279, + "step": 2963 + }, + { + "epoch": 0.9898146602103857, + "grad_norm": 0.5046416290709523, + "learning_rate": 8.474208151802898e-06, + "loss": 0.2396, + "step": 2964 + }, + { + "epoch": 0.9901486057772583, + "grad_norm": 0.4567927616624758, + "learning_rate": 8.472810316425223e-06, + "loss": 0.2277, + "step": 2965 + }, + { + "epoch": 0.9904825513441309, + "grad_norm": 0.4022340376300813, + "learning_rate": 8.471411956438657e-06, + "loss": 0.2108, + "step": 2966 + }, + { + "epoch": 0.9908164969110035, + "grad_norm": 0.4798602783262579, + "learning_rate": 8.470013072054442e-06, + "loss": 0.2264, + "step": 2967 + }, + { + "epoch": 0.9911504424778761, + "grad_norm": 0.42605908508496787, + "learning_rate": 8.468613663483894e-06, + "loss": 0.2163, + "step": 2968 + }, + { + "epoch": 0.9914843880447487, + "grad_norm": 0.7310766075725814, + "learning_rate": 8.467213730938408e-06, + "loss": 0.2194, + "step": 2969 + }, + { + "epoch": 0.9918183336116213, + "grad_norm": 0.40667359914165013, + "learning_rate": 8.465813274629466e-06, + "loss": 0.214, + "step": 2970 + }, + { + "epoch": 0.9921522791784939, + "grad_norm": 0.43679647168307384, + "learning_rate": 8.46441229476862e-06, + "loss": 0.2407, + "step": 2971 + }, + { + "epoch": 0.9924862247453665, + "grad_norm": 0.41879553528039526, + "learning_rate": 8.463010791567503e-06, + "loss": 0.2183, + "step": 2972 + }, + { + "epoch": 0.9928201703122391, + "grad_norm": 0.43420445883979225, + "learning_rate": 8.461608765237832e-06, + "loss": 0.2221, + "step": 2973 + }, + { + "epoch": 0.9931541158791117, + "grad_norm": 0.42725936711171186, + "learning_rate": 8.460206215991398e-06, + "loss": 0.2236, + "step": 2974 + }, + { + "epoch": 0.9934880614459843, + "grad_norm": 0.45881095769887553, + "learning_rate": 8.458803144040071e-06, + "loss": 0.2374, + "step": 2975 + }, + { + "epoch": 0.993822007012857, + "grad_norm": 0.4298372232535485, + "learning_rate": 8.457399549595803e-06, + "loss": 0.2045, + "step": 2976 + }, + { + "epoch": 0.9941559525797296, + "grad_norm": 0.4446130630825048, + "learning_rate": 8.455995432870626e-06, + "loss": 0.2287, + "step": 2977 + }, + { + "epoch": 0.9944898981466022, + "grad_norm": 0.4227750823184624, + "learning_rate": 8.454590794076642e-06, + "loss": 0.2199, + "step": 2978 + }, + { + "epoch": 0.9948238437134747, + "grad_norm": 0.8347772521786165, + "learning_rate": 8.453185633426044e-06, + "loss": 0.2208, + "step": 2979 + }, + { + "epoch": 0.9951577892803473, + "grad_norm": 0.38705711704080775, + "learning_rate": 8.451779951131096e-06, + "loss": 0.2048, + "step": 2980 + }, + { + "epoch": 0.9954917348472199, + "grad_norm": 0.53915343201022, + "learning_rate": 8.450373747404143e-06, + "loss": 0.2052, + "step": 2981 + }, + { + "epoch": 0.9958256804140925, + "grad_norm": 0.42343951131201196, + "learning_rate": 8.448967022457611e-06, + "loss": 0.2123, + "step": 2982 + }, + { + "epoch": 0.9961596259809651, + "grad_norm": 0.39481374145182524, + "learning_rate": 8.447559776503998e-06, + "loss": 0.2132, + "step": 2983 + }, + { + "epoch": 0.9964935715478377, + "grad_norm": 0.42340444262938, + "learning_rate": 8.446152009755886e-06, + "loss": 0.2093, + "step": 2984 + }, + { + "epoch": 0.9968275171147103, + "grad_norm": 0.5923161067138794, + "learning_rate": 8.444743722425937e-06, + "loss": 0.2359, + "step": 2985 + }, + { + "epoch": 0.9971614626815829, + "grad_norm": 0.537393246167669, + "learning_rate": 8.443334914726886e-06, + "loss": 0.2035, + "step": 2986 + }, + { + "epoch": 0.9974954082484555, + "grad_norm": 0.47016253843865485, + "learning_rate": 8.441925586871556e-06, + "loss": 0.2039, + "step": 2987 + }, + { + "epoch": 0.9978293538153281, + "grad_norm": 0.3945282745483418, + "learning_rate": 8.440515739072836e-06, + "loss": 0.2017, + "step": 2988 + }, + { + "epoch": 0.9981632993822007, + "grad_norm": 0.42985799339322056, + "learning_rate": 8.439105371543703e-06, + "loss": 0.2134, + "step": 2989 + }, + { + "epoch": 0.9984972449490733, + "grad_norm": 0.43566435703151096, + "learning_rate": 8.43769448449721e-06, + "loss": 0.2139, + "step": 2990 + }, + { + "epoch": 0.9988311905159459, + "grad_norm": 0.43132891430456666, + "learning_rate": 8.436283078146488e-06, + "loss": 0.2223, + "step": 2991 + }, + { + "epoch": 0.9991651360828185, + "grad_norm": 0.546060681351875, + "learning_rate": 8.434871152704745e-06, + "loss": 0.219, + "step": 2992 + }, + { + "epoch": 0.9994990816496911, + "grad_norm": 0.45277563875621096, + "learning_rate": 8.433458708385272e-06, + "loss": 0.2125, + "step": 2993 + }, + { + "epoch": 0.9998330272165638, + "grad_norm": 0.4072238139150075, + "learning_rate": 8.432045745401431e-06, + "loss": 0.2231, + "step": 2994 + }, + { + "epoch": 0.9998330272165638, + "eval_loss": 0.21826396882534027, + "eval_runtime": 187.3402, + "eval_samples_per_second": 107.681, + "eval_steps_per_second": 1.687, + "step": 2994 + }, + { + "epoch": 1.0001669727834364, + "grad_norm": 0.37766855711266734, + "learning_rate": 8.430632263966672e-06, + "loss": 0.2026, + "step": 2995 + }, + { + "epoch": 1.0005009183503089, + "grad_norm": 0.42746862033576705, + "learning_rate": 8.429218264294512e-06, + "loss": 0.1963, + "step": 2996 + }, + { + "epoch": 1.0008348639171816, + "grad_norm": 0.42313117231942904, + "learning_rate": 8.427803746598557e-06, + "loss": 0.1963, + "step": 2997 + }, + { + "epoch": 1.001168809484054, + "grad_norm": 0.40410590824339193, + "learning_rate": 8.426388711092486e-06, + "loss": 0.1937, + "step": 2998 + }, + { + "epoch": 1.0015027550509268, + "grad_norm": 0.3865466171900227, + "learning_rate": 8.424973157990053e-06, + "loss": 0.1889, + "step": 2999 + }, + { + "epoch": 1.0018367006177993, + "grad_norm": 0.42882739122705976, + "learning_rate": 8.4235570875051e-06, + "loss": 0.2107, + "step": 3000 + }, + { + "epoch": 1.002170646184672, + "grad_norm": 0.45285004117271893, + "learning_rate": 8.422140499851536e-06, + "loss": 0.2176, + "step": 3001 + }, + { + "epoch": 1.0025045917515445, + "grad_norm": 0.411068092342574, + "learning_rate": 8.420723395243356e-06, + "loss": 0.1989, + "step": 3002 + }, + { + "epoch": 1.002838537318417, + "grad_norm": 0.4226831623463052, + "learning_rate": 8.419305773894628e-06, + "loss": 0.1881, + "step": 3003 + }, + { + "epoch": 1.0031724828852897, + "grad_norm": 0.4373537624300133, + "learning_rate": 8.417887636019504e-06, + "loss": 0.1967, + "step": 3004 + }, + { + "epoch": 1.0035064284521622, + "grad_norm": 0.4359816928757857, + "learning_rate": 8.416468981832207e-06, + "loss": 0.1959, + "step": 3005 + }, + { + "epoch": 1.003840374019035, + "grad_norm": 0.4397588855669126, + "learning_rate": 8.415049811547043e-06, + "loss": 0.1953, + "step": 3006 + }, + { + "epoch": 1.0041743195859074, + "grad_norm": 0.4361235547061362, + "learning_rate": 8.413630125378393e-06, + "loss": 0.1965, + "step": 3007 + }, + { + "epoch": 1.0045082651527801, + "grad_norm": 0.44202989818976113, + "learning_rate": 8.412209923540719e-06, + "loss": 0.2008, + "step": 3008 + }, + { + "epoch": 1.0048422107196526, + "grad_norm": 0.5020608356876495, + "learning_rate": 8.41078920624856e-06, + "loss": 0.2118, + "step": 3009 + }, + { + "epoch": 1.0051761562865253, + "grad_norm": 0.427867506708943, + "learning_rate": 8.409367973716527e-06, + "loss": 0.2082, + "step": 3010 + }, + { + "epoch": 1.0055101018533978, + "grad_norm": 0.3950096858664292, + "learning_rate": 8.40794622615932e-06, + "loss": 0.1906, + "step": 3011 + }, + { + "epoch": 1.0058440474202706, + "grad_norm": 0.46858017830302484, + "learning_rate": 8.406523963791709e-06, + "loss": 0.2002, + "step": 3012 + }, + { + "epoch": 1.006177992987143, + "grad_norm": 0.46112765843275033, + "learning_rate": 8.405101186828542e-06, + "loss": 0.2069, + "step": 3013 + }, + { + "epoch": 1.0065119385540158, + "grad_norm": 0.7898365532056968, + "learning_rate": 8.403677895484746e-06, + "loss": 0.2108, + "step": 3014 + }, + { + "epoch": 1.0068458841208883, + "grad_norm": 0.37587310991070744, + "learning_rate": 8.402254089975328e-06, + "loss": 0.188, + "step": 3015 + }, + { + "epoch": 1.007179829687761, + "grad_norm": 0.4706264177833519, + "learning_rate": 8.400829770515369e-06, + "loss": 0.1941, + "step": 3016 + }, + { + "epoch": 1.0075137752546335, + "grad_norm": 0.4025345426782933, + "learning_rate": 8.399404937320031e-06, + "loss": 0.187, + "step": 3017 + }, + { + "epoch": 1.0078477208215062, + "grad_norm": 0.4425795571729669, + "learning_rate": 8.397979590604548e-06, + "loss": 0.2139, + "step": 3018 + }, + { + "epoch": 1.0081816663883787, + "grad_norm": 0.41370338504034, + "learning_rate": 8.39655373058424e-06, + "loss": 0.1948, + "step": 3019 + }, + { + "epoch": 1.0085156119552512, + "grad_norm": 0.49065974447042454, + "learning_rate": 8.395127357474498e-06, + "loss": 0.2014, + "step": 3020 + }, + { + "epoch": 1.008849557522124, + "grad_norm": 0.42889156437278847, + "learning_rate": 8.39370047149079e-06, + "loss": 0.1955, + "step": 3021 + }, + { + "epoch": 1.0091835030889964, + "grad_norm": 0.4188098849683885, + "learning_rate": 8.39227307284867e-06, + "loss": 0.1857, + "step": 3022 + }, + { + "epoch": 1.0095174486558691, + "grad_norm": 0.4496220697484712, + "learning_rate": 8.390845161763756e-06, + "loss": 0.2073, + "step": 3023 + }, + { + "epoch": 1.0098513942227416, + "grad_norm": 0.4471112381315624, + "learning_rate": 8.389416738451755e-06, + "loss": 0.1859, + "step": 3024 + }, + { + "epoch": 1.0101853397896143, + "grad_norm": 0.5110693539703756, + "learning_rate": 8.387987803128447e-06, + "loss": 0.2178, + "step": 3025 + }, + { + "epoch": 1.0105192853564868, + "grad_norm": 0.5629538582211671, + "learning_rate": 8.386558356009691e-06, + "loss": 0.208, + "step": 3026 + }, + { + "epoch": 1.0108532309233595, + "grad_norm": 0.43042048600723654, + "learning_rate": 8.385128397311418e-06, + "loss": 0.191, + "step": 3027 + }, + { + "epoch": 1.011187176490232, + "grad_norm": 0.443595870073697, + "learning_rate": 8.383697927249641e-06, + "loss": 0.1974, + "step": 3028 + }, + { + "epoch": 1.0115211220571048, + "grad_norm": 0.5034619295424242, + "learning_rate": 8.382266946040453e-06, + "loss": 0.2031, + "step": 3029 + }, + { + "epoch": 1.0118550676239773, + "grad_norm": 0.48727180978971396, + "learning_rate": 8.380835453900017e-06, + "loss": 0.2093, + "step": 3030 + }, + { + "epoch": 1.01218901319085, + "grad_norm": 0.4396347345023138, + "learning_rate": 8.379403451044576e-06, + "loss": 0.197, + "step": 3031 + }, + { + "epoch": 1.0125229587577225, + "grad_norm": 0.44433522767753325, + "learning_rate": 8.377970937690455e-06, + "loss": 0.2064, + "step": 3032 + }, + { + "epoch": 1.0128569043245952, + "grad_norm": 0.42254899723175227, + "learning_rate": 8.376537914054048e-06, + "loss": 0.1891, + "step": 3033 + }, + { + "epoch": 1.0131908498914677, + "grad_norm": 0.4347839679518912, + "learning_rate": 8.37510438035183e-06, + "loss": 0.193, + "step": 3034 + }, + { + "epoch": 1.0135247954583404, + "grad_norm": 0.4350862330519345, + "learning_rate": 8.373670336800358e-06, + "loss": 0.2063, + "step": 3035 + }, + { + "epoch": 1.013858741025213, + "grad_norm": 0.5486004506039731, + "learning_rate": 8.372235783616258e-06, + "loss": 0.2109, + "step": 3036 + }, + { + "epoch": 1.0141926865920854, + "grad_norm": 0.41129617581244454, + "learning_rate": 8.370800721016232e-06, + "loss": 0.1962, + "step": 3037 + }, + { + "epoch": 1.014526632158958, + "grad_norm": 0.4257282696992318, + "learning_rate": 8.369365149217072e-06, + "loss": 0.1982, + "step": 3038 + }, + { + "epoch": 1.0148605777258306, + "grad_norm": 0.46728603791520196, + "learning_rate": 8.36792906843563e-06, + "loss": 0.1908, + "step": 3039 + }, + { + "epoch": 1.0151945232927033, + "grad_norm": 0.4749390533604057, + "learning_rate": 8.366492478888849e-06, + "loss": 0.2052, + "step": 3040 + }, + { + "epoch": 1.0155284688595758, + "grad_norm": 0.46810287444765153, + "learning_rate": 8.365055380793737e-06, + "loss": 0.2041, + "step": 3041 + }, + { + "epoch": 1.0158624144264485, + "grad_norm": 0.42069357580760686, + "learning_rate": 8.363617774367389e-06, + "loss": 0.2063, + "step": 3042 + }, + { + "epoch": 1.016196359993321, + "grad_norm": 0.38632263608832246, + "learning_rate": 8.36217965982697e-06, + "loss": 0.1904, + "step": 3043 + }, + { + "epoch": 1.0165303055601937, + "grad_norm": 0.43057927948036445, + "learning_rate": 8.360741037389727e-06, + "loss": 0.216, + "step": 3044 + }, + { + "epoch": 1.0168642511270662, + "grad_norm": 0.4222757576848028, + "learning_rate": 8.359301907272976e-06, + "loss": 0.1915, + "step": 3045 + }, + { + "epoch": 1.017198196693939, + "grad_norm": 0.3786692087465665, + "learning_rate": 8.35786226969412e-06, + "loss": 0.1788, + "step": 3046 + }, + { + "epoch": 1.0175321422608115, + "grad_norm": 0.45325514122512583, + "learning_rate": 8.356422124870629e-06, + "loss": 0.2057, + "step": 3047 + }, + { + "epoch": 1.0178660878276842, + "grad_norm": 0.40238918835240045, + "learning_rate": 8.354981473020056e-06, + "loss": 0.1918, + "step": 3048 + }, + { + "epoch": 1.0182000333945567, + "grad_norm": 0.4393213758556735, + "learning_rate": 8.353540314360027e-06, + "loss": 0.1956, + "step": 3049 + }, + { + "epoch": 1.0185339789614294, + "grad_norm": 0.41890810895805786, + "learning_rate": 8.352098649108246e-06, + "loss": 0.1966, + "step": 3050 + }, + { + "epoch": 1.0188679245283019, + "grad_norm": 0.4322860515599835, + "learning_rate": 8.350656477482497e-06, + "loss": 0.1988, + "step": 3051 + }, + { + "epoch": 1.0192018700951744, + "grad_norm": 0.4546480187544326, + "learning_rate": 8.349213799700635e-06, + "loss": 0.2049, + "step": 3052 + }, + { + "epoch": 1.019535815662047, + "grad_norm": 0.4717472602597745, + "learning_rate": 8.34777061598059e-06, + "loss": 0.2074, + "step": 3053 + }, + { + "epoch": 1.0198697612289196, + "grad_norm": 0.4401320901117122, + "learning_rate": 8.346326926540377e-06, + "loss": 0.2015, + "step": 3054 + }, + { + "epoch": 1.0202037067957923, + "grad_norm": 0.43023737821846486, + "learning_rate": 8.344882731598079e-06, + "loss": 0.2026, + "step": 3055 + }, + { + "epoch": 1.0205376523626648, + "grad_norm": 0.41879723678450614, + "learning_rate": 8.343438031371858e-06, + "loss": 0.1928, + "step": 3056 + }, + { + "epoch": 1.0208715979295375, + "grad_norm": 0.4906147418309719, + "learning_rate": 8.341992826079956e-06, + "loss": 0.2218, + "step": 3057 + }, + { + "epoch": 1.02120554349641, + "grad_norm": 0.475504726884439, + "learning_rate": 8.340547115940688e-06, + "loss": 0.1776, + "step": 3058 + }, + { + "epoch": 1.0215394890632827, + "grad_norm": 0.49917711995665354, + "learning_rate": 8.339100901172443e-06, + "loss": 0.2136, + "step": 3059 + }, + { + "epoch": 1.0218734346301552, + "grad_norm": 0.6420536276666094, + "learning_rate": 8.337654181993691e-06, + "loss": 0.1996, + "step": 3060 + }, + { + "epoch": 1.022207380197028, + "grad_norm": 0.586797523605855, + "learning_rate": 8.336206958622975e-06, + "loss": 0.2052, + "step": 3061 + }, + { + "epoch": 1.0225413257639004, + "grad_norm": 0.42846778032094296, + "learning_rate": 8.334759231278915e-06, + "loss": 0.1913, + "step": 3062 + }, + { + "epoch": 1.0228752713307732, + "grad_norm": 0.43006716318612825, + "learning_rate": 8.333311000180208e-06, + "loss": 0.1964, + "step": 3063 + }, + { + "epoch": 1.0232092168976457, + "grad_norm": 0.415041357651441, + "learning_rate": 8.331862265545627e-06, + "loss": 0.1903, + "step": 3064 + }, + { + "epoch": 1.0235431624645184, + "grad_norm": 0.4809451997950013, + "learning_rate": 8.330413027594019e-06, + "loss": 0.2193, + "step": 3065 + }, + { + "epoch": 1.0238771080313909, + "grad_norm": 0.4849800241783216, + "learning_rate": 8.328963286544309e-06, + "loss": 0.2022, + "step": 3066 + }, + { + "epoch": 1.0242110535982636, + "grad_norm": 0.4358252609774767, + "learning_rate": 8.327513042615496e-06, + "loss": 0.2153, + "step": 3067 + }, + { + "epoch": 1.024544999165136, + "grad_norm": 0.42611601243396807, + "learning_rate": 8.326062296026657e-06, + "loss": 0.1973, + "step": 3068 + }, + { + "epoch": 1.0248789447320086, + "grad_norm": 0.5112017276974804, + "learning_rate": 8.324611046996947e-06, + "loss": 0.219, + "step": 3069 + }, + { + "epoch": 1.0252128902988813, + "grad_norm": 0.4373035623997926, + "learning_rate": 8.32315929574559e-06, + "loss": 0.1959, + "step": 3070 + }, + { + "epoch": 1.0255468358657538, + "grad_norm": 0.3837563091069947, + "learning_rate": 8.321707042491895e-06, + "loss": 0.1844, + "step": 3071 + }, + { + "epoch": 1.0258807814326265, + "grad_norm": 0.45421219166684185, + "learning_rate": 8.320254287455238e-06, + "loss": 0.2056, + "step": 3072 + }, + { + "epoch": 1.026214726999499, + "grad_norm": 0.40781482071872827, + "learning_rate": 8.318801030855078e-06, + "loss": 0.1864, + "step": 3073 + }, + { + "epoch": 1.0265486725663717, + "grad_norm": 0.4706705010349072, + "learning_rate": 8.317347272910944e-06, + "loss": 0.2033, + "step": 3074 + }, + { + "epoch": 1.0268826181332442, + "grad_norm": 0.45550347872407415, + "learning_rate": 8.315893013842441e-06, + "loss": 0.2192, + "step": 3075 + }, + { + "epoch": 1.027216563700117, + "grad_norm": 0.4563220551157251, + "learning_rate": 8.31443825386926e-06, + "loss": 0.2164, + "step": 3076 + }, + { + "epoch": 1.0275505092669894, + "grad_norm": 0.4866578429632529, + "learning_rate": 8.312982993211151e-06, + "loss": 0.2222, + "step": 3077 + }, + { + "epoch": 1.0278844548338621, + "grad_norm": 0.4244291790269176, + "learning_rate": 8.311527232087951e-06, + "loss": 0.2031, + "step": 3078 + }, + { + "epoch": 1.0282184004007346, + "grad_norm": 0.4243539342498891, + "learning_rate": 8.310070970719573e-06, + "loss": 0.2022, + "step": 3079 + }, + { + "epoch": 1.0285523459676074, + "grad_norm": 0.4218336141097441, + "learning_rate": 8.308614209325997e-06, + "loss": 0.2044, + "step": 3080 + }, + { + "epoch": 1.0288862915344799, + "grad_norm": 0.4661210673858716, + "learning_rate": 8.30715694812729e-06, + "loss": 0.202, + "step": 3081 + }, + { + "epoch": 1.0292202371013526, + "grad_norm": 0.4768217692487075, + "learning_rate": 8.305699187343586e-06, + "loss": 0.2128, + "step": 3082 + }, + { + "epoch": 1.029554182668225, + "grad_norm": 0.46166941719548166, + "learning_rate": 8.304240927195094e-06, + "loss": 0.2108, + "step": 3083 + }, + { + "epoch": 1.0298881282350978, + "grad_norm": 0.43191760215598024, + "learning_rate": 8.302782167902103e-06, + "loss": 0.2062, + "step": 3084 + }, + { + "epoch": 1.0302220738019703, + "grad_norm": 0.41257731682620635, + "learning_rate": 8.30132290968498e-06, + "loss": 0.1818, + "step": 3085 + }, + { + "epoch": 1.0305560193688428, + "grad_norm": 0.43840781003063306, + "learning_rate": 8.299863152764158e-06, + "loss": 0.2066, + "step": 3086 + }, + { + "epoch": 1.0308899649357155, + "grad_norm": 0.4919766494174308, + "learning_rate": 8.298402897360152e-06, + "loss": 0.1962, + "step": 3087 + }, + { + "epoch": 1.031223910502588, + "grad_norm": 0.4370771922590659, + "learning_rate": 8.29694214369355e-06, + "loss": 0.1989, + "step": 3088 + }, + { + "epoch": 1.0315578560694607, + "grad_norm": 0.4522687494366065, + "learning_rate": 8.295480891985019e-06, + "loss": 0.197, + "step": 3089 + }, + { + "epoch": 1.0318918016363332, + "grad_norm": 0.459698970205727, + "learning_rate": 8.294019142455295e-06, + "loss": 0.215, + "step": 3090 + }, + { + "epoch": 1.032225747203206, + "grad_norm": 0.45768357278304855, + "learning_rate": 8.292556895325195e-06, + "loss": 0.1834, + "step": 3091 + }, + { + "epoch": 1.0325596927700784, + "grad_norm": 0.4779953772645454, + "learning_rate": 8.291094150815607e-06, + "loss": 0.2046, + "step": 3092 + }, + { + "epoch": 1.0328936383369511, + "grad_norm": 0.4110970900415633, + "learning_rate": 8.289630909147494e-06, + "loss": 0.1926, + "step": 3093 + }, + { + "epoch": 1.0332275839038236, + "grad_norm": 0.4263110732845005, + "learning_rate": 8.2881671705419e-06, + "loss": 0.1961, + "step": 3094 + }, + { + "epoch": 1.0335615294706963, + "grad_norm": 0.42351189564424013, + "learning_rate": 8.286702935219936e-06, + "loss": 0.1924, + "step": 3095 + }, + { + "epoch": 1.0338954750375688, + "grad_norm": 0.5111127526263297, + "learning_rate": 8.285238203402796e-06, + "loss": 0.2099, + "step": 3096 + }, + { + "epoch": 1.0342294206044416, + "grad_norm": 0.4395888062180706, + "learning_rate": 8.283772975311742e-06, + "loss": 0.2091, + "step": 3097 + }, + { + "epoch": 1.034563366171314, + "grad_norm": 0.40597673314935695, + "learning_rate": 8.282307251168116e-06, + "loss": 0.1948, + "step": 3098 + }, + { + "epoch": 1.0348973117381868, + "grad_norm": 0.41337566836535816, + "learning_rate": 8.28084103119333e-06, + "loss": 0.1956, + "step": 3099 + }, + { + "epoch": 1.0352312573050593, + "grad_norm": 0.4440489461102574, + "learning_rate": 8.279374315608877e-06, + "loss": 0.1849, + "step": 3100 + }, + { + "epoch": 1.0355652028719318, + "grad_norm": 0.43649748059427285, + "learning_rate": 8.27790710463632e-06, + "loss": 0.202, + "step": 3101 + }, + { + "epoch": 1.0358991484388045, + "grad_norm": 0.45527974348537775, + "learning_rate": 8.276439398497298e-06, + "loss": 0.2069, + "step": 3102 + }, + { + "epoch": 1.036233094005677, + "grad_norm": 0.4411849130395665, + "learning_rate": 8.274971197413527e-06, + "loss": 0.2018, + "step": 3103 + }, + { + "epoch": 1.0365670395725497, + "grad_norm": 0.432927432928707, + "learning_rate": 8.273502501606794e-06, + "loss": 0.1921, + "step": 3104 + }, + { + "epoch": 1.0369009851394222, + "grad_norm": 0.48645160430520945, + "learning_rate": 8.272033311298965e-06, + "loss": 0.2062, + "step": 3105 + }, + { + "epoch": 1.037234930706295, + "grad_norm": 0.4723298015246345, + "learning_rate": 8.270563626711979e-06, + "loss": 0.2191, + "step": 3106 + }, + { + "epoch": 1.0375688762731674, + "grad_norm": 0.4286934741750141, + "learning_rate": 8.269093448067845e-06, + "loss": 0.1974, + "step": 3107 + }, + { + "epoch": 1.0379028218400401, + "grad_norm": 0.4898091980655584, + "learning_rate": 8.267622775588653e-06, + "loss": 0.2071, + "step": 3108 + }, + { + "epoch": 1.0382367674069126, + "grad_norm": 0.42263697887339474, + "learning_rate": 8.266151609496567e-06, + "loss": 0.1943, + "step": 3109 + }, + { + "epoch": 1.0385707129737853, + "grad_norm": 0.4409916018139914, + "learning_rate": 8.26467995001382e-06, + "loss": 0.1845, + "step": 3110 + }, + { + "epoch": 1.0389046585406578, + "grad_norm": 0.46867210638599427, + "learning_rate": 8.26320779736273e-06, + "loss": 0.2111, + "step": 3111 + }, + { + "epoch": 1.0392386041075306, + "grad_norm": 1.1833877800166233, + "learning_rate": 8.261735151765678e-06, + "loss": 0.2006, + "step": 3112 + }, + { + "epoch": 1.039572549674403, + "grad_norm": 0.4392933903047106, + "learning_rate": 8.260262013445126e-06, + "loss": 0.1959, + "step": 3113 + }, + { + "epoch": 1.0399064952412758, + "grad_norm": 0.43251596850546686, + "learning_rate": 8.258788382623607e-06, + "loss": 0.1921, + "step": 3114 + }, + { + "epoch": 1.0402404408081483, + "grad_norm": 0.5317197119185111, + "learning_rate": 8.257314259523732e-06, + "loss": 0.2015, + "step": 3115 + }, + { + "epoch": 1.040574386375021, + "grad_norm": 0.4656601804952947, + "learning_rate": 8.255839644368185e-06, + "loss": 0.2033, + "step": 3116 + }, + { + "epoch": 1.0409083319418935, + "grad_norm": 0.48325787940709314, + "learning_rate": 8.254364537379725e-06, + "loss": 0.1973, + "step": 3117 + }, + { + "epoch": 1.041242277508766, + "grad_norm": 0.4615708346164602, + "learning_rate": 8.25288893878118e-06, + "loss": 0.2173, + "step": 3118 + }, + { + "epoch": 1.0415762230756387, + "grad_norm": 0.4910469321775321, + "learning_rate": 8.251412848795462e-06, + "loss": 0.2112, + "step": 3119 + }, + { + "epoch": 1.0419101686425112, + "grad_norm": 0.42906476123756765, + "learning_rate": 8.249936267645546e-06, + "loss": 0.2085, + "step": 3120 + }, + { + "epoch": 1.042244114209384, + "grad_norm": 0.4468814505175119, + "learning_rate": 8.248459195554492e-06, + "loss": 0.2027, + "step": 3121 + }, + { + "epoch": 1.0425780597762564, + "grad_norm": 0.4510701292444035, + "learning_rate": 8.246981632745428e-06, + "loss": 0.216, + "step": 3122 + }, + { + "epoch": 1.0429120053431291, + "grad_norm": 0.5012622614358289, + "learning_rate": 8.245503579441554e-06, + "loss": 0.2041, + "step": 3123 + }, + { + "epoch": 1.0432459509100016, + "grad_norm": 0.44604941183333585, + "learning_rate": 8.244025035866151e-06, + "loss": 0.2107, + "step": 3124 + }, + { + "epoch": 1.0435798964768743, + "grad_norm": 0.43182075677881904, + "learning_rate": 8.242546002242569e-06, + "loss": 0.2012, + "step": 3125 + }, + { + "epoch": 1.0439138420437468, + "grad_norm": 0.7065204162623006, + "learning_rate": 8.241066478794233e-06, + "loss": 0.2149, + "step": 3126 + }, + { + "epoch": 1.0442477876106195, + "grad_norm": 0.43548371799456587, + "learning_rate": 8.239586465744644e-06, + "loss": 0.1964, + "step": 3127 + }, + { + "epoch": 1.044581733177492, + "grad_norm": 0.45976736574310023, + "learning_rate": 8.238105963317376e-06, + "loss": 0.1977, + "step": 3128 + }, + { + "epoch": 1.0449156787443648, + "grad_norm": 0.45610032095662145, + "learning_rate": 8.236624971736071e-06, + "loss": 0.2006, + "step": 3129 + }, + { + "epoch": 1.0452496243112372, + "grad_norm": 0.4178927483884343, + "learning_rate": 8.235143491224458e-06, + "loss": 0.1941, + "step": 3130 + }, + { + "epoch": 1.04558356987811, + "grad_norm": 0.4450014435903185, + "learning_rate": 8.233661522006324e-06, + "loss": 0.2011, + "step": 3131 + }, + { + "epoch": 1.0459175154449825, + "grad_norm": 0.4678822230492103, + "learning_rate": 8.232179064305545e-06, + "loss": 0.2141, + "step": 3132 + }, + { + "epoch": 1.0462514610118552, + "grad_norm": 0.4659113518770199, + "learning_rate": 8.230696118346059e-06, + "loss": 0.2029, + "step": 3133 + }, + { + "epoch": 1.0465854065787277, + "grad_norm": 0.42572466204955733, + "learning_rate": 8.229212684351886e-06, + "loss": 0.2092, + "step": 3134 + }, + { + "epoch": 1.0469193521456002, + "grad_norm": 0.42976487899817956, + "learning_rate": 8.227728762547112e-06, + "loss": 0.2049, + "step": 3135 + }, + { + "epoch": 1.0472532977124729, + "grad_norm": 0.4285031948928849, + "learning_rate": 8.226244353155906e-06, + "loss": 0.1895, + "step": 3136 + }, + { + "epoch": 1.0475872432793454, + "grad_norm": 0.41985602222315516, + "learning_rate": 8.2247594564025e-06, + "loss": 0.1976, + "step": 3137 + }, + { + "epoch": 1.047921188846218, + "grad_norm": 0.44464462060650317, + "learning_rate": 8.22327407251121e-06, + "loss": 0.1917, + "step": 3138 + }, + { + "epoch": 1.0482551344130906, + "grad_norm": 0.4273282426042788, + "learning_rate": 8.221788201706416e-06, + "loss": 0.202, + "step": 3139 + }, + { + "epoch": 1.0485890799799633, + "grad_norm": 0.41653410454050144, + "learning_rate": 8.22030184421258e-06, + "loss": 0.2017, + "step": 3140 + }, + { + "epoch": 1.0489230255468358, + "grad_norm": 0.43858551157838355, + "learning_rate": 8.218815000254233e-06, + "loss": 0.2022, + "step": 3141 + }, + { + "epoch": 1.0492569711137085, + "grad_norm": 0.4908298917701791, + "learning_rate": 8.21732767005598e-06, + "loss": 0.2132, + "step": 3142 + }, + { + "epoch": 1.049590916680581, + "grad_norm": 0.4749605912490831, + "learning_rate": 8.215839853842498e-06, + "loss": 0.2158, + "step": 3143 + }, + { + "epoch": 1.0499248622474537, + "grad_norm": 0.4483578369643801, + "learning_rate": 8.214351551838541e-06, + "loss": 0.2043, + "step": 3144 + }, + { + "epoch": 1.0502588078143262, + "grad_norm": 0.42665670967430674, + "learning_rate": 8.212862764268936e-06, + "loss": 0.1972, + "step": 3145 + }, + { + "epoch": 1.050592753381199, + "grad_norm": 0.44815585260638, + "learning_rate": 8.21137349135858e-06, + "loss": 0.2022, + "step": 3146 + }, + { + "epoch": 1.0509266989480714, + "grad_norm": 0.42978041785750576, + "learning_rate": 8.209883733332444e-06, + "loss": 0.193, + "step": 3147 + }, + { + "epoch": 1.0512606445149442, + "grad_norm": 0.40783792703270516, + "learning_rate": 8.208393490415576e-06, + "loss": 0.2001, + "step": 3148 + }, + { + "epoch": 1.0515945900818167, + "grad_norm": 0.5034157622557892, + "learning_rate": 8.206902762833095e-06, + "loss": 0.1908, + "step": 3149 + }, + { + "epoch": 1.0519285356486892, + "grad_norm": 0.41596953468049164, + "learning_rate": 8.205411550810189e-06, + "loss": 0.2026, + "step": 3150 + }, + { + "epoch": 1.0522624812155619, + "grad_norm": 0.4387109004909959, + "learning_rate": 8.203919854572126e-06, + "loss": 0.1955, + "step": 3151 + }, + { + "epoch": 1.0525964267824344, + "grad_norm": 0.4400267808923557, + "learning_rate": 8.202427674344246e-06, + "loss": 0.1956, + "step": 3152 + }, + { + "epoch": 1.052930372349307, + "grad_norm": 0.42088665905677247, + "learning_rate": 8.200935010351958e-06, + "loss": 0.2036, + "step": 3153 + }, + { + "epoch": 1.0532643179161796, + "grad_norm": 0.42093977058051263, + "learning_rate": 8.199441862820746e-06, + "loss": 0.2094, + "step": 3154 + }, + { + "epoch": 1.0535982634830523, + "grad_norm": 0.41817125248997694, + "learning_rate": 8.197948231976169e-06, + "loss": 0.1905, + "step": 3155 + }, + { + "epoch": 1.0539322090499248, + "grad_norm": 0.42802200051982714, + "learning_rate": 8.196454118043856e-06, + "loss": 0.2004, + "step": 3156 + }, + { + "epoch": 1.0542661546167975, + "grad_norm": 0.4750486331903779, + "learning_rate": 8.194959521249512e-06, + "loss": 0.2019, + "step": 3157 + }, + { + "epoch": 1.05460010018367, + "grad_norm": 0.5160559948271373, + "learning_rate": 8.193464441818913e-06, + "loss": 0.209, + "step": 3158 + }, + { + "epoch": 1.0549340457505427, + "grad_norm": 0.420087588687286, + "learning_rate": 8.191968879977907e-06, + "loss": 0.2003, + "step": 3159 + }, + { + "epoch": 1.0552679913174152, + "grad_norm": 0.4037748789668453, + "learning_rate": 8.190472835952419e-06, + "loss": 0.1872, + "step": 3160 + }, + { + "epoch": 1.055601936884288, + "grad_norm": 0.38711125361133414, + "learning_rate": 8.188976309968443e-06, + "loss": 0.1858, + "step": 3161 + }, + { + "epoch": 1.0559358824511604, + "grad_norm": 0.47560444912511207, + "learning_rate": 8.187479302252045e-06, + "loss": 0.2034, + "step": 3162 + }, + { + "epoch": 1.0562698280180332, + "grad_norm": 0.3964934919375712, + "learning_rate": 8.185981813029368e-06, + "loss": 0.1892, + "step": 3163 + }, + { + "epoch": 1.0566037735849056, + "grad_norm": 0.45616318954926843, + "learning_rate": 8.184483842526623e-06, + "loss": 0.1934, + "step": 3164 + }, + { + "epoch": 1.0569377191517784, + "grad_norm": 0.4221881863168421, + "learning_rate": 8.1829853909701e-06, + "loss": 0.1882, + "step": 3165 + }, + { + "epoch": 1.0572716647186509, + "grad_norm": 0.4946404078548695, + "learning_rate": 8.181486458586153e-06, + "loss": 0.2197, + "step": 3166 + }, + { + "epoch": 1.0576056102855234, + "grad_norm": 0.41759418433544915, + "learning_rate": 8.179987045601217e-06, + "loss": 0.1905, + "step": 3167 + }, + { + "epoch": 1.057939555852396, + "grad_norm": 0.4680014004671277, + "learning_rate": 8.178487152241795e-06, + "loss": 0.2116, + "step": 3168 + }, + { + "epoch": 1.0582735014192686, + "grad_norm": 0.4679411328019381, + "learning_rate": 8.17698677873446e-06, + "loss": 0.2281, + "step": 3169 + }, + { + "epoch": 1.0586074469861413, + "grad_norm": 0.4306592891134695, + "learning_rate": 8.175485925305867e-06, + "loss": 0.1981, + "step": 3170 + }, + { + "epoch": 1.0589413925530138, + "grad_norm": 0.4279511404583339, + "learning_rate": 8.173984592182736e-06, + "loss": 0.2152, + "step": 3171 + }, + { + "epoch": 1.0592753381198865, + "grad_norm": 0.4256581923170128, + "learning_rate": 8.172482779591858e-06, + "loss": 0.1965, + "step": 3172 + }, + { + "epoch": 1.059609283686759, + "grad_norm": 0.42259600236442063, + "learning_rate": 8.170980487760101e-06, + "loss": 0.1959, + "step": 3173 + }, + { + "epoch": 1.0599432292536317, + "grad_norm": 0.3919759523982466, + "learning_rate": 8.169477716914405e-06, + "loss": 0.1835, + "step": 3174 + }, + { + "epoch": 1.0602771748205042, + "grad_norm": 0.3965257390740146, + "learning_rate": 8.16797446728178e-06, + "loss": 0.192, + "step": 3175 + }, + { + "epoch": 1.060611120387377, + "grad_norm": 0.44522593096004287, + "learning_rate": 8.16647073908931e-06, + "loss": 0.2098, + "step": 3176 + }, + { + "epoch": 1.0609450659542494, + "grad_norm": 0.4581897421406975, + "learning_rate": 8.164966532564152e-06, + "loss": 0.2049, + "step": 3177 + }, + { + "epoch": 1.0612790115211221, + "grad_norm": 0.4995506758881796, + "learning_rate": 8.163461847933532e-06, + "loss": 0.1989, + "step": 3178 + }, + { + "epoch": 1.0616129570879946, + "grad_norm": 0.40476982409428536, + "learning_rate": 8.161956685424752e-06, + "loss": 0.1843, + "step": 3179 + }, + { + "epoch": 1.0619469026548674, + "grad_norm": 0.42655470604742796, + "learning_rate": 8.160451045265183e-06, + "loss": 0.2032, + "step": 3180 + }, + { + "epoch": 1.0622808482217398, + "grad_norm": 0.42018746091677206, + "learning_rate": 8.158944927682269e-06, + "loss": 0.1965, + "step": 3181 + }, + { + "epoch": 1.0626147937886126, + "grad_norm": 0.4268157415568209, + "learning_rate": 8.157438332903531e-06, + "loss": 0.2006, + "step": 3182 + }, + { + "epoch": 1.062948739355485, + "grad_norm": 0.4033846341010823, + "learning_rate": 8.155931261156555e-06, + "loss": 0.1924, + "step": 3183 + }, + { + "epoch": 1.0632826849223576, + "grad_norm": 0.43889640040181027, + "learning_rate": 8.154423712669003e-06, + "loss": 0.1986, + "step": 3184 + }, + { + "epoch": 1.0636166304892303, + "grad_norm": 0.4001342148930981, + "learning_rate": 8.152915687668603e-06, + "loss": 0.1862, + "step": 3185 + }, + { + "epoch": 1.0639505760561028, + "grad_norm": 0.4105761999473694, + "learning_rate": 8.151407186383166e-06, + "loss": 0.191, + "step": 3186 + }, + { + "epoch": 1.0642845216229755, + "grad_norm": 0.4319067425333874, + "learning_rate": 8.149898209040568e-06, + "loss": 0.1997, + "step": 3187 + }, + { + "epoch": 1.064618467189848, + "grad_norm": 0.4560783659343897, + "learning_rate": 8.148388755868757e-06, + "loss": 0.2077, + "step": 3188 + }, + { + "epoch": 1.0649524127567207, + "grad_norm": 0.43798081875672745, + "learning_rate": 8.146878827095751e-06, + "loss": 0.1913, + "step": 3189 + }, + { + "epoch": 1.0652863583235932, + "grad_norm": 0.4372702184821335, + "learning_rate": 8.145368422949647e-06, + "loss": 0.2034, + "step": 3190 + }, + { + "epoch": 1.065620303890466, + "grad_norm": 0.41986516394757967, + "learning_rate": 8.143857543658606e-06, + "loss": 0.199, + "step": 3191 + }, + { + "epoch": 1.0659542494573384, + "grad_norm": 0.41907063437667164, + "learning_rate": 8.142346189450866e-06, + "loss": 0.2159, + "step": 3192 + }, + { + "epoch": 1.0662881950242111, + "grad_norm": 0.4094081046487141, + "learning_rate": 8.140834360554734e-06, + "loss": 0.1948, + "step": 3193 + }, + { + "epoch": 1.0666221405910836, + "grad_norm": 0.4371273342597166, + "learning_rate": 8.13932205719859e-06, + "loss": 0.2082, + "step": 3194 + }, + { + "epoch": 1.0669560861579563, + "grad_norm": 0.4135820465008076, + "learning_rate": 8.137809279610885e-06, + "loss": 0.1929, + "step": 3195 + }, + { + "epoch": 1.0672900317248288, + "grad_norm": 0.4399567478630838, + "learning_rate": 8.13629602802014e-06, + "loss": 0.1995, + "step": 3196 + }, + { + "epoch": 1.0676239772917016, + "grad_norm": 0.5092407448174172, + "learning_rate": 8.134782302654953e-06, + "loss": 0.2134, + "step": 3197 + }, + { + "epoch": 1.067957922858574, + "grad_norm": 0.5597000261870062, + "learning_rate": 8.133268103743989e-06, + "loss": 0.2111, + "step": 3198 + }, + { + "epoch": 1.0682918684254465, + "grad_norm": 0.4341592549068151, + "learning_rate": 8.131753431515984e-06, + "loss": 0.2052, + "step": 3199 + }, + { + "epoch": 1.0686258139923193, + "grad_norm": 0.5131722560356433, + "learning_rate": 8.130238286199747e-06, + "loss": 0.1998, + "step": 3200 + }, + { + "epoch": 1.0689597595591918, + "grad_norm": 0.4204116895958866, + "learning_rate": 8.128722668024161e-06, + "loss": 0.1961, + "step": 3201 + }, + { + "epoch": 1.0692937051260645, + "grad_norm": 0.46505953263469596, + "learning_rate": 8.127206577218177e-06, + "loss": 0.2106, + "step": 3202 + }, + { + "epoch": 1.069627650692937, + "grad_norm": 0.44604847632939687, + "learning_rate": 8.125690014010814e-06, + "loss": 0.2052, + "step": 3203 + }, + { + "epoch": 1.0699615962598097, + "grad_norm": 0.43756933565755846, + "learning_rate": 8.124172978631173e-06, + "loss": 0.1994, + "step": 3204 + }, + { + "epoch": 1.0702955418266822, + "grad_norm": 0.461832138118902, + "learning_rate": 8.12265547130842e-06, + "loss": 0.2158, + "step": 3205 + }, + { + "epoch": 1.070629487393555, + "grad_norm": 0.5180640624879851, + "learning_rate": 8.121137492271787e-06, + "loss": 0.206, + "step": 3206 + }, + { + "epoch": 1.0709634329604274, + "grad_norm": 0.44492318961314764, + "learning_rate": 8.119619041750586e-06, + "loss": 0.2123, + "step": 3207 + }, + { + "epoch": 1.0712973785273001, + "grad_norm": 0.44935278367080866, + "learning_rate": 8.118100119974197e-06, + "loss": 0.1938, + "step": 3208 + }, + { + "epoch": 1.0716313240941726, + "grad_norm": 0.5594811829388091, + "learning_rate": 8.116580727172071e-06, + "loss": 0.1879, + "step": 3209 + }, + { + "epoch": 1.0719652696610453, + "grad_norm": 0.4261468140314612, + "learning_rate": 8.115060863573729e-06, + "loss": 0.194, + "step": 3210 + }, + { + "epoch": 1.0722992152279178, + "grad_norm": 0.5033805368157565, + "learning_rate": 8.113540529408766e-06, + "loss": 0.201, + "step": 3211 + }, + { + "epoch": 1.0726331607947905, + "grad_norm": 0.41711740898266775, + "learning_rate": 8.112019724906844e-06, + "loss": 0.1898, + "step": 3212 + }, + { + "epoch": 1.072967106361663, + "grad_norm": 0.4474697888666802, + "learning_rate": 8.1104984502977e-06, + "loss": 0.2019, + "step": 3213 + }, + { + "epoch": 1.0733010519285355, + "grad_norm": 0.4425877508940847, + "learning_rate": 8.108976705811138e-06, + "loss": 0.204, + "step": 3214 + }, + { + "epoch": 1.0736349974954082, + "grad_norm": 0.40031817179384527, + "learning_rate": 8.107454491677041e-06, + "loss": 0.191, + "step": 3215 + }, + { + "epoch": 1.0739689430622807, + "grad_norm": 0.4374992487231899, + "learning_rate": 8.10593180812535e-06, + "loss": 0.1825, + "step": 3216 + }, + { + "epoch": 1.0743028886291535, + "grad_norm": 0.4304869205108331, + "learning_rate": 8.104408655386092e-06, + "loss": 0.1964, + "step": 3217 + }, + { + "epoch": 1.074636834196026, + "grad_norm": 0.4273397458580168, + "learning_rate": 8.102885033689352e-06, + "loss": 0.1987, + "step": 3218 + }, + { + "epoch": 1.0749707797628987, + "grad_norm": 0.44715648855381623, + "learning_rate": 8.101360943265293e-06, + "loss": 0.2094, + "step": 3219 + }, + { + "epoch": 1.0753047253297712, + "grad_norm": 0.433584901382971, + "learning_rate": 8.099836384344146e-06, + "loss": 0.192, + "step": 3220 + }, + { + "epoch": 1.075638670896644, + "grad_norm": 0.45246811449875446, + "learning_rate": 8.098311357156213e-06, + "loss": 0.2063, + "step": 3221 + }, + { + "epoch": 1.0759726164635164, + "grad_norm": 0.3831669548869065, + "learning_rate": 8.096785861931868e-06, + "loss": 0.1873, + "step": 3222 + }, + { + "epoch": 1.076306562030389, + "grad_norm": 0.39589825338369394, + "learning_rate": 8.095259898901557e-06, + "loss": 0.1884, + "step": 3223 + }, + { + "epoch": 1.0766405075972616, + "grad_norm": 0.42988110937230956, + "learning_rate": 8.09373346829579e-06, + "loss": 0.1989, + "step": 3224 + }, + { + "epoch": 1.0769744531641343, + "grad_norm": 0.3861062119239817, + "learning_rate": 8.092206570345158e-06, + "loss": 0.1876, + "step": 3225 + }, + { + "epoch": 1.0773083987310068, + "grad_norm": 0.39497963164084615, + "learning_rate": 8.090679205280311e-06, + "loss": 0.1875, + "step": 3226 + }, + { + "epoch": 1.0776423442978795, + "grad_norm": 0.4204241046414486, + "learning_rate": 8.08915137333198e-06, + "loss": 0.1985, + "step": 3227 + }, + { + "epoch": 1.077976289864752, + "grad_norm": 0.512738178352209, + "learning_rate": 8.08762307473096e-06, + "loss": 0.2269, + "step": 3228 + }, + { + "epoch": 1.0783102354316247, + "grad_norm": 0.44715999802900963, + "learning_rate": 8.08609430970812e-06, + "loss": 0.2031, + "step": 3229 + }, + { + "epoch": 1.0786441809984972, + "grad_norm": 0.3995392929976696, + "learning_rate": 8.084565078494396e-06, + "loss": 0.1908, + "step": 3230 + }, + { + "epoch": 1.07897812656537, + "grad_norm": 0.4132071042034365, + "learning_rate": 8.083035381320798e-06, + "loss": 0.1984, + "step": 3231 + }, + { + "epoch": 1.0793120721322425, + "grad_norm": 0.4507397205246835, + "learning_rate": 8.081505218418403e-06, + "loss": 0.2041, + "step": 3232 + }, + { + "epoch": 1.079646017699115, + "grad_norm": 0.39389622349185976, + "learning_rate": 8.079974590018363e-06, + "loss": 0.1941, + "step": 3233 + }, + { + "epoch": 1.0799799632659877, + "grad_norm": 0.418373210132237, + "learning_rate": 8.078443496351893e-06, + "loss": 0.1896, + "step": 3234 + }, + { + "epoch": 1.0803139088328602, + "grad_norm": 0.44554108624356625, + "learning_rate": 8.076911937650288e-06, + "loss": 0.1959, + "step": 3235 + }, + { + "epoch": 1.0806478543997329, + "grad_norm": 0.3895348321590768, + "learning_rate": 8.075379914144902e-06, + "loss": 0.1834, + "step": 3236 + }, + { + "epoch": 1.0809817999666054, + "grad_norm": 0.41066865301421246, + "learning_rate": 8.073847426067172e-06, + "loss": 0.1926, + "step": 3237 + }, + { + "epoch": 1.081315745533478, + "grad_norm": 0.4979728510699529, + "learning_rate": 8.072314473648595e-06, + "loss": 0.2136, + "step": 3238 + }, + { + "epoch": 1.0816496911003506, + "grad_norm": 0.4147497389444579, + "learning_rate": 8.07078105712074e-06, + "loss": 0.2014, + "step": 3239 + }, + { + "epoch": 1.0819836366672233, + "grad_norm": 0.44728845985893656, + "learning_rate": 8.06924717671525e-06, + "loss": 0.1971, + "step": 3240 + }, + { + "epoch": 1.0823175822340958, + "grad_norm": 0.4179130657469007, + "learning_rate": 8.067712832663831e-06, + "loss": 0.1949, + "step": 3241 + }, + { + "epoch": 1.0826515278009685, + "grad_norm": 0.4107644564937398, + "learning_rate": 8.066178025198272e-06, + "loss": 0.1877, + "step": 3242 + }, + { + "epoch": 1.082985473367841, + "grad_norm": 0.46230738921006503, + "learning_rate": 8.064642754550418e-06, + "loss": 0.2038, + "step": 3243 + }, + { + "epoch": 1.0833194189347137, + "grad_norm": 0.44196147805322944, + "learning_rate": 8.06310702095219e-06, + "loss": 0.2034, + "step": 3244 + }, + { + "epoch": 1.0836533645015862, + "grad_norm": 0.4247083802737399, + "learning_rate": 8.06157082463558e-06, + "loss": 0.2016, + "step": 3245 + }, + { + "epoch": 1.083987310068459, + "grad_norm": 0.4871767192301708, + "learning_rate": 8.060034165832648e-06, + "loss": 0.2036, + "step": 3246 + }, + { + "epoch": 1.0843212556353314, + "grad_norm": 0.44229334769971135, + "learning_rate": 8.058497044775526e-06, + "loss": 0.2002, + "step": 3247 + }, + { + "epoch": 1.084655201202204, + "grad_norm": 0.47425800895867326, + "learning_rate": 8.05695946169641e-06, + "loss": 0.2031, + "step": 3248 + }, + { + "epoch": 1.0849891467690767, + "grad_norm": 0.4145195665483283, + "learning_rate": 8.055421416827575e-06, + "loss": 0.1965, + "step": 3249 + }, + { + "epoch": 1.0853230923359491, + "grad_norm": 0.5037905106010119, + "learning_rate": 8.053882910401359e-06, + "loss": 0.2002, + "step": 3250 + }, + { + "epoch": 1.0856570379028219, + "grad_norm": 0.47633932245924165, + "learning_rate": 8.052343942650168e-06, + "loss": 0.2055, + "step": 3251 + }, + { + "epoch": 1.0859909834696944, + "grad_norm": 0.4534882323600842, + "learning_rate": 8.050804513806488e-06, + "loss": 0.2039, + "step": 3252 + }, + { + "epoch": 1.086324929036567, + "grad_norm": 0.4399351104535666, + "learning_rate": 8.049264624102862e-06, + "loss": 0.1916, + "step": 3253 + }, + { + "epoch": 1.0866588746034396, + "grad_norm": 0.42972776831463017, + "learning_rate": 8.047724273771909e-06, + "loss": 0.2051, + "step": 3254 + }, + { + "epoch": 1.0869928201703123, + "grad_norm": 0.4387648183372129, + "learning_rate": 8.046183463046322e-06, + "loss": 0.2131, + "step": 3255 + }, + { + "epoch": 1.0873267657371848, + "grad_norm": 0.4531319223093814, + "learning_rate": 8.044642192158854e-06, + "loss": 0.2045, + "step": 3256 + }, + { + "epoch": 1.0876607113040575, + "grad_norm": 0.43051407362446054, + "learning_rate": 8.043100461342332e-06, + "loss": 0.2063, + "step": 3257 + }, + { + "epoch": 1.08799465687093, + "grad_norm": 0.4543744781210135, + "learning_rate": 8.041558270829655e-06, + "loss": 0.1944, + "step": 3258 + }, + { + "epoch": 1.0883286024378027, + "grad_norm": 0.5087181521948336, + "learning_rate": 8.04001562085379e-06, + "loss": 0.2223, + "step": 3259 + }, + { + "epoch": 1.0886625480046752, + "grad_norm": 0.4310436393030242, + "learning_rate": 8.038472511647768e-06, + "loss": 0.1884, + "step": 3260 + }, + { + "epoch": 1.088996493571548, + "grad_norm": 0.4297632017698525, + "learning_rate": 8.036928943444698e-06, + "loss": 0.2041, + "step": 3261 + }, + { + "epoch": 1.0893304391384204, + "grad_norm": 0.4827595445998538, + "learning_rate": 8.03538491647775e-06, + "loss": 0.2051, + "step": 3262 + }, + { + "epoch": 1.089664384705293, + "grad_norm": 0.46301803588539736, + "learning_rate": 8.03384043098017e-06, + "loss": 0.1946, + "step": 3263 + }, + { + "epoch": 1.0899983302721656, + "grad_norm": 0.5171554254848715, + "learning_rate": 8.032295487185273e-06, + "loss": 0.2109, + "step": 3264 + }, + { + "epoch": 1.0903322758390381, + "grad_norm": 0.38795308444933635, + "learning_rate": 8.030750085326438e-06, + "loss": 0.1782, + "step": 3265 + }, + { + "epoch": 1.0906662214059109, + "grad_norm": 0.47435267419979654, + "learning_rate": 8.029204225637114e-06, + "loss": 0.2044, + "step": 3266 + }, + { + "epoch": 1.0910001669727833, + "grad_norm": 0.4123685515803338, + "learning_rate": 8.027657908350826e-06, + "loss": 0.1808, + "step": 3267 + }, + { + "epoch": 1.091334112539656, + "grad_norm": 0.4947662241675235, + "learning_rate": 8.026111133701162e-06, + "loss": 0.2165, + "step": 3268 + }, + { + "epoch": 1.0916680581065286, + "grad_norm": 0.5047171583821233, + "learning_rate": 8.02456390192178e-06, + "loss": 0.2062, + "step": 3269 + }, + { + "epoch": 1.0920020036734013, + "grad_norm": 0.41658647064399756, + "learning_rate": 8.023016213246406e-06, + "loss": 0.1888, + "step": 3270 + }, + { + "epoch": 1.0923359492402738, + "grad_norm": 0.44506089965581364, + "learning_rate": 8.021468067908839e-06, + "loss": 0.2019, + "step": 3271 + }, + { + "epoch": 1.0926698948071465, + "grad_norm": 0.4885924662885089, + "learning_rate": 8.019919466142945e-06, + "loss": 0.1967, + "step": 3272 + }, + { + "epoch": 1.093003840374019, + "grad_norm": 0.4729823871926014, + "learning_rate": 8.018370408182655e-06, + "loss": 0.1982, + "step": 3273 + }, + { + "epoch": 1.0933377859408917, + "grad_norm": 0.4531774083310163, + "learning_rate": 8.016820894261975e-06, + "loss": 0.2072, + "step": 3274 + }, + { + "epoch": 1.0936717315077642, + "grad_norm": 0.43365265276494597, + "learning_rate": 8.015270924614977e-06, + "loss": 0.2068, + "step": 3275 + }, + { + "epoch": 1.094005677074637, + "grad_norm": 0.48378575313322314, + "learning_rate": 8.013720499475804e-06, + "loss": 0.2127, + "step": 3276 + }, + { + "epoch": 1.0943396226415094, + "grad_norm": 0.4462044392760605, + "learning_rate": 8.012169619078662e-06, + "loss": 0.205, + "step": 3277 + }, + { + "epoch": 1.0946735682083821, + "grad_norm": 0.4410639604021698, + "learning_rate": 8.010618283657834e-06, + "loss": 0.2022, + "step": 3278 + }, + { + "epoch": 1.0950075137752546, + "grad_norm": 0.41187747484604836, + "learning_rate": 8.009066493447664e-06, + "loss": 0.1917, + "step": 3279 + }, + { + "epoch": 1.0953414593421273, + "grad_norm": 0.4505023943246841, + "learning_rate": 8.00751424868257e-06, + "loss": 0.2128, + "step": 3280 + }, + { + "epoch": 1.0956754049089998, + "grad_norm": 0.44407650340933, + "learning_rate": 8.005961549597037e-06, + "loss": 0.2023, + "step": 3281 + }, + { + "epoch": 1.0960093504758723, + "grad_norm": 0.49103707615481035, + "learning_rate": 8.004408396425617e-06, + "loss": 0.1892, + "step": 3282 + }, + { + "epoch": 1.096343296042745, + "grad_norm": 0.41128125303211827, + "learning_rate": 8.002854789402931e-06, + "loss": 0.1902, + "step": 3283 + }, + { + "epoch": 1.0966772416096175, + "grad_norm": 0.42074924001477537, + "learning_rate": 8.001300728763674e-06, + "loss": 0.1842, + "step": 3284 + }, + { + "epoch": 1.0970111871764903, + "grad_norm": 0.44415785033200067, + "learning_rate": 7.999746214742603e-06, + "loss": 0.2089, + "step": 3285 + }, + { + "epoch": 1.0973451327433628, + "grad_norm": 0.3976674538896059, + "learning_rate": 7.998191247574545e-06, + "loss": 0.1876, + "step": 3286 + }, + { + "epoch": 1.0976790783102355, + "grad_norm": 0.4602221499860677, + "learning_rate": 7.996635827494397e-06, + "loss": 0.1924, + "step": 3287 + }, + { + "epoch": 1.098013023877108, + "grad_norm": 0.4569109408695937, + "learning_rate": 7.995079954737122e-06, + "loss": 0.2004, + "step": 3288 + }, + { + "epoch": 1.0983469694439807, + "grad_norm": 0.46440228075117673, + "learning_rate": 7.993523629537753e-06, + "loss": 0.201, + "step": 3289 + }, + { + "epoch": 1.0986809150108532, + "grad_norm": 0.4210453426591052, + "learning_rate": 7.991966852131394e-06, + "loss": 0.2009, + "step": 3290 + }, + { + "epoch": 1.099014860577726, + "grad_norm": 0.4197516901784847, + "learning_rate": 7.990409622753212e-06, + "loss": 0.1993, + "step": 3291 + }, + { + "epoch": 1.0993488061445984, + "grad_norm": 0.40831891802316234, + "learning_rate": 7.988851941638445e-06, + "loss": 0.1941, + "step": 3292 + }, + { + "epoch": 1.0996827517114711, + "grad_norm": 0.4286089460485976, + "learning_rate": 7.987293809022401e-06, + "loss": 0.2052, + "step": 3293 + }, + { + "epoch": 1.1000166972783436, + "grad_norm": 0.4800654204970883, + "learning_rate": 7.985735225140452e-06, + "loss": 0.2093, + "step": 3294 + }, + { + "epoch": 1.1003506428452163, + "grad_norm": 0.4753445481388503, + "learning_rate": 7.984176190228042e-06, + "loss": 0.2095, + "step": 3295 + }, + { + "epoch": 1.1006845884120888, + "grad_norm": 0.4414696105218096, + "learning_rate": 7.98261670452068e-06, + "loss": 0.2049, + "step": 3296 + }, + { + "epoch": 1.1010185339789613, + "grad_norm": 0.46108006143762986, + "learning_rate": 7.981056768253945e-06, + "loss": 0.2192, + "step": 3297 + }, + { + "epoch": 1.101352479545834, + "grad_norm": 0.40832219511245, + "learning_rate": 7.979496381663486e-06, + "loss": 0.1899, + "step": 3298 + }, + { + "epoch": 1.1016864251127065, + "grad_norm": 0.43698727164514994, + "learning_rate": 7.977935544985016e-06, + "loss": 0.2032, + "step": 3299 + }, + { + "epoch": 1.1020203706795793, + "grad_norm": 0.4212410386989296, + "learning_rate": 7.976374258454317e-06, + "loss": 0.1885, + "step": 3300 + }, + { + "epoch": 1.1023543162464517, + "grad_norm": 0.41194340216926234, + "learning_rate": 7.97481252230724e-06, + "loss": 0.1988, + "step": 3301 + }, + { + "epoch": 1.1026882618133245, + "grad_norm": 0.4276995668094239, + "learning_rate": 7.973250336779705e-06, + "loss": 0.2041, + "step": 3302 + }, + { + "epoch": 1.103022207380197, + "grad_norm": 0.4857427454118398, + "learning_rate": 7.971687702107698e-06, + "loss": 0.194, + "step": 3303 + }, + { + "epoch": 1.1033561529470697, + "grad_norm": 0.4220206495507896, + "learning_rate": 7.970124618527274e-06, + "loss": 0.1857, + "step": 3304 + }, + { + "epoch": 1.1036900985139422, + "grad_norm": 0.4270813488896286, + "learning_rate": 7.968561086274553e-06, + "loss": 0.1964, + "step": 3305 + }, + { + "epoch": 1.104024044080815, + "grad_norm": 0.4252133936392048, + "learning_rate": 7.966997105585727e-06, + "loss": 0.2049, + "step": 3306 + }, + { + "epoch": 1.1043579896476874, + "grad_norm": 0.43526141115105044, + "learning_rate": 7.965432676697052e-06, + "loss": 0.2081, + "step": 3307 + }, + { + "epoch": 1.10469193521456, + "grad_norm": 0.39251872349305744, + "learning_rate": 7.963867799844855e-06, + "loss": 0.1874, + "step": 3308 + }, + { + "epoch": 1.1050258807814326, + "grad_norm": 0.3944749883657532, + "learning_rate": 7.962302475265527e-06, + "loss": 0.1925, + "step": 3309 + }, + { + "epoch": 1.1053598263483053, + "grad_norm": 0.42273055292457995, + "learning_rate": 7.960736703195533e-06, + "loss": 0.1909, + "step": 3310 + }, + { + "epoch": 1.1056937719151778, + "grad_norm": 0.4328304273348919, + "learning_rate": 7.959170483871398e-06, + "loss": 0.2037, + "step": 3311 + }, + { + "epoch": 1.1060277174820503, + "grad_norm": 0.4745377240310588, + "learning_rate": 7.957603817529715e-06, + "loss": 0.2082, + "step": 3312 + }, + { + "epoch": 1.106361663048923, + "grad_norm": 0.45131928288623807, + "learning_rate": 7.956036704407153e-06, + "loss": 0.1966, + "step": 3313 + }, + { + "epoch": 1.1066956086157955, + "grad_norm": 0.41211914551956946, + "learning_rate": 7.954469144740441e-06, + "loss": 0.194, + "step": 3314 + }, + { + "epoch": 1.1070295541826682, + "grad_norm": 0.45784621244927204, + "learning_rate": 7.952901138766376e-06, + "loss": 0.2016, + "step": 3315 + }, + { + "epoch": 1.1073634997495407, + "grad_norm": 0.4387965133352464, + "learning_rate": 7.951332686721825e-06, + "loss": 0.2084, + "step": 3316 + }, + { + "epoch": 1.1076974453164135, + "grad_norm": 0.41301207819634184, + "learning_rate": 7.94976378884372e-06, + "loss": 0.208, + "step": 3317 + }, + { + "epoch": 1.108031390883286, + "grad_norm": 0.4522258424463022, + "learning_rate": 7.948194445369065e-06, + "loss": 0.2003, + "step": 3318 + }, + { + "epoch": 1.1083653364501587, + "grad_norm": 0.45104153739353287, + "learning_rate": 7.946624656534922e-06, + "loss": 0.2051, + "step": 3319 + }, + { + "epoch": 1.1086992820170312, + "grad_norm": 0.47403483171073474, + "learning_rate": 7.945054422578432e-06, + "loss": 0.2062, + "step": 3320 + }, + { + "epoch": 1.1090332275839039, + "grad_norm": 0.44902598801173543, + "learning_rate": 7.943483743736793e-06, + "loss": 0.2075, + "step": 3321 + }, + { + "epoch": 1.1093671731507764, + "grad_norm": 0.4303479990019636, + "learning_rate": 7.941912620247276e-06, + "loss": 0.1969, + "step": 3322 + }, + { + "epoch": 1.109701118717649, + "grad_norm": 0.4960675003356271, + "learning_rate": 7.940341052347219e-06, + "loss": 0.2157, + "step": 3323 + }, + { + "epoch": 1.1100350642845216, + "grad_norm": 0.42395659697291693, + "learning_rate": 7.938769040274022e-06, + "loss": 0.2068, + "step": 3324 + }, + { + "epoch": 1.1103690098513943, + "grad_norm": 0.4340552153546476, + "learning_rate": 7.937196584265161e-06, + "loss": 0.2016, + "step": 3325 + }, + { + "epoch": 1.1107029554182668, + "grad_norm": 0.4117355175562917, + "learning_rate": 7.93562368455817e-06, + "loss": 0.2022, + "step": 3326 + }, + { + "epoch": 1.1110369009851395, + "grad_norm": 0.5466416870551023, + "learning_rate": 7.934050341390659e-06, + "loss": 0.1812, + "step": 3327 + }, + { + "epoch": 1.111370846552012, + "grad_norm": 0.4095963376065435, + "learning_rate": 7.932476555000294e-06, + "loss": 0.1928, + "step": 3328 + }, + { + "epoch": 1.1117047921188847, + "grad_norm": 0.45614123456720185, + "learning_rate": 7.930902325624816e-06, + "loss": 0.1947, + "step": 3329 + }, + { + "epoch": 1.1120387376857572, + "grad_norm": 0.43679448876694665, + "learning_rate": 7.929327653502032e-06, + "loss": 0.1928, + "step": 3330 + }, + { + "epoch": 1.1123726832526297, + "grad_norm": 0.4824447177607817, + "learning_rate": 7.927752538869816e-06, + "loss": 0.208, + "step": 3331 + }, + { + "epoch": 1.1127066288195024, + "grad_norm": 0.4681733368568868, + "learning_rate": 7.926176981966102e-06, + "loss": 0.2127, + "step": 3332 + }, + { + "epoch": 1.113040574386375, + "grad_norm": 0.5039341589751777, + "learning_rate": 7.924600983028903e-06, + "loss": 0.2128, + "step": 3333 + }, + { + "epoch": 1.1133745199532477, + "grad_norm": 0.41043359902653814, + "learning_rate": 7.92302454229629e-06, + "loss": 0.1901, + "step": 3334 + }, + { + "epoch": 1.1137084655201201, + "grad_norm": 0.42173077749314025, + "learning_rate": 7.9214476600064e-06, + "loss": 0.1946, + "step": 3335 + }, + { + "epoch": 1.1140424110869929, + "grad_norm": 0.5686198735094352, + "learning_rate": 7.919870336397444e-06, + "loss": 0.2045, + "step": 3336 + }, + { + "epoch": 1.1143763566538654, + "grad_norm": 0.4263229791855886, + "learning_rate": 7.918292571707693e-06, + "loss": 0.1998, + "step": 3337 + }, + { + "epoch": 1.114710302220738, + "grad_norm": 0.4538265435685427, + "learning_rate": 7.916714366175487e-06, + "loss": 0.1977, + "step": 3338 + }, + { + "epoch": 1.1150442477876106, + "grad_norm": 0.4145724982137613, + "learning_rate": 7.915135720039233e-06, + "loss": 0.1908, + "step": 3339 + }, + { + "epoch": 1.1153781933544833, + "grad_norm": 0.4018593488275227, + "learning_rate": 7.913556633537403e-06, + "loss": 0.2041, + "step": 3340 + }, + { + "epoch": 1.1157121389213558, + "grad_norm": 0.4354152187879273, + "learning_rate": 7.91197710690854e-06, + "loss": 0.2153, + "step": 3341 + }, + { + "epoch": 1.1160460844882285, + "grad_norm": 0.43252317066527857, + "learning_rate": 7.910397140391244e-06, + "loss": 0.2014, + "step": 3342 + }, + { + "epoch": 1.116380030055101, + "grad_norm": 0.4474161800654269, + "learning_rate": 7.908816734224195e-06, + "loss": 0.2027, + "step": 3343 + }, + { + "epoch": 1.1167139756219737, + "grad_norm": 0.4972888829593969, + "learning_rate": 7.907235888646126e-06, + "loss": 0.22, + "step": 3344 + }, + { + "epoch": 1.1170479211888462, + "grad_norm": 0.4505839982249951, + "learning_rate": 7.905654603895843e-06, + "loss": 0.1939, + "step": 3345 + }, + { + "epoch": 1.1173818667557187, + "grad_norm": 0.4374753952465327, + "learning_rate": 7.90407288021222e-06, + "loss": 0.2152, + "step": 3346 + }, + { + "epoch": 1.1177158123225914, + "grad_norm": 0.47875310572864327, + "learning_rate": 7.902490717834196e-06, + "loss": 0.2035, + "step": 3347 + }, + { + "epoch": 1.118049757889464, + "grad_norm": 0.4382931827428161, + "learning_rate": 7.90090811700077e-06, + "loss": 0.2026, + "step": 3348 + }, + { + "epoch": 1.1183837034563366, + "grad_norm": 0.4385411786075883, + "learning_rate": 7.899325077951018e-06, + "loss": 0.196, + "step": 3349 + }, + { + "epoch": 1.1187176490232091, + "grad_norm": 0.38871833531460476, + "learning_rate": 7.897741600924073e-06, + "loss": 0.1769, + "step": 3350 + }, + { + "epoch": 1.1190515945900819, + "grad_norm": 0.4519038045952756, + "learning_rate": 7.896157686159142e-06, + "loss": 0.2071, + "step": 3351 + }, + { + "epoch": 1.1193855401569544, + "grad_norm": 0.47224347629867824, + "learning_rate": 7.89457333389549e-06, + "loss": 0.2029, + "step": 3352 + }, + { + "epoch": 1.119719485723827, + "grad_norm": 0.4201454685987697, + "learning_rate": 7.892988544372454e-06, + "loss": 0.1991, + "step": 3353 + }, + { + "epoch": 1.1200534312906996, + "grad_norm": 0.4289082154934355, + "learning_rate": 7.891403317829434e-06, + "loss": 0.1932, + "step": 3354 + }, + { + "epoch": 1.1203873768575723, + "grad_norm": 0.4758235863184793, + "learning_rate": 7.889817654505897e-06, + "loss": 0.2088, + "step": 3355 + }, + { + "epoch": 1.1207213224244448, + "grad_norm": 0.4198002688938867, + "learning_rate": 7.888231554641377e-06, + "loss": 0.1923, + "step": 3356 + }, + { + "epoch": 1.1210552679913175, + "grad_norm": 0.43029326862973805, + "learning_rate": 7.886645018475474e-06, + "loss": 0.1915, + "step": 3357 + }, + { + "epoch": 1.12138921355819, + "grad_norm": 0.6278196412133785, + "learning_rate": 7.885058046247852e-06, + "loss": 0.2142, + "step": 3358 + }, + { + "epoch": 1.1217231591250627, + "grad_norm": 0.44070049869120914, + "learning_rate": 7.88347063819824e-06, + "loss": 0.2005, + "step": 3359 + }, + { + "epoch": 1.1220571046919352, + "grad_norm": 0.4247481458151347, + "learning_rate": 7.881882794566438e-06, + "loss": 0.1995, + "step": 3360 + }, + { + "epoch": 1.1223910502588077, + "grad_norm": 0.4145120275554937, + "learning_rate": 7.880294515592304e-06, + "loss": 0.1997, + "step": 3361 + }, + { + "epoch": 1.1227249958256804, + "grad_norm": 0.42952334403896586, + "learning_rate": 7.878705801515772e-06, + "loss": 0.1919, + "step": 3362 + }, + { + "epoch": 1.123058941392553, + "grad_norm": 0.43518221391978096, + "learning_rate": 7.877116652576832e-06, + "loss": 0.1963, + "step": 3363 + }, + { + "epoch": 1.1233928869594256, + "grad_norm": 0.4414778182611209, + "learning_rate": 7.875527069015545e-06, + "loss": 0.2023, + "step": 3364 + }, + { + "epoch": 1.1237268325262981, + "grad_norm": 0.48255357811517446, + "learning_rate": 7.873937051072037e-06, + "loss": 0.1912, + "step": 3365 + }, + { + "epoch": 1.1240607780931708, + "grad_norm": 0.4400050750711768, + "learning_rate": 7.872346598986496e-06, + "loss": 0.1995, + "step": 3366 + }, + { + "epoch": 1.1243947236600433, + "grad_norm": 0.4700279659952803, + "learning_rate": 7.87075571299918e-06, + "loss": 0.2191, + "step": 3367 + }, + { + "epoch": 1.124728669226916, + "grad_norm": 0.4577800514967404, + "learning_rate": 7.869164393350412e-06, + "loss": 0.1821, + "step": 3368 + }, + { + "epoch": 1.1250626147937886, + "grad_norm": 0.4447548135499851, + "learning_rate": 7.86757264028058e-06, + "loss": 0.1968, + "step": 3369 + }, + { + "epoch": 1.1253965603606613, + "grad_norm": 0.4270542850533781, + "learning_rate": 7.865980454030135e-06, + "loss": 0.199, + "step": 3370 + }, + { + "epoch": 1.1257305059275338, + "grad_norm": 0.3999805646644059, + "learning_rate": 7.864387834839598e-06, + "loss": 0.1932, + "step": 3371 + }, + { + "epoch": 1.1260644514944065, + "grad_norm": 0.4350361424348412, + "learning_rate": 7.86279478294955e-06, + "loss": 0.1992, + "step": 3372 + }, + { + "epoch": 1.126398397061279, + "grad_norm": 0.4781455786710474, + "learning_rate": 7.861201298600642e-06, + "loss": 0.2009, + "step": 3373 + }, + { + "epoch": 1.1267323426281517, + "grad_norm": 0.4167397575968442, + "learning_rate": 7.85960738203359e-06, + "loss": 0.1914, + "step": 3374 + }, + { + "epoch": 1.1270662881950242, + "grad_norm": 0.5743493123189489, + "learning_rate": 7.858013033489171e-06, + "loss": 0.2252, + "step": 3375 + }, + { + "epoch": 1.1274002337618967, + "grad_norm": 0.4175739966743874, + "learning_rate": 7.856418253208232e-06, + "loss": 0.2073, + "step": 3376 + }, + { + "epoch": 1.1277341793287694, + "grad_norm": 0.4040680795350153, + "learning_rate": 7.85482304143168e-06, + "loss": 0.1941, + "step": 3377 + }, + { + "epoch": 1.1280681248956421, + "grad_norm": 0.42600690189169366, + "learning_rate": 7.853227398400495e-06, + "loss": 0.1942, + "step": 3378 + }, + { + "epoch": 1.1284020704625146, + "grad_norm": 0.3704468500759086, + "learning_rate": 7.851631324355717e-06, + "loss": 0.1838, + "step": 3379 + }, + { + "epoch": 1.1287360160293871, + "grad_norm": 0.4178851626097371, + "learning_rate": 7.850034819538448e-06, + "loss": 0.2019, + "step": 3380 + }, + { + "epoch": 1.1290699615962598, + "grad_norm": 0.41850958354938694, + "learning_rate": 7.848437884189864e-06, + "loss": 0.1908, + "step": 3381 + }, + { + "epoch": 1.1294039071631323, + "grad_norm": 0.46566524574700385, + "learning_rate": 7.846840518551197e-06, + "loss": 0.2139, + "step": 3382 + }, + { + "epoch": 1.129737852730005, + "grad_norm": 0.43426626236451754, + "learning_rate": 7.845242722863749e-06, + "loss": 0.2025, + "step": 3383 + }, + { + "epoch": 1.1300717982968775, + "grad_norm": 0.43143461237116987, + "learning_rate": 7.843644497368886e-06, + "loss": 0.2002, + "step": 3384 + }, + { + "epoch": 1.1304057438637503, + "grad_norm": 0.5135500919403986, + "learning_rate": 7.842045842308038e-06, + "loss": 0.2073, + "step": 3385 + }, + { + "epoch": 1.1307396894306228, + "grad_norm": 0.47400554849908777, + "learning_rate": 7.840446757922704e-06, + "loss": 0.2046, + "step": 3386 + }, + { + "epoch": 1.1310736349974955, + "grad_norm": 0.43000385084533, + "learning_rate": 7.838847244454441e-06, + "loss": 0.2004, + "step": 3387 + }, + { + "epoch": 1.131407580564368, + "grad_norm": 0.44538414882916644, + "learning_rate": 7.837247302144874e-06, + "loss": 0.1996, + "step": 3388 + }, + { + "epoch": 1.1317415261312407, + "grad_norm": 0.4611482843216874, + "learning_rate": 7.835646931235697e-06, + "loss": 0.2063, + "step": 3389 + }, + { + "epoch": 1.1320754716981132, + "grad_norm": 0.4155829516189207, + "learning_rate": 7.83404613196866e-06, + "loss": 0.1903, + "step": 3390 + }, + { + "epoch": 1.132409417264986, + "grad_norm": 0.4401772944803297, + "learning_rate": 7.832444904585587e-06, + "loss": 0.1983, + "step": 3391 + }, + { + "epoch": 1.1327433628318584, + "grad_norm": 0.550134028287982, + "learning_rate": 7.83084324932836e-06, + "loss": 0.2153, + "step": 3392 + }, + { + "epoch": 1.133077308398731, + "grad_norm": 0.438562739435005, + "learning_rate": 7.829241166438925e-06, + "loss": 0.1966, + "step": 3393 + }, + { + "epoch": 1.1334112539656036, + "grad_norm": 0.4134789296728246, + "learning_rate": 7.827638656159302e-06, + "loss": 0.195, + "step": 3394 + }, + { + "epoch": 1.133745199532476, + "grad_norm": 0.4230251736651399, + "learning_rate": 7.826035718731564e-06, + "loss": 0.2065, + "step": 3395 + }, + { + "epoch": 1.1340791450993488, + "grad_norm": 0.4080363331592151, + "learning_rate": 7.824432354397857e-06, + "loss": 0.1896, + "step": 3396 + }, + { + "epoch": 1.1344130906662213, + "grad_norm": 0.5090023027343824, + "learning_rate": 7.822828563400384e-06, + "loss": 0.1999, + "step": 3397 + }, + { + "epoch": 1.134747036233094, + "grad_norm": 0.4361556768676404, + "learning_rate": 7.82122434598142e-06, + "loss": 0.2119, + "step": 3398 + }, + { + "epoch": 1.1350809817999665, + "grad_norm": 0.42843947744351496, + "learning_rate": 7.819619702383299e-06, + "loss": 0.2032, + "step": 3399 + }, + { + "epoch": 1.1354149273668392, + "grad_norm": 0.4308637813200796, + "learning_rate": 7.818014632848422e-06, + "loss": 0.2194, + "step": 3400 + }, + { + "epoch": 1.1357488729337117, + "grad_norm": 0.41419066838076307, + "learning_rate": 7.816409137619254e-06, + "loss": 0.2013, + "step": 3401 + }, + { + "epoch": 1.1360828185005845, + "grad_norm": 0.4978926371404705, + "learning_rate": 7.814803216938324e-06, + "loss": 0.2085, + "step": 3402 + }, + { + "epoch": 1.136416764067457, + "grad_norm": 0.4466119651320874, + "learning_rate": 7.813196871048226e-06, + "loss": 0.1905, + "step": 3403 + }, + { + "epoch": 1.1367507096343297, + "grad_norm": 0.46519231603409, + "learning_rate": 7.811590100191613e-06, + "loss": 0.2014, + "step": 3404 + }, + { + "epoch": 1.1370846552012022, + "grad_norm": 0.45057828935363947, + "learning_rate": 7.809982904611213e-06, + "loss": 0.2068, + "step": 3405 + }, + { + "epoch": 1.1374186007680749, + "grad_norm": 0.4500947824438429, + "learning_rate": 7.808375284549807e-06, + "loss": 0.2019, + "step": 3406 + }, + { + "epoch": 1.1377525463349474, + "grad_norm": 0.4538311873250359, + "learning_rate": 7.806767240250248e-06, + "loss": 0.2124, + "step": 3407 + }, + { + "epoch": 1.13808649190182, + "grad_norm": 0.49784144787499535, + "learning_rate": 7.805158771955448e-06, + "loss": 0.2027, + "step": 3408 + }, + { + "epoch": 1.1384204374686926, + "grad_norm": 0.4619216519515817, + "learning_rate": 7.803549879908385e-06, + "loss": 0.2047, + "step": 3409 + }, + { + "epoch": 1.138754383035565, + "grad_norm": 0.4107807403289291, + "learning_rate": 7.801940564352103e-06, + "loss": 0.1956, + "step": 3410 + }, + { + "epoch": 1.1390883286024378, + "grad_norm": 0.42471264599679803, + "learning_rate": 7.800330825529707e-06, + "loss": 0.1964, + "step": 3411 + }, + { + "epoch": 1.1394222741693105, + "grad_norm": 0.40572323274981525, + "learning_rate": 7.798720663684367e-06, + "loss": 0.1928, + "step": 3412 + }, + { + "epoch": 1.139756219736183, + "grad_norm": 0.4101254365791967, + "learning_rate": 7.797110079059315e-06, + "loss": 0.2079, + "step": 3413 + }, + { + "epoch": 1.1400901653030555, + "grad_norm": 0.4411300893319862, + "learning_rate": 7.795499071897855e-06, + "loss": 0.2028, + "step": 3414 + }, + { + "epoch": 1.1404241108699282, + "grad_norm": 0.4332006753538404, + "learning_rate": 7.79388764244334e-06, + "loss": 0.2156, + "step": 3415 + }, + { + "epoch": 1.1407580564368007, + "grad_norm": 0.41551192652535196, + "learning_rate": 7.792275790939202e-06, + "loss": 0.2108, + "step": 3416 + }, + { + "epoch": 1.1410920020036734, + "grad_norm": 0.4225371412103681, + "learning_rate": 7.790663517628927e-06, + "loss": 0.1969, + "step": 3417 + }, + { + "epoch": 1.141425947570546, + "grad_norm": 0.4391114729864135, + "learning_rate": 7.789050822756068e-06, + "loss": 0.2064, + "step": 3418 + }, + { + "epoch": 1.1417598931374187, + "grad_norm": 0.40251758639113394, + "learning_rate": 7.787437706564243e-06, + "loss": 0.1882, + "step": 3419 + }, + { + "epoch": 1.1420938387042912, + "grad_norm": 0.41418603101346974, + "learning_rate": 7.78582416929713e-06, + "loss": 0.1919, + "step": 3420 + }, + { + "epoch": 1.1424277842711639, + "grad_norm": 0.41423945325619144, + "learning_rate": 7.784210211198475e-06, + "loss": 0.2046, + "step": 3421 + }, + { + "epoch": 1.1427617298380364, + "grad_norm": 0.45911999518986674, + "learning_rate": 7.782595832512086e-06, + "loss": 0.2102, + "step": 3422 + }, + { + "epoch": 1.143095675404909, + "grad_norm": 0.446295856392653, + "learning_rate": 7.780981033481832e-06, + "loss": 0.2115, + "step": 3423 + }, + { + "epoch": 1.1434296209717816, + "grad_norm": 0.47779947300607567, + "learning_rate": 7.779365814351648e-06, + "loss": 0.212, + "step": 3424 + }, + { + "epoch": 1.143763566538654, + "grad_norm": 0.5041795675046281, + "learning_rate": 7.77775017536553e-06, + "loss": 0.2201, + "step": 3425 + }, + { + "epoch": 1.1440975121055268, + "grad_norm": 0.4626546155813629, + "learning_rate": 7.776134116767544e-06, + "loss": 0.2161, + "step": 3426 + }, + { + "epoch": 1.1444314576723995, + "grad_norm": 0.43618214464201066, + "learning_rate": 7.774517638801808e-06, + "loss": 0.2068, + "step": 3427 + }, + { + "epoch": 1.144765403239272, + "grad_norm": 0.4254363039124097, + "learning_rate": 7.772900741712516e-06, + "loss": 0.2032, + "step": 3428 + }, + { + "epoch": 1.1450993488061445, + "grad_norm": 0.45525908259895725, + "learning_rate": 7.771283425743916e-06, + "loss": 0.2043, + "step": 3429 + }, + { + "epoch": 1.1454332943730172, + "grad_norm": 0.41941227927769686, + "learning_rate": 7.769665691140325e-06, + "loss": 0.2057, + "step": 3430 + }, + { + "epoch": 1.1457672399398897, + "grad_norm": 0.4616850221911409, + "learning_rate": 7.76804753814612e-06, + "loss": 0.1962, + "step": 3431 + }, + { + "epoch": 1.1461011855067624, + "grad_norm": 0.44619639348887946, + "learning_rate": 7.76642896700574e-06, + "loss": 0.1999, + "step": 3432 + }, + { + "epoch": 1.146435131073635, + "grad_norm": 0.42643033930129975, + "learning_rate": 7.764809977963692e-06, + "loss": 0.1966, + "step": 3433 + }, + { + "epoch": 1.1467690766405076, + "grad_norm": 0.41159559705421556, + "learning_rate": 7.763190571264542e-06, + "loss": 0.1919, + "step": 3434 + }, + { + "epoch": 1.1471030222073801, + "grad_norm": 0.4303534281592218, + "learning_rate": 7.761570747152923e-06, + "loss": 0.1985, + "step": 3435 + }, + { + "epoch": 1.1474369677742529, + "grad_norm": 0.42350721038806954, + "learning_rate": 7.759950505873523e-06, + "loss": 0.193, + "step": 3436 + }, + { + "epoch": 1.1477709133411254, + "grad_norm": 0.4596329657408379, + "learning_rate": 7.758329847671103e-06, + "loss": 0.2052, + "step": 3437 + }, + { + "epoch": 1.148104858907998, + "grad_norm": 0.4489699249368635, + "learning_rate": 7.75670877279048e-06, + "loss": 0.2104, + "step": 3438 + }, + { + "epoch": 1.1484388044748706, + "grad_norm": 0.43084296877947265, + "learning_rate": 7.755087281476539e-06, + "loss": 0.198, + "step": 3439 + }, + { + "epoch": 1.1487727500417433, + "grad_norm": 0.4057179440867843, + "learning_rate": 7.753465373974223e-06, + "loss": 0.1999, + "step": 3440 + }, + { + "epoch": 1.1491066956086158, + "grad_norm": 0.46218899540177816, + "learning_rate": 7.751843050528543e-06, + "loss": 0.199, + "step": 3441 + }, + { + "epoch": 1.1494406411754885, + "grad_norm": 0.4471325368000403, + "learning_rate": 7.750220311384567e-06, + "loss": 0.2127, + "step": 3442 + }, + { + "epoch": 1.149774586742361, + "grad_norm": 0.4508117782583261, + "learning_rate": 7.748597156787429e-06, + "loss": 0.2021, + "step": 3443 + }, + { + "epoch": 1.1501085323092335, + "grad_norm": 0.42529589186934397, + "learning_rate": 7.746973586982328e-06, + "loss": 0.2038, + "step": 3444 + }, + { + "epoch": 1.1504424778761062, + "grad_norm": 0.4501523886838213, + "learning_rate": 7.745349602214522e-06, + "loss": 0.2114, + "step": 3445 + }, + { + "epoch": 1.1507764234429787, + "grad_norm": 0.44047520372496385, + "learning_rate": 7.743725202729335e-06, + "loss": 0.1958, + "step": 3446 + }, + { + "epoch": 1.1511103690098514, + "grad_norm": 0.4644912732615579, + "learning_rate": 7.742100388772148e-06, + "loss": 0.2024, + "step": 3447 + }, + { + "epoch": 1.151444314576724, + "grad_norm": 0.41485035644284507, + "learning_rate": 7.74047516058841e-06, + "loss": 0.19, + "step": 3448 + }, + { + "epoch": 1.1517782601435966, + "grad_norm": 0.44397761503160493, + "learning_rate": 7.73884951842363e-06, + "loss": 0.2103, + "step": 3449 + }, + { + "epoch": 1.1521122057104691, + "grad_norm": 0.4329722118379211, + "learning_rate": 7.737223462523383e-06, + "loss": 0.2093, + "step": 3450 + }, + { + "epoch": 1.1524461512773418, + "grad_norm": 0.4536966006310295, + "learning_rate": 7.735596993133303e-06, + "loss": 0.2016, + "step": 3451 + }, + { + "epoch": 1.1527800968442143, + "grad_norm": 0.450365844932539, + "learning_rate": 7.733970110499086e-06, + "loss": 0.2061, + "step": 3452 + }, + { + "epoch": 1.153114042411087, + "grad_norm": 0.4556419046795417, + "learning_rate": 7.732342814866489e-06, + "loss": 0.2023, + "step": 3453 + }, + { + "epoch": 1.1534479879779596, + "grad_norm": 0.49031666549692304, + "learning_rate": 7.730715106481342e-06, + "loss": 0.2149, + "step": 3454 + }, + { + "epoch": 1.1537819335448323, + "grad_norm": 0.3880997191032563, + "learning_rate": 7.729086985589523e-06, + "loss": 0.1861, + "step": 3455 + }, + { + "epoch": 1.1541158791117048, + "grad_norm": 0.37937141823654447, + "learning_rate": 7.72745845243698e-06, + "loss": 0.1863, + "step": 3456 + }, + { + "epoch": 1.1544498246785775, + "grad_norm": 0.49966955106951333, + "learning_rate": 7.725829507269723e-06, + "loss": 0.1955, + "step": 3457 + }, + { + "epoch": 1.15478377024545, + "grad_norm": 0.4960476776887278, + "learning_rate": 7.724200150333826e-06, + "loss": 0.2025, + "step": 3458 + }, + { + "epoch": 1.1551177158123225, + "grad_norm": 0.4037417613399222, + "learning_rate": 7.722570381875418e-06, + "loss": 0.1939, + "step": 3459 + }, + { + "epoch": 1.1554516613791952, + "grad_norm": 0.4646572771220113, + "learning_rate": 7.720940202140698e-06, + "loss": 0.2096, + "step": 3460 + }, + { + "epoch": 1.155785606946068, + "grad_norm": 0.40519117416885664, + "learning_rate": 7.71930961137592e-06, + "loss": 0.1901, + "step": 3461 + }, + { + "epoch": 1.1561195525129404, + "grad_norm": 0.45808904281917306, + "learning_rate": 7.717678609827409e-06, + "loss": 0.2198, + "step": 3462 + }, + { + "epoch": 1.156453498079813, + "grad_norm": 0.49491562964628316, + "learning_rate": 7.716047197741543e-06, + "loss": 0.21, + "step": 3463 + }, + { + "epoch": 1.1567874436466856, + "grad_norm": 0.4560900514019919, + "learning_rate": 7.714415375364768e-06, + "loss": 0.2124, + "step": 3464 + }, + { + "epoch": 1.1571213892135581, + "grad_norm": 0.4567473518841226, + "learning_rate": 7.712783142943588e-06, + "loss": 0.197, + "step": 3465 + }, + { + "epoch": 1.1574553347804308, + "grad_norm": 0.45021820733197226, + "learning_rate": 7.711150500724574e-06, + "loss": 0.2032, + "step": 3466 + }, + { + "epoch": 1.1577892803473033, + "grad_norm": 0.43204683278363776, + "learning_rate": 7.709517448954353e-06, + "loss": 0.1972, + "step": 3467 + }, + { + "epoch": 1.158123225914176, + "grad_norm": 0.43096997145471116, + "learning_rate": 7.707883987879617e-06, + "loss": 0.1837, + "step": 3468 + }, + { + "epoch": 1.1584571714810485, + "grad_norm": 0.4107749143716421, + "learning_rate": 7.70625011774712e-06, + "loss": 0.193, + "step": 3469 + }, + { + "epoch": 1.1587911170479213, + "grad_norm": 0.42555377932051347, + "learning_rate": 7.70461583880368e-06, + "loss": 0.2027, + "step": 3470 + }, + { + "epoch": 1.1591250626147938, + "grad_norm": 0.537801799999701, + "learning_rate": 7.70298115129617e-06, + "loss": 0.2128, + "step": 3471 + }, + { + "epoch": 1.1594590081816665, + "grad_norm": 0.4701917219067799, + "learning_rate": 7.701346055471533e-06, + "loss": 0.1973, + "step": 3472 + }, + { + "epoch": 1.159792953748539, + "grad_norm": 0.4336001405965153, + "learning_rate": 7.699710551576763e-06, + "loss": 0.1959, + "step": 3473 + }, + { + "epoch": 1.1601268993154115, + "grad_norm": 0.5109622071728623, + "learning_rate": 7.69807463985893e-06, + "loss": 0.2068, + "step": 3474 + }, + { + "epoch": 1.1604608448822842, + "grad_norm": 0.4399649168868142, + "learning_rate": 7.696438320565152e-06, + "loss": 0.2018, + "step": 3475 + }, + { + "epoch": 1.160794790449157, + "grad_norm": 0.434807916850808, + "learning_rate": 7.694801593942615e-06, + "loss": 0.1922, + "step": 3476 + }, + { + "epoch": 1.1611287360160294, + "grad_norm": 0.4307111475753005, + "learning_rate": 7.69316446023857e-06, + "loss": 0.1963, + "step": 3477 + }, + { + "epoch": 1.161462681582902, + "grad_norm": 0.39136888781514806, + "learning_rate": 7.691526919700319e-06, + "loss": 0.1892, + "step": 3478 + }, + { + "epoch": 1.1617966271497746, + "grad_norm": 0.5072994571524962, + "learning_rate": 7.689888972575237e-06, + "loss": 0.2129, + "step": 3479 + }, + { + "epoch": 1.162130572716647, + "grad_norm": 0.45165856698956547, + "learning_rate": 7.688250619110752e-06, + "loss": 0.199, + "step": 3480 + }, + { + "epoch": 1.1624645182835198, + "grad_norm": 0.47159071816202563, + "learning_rate": 7.686611859554361e-06, + "loss": 0.2176, + "step": 3481 + }, + { + "epoch": 1.1627984638503923, + "grad_norm": 0.42425403478938783, + "learning_rate": 7.684972694153612e-06, + "loss": 0.2011, + "step": 3482 + }, + { + "epoch": 1.163132409417265, + "grad_norm": 0.46430876367122836, + "learning_rate": 7.683333123156122e-06, + "loss": 0.2024, + "step": 3483 + }, + { + "epoch": 1.1634663549841375, + "grad_norm": 0.4252770376389329, + "learning_rate": 7.681693146809572e-06, + "loss": 0.1989, + "step": 3484 + }, + { + "epoch": 1.1638003005510102, + "grad_norm": 0.38138227868488445, + "learning_rate": 7.680052765361693e-06, + "loss": 0.1877, + "step": 3485 + }, + { + "epoch": 1.1641342461178827, + "grad_norm": 0.4621436050450307, + "learning_rate": 7.678411979060289e-06, + "loss": 0.2138, + "step": 3486 + }, + { + "epoch": 1.1644681916847555, + "grad_norm": 0.4910521500257586, + "learning_rate": 7.676770788153218e-06, + "loss": 0.2029, + "step": 3487 + }, + { + "epoch": 1.164802137251628, + "grad_norm": 0.45326505140278767, + "learning_rate": 7.6751291928884e-06, + "loss": 0.2071, + "step": 3488 + }, + { + "epoch": 1.1651360828185007, + "grad_norm": 0.43879315421992415, + "learning_rate": 7.673487193513821e-06, + "loss": 0.2008, + "step": 3489 + }, + { + "epoch": 1.1654700283853732, + "grad_norm": 0.4220316459234618, + "learning_rate": 7.671844790277522e-06, + "loss": 0.201, + "step": 3490 + }, + { + "epoch": 1.1658039739522459, + "grad_norm": 0.4440932041786498, + "learning_rate": 7.670201983427606e-06, + "loss": 0.2145, + "step": 3491 + }, + { + "epoch": 1.1661379195191184, + "grad_norm": 0.42691236313301645, + "learning_rate": 7.66855877321224e-06, + "loss": 0.2015, + "step": 3492 + }, + { + "epoch": 1.1664718650859909, + "grad_norm": 0.501936815427682, + "learning_rate": 7.666915159879651e-06, + "loss": 0.2119, + "step": 3493 + }, + { + "epoch": 1.1668058106528636, + "grad_norm": 0.44911542211817373, + "learning_rate": 7.665271143678125e-06, + "loss": 0.2143, + "step": 3494 + }, + { + "epoch": 1.167139756219736, + "grad_norm": 0.4624678777626138, + "learning_rate": 7.66362672485601e-06, + "loss": 0.2079, + "step": 3495 + }, + { + "epoch": 1.1674737017866088, + "grad_norm": 0.40846524379588184, + "learning_rate": 7.661981903661715e-06, + "loss": 0.1976, + "step": 3496 + }, + { + "epoch": 1.1678076473534813, + "grad_norm": 0.4103460531002227, + "learning_rate": 7.66033668034371e-06, + "loss": 0.1968, + "step": 3497 + }, + { + "epoch": 1.168141592920354, + "grad_norm": 0.4630302942431863, + "learning_rate": 7.658691055150524e-06, + "loss": 0.214, + "step": 3498 + }, + { + "epoch": 1.1684755384872265, + "grad_norm": 0.4578551544166337, + "learning_rate": 7.65704502833075e-06, + "loss": 0.2043, + "step": 3499 + }, + { + "epoch": 1.1688094840540992, + "grad_norm": 0.4524932747516926, + "learning_rate": 7.655398600133037e-06, + "loss": 0.2279, + "step": 3500 + }, + { + "epoch": 1.1691434296209717, + "grad_norm": 0.881428383101394, + "learning_rate": 7.653751770806101e-06, + "loss": 0.1923, + "step": 3501 + }, + { + "epoch": 1.1694773751878444, + "grad_norm": 0.41607736093404346, + "learning_rate": 7.652104540598712e-06, + "loss": 0.2049, + "step": 3502 + }, + { + "epoch": 1.169811320754717, + "grad_norm": 0.40214250677631747, + "learning_rate": 7.650456909759707e-06, + "loss": 0.1925, + "step": 3503 + }, + { + "epoch": 1.1701452663215897, + "grad_norm": 0.4504111125904756, + "learning_rate": 7.648808878537976e-06, + "loss": 0.192, + "step": 3504 + }, + { + "epoch": 1.1704792118884622, + "grad_norm": 0.4602724838722225, + "learning_rate": 7.647160447182475e-06, + "loss": 0.2087, + "step": 3505 + }, + { + "epoch": 1.1708131574553349, + "grad_norm": 0.4104405110762258, + "learning_rate": 7.645511615942218e-06, + "loss": 0.192, + "step": 3506 + }, + { + "epoch": 1.1711471030222074, + "grad_norm": 0.36937151858707484, + "learning_rate": 7.643862385066285e-06, + "loss": 0.1778, + "step": 3507 + }, + { + "epoch": 1.1714810485890799, + "grad_norm": 0.4572947626174961, + "learning_rate": 7.642212754803804e-06, + "loss": 0.1994, + "step": 3508 + }, + { + "epoch": 1.1718149941559526, + "grad_norm": 0.4816706066033821, + "learning_rate": 7.640562725403978e-06, + "loss": 0.2051, + "step": 3509 + }, + { + "epoch": 1.1721489397228253, + "grad_norm": 0.4382076984794106, + "learning_rate": 7.638912297116061e-06, + "loss": 0.2049, + "step": 3510 + }, + { + "epoch": 1.1724828852896978, + "grad_norm": 0.40178546966678536, + "learning_rate": 7.637261470189369e-06, + "loss": 0.1978, + "step": 3511 + }, + { + "epoch": 1.1728168308565703, + "grad_norm": 0.39280539287687943, + "learning_rate": 7.635610244873277e-06, + "loss": 0.1912, + "step": 3512 + }, + { + "epoch": 1.173150776423443, + "grad_norm": 0.46472165624676787, + "learning_rate": 7.633958621417226e-06, + "loss": 0.215, + "step": 3513 + }, + { + "epoch": 1.1734847219903155, + "grad_norm": 0.4297956230414589, + "learning_rate": 7.632306600070711e-06, + "loss": 0.1893, + "step": 3514 + }, + { + "epoch": 1.1738186675571882, + "grad_norm": 0.43762007892015997, + "learning_rate": 7.63065418108329e-06, + "loss": 0.1916, + "step": 3515 + }, + { + "epoch": 1.1741526131240607, + "grad_norm": 0.4897234537598655, + "learning_rate": 7.62900136470458e-06, + "loss": 0.1981, + "step": 3516 + }, + { + "epoch": 1.1744865586909334, + "grad_norm": 0.4051411254394592, + "learning_rate": 7.627348151184257e-06, + "loss": 0.1918, + "step": 3517 + }, + { + "epoch": 1.174820504257806, + "grad_norm": 0.4902705727555018, + "learning_rate": 7.625694540772062e-06, + "loss": 0.215, + "step": 3518 + }, + { + "epoch": 1.1751544498246786, + "grad_norm": 0.459335122777554, + "learning_rate": 7.624040533717789e-06, + "loss": 0.1931, + "step": 3519 + }, + { + "epoch": 1.1754883953915511, + "grad_norm": 0.43076363942331786, + "learning_rate": 7.622386130271296e-06, + "loss": 0.1907, + "step": 3520 + }, + { + "epoch": 1.1758223409584239, + "grad_norm": 0.4783616479824569, + "learning_rate": 7.620731330682501e-06, + "loss": 0.212, + "step": 3521 + }, + { + "epoch": 1.1761562865252964, + "grad_norm": 0.4356060638682757, + "learning_rate": 7.6190761352013795e-06, + "loss": 0.1951, + "step": 3522 + }, + { + "epoch": 1.1764902320921689, + "grad_norm": 0.4594433502156529, + "learning_rate": 7.61742054407797e-06, + "loss": 0.1982, + "step": 3523 + }, + { + "epoch": 1.1768241776590416, + "grad_norm": 0.45490492503768754, + "learning_rate": 7.615764557562368e-06, + "loss": 0.198, + "step": 3524 + }, + { + "epoch": 1.1771581232259143, + "grad_norm": 0.46105614813698614, + "learning_rate": 7.6141081759047305e-06, + "loss": 0.2014, + "step": 3525 + }, + { + "epoch": 1.1774920687927868, + "grad_norm": 0.46026783313250486, + "learning_rate": 7.612451399355273e-06, + "loss": 0.2189, + "step": 3526 + }, + { + "epoch": 1.1778260143596593, + "grad_norm": 0.4356200212691802, + "learning_rate": 7.610794228164271e-06, + "loss": 0.2029, + "step": 3527 + }, + { + "epoch": 1.178159959926532, + "grad_norm": 0.4297337094056724, + "learning_rate": 7.60913666258206e-06, + "loss": 0.2142, + "step": 3528 + }, + { + "epoch": 1.1784939054934045, + "grad_norm": 0.40876834016172153, + "learning_rate": 7.6074787028590325e-06, + "loss": 0.1937, + "step": 3529 + }, + { + "epoch": 1.1788278510602772, + "grad_norm": 0.3885624115347344, + "learning_rate": 7.605820349245645e-06, + "loss": 0.1974, + "step": 3530 + }, + { + "epoch": 1.1791617966271497, + "grad_norm": 0.4338874777406309, + "learning_rate": 7.6041616019924125e-06, + "loss": 0.1979, + "step": 3531 + }, + { + "epoch": 1.1794957421940224, + "grad_norm": 0.4408923442251734, + "learning_rate": 7.602502461349907e-06, + "loss": 0.2044, + "step": 3532 + }, + { + "epoch": 1.179829687760895, + "grad_norm": 0.41283196570865327, + "learning_rate": 7.600842927568761e-06, + "loss": 0.208, + "step": 3533 + }, + { + "epoch": 1.1801636333277676, + "grad_norm": 0.42555424705496336, + "learning_rate": 7.599183000899667e-06, + "loss": 0.2055, + "step": 3534 + }, + { + "epoch": 1.1804975788946401, + "grad_norm": 0.43912943678038574, + "learning_rate": 7.597522681593375e-06, + "loss": 0.2036, + "step": 3535 + }, + { + "epoch": 1.1808315244615128, + "grad_norm": 0.45635857674949853, + "learning_rate": 7.595861969900698e-06, + "loss": 0.2137, + "step": 3536 + }, + { + "epoch": 1.1811654700283853, + "grad_norm": 0.4757314323116078, + "learning_rate": 7.5942008660725065e-06, + "loss": 0.2046, + "step": 3537 + }, + { + "epoch": 1.181499415595258, + "grad_norm": 0.3847032911424942, + "learning_rate": 7.5925393703597265e-06, + "loss": 0.1854, + "step": 3538 + }, + { + "epoch": 1.1818333611621306, + "grad_norm": 0.40804861098671225, + "learning_rate": 7.59087748301335e-06, + "loss": 0.1961, + "step": 3539 + }, + { + "epoch": 1.1821673067290033, + "grad_norm": 0.4048208000344706, + "learning_rate": 7.5892152042844224e-06, + "loss": 0.201, + "step": 3540 + }, + { + "epoch": 1.1825012522958758, + "grad_norm": 0.41873793093657335, + "learning_rate": 7.58755253442405e-06, + "loss": 0.1955, + "step": 3541 + }, + { + "epoch": 1.1828351978627483, + "grad_norm": 0.45334205386003373, + "learning_rate": 7.585889473683401e-06, + "loss": 0.2101, + "step": 3542 + }, + { + "epoch": 1.183169143429621, + "grad_norm": 0.40478766876394096, + "learning_rate": 7.5842260223137e-06, + "loss": 0.1894, + "step": 3543 + }, + { + "epoch": 1.1835030889964935, + "grad_norm": 0.4299142249734497, + "learning_rate": 7.5825621805662285e-06, + "loss": 0.1943, + "step": 3544 + }, + { + "epoch": 1.1838370345633662, + "grad_norm": 0.3919396554368549, + "learning_rate": 7.580897948692332e-06, + "loss": 0.176, + "step": 3545 + }, + { + "epoch": 1.1841709801302387, + "grad_norm": 0.4507907583061385, + "learning_rate": 7.579233326943412e-06, + "loss": 0.2061, + "step": 3546 + }, + { + "epoch": 1.1845049256971114, + "grad_norm": 0.43486901958401214, + "learning_rate": 7.577568315570925e-06, + "loss": 0.2101, + "step": 3547 + }, + { + "epoch": 1.184838871263984, + "grad_norm": 0.611895109414638, + "learning_rate": 7.5759029148263975e-06, + "loss": 0.2105, + "step": 3548 + }, + { + "epoch": 1.1851728168308566, + "grad_norm": 0.46180151479561693, + "learning_rate": 7.574237124961403e-06, + "loss": 0.1991, + "step": 3549 + }, + { + "epoch": 1.1855067623977291, + "grad_norm": 0.40920033333286243, + "learning_rate": 7.572570946227582e-06, + "loss": 0.1985, + "step": 3550 + }, + { + "epoch": 1.1858407079646018, + "grad_norm": 0.5521228822921554, + "learning_rate": 7.570904378876627e-06, + "loss": 0.2064, + "step": 3551 + }, + { + "epoch": 1.1861746535314743, + "grad_norm": 0.4360368544101082, + "learning_rate": 7.569237423160294e-06, + "loss": 0.1977, + "step": 3552 + }, + { + "epoch": 1.186508599098347, + "grad_norm": 0.5055283028151625, + "learning_rate": 7.567570079330395e-06, + "loss": 0.2105, + "step": 3553 + }, + { + "epoch": 1.1868425446652195, + "grad_norm": 0.47535148943957384, + "learning_rate": 7.565902347638806e-06, + "loss": 0.2198, + "step": 3554 + }, + { + "epoch": 1.1871764902320923, + "grad_norm": 0.4567763936303909, + "learning_rate": 7.564234228337452e-06, + "loss": 0.2059, + "step": 3555 + }, + { + "epoch": 1.1875104357989648, + "grad_norm": 0.43216007765801734, + "learning_rate": 7.5625657216783276e-06, + "loss": 0.2042, + "step": 3556 + }, + { + "epoch": 1.1878443813658373, + "grad_norm": 0.4437876561197619, + "learning_rate": 7.560896827913478e-06, + "loss": 0.2085, + "step": 3557 + }, + { + "epoch": 1.18817832693271, + "grad_norm": 0.4136617092910776, + "learning_rate": 7.559227547295007e-06, + "loss": 0.1918, + "step": 3558 + }, + { + "epoch": 1.1885122724995827, + "grad_norm": 0.3950033730804212, + "learning_rate": 7.557557880075082e-06, + "loss": 0.191, + "step": 3559 + }, + { + "epoch": 1.1888462180664552, + "grad_norm": 0.44611070160298427, + "learning_rate": 7.555887826505926e-06, + "loss": 0.1998, + "step": 3560 + }, + { + "epoch": 1.1891801636333277, + "grad_norm": 0.49252383091679314, + "learning_rate": 7.554217386839817e-06, + "loss": 0.2101, + "step": 3561 + }, + { + "epoch": 1.1895141092002004, + "grad_norm": 0.43600290528031405, + "learning_rate": 7.552546561329097e-06, + "loss": 0.2078, + "step": 3562 + }, + { + "epoch": 1.189848054767073, + "grad_norm": 0.4641562946328928, + "learning_rate": 7.550875350226166e-06, + "loss": 0.2018, + "step": 3563 + }, + { + "epoch": 1.1901820003339456, + "grad_norm": 0.4107029251971812, + "learning_rate": 7.549203753783475e-06, + "loss": 0.1873, + "step": 3564 + }, + { + "epoch": 1.190515945900818, + "grad_norm": 0.8638079712764937, + "learning_rate": 7.547531772253542e-06, + "loss": 0.2183, + "step": 3565 + }, + { + "epoch": 1.1908498914676908, + "grad_norm": 0.46739237740310874, + "learning_rate": 7.54585940588894e-06, + "loss": 0.2061, + "step": 3566 + }, + { + "epoch": 1.1911838370345633, + "grad_norm": 0.44207366884569477, + "learning_rate": 7.544186654942296e-06, + "loss": 0.1888, + "step": 3567 + }, + { + "epoch": 1.191517782601436, + "grad_norm": 0.40029726288340856, + "learning_rate": 7.542513519666302e-06, + "loss": 0.1842, + "step": 3568 + }, + { + "epoch": 1.1918517281683085, + "grad_norm": 0.45850744724415704, + "learning_rate": 7.540840000313705e-06, + "loss": 0.2071, + "step": 3569 + }, + { + "epoch": 1.1921856737351813, + "grad_norm": 0.4217447661377353, + "learning_rate": 7.539166097137306e-06, + "loss": 0.1989, + "step": 3570 + }, + { + "epoch": 1.1925196193020537, + "grad_norm": 0.47099508009727703, + "learning_rate": 7.537491810389972e-06, + "loss": 0.2064, + "step": 3571 + }, + { + "epoch": 1.1928535648689262, + "grad_norm": 0.49006517879961325, + "learning_rate": 7.535817140324622e-06, + "loss": 0.2089, + "step": 3572 + }, + { + "epoch": 1.193187510435799, + "grad_norm": 0.45108690927294987, + "learning_rate": 7.534142087194234e-06, + "loss": 0.2034, + "step": 3573 + }, + { + "epoch": 1.1935214560026717, + "grad_norm": 0.40647564236000283, + "learning_rate": 7.532466651251846e-06, + "loss": 0.1976, + "step": 3574 + }, + { + "epoch": 1.1938554015695442, + "grad_norm": 0.4361047207843498, + "learning_rate": 7.5307908327505506e-06, + "loss": 0.186, + "step": 3575 + }, + { + "epoch": 1.1941893471364167, + "grad_norm": 0.41159680206823224, + "learning_rate": 7.529114631943501e-06, + "loss": 0.18, + "step": 3576 + }, + { + "epoch": 1.1945232927032894, + "grad_norm": 0.5154574701726614, + "learning_rate": 7.527438049083908e-06, + "loss": 0.2203, + "step": 3577 + }, + { + "epoch": 1.1948572382701619, + "grad_norm": 0.48966209480112627, + "learning_rate": 7.5257610844250385e-06, + "loss": 0.2164, + "step": 3578 + }, + { + "epoch": 1.1951911838370346, + "grad_norm": 0.484286723912471, + "learning_rate": 7.524083738220214e-06, + "loss": 0.2147, + "step": 3579 + }, + { + "epoch": 1.195525129403907, + "grad_norm": 0.4568042601993156, + "learning_rate": 7.522406010722824e-06, + "loss": 0.2115, + "step": 3580 + }, + { + "epoch": 1.1958590749707798, + "grad_norm": 0.4637639130204421, + "learning_rate": 7.5207279021863045e-06, + "loss": 0.2011, + "step": 3581 + }, + { + "epoch": 1.1961930205376523, + "grad_norm": 0.4503469517122692, + "learning_rate": 7.5190494128641545e-06, + "loss": 0.2017, + "step": 3582 + }, + { + "epoch": 1.196526966104525, + "grad_norm": 0.4115714466848455, + "learning_rate": 7.5173705430099295e-06, + "loss": 0.1894, + "step": 3583 + }, + { + "epoch": 1.1968609116713975, + "grad_norm": 0.43313860284242234, + "learning_rate": 7.515691292877243e-06, + "loss": 0.2049, + "step": 3584 + }, + { + "epoch": 1.1971948572382702, + "grad_norm": 0.44242325330952814, + "learning_rate": 7.514011662719766e-06, + "loss": 0.2134, + "step": 3585 + }, + { + "epoch": 1.1975288028051427, + "grad_norm": 0.49641971012087477, + "learning_rate": 7.512331652791226e-06, + "loss": 0.2194, + "step": 3586 + }, + { + "epoch": 1.1978627483720155, + "grad_norm": 0.4505528169848719, + "learning_rate": 7.510651263345408e-06, + "loss": 0.2128, + "step": 3587 + }, + { + "epoch": 1.198196693938888, + "grad_norm": 0.4207298979058369, + "learning_rate": 7.508970494636154e-06, + "loss": 0.203, + "step": 3588 + }, + { + "epoch": 1.1985306395057607, + "grad_norm": 0.41439317908897016, + "learning_rate": 7.507289346917366e-06, + "loss": 0.2028, + "step": 3589 + }, + { + "epoch": 1.1988645850726332, + "grad_norm": 0.3898376423037865, + "learning_rate": 7.505607820442997e-06, + "loss": 0.1838, + "step": 3590 + }, + { + "epoch": 1.1991985306395057, + "grad_norm": 0.44797911557528114, + "learning_rate": 7.503925915467066e-06, + "loss": 0.2067, + "step": 3591 + }, + { + "epoch": 1.1995324762063784, + "grad_norm": 0.4517517097789117, + "learning_rate": 7.502243632243645e-06, + "loss": 0.2047, + "step": 3592 + }, + { + "epoch": 1.1998664217732509, + "grad_norm": 0.43869540600955803, + "learning_rate": 7.500560971026856e-06, + "loss": 0.2081, + "step": 3593 + }, + { + "epoch": 1.2002003673401236, + "grad_norm": 0.46044286409999025, + "learning_rate": 7.498877932070892e-06, + "loss": 0.2008, + "step": 3594 + }, + { + "epoch": 1.200534312906996, + "grad_norm": 0.46702326825878965, + "learning_rate": 7.497194515629992e-06, + "loss": 0.1912, + "step": 3595 + }, + { + "epoch": 1.2008682584738688, + "grad_norm": 0.42290587241087874, + "learning_rate": 7.4955107219584575e-06, + "loss": 0.1955, + "step": 3596 + }, + { + "epoch": 1.2012022040407413, + "grad_norm": 0.42810888590783125, + "learning_rate": 7.493826551310645e-06, + "loss": 0.2, + "step": 3597 + }, + { + "epoch": 1.201536149607614, + "grad_norm": 0.44645503536762926, + "learning_rate": 7.492142003940966e-06, + "loss": 0.2086, + "step": 3598 + }, + { + "epoch": 1.2018700951744865, + "grad_norm": 0.3919758646583704, + "learning_rate": 7.490457080103895e-06, + "loss": 0.1833, + "step": 3599 + }, + { + "epoch": 1.2022040407413592, + "grad_norm": 0.440006706453243, + "learning_rate": 7.4887717800539584e-06, + "loss": 0.2122, + "step": 3600 + }, + { + "epoch": 1.2025379863082317, + "grad_norm": 0.41535578129544654, + "learning_rate": 7.48708610404574e-06, + "loss": 0.1889, + "step": 3601 + }, + { + "epoch": 1.2028719318751044, + "grad_norm": 0.4473257740232535, + "learning_rate": 7.48540005233388e-06, + "loss": 0.1918, + "step": 3602 + }, + { + "epoch": 1.203205877441977, + "grad_norm": 0.4341216912769296, + "learning_rate": 7.483713625173078e-06, + "loss": 0.2096, + "step": 3603 + }, + { + "epoch": 1.2035398230088497, + "grad_norm": 0.40777171525617034, + "learning_rate": 7.482026822818088e-06, + "loss": 0.1919, + "step": 3604 + }, + { + "epoch": 1.2038737685757221, + "grad_norm": 0.48363102294243154, + "learning_rate": 7.480339645523721e-06, + "loss": 0.2269, + "step": 3605 + }, + { + "epoch": 1.2042077141425946, + "grad_norm": 0.40851845309559937, + "learning_rate": 7.478652093544846e-06, + "loss": 0.1787, + "step": 3606 + }, + { + "epoch": 1.2045416597094674, + "grad_norm": 0.3933853695150573, + "learning_rate": 7.476964167136388e-06, + "loss": 0.1943, + "step": 3607 + }, + { + "epoch": 1.20487560527634, + "grad_norm": 0.4244304751713145, + "learning_rate": 7.475275866553326e-06, + "loss": 0.2014, + "step": 3608 + }, + { + "epoch": 1.2052095508432126, + "grad_norm": 0.6049087445094025, + "learning_rate": 7.473587192050698e-06, + "loss": 0.2119, + "step": 3609 + }, + { + "epoch": 1.205543496410085, + "grad_norm": 0.4302000569946228, + "learning_rate": 7.471898143883601e-06, + "loss": 0.1866, + "step": 3610 + }, + { + "epoch": 1.2058774419769578, + "grad_norm": 0.4329034497056259, + "learning_rate": 7.470208722307183e-06, + "loss": 0.2069, + "step": 3611 + }, + { + "epoch": 1.2062113875438303, + "grad_norm": 0.4175942295887134, + "learning_rate": 7.468518927576653e-06, + "loss": 0.2029, + "step": 3612 + }, + { + "epoch": 1.206545333110703, + "grad_norm": 0.40484000172415735, + "learning_rate": 7.466828759947271e-06, + "loss": 0.1942, + "step": 3613 + }, + { + "epoch": 1.2068792786775755, + "grad_norm": 0.412447371421282, + "learning_rate": 7.465138219674359e-06, + "loss": 0.2098, + "step": 3614 + }, + { + "epoch": 1.2072132242444482, + "grad_norm": 0.4136319006653538, + "learning_rate": 7.463447307013294e-06, + "loss": 0.1944, + "step": 3615 + }, + { + "epoch": 1.2075471698113207, + "grad_norm": 0.4652498470308249, + "learning_rate": 7.461756022219507e-06, + "loss": 0.2017, + "step": 3616 + }, + { + "epoch": 1.2078811153781934, + "grad_norm": 0.451017605411755, + "learning_rate": 7.460064365548486e-06, + "loss": 0.1925, + "step": 3617 + }, + { + "epoch": 1.208215060945066, + "grad_norm": 0.44487372326102165, + "learning_rate": 7.458372337255777e-06, + "loss": 0.2075, + "step": 3618 + }, + { + "epoch": 1.2085490065119386, + "grad_norm": 0.4495806275102939, + "learning_rate": 7.45667993759698e-06, + "loss": 0.2081, + "step": 3619 + }, + { + "epoch": 1.2088829520788111, + "grad_norm": 0.4430843200910173, + "learning_rate": 7.454987166827751e-06, + "loss": 0.206, + "step": 3620 + }, + { + "epoch": 1.2092168976456836, + "grad_norm": 0.4233773457511084, + "learning_rate": 7.4532940252038055e-06, + "loss": 0.2099, + "step": 3621 + }, + { + "epoch": 1.2095508432125563, + "grad_norm": 0.4321928280916543, + "learning_rate": 7.45160051298091e-06, + "loss": 0.1958, + "step": 3622 + }, + { + "epoch": 1.209884788779429, + "grad_norm": 0.5607630429277455, + "learning_rate": 7.4499066304148904e-06, + "loss": 0.2037, + "step": 3623 + }, + { + "epoch": 1.2102187343463016, + "grad_norm": 0.4184204477068433, + "learning_rate": 7.448212377761628e-06, + "loss": 0.2049, + "step": 3624 + }, + { + "epoch": 1.210552679913174, + "grad_norm": 0.4286820509175345, + "learning_rate": 7.4465177552770585e-06, + "loss": 0.2065, + "step": 3625 + }, + { + "epoch": 1.2108866254800468, + "grad_norm": 0.4204690969276894, + "learning_rate": 7.444822763217174e-06, + "loss": 0.1969, + "step": 3626 + }, + { + "epoch": 1.2112205710469193, + "grad_norm": 0.45133162737457627, + "learning_rate": 7.443127401838026e-06, + "loss": 0.2121, + "step": 3627 + }, + { + "epoch": 1.211554516613792, + "grad_norm": 0.6038373572960923, + "learning_rate": 7.441431671395717e-06, + "loss": 0.2025, + "step": 3628 + }, + { + "epoch": 1.2118884621806645, + "grad_norm": 0.46725093116741995, + "learning_rate": 7.439735572146407e-06, + "loss": 0.2076, + "step": 3629 + }, + { + "epoch": 1.2122224077475372, + "grad_norm": 0.4469655909074122, + "learning_rate": 7.438039104346312e-06, + "loss": 0.1991, + "step": 3630 + }, + { + "epoch": 1.2125563533144097, + "grad_norm": 0.4339059574842912, + "learning_rate": 7.436342268251702e-06, + "loss": 0.1954, + "step": 3631 + }, + { + "epoch": 1.2128902988812824, + "grad_norm": 0.47586137866658257, + "learning_rate": 7.434645064118906e-06, + "loss": 0.2117, + "step": 3632 + }, + { + "epoch": 1.213224244448155, + "grad_norm": 0.39511341244546483, + "learning_rate": 7.432947492204308e-06, + "loss": 0.1867, + "step": 3633 + }, + { + "epoch": 1.2135581900150276, + "grad_norm": 0.42677190119791664, + "learning_rate": 7.431249552764342e-06, + "loss": 0.2021, + "step": 3634 + }, + { + "epoch": 1.2138921355819001, + "grad_norm": 0.4669699424010275, + "learning_rate": 7.429551246055504e-06, + "loss": 0.1998, + "step": 3635 + }, + { + "epoch": 1.2142260811487728, + "grad_norm": 0.4174572295361658, + "learning_rate": 7.427852572334344e-06, + "loss": 0.2047, + "step": 3636 + }, + { + "epoch": 1.2145600267156453, + "grad_norm": 0.49130052281682524, + "learning_rate": 7.426153531857466e-06, + "loss": 0.2126, + "step": 3637 + }, + { + "epoch": 1.214893972282518, + "grad_norm": 0.39466608599092157, + "learning_rate": 7.424454124881531e-06, + "loss": 0.1912, + "step": 3638 + }, + { + "epoch": 1.2152279178493905, + "grad_norm": 0.5280861811057292, + "learning_rate": 7.422754351663252e-06, + "loss": 0.2041, + "step": 3639 + }, + { + "epoch": 1.215561863416263, + "grad_norm": 0.42326684754756433, + "learning_rate": 7.4210542124594e-06, + "loss": 0.1907, + "step": 3640 + }, + { + "epoch": 1.2158958089831358, + "grad_norm": 0.44549725989101097, + "learning_rate": 7.419353707526804e-06, + "loss": 0.2083, + "step": 3641 + }, + { + "epoch": 1.2162297545500083, + "grad_norm": 0.43094208142967383, + "learning_rate": 7.417652837122345e-06, + "loss": 0.2167, + "step": 3642 + }, + { + "epoch": 1.216563700116881, + "grad_norm": 0.5041095084876982, + "learning_rate": 7.4159516015029545e-06, + "loss": 0.2003, + "step": 3643 + }, + { + "epoch": 1.2168976456837535, + "grad_norm": 0.415265396940736, + "learning_rate": 7.414250000925629e-06, + "loss": 0.1979, + "step": 3644 + }, + { + "epoch": 1.2172315912506262, + "grad_norm": 0.4362697950301838, + "learning_rate": 7.412548035647416e-06, + "loss": 0.2035, + "step": 3645 + }, + { + "epoch": 1.2175655368174987, + "grad_norm": 0.3976738370024975, + "learning_rate": 7.4108457059254135e-06, + "loss": 0.1947, + "step": 3646 + }, + { + "epoch": 1.2178994823843714, + "grad_norm": 0.44831231308322833, + "learning_rate": 7.40914301201678e-06, + "loss": 0.2095, + "step": 3647 + }, + { + "epoch": 1.218233427951244, + "grad_norm": 0.38688375643877443, + "learning_rate": 7.407439954178729e-06, + "loss": 0.1935, + "step": 3648 + }, + { + "epoch": 1.2185673735181166, + "grad_norm": 0.42809394482498125, + "learning_rate": 7.405736532668525e-06, + "loss": 0.1997, + "step": 3649 + }, + { + "epoch": 1.218901319084989, + "grad_norm": 0.4272147459706146, + "learning_rate": 7.4040327477434926e-06, + "loss": 0.1933, + "step": 3650 + }, + { + "epoch": 1.2192352646518618, + "grad_norm": 0.4233262886986238, + "learning_rate": 7.402328599661006e-06, + "loss": 0.1961, + "step": 3651 + }, + { + "epoch": 1.2195692102187343, + "grad_norm": 0.4380339243339666, + "learning_rate": 7.400624088678497e-06, + "loss": 0.202, + "step": 3652 + }, + { + "epoch": 1.219903155785607, + "grad_norm": 0.4327575082842074, + "learning_rate": 7.398919215053455e-06, + "loss": 0.205, + "step": 3653 + }, + { + "epoch": 1.2202371013524795, + "grad_norm": 0.4371121995452199, + "learning_rate": 7.397213979043418e-06, + "loss": 0.2089, + "step": 3654 + }, + { + "epoch": 1.220571046919352, + "grad_norm": 0.4384003879413628, + "learning_rate": 7.395508380905983e-06, + "loss": 0.2007, + "step": 3655 + }, + { + "epoch": 1.2209049924862247, + "grad_norm": 0.42353068846381453, + "learning_rate": 7.393802420898801e-06, + "loss": 0.1949, + "step": 3656 + }, + { + "epoch": 1.2212389380530975, + "grad_norm": 0.388881556989798, + "learning_rate": 7.392096099279579e-06, + "loss": 0.1945, + "step": 3657 + }, + { + "epoch": 1.22157288361997, + "grad_norm": 0.4694668160111598, + "learning_rate": 7.390389416306073e-06, + "loss": 0.1988, + "step": 3658 + }, + { + "epoch": 1.2219068291868425, + "grad_norm": 0.41573848399930674, + "learning_rate": 7.3886823722361e-06, + "loss": 0.195, + "step": 3659 + }, + { + "epoch": 1.2222407747537152, + "grad_norm": 0.4196740807199658, + "learning_rate": 7.386974967327531e-06, + "loss": 0.1941, + "step": 3660 + }, + { + "epoch": 1.2225747203205877, + "grad_norm": 0.3842082607552933, + "learning_rate": 7.385267201838284e-06, + "loss": 0.1816, + "step": 3661 + }, + { + "epoch": 1.2229086658874604, + "grad_norm": 0.4210064079129366, + "learning_rate": 7.383559076026343e-06, + "loss": 0.2085, + "step": 3662 + }, + { + "epoch": 1.2232426114543329, + "grad_norm": 0.40011050080472454, + "learning_rate": 7.381850590149737e-06, + "loss": 0.1921, + "step": 3663 + }, + { + "epoch": 1.2235765570212056, + "grad_norm": 0.41455617156730395, + "learning_rate": 7.380141744466555e-06, + "loss": 0.1862, + "step": 3664 + }, + { + "epoch": 1.223910502588078, + "grad_norm": 0.42642811953196785, + "learning_rate": 7.378432539234936e-06, + "loss": 0.1976, + "step": 3665 + }, + { + "epoch": 1.2242444481549508, + "grad_norm": 0.4540552515895627, + "learning_rate": 7.376722974713078e-06, + "loss": 0.201, + "step": 3666 + }, + { + "epoch": 1.2245783937218233, + "grad_norm": 0.4442490667004041, + "learning_rate": 7.3750130511592275e-06, + "loss": 0.1957, + "step": 3667 + }, + { + "epoch": 1.224912339288696, + "grad_norm": 0.49385689080880146, + "learning_rate": 7.373302768831694e-06, + "loss": 0.2133, + "step": 3668 + }, + { + "epoch": 1.2252462848555685, + "grad_norm": 0.434217175678413, + "learning_rate": 7.371592127988831e-06, + "loss": 0.2016, + "step": 3669 + }, + { + "epoch": 1.225580230422441, + "grad_norm": 0.39530828895439507, + "learning_rate": 7.369881128889052e-06, + "loss": 0.1923, + "step": 3670 + }, + { + "epoch": 1.2259141759893137, + "grad_norm": 0.40983470561623925, + "learning_rate": 7.368169771790825e-06, + "loss": 0.2084, + "step": 3671 + }, + { + "epoch": 1.2262481215561865, + "grad_norm": 0.4571876738058572, + "learning_rate": 7.366458056952668e-06, + "loss": 0.1997, + "step": 3672 + }, + { + "epoch": 1.226582067123059, + "grad_norm": 0.426026877765367, + "learning_rate": 7.36474598463316e-06, + "loss": 0.2005, + "step": 3673 + }, + { + "epoch": 1.2269160126899314, + "grad_norm": 0.4118963504455984, + "learning_rate": 7.363033555090925e-06, + "loss": 0.1886, + "step": 3674 + }, + { + "epoch": 1.2272499582568042, + "grad_norm": 0.40449592628596953, + "learning_rate": 7.361320768584648e-06, + "loss": 0.1994, + "step": 3675 + }, + { + "epoch": 1.2275839038236767, + "grad_norm": 0.4197205384614, + "learning_rate": 7.359607625373065e-06, + "loss": 0.1891, + "step": 3676 + }, + { + "epoch": 1.2279178493905494, + "grad_norm": 0.4325982836766197, + "learning_rate": 7.357894125714967e-06, + "loss": 0.2011, + "step": 3677 + }, + { + "epoch": 1.2282517949574219, + "grad_norm": 0.44776808435438165, + "learning_rate": 7.3561802698691976e-06, + "loss": 0.1929, + "step": 3678 + }, + { + "epoch": 1.2285857405242946, + "grad_norm": 0.4529997887257035, + "learning_rate": 7.354466058094656e-06, + "loss": 0.2019, + "step": 3679 + }, + { + "epoch": 1.228919686091167, + "grad_norm": 0.4780531134449325, + "learning_rate": 7.352751490650294e-06, + "loss": 0.2334, + "step": 3680 + }, + { + "epoch": 1.2292536316580398, + "grad_norm": 0.4327464219707674, + "learning_rate": 7.3510365677951155e-06, + "loss": 0.1923, + "step": 3681 + }, + { + "epoch": 1.2295875772249123, + "grad_norm": 0.4494358023469274, + "learning_rate": 7.349321289788181e-06, + "loss": 0.2073, + "step": 3682 + }, + { + "epoch": 1.229921522791785, + "grad_norm": 0.4186072999519254, + "learning_rate": 7.3476056568886036e-06, + "loss": 0.1897, + "step": 3683 + }, + { + "epoch": 1.2302554683586575, + "grad_norm": 0.4505587151771832, + "learning_rate": 7.34588966935555e-06, + "loss": 0.2074, + "step": 3684 + }, + { + "epoch": 1.2305894139255302, + "grad_norm": 0.4712172452986157, + "learning_rate": 7.344173327448238e-06, + "loss": 0.1955, + "step": 3685 + }, + { + "epoch": 1.2309233594924027, + "grad_norm": 0.42151892488420034, + "learning_rate": 7.342456631425945e-06, + "loss": 0.1968, + "step": 3686 + }, + { + "epoch": 1.2312573050592754, + "grad_norm": 0.4858147778552328, + "learning_rate": 7.340739581547996e-06, + "loss": 0.2112, + "step": 3687 + }, + { + "epoch": 1.231591250626148, + "grad_norm": 0.39299160270288175, + "learning_rate": 7.339022178073772e-06, + "loss": 0.1783, + "step": 3688 + }, + { + "epoch": 1.2319251961930204, + "grad_norm": 0.4347870834866963, + "learning_rate": 7.337304421262706e-06, + "loss": 0.2005, + "step": 3689 + }, + { + "epoch": 1.2322591417598932, + "grad_norm": 0.3944954395837461, + "learning_rate": 7.335586311374287e-06, + "loss": 0.1804, + "step": 3690 + }, + { + "epoch": 1.2325930873267656, + "grad_norm": 0.4511832507539604, + "learning_rate": 7.3338678486680545e-06, + "loss": 0.226, + "step": 3691 + }, + { + "epoch": 1.2329270328936384, + "grad_norm": 0.4951489024207722, + "learning_rate": 7.3321490334036035e-06, + "loss": 0.233, + "step": 3692 + }, + { + "epoch": 1.2332609784605109, + "grad_norm": 0.42695922666442165, + "learning_rate": 7.3304298658405815e-06, + "loss": 0.1988, + "step": 3693 + }, + { + "epoch": 1.2335949240273836, + "grad_norm": 0.4559841391821671, + "learning_rate": 7.328710346238688e-06, + "loss": 0.1873, + "step": 3694 + }, + { + "epoch": 1.233928869594256, + "grad_norm": 0.4176584911033391, + "learning_rate": 7.326990474857676e-06, + "loss": 0.1755, + "step": 3695 + }, + { + "epoch": 1.2342628151611288, + "grad_norm": 0.388833205756847, + "learning_rate": 7.3252702519573545e-06, + "loss": 0.1818, + "step": 3696 + }, + { + "epoch": 1.2345967607280013, + "grad_norm": 0.4741753202108229, + "learning_rate": 7.323549677797582e-06, + "loss": 0.2035, + "step": 3697 + }, + { + "epoch": 1.234930706294874, + "grad_norm": 0.41197049335397234, + "learning_rate": 7.3218287526382716e-06, + "loss": 0.1884, + "step": 3698 + }, + { + "epoch": 1.2352646518617465, + "grad_norm": 0.4519660783569149, + "learning_rate": 7.320107476739389e-06, + "loss": 0.1987, + "step": 3699 + }, + { + "epoch": 1.2355985974286192, + "grad_norm": 0.4441870961138195, + "learning_rate": 7.318385850360954e-06, + "loss": 0.2002, + "step": 3700 + }, + { + "epoch": 1.2359325429954917, + "grad_norm": 0.4771636776284061, + "learning_rate": 7.316663873763039e-06, + "loss": 0.2005, + "step": 3701 + }, + { + "epoch": 1.2362664885623644, + "grad_norm": 0.421346363486521, + "learning_rate": 7.314941547205767e-06, + "loss": 0.2057, + "step": 3702 + }, + { + "epoch": 1.236600434129237, + "grad_norm": 0.3981895128659755, + "learning_rate": 7.313218870949317e-06, + "loss": 0.1943, + "step": 3703 + }, + { + "epoch": 1.2369343796961094, + "grad_norm": 0.43622076090027484, + "learning_rate": 7.31149584525392e-06, + "loss": 0.2047, + "step": 3704 + }, + { + "epoch": 1.2372683252629821, + "grad_norm": 0.39492812624758417, + "learning_rate": 7.309772470379856e-06, + "loss": 0.2045, + "step": 3705 + }, + { + "epoch": 1.2376022708298549, + "grad_norm": 0.39980899433349076, + "learning_rate": 7.308048746587466e-06, + "loss": 0.1904, + "step": 3706 + }, + { + "epoch": 1.2379362163967274, + "grad_norm": 0.4226276960308284, + "learning_rate": 7.3063246741371365e-06, + "loss": 0.1934, + "step": 3707 + }, + { + "epoch": 1.2382701619635998, + "grad_norm": 0.4416996943577303, + "learning_rate": 7.304600253289308e-06, + "loss": 0.1986, + "step": 3708 + }, + { + "epoch": 1.2386041075304726, + "grad_norm": 0.4084631963077389, + "learning_rate": 7.302875484304476e-06, + "loss": 0.2003, + "step": 3709 + }, + { + "epoch": 1.238938053097345, + "grad_norm": 0.43539905341193225, + "learning_rate": 7.301150367443186e-06, + "loss": 0.2037, + "step": 3710 + }, + { + "epoch": 1.2392719986642178, + "grad_norm": 0.4253528674527778, + "learning_rate": 7.299424902966039e-06, + "loss": 0.2084, + "step": 3711 + }, + { + "epoch": 1.2396059442310903, + "grad_norm": 0.4139818088928367, + "learning_rate": 7.297699091133685e-06, + "loss": 0.2113, + "step": 3712 + }, + { + "epoch": 1.239939889797963, + "grad_norm": 0.42337946777331986, + "learning_rate": 7.295972932206827e-06, + "loss": 0.1946, + "step": 3713 + }, + { + "epoch": 1.2402738353648355, + "grad_norm": 0.42234185270273344, + "learning_rate": 7.2942464264462255e-06, + "loss": 0.1877, + "step": 3714 + }, + { + "epoch": 1.2406077809317082, + "grad_norm": 0.42468102269690094, + "learning_rate": 7.292519574112688e-06, + "loss": 0.1952, + "step": 3715 + }, + { + "epoch": 1.2409417264985807, + "grad_norm": 0.45061158016324065, + "learning_rate": 7.290792375467074e-06, + "loss": 0.2077, + "step": 3716 + }, + { + "epoch": 1.2412756720654534, + "grad_norm": 0.4374334755090788, + "learning_rate": 7.2890648307702985e-06, + "loss": 0.208, + "step": 3717 + }, + { + "epoch": 1.241609617632326, + "grad_norm": 0.44401302086261285, + "learning_rate": 7.287336940283327e-06, + "loss": 0.2005, + "step": 3718 + }, + { + "epoch": 1.2419435631991984, + "grad_norm": 0.4366476902185395, + "learning_rate": 7.28560870426718e-06, + "loss": 0.2007, + "step": 3719 + }, + { + "epoch": 1.2422775087660711, + "grad_norm": 0.43419928883049946, + "learning_rate": 7.2838801229829245e-06, + "loss": 0.1975, + "step": 3720 + }, + { + "epoch": 1.2426114543329438, + "grad_norm": 0.40640252159976503, + "learning_rate": 7.2821511966916845e-06, + "loss": 0.1994, + "step": 3721 + }, + { + "epoch": 1.2429453998998163, + "grad_norm": 0.42257019966830034, + "learning_rate": 7.280421925654635e-06, + "loss": 0.2021, + "step": 3722 + }, + { + "epoch": 1.2432793454666888, + "grad_norm": 0.40500129240346916, + "learning_rate": 7.278692310133003e-06, + "loss": 0.1919, + "step": 3723 + }, + { + "epoch": 1.2436132910335616, + "grad_norm": 0.42012490022988247, + "learning_rate": 7.276962350388067e-06, + "loss": 0.1966, + "step": 3724 + }, + { + "epoch": 1.243947236600434, + "grad_norm": 0.4029324763396112, + "learning_rate": 7.275232046681157e-06, + "loss": 0.1896, + "step": 3725 + }, + { + "epoch": 1.2442811821673068, + "grad_norm": 0.4113849025835322, + "learning_rate": 7.273501399273656e-06, + "loss": 0.1969, + "step": 3726 + }, + { + "epoch": 1.2446151277341793, + "grad_norm": 0.4507044786866005, + "learning_rate": 7.271770408427e-06, + "loss": 0.2062, + "step": 3727 + }, + { + "epoch": 1.244949073301052, + "grad_norm": 0.45701381903540794, + "learning_rate": 7.2700390744026735e-06, + "loss": 0.2079, + "step": 3728 + }, + { + "epoch": 1.2452830188679245, + "grad_norm": 0.47264780094018927, + "learning_rate": 7.2683073974622165e-06, + "loss": 0.21, + "step": 3729 + }, + { + "epoch": 1.2456169644347972, + "grad_norm": 0.46485397157026925, + "learning_rate": 7.26657537786722e-06, + "loss": 0.2165, + "step": 3730 + }, + { + "epoch": 1.2459509100016697, + "grad_norm": 0.40938039577104884, + "learning_rate": 7.264843015879321e-06, + "loss": 0.1923, + "step": 3731 + }, + { + "epoch": 1.2462848555685424, + "grad_norm": 0.5185887899593448, + "learning_rate": 7.263110311760221e-06, + "loss": 0.2086, + "step": 3732 + }, + { + "epoch": 1.246618801135415, + "grad_norm": 0.4628231707028568, + "learning_rate": 7.2613772657716585e-06, + "loss": 0.1927, + "step": 3733 + }, + { + "epoch": 1.2469527467022876, + "grad_norm": 0.49261373578200623, + "learning_rate": 7.259643878175434e-06, + "loss": 0.2183, + "step": 3734 + }, + { + "epoch": 1.2472866922691601, + "grad_norm": 0.4642674170595037, + "learning_rate": 7.2579101492333956e-06, + "loss": 0.194, + "step": 3735 + }, + { + "epoch": 1.2476206378360328, + "grad_norm": 0.4697562787124826, + "learning_rate": 7.256176079207442e-06, + "loss": 0.2052, + "step": 3736 + }, + { + "epoch": 1.2479545834029053, + "grad_norm": 0.43296428720065006, + "learning_rate": 7.254441668359527e-06, + "loss": 0.2019, + "step": 3737 + }, + { + "epoch": 1.2482885289697778, + "grad_norm": 0.5097000021139282, + "learning_rate": 7.252706916951653e-06, + "loss": 0.2192, + "step": 3738 + }, + { + "epoch": 1.2486224745366505, + "grad_norm": 0.4864215004236803, + "learning_rate": 7.250971825245874e-06, + "loss": 0.2168, + "step": 3739 + }, + { + "epoch": 1.248956420103523, + "grad_norm": 0.41838422249957957, + "learning_rate": 7.249236393504296e-06, + "loss": 0.2018, + "step": 3740 + }, + { + "epoch": 1.2492903656703958, + "grad_norm": 0.39261054860167127, + "learning_rate": 7.247500621989078e-06, + "loss": 0.1887, + "step": 3741 + }, + { + "epoch": 1.2496243112372682, + "grad_norm": 0.420859082136698, + "learning_rate": 7.245764510962426e-06, + "loss": 0.2072, + "step": 3742 + }, + { + "epoch": 1.249958256804141, + "grad_norm": 0.44399344347580727, + "learning_rate": 7.244028060686603e-06, + "loss": 0.2007, + "step": 3743 + }, + { + "epoch": 1.2502922023710135, + "grad_norm": 0.403279573354582, + "learning_rate": 7.242291271423919e-06, + "loss": 0.1983, + "step": 3744 + }, + { + "epoch": 1.2506261479378862, + "grad_norm": 0.40558518807409527, + "learning_rate": 7.240554143436735e-06, + "loss": 0.1942, + "step": 3745 + }, + { + "epoch": 1.2509600935047587, + "grad_norm": 0.45525307076929966, + "learning_rate": 7.238816676987467e-06, + "loss": 0.196, + "step": 3746 + }, + { + "epoch": 1.2512940390716314, + "grad_norm": 0.47302635450133046, + "learning_rate": 7.237078872338579e-06, + "loss": 0.2172, + "step": 3747 + }, + { + "epoch": 1.2516279846385039, + "grad_norm": 0.4115995099581479, + "learning_rate": 7.235340729752584e-06, + "loss": 0.202, + "step": 3748 + }, + { + "epoch": 1.2519619302053766, + "grad_norm": 0.40330391629967777, + "learning_rate": 7.233602249492055e-06, + "loss": 0.1861, + "step": 3749 + }, + { + "epoch": 1.252295875772249, + "grad_norm": 0.4611901621435869, + "learning_rate": 7.2318634318196045e-06, + "loss": 0.1993, + "step": 3750 + }, + { + "epoch": 1.2526298213391218, + "grad_norm": 0.444872798872787, + "learning_rate": 7.230124276997903e-06, + "loss": 0.1939, + "step": 3751 + }, + { + "epoch": 1.2529637669059943, + "grad_norm": 0.39062409444486607, + "learning_rate": 7.228384785289671e-06, + "loss": 0.1878, + "step": 3752 + }, + { + "epoch": 1.2532977124728668, + "grad_norm": 0.38750288835226704, + "learning_rate": 7.2266449569576804e-06, + "loss": 0.2006, + "step": 3753 + }, + { + "epoch": 1.2536316580397395, + "grad_norm": 0.4739440204799354, + "learning_rate": 7.224904792264748e-06, + "loss": 0.1972, + "step": 3754 + }, + { + "epoch": 1.2539656036066122, + "grad_norm": 0.43430096931368434, + "learning_rate": 7.223164291473752e-06, + "loss": 0.1988, + "step": 3755 + }, + { + "epoch": 1.2542995491734847, + "grad_norm": 0.4797983430080601, + "learning_rate": 7.221423454847611e-06, + "loss": 0.2029, + "step": 3756 + }, + { + "epoch": 1.2546334947403572, + "grad_norm": 0.43552537544104014, + "learning_rate": 7.219682282649302e-06, + "loss": 0.1982, + "step": 3757 + }, + { + "epoch": 1.25496744030723, + "grad_norm": 0.44330876808250125, + "learning_rate": 7.2179407751418485e-06, + "loss": 0.1996, + "step": 3758 + }, + { + "epoch": 1.2553013858741024, + "grad_norm": 0.40349763911316494, + "learning_rate": 7.216198932588325e-06, + "loss": 0.1898, + "step": 3759 + }, + { + "epoch": 1.2556353314409752, + "grad_norm": 0.4307633819537129, + "learning_rate": 7.214456755251858e-06, + "loss": 0.2011, + "step": 3760 + }, + { + "epoch": 1.2559692770078477, + "grad_norm": 0.43747344877924577, + "learning_rate": 7.212714243395623e-06, + "loss": 0.1935, + "step": 3761 + }, + { + "epoch": 1.2563032225747204, + "grad_norm": 0.439147201532576, + "learning_rate": 7.210971397282848e-06, + "loss": 0.1968, + "step": 3762 + }, + { + "epoch": 1.2566371681415929, + "grad_norm": 0.4091513057684739, + "learning_rate": 7.20922821717681e-06, + "loss": 0.1975, + "step": 3763 + }, + { + "epoch": 1.2569711137084656, + "grad_norm": 0.4001065589997931, + "learning_rate": 7.207484703340838e-06, + "loss": 0.1789, + "step": 3764 + }, + { + "epoch": 1.257305059275338, + "grad_norm": 0.41731976095296924, + "learning_rate": 7.205740856038308e-06, + "loss": 0.2007, + "step": 3765 + }, + { + "epoch": 1.2576390048422108, + "grad_norm": 0.4303382469501401, + "learning_rate": 7.2039966755326515e-06, + "loss": 0.2048, + "step": 3766 + }, + { + "epoch": 1.2579729504090833, + "grad_norm": 0.43856878267294563, + "learning_rate": 7.2022521620873456e-06, + "loss": 0.1904, + "step": 3767 + }, + { + "epoch": 1.2583068959759558, + "grad_norm": 0.4183896525795676, + "learning_rate": 7.2005073159659186e-06, + "loss": 0.1931, + "step": 3768 + }, + { + "epoch": 1.2586408415428285, + "grad_norm": 0.4647365271907042, + "learning_rate": 7.198762137431952e-06, + "loss": 0.2066, + "step": 3769 + }, + { + "epoch": 1.2589747871097012, + "grad_norm": 0.40618009076146466, + "learning_rate": 7.197016626749076e-06, + "loss": 0.185, + "step": 3770 + }, + { + "epoch": 1.2593087326765737, + "grad_norm": 0.4063402599280868, + "learning_rate": 7.195270784180968e-06, + "loss": 0.1964, + "step": 3771 + }, + { + "epoch": 1.2596426782434462, + "grad_norm": 0.4980873393441178, + "learning_rate": 7.193524609991359e-06, + "loss": 0.2167, + "step": 3772 + }, + { + "epoch": 1.259976623810319, + "grad_norm": 0.4203495972272948, + "learning_rate": 7.191778104444031e-06, + "loss": 0.1996, + "step": 3773 + }, + { + "epoch": 1.2603105693771914, + "grad_norm": 0.4691853538909953, + "learning_rate": 7.190031267802814e-06, + "loss": 0.2068, + "step": 3774 + }, + { + "epoch": 1.2606445149440642, + "grad_norm": 0.429747934431845, + "learning_rate": 7.188284100331585e-06, + "loss": 0.197, + "step": 3775 + }, + { + "epoch": 1.2609784605109366, + "grad_norm": 0.3989541459642685, + "learning_rate": 7.186536602294278e-06, + "loss": 0.1866, + "step": 3776 + }, + { + "epoch": 1.2613124060778094, + "grad_norm": 0.4079076916479964, + "learning_rate": 7.184788773954871e-06, + "loss": 0.191, + "step": 3777 + }, + { + "epoch": 1.2616463516446819, + "grad_norm": 0.37280995002049727, + "learning_rate": 7.1830406155773946e-06, + "loss": 0.1842, + "step": 3778 + }, + { + "epoch": 1.2619802972115546, + "grad_norm": 0.47418669121238, + "learning_rate": 7.181292127425928e-06, + "loss": 0.2017, + "step": 3779 + }, + { + "epoch": 1.262314242778427, + "grad_norm": 0.4557782177670249, + "learning_rate": 7.179543309764604e-06, + "loss": 0.2095, + "step": 3780 + }, + { + "epoch": 1.2626481883452998, + "grad_norm": 0.4038382372590161, + "learning_rate": 7.177794162857598e-06, + "loss": 0.1895, + "step": 3781 + }, + { + "epoch": 1.2629821339121723, + "grad_norm": 0.4257072698432259, + "learning_rate": 7.176044686969141e-06, + "loss": 0.2084, + "step": 3782 + }, + { + "epoch": 1.2633160794790448, + "grad_norm": 0.40296656493114924, + "learning_rate": 7.174294882363513e-06, + "loss": 0.2078, + "step": 3783 + }, + { + "epoch": 1.2636500250459175, + "grad_norm": 0.3972652707207378, + "learning_rate": 7.172544749305039e-06, + "loss": 0.2005, + "step": 3784 + }, + { + "epoch": 1.2639839706127902, + "grad_norm": 0.40141205047962275, + "learning_rate": 7.170794288058103e-06, + "loss": 0.1899, + "step": 3785 + }, + { + "epoch": 1.2643179161796627, + "grad_norm": 0.3924619438453352, + "learning_rate": 7.169043498887126e-06, + "loss": 0.1928, + "step": 3786 + }, + { + "epoch": 1.2646518617465352, + "grad_norm": 0.4310553864330334, + "learning_rate": 7.1672923820565925e-06, + "loss": 0.1945, + "step": 3787 + }, + { + "epoch": 1.264985807313408, + "grad_norm": 0.42295352764743993, + "learning_rate": 7.165540937831024e-06, + "loss": 0.1984, + "step": 3788 + }, + { + "epoch": 1.2653197528802806, + "grad_norm": 0.38478960335529383, + "learning_rate": 7.163789166474998e-06, + "loss": 0.1907, + "step": 3789 + }, + { + "epoch": 1.2656536984471531, + "grad_norm": 0.3964245436953759, + "learning_rate": 7.162037068253141e-06, + "loss": 0.1824, + "step": 3790 + }, + { + "epoch": 1.2659876440140256, + "grad_norm": 0.4650615379634907, + "learning_rate": 7.160284643430129e-06, + "loss": 0.2234, + "step": 3791 + }, + { + "epoch": 1.2663215895808984, + "grad_norm": 0.4988659718160951, + "learning_rate": 7.158531892270682e-06, + "loss": 0.1991, + "step": 3792 + }, + { + "epoch": 1.2666555351477709, + "grad_norm": 0.40544641816954785, + "learning_rate": 7.156778815039579e-06, + "loss": 0.1871, + "step": 3793 + }, + { + "epoch": 1.2669894807146436, + "grad_norm": 0.4331677677975272, + "learning_rate": 7.15502541200164e-06, + "loss": 0.2045, + "step": 3794 + }, + { + "epoch": 1.267323426281516, + "grad_norm": 0.4427416226804204, + "learning_rate": 7.153271683421738e-06, + "loss": 0.2012, + "step": 3795 + }, + { + "epoch": 1.2676573718483888, + "grad_norm": 0.43060810240200514, + "learning_rate": 7.151517629564795e-06, + "loss": 0.2067, + "step": 3796 + }, + { + "epoch": 1.2679913174152613, + "grad_norm": 0.4162223937732087, + "learning_rate": 7.14976325069578e-06, + "loss": 0.2001, + "step": 3797 + }, + { + "epoch": 1.268325262982134, + "grad_norm": 0.4283496532813782, + "learning_rate": 7.148008547079713e-06, + "loss": 0.1942, + "step": 3798 + }, + { + "epoch": 1.2686592085490065, + "grad_norm": 0.3919278069619941, + "learning_rate": 7.1462535189816636e-06, + "loss": 0.1978, + "step": 3799 + }, + { + "epoch": 1.2689931541158792, + "grad_norm": 0.4030159316869139, + "learning_rate": 7.14449816666675e-06, + "loss": 0.2024, + "step": 3800 + }, + { + "epoch": 1.2693270996827517, + "grad_norm": 0.4122621176678283, + "learning_rate": 7.142742490400135e-06, + "loss": 0.1939, + "step": 3801 + }, + { + "epoch": 1.2696610452496242, + "grad_norm": 0.4242214438614236, + "learning_rate": 7.140986490447039e-06, + "loss": 0.2014, + "step": 3802 + }, + { + "epoch": 1.269994990816497, + "grad_norm": 0.41850239250671534, + "learning_rate": 7.139230167072724e-06, + "loss": 0.2037, + "step": 3803 + }, + { + "epoch": 1.2703289363833696, + "grad_norm": 0.4148927143434032, + "learning_rate": 7.137473520542503e-06, + "loss": 0.1982, + "step": 3804 + }, + { + "epoch": 1.2706628819502421, + "grad_norm": 0.42901971140755585, + "learning_rate": 7.135716551121739e-06, + "loss": 0.1966, + "step": 3805 + }, + { + "epoch": 1.2709968275171146, + "grad_norm": 0.45317109667412, + "learning_rate": 7.133959259075844e-06, + "loss": 0.2161, + "step": 3806 + }, + { + "epoch": 1.2713307730839873, + "grad_norm": 0.38629809789963193, + "learning_rate": 7.132201644670274e-06, + "loss": 0.1946, + "step": 3807 + }, + { + "epoch": 1.2716647186508598, + "grad_norm": 0.43946380323346784, + "learning_rate": 7.13044370817054e-06, + "loss": 0.2185, + "step": 3808 + }, + { + "epoch": 1.2719986642177326, + "grad_norm": 0.4153870460305795, + "learning_rate": 7.128685449842201e-06, + "loss": 0.1829, + "step": 3809 + }, + { + "epoch": 1.272332609784605, + "grad_norm": 0.4070835430938735, + "learning_rate": 7.1269268699508574e-06, + "loss": 0.1914, + "step": 3810 + }, + { + "epoch": 1.2726665553514778, + "grad_norm": 0.5294790454069535, + "learning_rate": 7.1251679687621685e-06, + "loss": 0.197, + "step": 3811 + }, + { + "epoch": 1.2730005009183503, + "grad_norm": 0.43195916234280074, + "learning_rate": 7.123408746541835e-06, + "loss": 0.2067, + "step": 3812 + }, + { + "epoch": 1.273334446485223, + "grad_norm": 0.4223072878291587, + "learning_rate": 7.1216492035556075e-06, + "loss": 0.2024, + "step": 3813 + }, + { + "epoch": 1.2736683920520955, + "grad_norm": 0.43487357461786935, + "learning_rate": 7.119889340069286e-06, + "loss": 0.1966, + "step": 3814 + }, + { + "epoch": 1.2740023376189682, + "grad_norm": 0.4051402525802372, + "learning_rate": 7.1181291563487175e-06, + "loss": 0.1887, + "step": 3815 + }, + { + "epoch": 1.2743362831858407, + "grad_norm": 0.4456616551512561, + "learning_rate": 7.116368652659802e-06, + "loss": 0.2077, + "step": 3816 + }, + { + "epoch": 1.2746702287527132, + "grad_norm": 0.407498695448717, + "learning_rate": 7.114607829268481e-06, + "loss": 0.1967, + "step": 3817 + }, + { + "epoch": 1.275004174319586, + "grad_norm": 0.40906980972589485, + "learning_rate": 7.1128466864407486e-06, + "loss": 0.1986, + "step": 3818 + }, + { + "epoch": 1.2753381198864586, + "grad_norm": 0.38020921049437223, + "learning_rate": 7.111085224442647e-06, + "loss": 0.1864, + "step": 3819 + }, + { + "epoch": 1.2756720654533311, + "grad_norm": 0.41376376861187264, + "learning_rate": 7.109323443540263e-06, + "loss": 0.2022, + "step": 3820 + }, + { + "epoch": 1.2760060110202036, + "grad_norm": 0.4085034643758407, + "learning_rate": 7.107561343999739e-06, + "loss": 0.2002, + "step": 3821 + }, + { + "epoch": 1.2763399565870763, + "grad_norm": 0.4147881532976798, + "learning_rate": 7.105798926087257e-06, + "loss": 0.1929, + "step": 3822 + }, + { + "epoch": 1.2766739021539488, + "grad_norm": 0.4164403555278612, + "learning_rate": 7.104036190069052e-06, + "loss": 0.1979, + "step": 3823 + }, + { + "epoch": 1.2770078477208215, + "grad_norm": 0.388973867320647, + "learning_rate": 7.102273136211407e-06, + "loss": 0.1947, + "step": 3824 + }, + { + "epoch": 1.277341793287694, + "grad_norm": 0.43615613297218037, + "learning_rate": 7.10050976478065e-06, + "loss": 0.1904, + "step": 3825 + }, + { + "epoch": 1.2776757388545668, + "grad_norm": 0.4217476905256785, + "learning_rate": 7.098746076043162e-06, + "loss": 0.1976, + "step": 3826 + }, + { + "epoch": 1.2780096844214393, + "grad_norm": 0.4410477775249162, + "learning_rate": 7.096982070265366e-06, + "loss": 0.2111, + "step": 3827 + }, + { + "epoch": 1.278343629988312, + "grad_norm": 0.407553596713063, + "learning_rate": 7.0952177477137374e-06, + "loss": 0.1877, + "step": 3828 + }, + { + "epoch": 1.2786775755551845, + "grad_norm": 0.4226143283409012, + "learning_rate": 7.093453108654798e-06, + "loss": 0.1942, + "step": 3829 + }, + { + "epoch": 1.2790115211220572, + "grad_norm": 0.44448160702216344, + "learning_rate": 7.091688153355116e-06, + "loss": 0.2117, + "step": 3830 + }, + { + "epoch": 1.2793454666889297, + "grad_norm": 0.44023312970202644, + "learning_rate": 7.08992288208131e-06, + "loss": 0.2061, + "step": 3831 + }, + { + "epoch": 1.2796794122558022, + "grad_norm": 0.4289817202503459, + "learning_rate": 7.088157295100046e-06, + "loss": 0.1982, + "step": 3832 + }, + { + "epoch": 1.280013357822675, + "grad_norm": 0.41168121667242447, + "learning_rate": 7.0863913926780335e-06, + "loss": 0.1965, + "step": 3833 + }, + { + "epoch": 1.2803473033895476, + "grad_norm": 0.389066292024194, + "learning_rate": 7.084625175082036e-06, + "loss": 0.1969, + "step": 3834 + }, + { + "epoch": 1.28068124895642, + "grad_norm": 0.42211956082879903, + "learning_rate": 7.082858642578861e-06, + "loss": 0.1927, + "step": 3835 + }, + { + "epoch": 1.2810151945232926, + "grad_norm": 0.41487931684211865, + "learning_rate": 7.081091795435361e-06, + "loss": 0.2018, + "step": 3836 + }, + { + "epoch": 1.2813491400901653, + "grad_norm": 0.4355951362057153, + "learning_rate": 7.079324633918443e-06, + "loss": 0.187, + "step": 3837 + }, + { + "epoch": 1.281683085657038, + "grad_norm": 0.40177334082527777, + "learning_rate": 7.077557158295053e-06, + "loss": 0.1943, + "step": 3838 + }, + { + "epoch": 1.2820170312239105, + "grad_norm": 0.4688295048980153, + "learning_rate": 7.075789368832194e-06, + "loss": 0.2056, + "step": 3839 + }, + { + "epoch": 1.282350976790783, + "grad_norm": 0.45332907753482743, + "learning_rate": 7.074021265796909e-06, + "loss": 0.2151, + "step": 3840 + }, + { + "epoch": 1.2826849223576557, + "grad_norm": 0.4011388285723007, + "learning_rate": 7.072252849456291e-06, + "loss": 0.1948, + "step": 3841 + }, + { + "epoch": 1.2830188679245282, + "grad_norm": 0.4397419364652935, + "learning_rate": 7.07048412007748e-06, + "loss": 0.1973, + "step": 3842 + }, + { + "epoch": 1.283352813491401, + "grad_norm": 0.4191526878534424, + "learning_rate": 7.068715077927664e-06, + "loss": 0.1922, + "step": 3843 + }, + { + "epoch": 1.2836867590582735, + "grad_norm": 0.4622949980235757, + "learning_rate": 7.066945723274077e-06, + "loss": 0.1966, + "step": 3844 + }, + { + "epoch": 1.2840207046251462, + "grad_norm": 0.39544208070452724, + "learning_rate": 7.065176056383999e-06, + "loss": 0.1978, + "step": 3845 + }, + { + "epoch": 1.2843546501920187, + "grad_norm": 0.4669748844624701, + "learning_rate": 7.063406077524764e-06, + "loss": 0.2106, + "step": 3846 + }, + { + "epoch": 1.2846885957588914, + "grad_norm": 0.4045890789425728, + "learning_rate": 7.061635786963743e-06, + "loss": 0.1927, + "step": 3847 + }, + { + "epoch": 1.2850225413257639, + "grad_norm": 0.4297051183096818, + "learning_rate": 7.059865184968362e-06, + "loss": 0.1991, + "step": 3848 + }, + { + "epoch": 1.2853564868926366, + "grad_norm": 0.44503532628630693, + "learning_rate": 7.058094271806091e-06, + "loss": 0.2127, + "step": 3849 + }, + { + "epoch": 1.285690432459509, + "grad_norm": 0.38682645112211256, + "learning_rate": 7.056323047744447e-06, + "loss": 0.1753, + "step": 3850 + }, + { + "epoch": 1.2860243780263816, + "grad_norm": 0.44630819787955817, + "learning_rate": 7.054551513050993e-06, + "loss": 0.2156, + "step": 3851 + }, + { + "epoch": 1.2863583235932543, + "grad_norm": 0.4378283545797013, + "learning_rate": 7.052779667993342e-06, + "loss": 0.2057, + "step": 3852 + }, + { + "epoch": 1.286692269160127, + "grad_norm": 0.4304232637199404, + "learning_rate": 7.051007512839153e-06, + "loss": 0.2205, + "step": 3853 + }, + { + "epoch": 1.2870262147269995, + "grad_norm": 0.5159648005944082, + "learning_rate": 7.0492350478561275e-06, + "loss": 0.1995, + "step": 3854 + }, + { + "epoch": 1.287360160293872, + "grad_norm": 0.457847641860177, + "learning_rate": 7.04746227331202e-06, + "loss": 0.2067, + "step": 3855 + }, + { + "epoch": 1.2876941058607447, + "grad_norm": 0.41872580138211385, + "learning_rate": 7.045689189474628e-06, + "loss": 0.1934, + "step": 3856 + }, + { + "epoch": 1.2880280514276172, + "grad_norm": 0.4110936768730782, + "learning_rate": 7.0439157966117955e-06, + "loss": 0.1858, + "step": 3857 + }, + { + "epoch": 1.28836199699449, + "grad_norm": 0.38185640035698354, + "learning_rate": 7.042142094991418e-06, + "loss": 0.1816, + "step": 3858 + }, + { + "epoch": 1.2886959425613624, + "grad_norm": 0.3873522432517064, + "learning_rate": 7.04036808488143e-06, + "loss": 0.1964, + "step": 3859 + }, + { + "epoch": 1.2890298881282352, + "grad_norm": 0.4232734771641415, + "learning_rate": 7.038593766549817e-06, + "loss": 0.2104, + "step": 3860 + }, + { + "epoch": 1.2893638336951077, + "grad_norm": 0.4746183445203052, + "learning_rate": 7.0368191402646145e-06, + "loss": 0.2097, + "step": 3861 + }, + { + "epoch": 1.2896977792619804, + "grad_norm": 0.42621702835077613, + "learning_rate": 7.035044206293898e-06, + "loss": 0.2194, + "step": 3862 + }, + { + "epoch": 1.2900317248288529, + "grad_norm": 0.42742800253037144, + "learning_rate": 7.0332689649057905e-06, + "loss": 0.1985, + "step": 3863 + }, + { + "epoch": 1.2903656703957256, + "grad_norm": 0.4320688142367941, + "learning_rate": 7.031493416368466e-06, + "loss": 0.1984, + "step": 3864 + }, + { + "epoch": 1.290699615962598, + "grad_norm": 0.4611896460687471, + "learning_rate": 7.029717560950141e-06, + "loss": 0.2105, + "step": 3865 + }, + { + "epoch": 1.2910335615294706, + "grad_norm": 0.42167209030771285, + "learning_rate": 7.027941398919078e-06, + "loss": 0.2007, + "step": 3866 + }, + { + "epoch": 1.2913675070963433, + "grad_norm": 0.4486720053192747, + "learning_rate": 7.0261649305435895e-06, + "loss": 0.2107, + "step": 3867 + }, + { + "epoch": 1.291701452663216, + "grad_norm": 0.43703159019530075, + "learning_rate": 7.02438815609203e-06, + "loss": 0.2002, + "step": 3868 + }, + { + "epoch": 1.2920353982300885, + "grad_norm": 0.4206235426460402, + "learning_rate": 7.022611075832804e-06, + "loss": 0.1824, + "step": 3869 + }, + { + "epoch": 1.292369343796961, + "grad_norm": 0.45899661384875784, + "learning_rate": 7.02083369003436e-06, + "loss": 0.2091, + "step": 3870 + }, + { + "epoch": 1.2927032893638337, + "grad_norm": 0.4163150817565343, + "learning_rate": 7.019055998965191e-06, + "loss": 0.2037, + "step": 3871 + }, + { + "epoch": 1.2930372349307062, + "grad_norm": 0.48635183034398205, + "learning_rate": 7.017278002893841e-06, + "loss": 0.2072, + "step": 3872 + }, + { + "epoch": 1.293371180497579, + "grad_norm": 0.4210590900618153, + "learning_rate": 7.015499702088896e-06, + "loss": 0.1871, + "step": 3873 + }, + { + "epoch": 1.2937051260644514, + "grad_norm": 0.42679469662211317, + "learning_rate": 7.013721096818988e-06, + "loss": 0.2057, + "step": 3874 + }, + { + "epoch": 1.2940390716313241, + "grad_norm": 0.43109509020232506, + "learning_rate": 7.011942187352798e-06, + "loss": 0.1961, + "step": 3875 + }, + { + "epoch": 1.2943730171981966, + "grad_norm": 0.4261404803605642, + "learning_rate": 7.010162973959052e-06, + "loss": 0.1987, + "step": 3876 + }, + { + "epoch": 1.2947069627650694, + "grad_norm": 0.3984455245433112, + "learning_rate": 7.008383456906518e-06, + "loss": 0.1915, + "step": 3877 + }, + { + "epoch": 1.2950409083319419, + "grad_norm": 0.44213008748103527, + "learning_rate": 7.0066036364640165e-06, + "loss": 0.2063, + "step": 3878 + }, + { + "epoch": 1.2953748538988146, + "grad_norm": 0.4340151247449952, + "learning_rate": 7.004823512900408e-06, + "loss": 0.2061, + "step": 3879 + }, + { + "epoch": 1.295708799465687, + "grad_norm": 0.4353715360420167, + "learning_rate": 7.003043086484602e-06, + "loss": 0.2192, + "step": 3880 + }, + { + "epoch": 1.2960427450325596, + "grad_norm": 0.40389370415658, + "learning_rate": 7.001262357485553e-06, + "loss": 0.1915, + "step": 3881 + }, + { + "epoch": 1.2963766905994323, + "grad_norm": 0.4385006533052746, + "learning_rate": 6.99948132617226e-06, + "loss": 0.211, + "step": 3882 + }, + { + "epoch": 1.296710636166305, + "grad_norm": 0.42962689985450564, + "learning_rate": 6.99769999281377e-06, + "loss": 0.2004, + "step": 3883 + }, + { + "epoch": 1.2970445817331775, + "grad_norm": 0.3988243836403783, + "learning_rate": 6.9959183576791745e-06, + "loss": 0.1935, + "step": 3884 + }, + { + "epoch": 1.29737852730005, + "grad_norm": 0.46588350704208614, + "learning_rate": 6.9941364210376095e-06, + "loss": 0.2027, + "step": 3885 + }, + { + "epoch": 1.2977124728669227, + "grad_norm": 0.4423870351069402, + "learning_rate": 6.992354183158258e-06, + "loss": 0.2006, + "step": 3886 + }, + { + "epoch": 1.2980464184337954, + "grad_norm": 0.44696770311545875, + "learning_rate": 6.9905716443103475e-06, + "loss": 0.1977, + "step": 3887 + }, + { + "epoch": 1.298380364000668, + "grad_norm": 0.43550098856674935, + "learning_rate": 6.9887888047631525e-06, + "loss": 0.1935, + "step": 3888 + }, + { + "epoch": 1.2987143095675404, + "grad_norm": 0.45209412899280554, + "learning_rate": 6.987005664785991e-06, + "loss": 0.1968, + "step": 3889 + }, + { + "epoch": 1.2990482551344131, + "grad_norm": 0.44580123030250995, + "learning_rate": 6.985222224648227e-06, + "loss": 0.2001, + "step": 3890 + }, + { + "epoch": 1.2993822007012856, + "grad_norm": 0.45139711177023095, + "learning_rate": 6.983438484619272e-06, + "loss": 0.2037, + "step": 3891 + }, + { + "epoch": 1.2997161462681583, + "grad_norm": 0.4219811830085104, + "learning_rate": 6.981654444968578e-06, + "loss": 0.1982, + "step": 3892 + }, + { + "epoch": 1.3000500918350308, + "grad_norm": 0.44396824829876985, + "learning_rate": 6.979870105965648e-06, + "loss": 0.2021, + "step": 3893 + }, + { + "epoch": 1.3003840374019036, + "grad_norm": 0.41445057653912953, + "learning_rate": 6.978085467880027e-06, + "loss": 0.1894, + "step": 3894 + }, + { + "epoch": 1.300717982968776, + "grad_norm": 0.4947034290772902, + "learning_rate": 6.9763005309813025e-06, + "loss": 0.206, + "step": 3895 + }, + { + "epoch": 1.3010519285356488, + "grad_norm": 0.43818926332258024, + "learning_rate": 6.974515295539115e-06, + "loss": 0.2049, + "step": 3896 + }, + { + "epoch": 1.3013858741025213, + "grad_norm": 0.4498029142452314, + "learning_rate": 6.9727297618231416e-06, + "loss": 0.2033, + "step": 3897 + }, + { + "epoch": 1.301719819669394, + "grad_norm": 0.39991165215175273, + "learning_rate": 6.970943930103109e-06, + "loss": 0.2049, + "step": 3898 + }, + { + "epoch": 1.3020537652362665, + "grad_norm": 0.39753769966116426, + "learning_rate": 6.96915780064879e-06, + "loss": 0.1969, + "step": 3899 + }, + { + "epoch": 1.302387710803139, + "grad_norm": 0.4219028223395952, + "learning_rate": 6.96737137373e-06, + "loss": 0.2007, + "step": 3900 + }, + { + "epoch": 1.3027216563700117, + "grad_norm": 0.4405048022853998, + "learning_rate": 6.965584649616597e-06, + "loss": 0.2084, + "step": 3901 + }, + { + "epoch": 1.3030556019368844, + "grad_norm": 0.47780153655107577, + "learning_rate": 6.963797628578489e-06, + "loss": 0.189, + "step": 3902 + }, + { + "epoch": 1.303389547503757, + "grad_norm": 0.44621414800365433, + "learning_rate": 6.962010310885627e-06, + "loss": 0.201, + "step": 3903 + }, + { + "epoch": 1.3037234930706294, + "grad_norm": 0.4567887700056304, + "learning_rate": 6.960222696808004e-06, + "loss": 0.2142, + "step": 3904 + }, + { + "epoch": 1.3040574386375021, + "grad_norm": 0.4348089611652648, + "learning_rate": 6.958434786615663e-06, + "loss": 0.1969, + "step": 3905 + }, + { + "epoch": 1.3043913842043746, + "grad_norm": 0.4141676962452518, + "learning_rate": 6.956646580578687e-06, + "loss": 0.2024, + "step": 3906 + }, + { + "epoch": 1.3047253297712473, + "grad_norm": 0.5021452032817756, + "learning_rate": 6.954858078967207e-06, + "loss": 0.2167, + "step": 3907 + }, + { + "epoch": 1.3050592753381198, + "grad_norm": 0.42162857639320483, + "learning_rate": 6.953069282051397e-06, + "loss": 0.198, + "step": 3908 + }, + { + "epoch": 1.3053932209049925, + "grad_norm": 0.4012274684722401, + "learning_rate": 6.951280190101475e-06, + "loss": 0.1932, + "step": 3909 + }, + { + "epoch": 1.305727166471865, + "grad_norm": 0.4800296772397605, + "learning_rate": 6.949490803387704e-06, + "loss": 0.2146, + "step": 3910 + }, + { + "epoch": 1.3060611120387378, + "grad_norm": 0.430988613912435, + "learning_rate": 6.9477011221803935e-06, + "loss": 0.1926, + "step": 3911 + }, + { + "epoch": 1.3063950576056103, + "grad_norm": 0.5275440853144523, + "learning_rate": 6.945911146749894e-06, + "loss": 0.1949, + "step": 3912 + }, + { + "epoch": 1.306729003172483, + "grad_norm": 0.4382678861181215, + "learning_rate": 6.944120877366605e-06, + "loss": 0.2023, + "step": 3913 + }, + { + "epoch": 1.3070629487393555, + "grad_norm": 0.44380835569655847, + "learning_rate": 6.9423303143009644e-06, + "loss": 0.2155, + "step": 3914 + }, + { + "epoch": 1.307396894306228, + "grad_norm": 0.4166446811542404, + "learning_rate": 6.940539457823459e-06, + "loss": 0.2038, + "step": 3915 + }, + { + "epoch": 1.3077308398731007, + "grad_norm": 0.4268607935295313, + "learning_rate": 6.938748308204622e-06, + "loss": 0.2065, + "step": 3916 + }, + { + "epoch": 1.3080647854399734, + "grad_norm": 0.44848540251287616, + "learning_rate": 6.936956865715024e-06, + "loss": 0.1959, + "step": 3917 + }, + { + "epoch": 1.308398731006846, + "grad_norm": 0.4524449751352037, + "learning_rate": 6.9351651306252836e-06, + "loss": 0.2078, + "step": 3918 + }, + { + "epoch": 1.3087326765737184, + "grad_norm": 0.4111904238885648, + "learning_rate": 6.933373103206064e-06, + "loss": 0.1935, + "step": 3919 + }, + { + "epoch": 1.309066622140591, + "grad_norm": 0.49171063968711426, + "learning_rate": 6.931580783728075e-06, + "loss": 0.2076, + "step": 3920 + }, + { + "epoch": 1.3094005677074636, + "grad_norm": 0.42276347946771226, + "learning_rate": 6.929788172462063e-06, + "loss": 0.2004, + "step": 3921 + }, + { + "epoch": 1.3097345132743363, + "grad_norm": 0.45493648816395343, + "learning_rate": 6.927995269678826e-06, + "loss": 0.2091, + "step": 3922 + }, + { + "epoch": 1.3100684588412088, + "grad_norm": 0.4905831176410732, + "learning_rate": 6.926202075649202e-06, + "loss": 0.2073, + "step": 3923 + }, + { + "epoch": 1.3104024044080815, + "grad_norm": 0.4280595066960188, + "learning_rate": 6.924408590644073e-06, + "loss": 0.2102, + "step": 3924 + }, + { + "epoch": 1.310736349974954, + "grad_norm": 0.4519193608106046, + "learning_rate": 6.922614814934367e-06, + "loss": 0.2056, + "step": 3925 + }, + { + "epoch": 1.3110702955418267, + "grad_norm": 0.4404172492976198, + "learning_rate": 6.920820748791057e-06, + "loss": 0.1964, + "step": 3926 + }, + { + "epoch": 1.3114042411086992, + "grad_norm": 0.42868822460623446, + "learning_rate": 6.919026392485154e-06, + "loss": 0.1973, + "step": 3927 + }, + { + "epoch": 1.311738186675572, + "grad_norm": 0.43305908963203626, + "learning_rate": 6.91723174628772e-06, + "loss": 0.1928, + "step": 3928 + }, + { + "epoch": 1.3120721322424445, + "grad_norm": 0.4477015665448263, + "learning_rate": 6.915436810469856e-06, + "loss": 0.2061, + "step": 3929 + }, + { + "epoch": 1.312406077809317, + "grad_norm": 0.4061799452628013, + "learning_rate": 6.913641585302708e-06, + "loss": 0.1882, + "step": 3930 + }, + { + "epoch": 1.3127400233761897, + "grad_norm": 0.3957076000775846, + "learning_rate": 6.9118460710574665e-06, + "loss": 0.197, + "step": 3931 + }, + { + "epoch": 1.3130739689430624, + "grad_norm": 0.4529677675383033, + "learning_rate": 6.910050268005364e-06, + "loss": 0.1999, + "step": 3932 + }, + { + "epoch": 1.3134079145099349, + "grad_norm": 0.45692150570119494, + "learning_rate": 6.908254176417679e-06, + "loss": 0.1991, + "step": 3933 + }, + { + "epoch": 1.3137418600768074, + "grad_norm": 0.3908259281335957, + "learning_rate": 6.906457796565732e-06, + "loss": 0.1917, + "step": 3934 + }, + { + "epoch": 1.31407580564368, + "grad_norm": 0.44642695936390425, + "learning_rate": 6.904661128720887e-06, + "loss": 0.2102, + "step": 3935 + }, + { + "epoch": 1.3144097512105528, + "grad_norm": 0.4140422793919292, + "learning_rate": 6.902864173154551e-06, + "loss": 0.1954, + "step": 3936 + }, + { + "epoch": 1.3147436967774253, + "grad_norm": 0.4175994886561666, + "learning_rate": 6.9010669301381765e-06, + "loss": 0.197, + "step": 3937 + }, + { + "epoch": 1.3150776423442978, + "grad_norm": 0.43189349728431153, + "learning_rate": 6.899269399943258e-06, + "loss": 0.2212, + "step": 3938 + }, + { + "epoch": 1.3154115879111705, + "grad_norm": 0.4035715995235755, + "learning_rate": 6.897471582841333e-06, + "loss": 0.1968, + "step": 3939 + }, + { + "epoch": 1.315745533478043, + "grad_norm": 0.4664423426609285, + "learning_rate": 6.895673479103983e-06, + "loss": 0.2087, + "step": 3940 + }, + { + "epoch": 1.3160794790449157, + "grad_norm": 0.4664300811353399, + "learning_rate": 6.893875089002835e-06, + "loss": 0.2107, + "step": 3941 + }, + { + "epoch": 1.3164134246117882, + "grad_norm": 0.3890368808127858, + "learning_rate": 6.892076412809553e-06, + "loss": 0.1894, + "step": 3942 + }, + { + "epoch": 1.316747370178661, + "grad_norm": 0.4327141924488333, + "learning_rate": 6.890277450795851e-06, + "loss": 0.1968, + "step": 3943 + }, + { + "epoch": 1.3170813157455334, + "grad_norm": 0.42540004436257756, + "learning_rate": 6.888478203233484e-06, + "loss": 0.1837, + "step": 3944 + }, + { + "epoch": 1.3174152613124062, + "grad_norm": 0.40663174769799976, + "learning_rate": 6.886678670394247e-06, + "loss": 0.1884, + "step": 3945 + }, + { + "epoch": 1.3177492068792787, + "grad_norm": 0.4240067688960512, + "learning_rate": 6.884878852549982e-06, + "loss": 0.2086, + "step": 3946 + }, + { + "epoch": 1.3180831524461514, + "grad_norm": 0.46538495365988414, + "learning_rate": 6.883078749972573e-06, + "loss": 0.1947, + "step": 3947 + }, + { + "epoch": 1.3184170980130239, + "grad_norm": 0.39694236009597017, + "learning_rate": 6.881278362933947e-06, + "loss": 0.1926, + "step": 3948 + }, + { + "epoch": 1.3187510435798964, + "grad_norm": 0.4084027414378485, + "learning_rate": 6.879477691706071e-06, + "loss": 0.1912, + "step": 3949 + }, + { + "epoch": 1.319084989146769, + "grad_norm": 0.4483001951410644, + "learning_rate": 6.877676736560961e-06, + "loss": 0.2115, + "step": 3950 + }, + { + "epoch": 1.3194189347136418, + "grad_norm": 0.45057352996667105, + "learning_rate": 6.87587549777067e-06, + "loss": 0.2042, + "step": 3951 + }, + { + "epoch": 1.3197528802805143, + "grad_norm": 0.42211307538365817, + "learning_rate": 6.874073975607298e-06, + "loss": 0.2053, + "step": 3952 + }, + { + "epoch": 1.3200868258473868, + "grad_norm": 0.41256970624902645, + "learning_rate": 6.872272170342985e-06, + "loss": 0.2016, + "step": 3953 + }, + { + "epoch": 1.3204207714142595, + "grad_norm": 0.5930226936995437, + "learning_rate": 6.870470082249917e-06, + "loss": 0.2153, + "step": 3954 + }, + { + "epoch": 1.320754716981132, + "grad_norm": 0.4080686250723333, + "learning_rate": 6.868667711600318e-06, + "loss": 0.1957, + "step": 3955 + }, + { + "epoch": 1.3210886625480047, + "grad_norm": 0.4330981620220748, + "learning_rate": 6.866865058666459e-06, + "loss": 0.2001, + "step": 3956 + }, + { + "epoch": 1.3214226081148772, + "grad_norm": 0.41330007823441606, + "learning_rate": 6.86506212372065e-06, + "loss": 0.2064, + "step": 3957 + }, + { + "epoch": 1.32175655368175, + "grad_norm": 0.4314166600814895, + "learning_rate": 6.863258907035246e-06, + "loss": 0.2024, + "step": 3958 + }, + { + "epoch": 1.3220904992486224, + "grad_norm": 0.45440336445114765, + "learning_rate": 6.861455408882647e-06, + "loss": 0.2149, + "step": 3959 + }, + { + "epoch": 1.3224244448154951, + "grad_norm": 0.4156203304442072, + "learning_rate": 6.85965162953529e-06, + "loss": 0.1992, + "step": 3960 + }, + { + "epoch": 1.3227583903823676, + "grad_norm": 0.40586715685586255, + "learning_rate": 6.857847569265657e-06, + "loss": 0.1912, + "step": 3961 + }, + { + "epoch": 1.3230923359492404, + "grad_norm": 0.4198856216324367, + "learning_rate": 6.8560432283462745e-06, + "loss": 0.2046, + "step": 3962 + }, + { + "epoch": 1.3234262815161129, + "grad_norm": 0.4506385780580963, + "learning_rate": 6.854238607049707e-06, + "loss": 0.2107, + "step": 3963 + }, + { + "epoch": 1.3237602270829854, + "grad_norm": 0.4144978365544001, + "learning_rate": 6.852433705648566e-06, + "loss": 0.2053, + "step": 3964 + }, + { + "epoch": 1.324094172649858, + "grad_norm": 0.4206897818109025, + "learning_rate": 6.8506285244155e-06, + "loss": 0.195, + "step": 3965 + }, + { + "epoch": 1.3244281182167308, + "grad_norm": 0.4367717494660734, + "learning_rate": 6.848823063623207e-06, + "loss": 0.1964, + "step": 3966 + }, + { + "epoch": 1.3247620637836033, + "grad_norm": 0.42784053652612, + "learning_rate": 6.84701732354442e-06, + "loss": 0.2128, + "step": 3967 + }, + { + "epoch": 1.3250960093504758, + "grad_norm": 0.4303998202870131, + "learning_rate": 6.845211304451919e-06, + "loss": 0.1982, + "step": 3968 + }, + { + "epoch": 1.3254299549173485, + "grad_norm": 0.41585888294104695, + "learning_rate": 6.843405006618523e-06, + "loss": 0.1887, + "step": 3969 + }, + { + "epoch": 1.325763900484221, + "grad_norm": 0.3962726406636628, + "learning_rate": 6.841598430317096e-06, + "loss": 0.1932, + "step": 3970 + }, + { + "epoch": 1.3260978460510937, + "grad_norm": 0.41713419116526673, + "learning_rate": 6.839791575820541e-06, + "loss": 0.1938, + "step": 3971 + }, + { + "epoch": 1.3264317916179662, + "grad_norm": 0.3966219464596358, + "learning_rate": 6.837984443401807e-06, + "loss": 0.2006, + "step": 3972 + }, + { + "epoch": 1.326765737184839, + "grad_norm": 0.4215570099402985, + "learning_rate": 6.836177033333882e-06, + "loss": 0.2002, + "step": 3973 + }, + { + "epoch": 1.3270996827517114, + "grad_norm": 0.39620589259657973, + "learning_rate": 6.834369345889793e-06, + "loss": 0.1938, + "step": 3974 + }, + { + "epoch": 1.3274336283185841, + "grad_norm": 0.43563140461015154, + "learning_rate": 6.832561381342617e-06, + "loss": 0.2061, + "step": 3975 + }, + { + "epoch": 1.3277675738854566, + "grad_norm": 0.4041140648181498, + "learning_rate": 6.830753139965467e-06, + "loss": 0.1975, + "step": 3976 + }, + { + "epoch": 1.3281015194523293, + "grad_norm": 0.41885286235310515, + "learning_rate": 6.828944622031497e-06, + "loss": 0.2032, + "step": 3977 + }, + { + "epoch": 1.3284354650192018, + "grad_norm": 0.4670424648659507, + "learning_rate": 6.827135827813909e-06, + "loss": 0.2011, + "step": 3978 + }, + { + "epoch": 1.3287694105860743, + "grad_norm": 0.45994411686683434, + "learning_rate": 6.825326757585939e-06, + "loss": 0.2069, + "step": 3979 + }, + { + "epoch": 1.329103356152947, + "grad_norm": 0.4039507808169467, + "learning_rate": 6.823517411620871e-06, + "loss": 0.1889, + "step": 3980 + }, + { + "epoch": 1.3294373017198198, + "grad_norm": 0.40372411165182154, + "learning_rate": 6.821707790192025e-06, + "loss": 0.1921, + "step": 3981 + }, + { + "epoch": 1.3297712472866923, + "grad_norm": 0.44376795314218465, + "learning_rate": 6.819897893572769e-06, + "loss": 0.1962, + "step": 3982 + }, + { + "epoch": 1.3301051928535648, + "grad_norm": 0.39179303819655353, + "learning_rate": 6.818087722036507e-06, + "loss": 0.1891, + "step": 3983 + }, + { + "epoch": 1.3304391384204375, + "grad_norm": 0.44009288949714437, + "learning_rate": 6.8162772758566875e-06, + "loss": 0.2119, + "step": 3984 + }, + { + "epoch": 1.3307730839873102, + "grad_norm": 0.4444703006216181, + "learning_rate": 6.8144665553067975e-06, + "loss": 0.2195, + "step": 3985 + }, + { + "epoch": 1.3311070295541827, + "grad_norm": 0.4277024217685211, + "learning_rate": 6.812655560660373e-06, + "loss": 0.1933, + "step": 3986 + }, + { + "epoch": 1.3314409751210552, + "grad_norm": 0.43496112243374163, + "learning_rate": 6.810844292190982e-06, + "loss": 0.2074, + "step": 3987 + }, + { + "epoch": 1.331774920687928, + "grad_norm": 0.45009831755053864, + "learning_rate": 6.809032750172236e-06, + "loss": 0.2111, + "step": 3988 + }, + { + "epoch": 1.3321088662548004, + "grad_norm": 0.43335127708055415, + "learning_rate": 6.807220934877794e-06, + "loss": 0.2008, + "step": 3989 + }, + { + "epoch": 1.3324428118216731, + "grad_norm": 0.4212267467609862, + "learning_rate": 6.80540884658135e-06, + "loss": 0.1898, + "step": 3990 + }, + { + "epoch": 1.3327767573885456, + "grad_norm": 0.4700750950129188, + "learning_rate": 6.803596485556643e-06, + "loss": 0.2029, + "step": 3991 + }, + { + "epoch": 1.3331107029554183, + "grad_norm": 0.415428277497626, + "learning_rate": 6.8017838520774494e-06, + "loss": 0.1884, + "step": 3992 + }, + { + "epoch": 1.3334446485222908, + "grad_norm": 0.43456686746455836, + "learning_rate": 6.79997094641759e-06, + "loss": 0.2062, + "step": 3993 + }, + { + "epoch": 1.3337785940891635, + "grad_norm": 0.477054485573292, + "learning_rate": 6.798157768850924e-06, + "loss": 0.1951, + "step": 3994 + }, + { + "epoch": 1.334112539656036, + "grad_norm": 0.4216294974695778, + "learning_rate": 6.796344319651356e-06, + "loss": 0.2044, + "step": 3995 + }, + { + "epoch": 1.3344464852229088, + "grad_norm": 0.6541115831812444, + "learning_rate": 6.794530599092826e-06, + "loss": 0.2053, + "step": 3996 + }, + { + "epoch": 1.3347804307897813, + "grad_norm": 0.4641533174704176, + "learning_rate": 6.792716607449319e-06, + "loss": 0.2049, + "step": 3997 + }, + { + "epoch": 1.3351143763566538, + "grad_norm": 0.469313295525281, + "learning_rate": 6.790902344994861e-06, + "loss": 0.2025, + "step": 3998 + }, + { + "epoch": 1.3354483219235265, + "grad_norm": 0.4146298442288413, + "learning_rate": 6.789087812003516e-06, + "loss": 0.2056, + "step": 3999 + }, + { + "epoch": 1.3357822674903992, + "grad_norm": 0.4017430523249004, + "learning_rate": 6.787273008749391e-06, + "loss": 0.1972, + "step": 4000 + }, + { + "epoch": 1.3361162130572717, + "grad_norm": 0.4038683460405391, + "learning_rate": 6.785457935506634e-06, + "loss": 0.1851, + "step": 4001 + }, + { + "epoch": 1.3364501586241442, + "grad_norm": 0.4195513584233563, + "learning_rate": 6.783642592549433e-06, + "loss": 0.1981, + "step": 4002 + }, + { + "epoch": 1.336784104191017, + "grad_norm": 0.47255883489969897, + "learning_rate": 6.781826980152015e-06, + "loss": 0.1976, + "step": 4003 + }, + { + "epoch": 1.3371180497578894, + "grad_norm": 0.39785977899861913, + "learning_rate": 6.780011098588654e-06, + "loss": 0.1893, + "step": 4004 + }, + { + "epoch": 1.337451995324762, + "grad_norm": 0.4059364052664352, + "learning_rate": 6.778194948133656e-06, + "loss": 0.1934, + "step": 4005 + }, + { + "epoch": 1.3377859408916346, + "grad_norm": 0.38153108505480937, + "learning_rate": 6.776378529061374e-06, + "loss": 0.1838, + "step": 4006 + }, + { + "epoch": 1.3381198864585073, + "grad_norm": 0.4191943863581517, + "learning_rate": 6.774561841646199e-06, + "loss": 0.1969, + "step": 4007 + }, + { + "epoch": 1.3384538320253798, + "grad_norm": 0.43300160469021504, + "learning_rate": 6.772744886162563e-06, + "loss": 0.2016, + "step": 4008 + }, + { + "epoch": 1.3387877775922525, + "grad_norm": 0.4461504516470582, + "learning_rate": 6.770927662884937e-06, + "loss": 0.2038, + "step": 4009 + }, + { + "epoch": 1.339121723159125, + "grad_norm": 0.4468710082082139, + "learning_rate": 6.769110172087838e-06, + "loss": 0.2118, + "step": 4010 + }, + { + "epoch": 1.3394556687259978, + "grad_norm": 0.42922253720928116, + "learning_rate": 6.767292414045816e-06, + "loss": 0.1836, + "step": 4011 + }, + { + "epoch": 1.3397896142928702, + "grad_norm": 0.7166284001551869, + "learning_rate": 6.765474389033464e-06, + "loss": 0.1962, + "step": 4012 + }, + { + "epoch": 1.3401235598597427, + "grad_norm": 0.5407365657709906, + "learning_rate": 6.7636560973254195e-06, + "loss": 0.1867, + "step": 4013 + }, + { + "epoch": 1.3404575054266155, + "grad_norm": 0.4164038811937711, + "learning_rate": 6.761837539196355e-06, + "loss": 0.1919, + "step": 4014 + }, + { + "epoch": 1.3407914509934882, + "grad_norm": 0.42826189965553885, + "learning_rate": 6.760018714920985e-06, + "loss": 0.1958, + "step": 4015 + }, + { + "epoch": 1.3411253965603607, + "grad_norm": 0.434608355144472, + "learning_rate": 6.758199624774065e-06, + "loss": 0.1896, + "step": 4016 + }, + { + "epoch": 1.3414593421272332, + "grad_norm": 0.4068014354296992, + "learning_rate": 6.7563802690303895e-06, + "loss": 0.2008, + "step": 4017 + }, + { + "epoch": 1.3417932876941059, + "grad_norm": 0.4095906278662473, + "learning_rate": 6.7545606479647915e-06, + "loss": 0.1943, + "step": 4018 + }, + { + "epoch": 1.3421272332609784, + "grad_norm": 0.4169936089160469, + "learning_rate": 6.752740761852151e-06, + "loss": 0.1997, + "step": 4019 + }, + { + "epoch": 1.342461178827851, + "grad_norm": 0.4142264028021034, + "learning_rate": 6.7509206109673794e-06, + "loss": 0.1978, + "step": 4020 + }, + { + "epoch": 1.3427951243947236, + "grad_norm": 0.4059367415693497, + "learning_rate": 6.749100195585433e-06, + "loss": 0.1994, + "step": 4021 + }, + { + "epoch": 1.3431290699615963, + "grad_norm": 0.44463105396489705, + "learning_rate": 6.747279515981307e-06, + "loss": 0.2101, + "step": 4022 + }, + { + "epoch": 1.3434630155284688, + "grad_norm": 0.44157463328609936, + "learning_rate": 6.745458572430038e-06, + "loss": 0.1992, + "step": 4023 + }, + { + "epoch": 1.3437969610953415, + "grad_norm": 0.38829291132548344, + "learning_rate": 6.743637365206698e-06, + "loss": 0.199, + "step": 4024 + }, + { + "epoch": 1.344130906662214, + "grad_norm": 0.4474724309977797, + "learning_rate": 6.741815894586404e-06, + "loss": 0.2131, + "step": 4025 + }, + { + "epoch": 1.3444648522290867, + "grad_norm": 0.5159836340150592, + "learning_rate": 6.7399941608443096e-06, + "loss": 0.2019, + "step": 4026 + }, + { + "epoch": 1.3447987977959592, + "grad_norm": 0.4128853305002455, + "learning_rate": 6.7381721642556095e-06, + "loss": 0.184, + "step": 4027 + }, + { + "epoch": 1.3451327433628317, + "grad_norm": 0.47630481187131113, + "learning_rate": 6.736349905095538e-06, + "loss": 0.1956, + "step": 4028 + }, + { + "epoch": 1.3454666889297044, + "grad_norm": 0.43170867694495907, + "learning_rate": 6.734527383639369e-06, + "loss": 0.2004, + "step": 4029 + }, + { + "epoch": 1.3458006344965772, + "grad_norm": 0.44362403957101815, + "learning_rate": 6.732704600162414e-06, + "loss": 0.22, + "step": 4030 + }, + { + "epoch": 1.3461345800634497, + "grad_norm": 0.4137671915130916, + "learning_rate": 6.730881554940029e-06, + "loss": 0.202, + "step": 4031 + }, + { + "epoch": 1.3464685256303222, + "grad_norm": 0.43747702708097363, + "learning_rate": 6.729058248247602e-06, + "loss": 0.2066, + "step": 4032 + }, + { + "epoch": 1.3468024711971949, + "grad_norm": 0.4331403596331942, + "learning_rate": 6.727234680360569e-06, + "loss": 0.2067, + "step": 4033 + }, + { + "epoch": 1.3471364167640676, + "grad_norm": 0.3868967897202734, + "learning_rate": 6.725410851554401e-06, + "loss": 0.188, + "step": 4034 + }, + { + "epoch": 1.34747036233094, + "grad_norm": 0.5018421015205833, + "learning_rate": 6.7235867621046055e-06, + "loss": 0.1986, + "step": 4035 + }, + { + "epoch": 1.3478043078978126, + "grad_norm": 0.3939057183783324, + "learning_rate": 6.721762412286738e-06, + "loss": 0.1932, + "step": 4036 + }, + { + "epoch": 1.3481382534646853, + "grad_norm": 0.42393788403562216, + "learning_rate": 6.719937802376383e-06, + "loss": 0.1912, + "step": 4037 + }, + { + "epoch": 1.3484721990315578, + "grad_norm": 0.41288963328543327, + "learning_rate": 6.718112932649171e-06, + "loss": 0.1964, + "step": 4038 + }, + { + "epoch": 1.3488061445984305, + "grad_norm": 0.4732934923796236, + "learning_rate": 6.716287803380771e-06, + "loss": 0.2072, + "step": 4039 + }, + { + "epoch": 1.349140090165303, + "grad_norm": 0.4372044972174929, + "learning_rate": 6.714462414846891e-06, + "loss": 0.2041, + "step": 4040 + }, + { + "epoch": 1.3494740357321757, + "grad_norm": 0.4355557384762028, + "learning_rate": 6.712636767323273e-06, + "loss": 0.2052, + "step": 4041 + }, + { + "epoch": 1.3498079812990482, + "grad_norm": 0.4392920698842665, + "learning_rate": 6.710810861085708e-06, + "loss": 0.2097, + "step": 4042 + }, + { + "epoch": 1.3501419268659207, + "grad_norm": 0.406740095621287, + "learning_rate": 6.708984696410018e-06, + "loss": 0.1893, + "step": 4043 + }, + { + "epoch": 1.3504758724327934, + "grad_norm": 0.373673142911883, + "learning_rate": 6.707158273572066e-06, + "loss": 0.1867, + "step": 4044 + }, + { + "epoch": 1.3508098179996662, + "grad_norm": 0.40677087019915287, + "learning_rate": 6.7053315928477566e-06, + "loss": 0.1944, + "step": 4045 + }, + { + "epoch": 1.3511437635665386, + "grad_norm": 0.4062089141523285, + "learning_rate": 6.703504654513031e-06, + "loss": 0.1898, + "step": 4046 + }, + { + "epoch": 1.3514777091334111, + "grad_norm": 0.41024823502203445, + "learning_rate": 6.701677458843868e-06, + "loss": 0.1926, + "step": 4047 + }, + { + "epoch": 1.3518116547002839, + "grad_norm": 0.426250767229244, + "learning_rate": 6.6998500061162884e-06, + "loss": 0.2071, + "step": 4048 + }, + { + "epoch": 1.3521456002671566, + "grad_norm": 0.4039386500414676, + "learning_rate": 6.6980222966063516e-06, + "loss": 0.1885, + "step": 4049 + }, + { + "epoch": 1.352479545834029, + "grad_norm": 0.42655519261390634, + "learning_rate": 6.6961943305901515e-06, + "loss": 0.1971, + "step": 4050 + }, + { + "epoch": 1.3528134914009016, + "grad_norm": 0.5067100050466419, + "learning_rate": 6.694366108343827e-06, + "loss": 0.2232, + "step": 4051 + }, + { + "epoch": 1.3531474369677743, + "grad_norm": 0.48278513038763404, + "learning_rate": 6.692537630143551e-06, + "loss": 0.1997, + "step": 4052 + }, + { + "epoch": 1.3534813825346468, + "grad_norm": 0.4507838778276989, + "learning_rate": 6.6907088962655375e-06, + "loss": 0.1962, + "step": 4053 + }, + { + "epoch": 1.3538153281015195, + "grad_norm": 0.4551173960396972, + "learning_rate": 6.688879906986036e-06, + "loss": 0.2083, + "step": 4054 + }, + { + "epoch": 1.354149273668392, + "grad_norm": 0.4569334112005147, + "learning_rate": 6.687050662581341e-06, + "loss": 0.2214, + "step": 4055 + }, + { + "epoch": 1.3544832192352647, + "grad_norm": 0.4806277133503981, + "learning_rate": 6.685221163327778e-06, + "loss": 0.1979, + "step": 4056 + }, + { + "epoch": 1.3548171648021372, + "grad_norm": 0.4563806677226338, + "learning_rate": 6.683391409501715e-06, + "loss": 0.2132, + "step": 4057 + }, + { + "epoch": 1.35515111036901, + "grad_norm": 0.5042972618637712, + "learning_rate": 6.6815614013795595e-06, + "loss": 0.2192, + "step": 4058 + }, + { + "epoch": 1.3554850559358824, + "grad_norm": 0.44352802619886555, + "learning_rate": 6.679731139237753e-06, + "loss": 0.2, + "step": 4059 + }, + { + "epoch": 1.3558190015027551, + "grad_norm": 0.4350886660962178, + "learning_rate": 6.67790062335278e-06, + "loss": 0.1952, + "step": 4060 + }, + { + "epoch": 1.3561529470696276, + "grad_norm": 0.40422153509801434, + "learning_rate": 6.676069854001162e-06, + "loss": 0.195, + "step": 4061 + }, + { + "epoch": 1.3564868926365001, + "grad_norm": 0.41453093337842917, + "learning_rate": 6.674238831459456e-06, + "loss": 0.2004, + "step": 4062 + }, + { + "epoch": 1.3568208382033728, + "grad_norm": 0.43625037543673406, + "learning_rate": 6.672407556004262e-06, + "loss": 0.2025, + "step": 4063 + }, + { + "epoch": 1.3571547837702456, + "grad_norm": 0.4481372504164815, + "learning_rate": 6.670576027912215e-06, + "loss": 0.1952, + "step": 4064 + }, + { + "epoch": 1.357488729337118, + "grad_norm": 0.4335407448052665, + "learning_rate": 6.668744247459988e-06, + "loss": 0.1961, + "step": 4065 + }, + { + "epoch": 1.3578226749039906, + "grad_norm": 0.44753482938235967, + "learning_rate": 6.666912214924295e-06, + "loss": 0.2144, + "step": 4066 + }, + { + "epoch": 1.3581566204708633, + "grad_norm": 0.3991995293082414, + "learning_rate": 6.665079930581883e-06, + "loss": 0.1853, + "step": 4067 + }, + { + "epoch": 1.3584905660377358, + "grad_norm": 0.42486029468962033, + "learning_rate": 6.663247394709542e-06, + "loss": 0.1962, + "step": 4068 + }, + { + "epoch": 1.3588245116046085, + "grad_norm": 0.4266054882931973, + "learning_rate": 6.661414607584099e-06, + "loss": 0.1975, + "step": 4069 + }, + { + "epoch": 1.359158457171481, + "grad_norm": 0.432581270411452, + "learning_rate": 6.659581569482415e-06, + "loss": 0.2044, + "step": 4070 + }, + { + "epoch": 1.3594924027383537, + "grad_norm": 0.40150080646226216, + "learning_rate": 6.657748280681395e-06, + "loss": 0.2039, + "step": 4071 + }, + { + "epoch": 1.3598263483052262, + "grad_norm": 0.4225694559806857, + "learning_rate": 6.65591474145798e-06, + "loss": 0.204, + "step": 4072 + }, + { + "epoch": 1.360160293872099, + "grad_norm": 0.45371512908538403, + "learning_rate": 6.6540809520891425e-06, + "loss": 0.1996, + "step": 4073 + }, + { + "epoch": 1.3604942394389714, + "grad_norm": 0.4263029427537738, + "learning_rate": 6.652246912851903e-06, + "loss": 0.1985, + "step": 4074 + }, + { + "epoch": 1.3608281850058441, + "grad_norm": 0.4403693400801846, + "learning_rate": 6.650412624023311e-06, + "loss": 0.2079, + "step": 4075 + }, + { + "epoch": 1.3611621305727166, + "grad_norm": 0.4672726128852996, + "learning_rate": 6.648578085880461e-06, + "loss": 0.2114, + "step": 4076 + }, + { + "epoch": 1.3614960761395891, + "grad_norm": 0.42076978881075816, + "learning_rate": 6.64674329870048e-06, + "loss": 0.1915, + "step": 4077 + }, + { + "epoch": 1.3618300217064618, + "grad_norm": 0.4074124100582979, + "learning_rate": 6.644908262760531e-06, + "loss": 0.196, + "step": 4078 + }, + { + "epoch": 1.3621639672733346, + "grad_norm": 0.43709488826547427, + "learning_rate": 6.643072978337823e-06, + "loss": 0.2024, + "step": 4079 + }, + { + "epoch": 1.362497912840207, + "grad_norm": 0.4410887006441817, + "learning_rate": 6.641237445709595e-06, + "loss": 0.2083, + "step": 4080 + }, + { + "epoch": 1.3628318584070795, + "grad_norm": 0.41075833370114767, + "learning_rate": 6.639401665153126e-06, + "loss": 0.1874, + "step": 4081 + }, + { + "epoch": 1.3631658039739523, + "grad_norm": 0.4314059293595324, + "learning_rate": 6.637565636945731e-06, + "loss": 0.1969, + "step": 4082 + }, + { + "epoch": 1.363499749540825, + "grad_norm": 0.3962845145124328, + "learning_rate": 6.635729361364765e-06, + "loss": 0.182, + "step": 4083 + }, + { + "epoch": 1.3638336951076975, + "grad_norm": 0.45085289607224943, + "learning_rate": 6.633892838687621e-06, + "loss": 0.2153, + "step": 4084 + }, + { + "epoch": 1.36416764067457, + "grad_norm": 0.442502443264247, + "learning_rate": 6.632056069191723e-06, + "loss": 0.203, + "step": 4085 + }, + { + "epoch": 1.3645015862414427, + "grad_norm": 0.4331715292207473, + "learning_rate": 6.6302190531545395e-06, + "loss": 0.1989, + "step": 4086 + }, + { + "epoch": 1.3648355318083152, + "grad_norm": 0.4526521135216819, + "learning_rate": 6.628381790853573e-06, + "loss": 0.2094, + "step": 4087 + }, + { + "epoch": 1.365169477375188, + "grad_norm": 0.40449183201031125, + "learning_rate": 6.626544282566363e-06, + "loss": 0.1949, + "step": 4088 + }, + { + "epoch": 1.3655034229420604, + "grad_norm": 0.40387227018258237, + "learning_rate": 6.624706528570487e-06, + "loss": 0.1988, + "step": 4089 + }, + { + "epoch": 1.3658373685089331, + "grad_norm": 0.5711678934538252, + "learning_rate": 6.6228685291435605e-06, + "loss": 0.2188, + "step": 4090 + }, + { + "epoch": 1.3661713140758056, + "grad_norm": 0.4421358421448025, + "learning_rate": 6.621030284563232e-06, + "loss": 0.1824, + "step": 4091 + }, + { + "epoch": 1.366505259642678, + "grad_norm": 0.4559202259210313, + "learning_rate": 6.619191795107195e-06, + "loss": 0.217, + "step": 4092 + }, + { + "epoch": 1.3668392052095508, + "grad_norm": 0.4244083311195419, + "learning_rate": 6.617353061053171e-06, + "loss": 0.2009, + "step": 4093 + }, + { + "epoch": 1.3671731507764235, + "grad_norm": 0.4173771853428039, + "learning_rate": 6.615514082678922e-06, + "loss": 0.1872, + "step": 4094 + }, + { + "epoch": 1.367507096343296, + "grad_norm": 0.4377390377861039, + "learning_rate": 6.613674860262249e-06, + "loss": 0.2141, + "step": 4095 + }, + { + "epoch": 1.3678410419101685, + "grad_norm": 0.406886823763542, + "learning_rate": 6.61183539408099e-06, + "loss": 0.1906, + "step": 4096 + }, + { + "epoch": 1.3681749874770412, + "grad_norm": 0.42851766250941675, + "learning_rate": 6.609995684413013e-06, + "loss": 0.1965, + "step": 4097 + }, + { + "epoch": 1.368508933043914, + "grad_norm": 0.622902793386304, + "learning_rate": 6.608155731536233e-06, + "loss": 0.1968, + "step": 4098 + }, + { + "epoch": 1.3688428786107865, + "grad_norm": 0.3853625281666951, + "learning_rate": 6.606315535728594e-06, + "loss": 0.1816, + "step": 4099 + }, + { + "epoch": 1.369176824177659, + "grad_norm": 0.4666928758678988, + "learning_rate": 6.604475097268079e-06, + "loss": 0.202, + "step": 4100 + }, + { + "epoch": 1.3695107697445317, + "grad_norm": 0.3771661413941458, + "learning_rate": 6.602634416432708e-06, + "loss": 0.1874, + "step": 4101 + }, + { + "epoch": 1.3698447153114042, + "grad_norm": 0.4610010980819958, + "learning_rate": 6.600793493500539e-06, + "loss": 0.2075, + "step": 4102 + }, + { + "epoch": 1.3701786608782769, + "grad_norm": 0.41246814662170656, + "learning_rate": 6.5989523287496645e-06, + "loss": 0.1907, + "step": 4103 + }, + { + "epoch": 1.3705126064451494, + "grad_norm": 0.44901774277549816, + "learning_rate": 6.597110922458214e-06, + "loss": 0.1955, + "step": 4104 + }, + { + "epoch": 1.370846552012022, + "grad_norm": 0.3989788461433798, + "learning_rate": 6.595269274904351e-06, + "loss": 0.1971, + "step": 4105 + }, + { + "epoch": 1.3711804975788946, + "grad_norm": 0.4601759741943717, + "learning_rate": 6.593427386366282e-06, + "loss": 0.2044, + "step": 4106 + }, + { + "epoch": 1.3715144431457673, + "grad_norm": 0.4075278252757786, + "learning_rate": 6.591585257122244e-06, + "loss": 0.1929, + "step": 4107 + }, + { + "epoch": 1.3718483887126398, + "grad_norm": 0.45268827886168284, + "learning_rate": 6.589742887450512e-06, + "loss": 0.2012, + "step": 4108 + }, + { + "epoch": 1.3721823342795125, + "grad_norm": 0.39829538551301213, + "learning_rate": 6.5879002776294e-06, + "loss": 0.1934, + "step": 4109 + }, + { + "epoch": 1.372516279846385, + "grad_norm": 0.4075145132486534, + "learning_rate": 6.586057427937252e-06, + "loss": 0.2013, + "step": 4110 + }, + { + "epoch": 1.3728502254132575, + "grad_norm": 0.413224264908335, + "learning_rate": 6.584214338652455e-06, + "loss": 0.1885, + "step": 4111 + }, + { + "epoch": 1.3731841709801302, + "grad_norm": 0.42233907535924026, + "learning_rate": 6.582371010053429e-06, + "loss": 0.2007, + "step": 4112 + }, + { + "epoch": 1.373518116547003, + "grad_norm": 0.4689784474170459, + "learning_rate": 6.58052744241863e-06, + "loss": 0.2026, + "step": 4113 + }, + { + "epoch": 1.3738520621138754, + "grad_norm": 0.42477561232876815, + "learning_rate": 6.578683636026551e-06, + "loss": 0.203, + "step": 4114 + }, + { + "epoch": 1.374186007680748, + "grad_norm": 0.4309943871533634, + "learning_rate": 6.576839591155719e-06, + "loss": 0.1938, + "step": 4115 + }, + { + "epoch": 1.3745199532476207, + "grad_norm": 0.5615884354542333, + "learning_rate": 6.574995308084702e-06, + "loss": 0.1806, + "step": 4116 + }, + { + "epoch": 1.3748538988144932, + "grad_norm": 0.3813788802372199, + "learning_rate": 6.573150787092097e-06, + "loss": 0.1733, + "step": 4117 + }, + { + "epoch": 1.3751878443813659, + "grad_norm": 0.42148123715265845, + "learning_rate": 6.5713060284565435e-06, + "loss": 0.1896, + "step": 4118 + }, + { + "epoch": 1.3755217899482384, + "grad_norm": 0.431883451577582, + "learning_rate": 6.569461032456713e-06, + "loss": 0.2186, + "step": 4119 + }, + { + "epoch": 1.375855735515111, + "grad_norm": 0.4066532875757682, + "learning_rate": 6.567615799371313e-06, + "loss": 0.1902, + "step": 4120 + }, + { + "epoch": 1.3761896810819836, + "grad_norm": 0.4370833319403715, + "learning_rate": 6.565770329479089e-06, + "loss": 0.2035, + "step": 4121 + }, + { + "epoch": 1.3765236266488563, + "grad_norm": 0.39098258449611045, + "learning_rate": 6.5639246230588205e-06, + "loss": 0.1918, + "step": 4122 + }, + { + "epoch": 1.3768575722157288, + "grad_norm": 0.39463295296495227, + "learning_rate": 6.562078680389323e-06, + "loss": 0.1874, + "step": 4123 + }, + { + "epoch": 1.3771915177826015, + "grad_norm": 0.7838749092433236, + "learning_rate": 6.560232501749446e-06, + "loss": 0.2093, + "step": 4124 + }, + { + "epoch": 1.377525463349474, + "grad_norm": 0.45124048458861776, + "learning_rate": 6.558386087418082e-06, + "loss": 0.2058, + "step": 4125 + }, + { + "epoch": 1.3778594089163465, + "grad_norm": 0.44508050673501187, + "learning_rate": 6.556539437674147e-06, + "loss": 0.2119, + "step": 4126 + }, + { + "epoch": 1.3781933544832192, + "grad_norm": 0.4532245305883286, + "learning_rate": 6.554692552796604e-06, + "loss": 0.1988, + "step": 4127 + }, + { + "epoch": 1.378527300050092, + "grad_norm": 0.4366039970271216, + "learning_rate": 6.552845433064445e-06, + "loss": 0.2016, + "step": 4128 + }, + { + "epoch": 1.3788612456169644, + "grad_norm": 0.43991279957977425, + "learning_rate": 6.550998078756698e-06, + "loss": 0.2125, + "step": 4129 + }, + { + "epoch": 1.379195191183837, + "grad_norm": 0.45075051528764204, + "learning_rate": 6.549150490152429e-06, + "loss": 0.2124, + "step": 4130 + }, + { + "epoch": 1.3795291367507097, + "grad_norm": 0.4697620488513748, + "learning_rate": 6.5473026675307394e-06, + "loss": 0.214, + "step": 4131 + }, + { + "epoch": 1.3798630823175824, + "grad_norm": 0.4650812525046603, + "learning_rate": 6.545454611170762e-06, + "loss": 0.2046, + "step": 4132 + }, + { + "epoch": 1.3801970278844549, + "grad_norm": 0.4473205624182132, + "learning_rate": 6.543606321351668e-06, + "loss": 0.1995, + "step": 4133 + }, + { + "epoch": 1.3805309734513274, + "grad_norm": 0.43793988430119773, + "learning_rate": 6.541757798352664e-06, + "loss": 0.1964, + "step": 4134 + }, + { + "epoch": 1.3808649190182, + "grad_norm": 0.46122603750033925, + "learning_rate": 6.539909042452991e-06, + "loss": 0.2021, + "step": 4135 + }, + { + "epoch": 1.3811988645850726, + "grad_norm": 0.4431099270741144, + "learning_rate": 6.538060053931925e-06, + "loss": 0.2151, + "step": 4136 + }, + { + "epoch": 1.3815328101519453, + "grad_norm": 0.44155698593464704, + "learning_rate": 6.536210833068779e-06, + "loss": 0.1947, + "step": 4137 + }, + { + "epoch": 1.3818667557188178, + "grad_norm": 0.4109387527355864, + "learning_rate": 6.534361380142896e-06, + "loss": 0.1998, + "step": 4138 + }, + { + "epoch": 1.3822007012856905, + "grad_norm": 0.4603948218667672, + "learning_rate": 6.532511695433662e-06, + "loss": 0.1966, + "step": 4139 + }, + { + "epoch": 1.382534646852563, + "grad_norm": 0.44282145799352707, + "learning_rate": 6.5306617792204915e-06, + "loss": 0.1896, + "step": 4140 + }, + { + "epoch": 1.3828685924194355, + "grad_norm": 0.44104795387567414, + "learning_rate": 6.528811631782835e-06, + "loss": 0.2068, + "step": 4141 + }, + { + "epoch": 1.3832025379863082, + "grad_norm": 0.48858635612357, + "learning_rate": 6.526961253400181e-06, + "loss": 0.2144, + "step": 4142 + }, + { + "epoch": 1.383536483553181, + "grad_norm": 0.44312917050300416, + "learning_rate": 6.525110644352052e-06, + "loss": 0.2069, + "step": 4143 + }, + { + "epoch": 1.3838704291200534, + "grad_norm": 0.3981097773309502, + "learning_rate": 6.523259804918001e-06, + "loss": 0.1897, + "step": 4144 + }, + { + "epoch": 1.384204374686926, + "grad_norm": 0.4204274275796497, + "learning_rate": 6.52140873537762e-06, + "loss": 0.1838, + "step": 4145 + }, + { + "epoch": 1.3845383202537986, + "grad_norm": 0.46326487504021546, + "learning_rate": 6.519557436010535e-06, + "loss": 0.2145, + "step": 4146 + }, + { + "epoch": 1.3848722658206714, + "grad_norm": 0.4722683700335305, + "learning_rate": 6.51770590709641e-06, + "loss": 0.2081, + "step": 4147 + }, + { + "epoch": 1.3852062113875439, + "grad_norm": 0.4311480901777335, + "learning_rate": 6.515854148914935e-06, + "loss": 0.2024, + "step": 4148 + }, + { + "epoch": 1.3855401569544163, + "grad_norm": 0.46985472062528916, + "learning_rate": 6.514002161745844e-06, + "loss": 0.2183, + "step": 4149 + }, + { + "epoch": 1.385874102521289, + "grad_norm": 0.4803730015949404, + "learning_rate": 6.512149945868898e-06, + "loss": 0.2193, + "step": 4150 + }, + { + "epoch": 1.3862080480881616, + "grad_norm": 0.4597435238205787, + "learning_rate": 6.510297501563899e-06, + "loss": 0.2072, + "step": 4151 + }, + { + "epoch": 1.3865419936550343, + "grad_norm": 0.43218955473225323, + "learning_rate": 6.5084448291106785e-06, + "loss": 0.2076, + "step": 4152 + }, + { + "epoch": 1.3868759392219068, + "grad_norm": 0.4150408431997899, + "learning_rate": 6.506591928789105e-06, + "loss": 0.1936, + "step": 4153 + }, + { + "epoch": 1.3872098847887795, + "grad_norm": 0.4312232452910037, + "learning_rate": 6.504738800879081e-06, + "loss": 0.1933, + "step": 4154 + }, + { + "epoch": 1.387543830355652, + "grad_norm": 0.4064866596866989, + "learning_rate": 6.502885445660544e-06, + "loss": 0.184, + "step": 4155 + }, + { + "epoch": 1.3878777759225247, + "grad_norm": 0.3829518324598277, + "learning_rate": 6.501031863413464e-06, + "loss": 0.1746, + "step": 4156 + }, + { + "epoch": 1.3882117214893972, + "grad_norm": 0.45492095092003937, + "learning_rate": 6.499178054417847e-06, + "loss": 0.2152, + "step": 4157 + }, + { + "epoch": 1.38854566705627, + "grad_norm": 0.4457187903715593, + "learning_rate": 6.497324018953732e-06, + "loss": 0.2061, + "step": 4158 + }, + { + "epoch": 1.3888796126231424, + "grad_norm": 0.4466194927024366, + "learning_rate": 6.495469757301196e-06, + "loss": 0.1999, + "step": 4159 + }, + { + "epoch": 1.389213558190015, + "grad_norm": 0.4355326082302892, + "learning_rate": 6.493615269740343e-06, + "loss": 0.2141, + "step": 4160 + }, + { + "epoch": 1.3895475037568876, + "grad_norm": 0.3940978415054905, + "learning_rate": 6.491760556551315e-06, + "loss": 0.188, + "step": 4161 + }, + { + "epoch": 1.3898814493237603, + "grad_norm": 0.4292342258128804, + "learning_rate": 6.489905618014293e-06, + "loss": 0.2138, + "step": 4162 + }, + { + "epoch": 1.3902153948906328, + "grad_norm": 0.43189024487894684, + "learning_rate": 6.488050454409483e-06, + "loss": 0.1972, + "step": 4163 + }, + { + "epoch": 1.3905493404575053, + "grad_norm": 0.4053167522949468, + "learning_rate": 6.486195066017129e-06, + "loss": 0.1958, + "step": 4164 + }, + { + "epoch": 1.390883286024378, + "grad_norm": 0.45743707193697686, + "learning_rate": 6.484339453117514e-06, + "loss": 0.2064, + "step": 4165 + }, + { + "epoch": 1.3912172315912505, + "grad_norm": 0.3727514433378026, + "learning_rate": 6.482483615990945e-06, + "loss": 0.1789, + "step": 4166 + }, + { + "epoch": 1.3915511771581233, + "grad_norm": 0.38894137550668284, + "learning_rate": 6.480627554917771e-06, + "loss": 0.1897, + "step": 4167 + }, + { + "epoch": 1.3918851227249958, + "grad_norm": 0.3949005314394, + "learning_rate": 6.47877127017837e-06, + "loss": 0.184, + "step": 4168 + }, + { + "epoch": 1.3922190682918685, + "grad_norm": 0.5483421957144745, + "learning_rate": 6.476914762053158e-06, + "loss": 0.2248, + "step": 4169 + }, + { + "epoch": 1.392553013858741, + "grad_norm": 0.4349485020907497, + "learning_rate": 6.47505803082258e-06, + "loss": 0.2087, + "step": 4170 + }, + { + "epoch": 1.3928869594256137, + "grad_norm": 0.494753044097071, + "learning_rate": 6.473201076767119e-06, + "loss": 0.2029, + "step": 4171 + }, + { + "epoch": 1.3932209049924862, + "grad_norm": 0.402889777781755, + "learning_rate": 6.471343900167289e-06, + "loss": 0.1958, + "step": 4172 + }, + { + "epoch": 1.393554850559359, + "grad_norm": 0.42334287046577884, + "learning_rate": 6.469486501303639e-06, + "loss": 0.2183, + "step": 4173 + }, + { + "epoch": 1.3938887961262314, + "grad_norm": 0.3947978066614757, + "learning_rate": 6.467628880456749e-06, + "loss": 0.1865, + "step": 4174 + }, + { + "epoch": 1.394222741693104, + "grad_norm": 0.47629578193143574, + "learning_rate": 6.465771037907236e-06, + "loss": 0.2095, + "step": 4175 + }, + { + "epoch": 1.3945566872599766, + "grad_norm": 0.3880989484453497, + "learning_rate": 6.463912973935749e-06, + "loss": 0.1855, + "step": 4176 + }, + { + "epoch": 1.3948906328268493, + "grad_norm": 0.47037211387441913, + "learning_rate": 6.462054688822971e-06, + "loss": 0.2078, + "step": 4177 + }, + { + "epoch": 1.3952245783937218, + "grad_norm": 0.382314312114383, + "learning_rate": 6.460196182849616e-06, + "loss": 0.1982, + "step": 4178 + }, + { + "epoch": 1.3955585239605943, + "grad_norm": 0.41109789004350694, + "learning_rate": 6.458337456296434e-06, + "loss": 0.2024, + "step": 4179 + }, + { + "epoch": 1.395892469527467, + "grad_norm": 0.4899142261094805, + "learning_rate": 6.456478509444209e-06, + "loss": 0.1993, + "step": 4180 + }, + { + "epoch": 1.3962264150943398, + "grad_norm": 0.42665421483230975, + "learning_rate": 6.454619342573756e-06, + "loss": 0.2041, + "step": 4181 + }, + { + "epoch": 1.3965603606612123, + "grad_norm": 0.41216556645235136, + "learning_rate": 6.452759955965922e-06, + "loss": 0.1884, + "step": 4182 + }, + { + "epoch": 1.3968943062280847, + "grad_norm": 0.4322046656673873, + "learning_rate": 6.450900349901592e-06, + "loss": 0.2001, + "step": 4183 + }, + { + "epoch": 1.3972282517949575, + "grad_norm": 0.4285030575773755, + "learning_rate": 6.449040524661681e-06, + "loss": 0.1906, + "step": 4184 + }, + { + "epoch": 1.39756219736183, + "grad_norm": 0.43465596091020026, + "learning_rate": 6.447180480527135e-06, + "loss": 0.2021, + "step": 4185 + }, + { + "epoch": 1.3978961429287027, + "grad_norm": 0.41971935181118564, + "learning_rate": 6.445320217778939e-06, + "loss": 0.1957, + "step": 4186 + }, + { + "epoch": 1.3982300884955752, + "grad_norm": 0.4583440890267203, + "learning_rate": 6.443459736698106e-06, + "loss": 0.2051, + "step": 4187 + }, + { + "epoch": 1.398564034062448, + "grad_norm": 0.4406778185384973, + "learning_rate": 6.4415990375656826e-06, + "loss": 0.2003, + "step": 4188 + }, + { + "epoch": 1.3988979796293204, + "grad_norm": 0.5268157377750877, + "learning_rate": 6.4397381206627505e-06, + "loss": 0.2256, + "step": 4189 + }, + { + "epoch": 1.3992319251961929, + "grad_norm": 0.40175298789569275, + "learning_rate": 6.437876986270424e-06, + "loss": 0.1926, + "step": 4190 + }, + { + "epoch": 1.3995658707630656, + "grad_norm": 0.43113117207329726, + "learning_rate": 6.436015634669848e-06, + "loss": 0.2101, + "step": 4191 + }, + { + "epoch": 1.3998998163299383, + "grad_norm": 0.40792889401372284, + "learning_rate": 6.434154066142201e-06, + "loss": 0.1982, + "step": 4192 + }, + { + "epoch": 1.4002337618968108, + "grad_norm": 0.3980086772220684, + "learning_rate": 6.432292280968695e-06, + "loss": 0.2006, + "step": 4193 + }, + { + "epoch": 1.4005677074636833, + "grad_norm": 0.45321556751445036, + "learning_rate": 6.430430279430577e-06, + "loss": 0.2076, + "step": 4194 + }, + { + "epoch": 1.400901653030556, + "grad_norm": 0.3962084842360654, + "learning_rate": 6.428568061809122e-06, + "loss": 0.2022, + "step": 4195 + }, + { + "epoch": 1.4012355985974287, + "grad_norm": 0.38721195579949064, + "learning_rate": 6.426705628385641e-06, + "loss": 0.1928, + "step": 4196 + }, + { + "epoch": 1.4015695441643012, + "grad_norm": 0.43865314302481156, + "learning_rate": 6.4248429794414745e-06, + "loss": 0.1984, + "step": 4197 + }, + { + "epoch": 1.4019034897311737, + "grad_norm": 0.4245277931246651, + "learning_rate": 6.422980115258e-06, + "loss": 0.2047, + "step": 4198 + }, + { + "epoch": 1.4022374352980465, + "grad_norm": 0.40245781599580605, + "learning_rate": 6.421117036116624e-06, + "loss": 0.1939, + "step": 4199 + }, + { + "epoch": 1.402571380864919, + "grad_norm": 0.42437101178064845, + "learning_rate": 6.4192537422987864e-06, + "loss": 0.1898, + "step": 4200 + }, + { + "epoch": 1.4029053264317917, + "grad_norm": 0.41477729386611645, + "learning_rate": 6.417390234085961e-06, + "loss": 0.2016, + "step": 4201 + }, + { + "epoch": 1.4032392719986642, + "grad_norm": 0.39992406322586277, + "learning_rate": 6.415526511759649e-06, + "loss": 0.1922, + "step": 4202 + }, + { + "epoch": 1.4035732175655369, + "grad_norm": 0.45466666242275616, + "learning_rate": 6.413662575601391e-06, + "loss": 0.2148, + "step": 4203 + }, + { + "epoch": 1.4039071631324094, + "grad_norm": 0.3778303865454364, + "learning_rate": 6.4117984258927565e-06, + "loss": 0.1831, + "step": 4204 + }, + { + "epoch": 1.404241108699282, + "grad_norm": 0.4005745223294027, + "learning_rate": 6.409934062915345e-06, + "loss": 0.1847, + "step": 4205 + }, + { + "epoch": 1.4045750542661546, + "grad_norm": 0.4584571575972361, + "learning_rate": 6.408069486950793e-06, + "loss": 0.2045, + "step": 4206 + }, + { + "epoch": 1.4049089998330273, + "grad_norm": 0.41082051621126475, + "learning_rate": 6.406204698280766e-06, + "loss": 0.1912, + "step": 4207 + }, + { + "epoch": 1.4052429453998998, + "grad_norm": 0.3901252288030012, + "learning_rate": 6.40433969718696e-06, + "loss": 0.1807, + "step": 4208 + }, + { + "epoch": 1.4055768909667723, + "grad_norm": 0.4501067185058659, + "learning_rate": 6.402474483951109e-06, + "loss": 0.2018, + "step": 4209 + }, + { + "epoch": 1.405910836533645, + "grad_norm": 0.38450772473450306, + "learning_rate": 6.400609058854973e-06, + "loss": 0.1823, + "step": 4210 + }, + { + "epoch": 1.4062447821005177, + "grad_norm": 0.40515453066428003, + "learning_rate": 6.398743422180346e-06, + "loss": 0.2034, + "step": 4211 + }, + { + "epoch": 1.4065787276673902, + "grad_norm": 0.4239306652790811, + "learning_rate": 6.396877574209057e-06, + "loss": 0.2087, + "step": 4212 + }, + { + "epoch": 1.4069126732342627, + "grad_norm": 0.4566772627357021, + "learning_rate": 6.395011515222962e-06, + "loss": 0.1948, + "step": 4213 + }, + { + "epoch": 1.4072466188011354, + "grad_norm": 0.4000662579941561, + "learning_rate": 6.393145245503951e-06, + "loss": 0.1955, + "step": 4214 + }, + { + "epoch": 1.407580564368008, + "grad_norm": 0.4201143343811621, + "learning_rate": 6.391278765333948e-06, + "loss": 0.2011, + "step": 4215 + }, + { + "epoch": 1.4079145099348807, + "grad_norm": 0.39637080003077907, + "learning_rate": 6.389412074994906e-06, + "loss": 0.1937, + "step": 4216 + }, + { + "epoch": 1.4082484555017531, + "grad_norm": 0.4258908660767943, + "learning_rate": 6.387545174768809e-06, + "loss": 0.201, + "step": 4217 + }, + { + "epoch": 1.4085824010686259, + "grad_norm": 0.46190972235436634, + "learning_rate": 6.385678064937677e-06, + "loss": 0.219, + "step": 4218 + }, + { + "epoch": 1.4089163466354984, + "grad_norm": 0.40798704121606794, + "learning_rate": 6.383810745783556e-06, + "loss": 0.2036, + "step": 4219 + }, + { + "epoch": 1.409250292202371, + "grad_norm": 0.3900248306224213, + "learning_rate": 6.38194321758853e-06, + "loss": 0.182, + "step": 4220 + }, + { + "epoch": 1.4095842377692436, + "grad_norm": 0.41143107768977444, + "learning_rate": 6.3800754806347065e-06, + "loss": 0.2046, + "step": 4221 + }, + { + "epoch": 1.4099181833361163, + "grad_norm": 0.4227528210432267, + "learning_rate": 6.378207535204234e-06, + "loss": 0.2068, + "step": 4222 + }, + { + "epoch": 1.4102521289029888, + "grad_norm": 0.4496275000832063, + "learning_rate": 6.376339381579285e-06, + "loss": 0.188, + "step": 4223 + }, + { + "epoch": 1.4105860744698613, + "grad_norm": 0.3920155752449724, + "learning_rate": 6.374471020042067e-06, + "loss": 0.1915, + "step": 4224 + }, + { + "epoch": 1.410920020036734, + "grad_norm": 0.4893281271897962, + "learning_rate": 6.372602450874816e-06, + "loss": 0.2051, + "step": 4225 + }, + { + "epoch": 1.4112539656036067, + "grad_norm": 0.3895814494862588, + "learning_rate": 6.370733674359803e-06, + "loss": 0.2008, + "step": 4226 + }, + { + "epoch": 1.4115879111704792, + "grad_norm": 0.4464740149177025, + "learning_rate": 6.36886469077933e-06, + "loss": 0.1996, + "step": 4227 + }, + { + "epoch": 1.4119218567373517, + "grad_norm": 0.41982386936019905, + "learning_rate": 6.366995500415727e-06, + "loss": 0.1789, + "step": 4228 + }, + { + "epoch": 1.4122558023042244, + "grad_norm": 0.48692131344551154, + "learning_rate": 6.365126103551358e-06, + "loss": 0.1914, + "step": 4229 + }, + { + "epoch": 1.4125897478710971, + "grad_norm": 0.4188619004865136, + "learning_rate": 6.363256500468617e-06, + "loss": 0.1926, + "step": 4230 + }, + { + "epoch": 1.4129236934379696, + "grad_norm": 0.38428575637852025, + "learning_rate": 6.3613866914499285e-06, + "loss": 0.1913, + "step": 4231 + }, + { + "epoch": 1.4132576390048421, + "grad_norm": 0.4192523891713591, + "learning_rate": 6.359516676777751e-06, + "loss": 0.2058, + "step": 4232 + }, + { + "epoch": 1.4135915845717149, + "grad_norm": 0.4161796743739333, + "learning_rate": 6.357646456734574e-06, + "loss": 0.1986, + "step": 4233 + }, + { + "epoch": 1.4139255301385873, + "grad_norm": 0.412845390363555, + "learning_rate": 6.3557760316029115e-06, + "loss": 0.192, + "step": 4234 + }, + { + "epoch": 1.41425947570546, + "grad_norm": 0.3764465690310885, + "learning_rate": 6.353905401665317e-06, + "loss": 0.1832, + "step": 4235 + }, + { + "epoch": 1.4145934212723326, + "grad_norm": 0.3984816244207758, + "learning_rate": 6.35203456720437e-06, + "loss": 0.1959, + "step": 4236 + }, + { + "epoch": 1.4149273668392053, + "grad_norm": 0.4660411346747864, + "learning_rate": 6.35016352850268e-06, + "loss": 0.208, + "step": 4237 + }, + { + "epoch": 1.4152613124060778, + "grad_norm": 0.4208257324589308, + "learning_rate": 6.3482922858428915e-06, + "loss": 0.1906, + "step": 4238 + }, + { + "epoch": 1.4155952579729503, + "grad_norm": 0.4093879464060409, + "learning_rate": 6.34642083950768e-06, + "loss": 0.1919, + "step": 4239 + }, + { + "epoch": 1.415929203539823, + "grad_norm": 0.4169795445243217, + "learning_rate": 6.344549189779745e-06, + "loss": 0.1934, + "step": 4240 + }, + { + "epoch": 1.4162631491066957, + "grad_norm": 0.3999232718854513, + "learning_rate": 6.342677336941825e-06, + "loss": 0.201, + "step": 4241 + }, + { + "epoch": 1.4165970946735682, + "grad_norm": 0.43188532288830084, + "learning_rate": 6.340805281276683e-06, + "loss": 0.1915, + "step": 4242 + }, + { + "epoch": 1.4169310402404407, + "grad_norm": 0.4212897378007842, + "learning_rate": 6.338933023067114e-06, + "loss": 0.2031, + "step": 4243 + }, + { + "epoch": 1.4172649858073134, + "grad_norm": 0.39104495072727397, + "learning_rate": 6.337060562595949e-06, + "loss": 0.1838, + "step": 4244 + }, + { + "epoch": 1.4175989313741861, + "grad_norm": 0.4279064144901453, + "learning_rate": 6.3351879001460425e-06, + "loss": 0.203, + "step": 4245 + }, + { + "epoch": 1.4179328769410586, + "grad_norm": 0.44299570054290127, + "learning_rate": 6.333315036000281e-06, + "loss": 0.1909, + "step": 4246 + }, + { + "epoch": 1.4182668225079311, + "grad_norm": 0.4518229529900438, + "learning_rate": 6.331441970441585e-06, + "loss": 0.2016, + "step": 4247 + }, + { + "epoch": 1.4186007680748038, + "grad_norm": 0.437754191582659, + "learning_rate": 6.329568703752902e-06, + "loss": 0.2023, + "step": 4248 + }, + { + "epoch": 1.4189347136416763, + "grad_norm": 0.39214812360430923, + "learning_rate": 6.32769523621721e-06, + "loss": 0.1946, + "step": 4249 + }, + { + "epoch": 1.419268659208549, + "grad_norm": 0.40032628961599576, + "learning_rate": 6.3258215681175215e-06, + "loss": 0.1893, + "step": 4250 + }, + { + "epoch": 1.4196026047754216, + "grad_norm": 0.4135834268436745, + "learning_rate": 6.323947699736873e-06, + "loss": 0.1914, + "step": 4251 + }, + { + "epoch": 1.4199365503422943, + "grad_norm": 0.4408452075370049, + "learning_rate": 6.3220736313583345e-06, + "loss": 0.2073, + "step": 4252 + }, + { + "epoch": 1.4202704959091668, + "grad_norm": 0.46529501920804817, + "learning_rate": 6.320199363265008e-06, + "loss": 0.1994, + "step": 4253 + }, + { + "epoch": 1.4206044414760395, + "grad_norm": 0.4180415628920196, + "learning_rate": 6.318324895740023e-06, + "loss": 0.1979, + "step": 4254 + }, + { + "epoch": 1.420938387042912, + "grad_norm": 0.40285815975229106, + "learning_rate": 6.31645022906654e-06, + "loss": 0.1972, + "step": 4255 + }, + { + "epoch": 1.4212723326097847, + "grad_norm": 0.4245223489154361, + "learning_rate": 6.314575363527748e-06, + "loss": 0.1955, + "step": 4256 + }, + { + "epoch": 1.4216062781766572, + "grad_norm": 0.4107284657056354, + "learning_rate": 6.312700299406871e-06, + "loss": 0.1956, + "step": 4257 + }, + { + "epoch": 1.4219402237435297, + "grad_norm": 0.4123475914189533, + "learning_rate": 6.310825036987154e-06, + "loss": 0.1987, + "step": 4258 + }, + { + "epoch": 1.4222741693104024, + "grad_norm": 0.4113685742620576, + "learning_rate": 6.308949576551884e-06, + "loss": 0.189, + "step": 4259 + }, + { + "epoch": 1.4226081148772751, + "grad_norm": 0.4113493190674343, + "learning_rate": 6.3070739183843655e-06, + "loss": 0.1958, + "step": 4260 + }, + { + "epoch": 1.4229420604441476, + "grad_norm": 0.41318754392232565, + "learning_rate": 6.305198062767942e-06, + "loss": 0.1921, + "step": 4261 + }, + { + "epoch": 1.4232760060110201, + "grad_norm": 0.42444570405075827, + "learning_rate": 6.303322009985984e-06, + "loss": 0.2002, + "step": 4262 + }, + { + "epoch": 1.4236099515778928, + "grad_norm": 0.4036884323591814, + "learning_rate": 6.301445760321889e-06, + "loss": 0.1921, + "step": 4263 + }, + { + "epoch": 1.4239438971447653, + "grad_norm": 0.3868929082230903, + "learning_rate": 6.299569314059088e-06, + "loss": 0.1952, + "step": 4264 + }, + { + "epoch": 1.424277842711638, + "grad_norm": 0.3934660880793526, + "learning_rate": 6.297692671481042e-06, + "loss": 0.1961, + "step": 4265 + }, + { + "epoch": 1.4246117882785105, + "grad_norm": 0.5476517438338885, + "learning_rate": 6.295815832871235e-06, + "loss": 0.178, + "step": 4266 + }, + { + "epoch": 1.4249457338453833, + "grad_norm": 0.42403647877836415, + "learning_rate": 6.2939387985131905e-06, + "loss": 0.1926, + "step": 4267 + }, + { + "epoch": 1.4252796794122558, + "grad_norm": 0.4603384594375843, + "learning_rate": 6.292061568690455e-06, + "loss": 0.1962, + "step": 4268 + }, + { + "epoch": 1.4256136249791285, + "grad_norm": 0.42212567893770986, + "learning_rate": 6.290184143686606e-06, + "loss": 0.1825, + "step": 4269 + }, + { + "epoch": 1.425947570546001, + "grad_norm": 0.42720372519788125, + "learning_rate": 6.288306523785252e-06, + "loss": 0.2023, + "step": 4270 + }, + { + "epoch": 1.4262815161128737, + "grad_norm": 0.3850216851382417, + "learning_rate": 6.286428709270026e-06, + "loss": 0.19, + "step": 4271 + }, + { + "epoch": 1.4266154616797462, + "grad_norm": 0.42293406684487284, + "learning_rate": 6.284550700424597e-06, + "loss": 0.1934, + "step": 4272 + }, + { + "epoch": 1.4269494072466187, + "grad_norm": 0.4185602665867109, + "learning_rate": 6.282672497532659e-06, + "loss": 0.1862, + "step": 4273 + }, + { + "epoch": 1.4272833528134914, + "grad_norm": 0.42353635443489346, + "learning_rate": 6.280794100877938e-06, + "loss": 0.1976, + "step": 4274 + }, + { + "epoch": 1.427617298380364, + "grad_norm": 0.4193474465872973, + "learning_rate": 6.278915510744187e-06, + "loss": 0.1798, + "step": 4275 + }, + { + "epoch": 1.4279512439472366, + "grad_norm": 0.47731951370814735, + "learning_rate": 6.277036727415189e-06, + "loss": 0.2027, + "step": 4276 + }, + { + "epoch": 1.428285189514109, + "grad_norm": 0.41253445818583245, + "learning_rate": 6.2751577511747575e-06, + "loss": 0.1864, + "step": 4277 + }, + { + "epoch": 1.4286191350809818, + "grad_norm": 0.41072543870509975, + "learning_rate": 6.273278582306732e-06, + "loss": 0.2005, + "step": 4278 + }, + { + "epoch": 1.4289530806478545, + "grad_norm": 0.4464246228500059, + "learning_rate": 6.271399221094986e-06, + "loss": 0.1992, + "step": 4279 + }, + { + "epoch": 1.429287026214727, + "grad_norm": 0.4094540550192012, + "learning_rate": 6.269519667823416e-06, + "loss": 0.1973, + "step": 4280 + }, + { + "epoch": 1.4296209717815995, + "grad_norm": 0.41233321137674056, + "learning_rate": 6.267639922775952e-06, + "loss": 0.1979, + "step": 4281 + }, + { + "epoch": 1.4299549173484722, + "grad_norm": 0.424743809144049, + "learning_rate": 6.265759986236552e-06, + "loss": 0.2035, + "step": 4282 + }, + { + "epoch": 1.4302888629153447, + "grad_norm": 0.4005345858496865, + "learning_rate": 6.263879858489204e-06, + "loss": 0.1913, + "step": 4283 + }, + { + "epoch": 1.4306228084822175, + "grad_norm": 0.4053273447527923, + "learning_rate": 6.261999539817919e-06, + "loss": 0.1943, + "step": 4284 + }, + { + "epoch": 1.43095675404909, + "grad_norm": 0.4185492417076911, + "learning_rate": 6.260119030506746e-06, + "loss": 0.1975, + "step": 4285 + }, + { + "epoch": 1.4312906996159627, + "grad_norm": 0.3913801060486488, + "learning_rate": 6.258238330839754e-06, + "loss": 0.1798, + "step": 4286 + }, + { + "epoch": 1.4316246451828352, + "grad_norm": 0.426769833384345, + "learning_rate": 6.2563574411010485e-06, + "loss": 0.1964, + "step": 4287 + }, + { + "epoch": 1.4319585907497077, + "grad_norm": 0.41614648545349253, + "learning_rate": 6.254476361574757e-06, + "loss": 0.189, + "step": 4288 + }, + { + "epoch": 1.4322925363165804, + "grad_norm": 0.4241398703694006, + "learning_rate": 6.252595092545042e-06, + "loss": 0.1875, + "step": 4289 + }, + { + "epoch": 1.432626481883453, + "grad_norm": 0.4299458304622268, + "learning_rate": 6.250713634296087e-06, + "loss": 0.2018, + "step": 4290 + }, + { + "epoch": 1.4329604274503256, + "grad_norm": 0.4229363795139313, + "learning_rate": 6.248831987112113e-06, + "loss": 0.206, + "step": 4291 + }, + { + "epoch": 1.433294373017198, + "grad_norm": 0.401583294371539, + "learning_rate": 6.246950151277362e-06, + "loss": 0.1883, + "step": 4292 + }, + { + "epoch": 1.4336283185840708, + "grad_norm": 0.4519792653966738, + "learning_rate": 6.245068127076109e-06, + "loss": 0.2018, + "step": 4293 + }, + { + "epoch": 1.4339622641509435, + "grad_norm": 0.40227774137045846, + "learning_rate": 6.243185914792655e-06, + "loss": 0.1819, + "step": 4294 + }, + { + "epoch": 1.434296209717816, + "grad_norm": 0.559657939335022, + "learning_rate": 6.2413035147113295e-06, + "loss": 0.1913, + "step": 4295 + }, + { + "epoch": 1.4346301552846885, + "grad_norm": 0.3972251507876515, + "learning_rate": 6.239420927116493e-06, + "loss": 0.1886, + "step": 4296 + }, + { + "epoch": 1.4349641008515612, + "grad_norm": 0.41776370884070535, + "learning_rate": 6.2375381522925325e-06, + "loss": 0.1972, + "step": 4297 + }, + { + "epoch": 1.4352980464184337, + "grad_norm": 0.41015662981383944, + "learning_rate": 6.235655190523862e-06, + "loss": 0.1978, + "step": 4298 + }, + { + "epoch": 1.4356319919853064, + "grad_norm": 0.4229324755634245, + "learning_rate": 6.233772042094924e-06, + "loss": 0.1968, + "step": 4299 + }, + { + "epoch": 1.435965937552179, + "grad_norm": 0.4710852821204468, + "learning_rate": 6.231888707290194e-06, + "loss": 0.2016, + "step": 4300 + }, + { + "epoch": 1.4362998831190517, + "grad_norm": 0.5534094086271172, + "learning_rate": 6.230005186394169e-06, + "loss": 0.2034, + "step": 4301 + }, + { + "epoch": 1.4366338286859242, + "grad_norm": 0.40362990371947977, + "learning_rate": 6.228121479691377e-06, + "loss": 0.1976, + "step": 4302 + }, + { + "epoch": 1.4369677742527969, + "grad_norm": 0.46882248799019716, + "learning_rate": 6.226237587466375e-06, + "loss": 0.216, + "step": 4303 + }, + { + "epoch": 1.4373017198196694, + "grad_norm": 0.44983410588953954, + "learning_rate": 6.224353510003747e-06, + "loss": 0.1955, + "step": 4304 + }, + { + "epoch": 1.437635665386542, + "grad_norm": 0.4562795509116905, + "learning_rate": 6.222469247588105e-06, + "loss": 0.1959, + "step": 4305 + }, + { + "epoch": 1.4379696109534146, + "grad_norm": 0.40086144043627886, + "learning_rate": 6.220584800504091e-06, + "loss": 0.1897, + "step": 4306 + }, + { + "epoch": 1.438303556520287, + "grad_norm": 0.4220200763976442, + "learning_rate": 6.218700169036368e-06, + "loss": 0.2092, + "step": 4307 + }, + { + "epoch": 1.4386375020871598, + "grad_norm": 0.40218684127289284, + "learning_rate": 6.216815353469636e-06, + "loss": 0.1891, + "step": 4308 + }, + { + "epoch": 1.4389714476540325, + "grad_norm": 0.3699554890094108, + "learning_rate": 6.214930354088618e-06, + "loss": 0.1804, + "step": 4309 + }, + { + "epoch": 1.439305393220905, + "grad_norm": 0.46604002782555415, + "learning_rate": 6.213045171178063e-06, + "loss": 0.2052, + "step": 4310 + }, + { + "epoch": 1.4396393387877775, + "grad_norm": 0.4096947218921958, + "learning_rate": 6.2111598050227535e-06, + "loss": 0.1879, + "step": 4311 + }, + { + "epoch": 1.4399732843546502, + "grad_norm": 0.43814289672198536, + "learning_rate": 6.209274255907494e-06, + "loss": 0.2108, + "step": 4312 + }, + { + "epoch": 1.4403072299215227, + "grad_norm": 0.4349153906107477, + "learning_rate": 6.207388524117119e-06, + "loss": 0.1942, + "step": 4313 + }, + { + "epoch": 1.4406411754883954, + "grad_norm": 0.389711559079184, + "learning_rate": 6.205502609936491e-06, + "loss": 0.189, + "step": 4314 + }, + { + "epoch": 1.440975121055268, + "grad_norm": 0.39940637696014164, + "learning_rate": 6.2036165136505e-06, + "loss": 0.1912, + "step": 4315 + }, + { + "epoch": 1.4413090666221406, + "grad_norm": 0.4348476443490899, + "learning_rate": 6.201730235544062e-06, + "loss": 0.2026, + "step": 4316 + }, + { + "epoch": 1.4416430121890131, + "grad_norm": 0.4010344004022616, + "learning_rate": 6.1998437759021235e-06, + "loss": 0.19, + "step": 4317 + }, + { + "epoch": 1.4419769577558859, + "grad_norm": 0.4077069086603506, + "learning_rate": 6.197957135009653e-06, + "loss": 0.1992, + "step": 4318 + }, + { + "epoch": 1.4423109033227584, + "grad_norm": 0.454450027580997, + "learning_rate": 6.196070313151652e-06, + "loss": 0.2078, + "step": 4319 + }, + { + "epoch": 1.442644848889631, + "grad_norm": 0.42090639548690023, + "learning_rate": 6.194183310613147e-06, + "loss": 0.2042, + "step": 4320 + }, + { + "epoch": 1.4429787944565036, + "grad_norm": 0.3924425533047438, + "learning_rate": 6.1922961276791925e-06, + "loss": 0.1861, + "step": 4321 + }, + { + "epoch": 1.443312740023376, + "grad_norm": 0.43825434862695306, + "learning_rate": 6.190408764634869e-06, + "loss": 0.2036, + "step": 4322 + }, + { + "epoch": 1.4436466855902488, + "grad_norm": 0.4554298024237673, + "learning_rate": 6.188521221765285e-06, + "loss": 0.2075, + "step": 4323 + }, + { + "epoch": 1.4439806311571215, + "grad_norm": 0.40310377491021876, + "learning_rate": 6.186633499355576e-06, + "loss": 0.1849, + "step": 4324 + }, + { + "epoch": 1.444314576723994, + "grad_norm": 0.3958077216570012, + "learning_rate": 6.184745597690903e-06, + "loss": 0.2019, + "step": 4325 + }, + { + "epoch": 1.4446485222908665, + "grad_norm": 0.41846403968531265, + "learning_rate": 6.1828575170564595e-06, + "loss": 0.1921, + "step": 4326 + }, + { + "epoch": 1.4449824678577392, + "grad_norm": 0.4349377636186543, + "learning_rate": 6.18096925773746e-06, + "loss": 0.1972, + "step": 4327 + }, + { + "epoch": 1.445316413424612, + "grad_norm": 0.39603752468722014, + "learning_rate": 6.179080820019147e-06, + "loss": 0.1766, + "step": 4328 + }, + { + "epoch": 1.4456503589914844, + "grad_norm": 0.40669232085821777, + "learning_rate": 6.177192204186796e-06, + "loss": 0.1931, + "step": 4329 + }, + { + "epoch": 1.445984304558357, + "grad_norm": 0.3946887338662643, + "learning_rate": 6.1753034105257e-06, + "loss": 0.1957, + "step": 4330 + }, + { + "epoch": 1.4463182501252296, + "grad_norm": 0.41443060881225996, + "learning_rate": 6.173414439321185e-06, + "loss": 0.1964, + "step": 4331 + }, + { + "epoch": 1.4466521956921021, + "grad_norm": 0.4405029054870975, + "learning_rate": 6.171525290858602e-06, + "loss": 0.1927, + "step": 4332 + }, + { + "epoch": 1.4469861412589748, + "grad_norm": 0.4628203122774109, + "learning_rate": 6.169635965423331e-06, + "loss": 0.211, + "step": 4333 + }, + { + "epoch": 1.4473200868258473, + "grad_norm": 0.48872692289281494, + "learning_rate": 6.167746463300774e-06, + "loss": 0.2088, + "step": 4334 + }, + { + "epoch": 1.44765403239272, + "grad_norm": 0.4343259836847498, + "learning_rate": 6.1658567847763655e-06, + "loss": 0.2007, + "step": 4335 + }, + { + "epoch": 1.4479879779595926, + "grad_norm": 0.4398765922609618, + "learning_rate": 6.163966930135561e-06, + "loss": 0.2006, + "step": 4336 + }, + { + "epoch": 1.448321923526465, + "grad_norm": 0.45579456571854265, + "learning_rate": 6.162076899663846e-06, + "loss": 0.2107, + "step": 4337 + }, + { + "epoch": 1.4486558690933378, + "grad_norm": 0.40883179410081466, + "learning_rate": 6.160186693646732e-06, + "loss": 0.1849, + "step": 4338 + }, + { + "epoch": 1.4489898146602105, + "grad_norm": 0.49071716904008567, + "learning_rate": 6.158296312369759e-06, + "loss": 0.2147, + "step": 4339 + }, + { + "epoch": 1.449323760227083, + "grad_norm": 0.4314291467098978, + "learning_rate": 6.156405756118489e-06, + "loss": 0.2086, + "step": 4340 + }, + { + "epoch": 1.4496577057939555, + "grad_norm": 0.4593921097879424, + "learning_rate": 6.154515025178511e-06, + "loss": 0.2002, + "step": 4341 + }, + { + "epoch": 1.4499916513608282, + "grad_norm": 0.45040771494821547, + "learning_rate": 6.152624119835447e-06, + "loss": 0.2086, + "step": 4342 + }, + { + "epoch": 1.450325596927701, + "grad_norm": 0.4299923678581887, + "learning_rate": 6.150733040374937e-06, + "loss": 0.2014, + "step": 4343 + }, + { + "epoch": 1.4506595424945734, + "grad_norm": 0.400344990906151, + "learning_rate": 6.148841787082653e-06, + "loss": 0.1927, + "step": 4344 + }, + { + "epoch": 1.450993488061446, + "grad_norm": 0.5435972870488641, + "learning_rate": 6.146950360244288e-06, + "loss": 0.1967, + "step": 4345 + }, + { + "epoch": 1.4513274336283186, + "grad_norm": 0.5742306297652585, + "learning_rate": 6.145058760145568e-06, + "loss": 0.2234, + "step": 4346 + }, + { + "epoch": 1.4516613791951911, + "grad_norm": 0.39381531571315703, + "learning_rate": 6.14316698707224e-06, + "loss": 0.188, + "step": 4347 + }, + { + "epoch": 1.4519953247620638, + "grad_norm": 0.39767928183404444, + "learning_rate": 6.1412750413100754e-06, + "loss": 0.1967, + "step": 4348 + }, + { + "epoch": 1.4523292703289363, + "grad_norm": 0.42473106735652594, + "learning_rate": 6.13938292314488e-06, + "loss": 0.2017, + "step": 4349 + }, + { + "epoch": 1.452663215895809, + "grad_norm": 0.4608850855222034, + "learning_rate": 6.137490632862479e-06, + "loss": 0.2073, + "step": 4350 + }, + { + "epoch": 1.4529971614626815, + "grad_norm": 0.44194784147975535, + "learning_rate": 6.135598170748721e-06, + "loss": 0.1999, + "step": 4351 + }, + { + "epoch": 1.4533311070295543, + "grad_norm": 0.4336212713439643, + "learning_rate": 6.13370553708949e-06, + "loss": 0.21, + "step": 4352 + }, + { + "epoch": 1.4536650525964268, + "grad_norm": 0.4474198500961851, + "learning_rate": 6.13181273217069e-06, + "loss": 0.2107, + "step": 4353 + }, + { + "epoch": 1.4539989981632995, + "grad_norm": 0.3988946635025851, + "learning_rate": 6.129919756278248e-06, + "loss": 0.1998, + "step": 4354 + }, + { + "epoch": 1.454332943730172, + "grad_norm": 0.4289910244729747, + "learning_rate": 6.128026609698124e-06, + "loss": 0.1926, + "step": 4355 + }, + { + "epoch": 1.4546668892970445, + "grad_norm": 0.38926894257927563, + "learning_rate": 6.126133292716297e-06, + "loss": 0.1893, + "step": 4356 + }, + { + "epoch": 1.4550008348639172, + "grad_norm": 0.42221784850848676, + "learning_rate": 6.124239805618778e-06, + "loss": 0.1908, + "step": 4357 + }, + { + "epoch": 1.45533478043079, + "grad_norm": 0.39646952858495677, + "learning_rate": 6.122346148691598e-06, + "loss": 0.1877, + "step": 4358 + }, + { + "epoch": 1.4556687259976624, + "grad_norm": 0.40546148481183963, + "learning_rate": 6.120452322220818e-06, + "loss": 0.2004, + "step": 4359 + }, + { + "epoch": 1.456002671564535, + "grad_norm": 0.4547099784865291, + "learning_rate": 6.11855832649252e-06, + "loss": 0.2115, + "step": 4360 + }, + { + "epoch": 1.4563366171314076, + "grad_norm": 0.38112993327587635, + "learning_rate": 6.116664161792817e-06, + "loss": 0.1858, + "step": 4361 + }, + { + "epoch": 1.45667056269828, + "grad_norm": 0.4019263289774499, + "learning_rate": 6.114769828407845e-06, + "loss": 0.1955, + "step": 4362 + }, + { + "epoch": 1.4570045082651528, + "grad_norm": 0.41009801163114296, + "learning_rate": 6.112875326623763e-06, + "loss": 0.1882, + "step": 4363 + }, + { + "epoch": 1.4573384538320253, + "grad_norm": 0.4079447248544796, + "learning_rate": 6.110980656726759e-06, + "loss": 0.1889, + "step": 4364 + }, + { + "epoch": 1.457672399398898, + "grad_norm": 0.4516607016893233, + "learning_rate": 6.109085819003048e-06, + "loss": 0.2073, + "step": 4365 + }, + { + "epoch": 1.4580063449657705, + "grad_norm": 0.42566248368763016, + "learning_rate": 6.107190813738864e-06, + "loss": 0.2089, + "step": 4366 + }, + { + "epoch": 1.4583402905326432, + "grad_norm": 0.39147915382309184, + "learning_rate": 6.10529564122047e-06, + "loss": 0.1974, + "step": 4367 + }, + { + "epoch": 1.4586742360995157, + "grad_norm": 0.3982765548370061, + "learning_rate": 6.103400301734155e-06, + "loss": 0.1993, + "step": 4368 + }, + { + "epoch": 1.4590081816663885, + "grad_norm": 0.40681227759099087, + "learning_rate": 6.101504795566232e-06, + "loss": 0.2013, + "step": 4369 + }, + { + "epoch": 1.459342127233261, + "grad_norm": 0.44284559949396546, + "learning_rate": 6.099609123003041e-06, + "loss": 0.2055, + "step": 4370 + }, + { + "epoch": 1.4596760728001335, + "grad_norm": 0.3860492351094961, + "learning_rate": 6.097713284330944e-06, + "loss": 0.1948, + "step": 4371 + }, + { + "epoch": 1.4600100183670062, + "grad_norm": 0.41425735018239185, + "learning_rate": 6.095817279836329e-06, + "loss": 0.1912, + "step": 4372 + }, + { + "epoch": 1.4603439639338789, + "grad_norm": 0.38871439359382953, + "learning_rate": 6.093921109805612e-06, + "loss": 0.1928, + "step": 4373 + }, + { + "epoch": 1.4606779095007514, + "grad_norm": 0.4143118345237507, + "learning_rate": 6.092024774525231e-06, + "loss": 0.215, + "step": 4374 + }, + { + "epoch": 1.4610118550676239, + "grad_norm": 0.44030852410849036, + "learning_rate": 6.090128274281649e-06, + "loss": 0.2048, + "step": 4375 + }, + { + "epoch": 1.4613458006344966, + "grad_norm": 0.4075810714964566, + "learning_rate": 6.0882316093613555e-06, + "loss": 0.191, + "step": 4376 + }, + { + "epoch": 1.4616797462013693, + "grad_norm": 0.47178283797989656, + "learning_rate": 6.086334780050865e-06, + "loss": 0.2076, + "step": 4377 + }, + { + "epoch": 1.4620136917682418, + "grad_norm": 0.4385058940753501, + "learning_rate": 6.084437786636713e-06, + "loss": 0.1973, + "step": 4378 + }, + { + "epoch": 1.4623476373351143, + "grad_norm": 0.39454270917277967, + "learning_rate": 6.082540629405467e-06, + "loss": 0.1956, + "step": 4379 + }, + { + "epoch": 1.462681582901987, + "grad_norm": 0.4246463460236804, + "learning_rate": 6.08064330864371e-06, + "loss": 0.1897, + "step": 4380 + }, + { + "epoch": 1.4630155284688595, + "grad_norm": 0.4279872130659886, + "learning_rate": 6.078745824638058e-06, + "loss": 0.1909, + "step": 4381 + }, + { + "epoch": 1.4633494740357322, + "grad_norm": 0.4037107620637563, + "learning_rate": 6.076848177675148e-06, + "loss": 0.185, + "step": 4382 + }, + { + "epoch": 1.4636834196026047, + "grad_norm": 0.452895373409483, + "learning_rate": 6.07495036804164e-06, + "loss": 0.1987, + "step": 4383 + }, + { + "epoch": 1.4640173651694774, + "grad_norm": 0.46645780502615036, + "learning_rate": 6.073052396024222e-06, + "loss": 0.2091, + "step": 4384 + }, + { + "epoch": 1.46435131073635, + "grad_norm": 0.4041982376337457, + "learning_rate": 6.071154261909605e-06, + "loss": 0.1929, + "step": 4385 + }, + { + "epoch": 1.4646852563032224, + "grad_norm": 0.4356675688935076, + "learning_rate": 6.069255965984524e-06, + "loss": 0.2031, + "step": 4386 + }, + { + "epoch": 1.4650192018700952, + "grad_norm": 0.40081538939786715, + "learning_rate": 6.067357508535741e-06, + "loss": 0.1979, + "step": 4387 + }, + { + "epoch": 1.4653531474369679, + "grad_norm": 0.424597670187965, + "learning_rate": 6.065458889850037e-06, + "loss": 0.1921, + "step": 4388 + }, + { + "epoch": 1.4656870930038404, + "grad_norm": 0.4502702104798383, + "learning_rate": 6.063560110214224e-06, + "loss": 0.2006, + "step": 4389 + }, + { + "epoch": 1.4660210385707129, + "grad_norm": 0.4195663771268208, + "learning_rate": 6.061661169915132e-06, + "loss": 0.1984, + "step": 4390 + }, + { + "epoch": 1.4663549841375856, + "grad_norm": 0.4308479391309738, + "learning_rate": 6.05976206923962e-06, + "loss": 0.2084, + "step": 4391 + }, + { + "epoch": 1.4666889297044583, + "grad_norm": 0.4166693946797753, + "learning_rate": 6.057862808474569e-06, + "loss": 0.1985, + "step": 4392 + }, + { + "epoch": 1.4670228752713308, + "grad_norm": 0.4603888013323258, + "learning_rate": 6.055963387906884e-06, + "loss": 0.2015, + "step": 4393 + }, + { + "epoch": 1.4673568208382033, + "grad_norm": 0.4081769473587065, + "learning_rate": 6.054063807823497e-06, + "loss": 0.2027, + "step": 4394 + }, + { + "epoch": 1.467690766405076, + "grad_norm": 0.4134409770785201, + "learning_rate": 6.052164068511359e-06, + "loss": 0.1994, + "step": 4395 + }, + { + "epoch": 1.4680247119719485, + "grad_norm": 0.4692504389911519, + "learning_rate": 6.05026417025745e-06, + "loss": 0.2123, + "step": 4396 + }, + { + "epoch": 1.4683586575388212, + "grad_norm": 0.4493204264781914, + "learning_rate": 6.0483641133487736e-06, + "loss": 0.2065, + "step": 4397 + }, + { + "epoch": 1.4686926031056937, + "grad_norm": 0.42766406223034875, + "learning_rate": 6.046463898072351e-06, + "loss": 0.2048, + "step": 4398 + }, + { + "epoch": 1.4690265486725664, + "grad_norm": 0.5308801017267736, + "learning_rate": 6.044563524715237e-06, + "loss": 0.1971, + "step": 4399 + }, + { + "epoch": 1.469360494239439, + "grad_norm": 0.4262125710912439, + "learning_rate": 6.042662993564503e-06, + "loss": 0.1949, + "step": 4400 + }, + { + "epoch": 1.4696944398063116, + "grad_norm": 0.4025672852690035, + "learning_rate": 6.040762304907246e-06, + "loss": 0.1984, + "step": 4401 + }, + { + "epoch": 1.4700283853731841, + "grad_norm": 0.4026158150281825, + "learning_rate": 6.038861459030588e-06, + "loss": 0.1891, + "step": 4402 + }, + { + "epoch": 1.4703623309400569, + "grad_norm": 0.3956628314819311, + "learning_rate": 6.036960456221677e-06, + "loss": 0.1958, + "step": 4403 + }, + { + "epoch": 1.4706962765069294, + "grad_norm": 0.4044262098140246, + "learning_rate": 6.035059296767676e-06, + "loss": 0.1916, + "step": 4404 + }, + { + "epoch": 1.4710302220738019, + "grad_norm": 0.480407738770424, + "learning_rate": 6.033157980955782e-06, + "loss": 0.2004, + "step": 4405 + }, + { + "epoch": 1.4713641676406746, + "grad_norm": 0.46733025871686174, + "learning_rate": 6.0312565090732115e-06, + "loss": 0.2135, + "step": 4406 + }, + { + "epoch": 1.4716981132075473, + "grad_norm": 0.38309985347010694, + "learning_rate": 6.0293548814072004e-06, + "loss": 0.1973, + "step": 4407 + }, + { + "epoch": 1.4720320587744198, + "grad_norm": 0.43882038617597807, + "learning_rate": 6.0274530982450155e-06, + "loss": 0.2097, + "step": 4408 + }, + { + "epoch": 1.4723660043412923, + "grad_norm": 0.4763463732615725, + "learning_rate": 6.025551159873941e-06, + "loss": 0.2025, + "step": 4409 + }, + { + "epoch": 1.472699949908165, + "grad_norm": 0.4773269336687056, + "learning_rate": 6.023649066581288e-06, + "loss": 0.2237, + "step": 4410 + }, + { + "epoch": 1.4730338954750375, + "grad_norm": 0.4148665052375862, + "learning_rate": 6.021746818654393e-06, + "loss": 0.2008, + "step": 4411 + }, + { + "epoch": 1.4733678410419102, + "grad_norm": 0.43151291731531444, + "learning_rate": 6.019844416380609e-06, + "loss": 0.1983, + "step": 4412 + }, + { + "epoch": 1.4737017866087827, + "grad_norm": 0.4286672462162659, + "learning_rate": 6.017941860047318e-06, + "loss": 0.1993, + "step": 4413 + }, + { + "epoch": 1.4740357321756554, + "grad_norm": 0.42593830099460744, + "learning_rate": 6.016039149941924e-06, + "loss": 0.2, + "step": 4414 + }, + { + "epoch": 1.474369677742528, + "grad_norm": 0.44556539133547163, + "learning_rate": 6.01413628635185e-06, + "loss": 0.2224, + "step": 4415 + }, + { + "epoch": 1.4747036233094006, + "grad_norm": 0.40967464330179093, + "learning_rate": 6.012233269564551e-06, + "loss": 0.1948, + "step": 4416 + }, + { + "epoch": 1.4750375688762731, + "grad_norm": 0.43684591455417715, + "learning_rate": 6.010330099867497e-06, + "loss": 0.2075, + "step": 4417 + }, + { + "epoch": 1.4753715144431458, + "grad_norm": 0.4341408303231394, + "learning_rate": 6.008426777548186e-06, + "loss": 0.1982, + "step": 4418 + }, + { + "epoch": 1.4757054600100183, + "grad_norm": 0.4358707442878612, + "learning_rate": 6.0065233028941365e-06, + "loss": 0.2045, + "step": 4419 + }, + { + "epoch": 1.4760394055768908, + "grad_norm": 0.37328822493004127, + "learning_rate": 6.00461967619289e-06, + "loss": 0.1821, + "step": 4420 + }, + { + "epoch": 1.4763733511437636, + "grad_norm": 0.400014805788866, + "learning_rate": 6.002715897732013e-06, + "loss": 0.1847, + "step": 4421 + }, + { + "epoch": 1.4767072967106363, + "grad_norm": 0.4046246703398329, + "learning_rate": 6.000811967799092e-06, + "loss": 0.1899, + "step": 4422 + }, + { + "epoch": 1.4770412422775088, + "grad_norm": 0.40715797582521757, + "learning_rate": 5.99890788668174e-06, + "loss": 0.1941, + "step": 4423 + }, + { + "epoch": 1.4773751878443813, + "grad_norm": 0.4206959634130982, + "learning_rate": 5.997003654667589e-06, + "loss": 0.1991, + "step": 4424 + }, + { + "epoch": 1.477709133411254, + "grad_norm": 0.40769749828652163, + "learning_rate": 5.995099272044298e-06, + "loss": 0.1829, + "step": 4425 + }, + { + "epoch": 1.4780430789781267, + "grad_norm": 0.44404122589778716, + "learning_rate": 5.9931947390995435e-06, + "loss": 0.2092, + "step": 4426 + }, + { + "epoch": 1.4783770245449992, + "grad_norm": 0.42520813532123286, + "learning_rate": 5.99129005612103e-06, + "loss": 0.2081, + "step": 4427 + }, + { + "epoch": 1.4787109701118717, + "grad_norm": 0.4122469308763514, + "learning_rate": 5.989385223396482e-06, + "loss": 0.2011, + "step": 4428 + }, + { + "epoch": 1.4790449156787444, + "grad_norm": 0.3929382468133838, + "learning_rate": 5.987480241213646e-06, + "loss": 0.192, + "step": 4429 + }, + { + "epoch": 1.479378861245617, + "grad_norm": 0.5672122545325259, + "learning_rate": 5.985575109860292e-06, + "loss": 0.2116, + "step": 4430 + }, + { + "epoch": 1.4797128068124896, + "grad_norm": 0.4983677423174381, + "learning_rate": 5.983669829624214e-06, + "loss": 0.2018, + "step": 4431 + }, + { + "epoch": 1.4800467523793621, + "grad_norm": 0.42548655408845926, + "learning_rate": 5.981764400793224e-06, + "loss": 0.1971, + "step": 4432 + }, + { + "epoch": 1.4803806979462348, + "grad_norm": 0.39361510423237334, + "learning_rate": 5.9798588236551626e-06, + "loss": 0.1919, + "step": 4433 + }, + { + "epoch": 1.4807146435131073, + "grad_norm": 0.42721227153711655, + "learning_rate": 5.977953098497889e-06, + "loss": 0.2013, + "step": 4434 + }, + { + "epoch": 1.4810485890799798, + "grad_norm": 0.4093784539006786, + "learning_rate": 5.976047225609284e-06, + "loss": 0.1945, + "step": 4435 + }, + { + "epoch": 1.4813825346468525, + "grad_norm": 0.39877583576233977, + "learning_rate": 5.974141205277253e-06, + "loss": 0.1932, + "step": 4436 + }, + { + "epoch": 1.4817164802137253, + "grad_norm": 0.4241654493666607, + "learning_rate": 5.972235037789723e-06, + "loss": 0.1888, + "step": 4437 + }, + { + "epoch": 1.4820504257805978, + "grad_norm": 0.4127889957282086, + "learning_rate": 5.970328723434642e-06, + "loss": 0.1855, + "step": 4438 + }, + { + "epoch": 1.4823843713474703, + "grad_norm": 0.43974577195283593, + "learning_rate": 5.968422262499983e-06, + "loss": 0.2001, + "step": 4439 + }, + { + "epoch": 1.482718316914343, + "grad_norm": 0.43856634741945644, + "learning_rate": 5.966515655273739e-06, + "loss": 0.1909, + "step": 4440 + }, + { + "epoch": 1.4830522624812157, + "grad_norm": 0.43919243938383296, + "learning_rate": 5.9646089020439245e-06, + "loss": 0.1957, + "step": 4441 + }, + { + "epoch": 1.4833862080480882, + "grad_norm": 0.39521255006239037, + "learning_rate": 5.962702003098576e-06, + "loss": 0.1884, + "step": 4442 + }, + { + "epoch": 1.4837201536149607, + "grad_norm": 0.39010763406866694, + "learning_rate": 5.960794958725756e-06, + "loss": 0.1967, + "step": 4443 + }, + { + "epoch": 1.4840540991818334, + "grad_norm": 0.38308666558185406, + "learning_rate": 5.958887769213544e-06, + "loss": 0.1898, + "step": 4444 + }, + { + "epoch": 1.484388044748706, + "grad_norm": 0.3808395689544056, + "learning_rate": 5.956980434850044e-06, + "loss": 0.1926, + "step": 4445 + }, + { + "epoch": 1.4847219903155786, + "grad_norm": 0.4019120461912746, + "learning_rate": 5.955072955923381e-06, + "loss": 0.1901, + "step": 4446 + }, + { + "epoch": 1.485055935882451, + "grad_norm": 0.3914079755133911, + "learning_rate": 5.9531653327217035e-06, + "loss": 0.1858, + "step": 4447 + }, + { + "epoch": 1.4853898814493238, + "grad_norm": 0.4001292440001611, + "learning_rate": 5.951257565533177e-06, + "loss": 0.1942, + "step": 4448 + }, + { + "epoch": 1.4857238270161963, + "grad_norm": 0.44889699749599704, + "learning_rate": 5.949349654645997e-06, + "loss": 0.2144, + "step": 4449 + }, + { + "epoch": 1.486057772583069, + "grad_norm": 0.4376071738007426, + "learning_rate": 5.947441600348373e-06, + "loss": 0.2112, + "step": 4450 + }, + { + "epoch": 1.4863917181499415, + "grad_norm": 0.5002467028319895, + "learning_rate": 5.945533402928537e-06, + "loss": 0.2143, + "step": 4451 + }, + { + "epoch": 1.4867256637168142, + "grad_norm": 0.43261606233366906, + "learning_rate": 5.9436250626747505e-06, + "loss": 0.2231, + "step": 4452 + }, + { + "epoch": 1.4870596092836867, + "grad_norm": 0.42928106883312683, + "learning_rate": 5.941716579875286e-06, + "loss": 0.1944, + "step": 4453 + }, + { + "epoch": 1.4873935548505592, + "grad_norm": 0.44047447971144676, + "learning_rate": 5.939807954818443e-06, + "loss": 0.2087, + "step": 4454 + }, + { + "epoch": 1.487727500417432, + "grad_norm": 0.647500480455085, + "learning_rate": 5.937899187792544e-06, + "loss": 0.1897, + "step": 4455 + }, + { + "epoch": 1.4880614459843047, + "grad_norm": 0.39203680888737436, + "learning_rate": 5.935990279085928e-06, + "loss": 0.1926, + "step": 4456 + }, + { + "epoch": 1.4883953915511772, + "grad_norm": 0.7590048406114819, + "learning_rate": 5.93408122898696e-06, + "loss": 0.2066, + "step": 4457 + }, + { + "epoch": 1.4887293371180497, + "grad_norm": 0.3828687255820843, + "learning_rate": 5.9321720377840245e-06, + "loss": 0.1898, + "step": 4458 + }, + { + "epoch": 1.4890632826849224, + "grad_norm": 0.3989626077341179, + "learning_rate": 5.930262705765526e-06, + "loss": 0.1919, + "step": 4459 + }, + { + "epoch": 1.4893972282517949, + "grad_norm": 0.4435140197348797, + "learning_rate": 5.928353233219893e-06, + "loss": 0.2138, + "step": 4460 + }, + { + "epoch": 1.4897311738186676, + "grad_norm": 0.41263294889386565, + "learning_rate": 5.926443620435572e-06, + "loss": 0.199, + "step": 4461 + }, + { + "epoch": 1.49006511938554, + "grad_norm": 0.4331460485795739, + "learning_rate": 5.924533867701034e-06, + "loss": 0.188, + "step": 4462 + }, + { + "epoch": 1.4903990649524128, + "grad_norm": 0.41622877671317154, + "learning_rate": 5.922623975304771e-06, + "loss": 0.1985, + "step": 4463 + }, + { + "epoch": 1.4907330105192853, + "grad_norm": 0.4044199019408801, + "learning_rate": 5.920713943535291e-06, + "loss": 0.1978, + "step": 4464 + }, + { + "epoch": 1.491066956086158, + "grad_norm": 0.44762562378654125, + "learning_rate": 5.9188037726811285e-06, + "loss": 0.2139, + "step": 4465 + }, + { + "epoch": 1.4914009016530305, + "grad_norm": 0.4544481373494818, + "learning_rate": 5.9168934630308385e-06, + "loss": 0.2069, + "step": 4466 + }, + { + "epoch": 1.4917348472199032, + "grad_norm": 0.4158141847232926, + "learning_rate": 5.914983014872995e-06, + "loss": 0.2005, + "step": 4467 + }, + { + "epoch": 1.4920687927867757, + "grad_norm": 0.438988331436695, + "learning_rate": 5.9130724284961924e-06, + "loss": 0.1996, + "step": 4468 + }, + { + "epoch": 1.4924027383536482, + "grad_norm": 0.4092313246855316, + "learning_rate": 5.91116170418905e-06, + "loss": 0.1997, + "step": 4469 + }, + { + "epoch": 1.492736683920521, + "grad_norm": 0.5391760700288386, + "learning_rate": 5.909250842240203e-06, + "loss": 0.2231, + "step": 4470 + }, + { + "epoch": 1.4930706294873937, + "grad_norm": 0.42467270074369073, + "learning_rate": 5.907339842938309e-06, + "loss": 0.1921, + "step": 4471 + }, + { + "epoch": 1.4934045750542662, + "grad_norm": 0.4149774782856609, + "learning_rate": 5.90542870657205e-06, + "loss": 0.1953, + "step": 4472 + }, + { + "epoch": 1.4937385206211387, + "grad_norm": 0.4071411942291915, + "learning_rate": 5.903517433430123e-06, + "loss": 0.1945, + "step": 4473 + }, + { + "epoch": 1.4940724661880114, + "grad_norm": 0.4161620755423087, + "learning_rate": 5.901606023801248e-06, + "loss": 0.196, + "step": 4474 + }, + { + "epoch": 1.494406411754884, + "grad_norm": 0.44415983179036833, + "learning_rate": 5.899694477974168e-06, + "loss": 0.2036, + "step": 4475 + }, + { + "epoch": 1.4947403573217566, + "grad_norm": 0.48213305322394434, + "learning_rate": 5.897782796237645e-06, + "loss": 0.2017, + "step": 4476 + }, + { + "epoch": 1.495074302888629, + "grad_norm": 0.4565653405989367, + "learning_rate": 5.895870978880457e-06, + "loss": 0.2131, + "step": 4477 + }, + { + "epoch": 1.4954082484555018, + "grad_norm": 0.38413826906308257, + "learning_rate": 5.89395902619141e-06, + "loss": 0.1824, + "step": 4478 + }, + { + "epoch": 1.4957421940223743, + "grad_norm": 0.3865209015501584, + "learning_rate": 5.892046938459327e-06, + "loss": 0.1932, + "step": 4479 + }, + { + "epoch": 1.496076139589247, + "grad_norm": 0.4485955314433592, + "learning_rate": 5.890134715973049e-06, + "loss": 0.2067, + "step": 4480 + }, + { + "epoch": 1.4964100851561195, + "grad_norm": 0.4159945254787882, + "learning_rate": 5.888222359021443e-06, + "loss": 0.208, + "step": 4481 + }, + { + "epoch": 1.4967440307229922, + "grad_norm": 0.4705989200101088, + "learning_rate": 5.8863098678933896e-06, + "loss": 0.2105, + "step": 4482 + }, + { + "epoch": 1.4970779762898647, + "grad_norm": 0.4875225833597677, + "learning_rate": 5.884397242877795e-06, + "loss": 0.2036, + "step": 4483 + }, + { + "epoch": 1.4974119218567372, + "grad_norm": 0.46559568159684633, + "learning_rate": 5.882484484263584e-06, + "loss": 0.2078, + "step": 4484 + }, + { + "epoch": 1.49774586742361, + "grad_norm": 0.4104886972900785, + "learning_rate": 5.8805715923397e-06, + "loss": 0.2093, + "step": 4485 + }, + { + "epoch": 1.4980798129904827, + "grad_norm": 0.4270246748278954, + "learning_rate": 5.87865856739511e-06, + "loss": 0.1874, + "step": 4486 + }, + { + "epoch": 1.4984137585573551, + "grad_norm": 0.40156162163724024, + "learning_rate": 5.876745409718796e-06, + "loss": 0.1865, + "step": 4487 + }, + { + "epoch": 1.4987477041242276, + "grad_norm": 0.391157655694104, + "learning_rate": 5.874832119599766e-06, + "loss": 0.1849, + "step": 4488 + }, + { + "epoch": 1.4990816496911004, + "grad_norm": 0.4212459484944311, + "learning_rate": 5.872918697327042e-06, + "loss": 0.1948, + "step": 4489 + }, + { + "epoch": 1.499415595257973, + "grad_norm": 0.43743610694660373, + "learning_rate": 5.871005143189671e-06, + "loss": 0.2022, + "step": 4490 + }, + { + "epoch": 1.4997495408248456, + "grad_norm": 0.47954887154563275, + "learning_rate": 5.869091457476718e-06, + "loss": 0.2165, + "step": 4491 + }, + { + "epoch": 1.500083486391718, + "grad_norm": 0.40104355324589935, + "learning_rate": 5.8671776404772655e-06, + "loss": 0.2011, + "step": 4492 + }, + { + "epoch": 1.5004174319585908, + "grad_norm": 0.4065530502322676, + "learning_rate": 5.8652636924804206e-06, + "loss": 0.1917, + "step": 4493 + }, + { + "epoch": 1.5007513775254635, + "grad_norm": 0.39361953435018554, + "learning_rate": 5.863349613775308e-06, + "loss": 0.1871, + "step": 4494 + }, + { + "epoch": 1.501085323092336, + "grad_norm": 0.4220860375266858, + "learning_rate": 5.861435404651068e-06, + "loss": 0.197, + "step": 4495 + }, + { + "epoch": 1.5014192686592085, + "grad_norm": 0.4133784223860879, + "learning_rate": 5.859521065396869e-06, + "loss": 0.2039, + "step": 4496 + }, + { + "epoch": 1.5017532142260812, + "grad_norm": 0.46819851944987745, + "learning_rate": 5.857606596301892e-06, + "loss": 0.1887, + "step": 4497 + }, + { + "epoch": 1.5020871597929537, + "grad_norm": 0.4212627192628078, + "learning_rate": 5.85569199765534e-06, + "loss": 0.1992, + "step": 4498 + }, + { + "epoch": 1.5024211053598262, + "grad_norm": 0.5441159142194298, + "learning_rate": 5.853777269746438e-06, + "loss": 0.1945, + "step": 4499 + }, + { + "epoch": 1.502755050926699, + "grad_norm": 0.38319452570638174, + "learning_rate": 5.851862412864426e-06, + "loss": 0.1878, + "step": 4500 + }, + { + "epoch": 1.5030889964935716, + "grad_norm": 0.39792537321581023, + "learning_rate": 5.8499474272985654e-06, + "loss": 0.1871, + "step": 4501 + }, + { + "epoch": 1.5034229420604441, + "grad_norm": 0.42532885484991806, + "learning_rate": 5.848032313338139e-06, + "loss": 0.188, + "step": 4502 + }, + { + "epoch": 1.5037568876273166, + "grad_norm": 0.40589177813814775, + "learning_rate": 5.846117071272444e-06, + "loss": 0.1919, + "step": 4503 + }, + { + "epoch": 1.5040908331941893, + "grad_norm": 0.4038764967885529, + "learning_rate": 5.844201701390806e-06, + "loss": 0.1947, + "step": 4504 + }, + { + "epoch": 1.504424778761062, + "grad_norm": 0.42058966558438154, + "learning_rate": 5.842286203982559e-06, + "loss": 0.1875, + "step": 4505 + }, + { + "epoch": 1.5047587243279346, + "grad_norm": 0.44301115684433295, + "learning_rate": 5.840370579337063e-06, + "loss": 0.2088, + "step": 4506 + }, + { + "epoch": 1.505092669894807, + "grad_norm": 0.4506602942327276, + "learning_rate": 5.838454827743697e-06, + "loss": 0.214, + "step": 4507 + }, + { + "epoch": 1.5054266154616798, + "grad_norm": 0.41888527162391026, + "learning_rate": 5.8365389494918565e-06, + "loss": 0.1989, + "step": 4508 + }, + { + "epoch": 1.5057605610285525, + "grad_norm": 0.46234137248207496, + "learning_rate": 5.834622944870959e-06, + "loss": 0.2071, + "step": 4509 + }, + { + "epoch": 1.506094506595425, + "grad_norm": 0.3991349847409579, + "learning_rate": 5.832706814170437e-06, + "loss": 0.1921, + "step": 4510 + }, + { + "epoch": 1.5064284521622975, + "grad_norm": 0.45328443937762986, + "learning_rate": 5.830790557679746e-06, + "loss": 0.2094, + "step": 4511 + }, + { + "epoch": 1.5067623977291702, + "grad_norm": 0.4545739233729224, + "learning_rate": 5.8288741756883585e-06, + "loss": 0.2233, + "step": 4512 + }, + { + "epoch": 1.5070963432960427, + "grad_norm": 0.4173274151899909, + "learning_rate": 5.826957668485768e-06, + "loss": 0.1891, + "step": 4513 + }, + { + "epoch": 1.5074302888629152, + "grad_norm": 0.4514509261491327, + "learning_rate": 5.825041036361484e-06, + "loss": 0.2034, + "step": 4514 + }, + { + "epoch": 1.507764234429788, + "grad_norm": 0.4245100612780044, + "learning_rate": 5.823124279605037e-06, + "loss": 0.2083, + "step": 4515 + }, + { + "epoch": 1.5080981799966606, + "grad_norm": 0.41279249791467926, + "learning_rate": 5.821207398505976e-06, + "loss": 0.2131, + "step": 4516 + }, + { + "epoch": 1.5084321255635331, + "grad_norm": 0.4430510447851048, + "learning_rate": 5.819290393353867e-06, + "loss": 0.2233, + "step": 4517 + }, + { + "epoch": 1.5087660711304056, + "grad_norm": 0.4270138809636306, + "learning_rate": 5.817373264438297e-06, + "loss": 0.198, + "step": 4518 + }, + { + "epoch": 1.5091000166972783, + "grad_norm": 0.3855525070150281, + "learning_rate": 5.815456012048873e-06, + "loss": 0.1943, + "step": 4519 + }, + { + "epoch": 1.509433962264151, + "grad_norm": 0.3951585582354066, + "learning_rate": 5.8135386364752154e-06, + "loss": 0.1973, + "step": 4520 + }, + { + "epoch": 1.5097679078310235, + "grad_norm": 0.44097574717875243, + "learning_rate": 5.8116211380069675e-06, + "loss": 0.1974, + "step": 4521 + }, + { + "epoch": 1.510101853397896, + "grad_norm": 0.418568197709888, + "learning_rate": 5.809703516933791e-06, + "loss": 0.1981, + "step": 4522 + }, + { + "epoch": 1.5104357989647688, + "grad_norm": 0.39429989332524545, + "learning_rate": 5.807785773545364e-06, + "loss": 0.1885, + "step": 4523 + }, + { + "epoch": 1.5107697445316415, + "grad_norm": 0.5133283842688638, + "learning_rate": 5.805867908131384e-06, + "loss": 0.2054, + "step": 4524 + }, + { + "epoch": 1.511103690098514, + "grad_norm": 0.4138694184824036, + "learning_rate": 5.803949920981568e-06, + "loss": 0.2066, + "step": 4525 + }, + { + "epoch": 1.5114376356653865, + "grad_norm": 0.451090788654152, + "learning_rate": 5.802031812385651e-06, + "loss": 0.207, + "step": 4526 + }, + { + "epoch": 1.5117715812322592, + "grad_norm": 0.4391511075816685, + "learning_rate": 5.800113582633384e-06, + "loss": 0.1972, + "step": 4527 + }, + { + "epoch": 1.512105526799132, + "grad_norm": 0.4062998929666656, + "learning_rate": 5.7981952320145405e-06, + "loss": 0.2005, + "step": 4528 + }, + { + "epoch": 1.5124394723660042, + "grad_norm": 0.4147263705858617, + "learning_rate": 5.796276760818908e-06, + "loss": 0.2, + "step": 4529 + }, + { + "epoch": 1.512773417932877, + "grad_norm": 0.4184233692356123, + "learning_rate": 5.794358169336295e-06, + "loss": 0.2072, + "step": 4530 + }, + { + "epoch": 1.5131073634997496, + "grad_norm": 0.42606865720950565, + "learning_rate": 5.792439457856528e-06, + "loss": 0.2058, + "step": 4531 + }, + { + "epoch": 1.513441309066622, + "grad_norm": 0.4031899587249856, + "learning_rate": 5.790520626669449e-06, + "loss": 0.1865, + "step": 4532 + }, + { + "epoch": 1.5137752546334946, + "grad_norm": 0.39822141992740223, + "learning_rate": 5.788601676064922e-06, + "loss": 0.1849, + "step": 4533 + }, + { + "epoch": 1.5141092002003673, + "grad_norm": 0.41169793772637675, + "learning_rate": 5.786682606332827e-06, + "loss": 0.198, + "step": 4534 + }, + { + "epoch": 1.51444314576724, + "grad_norm": 0.41321607553783124, + "learning_rate": 5.78476341776306e-06, + "loss": 0.1988, + "step": 4535 + }, + { + "epoch": 1.5147770913341125, + "grad_norm": 0.4333225224257672, + "learning_rate": 5.782844110645539e-06, + "loss": 0.1957, + "step": 4536 + }, + { + "epoch": 1.515111036900985, + "grad_norm": 0.42052885937644136, + "learning_rate": 5.780924685270198e-06, + "loss": 0.198, + "step": 4537 + }, + { + "epoch": 1.5154449824678577, + "grad_norm": 0.4037698414311071, + "learning_rate": 5.779005141926988e-06, + "loss": 0.2, + "step": 4538 + }, + { + "epoch": 1.5157789280347305, + "grad_norm": 0.3818155530534464, + "learning_rate": 5.777085480905877e-06, + "loss": 0.1915, + "step": 4539 + }, + { + "epoch": 1.516112873601603, + "grad_norm": 0.3963973853574785, + "learning_rate": 5.7751657024968565e-06, + "loss": 0.1988, + "step": 4540 + }, + { + "epoch": 1.5164468191684755, + "grad_norm": 0.45356922202417344, + "learning_rate": 5.773245806989929e-06, + "loss": 0.2118, + "step": 4541 + }, + { + "epoch": 1.5167807647353482, + "grad_norm": 0.41828720268999303, + "learning_rate": 5.771325794675117e-06, + "loss": 0.2065, + "step": 4542 + }, + { + "epoch": 1.517114710302221, + "grad_norm": 0.38469626361124054, + "learning_rate": 5.769405665842461e-06, + "loss": 0.1849, + "step": 4543 + }, + { + "epoch": 1.5174486558690934, + "grad_norm": 0.3785273566123282, + "learning_rate": 5.767485420782021e-06, + "loss": 0.1876, + "step": 4544 + }, + { + "epoch": 1.5177826014359659, + "grad_norm": 0.41324123757089976, + "learning_rate": 5.7655650597838704e-06, + "loss": 0.2107, + "step": 4545 + }, + { + "epoch": 1.5181165470028386, + "grad_norm": 0.4023559102615698, + "learning_rate": 5.7636445831381034e-06, + "loss": 0.1958, + "step": 4546 + }, + { + "epoch": 1.518450492569711, + "grad_norm": 0.4268003997100768, + "learning_rate": 5.761723991134831e-06, + "loss": 0.2005, + "step": 4547 + }, + { + "epoch": 1.5187844381365836, + "grad_norm": 0.41879175195622614, + "learning_rate": 5.759803284064181e-06, + "loss": 0.1974, + "step": 4548 + }, + { + "epoch": 1.5191183837034563, + "grad_norm": 0.41661555641918735, + "learning_rate": 5.757882462216299e-06, + "loss": 0.2028, + "step": 4549 + }, + { + "epoch": 1.519452329270329, + "grad_norm": 0.5008947195462263, + "learning_rate": 5.755961525881345e-06, + "loss": 0.211, + "step": 4550 + }, + { + "epoch": 1.5197862748372015, + "grad_norm": 0.43921728419522166, + "learning_rate": 5.7540404753495034e-06, + "loss": 0.2032, + "step": 4551 + }, + { + "epoch": 1.520120220404074, + "grad_norm": 0.44693077234088574, + "learning_rate": 5.75211931091097e-06, + "loss": 0.2265, + "step": 4552 + }, + { + "epoch": 1.5204541659709467, + "grad_norm": 0.4001694902689412, + "learning_rate": 5.750198032855956e-06, + "loss": 0.2, + "step": 4553 + }, + { + "epoch": 1.5207881115378195, + "grad_norm": 0.4045351162020478, + "learning_rate": 5.748276641474698e-06, + "loss": 0.1983, + "step": 4554 + }, + { + "epoch": 1.521122057104692, + "grad_norm": 0.3977245043542055, + "learning_rate": 5.746355137057442e-06, + "loss": 0.199, + "step": 4555 + }, + { + "epoch": 1.5214560026715644, + "grad_norm": 0.4176634955723099, + "learning_rate": 5.7444335198944555e-06, + "loss": 0.2065, + "step": 4556 + }, + { + "epoch": 1.5217899482384372, + "grad_norm": 0.4086844831410891, + "learning_rate": 5.7425117902760195e-06, + "loss": 0.2004, + "step": 4557 + }, + { + "epoch": 1.5221238938053099, + "grad_norm": 0.441434511195485, + "learning_rate": 5.7405899484924346e-06, + "loss": 0.2116, + "step": 4558 + }, + { + "epoch": 1.5224578393721824, + "grad_norm": 0.41164529074446093, + "learning_rate": 5.738667994834019e-06, + "loss": 0.1939, + "step": 4559 + }, + { + "epoch": 1.5227917849390549, + "grad_norm": 0.4069363998020001, + "learning_rate": 5.736745929591103e-06, + "loss": 0.1885, + "step": 4560 + }, + { + "epoch": 1.5231257305059276, + "grad_norm": 0.40733152134950845, + "learning_rate": 5.734823753054042e-06, + "loss": 0.2001, + "step": 4561 + }, + { + "epoch": 1.5234596760728, + "grad_norm": 0.3828618387327402, + "learning_rate": 5.732901465513199e-06, + "loss": 0.1982, + "step": 4562 + }, + { + "epoch": 1.5237936216396726, + "grad_norm": 0.4305349510985267, + "learning_rate": 5.73097906725896e-06, + "loss": 0.1974, + "step": 4563 + }, + { + "epoch": 1.5241275672065453, + "grad_norm": 0.40465587338482667, + "learning_rate": 5.729056558581727e-06, + "loss": 0.1986, + "step": 4564 + }, + { + "epoch": 1.524461512773418, + "grad_norm": 0.471282991054306, + "learning_rate": 5.727133939771915e-06, + "loss": 0.2081, + "step": 4565 + }, + { + "epoch": 1.5247954583402905, + "grad_norm": 0.4088188439945464, + "learning_rate": 5.725211211119961e-06, + "loss": 0.1952, + "step": 4566 + }, + { + "epoch": 1.525129403907163, + "grad_norm": 0.3934161375353832, + "learning_rate": 5.723288372916315e-06, + "loss": 0.1901, + "step": 4567 + }, + { + "epoch": 1.5254633494740357, + "grad_norm": 0.39417936401264786, + "learning_rate": 5.721365425451442e-06, + "loss": 0.195, + "step": 4568 + }, + { + "epoch": 1.5257972950409084, + "grad_norm": 0.41813761847460523, + "learning_rate": 5.719442369015828e-06, + "loss": 0.2002, + "step": 4569 + }, + { + "epoch": 1.526131240607781, + "grad_norm": 0.3928817442742676, + "learning_rate": 5.717519203899975e-06, + "loss": 0.1821, + "step": 4570 + }, + { + "epoch": 1.5264651861746534, + "grad_norm": 0.42060022944156716, + "learning_rate": 5.715595930394396e-06, + "loss": 0.197, + "step": 4571 + }, + { + "epoch": 1.5267991317415261, + "grad_norm": 0.414538136672965, + "learning_rate": 5.713672548789626e-06, + "loss": 0.2109, + "step": 4572 + }, + { + "epoch": 1.5271330773083989, + "grad_norm": 0.40300175281434797, + "learning_rate": 5.711749059376215e-06, + "loss": 0.194, + "step": 4573 + }, + { + "epoch": 1.5274670228752714, + "grad_norm": 0.4202683530403047, + "learning_rate": 5.7098254624447255e-06, + "loss": 0.1891, + "step": 4574 + }, + { + "epoch": 1.5278009684421439, + "grad_norm": 0.41121947338511855, + "learning_rate": 5.707901758285745e-06, + "loss": 0.1911, + "step": 4575 + }, + { + "epoch": 1.5281349140090166, + "grad_norm": 0.3973014662751213, + "learning_rate": 5.705977947189868e-06, + "loss": 0.1994, + "step": 4576 + }, + { + "epoch": 1.5284688595758893, + "grad_norm": 0.43891931027961867, + "learning_rate": 5.704054029447708e-06, + "loss": 0.2007, + "step": 4577 + }, + { + "epoch": 1.5288028051427616, + "grad_norm": 0.4502833076199682, + "learning_rate": 5.702130005349899e-06, + "loss": 0.214, + "step": 4578 + }, + { + "epoch": 1.5291367507096343, + "grad_norm": 0.44959299233942557, + "learning_rate": 5.700205875187084e-06, + "loss": 0.2153, + "step": 4579 + }, + { + "epoch": 1.529470696276507, + "grad_norm": 0.4305977208026748, + "learning_rate": 5.698281639249927e-06, + "loss": 0.2123, + "step": 4580 + }, + { + "epoch": 1.5298046418433795, + "grad_norm": 0.43603488906842863, + "learning_rate": 5.696357297829106e-06, + "loss": 0.2089, + "step": 4581 + }, + { + "epoch": 1.530138587410252, + "grad_norm": 0.4250706270087593, + "learning_rate": 5.6944328512153165e-06, + "loss": 0.1977, + "step": 4582 + }, + { + "epoch": 1.5304725329771247, + "grad_norm": 0.4198326353777762, + "learning_rate": 5.692508299699269e-06, + "loss": 0.2009, + "step": 4583 + }, + { + "epoch": 1.5308064785439974, + "grad_norm": 0.395956572492155, + "learning_rate": 5.690583643571687e-06, + "loss": 0.198, + "step": 4584 + }, + { + "epoch": 1.53114042411087, + "grad_norm": 0.43947140761901465, + "learning_rate": 5.688658883123315e-06, + "loss": 0.2027, + "step": 4585 + }, + { + "epoch": 1.5314743696777424, + "grad_norm": 0.4108183015582965, + "learning_rate": 5.68673401864491e-06, + "loss": 0.2118, + "step": 4586 + }, + { + "epoch": 1.5318083152446151, + "grad_norm": 0.41182034693533875, + "learning_rate": 5.684809050427247e-06, + "loss": 0.1987, + "step": 4587 + }, + { + "epoch": 1.5321422608114879, + "grad_norm": 0.4349586140015333, + "learning_rate": 5.682883978761111e-06, + "loss": 0.2002, + "step": 4588 + }, + { + "epoch": 1.5324762063783604, + "grad_norm": 0.39453978972461545, + "learning_rate": 5.680958803937311e-06, + "loss": 0.1962, + "step": 4589 + }, + { + "epoch": 1.5328101519452328, + "grad_norm": 0.4430655520347526, + "learning_rate": 5.6790335262466645e-06, + "loss": 0.2243, + "step": 4590 + }, + { + "epoch": 1.5331440975121056, + "grad_norm": 0.38426560740617555, + "learning_rate": 5.677108145980008e-06, + "loss": 0.1949, + "step": 4591 + }, + { + "epoch": 1.5334780430789783, + "grad_norm": 0.41412598049147914, + "learning_rate": 5.675182663428196e-06, + "loss": 0.2016, + "step": 4592 + }, + { + "epoch": 1.5338119886458508, + "grad_norm": 0.3952364153216541, + "learning_rate": 5.673257078882091e-06, + "loss": 0.1957, + "step": 4593 + }, + { + "epoch": 1.5341459342127233, + "grad_norm": 0.4347041827090324, + "learning_rate": 5.671331392632577e-06, + "loss": 0.2062, + "step": 4594 + }, + { + "epoch": 1.534479879779596, + "grad_norm": 0.4243751095481007, + "learning_rate": 5.6694056049705506e-06, + "loss": 0.2011, + "step": 4595 + }, + { + "epoch": 1.5348138253464685, + "grad_norm": 0.41160762842006665, + "learning_rate": 5.667479716186927e-06, + "loss": 0.2053, + "step": 4596 + }, + { + "epoch": 1.535147770913341, + "grad_norm": 0.45726125048358807, + "learning_rate": 5.665553726572631e-06, + "loss": 0.206, + "step": 4597 + }, + { + "epoch": 1.5354817164802137, + "grad_norm": 0.395285907262752, + "learning_rate": 5.663627636418611e-06, + "loss": 0.1837, + "step": 4598 + }, + { + "epoch": 1.5358156620470864, + "grad_norm": 0.3940866152777301, + "learning_rate": 5.661701446015821e-06, + "loss": 0.1945, + "step": 4599 + }, + { + "epoch": 1.536149607613959, + "grad_norm": 0.39866982139324986, + "learning_rate": 5.659775155655235e-06, + "loss": 0.1928, + "step": 4600 + }, + { + "epoch": 1.5364835531808314, + "grad_norm": 0.4186774736236097, + "learning_rate": 5.6578487656278446e-06, + "loss": 0.1981, + "step": 4601 + }, + { + "epoch": 1.5368174987477041, + "grad_norm": 0.41470326955184655, + "learning_rate": 5.655922276224652e-06, + "loss": 0.2064, + "step": 4602 + }, + { + "epoch": 1.5371514443145768, + "grad_norm": 0.4092155718896311, + "learning_rate": 5.653995687736676e-06, + "loss": 0.2006, + "step": 4603 + }, + { + "epoch": 1.5374853898814493, + "grad_norm": 0.4106065112514508, + "learning_rate": 5.652069000454951e-06, + "loss": 0.2022, + "step": 4604 + }, + { + "epoch": 1.5378193354483218, + "grad_norm": 0.4858555749514037, + "learning_rate": 5.650142214670527e-06, + "loss": 0.1805, + "step": 4605 + }, + { + "epoch": 1.5381532810151946, + "grad_norm": 0.4812494451824753, + "learning_rate": 5.648215330674464e-06, + "loss": 0.1882, + "step": 4606 + }, + { + "epoch": 1.5384872265820673, + "grad_norm": 0.4261470703710842, + "learning_rate": 5.646288348757845e-06, + "loss": 0.1926, + "step": 4607 + }, + { + "epoch": 1.5388211721489398, + "grad_norm": 0.3952136708762545, + "learning_rate": 5.64436126921176e-06, + "loss": 0.1975, + "step": 4608 + }, + { + "epoch": 1.5391551177158123, + "grad_norm": 0.3970827209666597, + "learning_rate": 5.642434092327318e-06, + "loss": 0.1999, + "step": 4609 + }, + { + "epoch": 1.539489063282685, + "grad_norm": 0.40998763479801215, + "learning_rate": 5.640506818395643e-06, + "loss": 0.1986, + "step": 4610 + }, + { + "epoch": 1.5398230088495575, + "grad_norm": 0.47084062645452907, + "learning_rate": 5.638579447707871e-06, + "loss": 0.193, + "step": 4611 + }, + { + "epoch": 1.54015695441643, + "grad_norm": 0.3860268369786831, + "learning_rate": 5.636651980555153e-06, + "loss": 0.1909, + "step": 4612 + }, + { + "epoch": 1.5404908999833027, + "grad_norm": 0.3942593539079482, + "learning_rate": 5.634724417228658e-06, + "loss": 0.1966, + "step": 4613 + }, + { + "epoch": 1.5408248455501754, + "grad_norm": 0.42574482432433103, + "learning_rate": 5.632796758019566e-06, + "loss": 0.1917, + "step": 4614 + }, + { + "epoch": 1.541158791117048, + "grad_norm": 0.39053070306639825, + "learning_rate": 5.630869003219072e-06, + "loss": 0.1977, + "step": 4615 + }, + { + "epoch": 1.5414927366839204, + "grad_norm": 0.43083185683956327, + "learning_rate": 5.628941153118388e-06, + "loss": 0.2072, + "step": 4616 + }, + { + "epoch": 1.5418266822507931, + "grad_norm": 0.36680215926548226, + "learning_rate": 5.627013208008737e-06, + "loss": 0.1708, + "step": 4617 + }, + { + "epoch": 1.5421606278176658, + "grad_norm": 0.4168771060059623, + "learning_rate": 5.625085168181357e-06, + "loss": 0.1957, + "step": 4618 + }, + { + "epoch": 1.5424945733845383, + "grad_norm": 0.4159514176383808, + "learning_rate": 5.623157033927503e-06, + "loss": 0.1933, + "step": 4619 + }, + { + "epoch": 1.5428285189514108, + "grad_norm": 0.45704579742530344, + "learning_rate": 5.621228805538443e-06, + "loss": 0.1997, + "step": 4620 + }, + { + "epoch": 1.5431624645182835, + "grad_norm": 0.428254804443801, + "learning_rate": 5.619300483305454e-06, + "loss": 0.1862, + "step": 4621 + }, + { + "epoch": 1.5434964100851563, + "grad_norm": 0.4203762697429112, + "learning_rate": 5.617372067519837e-06, + "loss": 0.2034, + "step": 4622 + }, + { + "epoch": 1.5438303556520288, + "grad_norm": 0.4200993106054955, + "learning_rate": 5.6154435584729e-06, + "loss": 0.208, + "step": 4623 + }, + { + "epoch": 1.5441643012189012, + "grad_norm": 0.39120604266589976, + "learning_rate": 5.6135149564559665e-06, + "loss": 0.1893, + "step": 4624 + }, + { + "epoch": 1.544498246785774, + "grad_norm": 0.4347411574630664, + "learning_rate": 5.611586261760375e-06, + "loss": 0.181, + "step": 4625 + }, + { + "epoch": 1.5448321923526467, + "grad_norm": 0.43018401156890745, + "learning_rate": 5.609657474677478e-06, + "loss": 0.1994, + "step": 4626 + }, + { + "epoch": 1.545166137919519, + "grad_norm": 0.4820806633908888, + "learning_rate": 5.607728595498641e-06, + "loss": 0.2073, + "step": 4627 + }, + { + "epoch": 1.5455000834863917, + "grad_norm": 0.40368430637167996, + "learning_rate": 5.6057996245152435e-06, + "loss": 0.1919, + "step": 4628 + }, + { + "epoch": 1.5458340290532644, + "grad_norm": 0.406341895380249, + "learning_rate": 5.603870562018679e-06, + "loss": 0.1957, + "step": 4629 + }, + { + "epoch": 1.5461679746201369, + "grad_norm": 0.4120838046068009, + "learning_rate": 5.601941408300358e-06, + "loss": 0.2073, + "step": 4630 + }, + { + "epoch": 1.5465019201870094, + "grad_norm": 0.40592326676295587, + "learning_rate": 5.600012163651698e-06, + "loss": 0.1984, + "step": 4631 + }, + { + "epoch": 1.546835865753882, + "grad_norm": 0.4363803396643685, + "learning_rate": 5.598082828364134e-06, + "loss": 0.1972, + "step": 4632 + }, + { + "epoch": 1.5471698113207548, + "grad_norm": 0.43058371265331713, + "learning_rate": 5.596153402729118e-06, + "loss": 0.196, + "step": 4633 + }, + { + "epoch": 1.5475037568876273, + "grad_norm": 0.39059793054857367, + "learning_rate": 5.594223887038113e-06, + "loss": 0.181, + "step": 4634 + }, + { + "epoch": 1.5478377024544998, + "grad_norm": 0.4261233357009109, + "learning_rate": 5.592294281582591e-06, + "loss": 0.1893, + "step": 4635 + }, + { + "epoch": 1.5481716480213725, + "grad_norm": 0.41289802429846045, + "learning_rate": 5.590364586654043e-06, + "loss": 0.2031, + "step": 4636 + }, + { + "epoch": 1.5485055935882452, + "grad_norm": 0.45183145302663547, + "learning_rate": 5.588434802543975e-06, + "loss": 0.1886, + "step": 4637 + }, + { + "epoch": 1.5488395391551177, + "grad_norm": 0.4250974981387959, + "learning_rate": 5.5865049295439e-06, + "loss": 0.1951, + "step": 4638 + }, + { + "epoch": 1.5491734847219902, + "grad_norm": 0.43975870182322807, + "learning_rate": 5.584574967945351e-06, + "loss": 0.2104, + "step": 4639 + }, + { + "epoch": 1.549507430288863, + "grad_norm": 0.44559355295469355, + "learning_rate": 5.582644918039869e-06, + "loss": 0.2092, + "step": 4640 + }, + { + "epoch": 1.5498413758557357, + "grad_norm": 0.41539145621347284, + "learning_rate": 5.580714780119011e-06, + "loss": 0.2034, + "step": 4641 + }, + { + "epoch": 1.5501753214226082, + "grad_norm": 0.40591271120703043, + "learning_rate": 5.578784554474348e-06, + "loss": 0.188, + "step": 4642 + }, + { + "epoch": 1.5505092669894807, + "grad_norm": 0.6532970191187181, + "learning_rate": 5.5768542413974645e-06, + "loss": 0.2037, + "step": 4643 + }, + { + "epoch": 1.5508432125563534, + "grad_norm": 0.4315096314647443, + "learning_rate": 5.574923841179953e-06, + "loss": 0.2007, + "step": 4644 + }, + { + "epoch": 1.5511771581232259, + "grad_norm": 0.38269782258347723, + "learning_rate": 5.572993354113429e-06, + "loss": 0.183, + "step": 4645 + }, + { + "epoch": 1.5515111036900984, + "grad_norm": 0.5076233296132883, + "learning_rate": 5.5710627804895105e-06, + "loss": 0.214, + "step": 4646 + }, + { + "epoch": 1.551845049256971, + "grad_norm": 0.41321256949804475, + "learning_rate": 5.569132120599834e-06, + "loss": 0.1927, + "step": 4647 + }, + { + "epoch": 1.5521789948238438, + "grad_norm": 0.39911707248154793, + "learning_rate": 5.567201374736051e-06, + "loss": 0.1849, + "step": 4648 + }, + { + "epoch": 1.5525129403907163, + "grad_norm": 0.44520911248754574, + "learning_rate": 5.565270543189821e-06, + "loss": 0.2107, + "step": 4649 + }, + { + "epoch": 1.5528468859575888, + "grad_norm": 0.3773856938490637, + "learning_rate": 5.563339626252819e-06, + "loss": 0.1834, + "step": 4650 + }, + { + "epoch": 1.5531808315244615, + "grad_norm": 0.39836943833180216, + "learning_rate": 5.561408624216734e-06, + "loss": 0.1834, + "step": 4651 + }, + { + "epoch": 1.5535147770913342, + "grad_norm": 0.39621414053364384, + "learning_rate": 5.559477537373267e-06, + "loss": 0.1918, + "step": 4652 + }, + { + "epoch": 1.5538487226582067, + "grad_norm": 0.4435868338139983, + "learning_rate": 5.557546366014129e-06, + "loss": 0.1853, + "step": 4653 + }, + { + "epoch": 1.5541826682250792, + "grad_norm": 0.3993984001133118, + "learning_rate": 5.555615110431049e-06, + "loss": 0.1993, + "step": 4654 + }, + { + "epoch": 1.554516613791952, + "grad_norm": 0.41979545755044656, + "learning_rate": 5.553683770915763e-06, + "loss": 0.2005, + "step": 4655 + }, + { + "epoch": 1.5548505593588247, + "grad_norm": 0.4128510095489835, + "learning_rate": 5.551752347760023e-06, + "loss": 0.1937, + "step": 4656 + }, + { + "epoch": 1.5551845049256972, + "grad_norm": 0.41228406880777296, + "learning_rate": 5.549820841255597e-06, + "loss": 0.1934, + "step": 4657 + }, + { + "epoch": 1.5555184504925696, + "grad_norm": 0.40502569693476115, + "learning_rate": 5.547889251694257e-06, + "loss": 0.1993, + "step": 4658 + }, + { + "epoch": 1.5558523960594424, + "grad_norm": 0.45309964934422253, + "learning_rate": 5.545957579367795e-06, + "loss": 0.2157, + "step": 4659 + }, + { + "epoch": 1.5561863416263149, + "grad_norm": 0.4075848451781985, + "learning_rate": 5.544025824568011e-06, + "loss": 0.1882, + "step": 4660 + }, + { + "epoch": 1.5565202871931874, + "grad_norm": 0.41425823932525097, + "learning_rate": 5.542093987586722e-06, + "loss": 0.1992, + "step": 4661 + }, + { + "epoch": 1.55685423276006, + "grad_norm": 0.40778580005587917, + "learning_rate": 5.540162068715752e-06, + "loss": 0.1909, + "step": 4662 + }, + { + "epoch": 1.5571881783269328, + "grad_norm": 0.40536508487131023, + "learning_rate": 5.538230068246942e-06, + "loss": 0.2013, + "step": 4663 + }, + { + "epoch": 1.5575221238938053, + "grad_norm": 0.45353755000496815, + "learning_rate": 5.536297986472142e-06, + "loss": 0.1986, + "step": 4664 + }, + { + "epoch": 1.5578560694606778, + "grad_norm": 0.38378580944056184, + "learning_rate": 5.534365823683219e-06, + "loss": 0.1869, + "step": 4665 + }, + { + "epoch": 1.5581900150275505, + "grad_norm": 0.5116844500041666, + "learning_rate": 5.532433580172044e-06, + "loss": 0.1968, + "step": 4666 + }, + { + "epoch": 1.5585239605944232, + "grad_norm": 0.40984005892330067, + "learning_rate": 5.5305012562305075e-06, + "loss": 0.1805, + "step": 4667 + }, + { + "epoch": 1.5588579061612957, + "grad_norm": 0.4695150357003394, + "learning_rate": 5.528568852150511e-06, + "loss": 0.2131, + "step": 4668 + }, + { + "epoch": 1.5591918517281682, + "grad_norm": 0.454406568295344, + "learning_rate": 5.526636368223965e-06, + "loss": 0.2069, + "step": 4669 + }, + { + "epoch": 1.559525797295041, + "grad_norm": 0.41198712780712915, + "learning_rate": 5.524703804742793e-06, + "loss": 0.2079, + "step": 4670 + }, + { + "epoch": 1.5598597428619136, + "grad_norm": 0.3930997269695332, + "learning_rate": 5.522771161998936e-06, + "loss": 0.2006, + "step": 4671 + }, + { + "epoch": 1.5601936884287861, + "grad_norm": 0.4132056747503601, + "learning_rate": 5.52083844028434e-06, + "loss": 0.1947, + "step": 4672 + }, + { + "epoch": 1.5605276339956586, + "grad_norm": 0.44323946531045155, + "learning_rate": 5.518905639890961e-06, + "loss": 0.2009, + "step": 4673 + }, + { + "epoch": 1.5608615795625314, + "grad_norm": 0.43128682055477907, + "learning_rate": 5.516972761110778e-06, + "loss": 0.2031, + "step": 4674 + }, + { + "epoch": 1.561195525129404, + "grad_norm": 0.4152986643855445, + "learning_rate": 5.515039804235772e-06, + "loss": 0.1924, + "step": 4675 + }, + { + "epoch": 1.5615294706962763, + "grad_norm": 0.43452047598785126, + "learning_rate": 5.51310676955794e-06, + "loss": 0.1928, + "step": 4676 + }, + { + "epoch": 1.561863416263149, + "grad_norm": 0.400631942313824, + "learning_rate": 5.511173657369287e-06, + "loss": 0.1914, + "step": 4677 + }, + { + "epoch": 1.5621973618300218, + "grad_norm": 0.45756926721328306, + "learning_rate": 5.509240467961835e-06, + "loss": 0.1983, + "step": 4678 + }, + { + "epoch": 1.5625313073968943, + "grad_norm": 0.37815230836023156, + "learning_rate": 5.507307201627614e-06, + "loss": 0.1795, + "step": 4679 + }, + { + "epoch": 1.5628652529637668, + "grad_norm": 0.42395660449614353, + "learning_rate": 5.505373858658668e-06, + "loss": 0.2012, + "step": 4680 + }, + { + "epoch": 1.5631991985306395, + "grad_norm": 0.38723972997180905, + "learning_rate": 5.503440439347048e-06, + "loss": 0.186, + "step": 4681 + }, + { + "epoch": 1.5635331440975122, + "grad_norm": 0.3784601820153379, + "learning_rate": 5.501506943984823e-06, + "loss": 0.183, + "step": 4682 + }, + { + "epoch": 1.5638670896643847, + "grad_norm": 0.4147470025596834, + "learning_rate": 5.4995733728640695e-06, + "loss": 0.1917, + "step": 4683 + }, + { + "epoch": 1.5642010352312572, + "grad_norm": 0.43067484554889957, + "learning_rate": 5.497639726276876e-06, + "loss": 0.2089, + "step": 4684 + }, + { + "epoch": 1.56453498079813, + "grad_norm": 0.4656315754036739, + "learning_rate": 5.49570600451534e-06, + "loss": 0.2105, + "step": 4685 + }, + { + "epoch": 1.5648689263650026, + "grad_norm": 0.4202462754676753, + "learning_rate": 5.493772207871577e-06, + "loss": 0.2008, + "step": 4686 + }, + { + "epoch": 1.5652028719318751, + "grad_norm": 0.45653138426163725, + "learning_rate": 5.491838336637708e-06, + "loss": 0.2045, + "step": 4687 + }, + { + "epoch": 1.5655368174987476, + "grad_norm": 0.41926970695465754, + "learning_rate": 5.4899043911058665e-06, + "loss": 0.202, + "step": 4688 + }, + { + "epoch": 1.5658707630656203, + "grad_norm": 0.41411270997069943, + "learning_rate": 5.487970371568199e-06, + "loss": 0.1866, + "step": 4689 + }, + { + "epoch": 1.566204708632493, + "grad_norm": 0.38632209581344107, + "learning_rate": 5.486036278316861e-06, + "loss": 0.1826, + "step": 4690 + }, + { + "epoch": 1.5665386541993656, + "grad_norm": 0.3689564655199529, + "learning_rate": 5.48410211164402e-06, + "loss": 0.1834, + "step": 4691 + }, + { + "epoch": 1.566872599766238, + "grad_norm": 0.4331808630136572, + "learning_rate": 5.482167871841855e-06, + "loss": 0.2066, + "step": 4692 + }, + { + "epoch": 1.5672065453331108, + "grad_norm": 0.41532024396149275, + "learning_rate": 5.480233559202556e-06, + "loss": 0.2061, + "step": 4693 + }, + { + "epoch": 1.5675404908999833, + "grad_norm": 0.41478007296781094, + "learning_rate": 5.4782991740183225e-06, + "loss": 0.2085, + "step": 4694 + }, + { + "epoch": 1.5678744364668558, + "grad_norm": 0.39634309936244644, + "learning_rate": 5.476364716581367e-06, + "loss": 0.1963, + "step": 4695 + }, + { + "epoch": 1.5682083820337285, + "grad_norm": 0.38011643978852666, + "learning_rate": 5.474430187183912e-06, + "loss": 0.1885, + "step": 4696 + }, + { + "epoch": 1.5685423276006012, + "grad_norm": 0.4050712387842652, + "learning_rate": 5.472495586118192e-06, + "loss": 0.2043, + "step": 4697 + }, + { + "epoch": 1.5688762731674737, + "grad_norm": 0.398212340908208, + "learning_rate": 5.47056091367645e-06, + "loss": 0.1949, + "step": 4698 + }, + { + "epoch": 1.5692102187343462, + "grad_norm": 0.43479513365376116, + "learning_rate": 5.468626170150942e-06, + "loss": 0.2019, + "step": 4699 + }, + { + "epoch": 1.569544164301219, + "grad_norm": 0.4110382738577834, + "learning_rate": 5.466691355833932e-06, + "loss": 0.189, + "step": 4700 + }, + { + "epoch": 1.5698781098680916, + "grad_norm": 0.4152423718197278, + "learning_rate": 5.464756471017696e-06, + "loss": 0.2, + "step": 4701 + }, + { + "epoch": 1.5702120554349641, + "grad_norm": 0.37504987375871496, + "learning_rate": 5.462821515994525e-06, + "loss": 0.1847, + "step": 4702 + }, + { + "epoch": 1.5705460010018366, + "grad_norm": 0.4007750379597849, + "learning_rate": 5.460886491056714e-06, + "loss": 0.1973, + "step": 4703 + }, + { + "epoch": 1.5708799465687093, + "grad_norm": 0.4162908356303437, + "learning_rate": 5.458951396496572e-06, + "loss": 0.2064, + "step": 4704 + }, + { + "epoch": 1.571213892135582, + "grad_norm": 0.3842156235774017, + "learning_rate": 5.457016232606417e-06, + "loss": 0.1938, + "step": 4705 + }, + { + "epoch": 1.5715478377024545, + "grad_norm": 0.3956066511286018, + "learning_rate": 5.455080999678579e-06, + "loss": 0.1965, + "step": 4706 + }, + { + "epoch": 1.571881783269327, + "grad_norm": 0.42278335408976025, + "learning_rate": 5.453145698005399e-06, + "loss": 0.203, + "step": 4707 + }, + { + "epoch": 1.5722157288361998, + "grad_norm": 0.3980754778029337, + "learning_rate": 5.451210327879223e-06, + "loss": 0.1885, + "step": 4708 + }, + { + "epoch": 1.5725496744030723, + "grad_norm": 0.41626036909895414, + "learning_rate": 5.449274889592416e-06, + "loss": 0.1933, + "step": 4709 + }, + { + "epoch": 1.5728836199699447, + "grad_norm": 0.45739490381726283, + "learning_rate": 5.4473393834373466e-06, + "loss": 0.2051, + "step": 4710 + }, + { + "epoch": 1.5732175655368175, + "grad_norm": 0.40969808362672155, + "learning_rate": 5.445403809706395e-06, + "loss": 0.1942, + "step": 4711 + }, + { + "epoch": 1.5735515111036902, + "grad_norm": 0.38965421516201565, + "learning_rate": 5.443468168691954e-06, + "loss": 0.1872, + "step": 4712 + }, + { + "epoch": 1.5738854566705627, + "grad_norm": 0.3802144601432017, + "learning_rate": 5.441532460686426e-06, + "loss": 0.1936, + "step": 4713 + }, + { + "epoch": 1.5742194022374352, + "grad_norm": 0.4282184037870892, + "learning_rate": 5.4395966859822195e-06, + "loss": 0.2006, + "step": 4714 + }, + { + "epoch": 1.574553347804308, + "grad_norm": 0.41529391109227304, + "learning_rate": 5.437660844871758e-06, + "loss": 0.2018, + "step": 4715 + }, + { + "epoch": 1.5748872933711806, + "grad_norm": 0.43028826862778147, + "learning_rate": 5.435724937647473e-06, + "loss": 0.2069, + "step": 4716 + }, + { + "epoch": 1.575221238938053, + "grad_norm": 0.42632722327309375, + "learning_rate": 5.433788964601804e-06, + "loss": 0.2035, + "step": 4717 + }, + { + "epoch": 1.5755551845049256, + "grad_norm": 0.42219790114922706, + "learning_rate": 5.431852926027206e-06, + "loss": 0.1933, + "step": 4718 + }, + { + "epoch": 1.5758891300717983, + "grad_norm": 0.41389183867680046, + "learning_rate": 5.429916822216138e-06, + "loss": 0.1997, + "step": 4719 + }, + { + "epoch": 1.576223075638671, + "grad_norm": 0.42005326382754093, + "learning_rate": 5.42798065346107e-06, + "loss": 0.1881, + "step": 4720 + }, + { + "epoch": 1.5765570212055435, + "grad_norm": 0.39753444162905593, + "learning_rate": 5.426044420054488e-06, + "loss": 0.1929, + "step": 4721 + }, + { + "epoch": 1.576890966772416, + "grad_norm": 0.41691063344852947, + "learning_rate": 5.424108122288878e-06, + "loss": 0.2058, + "step": 4722 + }, + { + "epoch": 1.5772249123392887, + "grad_norm": 0.4523145688376783, + "learning_rate": 5.4221717604567435e-06, + "loss": 0.2156, + "step": 4723 + }, + { + "epoch": 1.5775588579061615, + "grad_norm": 0.39679608858354004, + "learning_rate": 5.420235334850593e-06, + "loss": 0.1923, + "step": 4724 + }, + { + "epoch": 1.5778928034730337, + "grad_norm": 0.3958825165224562, + "learning_rate": 5.418298845762947e-06, + "loss": 0.1885, + "step": 4725 + }, + { + "epoch": 1.5782267490399065, + "grad_norm": 0.42180340518034565, + "learning_rate": 5.416362293486336e-06, + "loss": 0.1985, + "step": 4726 + }, + { + "epoch": 1.5785606946067792, + "grad_norm": 0.43679318826263547, + "learning_rate": 5.4144256783132975e-06, + "loss": 0.2089, + "step": 4727 + }, + { + "epoch": 1.5788946401736517, + "grad_norm": 0.38015308316695867, + "learning_rate": 5.41248900053638e-06, + "loss": 0.1811, + "step": 4728 + }, + { + "epoch": 1.5792285857405242, + "grad_norm": 0.41432535867088516, + "learning_rate": 5.4105522604481435e-06, + "loss": 0.1965, + "step": 4729 + }, + { + "epoch": 1.5795625313073969, + "grad_norm": 0.36628677647353347, + "learning_rate": 5.408615458341152e-06, + "loss": 0.1796, + "step": 4730 + }, + { + "epoch": 1.5798964768742696, + "grad_norm": 0.3869654366325382, + "learning_rate": 5.4066785945079855e-06, + "loss": 0.1973, + "step": 4731 + }, + { + "epoch": 1.580230422441142, + "grad_norm": 0.3878145330599434, + "learning_rate": 5.404741669241228e-06, + "loss": 0.1883, + "step": 4732 + }, + { + "epoch": 1.5805643680080146, + "grad_norm": 0.4021115624829742, + "learning_rate": 5.402804682833477e-06, + "loss": 0.193, + "step": 4733 + }, + { + "epoch": 1.5808983135748873, + "grad_norm": 0.49296268950806166, + "learning_rate": 5.400867635577335e-06, + "loss": 0.1977, + "step": 4734 + }, + { + "epoch": 1.58123225914176, + "grad_norm": 0.4345599581839175, + "learning_rate": 5.398930527765416e-06, + "loss": 0.1982, + "step": 4735 + }, + { + "epoch": 1.5815662047086325, + "grad_norm": 0.413964619926734, + "learning_rate": 5.396993359690345e-06, + "loss": 0.2034, + "step": 4736 + }, + { + "epoch": 1.581900150275505, + "grad_norm": 0.43552047249198406, + "learning_rate": 5.395056131644752e-06, + "loss": 0.1958, + "step": 4737 + }, + { + "epoch": 1.5822340958423777, + "grad_norm": 0.4109376138019141, + "learning_rate": 5.393118843921277e-06, + "loss": 0.1922, + "step": 4738 + }, + { + "epoch": 1.5825680414092504, + "grad_norm": 0.5086676990134947, + "learning_rate": 5.391181496812573e-06, + "loss": 0.1883, + "step": 4739 + }, + { + "epoch": 1.582901986976123, + "grad_norm": 0.39573926156137973, + "learning_rate": 5.389244090611298e-06, + "loss": 0.1994, + "step": 4740 + }, + { + "epoch": 1.5832359325429954, + "grad_norm": 0.4607457839079693, + "learning_rate": 5.38730662561012e-06, + "loss": 0.205, + "step": 4741 + }, + { + "epoch": 1.5835698781098682, + "grad_norm": 0.40835357222366736, + "learning_rate": 5.385369102101716e-06, + "loss": 0.1872, + "step": 4742 + }, + { + "epoch": 1.5839038236767407, + "grad_norm": 0.48689078188687296, + "learning_rate": 5.38343152037877e-06, + "loss": 0.1961, + "step": 4743 + }, + { + "epoch": 1.5842377692436131, + "grad_norm": 0.4066030628039563, + "learning_rate": 5.38149388073398e-06, + "loss": 0.1945, + "step": 4744 + }, + { + "epoch": 1.5845717148104859, + "grad_norm": 0.366013738429132, + "learning_rate": 5.379556183460047e-06, + "loss": 0.1856, + "step": 4745 + }, + { + "epoch": 1.5849056603773586, + "grad_norm": 0.43200895906800973, + "learning_rate": 5.377618428849683e-06, + "loss": 0.2068, + "step": 4746 + }, + { + "epoch": 1.585239605944231, + "grad_norm": 0.49242292623459655, + "learning_rate": 5.375680617195609e-06, + "loss": 0.2096, + "step": 4747 + }, + { + "epoch": 1.5855735515111036, + "grad_norm": 0.4132744338380143, + "learning_rate": 5.373742748790555e-06, + "loss": 0.1864, + "step": 4748 + }, + { + "epoch": 1.5859074970779763, + "grad_norm": 0.39706760556922854, + "learning_rate": 5.371804823927258e-06, + "loss": 0.1926, + "step": 4749 + }, + { + "epoch": 1.586241442644849, + "grad_norm": 0.4238854670573685, + "learning_rate": 5.369866842898465e-06, + "loss": 0.2077, + "step": 4750 + }, + { + "epoch": 1.5865753882117215, + "grad_norm": 0.42640493614431735, + "learning_rate": 5.367928805996929e-06, + "loss": 0.1972, + "step": 4751 + }, + { + "epoch": 1.586909333778594, + "grad_norm": 0.37013179888742664, + "learning_rate": 5.365990713515414e-06, + "loss": 0.1729, + "step": 4752 + }, + { + "epoch": 1.5872432793454667, + "grad_norm": 0.45907590421110495, + "learning_rate": 5.364052565746693e-06, + "loss": 0.2073, + "step": 4753 + }, + { + "epoch": 1.5875772249123394, + "grad_norm": 0.43304177739828936, + "learning_rate": 5.362114362983547e-06, + "loss": 0.2037, + "step": 4754 + }, + { + "epoch": 1.587911170479212, + "grad_norm": 0.43747626863293665, + "learning_rate": 5.360176105518761e-06, + "loss": 0.1922, + "step": 4755 + }, + { + "epoch": 1.5882451160460844, + "grad_norm": 0.5704783919256339, + "learning_rate": 5.358237793645133e-06, + "loss": 0.191, + "step": 4756 + }, + { + "epoch": 1.5885790616129571, + "grad_norm": 0.41877733795764044, + "learning_rate": 5.356299427655469e-06, + "loss": 0.1945, + "step": 4757 + }, + { + "epoch": 1.5889130071798296, + "grad_norm": 0.4566114150645618, + "learning_rate": 5.354361007842581e-06, + "loss": 0.1982, + "step": 4758 + }, + { + "epoch": 1.5892469527467021, + "grad_norm": 0.38852790236519913, + "learning_rate": 5.352422534499291e-06, + "loss": 0.1853, + "step": 4759 + }, + { + "epoch": 1.5895808983135749, + "grad_norm": 0.40708531373941814, + "learning_rate": 5.350484007918428e-06, + "loss": 0.2064, + "step": 4760 + }, + { + "epoch": 1.5899148438804476, + "grad_norm": 0.4253185652044943, + "learning_rate": 5.3485454283928265e-06, + "loss": 0.192, + "step": 4761 + }, + { + "epoch": 1.59024878944732, + "grad_norm": 0.41539393746379444, + "learning_rate": 5.346606796215335e-06, + "loss": 0.1835, + "step": 4762 + }, + { + "epoch": 1.5905827350141926, + "grad_norm": 0.41820448778568065, + "learning_rate": 5.344668111678805e-06, + "loss": 0.2028, + "step": 4763 + }, + { + "epoch": 1.5909166805810653, + "grad_norm": 0.41281987451744845, + "learning_rate": 5.3427293750761e-06, + "loss": 0.2046, + "step": 4764 + }, + { + "epoch": 1.591250626147938, + "grad_norm": 0.49736587210070504, + "learning_rate": 5.340790586700086e-06, + "loss": 0.2222, + "step": 4765 + }, + { + "epoch": 1.5915845717148105, + "grad_norm": 0.46561471285392064, + "learning_rate": 5.338851746843643e-06, + "loss": 0.2092, + "step": 4766 + }, + { + "epoch": 1.591918517281683, + "grad_norm": 0.4281307653749809, + "learning_rate": 5.336912855799652e-06, + "loss": 0.2, + "step": 4767 + }, + { + "epoch": 1.5922524628485557, + "grad_norm": 0.39860709481846124, + "learning_rate": 5.334973913861008e-06, + "loss": 0.1945, + "step": 4768 + }, + { + "epoch": 1.5925864084154284, + "grad_norm": 0.4217558786649774, + "learning_rate": 5.33303492132061e-06, + "loss": 0.196, + "step": 4769 + }, + { + "epoch": 1.592920353982301, + "grad_norm": 0.4309860097814307, + "learning_rate": 5.3310958784713655e-06, + "loss": 0.1979, + "step": 4770 + }, + { + "epoch": 1.5932542995491734, + "grad_norm": 0.42902743725005, + "learning_rate": 5.329156785606191e-06, + "loss": 0.2034, + "step": 4771 + }, + { + "epoch": 1.5935882451160461, + "grad_norm": 0.38985978116145, + "learning_rate": 5.327217643018008e-06, + "loss": 0.2006, + "step": 4772 + }, + { + "epoch": 1.5939221906829188, + "grad_norm": 0.4347966967493165, + "learning_rate": 5.325278450999747e-06, + "loss": 0.2114, + "step": 4773 + }, + { + "epoch": 1.5942561362497911, + "grad_norm": 0.39474589860208664, + "learning_rate": 5.323339209844346e-06, + "loss": 0.1921, + "step": 4774 + }, + { + "epoch": 1.5945900818166638, + "grad_norm": 0.38256053396845374, + "learning_rate": 5.32139991984475e-06, + "loss": 0.1939, + "step": 4775 + }, + { + "epoch": 1.5949240273835366, + "grad_norm": 0.3729647811843787, + "learning_rate": 5.319460581293911e-06, + "loss": 0.1874, + "step": 4776 + }, + { + "epoch": 1.595257972950409, + "grad_norm": 0.45375740690251337, + "learning_rate": 5.317521194484791e-06, + "loss": 0.2095, + "step": 4777 + }, + { + "epoch": 1.5955919185172815, + "grad_norm": 0.3939370561880113, + "learning_rate": 5.315581759710356e-06, + "loss": 0.1933, + "step": 4778 + }, + { + "epoch": 1.5959258640841543, + "grad_norm": 0.41746052414225593, + "learning_rate": 5.313642277263577e-06, + "loss": 0.189, + "step": 4779 + }, + { + "epoch": 1.596259809651027, + "grad_norm": 0.4516550553035685, + "learning_rate": 5.311702747437443e-06, + "loss": 0.1925, + "step": 4780 + }, + { + "epoch": 1.5965937552178995, + "grad_norm": 0.41772684280109756, + "learning_rate": 5.309763170524937e-06, + "loss": 0.192, + "step": 4781 + }, + { + "epoch": 1.596927700784772, + "grad_norm": 0.41226393435164765, + "learning_rate": 5.307823546819056e-06, + "loss": 0.1922, + "step": 4782 + }, + { + "epoch": 1.5972616463516447, + "grad_norm": 0.39301899832094617, + "learning_rate": 5.305883876612805e-06, + "loss": 0.1864, + "step": 4783 + }, + { + "epoch": 1.5975955919185174, + "grad_norm": 0.4344890610361969, + "learning_rate": 5.303944160199193e-06, + "loss": 0.1933, + "step": 4784 + }, + { + "epoch": 1.59792953748539, + "grad_norm": 0.4204667147166973, + "learning_rate": 5.302004397871237e-06, + "loss": 0.2018, + "step": 4785 + }, + { + "epoch": 1.5982634830522624, + "grad_norm": 0.4025801727517493, + "learning_rate": 5.3000645899219594e-06, + "loss": 0.1944, + "step": 4786 + }, + { + "epoch": 1.5985974286191351, + "grad_norm": 0.4300003951448309, + "learning_rate": 5.298124736644392e-06, + "loss": 0.1997, + "step": 4787 + }, + { + "epoch": 1.5989313741860078, + "grad_norm": 0.4322841928213523, + "learning_rate": 5.296184838331575e-06, + "loss": 0.1934, + "step": 4788 + }, + { + "epoch": 1.5992653197528803, + "grad_norm": 0.4268719199609731, + "learning_rate": 5.2942448952765495e-06, + "loss": 0.1878, + "step": 4789 + }, + { + "epoch": 1.5995992653197528, + "grad_norm": 0.4096006704992976, + "learning_rate": 5.292304907772367e-06, + "loss": 0.1984, + "step": 4790 + }, + { + "epoch": 1.5999332108866255, + "grad_norm": 0.44146154991878817, + "learning_rate": 5.290364876112088e-06, + "loss": 0.2028, + "step": 4791 + }, + { + "epoch": 1.600267156453498, + "grad_norm": 0.4127132107934224, + "learning_rate": 5.288424800588775e-06, + "loss": 0.1985, + "step": 4792 + }, + { + "epoch": 1.6006011020203705, + "grad_norm": 0.4089302263887995, + "learning_rate": 5.2864846814955e-06, + "loss": 0.201, + "step": 4793 + }, + { + "epoch": 1.6009350475872433, + "grad_norm": 0.4057658128571867, + "learning_rate": 5.28454451912534e-06, + "loss": 0.1919, + "step": 4794 + }, + { + "epoch": 1.601268993154116, + "grad_norm": 0.41550864844996815, + "learning_rate": 5.28260431377138e-06, + "loss": 0.193, + "step": 4795 + }, + { + "epoch": 1.6016029387209885, + "grad_norm": 0.4214643164993601, + "learning_rate": 5.280664065726712e-06, + "loss": 0.2037, + "step": 4796 + }, + { + "epoch": 1.601936884287861, + "grad_norm": 0.41574763678512067, + "learning_rate": 5.278723775284432e-06, + "loss": 0.1955, + "step": 4797 + }, + { + "epoch": 1.6022708298547337, + "grad_norm": 0.4156915810039953, + "learning_rate": 5.276783442737642e-06, + "loss": 0.1951, + "step": 4798 + }, + { + "epoch": 1.6026047754216064, + "grad_norm": 0.4504952363847089, + "learning_rate": 5.274843068379456e-06, + "loss": 0.2121, + "step": 4799 + }, + { + "epoch": 1.602938720988479, + "grad_norm": 0.4141410647043094, + "learning_rate": 5.272902652502988e-06, + "loss": 0.1908, + "step": 4800 + }, + { + "epoch": 1.6032726665553514, + "grad_norm": 0.4172694674527439, + "learning_rate": 5.27096219540136e-06, + "loss": 0.2045, + "step": 4801 + }, + { + "epoch": 1.603606612122224, + "grad_norm": 0.39101651996056197, + "learning_rate": 5.269021697367702e-06, + "loss": 0.1979, + "step": 4802 + }, + { + "epoch": 1.6039405576890968, + "grad_norm": 0.4008081930642851, + "learning_rate": 5.26708115869515e-06, + "loss": 0.1828, + "step": 4803 + }, + { + "epoch": 1.6042745032559693, + "grad_norm": 0.40199523795553577, + "learning_rate": 5.265140579676844e-06, + "loss": 0.1946, + "step": 4804 + }, + { + "epoch": 1.6046084488228418, + "grad_norm": 0.36475147506022104, + "learning_rate": 5.263199960605931e-06, + "loss": 0.186, + "step": 4805 + }, + { + "epoch": 1.6049423943897145, + "grad_norm": 0.44832636055257613, + "learning_rate": 5.261259301775564e-06, + "loss": 0.2019, + "step": 4806 + }, + { + "epoch": 1.605276339956587, + "grad_norm": 0.3988676531453092, + "learning_rate": 5.259318603478904e-06, + "loss": 0.1845, + "step": 4807 + }, + { + "epoch": 1.6056102855234595, + "grad_norm": 0.4394954645416581, + "learning_rate": 5.2573778660091156e-06, + "loss": 0.2012, + "step": 4808 + }, + { + "epoch": 1.6059442310903322, + "grad_norm": 0.4090850456456306, + "learning_rate": 5.255437089659371e-06, + "loss": 0.1831, + "step": 4809 + }, + { + "epoch": 1.606278176657205, + "grad_norm": 0.4802815380853867, + "learning_rate": 5.253496274722846e-06, + "loss": 0.1978, + "step": 4810 + }, + { + "epoch": 1.6066121222240775, + "grad_norm": 0.38421296402882255, + "learning_rate": 5.251555421492722e-06, + "loss": 0.1815, + "step": 4811 + }, + { + "epoch": 1.60694606779095, + "grad_norm": 0.40180199874469, + "learning_rate": 5.249614530262191e-06, + "loss": 0.1883, + "step": 4812 + }, + { + "epoch": 1.6072800133578227, + "grad_norm": 0.3883383467412946, + "learning_rate": 5.2476736013244475e-06, + "loss": 0.1819, + "step": 4813 + }, + { + "epoch": 1.6076139589246954, + "grad_norm": 0.4086793201262821, + "learning_rate": 5.245732634972688e-06, + "loss": 0.1958, + "step": 4814 + }, + { + "epoch": 1.6079479044915679, + "grad_norm": 0.4224168613202054, + "learning_rate": 5.243791631500122e-06, + "loss": 0.1926, + "step": 4815 + }, + { + "epoch": 1.6082818500584404, + "grad_norm": 0.42613150503363995, + "learning_rate": 5.24185059119996e-06, + "loss": 0.1975, + "step": 4816 + }, + { + "epoch": 1.608615795625313, + "grad_norm": 0.4415148575023946, + "learning_rate": 5.239909514365415e-06, + "loss": 0.2004, + "step": 4817 + }, + { + "epoch": 1.6089497411921858, + "grad_norm": 0.40970747948408875, + "learning_rate": 5.237968401289717e-06, + "loss": 0.1971, + "step": 4818 + }, + { + "epoch": 1.6092836867590583, + "grad_norm": 0.4112895680407661, + "learning_rate": 5.236027252266088e-06, + "loss": 0.1972, + "step": 4819 + }, + { + "epoch": 1.6096176323259308, + "grad_norm": 0.40428437979576776, + "learning_rate": 5.234086067587765e-06, + "loss": 0.1942, + "step": 4820 + }, + { + "epoch": 1.6099515778928035, + "grad_norm": 0.419338867445877, + "learning_rate": 5.232144847547983e-06, + "loss": 0.1785, + "step": 4821 + }, + { + "epoch": 1.6102855234596762, + "grad_norm": 0.4699954998213189, + "learning_rate": 5.230203592439989e-06, + "loss": 0.2027, + "step": 4822 + }, + { + "epoch": 1.6106194690265485, + "grad_norm": 0.38579142330375554, + "learning_rate": 5.228262302557034e-06, + "loss": 0.1933, + "step": 4823 + }, + { + "epoch": 1.6109534145934212, + "grad_norm": 0.44120355162315, + "learning_rate": 5.226320978192369e-06, + "loss": 0.2033, + "step": 4824 + }, + { + "epoch": 1.611287360160294, + "grad_norm": 0.40448539512862475, + "learning_rate": 5.224379619639253e-06, + "loss": 0.1851, + "step": 4825 + }, + { + "epoch": 1.6116213057271664, + "grad_norm": 0.4454285188019959, + "learning_rate": 5.222438227190957e-06, + "loss": 0.2021, + "step": 4826 + }, + { + "epoch": 1.611955251294039, + "grad_norm": 0.414760600019116, + "learning_rate": 5.220496801140746e-06, + "loss": 0.1996, + "step": 4827 + }, + { + "epoch": 1.6122891968609117, + "grad_norm": 0.42718840912720824, + "learning_rate": 5.218555341781897e-06, + "loss": 0.189, + "step": 4828 + }, + { + "epoch": 1.6126231424277844, + "grad_norm": 0.4267525421289148, + "learning_rate": 5.216613849407691e-06, + "loss": 0.197, + "step": 4829 + }, + { + "epoch": 1.6129570879946569, + "grad_norm": 0.4493424424103547, + "learning_rate": 5.214672324311412e-06, + "loss": 0.1982, + "step": 4830 + }, + { + "epoch": 1.6132910335615294, + "grad_norm": 0.4308674096556119, + "learning_rate": 5.21273076678635e-06, + "loss": 0.1943, + "step": 4831 + }, + { + "epoch": 1.613624979128402, + "grad_norm": 0.3925998201907294, + "learning_rate": 5.210789177125802e-06, + "loss": 0.1881, + "step": 4832 + }, + { + "epoch": 1.6139589246952748, + "grad_norm": 0.4254383389522204, + "learning_rate": 5.208847555623066e-06, + "loss": 0.195, + "step": 4833 + }, + { + "epoch": 1.6142928702621473, + "grad_norm": 0.3784003115153938, + "learning_rate": 5.206905902571447e-06, + "loss": 0.1845, + "step": 4834 + }, + { + "epoch": 1.6146268158290198, + "grad_norm": 0.4259770207037785, + "learning_rate": 5.204964218264258e-06, + "loss": 0.1833, + "step": 4835 + }, + { + "epoch": 1.6149607613958925, + "grad_norm": 0.39738746452523543, + "learning_rate": 5.203022502994808e-06, + "loss": 0.1976, + "step": 4836 + }, + { + "epoch": 1.6152947069627652, + "grad_norm": 0.39030271717351783, + "learning_rate": 5.201080757056418e-06, + "loss": 0.1999, + "step": 4837 + }, + { + "epoch": 1.6156286525296377, + "grad_norm": 0.4263335380461042, + "learning_rate": 5.1991389807424145e-06, + "loss": 0.2086, + "step": 4838 + }, + { + "epoch": 1.6159625980965102, + "grad_norm": 0.398745944842516, + "learning_rate": 5.1971971743461215e-06, + "loss": 0.1932, + "step": 4839 + }, + { + "epoch": 1.616296543663383, + "grad_norm": 0.42880316783788475, + "learning_rate": 5.195255338160873e-06, + "loss": 0.2033, + "step": 4840 + }, + { + "epoch": 1.6166304892302554, + "grad_norm": 0.4503242539328835, + "learning_rate": 5.193313472480007e-06, + "loss": 0.216, + "step": 4841 + }, + { + "epoch": 1.616964434797128, + "grad_norm": 0.40831941532509597, + "learning_rate": 5.191371577596866e-06, + "loss": 0.1872, + "step": 4842 + }, + { + "epoch": 1.6172983803640006, + "grad_norm": 0.41791124388464684, + "learning_rate": 5.189429653804794e-06, + "loss": 0.194, + "step": 4843 + }, + { + "epoch": 1.6176323259308734, + "grad_norm": 0.4292909114375624, + "learning_rate": 5.187487701397142e-06, + "loss": 0.1999, + "step": 4844 + }, + { + "epoch": 1.6179662714977459, + "grad_norm": 0.4136962868331666, + "learning_rate": 5.185545720667266e-06, + "loss": 0.1884, + "step": 4845 + }, + { + "epoch": 1.6183002170646184, + "grad_norm": 0.39331667168777334, + "learning_rate": 5.183603711908523e-06, + "loss": 0.1932, + "step": 4846 + }, + { + "epoch": 1.618634162631491, + "grad_norm": 0.380147314511648, + "learning_rate": 5.181661675414278e-06, + "loss": 0.1895, + "step": 4847 + }, + { + "epoch": 1.6189681081983638, + "grad_norm": 0.4066449462044048, + "learning_rate": 5.179719611477898e-06, + "loss": 0.1937, + "step": 4848 + }, + { + "epoch": 1.6193020537652363, + "grad_norm": 0.4258446181103049, + "learning_rate": 5.1777775203927535e-06, + "loss": 0.2068, + "step": 4849 + }, + { + "epoch": 1.6196359993321088, + "grad_norm": 0.4113383272205944, + "learning_rate": 5.175835402452223e-06, + "loss": 0.2044, + "step": 4850 + }, + { + "epoch": 1.6199699448989815, + "grad_norm": 0.413380097915521, + "learning_rate": 5.173893257949683e-06, + "loss": 0.2014, + "step": 4851 + }, + { + "epoch": 1.6203038904658542, + "grad_norm": 0.42517246513422285, + "learning_rate": 5.17195108717852e-06, + "loss": 0.2054, + "step": 4852 + }, + { + "epoch": 1.6206378360327267, + "grad_norm": 0.41524625901706064, + "learning_rate": 5.170008890432121e-06, + "loss": 0.1873, + "step": 4853 + }, + { + "epoch": 1.6209717815995992, + "grad_norm": 0.43808341311644106, + "learning_rate": 5.168066668003876e-06, + "loss": 0.2091, + "step": 4854 + }, + { + "epoch": 1.621305727166472, + "grad_norm": 0.4252977935303211, + "learning_rate": 5.166124420187182e-06, + "loss": 0.2007, + "step": 4855 + }, + { + "epoch": 1.6216396727333444, + "grad_norm": 0.4319004455143065, + "learning_rate": 5.164182147275439e-06, + "loss": 0.2144, + "step": 4856 + }, + { + "epoch": 1.621973618300217, + "grad_norm": 0.40036208771754045, + "learning_rate": 5.16223984956205e-06, + "loss": 0.1875, + "step": 4857 + }, + { + "epoch": 1.6223075638670896, + "grad_norm": 0.418519003967928, + "learning_rate": 5.1602975273404196e-06, + "loss": 0.1976, + "step": 4858 + }, + { + "epoch": 1.6226415094339623, + "grad_norm": 0.40350253708517614, + "learning_rate": 5.158355180903961e-06, + "loss": 0.1989, + "step": 4859 + }, + { + "epoch": 1.6229754550008348, + "grad_norm": 0.41204369381140116, + "learning_rate": 5.156412810546089e-06, + "loss": 0.1949, + "step": 4860 + }, + { + "epoch": 1.6233094005677073, + "grad_norm": 0.4382744192834387, + "learning_rate": 5.154470416560219e-06, + "loss": 0.2089, + "step": 4861 + }, + { + "epoch": 1.62364334613458, + "grad_norm": 0.40586800209662394, + "learning_rate": 5.152527999239774e-06, + "loss": 0.1748, + "step": 4862 + }, + { + "epoch": 1.6239772917014528, + "grad_norm": 0.43817177004856733, + "learning_rate": 5.150585558878177e-06, + "loss": 0.2037, + "step": 4863 + }, + { + "epoch": 1.6243112372683253, + "grad_norm": 0.40381422303951253, + "learning_rate": 5.148643095768861e-06, + "loss": 0.1881, + "step": 4864 + }, + { + "epoch": 1.6246451828351978, + "grad_norm": 0.4223072439345843, + "learning_rate": 5.146700610205254e-06, + "loss": 0.2005, + "step": 4865 + }, + { + "epoch": 1.6249791284020705, + "grad_norm": 0.39611701923208614, + "learning_rate": 5.144758102480792e-06, + "loss": 0.1915, + "step": 4866 + }, + { + "epoch": 1.6253130739689432, + "grad_norm": 0.39897896763142227, + "learning_rate": 5.142815572888915e-06, + "loss": 0.1801, + "step": 4867 + }, + { + "epoch": 1.6256470195358157, + "grad_norm": 0.3942908284104242, + "learning_rate": 5.140873021723065e-06, + "loss": 0.1873, + "step": 4868 + }, + { + "epoch": 1.6259809651026882, + "grad_norm": 0.39322301183965197, + "learning_rate": 5.138930449276686e-06, + "loss": 0.1992, + "step": 4869 + }, + { + "epoch": 1.626314910669561, + "grad_norm": 0.4303711635184881, + "learning_rate": 5.136987855843226e-06, + "loss": 0.1995, + "step": 4870 + }, + { + "epoch": 1.6266488562364336, + "grad_norm": 0.4345354327414109, + "learning_rate": 5.135045241716138e-06, + "loss": 0.2136, + "step": 4871 + }, + { + "epoch": 1.626982801803306, + "grad_norm": 0.539742638729196, + "learning_rate": 5.133102607188875e-06, + "loss": 0.2132, + "step": 4872 + }, + { + "epoch": 1.6273167473701786, + "grad_norm": 0.4040192184352221, + "learning_rate": 5.131159952554896e-06, + "loss": 0.1896, + "step": 4873 + }, + { + "epoch": 1.6276506929370513, + "grad_norm": 0.4169971044202786, + "learning_rate": 5.129217278107663e-06, + "loss": 0.194, + "step": 4874 + }, + { + "epoch": 1.6279846385039238, + "grad_norm": 0.40503849753645316, + "learning_rate": 5.127274584140636e-06, + "loss": 0.1903, + "step": 4875 + }, + { + "epoch": 1.6283185840707963, + "grad_norm": 0.4237099685305266, + "learning_rate": 5.125331870947287e-06, + "loss": 0.2008, + "step": 4876 + }, + { + "epoch": 1.628652529637669, + "grad_norm": 0.4142662739561068, + "learning_rate": 5.123389138821084e-06, + "loss": 0.1995, + "step": 4877 + }, + { + "epoch": 1.6289864752045418, + "grad_norm": 0.45354343375917766, + "learning_rate": 5.121446388055497e-06, + "loss": 0.21, + "step": 4878 + }, + { + "epoch": 1.6293204207714143, + "grad_norm": 0.37425067796417305, + "learning_rate": 5.119503618944004e-06, + "loss": 0.1834, + "step": 4879 + }, + { + "epoch": 1.6296543663382868, + "grad_norm": 0.3788887242903905, + "learning_rate": 5.117560831780082e-06, + "loss": 0.1904, + "step": 4880 + }, + { + "epoch": 1.6299883119051595, + "grad_norm": 0.3898545052130911, + "learning_rate": 5.115618026857211e-06, + "loss": 0.1888, + "step": 4881 + }, + { + "epoch": 1.6303222574720322, + "grad_norm": 0.4105364824350985, + "learning_rate": 5.113675204468876e-06, + "loss": 0.1911, + "step": 4882 + }, + { + "epoch": 1.6306562030389047, + "grad_norm": 0.3962203176947265, + "learning_rate": 5.111732364908564e-06, + "loss": 0.1764, + "step": 4883 + }, + { + "epoch": 1.6309901486057772, + "grad_norm": 0.3507120420190793, + "learning_rate": 5.109789508469761e-06, + "loss": 0.1671, + "step": 4884 + }, + { + "epoch": 1.63132409417265, + "grad_norm": 0.3949491699443807, + "learning_rate": 5.107846635445962e-06, + "loss": 0.1958, + "step": 4885 + }, + { + "epoch": 1.6316580397395226, + "grad_norm": 0.44189276954645507, + "learning_rate": 5.1059037461306586e-06, + "loss": 0.2031, + "step": 4886 + }, + { + "epoch": 1.631991985306395, + "grad_norm": 0.42594578365840874, + "learning_rate": 5.103960840817346e-06, + "loss": 0.2029, + "step": 4887 + }, + { + "epoch": 1.6323259308732676, + "grad_norm": 0.4073041041534602, + "learning_rate": 5.1020179197995245e-06, + "loss": 0.1948, + "step": 4888 + }, + { + "epoch": 1.6326598764401403, + "grad_norm": 0.4197429915528991, + "learning_rate": 5.1000749833706964e-06, + "loss": 0.179, + "step": 4889 + }, + { + "epoch": 1.6329938220070128, + "grad_norm": 0.4231299956266433, + "learning_rate": 5.098132031824362e-06, + "loss": 0.2036, + "step": 4890 + }, + { + "epoch": 1.6333277675738853, + "grad_norm": 0.43474360913815285, + "learning_rate": 5.096189065454029e-06, + "loss": 0.2072, + "step": 4891 + }, + { + "epoch": 1.633661713140758, + "grad_norm": 0.42739722495251686, + "learning_rate": 5.094246084553206e-06, + "loss": 0.1998, + "step": 4892 + }, + { + "epoch": 1.6339956587076307, + "grad_norm": 0.4068442630703969, + "learning_rate": 5.092303089415403e-06, + "loss": 0.1965, + "step": 4893 + }, + { + "epoch": 1.6343296042745032, + "grad_norm": 0.42075102506762035, + "learning_rate": 5.09036008033413e-06, + "loss": 0.1958, + "step": 4894 + }, + { + "epoch": 1.6346635498413757, + "grad_norm": 0.4751211507782048, + "learning_rate": 5.0884170576029034e-06, + "loss": 0.2173, + "step": 4895 + }, + { + "epoch": 1.6349974954082485, + "grad_norm": 0.4414325188339375, + "learning_rate": 5.086474021515238e-06, + "loss": 0.2036, + "step": 4896 + }, + { + "epoch": 1.6353314409751212, + "grad_norm": 0.43981042916979307, + "learning_rate": 5.084530972364656e-06, + "loss": 0.1957, + "step": 4897 + }, + { + "epoch": 1.6356653865419937, + "grad_norm": 0.4447308583849977, + "learning_rate": 5.082587910444674e-06, + "loss": 0.1966, + "step": 4898 + }, + { + "epoch": 1.6359993321088662, + "grad_norm": 0.44669721483940267, + "learning_rate": 5.080644836048815e-06, + "loss": 0.2017, + "step": 4899 + }, + { + "epoch": 1.6363332776757389, + "grad_norm": 0.37911277492603973, + "learning_rate": 5.0787017494706035e-06, + "loss": 0.1885, + "step": 4900 + }, + { + "epoch": 1.6366672232426116, + "grad_norm": 0.42209764612725126, + "learning_rate": 5.076758651003567e-06, + "loss": 0.1904, + "step": 4901 + }, + { + "epoch": 1.637001168809484, + "grad_norm": 0.41618819886027797, + "learning_rate": 5.0748155409412325e-06, + "loss": 0.1971, + "step": 4902 + }, + { + "epoch": 1.6373351143763566, + "grad_norm": 0.37968963437992637, + "learning_rate": 5.0728724195771295e-06, + "loss": 0.1933, + "step": 4903 + }, + { + "epoch": 1.6376690599432293, + "grad_norm": 0.49016772674623965, + "learning_rate": 5.070929287204789e-06, + "loss": 0.2005, + "step": 4904 + }, + { + "epoch": 1.6380030055101018, + "grad_norm": 0.4179568582831211, + "learning_rate": 5.068986144117746e-06, + "loss": 0.2006, + "step": 4905 + }, + { + "epoch": 1.6383369510769743, + "grad_norm": 0.4255338314834005, + "learning_rate": 5.067042990609533e-06, + "loss": 0.194, + "step": 4906 + }, + { + "epoch": 1.638670896643847, + "grad_norm": 0.4489710292882698, + "learning_rate": 5.065099826973685e-06, + "loss": 0.2143, + "step": 4907 + }, + { + "epoch": 1.6390048422107197, + "grad_norm": 0.43882456276403864, + "learning_rate": 5.0631566535037435e-06, + "loss": 0.1977, + "step": 4908 + }, + { + "epoch": 1.6393387877775922, + "grad_norm": 0.38272437413780414, + "learning_rate": 5.061213470493246e-06, + "loss": 0.187, + "step": 4909 + }, + { + "epoch": 1.6396727333444647, + "grad_norm": 0.4223324994778571, + "learning_rate": 5.059270278235732e-06, + "loss": 0.1967, + "step": 4910 + }, + { + "epoch": 1.6400066789113374, + "grad_norm": 0.3976067064634602, + "learning_rate": 5.057327077024745e-06, + "loss": 0.1903, + "step": 4911 + }, + { + "epoch": 1.6403406244782102, + "grad_norm": 0.4209543105402407, + "learning_rate": 5.055383867153829e-06, + "loss": 0.2025, + "step": 4912 + }, + { + "epoch": 1.6406745700450827, + "grad_norm": 0.3947766936457843, + "learning_rate": 5.053440648916526e-06, + "loss": 0.1819, + "step": 4913 + }, + { + "epoch": 1.6410085156119552, + "grad_norm": 0.4026201573480306, + "learning_rate": 5.051497422606385e-06, + "loss": 0.1935, + "step": 4914 + }, + { + "epoch": 1.6413424611788279, + "grad_norm": 0.37785493785849833, + "learning_rate": 5.049554188516952e-06, + "loss": 0.1956, + "step": 4915 + }, + { + "epoch": 1.6416764067457006, + "grad_norm": 0.4080022366885912, + "learning_rate": 5.047610946941775e-06, + "loss": 0.1959, + "step": 4916 + }, + { + "epoch": 1.642010352312573, + "grad_norm": 0.39521347259779677, + "learning_rate": 5.045667698174403e-06, + "loss": 0.1841, + "step": 4917 + }, + { + "epoch": 1.6423442978794456, + "grad_norm": 0.4052000993449699, + "learning_rate": 5.043724442508388e-06, + "loss": 0.1873, + "step": 4918 + }, + { + "epoch": 1.6426782434463183, + "grad_norm": 0.4289851629916548, + "learning_rate": 5.0417811802372815e-06, + "loss": 0.2038, + "step": 4919 + }, + { + "epoch": 1.643012189013191, + "grad_norm": 0.42873602680133893, + "learning_rate": 5.039837911654637e-06, + "loss": 0.1924, + "step": 4920 + }, + { + "epoch": 1.6433461345800633, + "grad_norm": 0.4163011122982262, + "learning_rate": 5.037894637054005e-06, + "loss": 0.1887, + "step": 4921 + }, + { + "epoch": 1.643680080146936, + "grad_norm": 0.41256746568734526, + "learning_rate": 5.035951356728942e-06, + "loss": 0.1868, + "step": 4922 + }, + { + "epoch": 1.6440140257138087, + "grad_norm": 0.38452956569882735, + "learning_rate": 5.034008070973004e-06, + "loss": 0.1955, + "step": 4923 + }, + { + "epoch": 1.6443479712806812, + "grad_norm": 0.47266399939167203, + "learning_rate": 5.032064780079746e-06, + "loss": 0.2108, + "step": 4924 + }, + { + "epoch": 1.6446819168475537, + "grad_norm": 0.3661920769763338, + "learning_rate": 5.030121484342725e-06, + "loss": 0.1821, + "step": 4925 + }, + { + "epoch": 1.6450158624144264, + "grad_norm": 0.38288278251589564, + "learning_rate": 5.0281781840555e-06, + "loss": 0.1872, + "step": 4926 + }, + { + "epoch": 1.6453498079812992, + "grad_norm": 0.3951674051282928, + "learning_rate": 5.026234879511629e-06, + "loss": 0.1919, + "step": 4927 + }, + { + "epoch": 1.6456837535481716, + "grad_norm": 0.4446820667734045, + "learning_rate": 5.024291571004668e-06, + "loss": 0.2049, + "step": 4928 + }, + { + "epoch": 1.6460176991150441, + "grad_norm": 0.3847089437235357, + "learning_rate": 5.022348258828181e-06, + "loss": 0.1854, + "step": 4929 + }, + { + "epoch": 1.6463516446819169, + "grad_norm": 0.4465777995021089, + "learning_rate": 5.020404943275727e-06, + "loss": 0.2032, + "step": 4930 + }, + { + "epoch": 1.6466855902487896, + "grad_norm": 0.4209000333850444, + "learning_rate": 5.018461624640864e-06, + "loss": 0.1881, + "step": 4931 + }, + { + "epoch": 1.647019535815662, + "grad_norm": 0.43874264508637995, + "learning_rate": 5.016518303217157e-06, + "loss": 0.1959, + "step": 4932 + }, + { + "epoch": 1.6473534813825346, + "grad_norm": 0.40621965004843547, + "learning_rate": 5.014574979298166e-06, + "loss": 0.195, + "step": 4933 + }, + { + "epoch": 1.6476874269494073, + "grad_norm": 0.4112489831025004, + "learning_rate": 5.012631653177451e-06, + "loss": 0.1962, + "step": 4934 + }, + { + "epoch": 1.64802137251628, + "grad_norm": 0.38501149745631036, + "learning_rate": 5.010688325148577e-06, + "loss": 0.1827, + "step": 4935 + }, + { + "epoch": 1.6483553180831525, + "grad_norm": 0.382230170818548, + "learning_rate": 5.008744995505107e-06, + "loss": 0.1816, + "step": 4936 + }, + { + "epoch": 1.648689263650025, + "grad_norm": 0.4269969102329425, + "learning_rate": 5.0068016645406e-06, + "loss": 0.1912, + "step": 4937 + }, + { + "epoch": 1.6490232092168977, + "grad_norm": 0.3911253797830778, + "learning_rate": 5.0048583325486234e-06, + "loss": 0.1907, + "step": 4938 + }, + { + "epoch": 1.6493571547837702, + "grad_norm": 0.3973399530499921, + "learning_rate": 5.002914999822737e-06, + "loss": 0.1993, + "step": 4939 + }, + { + "epoch": 1.6496911003506427, + "grad_norm": 0.381805883336254, + "learning_rate": 5.000971666656508e-06, + "loss": 0.1866, + "step": 4940 + }, + { + "epoch": 1.6500250459175154, + "grad_norm": 0.44398075937862946, + "learning_rate": 4.999028333343494e-06, + "loss": 0.2153, + "step": 4941 + }, + { + "epoch": 1.6503589914843881, + "grad_norm": 0.40495075548352927, + "learning_rate": 4.9970850001772634e-06, + "loss": 0.2006, + "step": 4942 + }, + { + "epoch": 1.6506929370512606, + "grad_norm": 0.47323246207653963, + "learning_rate": 4.995141667451378e-06, + "loss": 0.2111, + "step": 4943 + }, + { + "epoch": 1.6510268826181331, + "grad_norm": 0.40689036499077386, + "learning_rate": 4.993198335459401e-06, + "loss": 0.1961, + "step": 4944 + }, + { + "epoch": 1.6513608281850058, + "grad_norm": 0.45363419907785657, + "learning_rate": 4.991255004494896e-06, + "loss": 0.2002, + "step": 4945 + }, + { + "epoch": 1.6516947737518786, + "grad_norm": 0.39079299321552946, + "learning_rate": 4.989311674851424e-06, + "loss": 0.1918, + "step": 4946 + }, + { + "epoch": 1.652028719318751, + "grad_norm": 0.4079537560804328, + "learning_rate": 4.9873683468225495e-06, + "loss": 0.1934, + "step": 4947 + }, + { + "epoch": 1.6523626648856236, + "grad_norm": 0.3895830948669782, + "learning_rate": 4.985425020701836e-06, + "loss": 0.1966, + "step": 4948 + }, + { + "epoch": 1.6526966104524963, + "grad_norm": 0.39586577460541733, + "learning_rate": 4.983481696782844e-06, + "loss": 0.2038, + "step": 4949 + }, + { + "epoch": 1.653030556019369, + "grad_norm": 0.4316130617272694, + "learning_rate": 4.9815383753591365e-06, + "loss": 0.2015, + "step": 4950 + }, + { + "epoch": 1.6533645015862415, + "grad_norm": 0.4491142890206108, + "learning_rate": 4.9795950567242754e-06, + "loss": 0.2137, + "step": 4951 + }, + { + "epoch": 1.653698447153114, + "grad_norm": 0.371221820736492, + "learning_rate": 4.9776517411718214e-06, + "loss": 0.1865, + "step": 4952 + }, + { + "epoch": 1.6540323927199867, + "grad_norm": 0.8306835153106146, + "learning_rate": 4.9757084289953325e-06, + "loss": 0.1926, + "step": 4953 + }, + { + "epoch": 1.6543663382868592, + "grad_norm": 0.4009920133756041, + "learning_rate": 4.973765120488373e-06, + "loss": 0.2038, + "step": 4954 + }, + { + "epoch": 1.6547002838537317, + "grad_norm": 0.4568623426752501, + "learning_rate": 4.9718218159445015e-06, + "loss": 0.1989, + "step": 4955 + }, + { + "epoch": 1.6550342294206044, + "grad_norm": 0.48262334533879275, + "learning_rate": 4.969878515657276e-06, + "loss": 0.1881, + "step": 4956 + }, + { + "epoch": 1.6553681749874771, + "grad_norm": 0.40522771927191203, + "learning_rate": 4.967935219920257e-06, + "loss": 0.1892, + "step": 4957 + }, + { + "epoch": 1.6557021205543496, + "grad_norm": 0.37521846679027776, + "learning_rate": 4.9659919290269986e-06, + "loss": 0.1844, + "step": 4958 + }, + { + "epoch": 1.6560360661212221, + "grad_norm": 0.4083183199052804, + "learning_rate": 4.964048643271058e-06, + "loss": 0.2098, + "step": 4959 + }, + { + "epoch": 1.6563700116880948, + "grad_norm": 0.4004642626569134, + "learning_rate": 4.962105362945996e-06, + "loss": 0.1891, + "step": 4960 + }, + { + "epoch": 1.6567039572549676, + "grad_norm": 0.40776367914606315, + "learning_rate": 4.960162088345365e-06, + "loss": 0.1911, + "step": 4961 + }, + { + "epoch": 1.65703790282184, + "grad_norm": 0.4170345590008578, + "learning_rate": 4.958218819762719e-06, + "loss": 0.1925, + "step": 4962 + }, + { + "epoch": 1.6573718483887125, + "grad_norm": 0.3933408020439283, + "learning_rate": 4.9562755574916125e-06, + "loss": 0.1841, + "step": 4963 + }, + { + "epoch": 1.6577057939555853, + "grad_norm": 0.38270569831932166, + "learning_rate": 4.954332301825597e-06, + "loss": 0.185, + "step": 4964 + }, + { + "epoch": 1.658039739522458, + "grad_norm": 0.4416397735911187, + "learning_rate": 4.952389053058226e-06, + "loss": 0.1998, + "step": 4965 + }, + { + "epoch": 1.6583736850893305, + "grad_norm": 0.4038672656766227, + "learning_rate": 4.95044581148305e-06, + "loss": 0.1876, + "step": 4966 + }, + { + "epoch": 1.658707630656203, + "grad_norm": 0.45268259547839906, + "learning_rate": 4.948502577393617e-06, + "loss": 0.214, + "step": 4967 + }, + { + "epoch": 1.6590415762230757, + "grad_norm": 0.40485817055381734, + "learning_rate": 4.946559351083475e-06, + "loss": 0.1994, + "step": 4968 + }, + { + "epoch": 1.6593755217899484, + "grad_norm": 0.39575079659822177, + "learning_rate": 4.944616132846174e-06, + "loss": 0.1897, + "step": 4969 + }, + { + "epoch": 1.6597094673568207, + "grad_norm": 0.4236243366509332, + "learning_rate": 4.942672922975255e-06, + "loss": 0.1893, + "step": 4970 + }, + { + "epoch": 1.6600434129236934, + "grad_norm": 0.42327398359903823, + "learning_rate": 4.940729721764268e-06, + "loss": 0.1849, + "step": 4971 + }, + { + "epoch": 1.6603773584905661, + "grad_norm": 0.41896021825786833, + "learning_rate": 4.938786529506755e-06, + "loss": 0.1968, + "step": 4972 + }, + { + "epoch": 1.6607113040574386, + "grad_norm": 0.44274001256771034, + "learning_rate": 4.936843346496257e-06, + "loss": 0.2005, + "step": 4973 + }, + { + "epoch": 1.661045249624311, + "grad_norm": 0.4440886178498309, + "learning_rate": 4.934900173026316e-06, + "loss": 0.2138, + "step": 4974 + }, + { + "epoch": 1.6613791951911838, + "grad_norm": 0.4494201652824032, + "learning_rate": 4.93295700939047e-06, + "loss": 0.2151, + "step": 4975 + }, + { + "epoch": 1.6617131407580565, + "grad_norm": 0.4008141428086073, + "learning_rate": 4.931013855882255e-06, + "loss": 0.1846, + "step": 4976 + }, + { + "epoch": 1.662047086324929, + "grad_norm": 0.4187124211937765, + "learning_rate": 4.929070712795211e-06, + "loss": 0.1974, + "step": 4977 + }, + { + "epoch": 1.6623810318918015, + "grad_norm": 0.3800461821906711, + "learning_rate": 4.927127580422871e-06, + "loss": 0.18, + "step": 4978 + }, + { + "epoch": 1.6627149774586742, + "grad_norm": 0.3790661455201971, + "learning_rate": 4.925184459058769e-06, + "loss": 0.1794, + "step": 4979 + }, + { + "epoch": 1.663048923025547, + "grad_norm": 0.4088327927350755, + "learning_rate": 4.9232413489964345e-06, + "loss": 0.1928, + "step": 4980 + }, + { + "epoch": 1.6633828685924195, + "grad_norm": 0.39844429506903506, + "learning_rate": 4.921298250529398e-06, + "loss": 0.1873, + "step": 4981 + }, + { + "epoch": 1.663716814159292, + "grad_norm": 0.3728433467890873, + "learning_rate": 4.919355163951186e-06, + "loss": 0.1875, + "step": 4982 + }, + { + "epoch": 1.6640507597261647, + "grad_norm": 0.4049596237809687, + "learning_rate": 4.917412089555328e-06, + "loss": 0.1834, + "step": 4983 + }, + { + "epoch": 1.6643847052930374, + "grad_norm": 0.43046575539496446, + "learning_rate": 4.915469027635345e-06, + "loss": 0.2045, + "step": 4984 + }, + { + "epoch": 1.6647186508599099, + "grad_norm": 0.34374023895710515, + "learning_rate": 4.9135259784847625e-06, + "loss": 0.1614, + "step": 4985 + }, + { + "epoch": 1.6650525964267824, + "grad_norm": 0.4120359522713758, + "learning_rate": 4.911582942397098e-06, + "loss": 0.1893, + "step": 4986 + }, + { + "epoch": 1.665386541993655, + "grad_norm": 0.4527687274785145, + "learning_rate": 4.909639919665872e-06, + "loss": 0.2074, + "step": 4987 + }, + { + "epoch": 1.6657204875605276, + "grad_norm": 0.42010451152241596, + "learning_rate": 4.907696910584599e-06, + "loss": 0.1902, + "step": 4988 + }, + { + "epoch": 1.6660544331274, + "grad_norm": 0.4080591968473613, + "learning_rate": 4.905753915446795e-06, + "loss": 0.1869, + "step": 4989 + }, + { + "epoch": 1.6663883786942728, + "grad_norm": 0.45688748641848076, + "learning_rate": 4.903810934545972e-06, + "loss": 0.1957, + "step": 4990 + }, + { + "epoch": 1.6667223242611455, + "grad_norm": 0.39479981393128605, + "learning_rate": 4.90186796817564e-06, + "loss": 0.1892, + "step": 4991 + }, + { + "epoch": 1.667056269828018, + "grad_norm": 0.3691965681647722, + "learning_rate": 4.899925016629307e-06, + "loss": 0.1811, + "step": 4992 + }, + { + "epoch": 1.6673902153948905, + "grad_norm": 0.45179774449253307, + "learning_rate": 4.897982080200477e-06, + "loss": 0.1988, + "step": 4993 + }, + { + "epoch": 1.6677241609617632, + "grad_norm": 0.3740310306629665, + "learning_rate": 4.896039159182655e-06, + "loss": 0.1808, + "step": 4994 + }, + { + "epoch": 1.668058106528636, + "grad_norm": 0.4089047480386049, + "learning_rate": 4.894096253869343e-06, + "loss": 0.1927, + "step": 4995 + }, + { + "epoch": 1.6683920520955084, + "grad_norm": 0.399676505442746, + "learning_rate": 4.89215336455404e-06, + "loss": 0.1931, + "step": 4996 + }, + { + "epoch": 1.668725997662381, + "grad_norm": 0.3895788933672829, + "learning_rate": 4.89021049153024e-06, + "loss": 0.1849, + "step": 4997 + }, + { + "epoch": 1.6690599432292537, + "grad_norm": 0.3595030957949447, + "learning_rate": 4.888267635091439e-06, + "loss": 0.173, + "step": 4998 + }, + { + "epoch": 1.6693938887961264, + "grad_norm": 0.41403286511225557, + "learning_rate": 4.886324795531126e-06, + "loss": 0.1847, + "step": 4999 + }, + { + "epoch": 1.6697278343629989, + "grad_norm": 0.4112481211288421, + "learning_rate": 4.88438197314279e-06, + "loss": 0.1868, + "step": 5000 + }, + { + "epoch": 1.6700617799298714, + "grad_norm": 0.41924546376727767, + "learning_rate": 4.88243916821992e-06, + "loss": 0.1997, + "step": 5001 + }, + { + "epoch": 1.670395725496744, + "grad_norm": 0.3888547499719048, + "learning_rate": 4.880496381055998e-06, + "loss": 0.1854, + "step": 5002 + }, + { + "epoch": 1.6707296710636166, + "grad_norm": 0.41588935541900535, + "learning_rate": 4.878553611944505e-06, + "loss": 0.1913, + "step": 5003 + }, + { + "epoch": 1.671063616630489, + "grad_norm": 0.37772961201560074, + "learning_rate": 4.876610861178918e-06, + "loss": 0.1902, + "step": 5004 + }, + { + "epoch": 1.6713975621973618, + "grad_norm": 0.39759117151567896, + "learning_rate": 4.874668129052712e-06, + "loss": 0.2015, + "step": 5005 + }, + { + "epoch": 1.6717315077642345, + "grad_norm": 0.40235692971912135, + "learning_rate": 4.872725415859363e-06, + "loss": 0.1863, + "step": 5006 + }, + { + "epoch": 1.672065453331107, + "grad_norm": 0.36928962349677913, + "learning_rate": 4.8707827218923385e-06, + "loss": 0.1808, + "step": 5007 + }, + { + "epoch": 1.6723993988979795, + "grad_norm": 0.4562272715321814, + "learning_rate": 4.868840047445106e-06, + "loss": 0.2151, + "step": 5008 + }, + { + "epoch": 1.6727333444648522, + "grad_norm": 0.43138486747940097, + "learning_rate": 4.866897392811127e-06, + "loss": 0.213, + "step": 5009 + }, + { + "epoch": 1.673067290031725, + "grad_norm": 0.39105820437363875, + "learning_rate": 4.864954758283865e-06, + "loss": 0.1844, + "step": 5010 + }, + { + "epoch": 1.6734012355985974, + "grad_norm": 0.39624876568230644, + "learning_rate": 4.8630121441567755e-06, + "loss": 0.1976, + "step": 5011 + }, + { + "epoch": 1.67373518116547, + "grad_norm": 0.38463611162536465, + "learning_rate": 4.861069550723316e-06, + "loss": 0.1955, + "step": 5012 + }, + { + "epoch": 1.6740691267323426, + "grad_norm": 0.4218274037026937, + "learning_rate": 4.859126978276937e-06, + "loss": 0.2063, + "step": 5013 + }, + { + "epoch": 1.6744030722992154, + "grad_norm": 0.4055270643811191, + "learning_rate": 4.857184427111086e-06, + "loss": 0.2086, + "step": 5014 + }, + { + "epoch": 1.6747370178660879, + "grad_norm": 0.3660260268828606, + "learning_rate": 4.855241897519209e-06, + "loss": 0.1744, + "step": 5015 + }, + { + "epoch": 1.6750709634329604, + "grad_norm": 0.42399611512123225, + "learning_rate": 4.8532993897947464e-06, + "loss": 0.1971, + "step": 5016 + }, + { + "epoch": 1.675404908999833, + "grad_norm": 0.4052742336770543, + "learning_rate": 4.851356904231139e-06, + "loss": 0.1851, + "step": 5017 + }, + { + "epoch": 1.6757388545667058, + "grad_norm": 0.4318706621172862, + "learning_rate": 4.849414441121823e-06, + "loss": 0.2056, + "step": 5018 + }, + { + "epoch": 1.676072800133578, + "grad_norm": 0.4183164683620372, + "learning_rate": 4.847472000760228e-06, + "loss": 0.2033, + "step": 5019 + }, + { + "epoch": 1.6764067457004508, + "grad_norm": 0.41452196588285173, + "learning_rate": 4.845529583439783e-06, + "loss": 0.1937, + "step": 5020 + }, + { + "epoch": 1.6767406912673235, + "grad_norm": 0.44161534649268036, + "learning_rate": 4.843587189453914e-06, + "loss": 0.2104, + "step": 5021 + }, + { + "epoch": 1.677074636834196, + "grad_norm": 0.41291953906054923, + "learning_rate": 4.84164481909604e-06, + "loss": 0.1964, + "step": 5022 + }, + { + "epoch": 1.6774085824010685, + "grad_norm": 0.4107451131477934, + "learning_rate": 4.839702472659581e-06, + "loss": 0.1848, + "step": 5023 + }, + { + "epoch": 1.6777425279679412, + "grad_norm": 0.40593954437157626, + "learning_rate": 4.837760150437952e-06, + "loss": 0.1927, + "step": 5024 + }, + { + "epoch": 1.678076473534814, + "grad_norm": 0.41888894475916594, + "learning_rate": 4.8358178527245625e-06, + "loss": 0.2006, + "step": 5025 + }, + { + "epoch": 1.6784104191016864, + "grad_norm": 0.43080741370800135, + "learning_rate": 4.83387557981282e-06, + "loss": 0.196, + "step": 5026 + }, + { + "epoch": 1.678744364668559, + "grad_norm": 0.40951343834625403, + "learning_rate": 4.831933331996126e-06, + "loss": 0.1998, + "step": 5027 + }, + { + "epoch": 1.6790783102354316, + "grad_norm": 0.3875819486249406, + "learning_rate": 4.8299911095678816e-06, + "loss": 0.1833, + "step": 5028 + }, + { + "epoch": 1.6794122558023044, + "grad_norm": 0.4300987227755639, + "learning_rate": 4.82804891282148e-06, + "loss": 0.2069, + "step": 5029 + }, + { + "epoch": 1.6797462013691769, + "grad_norm": 0.37297544793032655, + "learning_rate": 4.8261067420503175e-06, + "loss": 0.188, + "step": 5030 + }, + { + "epoch": 1.6800801469360493, + "grad_norm": 0.39096748045381796, + "learning_rate": 4.8241645975477785e-06, + "loss": 0.1835, + "step": 5031 + }, + { + "epoch": 1.680414092502922, + "grad_norm": 0.3996077225322656, + "learning_rate": 4.822222479607247e-06, + "loss": 0.1943, + "step": 5032 + }, + { + "epoch": 1.6807480380697948, + "grad_norm": 0.4304762784482779, + "learning_rate": 4.820280388522104e-06, + "loss": 0.1948, + "step": 5033 + }, + { + "epoch": 1.6810819836366673, + "grad_norm": 0.3805492053734514, + "learning_rate": 4.818338324585725e-06, + "loss": 0.1838, + "step": 5034 + }, + { + "epoch": 1.6814159292035398, + "grad_norm": 0.37585210563270655, + "learning_rate": 4.816396288091478e-06, + "loss": 0.1854, + "step": 5035 + }, + { + "epoch": 1.6817498747704125, + "grad_norm": 0.42351245196254, + "learning_rate": 4.814454279332737e-06, + "loss": 0.2025, + "step": 5036 + }, + { + "epoch": 1.682083820337285, + "grad_norm": 0.41695687567784445, + "learning_rate": 4.81251229860286e-06, + "loss": 0.193, + "step": 5037 + }, + { + "epoch": 1.6824177659041575, + "grad_norm": 0.537261725808297, + "learning_rate": 4.810570346195207e-06, + "loss": 0.2114, + "step": 5038 + }, + { + "epoch": 1.6827517114710302, + "grad_norm": 0.4113025086849026, + "learning_rate": 4.808628422403135e-06, + "loss": 0.1948, + "step": 5039 + }, + { + "epoch": 1.683085657037903, + "grad_norm": 0.4076342773958272, + "learning_rate": 4.806686527519994e-06, + "loss": 0.1986, + "step": 5040 + }, + { + "epoch": 1.6834196026047754, + "grad_norm": 0.41025435360946494, + "learning_rate": 4.804744661839128e-06, + "loss": 0.2066, + "step": 5041 + }, + { + "epoch": 1.683753548171648, + "grad_norm": 0.3636218816609904, + "learning_rate": 4.80280282565388e-06, + "loss": 0.1872, + "step": 5042 + }, + { + "epoch": 1.6840874937385206, + "grad_norm": 0.3950953132425706, + "learning_rate": 4.800861019257587e-06, + "loss": 0.1984, + "step": 5043 + }, + { + "epoch": 1.6844214393053933, + "grad_norm": 0.42371769285822725, + "learning_rate": 4.798919242943583e-06, + "loss": 0.2037, + "step": 5044 + }, + { + "epoch": 1.6847553848722658, + "grad_norm": 0.392433056196255, + "learning_rate": 4.796977497005194e-06, + "loss": 0.1871, + "step": 5045 + }, + { + "epoch": 1.6850893304391383, + "grad_norm": 0.46155417346154815, + "learning_rate": 4.795035781735743e-06, + "loss": 0.1978, + "step": 5046 + }, + { + "epoch": 1.685423276006011, + "grad_norm": 0.440921144287475, + "learning_rate": 4.793094097428552e-06, + "loss": 0.2139, + "step": 5047 + }, + { + "epoch": 1.6857572215728838, + "grad_norm": 0.3794880760617637, + "learning_rate": 4.7911524443769346e-06, + "loss": 0.1911, + "step": 5048 + }, + { + "epoch": 1.6860911671397563, + "grad_norm": 0.395057876244426, + "learning_rate": 4.789210822874199e-06, + "loss": 0.1938, + "step": 5049 + }, + { + "epoch": 1.6864251127066288, + "grad_norm": 0.3775844552491621, + "learning_rate": 4.787269233213651e-06, + "loss": 0.188, + "step": 5050 + }, + { + "epoch": 1.6867590582735015, + "grad_norm": 0.3603621487839833, + "learning_rate": 4.785327675688591e-06, + "loss": 0.1885, + "step": 5051 + }, + { + "epoch": 1.687093003840374, + "grad_norm": 0.4027004483451677, + "learning_rate": 4.7833861505923096e-06, + "loss": 0.1893, + "step": 5052 + }, + { + "epoch": 1.6874269494072465, + "grad_norm": 0.4765942958527092, + "learning_rate": 4.781444658218103e-06, + "loss": 0.1933, + "step": 5053 + }, + { + "epoch": 1.6877608949741192, + "grad_norm": 0.41479948198174693, + "learning_rate": 4.779503198859255e-06, + "loss": 0.1885, + "step": 5054 + }, + { + "epoch": 1.688094840540992, + "grad_norm": 0.39411292837696527, + "learning_rate": 4.777561772809045e-06, + "loss": 0.1867, + "step": 5055 + }, + { + "epoch": 1.6884287861078644, + "grad_norm": 0.3932933436171054, + "learning_rate": 4.775620380360747e-06, + "loss": 0.1996, + "step": 5056 + }, + { + "epoch": 1.688762731674737, + "grad_norm": 0.5515039821445362, + "learning_rate": 4.773679021807634e-06, + "loss": 0.2012, + "step": 5057 + }, + { + "epoch": 1.6890966772416096, + "grad_norm": 0.4175879612749353, + "learning_rate": 4.771737697442968e-06, + "loss": 0.1885, + "step": 5058 + }, + { + "epoch": 1.6894306228084823, + "grad_norm": 0.4073593770421272, + "learning_rate": 4.7697964075600114e-06, + "loss": 0.1963, + "step": 5059 + }, + { + "epoch": 1.6897645683753548, + "grad_norm": 0.41602326668910145, + "learning_rate": 4.767855152452019e-06, + "loss": 0.1945, + "step": 5060 + }, + { + "epoch": 1.6900985139422273, + "grad_norm": 0.444598225558259, + "learning_rate": 4.765913932412237e-06, + "loss": 0.2069, + "step": 5061 + }, + { + "epoch": 1.6904324595091, + "grad_norm": 0.383752685178437, + "learning_rate": 4.763972747733913e-06, + "loss": 0.1897, + "step": 5062 + }, + { + "epoch": 1.6907664050759728, + "grad_norm": 0.42119196388224206, + "learning_rate": 4.762031598710285e-06, + "loss": 0.1927, + "step": 5063 + }, + { + "epoch": 1.6911003506428453, + "grad_norm": 0.4190290794400875, + "learning_rate": 4.760090485634584e-06, + "loss": 0.1973, + "step": 5064 + }, + { + "epoch": 1.6914342962097177, + "grad_norm": 0.423228700372625, + "learning_rate": 4.758149408800042e-06, + "loss": 0.2059, + "step": 5065 + }, + { + "epoch": 1.6917682417765905, + "grad_norm": 0.3900317630440837, + "learning_rate": 4.756208368499879e-06, + "loss": 0.1885, + "step": 5066 + }, + { + "epoch": 1.692102187343463, + "grad_norm": 0.412549195493742, + "learning_rate": 4.754267365027314e-06, + "loss": 0.1965, + "step": 5067 + }, + { + "epoch": 1.6924361329103355, + "grad_norm": 0.3953914528356778, + "learning_rate": 4.752326398675555e-06, + "loss": 0.2009, + "step": 5068 + }, + { + "epoch": 1.6927700784772082, + "grad_norm": 0.42031070024553674, + "learning_rate": 4.750385469737811e-06, + "loss": 0.1878, + "step": 5069 + }, + { + "epoch": 1.693104024044081, + "grad_norm": 0.4127570219253499, + "learning_rate": 4.748444578507278e-06, + "loss": 0.207, + "step": 5070 + }, + { + "epoch": 1.6934379696109534, + "grad_norm": 0.41827513152551066, + "learning_rate": 4.746503725277156e-06, + "loss": 0.2002, + "step": 5071 + }, + { + "epoch": 1.6937719151778259, + "grad_norm": 0.48196867703987134, + "learning_rate": 4.744562910340631e-06, + "loss": 0.2071, + "step": 5072 + }, + { + "epoch": 1.6941058607446986, + "grad_norm": 0.38681708500322204, + "learning_rate": 4.742622133990885e-06, + "loss": 0.185, + "step": 5073 + }, + { + "epoch": 1.6944398063115713, + "grad_norm": 0.4062271054342476, + "learning_rate": 4.740681396521097e-06, + "loss": 0.1876, + "step": 5074 + }, + { + "epoch": 1.6947737518784438, + "grad_norm": 0.516392208571396, + "learning_rate": 4.738740698224438e-06, + "loss": 0.2053, + "step": 5075 + }, + { + "epoch": 1.6951076974453163, + "grad_norm": 0.4361598083182339, + "learning_rate": 4.73680003939407e-06, + "loss": 0.2121, + "step": 5076 + }, + { + "epoch": 1.695441643012189, + "grad_norm": 0.41160087711576554, + "learning_rate": 4.734859420323158e-06, + "loss": 0.2036, + "step": 5077 + }, + { + "epoch": 1.6957755885790617, + "grad_norm": 0.49946880177831654, + "learning_rate": 4.7329188413048515e-06, + "loss": 0.1999, + "step": 5078 + }, + { + "epoch": 1.6961095341459342, + "grad_norm": 0.3666729685844792, + "learning_rate": 4.7309783026322995e-06, + "loss": 0.1833, + "step": 5079 + }, + { + "epoch": 1.6964434797128067, + "grad_norm": 0.40908472431553156, + "learning_rate": 4.7290378045986425e-06, + "loss": 0.1904, + "step": 5080 + }, + { + "epoch": 1.6967774252796795, + "grad_norm": 0.4479324104960931, + "learning_rate": 4.727097347497014e-06, + "loss": 0.1967, + "step": 5081 + }, + { + "epoch": 1.6971113708465522, + "grad_norm": 0.43959080462229055, + "learning_rate": 4.7251569316205455e-06, + "loss": 0.2051, + "step": 5082 + }, + { + "epoch": 1.6974453164134247, + "grad_norm": 0.38971023600971766, + "learning_rate": 4.723216557262359e-06, + "loss": 0.1886, + "step": 5083 + }, + { + "epoch": 1.6977792619802972, + "grad_norm": 0.4760777142908489, + "learning_rate": 4.721276224715569e-06, + "loss": 0.201, + "step": 5084 + }, + { + "epoch": 1.6981132075471699, + "grad_norm": 0.38860098507630997, + "learning_rate": 4.719335934273289e-06, + "loss": 0.1859, + "step": 5085 + }, + { + "epoch": 1.6984471531140424, + "grad_norm": 0.40103252102303055, + "learning_rate": 4.717395686228621e-06, + "loss": 0.1884, + "step": 5086 + }, + { + "epoch": 1.6987810986809149, + "grad_norm": 0.36902419962259236, + "learning_rate": 4.715455480874661e-06, + "loss": 0.1814, + "step": 5087 + }, + { + "epoch": 1.6991150442477876, + "grad_norm": 0.412125534705266, + "learning_rate": 4.713515318504501e-06, + "loss": 0.1913, + "step": 5088 + }, + { + "epoch": 1.6994489898146603, + "grad_norm": 0.39157748575603113, + "learning_rate": 4.711575199411226e-06, + "loss": 0.1829, + "step": 5089 + }, + { + "epoch": 1.6997829353815328, + "grad_norm": 0.41802116475269446, + "learning_rate": 4.7096351238879135e-06, + "loss": 0.1978, + "step": 5090 + }, + { + "epoch": 1.7001168809484053, + "grad_norm": 0.6279979025769422, + "learning_rate": 4.707695092227634e-06, + "loss": 0.1937, + "step": 5091 + }, + { + "epoch": 1.700450826515278, + "grad_norm": 0.4114760298428927, + "learning_rate": 4.705755104723453e-06, + "loss": 0.199, + "step": 5092 + }, + { + "epoch": 1.7007847720821507, + "grad_norm": 0.4395547405428648, + "learning_rate": 4.703815161668426e-06, + "loss": 0.1931, + "step": 5093 + }, + { + "epoch": 1.7011187176490232, + "grad_norm": 0.41225308183331766, + "learning_rate": 4.701875263355608e-06, + "loss": 0.191, + "step": 5094 + }, + { + "epoch": 1.7014526632158957, + "grad_norm": 0.3958758087990041, + "learning_rate": 4.699935410078042e-06, + "loss": 0.1941, + "step": 5095 + }, + { + "epoch": 1.7017866087827684, + "grad_norm": 0.41315215933016763, + "learning_rate": 4.697995602128766e-06, + "loss": 0.1947, + "step": 5096 + }, + { + "epoch": 1.7021205543496412, + "grad_norm": 0.3955180863149731, + "learning_rate": 4.696055839800809e-06, + "loss": 0.1991, + "step": 5097 + }, + { + "epoch": 1.7024544999165137, + "grad_norm": 0.43154636895066495, + "learning_rate": 4.694116123387197e-06, + "loss": 0.2104, + "step": 5098 + }, + { + "epoch": 1.7027884454833861, + "grad_norm": 0.4007233166701319, + "learning_rate": 4.692176453180944e-06, + "loss": 0.1902, + "step": 5099 + }, + { + "epoch": 1.7031223910502589, + "grad_norm": 0.40361257750068574, + "learning_rate": 4.6902368294750644e-06, + "loss": 0.2028, + "step": 5100 + }, + { + "epoch": 1.7034563366171314, + "grad_norm": 0.40699361291263014, + "learning_rate": 4.688297252562559e-06, + "loss": 0.1989, + "step": 5101 + }, + { + "epoch": 1.7037902821840039, + "grad_norm": 0.4079101725879806, + "learning_rate": 4.6863577227364235e-06, + "loss": 0.1909, + "step": 5102 + }, + { + "epoch": 1.7041242277508766, + "grad_norm": 0.38587148305952435, + "learning_rate": 4.684418240289648e-06, + "loss": 0.1861, + "step": 5103 + }, + { + "epoch": 1.7044581733177493, + "grad_norm": 0.4625478571203875, + "learning_rate": 4.682478805515212e-06, + "loss": 0.202, + "step": 5104 + }, + { + "epoch": 1.7047921188846218, + "grad_norm": 0.4358990786312495, + "learning_rate": 4.680539418706091e-06, + "loss": 0.1927, + "step": 5105 + }, + { + "epoch": 1.7051260644514943, + "grad_norm": 0.41133770169381584, + "learning_rate": 4.678600080155252e-06, + "loss": 0.1775, + "step": 5106 + }, + { + "epoch": 1.705460010018367, + "grad_norm": 0.3962294559870226, + "learning_rate": 4.676660790155656e-06, + "loss": 0.1981, + "step": 5107 + }, + { + "epoch": 1.7057939555852397, + "grad_norm": 0.40931109170500446, + "learning_rate": 4.674721549000255e-06, + "loss": 0.2042, + "step": 5108 + }, + { + "epoch": 1.7061279011521122, + "grad_norm": 0.4107315070526611, + "learning_rate": 4.6727823569819944e-06, + "loss": 0.1878, + "step": 5109 + }, + { + "epoch": 1.7064618467189847, + "grad_norm": 0.3752388358432499, + "learning_rate": 4.670843214393811e-06, + "loss": 0.1813, + "step": 5110 + }, + { + "epoch": 1.7067957922858574, + "grad_norm": 0.43157554986961477, + "learning_rate": 4.6689041215286344e-06, + "loss": 0.2003, + "step": 5111 + }, + { + "epoch": 1.7071297378527301, + "grad_norm": 0.42486623254964884, + "learning_rate": 4.666965078679391e-06, + "loss": 0.1971, + "step": 5112 + }, + { + "epoch": 1.7074636834196026, + "grad_norm": 0.405339605614925, + "learning_rate": 4.665026086138993e-06, + "loss": 0.2053, + "step": 5113 + }, + { + "epoch": 1.7077976289864751, + "grad_norm": 0.4002710266542765, + "learning_rate": 4.66308714420035e-06, + "loss": 0.186, + "step": 5114 + }, + { + "epoch": 1.7081315745533479, + "grad_norm": 0.4126660129106094, + "learning_rate": 4.6611482531563595e-06, + "loss": 0.1961, + "step": 5115 + }, + { + "epoch": 1.7084655201202203, + "grad_norm": 0.45263061369654795, + "learning_rate": 4.659209413299916e-06, + "loss": 0.2105, + "step": 5116 + }, + { + "epoch": 1.7087994656870928, + "grad_norm": 0.39704063413819723, + "learning_rate": 4.657270624923901e-06, + "loss": 0.1866, + "step": 5117 + }, + { + "epoch": 1.7091334112539656, + "grad_norm": 0.456588695870717, + "learning_rate": 4.6553318883211955e-06, + "loss": 0.1986, + "step": 5118 + }, + { + "epoch": 1.7094673568208383, + "grad_norm": 0.4056701874158402, + "learning_rate": 4.653393203784667e-06, + "loss": 0.1937, + "step": 5119 + }, + { + "epoch": 1.7098013023877108, + "grad_norm": 0.37801865981225746, + "learning_rate": 4.651454571607176e-06, + "loss": 0.1828, + "step": 5120 + }, + { + "epoch": 1.7101352479545833, + "grad_norm": 0.4340941520725217, + "learning_rate": 4.649515992081576e-06, + "loss": 0.1945, + "step": 5121 + }, + { + "epoch": 1.710469193521456, + "grad_norm": 0.41411971112825, + "learning_rate": 4.64757746550071e-06, + "loss": 0.2097, + "step": 5122 + }, + { + "epoch": 1.7108031390883287, + "grad_norm": 0.4212025495479919, + "learning_rate": 4.645638992157419e-06, + "loss": 0.1967, + "step": 5123 + }, + { + "epoch": 1.7111370846552012, + "grad_norm": 0.40864256762488815, + "learning_rate": 4.6437005723445316e-06, + "loss": 0.1974, + "step": 5124 + }, + { + "epoch": 1.7114710302220737, + "grad_norm": 0.4028769432832656, + "learning_rate": 4.6417622063548675e-06, + "loss": 0.1991, + "step": 5125 + }, + { + "epoch": 1.7118049757889464, + "grad_norm": 0.6276560821172299, + "learning_rate": 4.6398238944812414e-06, + "loss": 0.1923, + "step": 5126 + }, + { + "epoch": 1.7121389213558191, + "grad_norm": 0.4154958288569983, + "learning_rate": 4.637885637016456e-06, + "loss": 0.1945, + "step": 5127 + }, + { + "epoch": 1.7124728669226916, + "grad_norm": 0.3784544159799719, + "learning_rate": 4.635947434253308e-06, + "loss": 0.174, + "step": 5128 + }, + { + "epoch": 1.7128068124895641, + "grad_norm": 0.4121407590389507, + "learning_rate": 4.634009286484586e-06, + "loss": 0.195, + "step": 5129 + }, + { + "epoch": 1.7131407580564368, + "grad_norm": 0.3936720216642308, + "learning_rate": 4.632071194003073e-06, + "loss": 0.1804, + "step": 5130 + }, + { + "epoch": 1.7134747036233096, + "grad_norm": 0.41086226040341123, + "learning_rate": 4.630133157101537e-06, + "loss": 0.1921, + "step": 5131 + }, + { + "epoch": 1.713808649190182, + "grad_norm": 0.3884513041579706, + "learning_rate": 4.6281951760727435e-06, + "loss": 0.1786, + "step": 5132 + }, + { + "epoch": 1.7141425947570545, + "grad_norm": 0.39582994223607226, + "learning_rate": 4.626257251209446e-06, + "loss": 0.1835, + "step": 5133 + }, + { + "epoch": 1.7144765403239273, + "grad_norm": 0.5027499487784188, + "learning_rate": 4.624319382804391e-06, + "loss": 0.205, + "step": 5134 + }, + { + "epoch": 1.7148104858907998, + "grad_norm": 0.3865108788544779, + "learning_rate": 4.622381571150317e-06, + "loss": 0.1791, + "step": 5135 + }, + { + "epoch": 1.7151444314576723, + "grad_norm": 0.41736826198301175, + "learning_rate": 4.620443816539954e-06, + "loss": 0.1845, + "step": 5136 + }, + { + "epoch": 1.715478377024545, + "grad_norm": 0.4076416332201311, + "learning_rate": 4.618506119266021e-06, + "loss": 0.2023, + "step": 5137 + }, + { + "epoch": 1.7158123225914177, + "grad_norm": 0.3940577448952557, + "learning_rate": 4.6165684796212306e-06, + "loss": 0.1971, + "step": 5138 + }, + { + "epoch": 1.7161462681582902, + "grad_norm": 0.6699527350793285, + "learning_rate": 4.6146308978982865e-06, + "loss": 0.2015, + "step": 5139 + }, + { + "epoch": 1.7164802137251627, + "grad_norm": 0.41792420253823254, + "learning_rate": 4.612693374389881e-06, + "loss": 0.1996, + "step": 5140 + }, + { + "epoch": 1.7168141592920354, + "grad_norm": 0.486254656967708, + "learning_rate": 4.610755909388703e-06, + "loss": 0.2017, + "step": 5141 + }, + { + "epoch": 1.7171481048589081, + "grad_norm": 0.43032663564827006, + "learning_rate": 4.608818503187428e-06, + "loss": 0.2077, + "step": 5142 + }, + { + "epoch": 1.7174820504257806, + "grad_norm": 0.4197636003363948, + "learning_rate": 4.606881156078725e-06, + "loss": 0.2063, + "step": 5143 + }, + { + "epoch": 1.717815995992653, + "grad_norm": 0.41927575757137314, + "learning_rate": 4.604943868355251e-06, + "loss": 0.1971, + "step": 5144 + }, + { + "epoch": 1.7181499415595258, + "grad_norm": 0.4364890921897206, + "learning_rate": 4.603006640309658e-06, + "loss": 0.1992, + "step": 5145 + }, + { + "epoch": 1.7184838871263985, + "grad_norm": 0.40568220551742395, + "learning_rate": 4.601069472234584e-06, + "loss": 0.2016, + "step": 5146 + }, + { + "epoch": 1.718817832693271, + "grad_norm": 0.41925099662175647, + "learning_rate": 4.599132364422666e-06, + "loss": 0.2029, + "step": 5147 + }, + { + "epoch": 1.7191517782601435, + "grad_norm": 0.4064241820068898, + "learning_rate": 4.597195317166525e-06, + "loss": 0.1941, + "step": 5148 + }, + { + "epoch": 1.7194857238270163, + "grad_norm": 0.40433828939310057, + "learning_rate": 4.595258330758773e-06, + "loss": 0.1928, + "step": 5149 + }, + { + "epoch": 1.7198196693938888, + "grad_norm": 0.42000296483673916, + "learning_rate": 4.593321405492017e-06, + "loss": 0.1987, + "step": 5150 + }, + { + "epoch": 1.7201536149607612, + "grad_norm": 0.4031145987386355, + "learning_rate": 4.59138454165885e-06, + "loss": 0.1867, + "step": 5151 + }, + { + "epoch": 1.720487560527634, + "grad_norm": 0.3927642852721298, + "learning_rate": 4.589447739551857e-06, + "loss": 0.1886, + "step": 5152 + }, + { + "epoch": 1.7208215060945067, + "grad_norm": 0.4365198371307918, + "learning_rate": 4.58751099946362e-06, + "loss": 0.2067, + "step": 5153 + }, + { + "epoch": 1.7211554516613792, + "grad_norm": 0.47754776245942315, + "learning_rate": 4.585574321686704e-06, + "loss": 0.2044, + "step": 5154 + }, + { + "epoch": 1.7214893972282517, + "grad_norm": 0.41403702099913486, + "learning_rate": 4.583637706513665e-06, + "loss": 0.2045, + "step": 5155 + }, + { + "epoch": 1.7218233427951244, + "grad_norm": 0.4129816326090922, + "learning_rate": 4.5817011542370535e-06, + "loss": 0.1912, + "step": 5156 + }, + { + "epoch": 1.722157288361997, + "grad_norm": 0.42004603602811036, + "learning_rate": 4.579764665149409e-06, + "loss": 0.2005, + "step": 5157 + }, + { + "epoch": 1.7224912339288696, + "grad_norm": 0.4321750576374955, + "learning_rate": 4.577828239543257e-06, + "loss": 0.1997, + "step": 5158 + }, + { + "epoch": 1.722825179495742, + "grad_norm": 0.405430374187034, + "learning_rate": 4.575891877711123e-06, + "loss": 0.2003, + "step": 5159 + }, + { + "epoch": 1.7231591250626148, + "grad_norm": 0.3853641882041907, + "learning_rate": 4.573955579945514e-06, + "loss": 0.1809, + "step": 5160 + }, + { + "epoch": 1.7234930706294875, + "grad_norm": 0.39519841050530224, + "learning_rate": 4.572019346538931e-06, + "loss": 0.1826, + "step": 5161 + }, + { + "epoch": 1.72382701619636, + "grad_norm": 0.39830083588979237, + "learning_rate": 4.570083177783865e-06, + "loss": 0.1985, + "step": 5162 + }, + { + "epoch": 1.7241609617632325, + "grad_norm": 0.3912144272420254, + "learning_rate": 4.568147073972795e-06, + "loss": 0.1791, + "step": 5163 + }, + { + "epoch": 1.7244949073301052, + "grad_norm": 0.438731828398516, + "learning_rate": 4.566211035398196e-06, + "loss": 0.2143, + "step": 5164 + }, + { + "epoch": 1.7248288528969777, + "grad_norm": 0.38952165500853425, + "learning_rate": 4.564275062352529e-06, + "loss": 0.1898, + "step": 5165 + }, + { + "epoch": 1.7251627984638502, + "grad_norm": 0.46702184416618503, + "learning_rate": 4.5623391551282435e-06, + "loss": 0.2063, + "step": 5166 + }, + { + "epoch": 1.725496744030723, + "grad_norm": 0.3862830352258704, + "learning_rate": 4.560403314017782e-06, + "loss": 0.1919, + "step": 5167 + }, + { + "epoch": 1.7258306895975957, + "grad_norm": 0.3939486702748393, + "learning_rate": 4.558467539313576e-06, + "loss": 0.185, + "step": 5168 + }, + { + "epoch": 1.7261646351644682, + "grad_norm": 0.4475298224185486, + "learning_rate": 4.556531831308045e-06, + "loss": 0.2084, + "step": 5169 + }, + { + "epoch": 1.7264985807313407, + "grad_norm": 0.3740950936131439, + "learning_rate": 4.554596190293606e-06, + "loss": 0.1861, + "step": 5170 + }, + { + "epoch": 1.7268325262982134, + "grad_norm": 0.4279837775356034, + "learning_rate": 4.552660616562655e-06, + "loss": 0.1956, + "step": 5171 + }, + { + "epoch": 1.727166471865086, + "grad_norm": 0.38629605489548957, + "learning_rate": 4.550725110407586e-06, + "loss": 0.1882, + "step": 5172 + }, + { + "epoch": 1.7275004174319586, + "grad_norm": 0.40903630345175285, + "learning_rate": 4.548789672120779e-06, + "loss": 0.1947, + "step": 5173 + }, + { + "epoch": 1.727834362998831, + "grad_norm": 0.40754681344598553, + "learning_rate": 4.5468543019946045e-06, + "loss": 0.1895, + "step": 5174 + }, + { + "epoch": 1.7281683085657038, + "grad_norm": 0.4154808397286242, + "learning_rate": 4.544919000321421e-06, + "loss": 0.1979, + "step": 5175 + }, + { + "epoch": 1.7285022541325765, + "grad_norm": 0.3927989079761158, + "learning_rate": 4.542983767393584e-06, + "loss": 0.187, + "step": 5176 + }, + { + "epoch": 1.728836199699449, + "grad_norm": 0.4477597515047998, + "learning_rate": 4.541048603503429e-06, + "loss": 0.1982, + "step": 5177 + }, + { + "epoch": 1.7291701452663215, + "grad_norm": 0.4033600052314757, + "learning_rate": 4.539113508943287e-06, + "loss": 0.1842, + "step": 5178 + }, + { + "epoch": 1.7295040908331942, + "grad_norm": 0.38530844939756476, + "learning_rate": 4.537178484005476e-06, + "loss": 0.1906, + "step": 5179 + }, + { + "epoch": 1.729838036400067, + "grad_norm": 0.4345227695242709, + "learning_rate": 4.535243528982305e-06, + "loss": 0.2003, + "step": 5180 + }, + { + "epoch": 1.7301719819669394, + "grad_norm": 0.38337505336535044, + "learning_rate": 4.53330864416607e-06, + "loss": 0.1841, + "step": 5181 + }, + { + "epoch": 1.730505927533812, + "grad_norm": 0.38990039540591903, + "learning_rate": 4.531373829849061e-06, + "loss": 0.1947, + "step": 5182 + }, + { + "epoch": 1.7308398731006847, + "grad_norm": 0.3929342954019198, + "learning_rate": 4.529439086323552e-06, + "loss": 0.1867, + "step": 5183 + }, + { + "epoch": 1.7311738186675572, + "grad_norm": 0.42907331015138545, + "learning_rate": 4.52750441388181e-06, + "loss": 0.2045, + "step": 5184 + }, + { + "epoch": 1.7315077642344296, + "grad_norm": 0.3957492762261916, + "learning_rate": 4.52556981281609e-06, + "loss": 0.1965, + "step": 5185 + }, + { + "epoch": 1.7318417098013024, + "grad_norm": 0.3985478985780018, + "learning_rate": 4.523635283418635e-06, + "loss": 0.1836, + "step": 5186 + }, + { + "epoch": 1.732175655368175, + "grad_norm": 0.4100016585452721, + "learning_rate": 4.521700825981678e-06, + "loss": 0.2057, + "step": 5187 + }, + { + "epoch": 1.7325096009350476, + "grad_norm": 0.38648895139654443, + "learning_rate": 4.519766440797446e-06, + "loss": 0.1998, + "step": 5188 + }, + { + "epoch": 1.73284354650192, + "grad_norm": 0.3950585537509392, + "learning_rate": 4.517832128158147e-06, + "loss": 0.1954, + "step": 5189 + }, + { + "epoch": 1.7331774920687928, + "grad_norm": 0.351655257356069, + "learning_rate": 4.515897888355982e-06, + "loss": 0.1701, + "step": 5190 + }, + { + "epoch": 1.7335114376356655, + "grad_norm": 0.39426784941225645, + "learning_rate": 4.513963721683142e-06, + "loss": 0.1933, + "step": 5191 + }, + { + "epoch": 1.733845383202538, + "grad_norm": 0.4069564879031356, + "learning_rate": 4.5120296284318035e-06, + "loss": 0.1839, + "step": 5192 + }, + { + "epoch": 1.7341793287694105, + "grad_norm": 0.3679036421194138, + "learning_rate": 4.510095608894134e-06, + "loss": 0.1784, + "step": 5193 + }, + { + "epoch": 1.7345132743362832, + "grad_norm": 0.41535645901622537, + "learning_rate": 4.508161663362294e-06, + "loss": 0.2009, + "step": 5194 + }, + { + "epoch": 1.734847219903156, + "grad_norm": 0.4320307397620132, + "learning_rate": 4.506227792128424e-06, + "loss": 0.2062, + "step": 5195 + }, + { + "epoch": 1.7351811654700284, + "grad_norm": 0.42447532292063683, + "learning_rate": 4.504293995484662e-06, + "loss": 0.2063, + "step": 5196 + }, + { + "epoch": 1.735515111036901, + "grad_norm": 0.4315851980720862, + "learning_rate": 4.502360273723127e-06, + "loss": 0.2087, + "step": 5197 + }, + { + "epoch": 1.7358490566037736, + "grad_norm": 0.40956272992571285, + "learning_rate": 4.500426627135933e-06, + "loss": 0.187, + "step": 5198 + }, + { + "epoch": 1.7361830021706461, + "grad_norm": 0.4529666235929845, + "learning_rate": 4.4984930560151776e-06, + "loss": 0.1871, + "step": 5199 + }, + { + "epoch": 1.7365169477375186, + "grad_norm": 0.38754524741812935, + "learning_rate": 4.496559560652952e-06, + "loss": 0.1914, + "step": 5200 + }, + { + "epoch": 1.7368508933043914, + "grad_norm": 0.43465984631819043, + "learning_rate": 4.494626141341334e-06, + "loss": 0.1997, + "step": 5201 + }, + { + "epoch": 1.737184838871264, + "grad_norm": 0.383172124051344, + "learning_rate": 4.4926927983723876e-06, + "loss": 0.1932, + "step": 5202 + }, + { + "epoch": 1.7375187844381366, + "grad_norm": 0.4201088959332936, + "learning_rate": 4.490759532038166e-06, + "loss": 0.1946, + "step": 5203 + }, + { + "epoch": 1.737852730005009, + "grad_norm": 0.4688579896439726, + "learning_rate": 4.488826342630714e-06, + "loss": 0.2246, + "step": 5204 + }, + { + "epoch": 1.7381866755718818, + "grad_norm": 0.4749846175120122, + "learning_rate": 4.486893230442062e-06, + "loss": 0.2024, + "step": 5205 + }, + { + "epoch": 1.7385206211387545, + "grad_norm": 0.38787853638285946, + "learning_rate": 4.4849601957642295e-06, + "loss": 0.186, + "step": 5206 + }, + { + "epoch": 1.738854566705627, + "grad_norm": 0.38594915412595865, + "learning_rate": 4.483027238889223e-06, + "loss": 0.193, + "step": 5207 + }, + { + "epoch": 1.7391885122724995, + "grad_norm": 0.4113339936370685, + "learning_rate": 4.48109436010904e-06, + "loss": 0.1912, + "step": 5208 + }, + { + "epoch": 1.7395224578393722, + "grad_norm": 0.4094567096226062, + "learning_rate": 4.4791615597156635e-06, + "loss": 0.1948, + "step": 5209 + }, + { + "epoch": 1.739856403406245, + "grad_norm": 0.41900250037341225, + "learning_rate": 4.477228838001065e-06, + "loss": 0.1891, + "step": 5210 + }, + { + "epoch": 1.7401903489731174, + "grad_norm": 0.38308421909389073, + "learning_rate": 4.475296195257206e-06, + "loss": 0.1855, + "step": 5211 + }, + { + "epoch": 1.74052429453999, + "grad_norm": 0.38927592333160743, + "learning_rate": 4.4733636317760365e-06, + "loss": 0.1929, + "step": 5212 + }, + { + "epoch": 1.7408582401068626, + "grad_norm": 0.3896513984545134, + "learning_rate": 4.471431147849491e-06, + "loss": 0.1808, + "step": 5213 + }, + { + "epoch": 1.7411921856737351, + "grad_norm": 0.4339601478317658, + "learning_rate": 4.469498743769493e-06, + "loss": 0.2101, + "step": 5214 + }, + { + "epoch": 1.7415261312406076, + "grad_norm": 0.4446663564844156, + "learning_rate": 4.467566419827958e-06, + "loss": 0.1946, + "step": 5215 + }, + { + "epoch": 1.7418600768074803, + "grad_norm": 0.42455586753823066, + "learning_rate": 4.465634176316782e-06, + "loss": 0.1935, + "step": 5216 + }, + { + "epoch": 1.742194022374353, + "grad_norm": 0.39397487594186725, + "learning_rate": 4.463702013527857e-06, + "loss": 0.1798, + "step": 5217 + }, + { + "epoch": 1.7425279679412256, + "grad_norm": 0.44564863513382924, + "learning_rate": 4.4617699317530585e-06, + "loss": 0.209, + "step": 5218 + }, + { + "epoch": 1.742861913508098, + "grad_norm": 0.3815517649159005, + "learning_rate": 4.459837931284249e-06, + "loss": 0.1835, + "step": 5219 + }, + { + "epoch": 1.7431958590749708, + "grad_norm": 0.37966203750616717, + "learning_rate": 4.45790601241328e-06, + "loss": 0.1842, + "step": 5220 + }, + { + "epoch": 1.7435298046418435, + "grad_norm": 0.3877934659273845, + "learning_rate": 4.45597417543199e-06, + "loss": 0.1852, + "step": 5221 + }, + { + "epoch": 1.743863750208716, + "grad_norm": 0.41155888783147637, + "learning_rate": 4.454042420632206e-06, + "loss": 0.1863, + "step": 5222 + }, + { + "epoch": 1.7441976957755885, + "grad_norm": 0.405549495330632, + "learning_rate": 4.452110748305744e-06, + "loss": 0.1925, + "step": 5223 + }, + { + "epoch": 1.7445316413424612, + "grad_norm": 0.37776272889626006, + "learning_rate": 4.450179158744405e-06, + "loss": 0.1897, + "step": 5224 + }, + { + "epoch": 1.744865586909334, + "grad_norm": 0.41244283018850775, + "learning_rate": 4.448247652239978e-06, + "loss": 0.1966, + "step": 5225 + }, + { + "epoch": 1.7451995324762064, + "grad_norm": 0.37070906739068943, + "learning_rate": 4.4463162290842395e-06, + "loss": 0.185, + "step": 5226 + }, + { + "epoch": 1.745533478043079, + "grad_norm": 0.38778365798313136, + "learning_rate": 4.444384889568954e-06, + "loss": 0.1885, + "step": 5227 + }, + { + "epoch": 1.7458674236099516, + "grad_norm": 0.42132640207261324, + "learning_rate": 4.442453633985872e-06, + "loss": 0.1996, + "step": 5228 + }, + { + "epoch": 1.7462013691768243, + "grad_norm": 0.4647696066279634, + "learning_rate": 4.4405224626267345e-06, + "loss": 0.2121, + "step": 5229 + }, + { + "epoch": 1.7465353147436968, + "grad_norm": 0.40648419947551384, + "learning_rate": 4.438591375783267e-06, + "loss": 0.1898, + "step": 5230 + }, + { + "epoch": 1.7468692603105693, + "grad_norm": 0.48398101512288694, + "learning_rate": 4.4366603737471825e-06, + "loss": 0.2041, + "step": 5231 + }, + { + "epoch": 1.747203205877442, + "grad_norm": 0.3757616353194717, + "learning_rate": 4.434729456810182e-06, + "loss": 0.1725, + "step": 5232 + }, + { + "epoch": 1.7475371514443145, + "grad_norm": 0.439178547747858, + "learning_rate": 4.432798625263951e-06, + "loss": 0.2103, + "step": 5233 + }, + { + "epoch": 1.747871097011187, + "grad_norm": 0.3795864733687818, + "learning_rate": 4.430867879400167e-06, + "loss": 0.1806, + "step": 5234 + }, + { + "epoch": 1.7482050425780598, + "grad_norm": 0.41932040849791336, + "learning_rate": 4.428937219510491e-06, + "loss": 0.1955, + "step": 5235 + }, + { + "epoch": 1.7485389881449325, + "grad_norm": 0.4871060941331514, + "learning_rate": 4.427006645886573e-06, + "loss": 0.1906, + "step": 5236 + }, + { + "epoch": 1.748872933711805, + "grad_norm": 0.35978985995864865, + "learning_rate": 4.425076158820048e-06, + "loss": 0.1763, + "step": 5237 + }, + { + "epoch": 1.7492068792786775, + "grad_norm": 0.4361636932415916, + "learning_rate": 4.423145758602538e-06, + "loss": 0.1901, + "step": 5238 + }, + { + "epoch": 1.7495408248455502, + "grad_norm": 0.3897242187992088, + "learning_rate": 4.4212154455256535e-06, + "loss": 0.1854, + "step": 5239 + }, + { + "epoch": 1.749874770412423, + "grad_norm": 0.44303703845267467, + "learning_rate": 4.41928521988099e-06, + "loss": 0.2072, + "step": 5240 + }, + { + "epoch": 1.7502087159792954, + "grad_norm": 0.38297608026027435, + "learning_rate": 4.417355081960133e-06, + "loss": 0.1852, + "step": 5241 + }, + { + "epoch": 1.7505426615461679, + "grad_norm": 0.4035320423312304, + "learning_rate": 4.415425032054651e-06, + "loss": 0.1974, + "step": 5242 + }, + { + "epoch": 1.7508766071130406, + "grad_norm": 0.4211405489724584, + "learning_rate": 4.413495070456101e-06, + "loss": 0.2007, + "step": 5243 + }, + { + "epoch": 1.7512105526799133, + "grad_norm": 0.3643499101190018, + "learning_rate": 4.411565197456027e-06, + "loss": 0.185, + "step": 5244 + }, + { + "epoch": 1.7515444982467858, + "grad_norm": 0.411108277805822, + "learning_rate": 4.409635413345956e-06, + "loss": 0.2048, + "step": 5245 + }, + { + "epoch": 1.7518784438136583, + "grad_norm": 0.4194151678676051, + "learning_rate": 4.40770571841741e-06, + "loss": 0.1999, + "step": 5246 + }, + { + "epoch": 1.752212389380531, + "grad_norm": 0.40680894494308045, + "learning_rate": 4.405776112961889e-06, + "loss": 0.2074, + "step": 5247 + }, + { + "epoch": 1.7525463349474035, + "grad_norm": 0.41101451985867804, + "learning_rate": 4.4038465972708824e-06, + "loss": 0.2035, + "step": 5248 + }, + { + "epoch": 1.752880280514276, + "grad_norm": 0.4094941833906081, + "learning_rate": 4.4019171716358675e-06, + "loss": 0.2125, + "step": 5249 + }, + { + "epoch": 1.7532142260811487, + "grad_norm": 0.47029820090748287, + "learning_rate": 4.399987836348305e-06, + "loss": 0.2013, + "step": 5250 + }, + { + "epoch": 1.7535481716480215, + "grad_norm": 0.436547364677873, + "learning_rate": 4.398058591699645e-06, + "loss": 0.2054, + "step": 5251 + }, + { + "epoch": 1.753882117214894, + "grad_norm": 0.4223790978332405, + "learning_rate": 4.396129437981322e-06, + "loss": 0.2078, + "step": 5252 + }, + { + "epoch": 1.7542160627817664, + "grad_norm": 0.3751825180830145, + "learning_rate": 4.394200375484758e-06, + "loss": 0.1807, + "step": 5253 + }, + { + "epoch": 1.7545500083486392, + "grad_norm": 0.41355637546032376, + "learning_rate": 4.392271404501361e-06, + "loss": 0.1946, + "step": 5254 + }, + { + "epoch": 1.7548839539155119, + "grad_norm": 0.4283757551832914, + "learning_rate": 4.390342525322524e-06, + "loss": 0.1935, + "step": 5255 + }, + { + "epoch": 1.7552178994823844, + "grad_norm": 0.3614554875916809, + "learning_rate": 4.3884137382396255e-06, + "loss": 0.1699, + "step": 5256 + }, + { + "epoch": 1.7555518450492569, + "grad_norm": 0.4100290180136812, + "learning_rate": 4.3864850435440335e-06, + "loss": 0.1949, + "step": 5257 + }, + { + "epoch": 1.7558857906161296, + "grad_norm": 0.4447808535436557, + "learning_rate": 4.3845564415271e-06, + "loss": 0.1973, + "step": 5258 + }, + { + "epoch": 1.7562197361830023, + "grad_norm": 0.38742859585948136, + "learning_rate": 4.382627932480164e-06, + "loss": 0.1864, + "step": 5259 + }, + { + "epoch": 1.7565536817498748, + "grad_norm": 0.4042603667108564, + "learning_rate": 4.380699516694547e-06, + "loss": 0.1927, + "step": 5260 + }, + { + "epoch": 1.7568876273167473, + "grad_norm": 0.5355154040253798, + "learning_rate": 4.37877119446156e-06, + "loss": 0.2128, + "step": 5261 + }, + { + "epoch": 1.75722157288362, + "grad_norm": 0.41543428507131214, + "learning_rate": 4.3768429660725e-06, + "loss": 0.1852, + "step": 5262 + }, + { + "epoch": 1.7575555184504925, + "grad_norm": 0.39428337573752037, + "learning_rate": 4.374914831818643e-06, + "loss": 0.1965, + "step": 5263 + }, + { + "epoch": 1.757889464017365, + "grad_norm": 0.4158653114188019, + "learning_rate": 4.372986791991265e-06, + "loss": 0.1917, + "step": 5264 + }, + { + "epoch": 1.7582234095842377, + "grad_norm": 0.3700405425476926, + "learning_rate": 4.371058846881614e-06, + "loss": 0.1747, + "step": 5265 + }, + { + "epoch": 1.7585573551511104, + "grad_norm": 0.41343759916271333, + "learning_rate": 4.36913099678093e-06, + "loss": 0.2013, + "step": 5266 + }, + { + "epoch": 1.758891300717983, + "grad_norm": 0.42108912822588007, + "learning_rate": 4.367203241980437e-06, + "loss": 0.1909, + "step": 5267 + }, + { + "epoch": 1.7592252462848554, + "grad_norm": 0.3808692844158381, + "learning_rate": 4.3652755827713456e-06, + "loss": 0.1827, + "step": 5268 + }, + { + "epoch": 1.7595591918517282, + "grad_norm": 0.394990213378632, + "learning_rate": 4.363348019444848e-06, + "loss": 0.1926, + "step": 5269 + }, + { + "epoch": 1.7598931374186009, + "grad_norm": 0.38402085666635427, + "learning_rate": 4.361420552292132e-06, + "loss": 0.1904, + "step": 5270 + }, + { + "epoch": 1.7602270829854734, + "grad_norm": 0.34637884058258056, + "learning_rate": 4.35949318160436e-06, + "loss": 0.1812, + "step": 5271 + }, + { + "epoch": 1.7605610285523459, + "grad_norm": 0.3776613811676968, + "learning_rate": 4.357565907672684e-06, + "loss": 0.1903, + "step": 5272 + }, + { + "epoch": 1.7608949741192186, + "grad_norm": 0.4087198495663909, + "learning_rate": 4.355638730788242e-06, + "loss": 0.1876, + "step": 5273 + }, + { + "epoch": 1.7612289196860913, + "grad_norm": 0.38778910053833804, + "learning_rate": 4.353711651242157e-06, + "loss": 0.1813, + "step": 5274 + }, + { + "epoch": 1.7615628652529638, + "grad_norm": 0.37325883749294486, + "learning_rate": 4.3517846693255365e-06, + "loss": 0.1777, + "step": 5275 + }, + { + "epoch": 1.7618968108198363, + "grad_norm": 0.39984608759219054, + "learning_rate": 4.349857785329475e-06, + "loss": 0.1923, + "step": 5276 + }, + { + "epoch": 1.762230756386709, + "grad_norm": 0.3743519628081095, + "learning_rate": 4.34793099954505e-06, + "loss": 0.1841, + "step": 5277 + }, + { + "epoch": 1.7625647019535817, + "grad_norm": 0.3955250843428749, + "learning_rate": 4.3460043122633256e-06, + "loss": 0.1899, + "step": 5278 + }, + { + "epoch": 1.7628986475204542, + "grad_norm": 0.37322793088236206, + "learning_rate": 4.344077723775349e-06, + "loss": 0.1707, + "step": 5279 + }, + { + "epoch": 1.7632325930873267, + "grad_norm": 0.42561651349981755, + "learning_rate": 4.342151234372155e-06, + "loss": 0.2092, + "step": 5280 + }, + { + "epoch": 1.7635665386541994, + "grad_norm": 0.42631843408788267, + "learning_rate": 4.340224844344766e-06, + "loss": 0.2021, + "step": 5281 + }, + { + "epoch": 1.763900484221072, + "grad_norm": 0.40817135112216674, + "learning_rate": 4.338298553984181e-06, + "loss": 0.2068, + "step": 5282 + }, + { + "epoch": 1.7642344297879444, + "grad_norm": 0.39472505278629344, + "learning_rate": 4.336372363581391e-06, + "loss": 0.1853, + "step": 5283 + }, + { + "epoch": 1.7645683753548171, + "grad_norm": 0.4000097483831296, + "learning_rate": 4.33444627342737e-06, + "loss": 0.1919, + "step": 5284 + }, + { + "epoch": 1.7649023209216899, + "grad_norm": 0.42574844971222253, + "learning_rate": 4.332520283813075e-06, + "loss": 0.2056, + "step": 5285 + }, + { + "epoch": 1.7652362664885624, + "grad_norm": 0.4176968980323195, + "learning_rate": 4.330594395029449e-06, + "loss": 0.2053, + "step": 5286 + }, + { + "epoch": 1.7655702120554349, + "grad_norm": 0.41190325772011954, + "learning_rate": 4.328668607367424e-06, + "loss": 0.1903, + "step": 5287 + }, + { + "epoch": 1.7659041576223076, + "grad_norm": 0.4500552732783806, + "learning_rate": 4.326742921117911e-06, + "loss": 0.1977, + "step": 5288 + }, + { + "epoch": 1.7662381031891803, + "grad_norm": 0.451776540064002, + "learning_rate": 4.324817336571806e-06, + "loss": 0.2023, + "step": 5289 + }, + { + "epoch": 1.7665720487560528, + "grad_norm": 0.4188013135031576, + "learning_rate": 4.3228918540199926e-06, + "loss": 0.1844, + "step": 5290 + }, + { + "epoch": 1.7669059943229253, + "grad_norm": 0.3839938487428742, + "learning_rate": 4.320966473753337e-06, + "loss": 0.1842, + "step": 5291 + }, + { + "epoch": 1.767239939889798, + "grad_norm": 0.41400365926676513, + "learning_rate": 4.31904119606269e-06, + "loss": 0.1974, + "step": 5292 + }, + { + "epoch": 1.7675738854566707, + "grad_norm": 0.420903637964543, + "learning_rate": 4.31711602123889e-06, + "loss": 0.1956, + "step": 5293 + }, + { + "epoch": 1.7679078310235432, + "grad_norm": 0.3933732864451317, + "learning_rate": 4.315190949572755e-06, + "loss": 0.181, + "step": 5294 + }, + { + "epoch": 1.7682417765904157, + "grad_norm": 0.4002834826973489, + "learning_rate": 4.313265981355091e-06, + "loss": 0.1897, + "step": 5295 + }, + { + "epoch": 1.7685757221572884, + "grad_norm": 0.4534917662589281, + "learning_rate": 4.311341116876687e-06, + "loss": 0.1912, + "step": 5296 + }, + { + "epoch": 1.768909667724161, + "grad_norm": 0.3953727715420872, + "learning_rate": 4.309416356428315e-06, + "loss": 0.192, + "step": 5297 + }, + { + "epoch": 1.7692436132910334, + "grad_norm": 0.4235110543403725, + "learning_rate": 4.307491700300733e-06, + "loss": 0.2054, + "step": 5298 + }, + { + "epoch": 1.7695775588579061, + "grad_norm": 0.39646132981498244, + "learning_rate": 4.305567148784685e-06, + "loss": 0.195, + "step": 5299 + }, + { + "epoch": 1.7699115044247788, + "grad_norm": 0.373965105972047, + "learning_rate": 4.3036427021708955e-06, + "loss": 0.1876, + "step": 5300 + }, + { + "epoch": 1.7702454499916513, + "grad_norm": 0.3879823479267765, + "learning_rate": 4.301718360750074e-06, + "loss": 0.181, + "step": 5301 + }, + { + "epoch": 1.7705793955585238, + "grad_norm": 0.3861311454658933, + "learning_rate": 4.299794124812918e-06, + "loss": 0.1786, + "step": 5302 + }, + { + "epoch": 1.7709133411253966, + "grad_norm": 0.41958134378619033, + "learning_rate": 4.297869994650103e-06, + "loss": 0.2081, + "step": 5303 + }, + { + "epoch": 1.7712472866922693, + "grad_norm": 0.39575198482226387, + "learning_rate": 4.295945970552293e-06, + "loss": 0.1834, + "step": 5304 + }, + { + "epoch": 1.7715812322591418, + "grad_norm": 0.3988296719287663, + "learning_rate": 4.294022052810134e-06, + "loss": 0.1858, + "step": 5305 + }, + { + "epoch": 1.7719151778260143, + "grad_norm": 0.4327878844010829, + "learning_rate": 4.292098241714256e-06, + "loss": 0.1995, + "step": 5306 + }, + { + "epoch": 1.772249123392887, + "grad_norm": 0.3883451289026303, + "learning_rate": 4.290174537555275e-06, + "loss": 0.1912, + "step": 5307 + }, + { + "epoch": 1.7725830689597597, + "grad_norm": 0.45477611232921417, + "learning_rate": 4.2882509406237885e-06, + "loss": 0.1849, + "step": 5308 + }, + { + "epoch": 1.7729170145266322, + "grad_norm": 0.38385530123430495, + "learning_rate": 4.286327451210377e-06, + "loss": 0.1814, + "step": 5309 + }, + { + "epoch": 1.7732509600935047, + "grad_norm": 0.4346218793838651, + "learning_rate": 4.284404069605605e-06, + "loss": 0.1986, + "step": 5310 + }, + { + "epoch": 1.7735849056603774, + "grad_norm": 0.36896709229326324, + "learning_rate": 4.282480796100027e-06, + "loss": 0.1848, + "step": 5311 + }, + { + "epoch": 1.77391885122725, + "grad_norm": 0.3921476392351792, + "learning_rate": 4.280557630984173e-06, + "loss": 0.1888, + "step": 5312 + }, + { + "epoch": 1.7742527967941224, + "grad_norm": 0.4206459707979227, + "learning_rate": 4.27863457454856e-06, + "loss": 0.2004, + "step": 5313 + }, + { + "epoch": 1.7745867423609951, + "grad_norm": 0.36808858668646643, + "learning_rate": 4.276711627083688e-06, + "loss": 0.177, + "step": 5314 + }, + { + "epoch": 1.7749206879278678, + "grad_norm": 0.4227954793051248, + "learning_rate": 4.274788788880041e-06, + "loss": 0.2021, + "step": 5315 + }, + { + "epoch": 1.7752546334947403, + "grad_norm": 0.4271760624815115, + "learning_rate": 4.272866060228084e-06, + "loss": 0.2035, + "step": 5316 + }, + { + "epoch": 1.7755885790616128, + "grad_norm": 0.38758325418151324, + "learning_rate": 4.270943441418275e-06, + "loss": 0.1882, + "step": 5317 + }, + { + "epoch": 1.7759225246284855, + "grad_norm": 0.38762943385431875, + "learning_rate": 4.2690209327410406e-06, + "loss": 0.1907, + "step": 5318 + }, + { + "epoch": 1.7762564701953583, + "grad_norm": 0.4139981860338518, + "learning_rate": 4.267098534486803e-06, + "loss": 0.2032, + "step": 5319 + }, + { + "epoch": 1.7765904157622308, + "grad_norm": 0.4167718701052845, + "learning_rate": 4.26517624694596e-06, + "loss": 0.1997, + "step": 5320 + }, + { + "epoch": 1.7769243613291033, + "grad_norm": 0.3827532636411078, + "learning_rate": 4.2632540704088975e-06, + "loss": 0.1938, + "step": 5321 + }, + { + "epoch": 1.777258306895976, + "grad_norm": 0.40170330200281723, + "learning_rate": 4.261332005165984e-06, + "loss": 0.1712, + "step": 5322 + }, + { + "epoch": 1.7775922524628487, + "grad_norm": 0.39802116363666706, + "learning_rate": 4.259410051507567e-06, + "loss": 0.1896, + "step": 5323 + }, + { + "epoch": 1.7779261980297212, + "grad_norm": 0.4222010197604959, + "learning_rate": 4.257488209723981e-06, + "loss": 0.207, + "step": 5324 + }, + { + "epoch": 1.7782601435965937, + "grad_norm": 0.40495532187068245, + "learning_rate": 4.255566480105546e-06, + "loss": 0.1886, + "step": 5325 + }, + { + "epoch": 1.7785940891634664, + "grad_norm": 0.4075549108054149, + "learning_rate": 4.2536448629425585e-06, + "loss": 0.1796, + "step": 5326 + }, + { + "epoch": 1.7789280347303391, + "grad_norm": 0.37713030151299365, + "learning_rate": 4.2517233585253024e-06, + "loss": 0.1787, + "step": 5327 + }, + { + "epoch": 1.7792619802972116, + "grad_norm": 0.4343255349139648, + "learning_rate": 4.2498019671440435e-06, + "loss": 0.2084, + "step": 5328 + }, + { + "epoch": 1.779595925864084, + "grad_norm": 0.39420471849926414, + "learning_rate": 4.247880689089033e-06, + "loss": 0.1913, + "step": 5329 + }, + { + "epoch": 1.7799298714309568, + "grad_norm": 0.3672641930743383, + "learning_rate": 4.245959524650498e-06, + "loss": 0.1757, + "step": 5330 + }, + { + "epoch": 1.7802638169978293, + "grad_norm": 0.3904849887413555, + "learning_rate": 4.244038474118656e-06, + "loss": 0.1815, + "step": 5331 + }, + { + "epoch": 1.7805977625647018, + "grad_norm": 0.3761970196437007, + "learning_rate": 4.242117537783704e-06, + "loss": 0.1794, + "step": 5332 + }, + { + "epoch": 1.7809317081315745, + "grad_norm": 0.4138094357366333, + "learning_rate": 4.2401967159358195e-06, + "loss": 0.1933, + "step": 5333 + }, + { + "epoch": 1.7812656536984472, + "grad_norm": 0.4377054814288653, + "learning_rate": 4.2382760088651696e-06, + "loss": 0.2075, + "step": 5334 + }, + { + "epoch": 1.7815995992653197, + "grad_norm": 0.44167950165051495, + "learning_rate": 4.236355416861897e-06, + "loss": 0.2025, + "step": 5335 + }, + { + "epoch": 1.7819335448321922, + "grad_norm": 0.40362351633119864, + "learning_rate": 4.23443494021613e-06, + "loss": 0.192, + "step": 5336 + }, + { + "epoch": 1.782267490399065, + "grad_norm": 0.4000054483623267, + "learning_rate": 4.232514579217981e-06, + "loss": 0.1863, + "step": 5337 + }, + { + "epoch": 1.7826014359659377, + "grad_norm": 0.39822312077850025, + "learning_rate": 4.23059433415754e-06, + "loss": 0.1915, + "step": 5338 + }, + { + "epoch": 1.7829353815328102, + "grad_norm": 0.41333564470960643, + "learning_rate": 4.228674205324884e-06, + "loss": 0.1986, + "step": 5339 + }, + { + "epoch": 1.7832693270996827, + "grad_norm": 0.40541425631144945, + "learning_rate": 4.226754193010072e-06, + "loss": 0.1819, + "step": 5340 + }, + { + "epoch": 1.7836032726665554, + "grad_norm": 0.36616267473432135, + "learning_rate": 4.224834297503145e-06, + "loss": 0.1805, + "step": 5341 + }, + { + "epoch": 1.783937218233428, + "grad_norm": 0.4368241980696301, + "learning_rate": 4.222914519094124e-06, + "loss": 0.2082, + "step": 5342 + }, + { + "epoch": 1.7842711638003006, + "grad_norm": 0.4105369770801003, + "learning_rate": 4.220994858073014e-06, + "loss": 0.194, + "step": 5343 + }, + { + "epoch": 1.784605109367173, + "grad_norm": 0.3959702721339513, + "learning_rate": 4.2190753147298044e-06, + "loss": 0.1945, + "step": 5344 + }, + { + "epoch": 1.7849390549340458, + "grad_norm": 0.47707382396904957, + "learning_rate": 4.2171558893544626e-06, + "loss": 0.1815, + "step": 5345 + }, + { + "epoch": 1.7852730005009183, + "grad_norm": 0.42034793280157745, + "learning_rate": 4.215236582236941e-06, + "loss": 0.1976, + "step": 5346 + }, + { + "epoch": 1.7856069460677908, + "grad_norm": 0.3784248182395078, + "learning_rate": 4.213317393667175e-06, + "loss": 0.191, + "step": 5347 + }, + { + "epoch": 1.7859408916346635, + "grad_norm": 0.3947336613390132, + "learning_rate": 4.211398323935079e-06, + "loss": 0.1984, + "step": 5348 + }, + { + "epoch": 1.7862748372015362, + "grad_norm": 0.4745094890865065, + "learning_rate": 4.209479373330552e-06, + "loss": 0.2146, + "step": 5349 + }, + { + "epoch": 1.7866087827684087, + "grad_norm": 0.4115025463067994, + "learning_rate": 4.207560542143474e-06, + "loss": 0.2033, + "step": 5350 + }, + { + "epoch": 1.7869427283352812, + "grad_norm": 0.41096840682747465, + "learning_rate": 4.205641830663706e-06, + "loss": 0.1969, + "step": 5351 + }, + { + "epoch": 1.787276673902154, + "grad_norm": 0.37721407675737656, + "learning_rate": 4.2037232391810925e-06, + "loss": 0.1796, + "step": 5352 + }, + { + "epoch": 1.7876106194690267, + "grad_norm": 0.3712905961394812, + "learning_rate": 4.20180476798546e-06, + "loss": 0.1831, + "step": 5353 + }, + { + "epoch": 1.7879445650358992, + "grad_norm": 0.41330452273749596, + "learning_rate": 4.1998864173666174e-06, + "loss": 0.2077, + "step": 5354 + }, + { + "epoch": 1.7882785106027717, + "grad_norm": 0.4693570027309341, + "learning_rate": 4.197968187614351e-06, + "loss": 0.2115, + "step": 5355 + }, + { + "epoch": 1.7886124561696444, + "grad_norm": 0.4171457241557042, + "learning_rate": 4.196050079018433e-06, + "loss": 0.1869, + "step": 5356 + }, + { + "epoch": 1.788946401736517, + "grad_norm": 1.0151186082749302, + "learning_rate": 4.194132091868616e-06, + "loss": 0.2071, + "step": 5357 + }, + { + "epoch": 1.7892803473033896, + "grad_norm": 0.4187428989067555, + "learning_rate": 4.1922142264546365e-06, + "loss": 0.19, + "step": 5358 + }, + { + "epoch": 1.789614292870262, + "grad_norm": 0.3836619002670058, + "learning_rate": 4.1902964830662104e-06, + "loss": 0.1924, + "step": 5359 + }, + { + "epoch": 1.7899482384371348, + "grad_norm": 0.38919854186049957, + "learning_rate": 4.188378861993034e-06, + "loss": 0.1878, + "step": 5360 + }, + { + "epoch": 1.7902821840040073, + "grad_norm": 0.39768862778262004, + "learning_rate": 4.186461363524786e-06, + "loss": 0.198, + "step": 5361 + }, + { + "epoch": 1.7906161295708798, + "grad_norm": 0.39782728299988296, + "learning_rate": 4.184543987951127e-06, + "loss": 0.1801, + "step": 5362 + }, + { + "epoch": 1.7909500751377525, + "grad_norm": 0.495707344907115, + "learning_rate": 4.182626735561703e-06, + "loss": 0.181, + "step": 5363 + }, + { + "epoch": 1.7912840207046252, + "grad_norm": 0.44114708561246135, + "learning_rate": 4.180709606646134e-06, + "loss": 0.2039, + "step": 5364 + }, + { + "epoch": 1.7916179662714977, + "grad_norm": 0.3927854303581158, + "learning_rate": 4.178792601494026e-06, + "loss": 0.1896, + "step": 5365 + }, + { + "epoch": 1.7919519118383702, + "grad_norm": 0.421206797477163, + "learning_rate": 4.176875720394965e-06, + "loss": 0.2005, + "step": 5366 + }, + { + "epoch": 1.792285857405243, + "grad_norm": 0.5201455802658564, + "learning_rate": 4.174958963638518e-06, + "loss": 0.2143, + "step": 5367 + }, + { + "epoch": 1.7926198029721157, + "grad_norm": 0.4138358670591043, + "learning_rate": 4.173042331514234e-06, + "loss": 0.1827, + "step": 5368 + }, + { + "epoch": 1.7929537485389881, + "grad_norm": 0.44836688438557204, + "learning_rate": 4.171125824311642e-06, + "loss": 0.204, + "step": 5369 + }, + { + "epoch": 1.7932876941058606, + "grad_norm": 0.39838785544358446, + "learning_rate": 4.169209442320255e-06, + "loss": 0.1864, + "step": 5370 + }, + { + "epoch": 1.7936216396727334, + "grad_norm": 0.39065135717154886, + "learning_rate": 4.167293185829565e-06, + "loss": 0.1777, + "step": 5371 + }, + { + "epoch": 1.793955585239606, + "grad_norm": 0.3868324514391502, + "learning_rate": 4.165377055129043e-06, + "loss": 0.1898, + "step": 5372 + }, + { + "epoch": 1.7942895308064786, + "grad_norm": 0.4086998441668388, + "learning_rate": 4.163461050508144e-06, + "loss": 0.1874, + "step": 5373 + }, + { + "epoch": 1.794623476373351, + "grad_norm": 0.3777662978033055, + "learning_rate": 4.161545172256303e-06, + "loss": 0.1824, + "step": 5374 + }, + { + "epoch": 1.7949574219402238, + "grad_norm": 0.4382436302483726, + "learning_rate": 4.1596294206629375e-06, + "loss": 0.2086, + "step": 5375 + }, + { + "epoch": 1.7952913675070965, + "grad_norm": 0.41758627579949836, + "learning_rate": 4.157713796017442e-06, + "loss": 0.1874, + "step": 5376 + }, + { + "epoch": 1.795625313073969, + "grad_norm": 0.40993854049694717, + "learning_rate": 4.155798298609196e-06, + "loss": 0.2063, + "step": 5377 + }, + { + "epoch": 1.7959592586408415, + "grad_norm": 0.44807089311268883, + "learning_rate": 4.1538829287275565e-06, + "loss": 0.1874, + "step": 5378 + }, + { + "epoch": 1.7962932042077142, + "grad_norm": 0.41081845688238966, + "learning_rate": 4.151967686661864e-06, + "loss": 0.2055, + "step": 5379 + }, + { + "epoch": 1.7966271497745867, + "grad_norm": 0.3750452408616848, + "learning_rate": 4.150052572701435e-06, + "loss": 0.1923, + "step": 5380 + }, + { + "epoch": 1.7969610953414592, + "grad_norm": 0.3748777838197131, + "learning_rate": 4.148137587135575e-06, + "loss": 0.1856, + "step": 5381 + }, + { + "epoch": 1.797295040908332, + "grad_norm": 0.385221778710938, + "learning_rate": 4.146222730253563e-06, + "loss": 0.1847, + "step": 5382 + }, + { + "epoch": 1.7976289864752046, + "grad_norm": 0.4130555094657973, + "learning_rate": 4.1443080023446605e-06, + "loss": 0.2001, + "step": 5383 + }, + { + "epoch": 1.7979629320420771, + "grad_norm": 0.40847407861926593, + "learning_rate": 4.1423934036981096e-06, + "loss": 0.1968, + "step": 5384 + }, + { + "epoch": 1.7982968776089496, + "grad_norm": 0.45331715003457235, + "learning_rate": 4.140478934603133e-06, + "loss": 0.1954, + "step": 5385 + }, + { + "epoch": 1.7986308231758223, + "grad_norm": 0.3854167677351394, + "learning_rate": 4.138564595348932e-06, + "loss": 0.1847, + "step": 5386 + }, + { + "epoch": 1.798964768742695, + "grad_norm": 0.40379130860802437, + "learning_rate": 4.136650386224694e-06, + "loss": 0.1974, + "step": 5387 + }, + { + "epoch": 1.7992987143095676, + "grad_norm": 0.43616884197672473, + "learning_rate": 4.13473630751958e-06, + "loss": 0.2073, + "step": 5388 + }, + { + "epoch": 1.79963265987644, + "grad_norm": 0.39500763107731124, + "learning_rate": 4.132822359522735e-06, + "loss": 0.1964, + "step": 5389 + }, + { + "epoch": 1.7999666054433128, + "grad_norm": 0.5316276814151004, + "learning_rate": 4.130908542523285e-06, + "loss": 0.2012, + "step": 5390 + }, + { + "epoch": 1.8003005510101855, + "grad_norm": 0.4413915607742555, + "learning_rate": 4.128994856810332e-06, + "loss": 0.1929, + "step": 5391 + }, + { + "epoch": 1.800634496577058, + "grad_norm": 0.43359946931826776, + "learning_rate": 4.127081302672958e-06, + "loss": 0.1975, + "step": 5392 + }, + { + "epoch": 1.8009684421439305, + "grad_norm": 0.4326737933869161, + "learning_rate": 4.125167880400235e-06, + "loss": 0.1979, + "step": 5393 + }, + { + "epoch": 1.8013023877108032, + "grad_norm": 0.3999758339829761, + "learning_rate": 4.1232545902812046e-06, + "loss": 0.1782, + "step": 5394 + }, + { + "epoch": 1.8016363332776757, + "grad_norm": 0.39414865979855523, + "learning_rate": 4.121341432604892e-06, + "loss": 0.1908, + "step": 5395 + }, + { + "epoch": 1.8019702788445482, + "grad_norm": 0.40085607899323583, + "learning_rate": 4.1194284076603004e-06, + "loss": 0.1991, + "step": 5396 + }, + { + "epoch": 1.802304224411421, + "grad_norm": 0.3986732428896081, + "learning_rate": 4.117515515736418e-06, + "loss": 0.1892, + "step": 5397 + }, + { + "epoch": 1.8026381699782936, + "grad_norm": 0.38552180630358474, + "learning_rate": 4.1156027571222054e-06, + "loss": 0.1899, + "step": 5398 + }, + { + "epoch": 1.8029721155451661, + "grad_norm": 0.42639374577496114, + "learning_rate": 4.113690132106611e-06, + "loss": 0.2013, + "step": 5399 + }, + { + "epoch": 1.8033060611120386, + "grad_norm": 0.44076136474761535, + "learning_rate": 4.111777640978559e-06, + "loss": 0.1967, + "step": 5400 + }, + { + "epoch": 1.8036400066789113, + "grad_norm": 0.3715616165565932, + "learning_rate": 4.109865284026953e-06, + "loss": 0.1895, + "step": 5401 + }, + { + "epoch": 1.803973952245784, + "grad_norm": 0.413959384805662, + "learning_rate": 4.107953061540676e-06, + "loss": 0.1906, + "step": 5402 + }, + { + "epoch": 1.8043078978126565, + "grad_norm": 0.3986680001400406, + "learning_rate": 4.10604097380859e-06, + "loss": 0.1884, + "step": 5403 + }, + { + "epoch": 1.804641843379529, + "grad_norm": 0.39896994690686616, + "learning_rate": 4.104129021119543e-06, + "loss": 0.1864, + "step": 5404 + }, + { + "epoch": 1.8049757889464018, + "grad_norm": 0.41708702213753196, + "learning_rate": 4.102217203762357e-06, + "loss": 0.2042, + "step": 5405 + }, + { + "epoch": 1.8053097345132745, + "grad_norm": 0.4162521221231452, + "learning_rate": 4.1003055220258335e-06, + "loss": 0.1886, + "step": 5406 + }, + { + "epoch": 1.805643680080147, + "grad_norm": 0.388971195188673, + "learning_rate": 4.0983939761987535e-06, + "loss": 0.1804, + "step": 5407 + }, + { + "epoch": 1.8059776256470195, + "grad_norm": 0.3976488686723864, + "learning_rate": 4.09648256656988e-06, + "loss": 0.2036, + "step": 5408 + }, + { + "epoch": 1.8063115712138922, + "grad_norm": 0.43724410925090645, + "learning_rate": 4.094571293427951e-06, + "loss": 0.2063, + "step": 5409 + }, + { + "epoch": 1.8066455167807647, + "grad_norm": 0.42616248921069183, + "learning_rate": 4.092660157061691e-06, + "loss": 0.2003, + "step": 5410 + }, + { + "epoch": 1.8069794623476372, + "grad_norm": 0.4045940509370612, + "learning_rate": 4.090749157759799e-06, + "loss": 0.1894, + "step": 5411 + }, + { + "epoch": 1.80731340791451, + "grad_norm": 0.4173527749892698, + "learning_rate": 4.088838295810952e-06, + "loss": 0.2007, + "step": 5412 + }, + { + "epoch": 1.8076473534813826, + "grad_norm": 0.42528073955892315, + "learning_rate": 4.086927571503808e-06, + "loss": 0.2005, + "step": 5413 + }, + { + "epoch": 1.807981299048255, + "grad_norm": 0.41587887704528714, + "learning_rate": 4.0850169851270075e-06, + "loss": 0.2006, + "step": 5414 + }, + { + "epoch": 1.8083152446151276, + "grad_norm": 0.40738714104001256, + "learning_rate": 4.0831065369691615e-06, + "loss": 0.2008, + "step": 5415 + }, + { + "epoch": 1.8086491901820003, + "grad_norm": 0.44869021642959594, + "learning_rate": 4.0811962273188714e-06, + "loss": 0.2001, + "step": 5416 + }, + { + "epoch": 1.808983135748873, + "grad_norm": 0.4002553541976238, + "learning_rate": 4.0792860564647105e-06, + "loss": 0.1911, + "step": 5417 + }, + { + "epoch": 1.8093170813157455, + "grad_norm": 0.3786221254197682, + "learning_rate": 4.077376024695231e-06, + "loss": 0.1774, + "step": 5418 + }, + { + "epoch": 1.809651026882618, + "grad_norm": 0.3973451247400723, + "learning_rate": 4.075466132298967e-06, + "loss": 0.1918, + "step": 5419 + }, + { + "epoch": 1.8099849724494907, + "grad_norm": 0.40821244986487804, + "learning_rate": 4.073556379564429e-06, + "loss": 0.1921, + "step": 5420 + }, + { + "epoch": 1.8103189180163635, + "grad_norm": 0.4187248192126581, + "learning_rate": 4.071646766780109e-06, + "loss": 0.2055, + "step": 5421 + }, + { + "epoch": 1.810652863583236, + "grad_norm": 0.4009251034725688, + "learning_rate": 4.069737294234475e-06, + "loss": 0.1921, + "step": 5422 + }, + { + "epoch": 1.8109868091501085, + "grad_norm": 0.4153763020520176, + "learning_rate": 4.067827962215977e-06, + "loss": 0.1971, + "step": 5423 + }, + { + "epoch": 1.8113207547169812, + "grad_norm": 0.42059984630019226, + "learning_rate": 4.065918771013042e-06, + "loss": 0.1931, + "step": 5424 + }, + { + "epoch": 1.811654700283854, + "grad_norm": 0.38975550241965073, + "learning_rate": 4.064009720914074e-06, + "loss": 0.1939, + "step": 5425 + }, + { + "epoch": 1.8119886458507264, + "grad_norm": 0.4352813167969884, + "learning_rate": 4.062100812207459e-06, + "loss": 0.1984, + "step": 5426 + }, + { + "epoch": 1.8123225914175989, + "grad_norm": 0.41222394672549245, + "learning_rate": 4.060192045181558e-06, + "loss": 0.2003, + "step": 5427 + }, + { + "epoch": 1.8126565369844716, + "grad_norm": 0.4211144974261239, + "learning_rate": 4.058283420124716e-06, + "loss": 0.1946, + "step": 5428 + }, + { + "epoch": 1.812990482551344, + "grad_norm": 0.4033307519940219, + "learning_rate": 4.056374937325251e-06, + "loss": 0.1888, + "step": 5429 + }, + { + "epoch": 1.8133244281182166, + "grad_norm": 0.4234575464895144, + "learning_rate": 4.054466597071464e-06, + "loss": 0.1932, + "step": 5430 + }, + { + "epoch": 1.8136583736850893, + "grad_norm": 0.41837807879440336, + "learning_rate": 4.05255839965163e-06, + "loss": 0.2026, + "step": 5431 + }, + { + "epoch": 1.813992319251962, + "grad_norm": 0.43738094694833785, + "learning_rate": 4.050650345354006e-06, + "loss": 0.202, + "step": 5432 + }, + { + "epoch": 1.8143262648188345, + "grad_norm": 0.43660446022144006, + "learning_rate": 4.048742434466823e-06, + "loss": 0.2109, + "step": 5433 + }, + { + "epoch": 1.814660210385707, + "grad_norm": 0.3961921763065722, + "learning_rate": 4.046834667278298e-06, + "loss": 0.1926, + "step": 5434 + }, + { + "epoch": 1.8149941559525797, + "grad_norm": 0.39349069851042634, + "learning_rate": 4.04492704407662e-06, + "loss": 0.191, + "step": 5435 + }, + { + "epoch": 1.8153281015194525, + "grad_norm": 0.40519030905215747, + "learning_rate": 4.043019565149958e-06, + "loss": 0.1936, + "step": 5436 + }, + { + "epoch": 1.815662047086325, + "grad_norm": 0.37874729826316716, + "learning_rate": 4.041112230786458e-06, + "loss": 0.186, + "step": 5437 + }, + { + "epoch": 1.8159959926531974, + "grad_norm": 0.6081203465553392, + "learning_rate": 4.039205041274247e-06, + "loss": 0.2003, + "step": 5438 + }, + { + "epoch": 1.8163299382200702, + "grad_norm": 0.41622622386812613, + "learning_rate": 4.0372979969014245e-06, + "loss": 0.1904, + "step": 5439 + }, + { + "epoch": 1.8166638837869429, + "grad_norm": 0.4075713166770594, + "learning_rate": 4.035391097956077e-06, + "loss": 0.2012, + "step": 5440 + }, + { + "epoch": 1.8169978293538154, + "grad_norm": 0.4494439120686127, + "learning_rate": 4.0334843447262625e-06, + "loss": 0.2047, + "step": 5441 + }, + { + "epoch": 1.8173317749206879, + "grad_norm": 0.4137738402524847, + "learning_rate": 4.0315777375000185e-06, + "loss": 0.2019, + "step": 5442 + }, + { + "epoch": 1.8176657204875606, + "grad_norm": 0.393956419351833, + "learning_rate": 4.029671276565359e-06, + "loss": 0.1929, + "step": 5443 + }, + { + "epoch": 1.817999666054433, + "grad_norm": 0.3960846147046012, + "learning_rate": 4.027764962210278e-06, + "loss": 0.1828, + "step": 5444 + }, + { + "epoch": 1.8183336116213056, + "grad_norm": 0.3877136289033595, + "learning_rate": 4.025858794722749e-06, + "loss": 0.1908, + "step": 5445 + }, + { + "epoch": 1.8186675571881783, + "grad_norm": 0.38011780754177016, + "learning_rate": 4.0239527743907184e-06, + "loss": 0.1833, + "step": 5446 + }, + { + "epoch": 1.819001502755051, + "grad_norm": 0.41199492546924943, + "learning_rate": 4.022046901502114e-06, + "loss": 0.1817, + "step": 5447 + }, + { + "epoch": 1.8193354483219235, + "grad_norm": 0.401329738391802, + "learning_rate": 4.020141176344839e-06, + "loss": 0.1926, + "step": 5448 + }, + { + "epoch": 1.819669393888796, + "grad_norm": 0.41127280691800383, + "learning_rate": 4.018235599206778e-06, + "loss": 0.2009, + "step": 5449 + }, + { + "epoch": 1.8200033394556687, + "grad_norm": 0.4064474207715107, + "learning_rate": 4.016330170375787e-06, + "loss": 0.1935, + "step": 5450 + }, + { + "epoch": 1.8203372850225414, + "grad_norm": 0.4020536829609197, + "learning_rate": 4.014424890139709e-06, + "loss": 0.1837, + "step": 5451 + }, + { + "epoch": 1.820671230589414, + "grad_norm": 0.38407516465676206, + "learning_rate": 4.012519758786355e-06, + "loss": 0.1913, + "step": 5452 + }, + { + "epoch": 1.8210051761562864, + "grad_norm": 0.4123403170436455, + "learning_rate": 4.01061477660352e-06, + "loss": 0.1976, + "step": 5453 + }, + { + "epoch": 1.8213391217231591, + "grad_norm": 0.3839390900902544, + "learning_rate": 4.008709943878971e-06, + "loss": 0.1836, + "step": 5454 + }, + { + "epoch": 1.8216730672900319, + "grad_norm": 0.38868642954562516, + "learning_rate": 4.006805260900458e-06, + "loss": 0.1857, + "step": 5455 + }, + { + "epoch": 1.8220070128569044, + "grad_norm": 0.44277697324688925, + "learning_rate": 4.004900727955703e-06, + "loss": 0.203, + "step": 5456 + }, + { + "epoch": 1.8223409584237769, + "grad_norm": 0.41818991156707475, + "learning_rate": 4.0029963453324115e-06, + "loss": 0.1891, + "step": 5457 + }, + { + "epoch": 1.8226749039906496, + "grad_norm": 0.378807769453953, + "learning_rate": 4.001092113318261e-06, + "loss": 0.1809, + "step": 5458 + }, + { + "epoch": 1.823008849557522, + "grad_norm": 0.4201589268763546, + "learning_rate": 3.99918803220091e-06, + "loss": 0.2034, + "step": 5459 + }, + { + "epoch": 1.8233427951243946, + "grad_norm": 0.3896772152862828, + "learning_rate": 3.99728410226799e-06, + "loss": 0.1958, + "step": 5460 + }, + { + "epoch": 1.8236767406912673, + "grad_norm": 0.3922430722052448, + "learning_rate": 3.995380323807113e-06, + "loss": 0.1953, + "step": 5461 + }, + { + "epoch": 1.82401068625814, + "grad_norm": 0.35224053029208974, + "learning_rate": 3.993476697105864e-06, + "loss": 0.1756, + "step": 5462 + }, + { + "epoch": 1.8243446318250125, + "grad_norm": 0.4117629603867008, + "learning_rate": 3.991573222451815e-06, + "loss": 0.2055, + "step": 5463 + }, + { + "epoch": 1.824678577391885, + "grad_norm": 0.4119526641975354, + "learning_rate": 3.989669900132504e-06, + "loss": 0.1853, + "step": 5464 + }, + { + "epoch": 1.8250125229587577, + "grad_norm": 0.3944045127756812, + "learning_rate": 3.987766730435451e-06, + "loss": 0.1987, + "step": 5465 + }, + { + "epoch": 1.8253464685256304, + "grad_norm": 0.4015804217496364, + "learning_rate": 3.9858637136481515e-06, + "loss": 0.198, + "step": 5466 + }, + { + "epoch": 1.825680414092503, + "grad_norm": 0.37176769181449776, + "learning_rate": 3.98396085005808e-06, + "loss": 0.1862, + "step": 5467 + }, + { + "epoch": 1.8260143596593754, + "grad_norm": 0.40972865853666285, + "learning_rate": 3.982058139952684e-06, + "loss": 0.1886, + "step": 5468 + }, + { + "epoch": 1.8263483052262481, + "grad_norm": 0.3955918184023543, + "learning_rate": 3.980155583619392e-06, + "loss": 0.1969, + "step": 5469 + }, + { + "epoch": 1.8266822507931209, + "grad_norm": 0.3825992680611567, + "learning_rate": 3.978253181345609e-06, + "loss": 0.1867, + "step": 5470 + }, + { + "epoch": 1.8270161963599933, + "grad_norm": 0.38415661050520683, + "learning_rate": 3.9763509334187125e-06, + "loss": 0.1918, + "step": 5471 + }, + { + "epoch": 1.8273501419268658, + "grad_norm": 0.4392047476972209, + "learning_rate": 3.974448840126061e-06, + "loss": 0.1988, + "step": 5472 + }, + { + "epoch": 1.8276840874937386, + "grad_norm": 0.4115656594818337, + "learning_rate": 3.972546901754987e-06, + "loss": 0.1947, + "step": 5473 + }, + { + "epoch": 1.8280180330606113, + "grad_norm": 0.3900375859373049, + "learning_rate": 3.9706451185928e-06, + "loss": 0.1921, + "step": 5474 + }, + { + "epoch": 1.8283519786274836, + "grad_norm": 0.407212324644722, + "learning_rate": 3.968743490926791e-06, + "loss": 0.1978, + "step": 5475 + }, + { + "epoch": 1.8286859241943563, + "grad_norm": 0.3799461405429855, + "learning_rate": 3.966842019044219e-06, + "loss": 0.1794, + "step": 5476 + }, + { + "epoch": 1.829019869761229, + "grad_norm": 0.5359825297417488, + "learning_rate": 3.964940703232326e-06, + "loss": 0.2117, + "step": 5477 + }, + { + "epoch": 1.8293538153281015, + "grad_norm": 0.43912349342902457, + "learning_rate": 3.963039543778327e-06, + "loss": 0.1951, + "step": 5478 + }, + { + "epoch": 1.829687760894974, + "grad_norm": 0.35805460260955724, + "learning_rate": 3.961138540969411e-06, + "loss": 0.1778, + "step": 5479 + }, + { + "epoch": 1.8300217064618467, + "grad_norm": 0.3810672843512081, + "learning_rate": 3.9592376950927545e-06, + "loss": 0.187, + "step": 5480 + }, + { + "epoch": 1.8303556520287194, + "grad_norm": 0.4011484817855961, + "learning_rate": 3.957337006435499e-06, + "loss": 0.1785, + "step": 5481 + }, + { + "epoch": 1.830689597595592, + "grad_norm": 0.4789546571386921, + "learning_rate": 3.955436475284764e-06, + "loss": 0.2091, + "step": 5482 + }, + { + "epoch": 1.8310235431624644, + "grad_norm": 0.4063460886404937, + "learning_rate": 3.95353610192765e-06, + "loss": 0.1924, + "step": 5483 + }, + { + "epoch": 1.8313574887293371, + "grad_norm": 0.41526229249076346, + "learning_rate": 3.95163588665123e-06, + "loss": 0.1915, + "step": 5484 + }, + { + "epoch": 1.8316914342962098, + "grad_norm": 0.41928063093273427, + "learning_rate": 3.949735829742549e-06, + "loss": 0.2008, + "step": 5485 + }, + { + "epoch": 1.8320253798630823, + "grad_norm": 0.41388821065326953, + "learning_rate": 3.947835931488642e-06, + "loss": 0.1978, + "step": 5486 + }, + { + "epoch": 1.8323593254299548, + "grad_norm": 0.42225519497611547, + "learning_rate": 3.9459361921765045e-06, + "loss": 0.2089, + "step": 5487 + }, + { + "epoch": 1.8326932709968276, + "grad_norm": 0.3697474868956762, + "learning_rate": 3.944036612093117e-06, + "loss": 0.1834, + "step": 5488 + }, + { + "epoch": 1.8330272165637003, + "grad_norm": 0.40844338979490125, + "learning_rate": 3.942137191525434e-06, + "loss": 0.1815, + "step": 5489 + }, + { + "epoch": 1.8333611621305728, + "grad_norm": 0.37036000123683893, + "learning_rate": 3.9402379307603825e-06, + "loss": 0.1757, + "step": 5490 + }, + { + "epoch": 1.8336951076974453, + "grad_norm": 0.4416887840594564, + "learning_rate": 3.93833883008487e-06, + "loss": 0.1928, + "step": 5491 + }, + { + "epoch": 1.834029053264318, + "grad_norm": 0.4230338012823816, + "learning_rate": 3.936439889785778e-06, + "loss": 0.2049, + "step": 5492 + }, + { + "epoch": 1.8343629988311905, + "grad_norm": 0.4552356374064481, + "learning_rate": 3.934541110149964e-06, + "loss": 0.1859, + "step": 5493 + }, + { + "epoch": 1.834696944398063, + "grad_norm": 0.3996370964740024, + "learning_rate": 3.932642491464261e-06, + "loss": 0.1863, + "step": 5494 + }, + { + "epoch": 1.8350308899649357, + "grad_norm": 0.41349629542546434, + "learning_rate": 3.930744034015477e-06, + "loss": 0.2028, + "step": 5495 + }, + { + "epoch": 1.8353648355318084, + "grad_norm": 0.3874670370011834, + "learning_rate": 3.9288457380903954e-06, + "loss": 0.1946, + "step": 5496 + }, + { + "epoch": 1.835698781098681, + "grad_norm": 0.35173551946256226, + "learning_rate": 3.926947603975778e-06, + "loss": 0.1787, + "step": 5497 + }, + { + "epoch": 1.8360327266655534, + "grad_norm": 0.38153385633184733, + "learning_rate": 3.925049631958361e-06, + "loss": 0.1901, + "step": 5498 + }, + { + "epoch": 1.8363666722324261, + "grad_norm": 0.37437515777658054, + "learning_rate": 3.923151822324854e-06, + "loss": 0.1772, + "step": 5499 + }, + { + "epoch": 1.8367006177992988, + "grad_norm": 0.3666580407206624, + "learning_rate": 3.9212541753619435e-06, + "loss": 0.1753, + "step": 5500 + }, + { + "epoch": 1.8370345633661713, + "grad_norm": 0.41030532655032376, + "learning_rate": 3.9193566913562915e-06, + "loss": 0.2008, + "step": 5501 + }, + { + "epoch": 1.8373685089330438, + "grad_norm": 0.47705147398700304, + "learning_rate": 3.917459370594537e-06, + "loss": 0.2097, + "step": 5502 + }, + { + "epoch": 1.8377024544999165, + "grad_norm": 0.3711053442410675, + "learning_rate": 3.915562213363287e-06, + "loss": 0.1921, + "step": 5503 + }, + { + "epoch": 1.8380364000667893, + "grad_norm": 0.3925948723529021, + "learning_rate": 3.9136652199491365e-06, + "loss": 0.1895, + "step": 5504 + }, + { + "epoch": 1.8383703456336618, + "grad_norm": 0.44496015087618784, + "learning_rate": 3.911768390638645e-06, + "loss": 0.2036, + "step": 5505 + }, + { + "epoch": 1.8387042912005342, + "grad_norm": 0.4320342801300668, + "learning_rate": 3.909871725718353e-06, + "loss": 0.2007, + "step": 5506 + }, + { + "epoch": 1.839038236767407, + "grad_norm": 0.40009969508303217, + "learning_rate": 3.907975225474771e-06, + "loss": 0.1907, + "step": 5507 + }, + { + "epoch": 1.8393721823342795, + "grad_norm": 0.44359487303286543, + "learning_rate": 3.906078890194391e-06, + "loss": 0.2049, + "step": 5508 + }, + { + "epoch": 1.839706127901152, + "grad_norm": 0.38608393971174715, + "learning_rate": 3.904182720163672e-06, + "loss": 0.1898, + "step": 5509 + }, + { + "epoch": 1.8400400734680247, + "grad_norm": 0.3923128843531059, + "learning_rate": 3.902286715669058e-06, + "loss": 0.1837, + "step": 5510 + }, + { + "epoch": 1.8403740190348974, + "grad_norm": 0.4372944654236281, + "learning_rate": 3.9003908769969615e-06, + "loss": 0.1986, + "step": 5511 + }, + { + "epoch": 1.8407079646017699, + "grad_norm": 0.41187405317933395, + "learning_rate": 3.89849520443377e-06, + "loss": 0.1895, + "step": 5512 + }, + { + "epoch": 1.8410419101686424, + "grad_norm": 0.4036022582486776, + "learning_rate": 3.896599698265847e-06, + "loss": 0.1859, + "step": 5513 + }, + { + "epoch": 1.841375855735515, + "grad_norm": 0.39156933325917426, + "learning_rate": 3.894704358779533e-06, + "loss": 0.1933, + "step": 5514 + }, + { + "epoch": 1.8417098013023878, + "grad_norm": 0.41843329760041964, + "learning_rate": 3.892809186261138e-06, + "loss": 0.2076, + "step": 5515 + }, + { + "epoch": 1.8420437468692603, + "grad_norm": 0.456977264972126, + "learning_rate": 3.890914180996954e-06, + "loss": 0.2005, + "step": 5516 + }, + { + "epoch": 1.8423776924361328, + "grad_norm": 0.38080749341104936, + "learning_rate": 3.889019343273242e-06, + "loss": 0.1896, + "step": 5517 + }, + { + "epoch": 1.8427116380030055, + "grad_norm": 0.433536114286531, + "learning_rate": 3.887124673376239e-06, + "loss": 0.1972, + "step": 5518 + }, + { + "epoch": 1.8430455835698782, + "grad_norm": 0.4274587785341165, + "learning_rate": 3.885230171592157e-06, + "loss": 0.2031, + "step": 5519 + }, + { + "epoch": 1.8433795291367507, + "grad_norm": 0.4017626320601205, + "learning_rate": 3.883335838207183e-06, + "loss": 0.1903, + "step": 5520 + }, + { + "epoch": 1.8437134747036232, + "grad_norm": 0.40513921939830017, + "learning_rate": 3.881441673507481e-06, + "loss": 0.1864, + "step": 5521 + }, + { + "epoch": 1.844047420270496, + "grad_norm": 0.4158883732225394, + "learning_rate": 3.879547677779184e-06, + "loss": 0.186, + "step": 5522 + }, + { + "epoch": 1.8443813658373687, + "grad_norm": 0.39393567740605284, + "learning_rate": 3.8776538513084036e-06, + "loss": 0.1913, + "step": 5523 + }, + { + "epoch": 1.844715311404241, + "grad_norm": 0.3830752975691852, + "learning_rate": 3.875760194381224e-06, + "loss": 0.181, + "step": 5524 + }, + { + "epoch": 1.8450492569711137, + "grad_norm": 0.43236140310887133, + "learning_rate": 3.873866707283704e-06, + "loss": 0.195, + "step": 5525 + }, + { + "epoch": 1.8453832025379864, + "grad_norm": 0.3971639731949253, + "learning_rate": 3.871973390301876e-06, + "loss": 0.1908, + "step": 5526 + }, + { + "epoch": 1.8457171481048589, + "grad_norm": 0.4329975492794574, + "learning_rate": 3.8700802437217526e-06, + "loss": 0.1973, + "step": 5527 + }, + { + "epoch": 1.8460510936717314, + "grad_norm": 0.39079323807263294, + "learning_rate": 3.8681872678293115e-06, + "loss": 0.1805, + "step": 5528 + }, + { + "epoch": 1.846385039238604, + "grad_norm": 0.4575892417350222, + "learning_rate": 3.866294462910511e-06, + "loss": 0.1846, + "step": 5529 + }, + { + "epoch": 1.8467189848054768, + "grad_norm": 0.39421025346303856, + "learning_rate": 3.86440182925128e-06, + "loss": 0.1857, + "step": 5530 + }, + { + "epoch": 1.8470529303723493, + "grad_norm": 0.3889312076173911, + "learning_rate": 3.862509367137525e-06, + "loss": 0.1846, + "step": 5531 + }, + { + "epoch": 1.8473868759392218, + "grad_norm": 0.4334080310493205, + "learning_rate": 3.86061707685512e-06, + "loss": 0.2061, + "step": 5532 + }, + { + "epoch": 1.8477208215060945, + "grad_norm": 0.38246167969063505, + "learning_rate": 3.8587249586899245e-06, + "loss": 0.1822, + "step": 5533 + }, + { + "epoch": 1.8480547670729672, + "grad_norm": 0.38452054872558217, + "learning_rate": 3.856833012927762e-06, + "loss": 0.1843, + "step": 5534 + }, + { + "epoch": 1.8483887126398397, + "grad_norm": 0.36692188968122724, + "learning_rate": 3.854941239854433e-06, + "loss": 0.1859, + "step": 5535 + }, + { + "epoch": 1.8487226582067122, + "grad_norm": 0.4291853445087326, + "learning_rate": 3.853049639755713e-06, + "loss": 0.1927, + "step": 5536 + }, + { + "epoch": 1.849056603773585, + "grad_norm": 0.4024469130697564, + "learning_rate": 3.8511582129173495e-06, + "loss": 0.1853, + "step": 5537 + }, + { + "epoch": 1.8493905493404577, + "grad_norm": 0.4198881770666027, + "learning_rate": 3.8492669596250636e-06, + "loss": 0.1983, + "step": 5538 + }, + { + "epoch": 1.8497244949073302, + "grad_norm": 0.4290931844458051, + "learning_rate": 3.8473758801645535e-06, + "loss": 0.2179, + "step": 5539 + }, + { + "epoch": 1.8500584404742026, + "grad_norm": 0.4519053873864919, + "learning_rate": 3.84548497482149e-06, + "loss": 0.2008, + "step": 5540 + }, + { + "epoch": 1.8503923860410754, + "grad_norm": 0.39187413508386726, + "learning_rate": 3.843594243881513e-06, + "loss": 0.1898, + "step": 5541 + }, + { + "epoch": 1.8507263316079479, + "grad_norm": 0.4042759859011185, + "learning_rate": 3.841703687630243e-06, + "loss": 0.1902, + "step": 5542 + }, + { + "epoch": 1.8510602771748204, + "grad_norm": 0.42476477246752786, + "learning_rate": 3.8398133063532685e-06, + "loss": 0.2123, + "step": 5543 + }, + { + "epoch": 1.851394222741693, + "grad_norm": 0.4270712881303755, + "learning_rate": 3.837923100336155e-06, + "loss": 0.2102, + "step": 5544 + }, + { + "epoch": 1.8517281683085658, + "grad_norm": 0.40813919718254027, + "learning_rate": 3.836033069864441e-06, + "loss": 0.1959, + "step": 5545 + }, + { + "epoch": 1.8520621138754383, + "grad_norm": 0.5303707806361242, + "learning_rate": 3.834143215223637e-06, + "loss": 0.2061, + "step": 5546 + }, + { + "epoch": 1.8523960594423108, + "grad_norm": 0.5552333428331633, + "learning_rate": 3.832253536699227e-06, + "loss": 0.2078, + "step": 5547 + }, + { + "epoch": 1.8527300050091835, + "grad_norm": 0.38601722650526543, + "learning_rate": 3.8303640345766714e-06, + "loss": 0.1881, + "step": 5548 + }, + { + "epoch": 1.8530639505760562, + "grad_norm": 0.3968436369022213, + "learning_rate": 3.8284747091414e-06, + "loss": 0.1951, + "step": 5549 + }, + { + "epoch": 1.8533978961429287, + "grad_norm": 0.3590138296067034, + "learning_rate": 3.826585560678816e-06, + "loss": 0.1755, + "step": 5550 + }, + { + "epoch": 1.8537318417098012, + "grad_norm": 0.4474793230992464, + "learning_rate": 3.824696589474301e-06, + "loss": 0.1995, + "step": 5551 + }, + { + "epoch": 1.854065787276674, + "grad_norm": 0.4182641931292139, + "learning_rate": 3.8228077958132055e-06, + "loss": 0.2056, + "step": 5552 + }, + { + "epoch": 1.8543997328435466, + "grad_norm": 0.41418885944491485, + "learning_rate": 3.8209191799808535e-06, + "loss": 0.175, + "step": 5553 + }, + { + "epoch": 1.8547336784104191, + "grad_norm": 0.4098383640034617, + "learning_rate": 3.819030742262542e-06, + "loss": 0.2002, + "step": 5554 + }, + { + "epoch": 1.8550676239772916, + "grad_norm": 0.43699938557858403, + "learning_rate": 3.817142482943543e-06, + "loss": 0.1906, + "step": 5555 + }, + { + "epoch": 1.8554015695441644, + "grad_norm": 0.41360176753527556, + "learning_rate": 3.815254402309097e-06, + "loss": 0.1967, + "step": 5556 + }, + { + "epoch": 1.8557355151110368, + "grad_norm": 0.42771060635906877, + "learning_rate": 3.813366500644426e-06, + "loss": 0.2154, + "step": 5557 + }, + { + "epoch": 1.8560694606779093, + "grad_norm": 0.36870116160817834, + "learning_rate": 3.8114787782347172e-06, + "loss": 0.1812, + "step": 5558 + }, + { + "epoch": 1.856403406244782, + "grad_norm": 0.40901216532316104, + "learning_rate": 3.809591235365133e-06, + "loss": 0.1928, + "step": 5559 + }, + { + "epoch": 1.8567373518116548, + "grad_norm": 0.37696959669230656, + "learning_rate": 3.807703872320809e-06, + "loss": 0.1763, + "step": 5560 + }, + { + "epoch": 1.8570712973785273, + "grad_norm": 0.3596874200077695, + "learning_rate": 3.8058166893868543e-06, + "loss": 0.1714, + "step": 5561 + }, + { + "epoch": 1.8574052429453998, + "grad_norm": 0.35321557980968193, + "learning_rate": 3.8039296868483493e-06, + "loss": 0.1739, + "step": 5562 + }, + { + "epoch": 1.8577391885122725, + "grad_norm": 0.38771500147851023, + "learning_rate": 3.802042864990349e-06, + "loss": 0.1897, + "step": 5563 + }, + { + "epoch": 1.8580731340791452, + "grad_norm": 0.4151413292305868, + "learning_rate": 3.8001562240978785e-06, + "loss": 0.1842, + "step": 5564 + }, + { + "epoch": 1.8584070796460177, + "grad_norm": 0.40997672571652566, + "learning_rate": 3.7982697644559385e-06, + "loss": 0.1959, + "step": 5565 + }, + { + "epoch": 1.8587410252128902, + "grad_norm": 0.44628840605004716, + "learning_rate": 3.7963834863495013e-06, + "loss": 0.2154, + "step": 5566 + }, + { + "epoch": 1.859074970779763, + "grad_norm": 0.3581451638522709, + "learning_rate": 3.794497390063509e-06, + "loss": 0.1749, + "step": 5567 + }, + { + "epoch": 1.8594089163466356, + "grad_norm": 0.4141106417920125, + "learning_rate": 3.792611475882881e-06, + "loss": 0.1854, + "step": 5568 + }, + { + "epoch": 1.8597428619135081, + "grad_norm": 0.3939536353320779, + "learning_rate": 3.790725744092507e-06, + "loss": 0.1864, + "step": 5569 + }, + { + "epoch": 1.8600768074803806, + "grad_norm": 0.4176080845444923, + "learning_rate": 3.788840194977248e-06, + "loss": 0.19, + "step": 5570 + }, + { + "epoch": 1.8604107530472533, + "grad_norm": 0.3773316234488866, + "learning_rate": 3.7869548288219383e-06, + "loss": 0.1827, + "step": 5571 + }, + { + "epoch": 1.860744698614126, + "grad_norm": 0.45447820265924405, + "learning_rate": 3.7850696459113845e-06, + "loss": 0.2136, + "step": 5572 + }, + { + "epoch": 1.8610786441809983, + "grad_norm": 0.40973336466501775, + "learning_rate": 3.783184646530364e-06, + "loss": 0.1974, + "step": 5573 + }, + { + "epoch": 1.861412589747871, + "grad_norm": 0.38507853427875804, + "learning_rate": 3.7812998309636323e-06, + "loss": 0.1843, + "step": 5574 + }, + { + "epoch": 1.8617465353147438, + "grad_norm": 0.42639231490055507, + "learning_rate": 3.779415199495911e-06, + "loss": 0.19, + "step": 5575 + }, + { + "epoch": 1.8620804808816163, + "grad_norm": 0.39541454547140653, + "learning_rate": 3.777530752411896e-06, + "loss": 0.1969, + "step": 5576 + }, + { + "epoch": 1.8624144264484888, + "grad_norm": 0.41359196848838, + "learning_rate": 3.7756464899962546e-06, + "loss": 0.1888, + "step": 5577 + }, + { + "epoch": 1.8627483720153615, + "grad_norm": 0.36075624301082615, + "learning_rate": 3.773762412533627e-06, + "loss": 0.1792, + "step": 5578 + }, + { + "epoch": 1.8630823175822342, + "grad_norm": 0.4098671567571041, + "learning_rate": 3.771878520308624e-06, + "loss": 0.1912, + "step": 5579 + }, + { + "epoch": 1.8634162631491067, + "grad_norm": 0.3970704176477835, + "learning_rate": 3.7699948136058327e-06, + "loss": 0.2007, + "step": 5580 + }, + { + "epoch": 1.8637502087159792, + "grad_norm": 0.41258075692882656, + "learning_rate": 3.768111292709808e-06, + "loss": 0.1952, + "step": 5581 + }, + { + "epoch": 1.864084154282852, + "grad_norm": 0.4985649821234798, + "learning_rate": 3.7662279579050777e-06, + "loss": 0.1976, + "step": 5582 + }, + { + "epoch": 1.8644180998497246, + "grad_norm": 0.39543825043818337, + "learning_rate": 3.764344809476141e-06, + "loss": 0.1839, + "step": 5583 + }, + { + "epoch": 1.8647520454165971, + "grad_norm": 0.4169230905277188, + "learning_rate": 3.7624618477074705e-06, + "loss": 0.1792, + "step": 5584 + }, + { + "epoch": 1.8650859909834696, + "grad_norm": 0.40159431111505056, + "learning_rate": 3.760579072883508e-06, + "loss": 0.1851, + "step": 5585 + }, + { + "epoch": 1.8654199365503423, + "grad_norm": 0.42377227839386244, + "learning_rate": 3.758696485288672e-06, + "loss": 0.201, + "step": 5586 + }, + { + "epoch": 1.865753882117215, + "grad_norm": 0.4257414605932439, + "learning_rate": 3.7568140852073464e-06, + "loss": 0.1975, + "step": 5587 + }, + { + "epoch": 1.8660878276840875, + "grad_norm": 0.4169457936193867, + "learning_rate": 3.754931872923892e-06, + "loss": 0.1923, + "step": 5588 + }, + { + "epoch": 1.86642177325096, + "grad_norm": 0.4043813591817317, + "learning_rate": 3.7530498487226384e-06, + "loss": 0.1929, + "step": 5589 + }, + { + "epoch": 1.8667557188178328, + "grad_norm": 0.41635875636597164, + "learning_rate": 3.751168012887888e-06, + "loss": 0.1935, + "step": 5590 + }, + { + "epoch": 1.8670896643847052, + "grad_norm": 0.38859957319609856, + "learning_rate": 3.7492863657039126e-06, + "loss": 0.1879, + "step": 5591 + }, + { + "epoch": 1.8674236099515777, + "grad_norm": 0.4261527050568269, + "learning_rate": 3.7474049074549596e-06, + "loss": 0.1999, + "step": 5592 + }, + { + "epoch": 1.8677575555184505, + "grad_norm": 0.42976803187002927, + "learning_rate": 3.7455236384252435e-06, + "loss": 0.2078, + "step": 5593 + }, + { + "epoch": 1.8680915010853232, + "grad_norm": 0.3885539430483641, + "learning_rate": 3.743642558898953e-06, + "loss": 0.1803, + "step": 5594 + }, + { + "epoch": 1.8684254466521957, + "grad_norm": 0.475836117741637, + "learning_rate": 3.7417616691602477e-06, + "loss": 0.1962, + "step": 5595 + }, + { + "epoch": 1.8687593922190682, + "grad_norm": 0.4264306061995, + "learning_rate": 3.739880969493257e-06, + "loss": 0.2007, + "step": 5596 + }, + { + "epoch": 1.869093337785941, + "grad_norm": 0.4188722723961078, + "learning_rate": 3.738000460182081e-06, + "loss": 0.199, + "step": 5597 + }, + { + "epoch": 1.8694272833528136, + "grad_norm": 0.43449811676434813, + "learning_rate": 3.736120141510798e-06, + "loss": 0.2002, + "step": 5598 + }, + { + "epoch": 1.869761228919686, + "grad_norm": 0.4255881141855972, + "learning_rate": 3.734240013763448e-06, + "loss": 0.2031, + "step": 5599 + }, + { + "epoch": 1.8700951744865586, + "grad_norm": 0.3924986018956876, + "learning_rate": 3.732360077224049e-06, + "loss": 0.1834, + "step": 5600 + }, + { + "epoch": 1.8704291200534313, + "grad_norm": 0.41303090285019545, + "learning_rate": 3.730480332176586e-06, + "loss": 0.1947, + "step": 5601 + }, + { + "epoch": 1.870763065620304, + "grad_norm": 0.4173686398679209, + "learning_rate": 3.7286007789050147e-06, + "loss": 0.2048, + "step": 5602 + }, + { + "epoch": 1.8710970111871765, + "grad_norm": 0.42256726256761623, + "learning_rate": 3.726721417693268e-06, + "loss": 0.196, + "step": 5603 + }, + { + "epoch": 1.871430956754049, + "grad_norm": 0.37707316686734915, + "learning_rate": 3.7248422488252433e-06, + "loss": 0.184, + "step": 5604 + }, + { + "epoch": 1.8717649023209217, + "grad_norm": 0.42889234026369716, + "learning_rate": 3.722963272584812e-06, + "loss": 0.1908, + "step": 5605 + }, + { + "epoch": 1.8720988478877942, + "grad_norm": 0.3962460242481815, + "learning_rate": 3.721084489255815e-06, + "loss": 0.1839, + "step": 5606 + }, + { + "epoch": 1.8724327934546667, + "grad_norm": 0.4230266260559559, + "learning_rate": 3.719205899122064e-06, + "loss": 0.1995, + "step": 5607 + }, + { + "epoch": 1.8727667390215395, + "grad_norm": 0.42644054894229216, + "learning_rate": 3.7173275024673424e-06, + "loss": 0.197, + "step": 5608 + }, + { + "epoch": 1.8731006845884122, + "grad_norm": 0.4754937302170339, + "learning_rate": 3.7154492995754046e-06, + "loss": 0.2116, + "step": 5609 + }, + { + "epoch": 1.8734346301552847, + "grad_norm": 0.4370210334991797, + "learning_rate": 3.7135712907299753e-06, + "loss": 0.1991, + "step": 5610 + }, + { + "epoch": 1.8737685757221572, + "grad_norm": 0.3959125955396685, + "learning_rate": 3.7116934762147504e-06, + "loss": 0.1768, + "step": 5611 + }, + { + "epoch": 1.8741025212890299, + "grad_norm": 0.40337586546501275, + "learning_rate": 3.709815856313395e-06, + "loss": 0.1843, + "step": 5612 + }, + { + "epoch": 1.8744364668559026, + "grad_norm": 0.4036266931746901, + "learning_rate": 3.7079384313095464e-06, + "loss": 0.1948, + "step": 5613 + }, + { + "epoch": 1.874770412422775, + "grad_norm": 0.40647248544698983, + "learning_rate": 3.70606120148681e-06, + "loss": 0.1949, + "step": 5614 + }, + { + "epoch": 1.8751043579896476, + "grad_norm": 0.4983920786650823, + "learning_rate": 3.7041841671287654e-06, + "loss": 0.1782, + "step": 5615 + }, + { + "epoch": 1.8754383035565203, + "grad_norm": 0.42161416739832963, + "learning_rate": 3.70230732851896e-06, + "loss": 0.1899, + "step": 5616 + }, + { + "epoch": 1.875772249123393, + "grad_norm": 0.5092564041054836, + "learning_rate": 3.7004306859409134e-06, + "loss": 0.2121, + "step": 5617 + }, + { + "epoch": 1.8761061946902655, + "grad_norm": 0.42450305824700535, + "learning_rate": 3.6985542396781127e-06, + "loss": 0.1945, + "step": 5618 + }, + { + "epoch": 1.876440140257138, + "grad_norm": 0.41218596453526796, + "learning_rate": 3.6966779900140193e-06, + "loss": 0.1883, + "step": 5619 + }, + { + "epoch": 1.8767740858240107, + "grad_norm": 0.45585778407093536, + "learning_rate": 3.694801937232058e-06, + "loss": 0.1923, + "step": 5620 + }, + { + "epoch": 1.8771080313908834, + "grad_norm": 0.43555642104196896, + "learning_rate": 3.6929260816156353e-06, + "loss": 0.2081, + "step": 5621 + }, + { + "epoch": 1.8774419769577557, + "grad_norm": 0.38653206137255025, + "learning_rate": 3.691050423448118e-06, + "loss": 0.188, + "step": 5622 + }, + { + "epoch": 1.8777759225246284, + "grad_norm": 0.44802164116477605, + "learning_rate": 3.689174963012847e-06, + "loss": 0.204, + "step": 5623 + }, + { + "epoch": 1.8781098680915012, + "grad_norm": 0.3824309407516524, + "learning_rate": 3.6872997005931323e-06, + "loss": 0.1876, + "step": 5624 + }, + { + "epoch": 1.8784438136583737, + "grad_norm": 0.4373752040761639, + "learning_rate": 3.6854246364722534e-06, + "loss": 0.1988, + "step": 5625 + }, + { + "epoch": 1.8787777592252461, + "grad_norm": 0.3912895000237093, + "learning_rate": 3.683549770933461e-06, + "loss": 0.1888, + "step": 5626 + }, + { + "epoch": 1.8791117047921189, + "grad_norm": 0.46238272413161635, + "learning_rate": 3.6816751042599774e-06, + "loss": 0.2081, + "step": 5627 + }, + { + "epoch": 1.8794456503589916, + "grad_norm": 0.38589801494425297, + "learning_rate": 3.6798006367349926e-06, + "loss": 0.1829, + "step": 5628 + }, + { + "epoch": 1.879779595925864, + "grad_norm": 0.3752621047827673, + "learning_rate": 3.6779263686416668e-06, + "loss": 0.1925, + "step": 5629 + }, + { + "epoch": 1.8801135414927366, + "grad_norm": 0.3765889397099863, + "learning_rate": 3.676052300263129e-06, + "loss": 0.1846, + "step": 5630 + }, + { + "epoch": 1.8804474870596093, + "grad_norm": 0.3676402690847071, + "learning_rate": 3.6741784318824814e-06, + "loss": 0.1808, + "step": 5631 + }, + { + "epoch": 1.880781432626482, + "grad_norm": 0.3551659592860279, + "learning_rate": 3.6723047637827897e-06, + "loss": 0.1712, + "step": 5632 + }, + { + "epoch": 1.8811153781933545, + "grad_norm": 0.410879058236752, + "learning_rate": 3.670431296247099e-06, + "loss": 0.2003, + "step": 5633 + }, + { + "epoch": 1.881449323760227, + "grad_norm": 0.40283302371260815, + "learning_rate": 3.6685580295584162e-06, + "loss": 0.188, + "step": 5634 + }, + { + "epoch": 1.8817832693270997, + "grad_norm": 0.3891040548112664, + "learning_rate": 3.6666849639997205e-06, + "loss": 0.1849, + "step": 5635 + }, + { + "epoch": 1.8821172148939724, + "grad_norm": 0.41052509920344155, + "learning_rate": 3.6648120998539596e-06, + "loss": 0.1862, + "step": 5636 + }, + { + "epoch": 1.882451160460845, + "grad_norm": 0.4079072214569817, + "learning_rate": 3.662939437404053e-06, + "loss": 0.1982, + "step": 5637 + }, + { + "epoch": 1.8827851060277174, + "grad_norm": 0.42609676874734265, + "learning_rate": 3.6610669769328853e-06, + "loss": 0.1851, + "step": 5638 + }, + { + "epoch": 1.8831190515945901, + "grad_norm": 0.4182804674313846, + "learning_rate": 3.659194718723319e-06, + "loss": 0.1946, + "step": 5639 + }, + { + "epoch": 1.8834529971614626, + "grad_norm": 0.406048729629456, + "learning_rate": 3.657322663058177e-06, + "loss": 0.1954, + "step": 5640 + }, + { + "epoch": 1.8837869427283351, + "grad_norm": 0.4220538626404686, + "learning_rate": 3.655450810220257e-06, + "loss": 0.1911, + "step": 5641 + }, + { + "epoch": 1.8841208882952079, + "grad_norm": 0.45423348313302325, + "learning_rate": 3.6535791604923225e-06, + "loss": 0.1924, + "step": 5642 + }, + { + "epoch": 1.8844548338620806, + "grad_norm": 0.3981725690261841, + "learning_rate": 3.6517077141571076e-06, + "loss": 0.1844, + "step": 5643 + }, + { + "epoch": 1.884788779428953, + "grad_norm": 0.559974848713012, + "learning_rate": 3.649836471497321e-06, + "loss": 0.1981, + "step": 5644 + }, + { + "epoch": 1.8851227249958256, + "grad_norm": 0.42473441487267527, + "learning_rate": 3.6479654327956325e-06, + "loss": 0.2059, + "step": 5645 + }, + { + "epoch": 1.8854566705626983, + "grad_norm": 0.4574118801471747, + "learning_rate": 3.646094598334685e-06, + "loss": 0.1964, + "step": 5646 + }, + { + "epoch": 1.885790616129571, + "grad_norm": 0.4137780911436003, + "learning_rate": 3.64422396839709e-06, + "loss": 0.1848, + "step": 5647 + }, + { + "epoch": 1.8861245616964435, + "grad_norm": 0.40265674895445425, + "learning_rate": 3.642353543265429e-06, + "loss": 0.1887, + "step": 5648 + }, + { + "epoch": 1.886458507263316, + "grad_norm": 0.3859651283367934, + "learning_rate": 3.640483323222248e-06, + "loss": 0.1923, + "step": 5649 + }, + { + "epoch": 1.8867924528301887, + "grad_norm": 0.4131845833522721, + "learning_rate": 3.638613308550072e-06, + "loss": 0.2009, + "step": 5650 + }, + { + "epoch": 1.8871263983970614, + "grad_norm": 0.3875014614187485, + "learning_rate": 3.636743499531385e-06, + "loss": 0.1782, + "step": 5651 + }, + { + "epoch": 1.887460343963934, + "grad_norm": 0.38824358254338215, + "learning_rate": 3.634873896448644e-06, + "loss": 0.1885, + "step": 5652 + }, + { + "epoch": 1.8877942895308064, + "grad_norm": 0.35317513678915435, + "learning_rate": 3.633004499584275e-06, + "loss": 0.1774, + "step": 5653 + }, + { + "epoch": 1.8881282350976791, + "grad_norm": 0.3775118670972697, + "learning_rate": 3.6311353092206723e-06, + "loss": 0.186, + "step": 5654 + }, + { + "epoch": 1.8884621806645516, + "grad_norm": 0.4040450531349787, + "learning_rate": 3.6292663256401967e-06, + "loss": 0.1935, + "step": 5655 + }, + { + "epoch": 1.8887961262314241, + "grad_norm": 0.4223300761091552, + "learning_rate": 3.6273975491251844e-06, + "loss": 0.2026, + "step": 5656 + }, + { + "epoch": 1.8891300717982968, + "grad_norm": 0.3819512409614182, + "learning_rate": 3.625528979957935e-06, + "loss": 0.2008, + "step": 5657 + }, + { + "epoch": 1.8894640173651696, + "grad_norm": 0.40376004385819364, + "learning_rate": 3.6236606184207164e-06, + "loss": 0.184, + "step": 5658 + }, + { + "epoch": 1.889797962932042, + "grad_norm": 0.4018629191997057, + "learning_rate": 3.621792464795767e-06, + "loss": 0.1921, + "step": 5659 + }, + { + "epoch": 1.8901319084989145, + "grad_norm": 0.38134294753922704, + "learning_rate": 3.6199245193652944e-06, + "loss": 0.1823, + "step": 5660 + }, + { + "epoch": 1.8904658540657873, + "grad_norm": 0.4086245882829037, + "learning_rate": 3.6180567824114715e-06, + "loss": 0.2045, + "step": 5661 + }, + { + "epoch": 1.89079979963266, + "grad_norm": 0.4147921609611881, + "learning_rate": 3.6161892542164444e-06, + "loss": 0.2035, + "step": 5662 + }, + { + "epoch": 1.8911337451995325, + "grad_norm": 0.403633243120622, + "learning_rate": 3.614321935062325e-06, + "loss": 0.1905, + "step": 5663 + }, + { + "epoch": 1.891467690766405, + "grad_norm": 0.4062716446644573, + "learning_rate": 3.6124548252311918e-06, + "loss": 0.2017, + "step": 5664 + }, + { + "epoch": 1.8918016363332777, + "grad_norm": 0.43732722649221445, + "learning_rate": 3.610587925005097e-06, + "loss": 0.1922, + "step": 5665 + }, + { + "epoch": 1.8921355819001504, + "grad_norm": 0.4168207296619112, + "learning_rate": 3.608721234666054e-06, + "loss": 0.1872, + "step": 5666 + }, + { + "epoch": 1.892469527467023, + "grad_norm": 0.4001633405949691, + "learning_rate": 3.6068547544960493e-06, + "loss": 0.1999, + "step": 5667 + }, + { + "epoch": 1.8928034730338954, + "grad_norm": 0.359620384256815, + "learning_rate": 3.6049884847770396e-06, + "loss": 0.1879, + "step": 5668 + }, + { + "epoch": 1.8931374186007681, + "grad_norm": 0.3814699560183258, + "learning_rate": 3.6031224257909448e-06, + "loss": 0.1925, + "step": 5669 + }, + { + "epoch": 1.8934713641676408, + "grad_norm": 0.4249305582407571, + "learning_rate": 3.6012565778196552e-06, + "loss": 0.2018, + "step": 5670 + }, + { + "epoch": 1.893805309734513, + "grad_norm": 0.44929761623627773, + "learning_rate": 3.5993909411450297e-06, + "loss": 0.2176, + "step": 5671 + }, + { + "epoch": 1.8941392553013858, + "grad_norm": 0.4402284991993238, + "learning_rate": 3.597525516048894e-06, + "loss": 0.1985, + "step": 5672 + }, + { + "epoch": 1.8944732008682585, + "grad_norm": 0.41033454175195433, + "learning_rate": 3.5956603028130397e-06, + "loss": 0.179, + "step": 5673 + }, + { + "epoch": 1.894807146435131, + "grad_norm": 0.41702509272834387, + "learning_rate": 3.5937953017192356e-06, + "loss": 0.1944, + "step": 5674 + }, + { + "epoch": 1.8951410920020035, + "grad_norm": 0.417384380584999, + "learning_rate": 3.591930513049208e-06, + "loss": 0.2002, + "step": 5675 + }, + { + "epoch": 1.8954750375688763, + "grad_norm": 0.404210881524086, + "learning_rate": 3.5900659370846556e-06, + "loss": 0.1915, + "step": 5676 + }, + { + "epoch": 1.895808983135749, + "grad_norm": 0.40054054358372826, + "learning_rate": 3.5882015741072464e-06, + "loss": 0.1893, + "step": 5677 + }, + { + "epoch": 1.8961429287026215, + "grad_norm": 0.4716609545548075, + "learning_rate": 3.586337424398609e-06, + "loss": 0.2013, + "step": 5678 + }, + { + "epoch": 1.896476874269494, + "grad_norm": 0.4086034807646897, + "learning_rate": 3.584473488240352e-06, + "loss": 0.1891, + "step": 5679 + }, + { + "epoch": 1.8968108198363667, + "grad_norm": 0.3875982015231373, + "learning_rate": 3.5826097659140413e-06, + "loss": 0.189, + "step": 5680 + }, + { + "epoch": 1.8971447654032394, + "grad_norm": 0.35728633673821014, + "learning_rate": 3.5807462577012152e-06, + "loss": 0.181, + "step": 5681 + }, + { + "epoch": 1.897478710970112, + "grad_norm": 0.3834926713626789, + "learning_rate": 3.5788829638833777e-06, + "loss": 0.189, + "step": 5682 + }, + { + "epoch": 1.8978126565369844, + "grad_norm": 0.39649251448728734, + "learning_rate": 3.5770198847420016e-06, + "loss": 0.1892, + "step": 5683 + }, + { + "epoch": 1.898146602103857, + "grad_norm": 0.42377795991739803, + "learning_rate": 3.5751570205585264e-06, + "loss": 0.1914, + "step": 5684 + }, + { + "epoch": 1.8984805476707298, + "grad_norm": 0.4002763802703904, + "learning_rate": 3.573294371614361e-06, + "loss": 0.1905, + "step": 5685 + }, + { + "epoch": 1.8988144932376023, + "grad_norm": 0.4236073409479343, + "learning_rate": 3.571431938190879e-06, + "loss": 0.1981, + "step": 5686 + }, + { + "epoch": 1.8991484388044748, + "grad_norm": 0.3916550676455339, + "learning_rate": 3.5695697205694246e-06, + "loss": 0.1984, + "step": 5687 + }, + { + "epoch": 1.8994823843713475, + "grad_norm": 0.4034111232421996, + "learning_rate": 3.567707719031306e-06, + "loss": 0.195, + "step": 5688 + }, + { + "epoch": 1.89981632993822, + "grad_norm": 0.41960190854899787, + "learning_rate": 3.5658459338578016e-06, + "loss": 0.2041, + "step": 5689 + }, + { + "epoch": 1.9001502755050925, + "grad_norm": 0.40615605662054505, + "learning_rate": 3.563984365330153e-06, + "loss": 0.1915, + "step": 5690 + }, + { + "epoch": 1.9004842210719652, + "grad_norm": 0.3654123514969394, + "learning_rate": 3.562123013729577e-06, + "loss": 0.1834, + "step": 5691 + }, + { + "epoch": 1.900818166638838, + "grad_norm": 0.43052112290651406, + "learning_rate": 3.56026187933725e-06, + "loss": 0.2023, + "step": 5692 + }, + { + "epoch": 1.9011521122057105, + "grad_norm": 0.4043205360016579, + "learning_rate": 3.5584009624343187e-06, + "loss": 0.2009, + "step": 5693 + }, + { + "epoch": 1.901486057772583, + "grad_norm": 0.38893024556161726, + "learning_rate": 3.5565402633018963e-06, + "loss": 0.1861, + "step": 5694 + }, + { + "epoch": 1.9018200033394557, + "grad_norm": 0.4320452390717206, + "learning_rate": 3.554679782221063e-06, + "loss": 0.19, + "step": 5695 + }, + { + "epoch": 1.9021539489063284, + "grad_norm": 0.3916662102911691, + "learning_rate": 3.552819519472865e-06, + "loss": 0.1877, + "step": 5696 + }, + { + "epoch": 1.9024878944732009, + "grad_norm": 0.44565474629895935, + "learning_rate": 3.5509594753383202e-06, + "loss": 0.2008, + "step": 5697 + }, + { + "epoch": 1.9028218400400734, + "grad_norm": 0.41234932037291344, + "learning_rate": 3.5490996500984085e-06, + "loss": 0.1962, + "step": 5698 + }, + { + "epoch": 1.903155785606946, + "grad_norm": 0.3981277646668938, + "learning_rate": 3.547240044034079e-06, + "loss": 0.1839, + "step": 5699 + }, + { + "epoch": 1.9034897311738188, + "grad_norm": 0.3711733861198523, + "learning_rate": 3.545380657426247e-06, + "loss": 0.1824, + "step": 5700 + }, + { + "epoch": 1.9038236767406913, + "grad_norm": 0.4233679288176297, + "learning_rate": 3.5435214905557937e-06, + "loss": 0.1844, + "step": 5701 + }, + { + "epoch": 1.9041576223075638, + "grad_norm": 0.3896155487732712, + "learning_rate": 3.5416625437035656e-06, + "loss": 0.1896, + "step": 5702 + }, + { + "epoch": 1.9044915678744365, + "grad_norm": 0.37448542490424214, + "learning_rate": 3.539803817150385e-06, + "loss": 0.184, + "step": 5703 + }, + { + "epoch": 1.904825513441309, + "grad_norm": 0.4181501138090368, + "learning_rate": 3.5379453111770313e-06, + "loss": 0.2077, + "step": 5704 + }, + { + "epoch": 1.9051594590081815, + "grad_norm": 0.43828339837035857, + "learning_rate": 3.536087026064252e-06, + "loss": 0.2003, + "step": 5705 + }, + { + "epoch": 1.9054934045750542, + "grad_norm": 0.3939854984655365, + "learning_rate": 3.534228962092766e-06, + "loss": 0.1927, + "step": 5706 + }, + { + "epoch": 1.905827350141927, + "grad_norm": 0.3850822547742392, + "learning_rate": 3.5323711195432533e-06, + "loss": 0.1921, + "step": 5707 + }, + { + "epoch": 1.9061612957087994, + "grad_norm": 0.3797573178782776, + "learning_rate": 3.530513498696363e-06, + "loss": 0.1868, + "step": 5708 + }, + { + "epoch": 1.906495241275672, + "grad_norm": 0.3974351606525824, + "learning_rate": 3.5286560998327125e-06, + "loss": 0.1957, + "step": 5709 + }, + { + "epoch": 1.9068291868425447, + "grad_norm": 0.37982353045389616, + "learning_rate": 3.5267989232328827e-06, + "loss": 0.1912, + "step": 5710 + }, + { + "epoch": 1.9071631324094174, + "grad_norm": 0.4056309512887979, + "learning_rate": 3.5249419691774212e-06, + "loss": 0.1878, + "step": 5711 + }, + { + "epoch": 1.9074970779762899, + "grad_norm": 0.41189171348380066, + "learning_rate": 3.523085237946844e-06, + "loss": 0.1928, + "step": 5712 + }, + { + "epoch": 1.9078310235431624, + "grad_norm": 0.39755751196618816, + "learning_rate": 3.5212287298216306e-06, + "loss": 0.1903, + "step": 5713 + }, + { + "epoch": 1.908164969110035, + "grad_norm": 0.4965401018841489, + "learning_rate": 3.5193724450822296e-06, + "loss": 0.2001, + "step": 5714 + }, + { + "epoch": 1.9084989146769078, + "grad_norm": 0.41952782189260945, + "learning_rate": 3.517516384009056e-06, + "loss": 0.1897, + "step": 5715 + }, + { + "epoch": 1.9088328602437803, + "grad_norm": 0.36265255842951255, + "learning_rate": 3.515660546882488e-06, + "loss": 0.1814, + "step": 5716 + }, + { + "epoch": 1.9091668058106528, + "grad_norm": 0.41875774466473636, + "learning_rate": 3.5138049339828718e-06, + "loss": 0.1956, + "step": 5717 + }, + { + "epoch": 1.9095007513775255, + "grad_norm": 0.45427593055322385, + "learning_rate": 3.5119495455905194e-06, + "loss": 0.1929, + "step": 5718 + }, + { + "epoch": 1.9098346969443982, + "grad_norm": 0.4283125075634053, + "learning_rate": 3.5100943819857082e-06, + "loss": 0.1849, + "step": 5719 + }, + { + "epoch": 1.9101686425112705, + "grad_norm": 0.3912738538037045, + "learning_rate": 3.508239443448685e-06, + "loss": 0.1754, + "step": 5720 + }, + { + "epoch": 1.9105025880781432, + "grad_norm": 0.3783746014875604, + "learning_rate": 3.5063847302596587e-06, + "loss": 0.1858, + "step": 5721 + }, + { + "epoch": 1.910836533645016, + "grad_norm": 0.40411330753733493, + "learning_rate": 3.504530242698806e-06, + "loss": 0.185, + "step": 5722 + }, + { + "epoch": 1.9111704792118884, + "grad_norm": 0.367902521163063, + "learning_rate": 3.5026759810462687e-06, + "loss": 0.1823, + "step": 5723 + }, + { + "epoch": 1.911504424778761, + "grad_norm": 0.36691214379783854, + "learning_rate": 3.5008219455821546e-06, + "loss": 0.1837, + "step": 5724 + }, + { + "epoch": 1.9118383703456336, + "grad_norm": 0.4113856631417277, + "learning_rate": 3.4989681365865363e-06, + "loss": 0.1943, + "step": 5725 + }, + { + "epoch": 1.9121723159125064, + "grad_norm": 0.38908322636188303, + "learning_rate": 3.497114554339457e-06, + "loss": 0.175, + "step": 5726 + }, + { + "epoch": 1.9125062614793789, + "grad_norm": 0.4302171916585288, + "learning_rate": 3.4952611991209197e-06, + "loss": 0.1912, + "step": 5727 + }, + { + "epoch": 1.9128402070462514, + "grad_norm": 0.42508386899522627, + "learning_rate": 3.4934080712108964e-06, + "loss": 0.1986, + "step": 5728 + }, + { + "epoch": 1.913174152613124, + "grad_norm": 0.40828670712191045, + "learning_rate": 3.4915551708893236e-06, + "loss": 0.1983, + "step": 5729 + }, + { + "epoch": 1.9135080981799968, + "grad_norm": 0.3826098539335381, + "learning_rate": 3.489702498436103e-06, + "loss": 0.1892, + "step": 5730 + }, + { + "epoch": 1.9138420437468693, + "grad_norm": 0.37640018946704573, + "learning_rate": 3.487850054131103e-06, + "loss": 0.1825, + "step": 5731 + }, + { + "epoch": 1.9141759893137418, + "grad_norm": 0.423540829528593, + "learning_rate": 3.4859978382541575e-06, + "loss": 0.2008, + "step": 5732 + }, + { + "epoch": 1.9145099348806145, + "grad_norm": 0.40110211348855723, + "learning_rate": 3.4841458510850656e-06, + "loss": 0.1928, + "step": 5733 + }, + { + "epoch": 1.9148438804474872, + "grad_norm": 0.4202765037424674, + "learning_rate": 3.482294092903592e-06, + "loss": 0.1913, + "step": 5734 + }, + { + "epoch": 1.9151778260143597, + "grad_norm": 0.3992739294133403, + "learning_rate": 3.480442563989466e-06, + "loss": 0.1855, + "step": 5735 + }, + { + "epoch": 1.9155117715812322, + "grad_norm": 0.39103883707441306, + "learning_rate": 3.4785912646223813e-06, + "loss": 0.198, + "step": 5736 + }, + { + "epoch": 1.915845717148105, + "grad_norm": 0.42159492620557487, + "learning_rate": 3.4767401950820003e-06, + "loss": 0.2061, + "step": 5737 + }, + { + "epoch": 1.9161796627149774, + "grad_norm": 0.38979029077665317, + "learning_rate": 3.4748893556479497e-06, + "loss": 0.185, + "step": 5738 + }, + { + "epoch": 1.91651360828185, + "grad_norm": 0.3767039745449479, + "learning_rate": 3.4730387465998194e-06, + "loss": 0.1911, + "step": 5739 + }, + { + "epoch": 1.9168475538487226, + "grad_norm": 0.4421262922770817, + "learning_rate": 3.4711883682171666e-06, + "loss": 0.2017, + "step": 5740 + }, + { + "epoch": 1.9171814994155953, + "grad_norm": 0.4073953632957591, + "learning_rate": 3.4693382207795114e-06, + "loss": 0.1913, + "step": 5741 + }, + { + "epoch": 1.9175154449824678, + "grad_norm": 0.3976848565725884, + "learning_rate": 3.4674883045663404e-06, + "loss": 0.1888, + "step": 5742 + }, + { + "epoch": 1.9178493905493403, + "grad_norm": 0.3853218853012135, + "learning_rate": 3.465638619857104e-06, + "loss": 0.1828, + "step": 5743 + }, + { + "epoch": 1.918183336116213, + "grad_norm": 0.40753207467467006, + "learning_rate": 3.463789166931223e-06, + "loss": 0.1907, + "step": 5744 + }, + { + "epoch": 1.9185172816830858, + "grad_norm": 0.3765078672210008, + "learning_rate": 3.4619399460680757e-06, + "loss": 0.1825, + "step": 5745 + }, + { + "epoch": 1.9188512272499583, + "grad_norm": 0.434986647967096, + "learning_rate": 3.460090957547011e-06, + "loss": 0.1806, + "step": 5746 + }, + { + "epoch": 1.9191851728168308, + "grad_norm": 0.40851362414259823, + "learning_rate": 3.4582422016473384e-06, + "loss": 0.1935, + "step": 5747 + }, + { + "epoch": 1.9195191183837035, + "grad_norm": 0.41083715107382207, + "learning_rate": 3.4563936786483345e-06, + "loss": 0.1988, + "step": 5748 + }, + { + "epoch": 1.9198530639505762, + "grad_norm": 0.4074208018702095, + "learning_rate": 3.454545388829239e-06, + "loss": 0.1944, + "step": 5749 + }, + { + "epoch": 1.9201870095174487, + "grad_norm": 0.39520620444751453, + "learning_rate": 3.4526973324692614e-06, + "loss": 0.1908, + "step": 5750 + }, + { + "epoch": 1.9205209550843212, + "grad_norm": 0.43906524174111994, + "learning_rate": 3.4508495098475712e-06, + "loss": 0.1992, + "step": 5751 + }, + { + "epoch": 1.920854900651194, + "grad_norm": 0.4502858723264202, + "learning_rate": 3.4490019212433035e-06, + "loss": 0.2044, + "step": 5752 + }, + { + "epoch": 1.9211888462180664, + "grad_norm": 0.40705908631119775, + "learning_rate": 3.447154566935557e-06, + "loss": 0.1943, + "step": 5753 + }, + { + "epoch": 1.921522791784939, + "grad_norm": 0.39945366422919054, + "learning_rate": 3.4453074472033975e-06, + "loss": 0.1863, + "step": 5754 + }, + { + "epoch": 1.9218567373518116, + "grad_norm": 0.39100600573805244, + "learning_rate": 3.443460562325853e-06, + "loss": 0.1823, + "step": 5755 + }, + { + "epoch": 1.9221906829186843, + "grad_norm": 0.3867960086824861, + "learning_rate": 3.4416139125819204e-06, + "loss": 0.1833, + "step": 5756 + }, + { + "epoch": 1.9225246284855568, + "grad_norm": 0.41121060111634883, + "learning_rate": 3.4397674982505546e-06, + "loss": 0.1887, + "step": 5757 + }, + { + "epoch": 1.9228585740524293, + "grad_norm": 0.42275941705337255, + "learning_rate": 3.43792131961068e-06, + "loss": 0.2092, + "step": 5758 + }, + { + "epoch": 1.923192519619302, + "grad_norm": 0.3843148443080461, + "learning_rate": 3.4360753769411816e-06, + "loss": 0.1749, + "step": 5759 + }, + { + "epoch": 1.9235264651861748, + "grad_norm": 0.39501568437932283, + "learning_rate": 3.4342296705209112e-06, + "loss": 0.1731, + "step": 5760 + }, + { + "epoch": 1.9238604107530473, + "grad_norm": 0.3831115342716545, + "learning_rate": 3.432384200628688e-06, + "loss": 0.1848, + "step": 5761 + }, + { + "epoch": 1.9241943563199198, + "grad_norm": 0.3925566970493553, + "learning_rate": 3.4305389675432882e-06, + "loss": 0.1923, + "step": 5762 + }, + { + "epoch": 1.9245283018867925, + "grad_norm": 0.4353970565860827, + "learning_rate": 3.4286939715434573e-06, + "loss": 0.1941, + "step": 5763 + }, + { + "epoch": 1.9248622474536652, + "grad_norm": 0.3853234833941961, + "learning_rate": 3.4268492129079047e-06, + "loss": 0.1862, + "step": 5764 + }, + { + "epoch": 1.9251961930205377, + "grad_norm": 0.43244855006490596, + "learning_rate": 3.4250046919153e-06, + "loss": 0.199, + "step": 5765 + }, + { + "epoch": 1.9255301385874102, + "grad_norm": 0.44410365215339354, + "learning_rate": 3.4231604088442806e-06, + "loss": 0.196, + "step": 5766 + }, + { + "epoch": 1.925864084154283, + "grad_norm": 0.4221818596900024, + "learning_rate": 3.4213163639734504e-06, + "loss": 0.1965, + "step": 5767 + }, + { + "epoch": 1.9261980297211556, + "grad_norm": 0.40430757506097365, + "learning_rate": 3.4194725575813707e-06, + "loss": 0.1871, + "step": 5768 + }, + { + "epoch": 1.9265319752880279, + "grad_norm": 0.39690720279215563, + "learning_rate": 3.417628989946572e-06, + "loss": 0.1866, + "step": 5769 + }, + { + "epoch": 1.9268659208549006, + "grad_norm": 0.41882654166179956, + "learning_rate": 3.415785661347546e-06, + "loss": 0.1976, + "step": 5770 + }, + { + "epoch": 1.9271998664217733, + "grad_norm": 0.42589760681235683, + "learning_rate": 3.4139425720627494e-06, + "loss": 0.1931, + "step": 5771 + }, + { + "epoch": 1.9275338119886458, + "grad_norm": 0.42698531652640753, + "learning_rate": 3.412099722370601e-06, + "loss": 0.1954, + "step": 5772 + }, + { + "epoch": 1.9278677575555183, + "grad_norm": 0.4458391967702739, + "learning_rate": 3.4102571125494877e-06, + "loss": 0.2061, + "step": 5773 + }, + { + "epoch": 1.928201703122391, + "grad_norm": 0.42935606022669165, + "learning_rate": 3.408414742877757e-06, + "loss": 0.2027, + "step": 5774 + }, + { + "epoch": 1.9285356486892637, + "grad_norm": 0.4383502567902757, + "learning_rate": 3.406572613633719e-06, + "loss": 0.1894, + "step": 5775 + }, + { + "epoch": 1.9288695942561362, + "grad_norm": 0.40057496070461995, + "learning_rate": 3.40473072509565e-06, + "loss": 0.199, + "step": 5776 + }, + { + "epoch": 1.9292035398230087, + "grad_norm": 0.39138634692508933, + "learning_rate": 3.4028890775417887e-06, + "loss": 0.1884, + "step": 5777 + }, + { + "epoch": 1.9295374853898815, + "grad_norm": 0.4266213221820064, + "learning_rate": 3.4010476712503367e-06, + "loss": 0.1957, + "step": 5778 + }, + { + "epoch": 1.9298714309567542, + "grad_norm": 0.41962434822821776, + "learning_rate": 3.3992065064994615e-06, + "loss": 0.1986, + "step": 5779 + }, + { + "epoch": 1.9302053765236267, + "grad_norm": 0.40653053851480586, + "learning_rate": 3.3973655835672923e-06, + "loss": 0.1872, + "step": 5780 + }, + { + "epoch": 1.9305393220904992, + "grad_norm": 0.3992762796239201, + "learning_rate": 3.3955249027319214e-06, + "loss": 0.1926, + "step": 5781 + }, + { + "epoch": 1.9308732676573719, + "grad_norm": 0.39309596029124866, + "learning_rate": 3.3936844642714073e-06, + "loss": 0.1885, + "step": 5782 + }, + { + "epoch": 1.9312072132242446, + "grad_norm": 0.3967296969011724, + "learning_rate": 3.3918442684637687e-06, + "loss": 0.1945, + "step": 5783 + }, + { + "epoch": 1.931541158791117, + "grad_norm": 0.37389292859170953, + "learning_rate": 3.3900043155869865e-06, + "loss": 0.1857, + "step": 5784 + }, + { + "epoch": 1.9318751043579896, + "grad_norm": 0.406835201145131, + "learning_rate": 3.388164605919012e-06, + "loss": 0.1912, + "step": 5785 + }, + { + "epoch": 1.9322090499248623, + "grad_norm": 0.40644414055673206, + "learning_rate": 3.3863251397377516e-06, + "loss": 0.1882, + "step": 5786 + }, + { + "epoch": 1.9325429954917348, + "grad_norm": 0.4534582493017801, + "learning_rate": 3.3844859173210797e-06, + "loss": 0.2098, + "step": 5787 + }, + { + "epoch": 1.9328769410586073, + "grad_norm": 0.40570770824932884, + "learning_rate": 3.382646938946832e-06, + "loss": 0.182, + "step": 5788 + }, + { + "epoch": 1.93321088662548, + "grad_norm": 0.4125254859784587, + "learning_rate": 3.3808082048928083e-06, + "loss": 0.1827, + "step": 5789 + }, + { + "epoch": 1.9335448321923527, + "grad_norm": 0.42050886372039403, + "learning_rate": 3.378969715436767e-06, + "loss": 0.1788, + "step": 5790 + }, + { + "epoch": 1.9338787777592252, + "grad_norm": 0.4029343552361385, + "learning_rate": 3.3771314708564408e-06, + "loss": 0.1932, + "step": 5791 + }, + { + "epoch": 1.9342127233260977, + "grad_norm": 0.4431266567668056, + "learning_rate": 3.3752934714295146e-06, + "loss": 0.2059, + "step": 5792 + }, + { + "epoch": 1.9345466688929704, + "grad_norm": 0.393345686460719, + "learning_rate": 3.373455717433639e-06, + "loss": 0.2035, + "step": 5793 + }, + { + "epoch": 1.9348806144598432, + "grad_norm": 0.4071248248099155, + "learning_rate": 3.3716182091464295e-06, + "loss": 0.1797, + "step": 5794 + }, + { + "epoch": 1.9352145600267157, + "grad_norm": 0.41122336422676053, + "learning_rate": 3.3697809468454634e-06, + "loss": 0.1922, + "step": 5795 + }, + { + "epoch": 1.9355485055935882, + "grad_norm": 0.4212957583988024, + "learning_rate": 3.3679439308082777e-06, + "loss": 0.1996, + "step": 5796 + }, + { + "epoch": 1.9358824511604609, + "grad_norm": 0.3994840928223961, + "learning_rate": 3.366107161312381e-06, + "loss": 0.1845, + "step": 5797 + }, + { + "epoch": 1.9362163967273336, + "grad_norm": 0.42573497457455245, + "learning_rate": 3.3642706386352355e-06, + "loss": 0.1994, + "step": 5798 + }, + { + "epoch": 1.936550342294206, + "grad_norm": 0.3631236184721829, + "learning_rate": 3.3624343630542707e-06, + "loss": 0.176, + "step": 5799 + }, + { + "epoch": 1.9368842878610786, + "grad_norm": 0.38488554412626735, + "learning_rate": 3.3605983348468764e-06, + "loss": 0.1882, + "step": 5800 + }, + { + "epoch": 1.9372182334279513, + "grad_norm": 0.47199555184017095, + "learning_rate": 3.3587625542904063e-06, + "loss": 0.2026, + "step": 5801 + }, + { + "epoch": 1.9375521789948238, + "grad_norm": 0.4601046109909035, + "learning_rate": 3.356927021662178e-06, + "loss": 0.1916, + "step": 5802 + }, + { + "epoch": 1.9378861245616963, + "grad_norm": 0.3994198582194862, + "learning_rate": 3.3550917372394696e-06, + "loss": 0.184, + "step": 5803 + }, + { + "epoch": 1.938220070128569, + "grad_norm": 0.3734853608354193, + "learning_rate": 3.353256701299522e-06, + "loss": 0.1833, + "step": 5804 + }, + { + "epoch": 1.9385540156954417, + "grad_norm": 0.38879660452435616, + "learning_rate": 3.3514219141195404e-06, + "loss": 0.1845, + "step": 5805 + }, + { + "epoch": 1.9388879612623142, + "grad_norm": 0.38560538917320175, + "learning_rate": 3.3495873759766897e-06, + "loss": 0.1926, + "step": 5806 + }, + { + "epoch": 1.9392219068291867, + "grad_norm": 0.38353142779479343, + "learning_rate": 3.347753087148098e-06, + "loss": 0.1838, + "step": 5807 + }, + { + "epoch": 1.9395558523960594, + "grad_norm": 0.44448297494571126, + "learning_rate": 3.3459190479108583e-06, + "loss": 0.199, + "step": 5808 + }, + { + "epoch": 1.9398897979629321, + "grad_norm": 0.4529776498547492, + "learning_rate": 3.344085258542022e-06, + "loss": 0.1979, + "step": 5809 + }, + { + "epoch": 1.9402237435298046, + "grad_norm": 0.4124811126918051, + "learning_rate": 3.3422517193186056e-06, + "loss": 0.2068, + "step": 5810 + }, + { + "epoch": 1.9405576890966771, + "grad_norm": 0.39920971830243007, + "learning_rate": 3.340418430517586e-06, + "loss": 0.1938, + "step": 5811 + }, + { + "epoch": 1.9408916346635499, + "grad_norm": 0.4222813253850214, + "learning_rate": 3.338585392415904e-06, + "loss": 0.2053, + "step": 5812 + }, + { + "epoch": 1.9412255802304226, + "grad_norm": 0.4161721442405356, + "learning_rate": 3.3367526052904585e-06, + "loss": 0.1787, + "step": 5813 + }, + { + "epoch": 1.941559525797295, + "grad_norm": 0.4879776692990766, + "learning_rate": 3.3349200694181182e-06, + "loss": 0.2053, + "step": 5814 + }, + { + "epoch": 1.9418934713641676, + "grad_norm": 0.4021223636861502, + "learning_rate": 3.333087785075707e-06, + "loss": 0.1964, + "step": 5815 + }, + { + "epoch": 1.9422274169310403, + "grad_norm": 0.37651593810978384, + "learning_rate": 3.3312557525400133e-06, + "loss": 0.1715, + "step": 5816 + }, + { + "epoch": 1.942561362497913, + "grad_norm": 0.4319318531805005, + "learning_rate": 3.329423972087787e-06, + "loss": 0.1955, + "step": 5817 + }, + { + "epoch": 1.9428953080647853, + "grad_norm": 0.4141613206128405, + "learning_rate": 3.3275924439957397e-06, + "loss": 0.2074, + "step": 5818 + }, + { + "epoch": 1.943229253631658, + "grad_norm": 0.38424218929059795, + "learning_rate": 3.3257611685405444e-06, + "loss": 0.1827, + "step": 5819 + }, + { + "epoch": 1.9435631991985307, + "grad_norm": 0.45704517395065586, + "learning_rate": 3.3239301459988395e-06, + "loss": 0.204, + "step": 5820 + }, + { + "epoch": 1.9438971447654032, + "grad_norm": 0.4519944942975536, + "learning_rate": 3.322099376647221e-06, + "loss": 0.1961, + "step": 5821 + }, + { + "epoch": 1.9442310903322757, + "grad_norm": 0.3865117153162869, + "learning_rate": 3.320268860762249e-06, + "loss": 0.1842, + "step": 5822 + }, + { + "epoch": 1.9445650358991484, + "grad_norm": 0.39394807708092394, + "learning_rate": 3.318438598620444e-06, + "loss": 0.1934, + "step": 5823 + }, + { + "epoch": 1.9448989814660211, + "grad_norm": 0.4040471014908394, + "learning_rate": 3.316608590498287e-06, + "loss": 0.1833, + "step": 5824 + }, + { + "epoch": 1.9452329270328936, + "grad_norm": 0.42368161376371266, + "learning_rate": 3.314778836672224e-06, + "loss": 0.1989, + "step": 5825 + }, + { + "epoch": 1.9455668725997661, + "grad_norm": 0.3799786063158331, + "learning_rate": 3.312949337418661e-06, + "loss": 0.1903, + "step": 5826 + }, + { + "epoch": 1.9459008181666388, + "grad_norm": 0.42001314369222936, + "learning_rate": 3.311120093013964e-06, + "loss": 0.2029, + "step": 5827 + }, + { + "epoch": 1.9462347637335116, + "grad_norm": 0.4491441693231125, + "learning_rate": 3.3092911037344642e-06, + "loss": 0.1899, + "step": 5828 + }, + { + "epoch": 1.946568709300384, + "grad_norm": 0.40616158884667664, + "learning_rate": 3.30746236985645e-06, + "loss": 0.1902, + "step": 5829 + }, + { + "epoch": 1.9469026548672566, + "grad_norm": 0.41435867845292346, + "learning_rate": 3.305633891656175e-06, + "loss": 0.1944, + "step": 5830 + }, + { + "epoch": 1.9472366004341293, + "grad_norm": 0.3835569025294799, + "learning_rate": 3.3038056694098485e-06, + "loss": 0.1866, + "step": 5831 + }, + { + "epoch": 1.947570546001002, + "grad_norm": 0.41557182152841543, + "learning_rate": 3.3019777033936497e-06, + "loss": 0.1941, + "step": 5832 + }, + { + "epoch": 1.9479044915678745, + "grad_norm": 0.4136102028672037, + "learning_rate": 3.3001499938837124e-06, + "loss": 0.199, + "step": 5833 + }, + { + "epoch": 1.948238437134747, + "grad_norm": 0.42808162442284475, + "learning_rate": 3.2983225411561338e-06, + "loss": 0.1888, + "step": 5834 + }, + { + "epoch": 1.9485723827016197, + "grad_norm": 0.4239596741209335, + "learning_rate": 3.296495345486971e-06, + "loss": 0.1947, + "step": 5835 + }, + { + "epoch": 1.9489063282684922, + "grad_norm": 0.38609956620089547, + "learning_rate": 3.294668407152245e-06, + "loss": 0.1887, + "step": 5836 + }, + { + "epoch": 1.9492402738353647, + "grad_norm": 0.39282968924115996, + "learning_rate": 3.2928417264279338e-06, + "loss": 0.1866, + "step": 5837 + }, + { + "epoch": 1.9495742194022374, + "grad_norm": 0.37824694049603946, + "learning_rate": 3.2910153035899826e-06, + "loss": 0.1867, + "step": 5838 + }, + { + "epoch": 1.9499081649691101, + "grad_norm": 0.43800186138716113, + "learning_rate": 3.2891891389142933e-06, + "loss": 0.2164, + "step": 5839 + }, + { + "epoch": 1.9502421105359826, + "grad_norm": 0.38749810000277757, + "learning_rate": 3.2873632326767278e-06, + "loss": 0.1806, + "step": 5840 + }, + { + "epoch": 1.9505760561028551, + "grad_norm": 0.393525628067674, + "learning_rate": 3.2855375851531122e-06, + "loss": 0.1861, + "step": 5841 + }, + { + "epoch": 1.9509100016697278, + "grad_norm": 0.4351462404079153, + "learning_rate": 3.283712196619229e-06, + "loss": 0.2038, + "step": 5842 + }, + { + "epoch": 1.9512439472366006, + "grad_norm": 0.42090443021448815, + "learning_rate": 3.2818870673508297e-06, + "loss": 0.2048, + "step": 5843 + }, + { + "epoch": 1.951577892803473, + "grad_norm": 0.3790280867900267, + "learning_rate": 3.2800621976236184e-06, + "loss": 0.1874, + "step": 5844 + }, + { + "epoch": 1.9519118383703455, + "grad_norm": 0.40688808218095657, + "learning_rate": 3.2782375877132643e-06, + "loss": 0.1808, + "step": 5845 + }, + { + "epoch": 1.9522457839372183, + "grad_norm": 0.4256516285672986, + "learning_rate": 3.276413237895395e-06, + "loss": 0.1954, + "step": 5846 + }, + { + "epoch": 1.952579729504091, + "grad_norm": 0.4171161147626943, + "learning_rate": 3.2745891484456016e-06, + "loss": 0.2009, + "step": 5847 + }, + { + "epoch": 1.9529136750709635, + "grad_norm": 0.5044605426457384, + "learning_rate": 3.2727653196394314e-06, + "loss": 0.194, + "step": 5848 + }, + { + "epoch": 1.953247620637836, + "grad_norm": 0.40522989200496995, + "learning_rate": 3.270941751752398e-06, + "loss": 0.191, + "step": 5849 + }, + { + "epoch": 1.9535815662047087, + "grad_norm": 0.39708955867580614, + "learning_rate": 3.269118445059973e-06, + "loss": 0.1799, + "step": 5850 + }, + { + "epoch": 1.9539155117715812, + "grad_norm": 0.40876661685989146, + "learning_rate": 3.267295399837587e-06, + "loss": 0.1914, + "step": 5851 + }, + { + "epoch": 1.9542494573384537, + "grad_norm": 0.39832148352306906, + "learning_rate": 3.2654726163606333e-06, + "loss": 0.1843, + "step": 5852 + }, + { + "epoch": 1.9545834029053264, + "grad_norm": 0.4339968232593733, + "learning_rate": 3.2636500949044637e-06, + "loss": 0.204, + "step": 5853 + }, + { + "epoch": 1.9549173484721991, + "grad_norm": 0.40521624053581823, + "learning_rate": 3.2618278357443913e-06, + "loss": 0.1923, + "step": 5854 + }, + { + "epoch": 1.9552512940390716, + "grad_norm": 0.40925796162009104, + "learning_rate": 3.260005839155691e-06, + "loss": 0.1942, + "step": 5855 + }, + { + "epoch": 1.955585239605944, + "grad_norm": 0.4065519345018825, + "learning_rate": 3.258184105413597e-06, + "loss": 0.1929, + "step": 5856 + }, + { + "epoch": 1.9559191851728168, + "grad_norm": 0.40373783185327916, + "learning_rate": 3.256362634793303e-06, + "loss": 0.1836, + "step": 5857 + }, + { + "epoch": 1.9562531307396895, + "grad_norm": 0.466912546541022, + "learning_rate": 3.2545414275699638e-06, + "loss": 0.2051, + "step": 5858 + }, + { + "epoch": 1.956587076306562, + "grad_norm": 0.4119496217398985, + "learning_rate": 3.2527204840186944e-06, + "loss": 0.2009, + "step": 5859 + }, + { + "epoch": 1.9569210218734345, + "grad_norm": 0.427568161677971, + "learning_rate": 3.2508998044145674e-06, + "loss": 0.1853, + "step": 5860 + }, + { + "epoch": 1.9572549674403072, + "grad_norm": 0.3964003737577245, + "learning_rate": 3.249079389032621e-06, + "loss": 0.1931, + "step": 5861 + }, + { + "epoch": 1.95758891300718, + "grad_norm": 0.3733715502028118, + "learning_rate": 3.247259238147851e-06, + "loss": 0.1891, + "step": 5862 + }, + { + "epoch": 1.9579228585740525, + "grad_norm": 0.387516171710147, + "learning_rate": 3.245439352035209e-06, + "loss": 0.1937, + "step": 5863 + }, + { + "epoch": 1.958256804140925, + "grad_norm": 0.41829601573720177, + "learning_rate": 3.243619730969614e-06, + "loss": 0.2014, + "step": 5864 + }, + { + "epoch": 1.9585907497077977, + "grad_norm": 0.37178507032150165, + "learning_rate": 3.2418003752259374e-06, + "loss": 0.1829, + "step": 5865 + }, + { + "epoch": 1.9589246952746704, + "grad_norm": 0.3755322252943328, + "learning_rate": 3.239981285079016e-06, + "loss": 0.1816, + "step": 5866 + }, + { + "epoch": 1.9592586408415427, + "grad_norm": 0.5064830664211579, + "learning_rate": 3.238162460803646e-06, + "loss": 0.1801, + "step": 5867 + }, + { + "epoch": 1.9595925864084154, + "grad_norm": 0.40514585437042433, + "learning_rate": 3.2363439026745813e-06, + "loss": 0.1937, + "step": 5868 + }, + { + "epoch": 1.959926531975288, + "grad_norm": 0.3949337542067921, + "learning_rate": 3.2345256109665366e-06, + "loss": 0.1769, + "step": 5869 + }, + { + "epoch": 1.9602604775421606, + "grad_norm": 0.39658019897229246, + "learning_rate": 3.2327075859541867e-06, + "loss": 0.1755, + "step": 5870 + }, + { + "epoch": 1.960594423109033, + "grad_norm": 0.4007129170784194, + "learning_rate": 3.2308898279121646e-06, + "loss": 0.1887, + "step": 5871 + }, + { + "epoch": 1.9609283686759058, + "grad_norm": 0.407882174298797, + "learning_rate": 3.2290723371150627e-06, + "loss": 0.1933, + "step": 5872 + }, + { + "epoch": 1.9612623142427785, + "grad_norm": 0.38390199585380574, + "learning_rate": 3.2272551138374387e-06, + "loss": 0.1857, + "step": 5873 + }, + { + "epoch": 1.961596259809651, + "grad_norm": 0.4144353996140741, + "learning_rate": 3.2254381583538025e-06, + "loss": 0.2052, + "step": 5874 + }, + { + "epoch": 1.9619302053765235, + "grad_norm": 0.42520106945104263, + "learning_rate": 3.223621470938628e-06, + "loss": 0.1989, + "step": 5875 + }, + { + "epoch": 1.9622641509433962, + "grad_norm": 0.39528450044868774, + "learning_rate": 3.2218050518663457e-06, + "loss": 0.1952, + "step": 5876 + }, + { + "epoch": 1.962598096510269, + "grad_norm": 0.38997068609060653, + "learning_rate": 3.219988901411347e-06, + "loss": 0.1905, + "step": 5877 + }, + { + "epoch": 1.9629320420771414, + "grad_norm": 0.43396687286925384, + "learning_rate": 3.218173019847985e-06, + "loss": 0.195, + "step": 5878 + }, + { + "epoch": 1.963265987644014, + "grad_norm": 0.41610373323796274, + "learning_rate": 3.2163574074505686e-06, + "loss": 0.2019, + "step": 5879 + }, + { + "epoch": 1.9635999332108867, + "grad_norm": 0.41341146951828495, + "learning_rate": 3.214542064493367e-06, + "loss": 0.2027, + "step": 5880 + }, + { + "epoch": 1.9639338787777594, + "grad_norm": 0.41138072181872515, + "learning_rate": 3.2127269912506103e-06, + "loss": 0.1868, + "step": 5881 + }, + { + "epoch": 1.9642678243446319, + "grad_norm": 0.3866031619654102, + "learning_rate": 3.210912187996486e-06, + "loss": 0.1932, + "step": 5882 + }, + { + "epoch": 1.9646017699115044, + "grad_norm": 0.4016530046534839, + "learning_rate": 3.2090976550051393e-06, + "loss": 0.2022, + "step": 5883 + }, + { + "epoch": 1.964935715478377, + "grad_norm": 0.4546880535883009, + "learning_rate": 3.207283392550681e-06, + "loss": 0.1937, + "step": 5884 + }, + { + "epoch": 1.9652696610452496, + "grad_norm": 0.3751451368240068, + "learning_rate": 3.2054694009071753e-06, + "loss": 0.1766, + "step": 5885 + }, + { + "epoch": 1.965603606612122, + "grad_norm": 0.4249952733519631, + "learning_rate": 3.2036556803486465e-06, + "loss": 0.1897, + "step": 5886 + }, + { + "epoch": 1.9659375521789948, + "grad_norm": 0.4289533302323695, + "learning_rate": 3.2018422311490778e-06, + "loss": 0.1872, + "step": 5887 + }, + { + "epoch": 1.9662714977458675, + "grad_norm": 0.4156318288794827, + "learning_rate": 3.200029053582413e-06, + "loss": 0.2013, + "step": 5888 + }, + { + "epoch": 1.96660544331274, + "grad_norm": 0.4234857984820954, + "learning_rate": 3.1982161479225514e-06, + "loss": 0.2035, + "step": 5889 + }, + { + "epoch": 1.9669393888796125, + "grad_norm": 0.4631603579410054, + "learning_rate": 3.196403514443358e-06, + "loss": 0.212, + "step": 5890 + }, + { + "epoch": 1.9672733344464852, + "grad_norm": 0.39403994004750187, + "learning_rate": 3.19459115341865e-06, + "loss": 0.1785, + "step": 5891 + }, + { + "epoch": 1.967607280013358, + "grad_norm": 0.3926567448420319, + "learning_rate": 3.1927790651222073e-06, + "loss": 0.189, + "step": 5892 + }, + { + "epoch": 1.9679412255802304, + "grad_norm": 0.3989322852990989, + "learning_rate": 3.1909672498277656e-06, + "loss": 0.2059, + "step": 5893 + }, + { + "epoch": 1.968275171147103, + "grad_norm": 0.42083685608122595, + "learning_rate": 3.1891557078090218e-06, + "loss": 0.1998, + "step": 5894 + }, + { + "epoch": 1.9686091167139756, + "grad_norm": 0.4383329776305396, + "learning_rate": 3.187344439339628e-06, + "loss": 0.1941, + "step": 5895 + }, + { + "epoch": 1.9689430622808484, + "grad_norm": 0.41449458340242895, + "learning_rate": 3.1855334446932025e-06, + "loss": 0.1944, + "step": 5896 + }, + { + "epoch": 1.9692770078477209, + "grad_norm": 0.3717497165199124, + "learning_rate": 3.1837227241433145e-06, + "loss": 0.1877, + "step": 5897 + }, + { + "epoch": 1.9696109534145934, + "grad_norm": 0.4011174641338493, + "learning_rate": 3.181912277963495e-06, + "loss": 0.1987, + "step": 5898 + }, + { + "epoch": 1.969944898981466, + "grad_norm": 0.38902286671665814, + "learning_rate": 3.180102106427233e-06, + "loss": 0.1867, + "step": 5899 + }, + { + "epoch": 1.9702788445483386, + "grad_norm": 0.42521419169467267, + "learning_rate": 3.178292209807976e-06, + "loss": 0.2018, + "step": 5900 + }, + { + "epoch": 1.970612790115211, + "grad_norm": 0.4065707042117532, + "learning_rate": 3.1764825883791306e-06, + "loss": 0.1885, + "step": 5901 + }, + { + "epoch": 1.9709467356820838, + "grad_norm": 0.412725809040078, + "learning_rate": 3.174673242414062e-06, + "loss": 0.1897, + "step": 5902 + }, + { + "epoch": 1.9712806812489565, + "grad_norm": 0.375413582686983, + "learning_rate": 3.1728641721860925e-06, + "loss": 0.185, + "step": 5903 + }, + { + "epoch": 1.971614626815829, + "grad_norm": 0.41467689840017014, + "learning_rate": 3.1710553779685036e-06, + "loss": 0.1973, + "step": 5904 + }, + { + "epoch": 1.9719485723827015, + "grad_norm": 0.41671275410394143, + "learning_rate": 3.169246860034535e-06, + "loss": 0.2082, + "step": 5905 + }, + { + "epoch": 1.9722825179495742, + "grad_norm": 0.4057263289652559, + "learning_rate": 3.1674386186573853e-06, + "loss": 0.1897, + "step": 5906 + }, + { + "epoch": 1.972616463516447, + "grad_norm": 0.38913647089351844, + "learning_rate": 3.1656306541102073e-06, + "loss": 0.1856, + "step": 5907 + }, + { + "epoch": 1.9729504090833194, + "grad_norm": 0.3486855638934541, + "learning_rate": 3.16382296666612e-06, + "loss": 0.1679, + "step": 5908 + }, + { + "epoch": 1.973284354650192, + "grad_norm": 0.4014595623992815, + "learning_rate": 3.1620155565981942e-06, + "loss": 0.191, + "step": 5909 + }, + { + "epoch": 1.9736183002170646, + "grad_norm": 0.40381926251995, + "learning_rate": 3.1602084241794595e-06, + "loss": 0.1839, + "step": 5910 + }, + { + "epoch": 1.9739522457839374, + "grad_norm": 0.37605441183464833, + "learning_rate": 3.158401569682906e-06, + "loss": 0.1875, + "step": 5911 + }, + { + "epoch": 1.9742861913508098, + "grad_norm": 0.3777469896395668, + "learning_rate": 3.156594993381479e-06, + "loss": 0.1886, + "step": 5912 + }, + { + "epoch": 1.9746201369176823, + "grad_norm": 0.3736417903821561, + "learning_rate": 3.154788695548082e-06, + "loss": 0.1822, + "step": 5913 + }, + { + "epoch": 1.974954082484555, + "grad_norm": 0.3663210624038179, + "learning_rate": 3.152982676455581e-06, + "loss": 0.1726, + "step": 5914 + }, + { + "epoch": 1.9752880280514278, + "grad_norm": 0.39921583039669706, + "learning_rate": 3.151176936376794e-06, + "loss": 0.1952, + "step": 5915 + }, + { + "epoch": 1.9756219736183, + "grad_norm": 0.41004780294376225, + "learning_rate": 3.1493714755845013e-06, + "loss": 0.1892, + "step": 5916 + }, + { + "epoch": 1.9759559191851728, + "grad_norm": 0.3953073598725052, + "learning_rate": 3.1475662943514366e-06, + "loss": 0.1846, + "step": 5917 + }, + { + "epoch": 1.9762898647520455, + "grad_norm": 0.41934031809748096, + "learning_rate": 3.145761392950293e-06, + "loss": 0.1938, + "step": 5918 + }, + { + "epoch": 1.976623810318918, + "grad_norm": 0.3991536627450681, + "learning_rate": 3.1439567716537268e-06, + "loss": 0.1842, + "step": 5919 + }, + { + "epoch": 1.9769577558857905, + "grad_norm": 0.42211108035671385, + "learning_rate": 3.142152430734343e-06, + "loss": 0.1926, + "step": 5920 + }, + { + "epoch": 1.9772917014526632, + "grad_norm": 0.3874004858043167, + "learning_rate": 3.140348370464711e-06, + "loss": 0.1825, + "step": 5921 + }, + { + "epoch": 1.977625647019536, + "grad_norm": 0.4959700492993034, + "learning_rate": 3.138544591117354e-06, + "loss": 0.1881, + "step": 5922 + }, + { + "epoch": 1.9779595925864084, + "grad_norm": 0.4058894672658346, + "learning_rate": 3.1367410929647544e-06, + "loss": 0.2038, + "step": 5923 + }, + { + "epoch": 1.978293538153281, + "grad_norm": 0.4178550089316045, + "learning_rate": 3.1349378762793515e-06, + "loss": 0.2012, + "step": 5924 + }, + { + "epoch": 1.9786274837201536, + "grad_norm": 0.4004663381408806, + "learning_rate": 3.133134941333543e-06, + "loss": 0.1837, + "step": 5925 + }, + { + "epoch": 1.9789614292870263, + "grad_norm": 0.41889122105286186, + "learning_rate": 3.1313322883996833e-06, + "loss": 0.2016, + "step": 5926 + }, + { + "epoch": 1.9792953748538988, + "grad_norm": 0.39423179870453967, + "learning_rate": 3.129529917750085e-06, + "loss": 0.1859, + "step": 5927 + }, + { + "epoch": 1.9796293204207713, + "grad_norm": 0.4176399622736979, + "learning_rate": 3.1277278296570157e-06, + "loss": 0.1969, + "step": 5928 + }, + { + "epoch": 1.979963265987644, + "grad_norm": 0.720364647524329, + "learning_rate": 3.1259260243927035e-06, + "loss": 0.1926, + "step": 5929 + }, + { + "epoch": 1.9802972115545168, + "grad_norm": 0.41290547958373885, + "learning_rate": 3.12412450222933e-06, + "loss": 0.1898, + "step": 5930 + }, + { + "epoch": 1.9806311571213893, + "grad_norm": 0.3671561676737529, + "learning_rate": 3.12232326343904e-06, + "loss": 0.1741, + "step": 5931 + }, + { + "epoch": 1.9809651026882618, + "grad_norm": 0.4006350592225924, + "learning_rate": 3.1205223082939302e-06, + "loss": 0.1908, + "step": 5932 + }, + { + "epoch": 1.9812990482551345, + "grad_norm": 0.398681215978086, + "learning_rate": 3.1187216370660558e-06, + "loss": 0.1936, + "step": 5933 + }, + { + "epoch": 1.981632993822007, + "grad_norm": 0.43415498393757535, + "learning_rate": 3.1169212500274294e-06, + "loss": 0.1943, + "step": 5934 + }, + { + "epoch": 1.9819669393888795, + "grad_norm": 0.4135058440013624, + "learning_rate": 3.11512114745002e-06, + "loss": 0.2106, + "step": 5935 + }, + { + "epoch": 1.9823008849557522, + "grad_norm": 0.3846174813571026, + "learning_rate": 3.113321329605754e-06, + "loss": 0.1796, + "step": 5936 + }, + { + "epoch": 1.982634830522625, + "grad_norm": 0.383507587269331, + "learning_rate": 3.1115217967665174e-06, + "loss": 0.1855, + "step": 5937 + }, + { + "epoch": 1.9829687760894974, + "grad_norm": 0.37537114616087025, + "learning_rate": 3.1097225492041494e-06, + "loss": 0.1747, + "step": 5938 + }, + { + "epoch": 1.98330272165637, + "grad_norm": 0.39957620124069226, + "learning_rate": 3.107923587190448e-06, + "loss": 0.1931, + "step": 5939 + }, + { + "epoch": 1.9836366672232426, + "grad_norm": 0.403505274568005, + "learning_rate": 3.106124910997168e-06, + "loss": 0.1886, + "step": 5940 + }, + { + "epoch": 1.9839706127901153, + "grad_norm": 0.38362226676493627, + "learning_rate": 3.1043265208960187e-06, + "loss": 0.1823, + "step": 5941 + }, + { + "epoch": 1.9843045583569878, + "grad_norm": 0.39122961355599384, + "learning_rate": 3.102528417158668e-06, + "loss": 0.1859, + "step": 5942 + }, + { + "epoch": 1.9846385039238603, + "grad_norm": 0.41049641938952836, + "learning_rate": 3.1007306000567434e-06, + "loss": 0.1918, + "step": 5943 + }, + { + "epoch": 1.984972449490733, + "grad_norm": 0.36272757102046843, + "learning_rate": 3.0989330698618248e-06, + "loss": 0.175, + "step": 5944 + }, + { + "epoch": 1.9853063950576058, + "grad_norm": 0.4101938673103447, + "learning_rate": 3.097135826845451e-06, + "loss": 0.193, + "step": 5945 + }, + { + "epoch": 1.9856403406244783, + "grad_norm": 0.3690806719292599, + "learning_rate": 3.0953388712791155e-06, + "loss": 0.1597, + "step": 5946 + }, + { + "epoch": 1.9859742861913507, + "grad_norm": 0.4225456794662782, + "learning_rate": 3.09354220343427e-06, + "loss": 0.1867, + "step": 5947 + }, + { + "epoch": 1.9863082317582235, + "grad_norm": 0.4282830064350807, + "learning_rate": 3.0917458235823215e-06, + "loss": 0.2025, + "step": 5948 + }, + { + "epoch": 1.986642177325096, + "grad_norm": 0.4349543625503407, + "learning_rate": 3.089949731994637e-06, + "loss": 0.2014, + "step": 5949 + }, + { + "epoch": 1.9869761228919685, + "grad_norm": 0.37381968406717025, + "learning_rate": 3.088153928942535e-06, + "loss": 0.1842, + "step": 5950 + }, + { + "epoch": 1.9873100684588412, + "grad_norm": 0.39590274000895553, + "learning_rate": 3.0863584146972935e-06, + "loss": 0.1806, + "step": 5951 + }, + { + "epoch": 1.987644014025714, + "grad_norm": 0.3863725717307747, + "learning_rate": 3.084563189530146e-06, + "loss": 0.1792, + "step": 5952 + }, + { + "epoch": 1.9879779595925864, + "grad_norm": 0.40619770607208006, + "learning_rate": 3.0827682537122817e-06, + "loss": 0.1942, + "step": 5953 + }, + { + "epoch": 1.9883119051594589, + "grad_norm": 0.4483264558049833, + "learning_rate": 3.0809736075148456e-06, + "loss": 0.2014, + "step": 5954 + }, + { + "epoch": 1.9886458507263316, + "grad_norm": 0.4199360913214994, + "learning_rate": 3.0791792512089443e-06, + "loss": 0.182, + "step": 5955 + }, + { + "epoch": 1.9889797962932043, + "grad_norm": 0.4058012474543562, + "learning_rate": 3.0773851850656335e-06, + "loss": 0.1855, + "step": 5956 + }, + { + "epoch": 1.9893137418600768, + "grad_norm": 0.3894507983132781, + "learning_rate": 3.075591409355929e-06, + "loss": 0.1914, + "step": 5957 + }, + { + "epoch": 1.9896476874269493, + "grad_norm": 0.39437724621404885, + "learning_rate": 3.073797924350801e-06, + "loss": 0.195, + "step": 5958 + }, + { + "epoch": 1.989981632993822, + "grad_norm": 0.39982310703857066, + "learning_rate": 3.0720047303211746e-06, + "loss": 0.1943, + "step": 5959 + }, + { + "epoch": 1.9903155785606947, + "grad_norm": 0.43063169248758626, + "learning_rate": 3.0702118275379376e-06, + "loss": 0.1986, + "step": 5960 + }, + { + "epoch": 1.9906495241275672, + "grad_norm": 0.39695348018292154, + "learning_rate": 3.0684192162719263e-06, + "loss": 0.1922, + "step": 5961 + }, + { + "epoch": 1.9909834696944397, + "grad_norm": 0.398425444924352, + "learning_rate": 3.066626896793936e-06, + "loss": 0.1887, + "step": 5962 + }, + { + "epoch": 1.9913174152613125, + "grad_norm": 0.3756162081582355, + "learning_rate": 3.0648348693747177e-06, + "loss": 0.1855, + "step": 5963 + }, + { + "epoch": 1.9916513608281852, + "grad_norm": 0.4199243577842171, + "learning_rate": 3.063043134284979e-06, + "loss": 0.1933, + "step": 5964 + }, + { + "epoch": 1.9919853063950574, + "grad_norm": 0.38131482579075354, + "learning_rate": 3.0612516917953783e-06, + "loss": 0.1843, + "step": 5965 + }, + { + "epoch": 1.9923192519619302, + "grad_norm": 0.369216400903443, + "learning_rate": 3.0594605421765406e-06, + "loss": 0.1797, + "step": 5966 + }, + { + "epoch": 1.9926531975288029, + "grad_norm": 0.4121903275320649, + "learning_rate": 3.057669685699037e-06, + "loss": 0.1863, + "step": 5967 + }, + { + "epoch": 1.9929871430956754, + "grad_norm": 0.4355489715595582, + "learning_rate": 3.0558791226333974e-06, + "loss": 0.1911, + "step": 5968 + }, + { + "epoch": 1.9933210886625479, + "grad_norm": 0.3789799913064823, + "learning_rate": 3.0540888532501075e-06, + "loss": 0.1926, + "step": 5969 + }, + { + "epoch": 1.9936550342294206, + "grad_norm": 0.42365316429605293, + "learning_rate": 3.052298877819608e-06, + "loss": 0.1904, + "step": 5970 + }, + { + "epoch": 1.9939889797962933, + "grad_norm": 0.41698808702160367, + "learning_rate": 3.050509196612297e-06, + "loss": 0.1963, + "step": 5971 + }, + { + "epoch": 1.9943229253631658, + "grad_norm": 0.37543262198026217, + "learning_rate": 3.0487198098985265e-06, + "loss": 0.1944, + "step": 5972 + }, + { + "epoch": 1.9946568709300383, + "grad_norm": 0.43641995145584184, + "learning_rate": 3.046930717948604e-06, + "loss": 0.1892, + "step": 5973 + }, + { + "epoch": 1.994990816496911, + "grad_norm": 0.38182184018203114, + "learning_rate": 3.0451419210327935e-06, + "loss": 0.1806, + "step": 5974 + }, + { + "epoch": 1.9953247620637837, + "grad_norm": 0.3818787684355753, + "learning_rate": 3.0433534194213143e-06, + "loss": 0.1799, + "step": 5975 + }, + { + "epoch": 1.9956587076306562, + "grad_norm": 0.3744181477267095, + "learning_rate": 3.0415652133843375e-06, + "loss": 0.1816, + "step": 5976 + }, + { + "epoch": 1.9959926531975287, + "grad_norm": 0.40734828799825423, + "learning_rate": 3.0397773031919966e-06, + "loss": 0.2016, + "step": 5977 + }, + { + "epoch": 1.9963265987644014, + "grad_norm": 0.3871428637806214, + "learning_rate": 3.0379896891143746e-06, + "loss": 0.1806, + "step": 5978 + }, + { + "epoch": 1.9966605443312742, + "grad_norm": 0.39395345656310504, + "learning_rate": 3.036202371421513e-06, + "loss": 0.1812, + "step": 5979 + }, + { + "epoch": 1.9969944898981467, + "grad_norm": 0.41099460235481833, + "learning_rate": 3.034415350383405e-06, + "loss": 0.1939, + "step": 5980 + }, + { + "epoch": 1.9973284354650191, + "grad_norm": 0.3679936586206283, + "learning_rate": 3.0326286262700035e-06, + "loss": 0.1879, + "step": 5981 + }, + { + "epoch": 1.9976623810318919, + "grad_norm": 0.3814246949048082, + "learning_rate": 3.030842199351212e-06, + "loss": 0.1879, + "step": 5982 + }, + { + "epoch": 1.9979963265987644, + "grad_norm": 0.42714571464627366, + "learning_rate": 3.0290560698968907e-06, + "loss": 0.2029, + "step": 5983 + }, + { + "epoch": 1.9983302721656369, + "grad_norm": 0.4021653615871575, + "learning_rate": 3.0272702381768593e-06, + "loss": 0.1936, + "step": 5984 + }, + { + "epoch": 1.9986642177325096, + "grad_norm": 0.39922967076189847, + "learning_rate": 3.0254847044608872e-06, + "loss": 0.2034, + "step": 5985 + }, + { + "epoch": 1.9989981632993823, + "grad_norm": 0.4103626909173217, + "learning_rate": 3.0236994690186983e-06, + "loss": 0.1874, + "step": 5986 + }, + { + "epoch": 1.9993321088662548, + "grad_norm": 0.40222096398131774, + "learning_rate": 3.0219145321199763e-06, + "loss": 0.1868, + "step": 5987 + }, + { + "epoch": 1.9996660544331273, + "grad_norm": 0.37798334008408047, + "learning_rate": 3.0201298940343543e-06, + "loss": 0.1801, + "step": 5988 + }, + { + "epoch": 2.0, + "grad_norm": 0.41315481474910304, + "learning_rate": 3.018345555031422e-06, + "loss": 0.1861, + "step": 5989 + }, + { + "epoch": 2.0, + "eval_loss": 0.20493414998054504, + "eval_runtime": 184.3833, + "eval_samples_per_second": 109.408, + "eval_steps_per_second": 1.714, + "step": 5989 + }, + { + "epoch": 2.0003339455668727, + "grad_norm": 0.3534058618254128, + "learning_rate": 3.0165615153807293e-06, + "loss": 0.155, + "step": 5990 + }, + { + "epoch": 2.000667891133745, + "grad_norm": 0.4330126571772758, + "learning_rate": 3.014777775351774e-06, + "loss": 0.1767, + "step": 5991 + }, + { + "epoch": 2.0010018367006177, + "grad_norm": 0.3993798294281268, + "learning_rate": 3.012994335214011e-06, + "loss": 0.1656, + "step": 5992 + }, + { + "epoch": 2.0013357822674904, + "grad_norm": 0.35823939570937335, + "learning_rate": 3.0112111952368496e-06, + "loss": 0.152, + "step": 5993 + }, + { + "epoch": 2.001669727834363, + "grad_norm": 0.4274610122348606, + "learning_rate": 3.009428355689654e-06, + "loss": 0.1683, + "step": 5994 + }, + { + "epoch": 2.0020036734012354, + "grad_norm": 0.39209988341414004, + "learning_rate": 3.007645816841743e-06, + "loss": 0.1553, + "step": 5995 + }, + { + "epoch": 2.002337618968108, + "grad_norm": 0.41360416403789046, + "learning_rate": 3.0058635789623926e-06, + "loss": 0.1693, + "step": 5996 + }, + { + "epoch": 2.002671564534981, + "grad_norm": 0.3964326471268367, + "learning_rate": 3.0040816423208276e-06, + "loss": 0.157, + "step": 5997 + }, + { + "epoch": 2.0030055101018536, + "grad_norm": 0.38668198118986086, + "learning_rate": 3.002300007186232e-06, + "loss": 0.1524, + "step": 5998 + }, + { + "epoch": 2.003339455668726, + "grad_norm": 0.405645756289619, + "learning_rate": 3.0005186738277407e-06, + "loss": 0.1656, + "step": 5999 + }, + { + "epoch": 2.0036734012355986, + "grad_norm": 0.4397302560149037, + "learning_rate": 2.9987376425144477e-06, + "loss": 0.1577, + "step": 6000 + }, + { + "epoch": 2.0040073468024713, + "grad_norm": 0.4394388666470143, + "learning_rate": 2.9969569135153985e-06, + "loss": 0.1639, + "step": 6001 + }, + { + "epoch": 2.004341292369344, + "grad_norm": 0.4849570632205489, + "learning_rate": 2.9951764870995925e-06, + "loss": 0.1621, + "step": 6002 + }, + { + "epoch": 2.0046752379362163, + "grad_norm": 0.48705098971685307, + "learning_rate": 2.9933963635359847e-06, + "loss": 0.1646, + "step": 6003 + }, + { + "epoch": 2.005009183503089, + "grad_norm": 0.4641189129482545, + "learning_rate": 2.991616543093483e-06, + "loss": 0.1749, + "step": 6004 + }, + { + "epoch": 2.0053431290699617, + "grad_norm": 0.46677908584428857, + "learning_rate": 2.9898370260409502e-06, + "loss": 0.1623, + "step": 6005 + }, + { + "epoch": 2.005677074636834, + "grad_norm": 0.4711820568033053, + "learning_rate": 2.9880578126472015e-06, + "loss": 0.16, + "step": 6006 + }, + { + "epoch": 2.0060110202037067, + "grad_norm": 0.47617074156351524, + "learning_rate": 2.9862789031810126e-06, + "loss": 0.1721, + "step": 6007 + }, + { + "epoch": 2.0063449657705794, + "grad_norm": 0.4487052591604857, + "learning_rate": 2.984500297911106e-06, + "loss": 0.1584, + "step": 6008 + }, + { + "epoch": 2.006678911337452, + "grad_norm": 0.447671304592757, + "learning_rate": 2.9827219971061607e-06, + "loss": 0.1654, + "step": 6009 + }, + { + "epoch": 2.0070128569043244, + "grad_norm": 0.47125672755220205, + "learning_rate": 2.98094400103481e-06, + "loss": 0.1681, + "step": 6010 + }, + { + "epoch": 2.007346802471197, + "grad_norm": 0.45728456238338155, + "learning_rate": 2.9791663099656424e-06, + "loss": 0.1618, + "step": 6011 + }, + { + "epoch": 2.00768074803807, + "grad_norm": 0.47447060864994905, + "learning_rate": 2.977388924167196e-06, + "loss": 0.1492, + "step": 6012 + }, + { + "epoch": 2.0080146936049426, + "grad_norm": 0.446422979462353, + "learning_rate": 2.975611843907971e-06, + "loss": 0.1576, + "step": 6013 + }, + { + "epoch": 2.008348639171815, + "grad_norm": 0.48529971727386545, + "learning_rate": 2.9738350694564117e-06, + "loss": 0.1674, + "step": 6014 + }, + { + "epoch": 2.0086825847386875, + "grad_norm": 0.43999105968259355, + "learning_rate": 2.9720586010809234e-06, + "loss": 0.1584, + "step": 6015 + }, + { + "epoch": 2.0090165303055603, + "grad_norm": 0.42356091106399757, + "learning_rate": 2.9702824390498615e-06, + "loss": 0.155, + "step": 6016 + }, + { + "epoch": 2.009350475872433, + "grad_norm": 0.4529826679339405, + "learning_rate": 2.9685065836315362e-06, + "loss": 0.1594, + "step": 6017 + }, + { + "epoch": 2.0096844214393053, + "grad_norm": 0.44293372978393375, + "learning_rate": 2.9667310350942103e-06, + "loss": 0.1601, + "step": 6018 + }, + { + "epoch": 2.010018367006178, + "grad_norm": 0.4662546772405464, + "learning_rate": 2.964955793706104e-06, + "loss": 0.1651, + "step": 6019 + }, + { + "epoch": 2.0103523125730507, + "grad_norm": 0.4662905596333431, + "learning_rate": 2.963180859735387e-06, + "loss": 0.1564, + "step": 6020 + }, + { + "epoch": 2.0106862581399234, + "grad_norm": 0.5287577603093535, + "learning_rate": 2.961406233450184e-06, + "loss": 0.1658, + "step": 6021 + }, + { + "epoch": 2.0110202037067957, + "grad_norm": 0.5051506571548454, + "learning_rate": 2.9596319151185713e-06, + "loss": 0.178, + "step": 6022 + }, + { + "epoch": 2.0113541492736684, + "grad_norm": 0.4822552977434536, + "learning_rate": 2.9578579050085836e-06, + "loss": 0.1756, + "step": 6023 + }, + { + "epoch": 2.011688094840541, + "grad_norm": 0.5070476190943671, + "learning_rate": 2.956084203388204e-06, + "loss": 0.1565, + "step": 6024 + }, + { + "epoch": 2.0120220404074134, + "grad_norm": 0.4697832208358172, + "learning_rate": 2.9543108105253733e-06, + "loss": 0.1665, + "step": 6025 + }, + { + "epoch": 2.012355985974286, + "grad_norm": 0.47328140753003795, + "learning_rate": 2.9525377266879813e-06, + "loss": 0.1646, + "step": 6026 + }, + { + "epoch": 2.012689931541159, + "grad_norm": 0.45988128423824537, + "learning_rate": 2.950764952143874e-06, + "loss": 0.1568, + "step": 6027 + }, + { + "epoch": 2.0130238771080315, + "grad_norm": 0.4671425277097255, + "learning_rate": 2.9489924871608495e-06, + "loss": 0.1643, + "step": 6028 + }, + { + "epoch": 2.013357822674904, + "grad_norm": 0.4991890618376096, + "learning_rate": 2.9472203320066594e-06, + "loss": 0.1721, + "step": 6029 + }, + { + "epoch": 2.0136917682417765, + "grad_norm": 0.4946814103252608, + "learning_rate": 2.9454484869490074e-06, + "loss": 0.1696, + "step": 6030 + }, + { + "epoch": 2.0140257138086493, + "grad_norm": 0.5118310832695881, + "learning_rate": 2.943676952255554e-06, + "loss": 0.1657, + "step": 6031 + }, + { + "epoch": 2.014359659375522, + "grad_norm": 0.49387059069204864, + "learning_rate": 2.9419057281939106e-06, + "loss": 0.168, + "step": 6032 + }, + { + "epoch": 2.0146936049423942, + "grad_norm": 0.4596647029669583, + "learning_rate": 2.94013481503164e-06, + "loss": 0.1597, + "step": 6033 + }, + { + "epoch": 2.015027550509267, + "grad_norm": 0.49980994832903475, + "learning_rate": 2.9383642130362596e-06, + "loss": 0.1647, + "step": 6034 + }, + { + "epoch": 2.0153614960761397, + "grad_norm": 0.4739084813661146, + "learning_rate": 2.9365939224752394e-06, + "loss": 0.1564, + "step": 6035 + }, + { + "epoch": 2.0156954416430124, + "grad_norm": 0.48889393560437605, + "learning_rate": 2.934823943616001e-06, + "loss": 0.1683, + "step": 6036 + }, + { + "epoch": 2.0160293872098847, + "grad_norm": 0.46643889067317856, + "learning_rate": 2.933054276725925e-06, + "loss": 0.1613, + "step": 6037 + }, + { + "epoch": 2.0163633327767574, + "grad_norm": 0.45399739753399115, + "learning_rate": 2.9312849220723382e-06, + "loss": 0.1559, + "step": 6038 + }, + { + "epoch": 2.01669727834363, + "grad_norm": 0.45507324551278094, + "learning_rate": 2.929515879922522e-06, + "loss": 0.159, + "step": 6039 + }, + { + "epoch": 2.0170312239105024, + "grad_norm": 0.4612841631939222, + "learning_rate": 2.9277471505437105e-06, + "loss": 0.1602, + "step": 6040 + }, + { + "epoch": 2.017365169477375, + "grad_norm": 0.4776368978691176, + "learning_rate": 2.925978734203092e-06, + "loss": 0.1723, + "step": 6041 + }, + { + "epoch": 2.017699115044248, + "grad_norm": 0.47341321323873353, + "learning_rate": 2.924210631167807e-06, + "loss": 0.1518, + "step": 6042 + }, + { + "epoch": 2.0180330606111205, + "grad_norm": 0.4839747955511077, + "learning_rate": 2.922442841704948e-06, + "loss": 0.1621, + "step": 6043 + }, + { + "epoch": 2.018367006177993, + "grad_norm": 0.4883511258492533, + "learning_rate": 2.920675366081559e-06, + "loss": 0.1636, + "step": 6044 + }, + { + "epoch": 2.0187009517448655, + "grad_norm": 0.5464115849984775, + "learning_rate": 2.9189082045646404e-06, + "loss": 0.1777, + "step": 6045 + }, + { + "epoch": 2.0190348973117382, + "grad_norm": 0.46191595447785594, + "learning_rate": 2.9171413574211426e-06, + "loss": 0.1623, + "step": 6046 + }, + { + "epoch": 2.019368842878611, + "grad_norm": 0.49795505123926764, + "learning_rate": 2.9153748249179637e-06, + "loss": 0.1667, + "step": 6047 + }, + { + "epoch": 2.0197027884454832, + "grad_norm": 0.47251837346553677, + "learning_rate": 2.9136086073219665e-06, + "loss": 0.1539, + "step": 6048 + }, + { + "epoch": 2.020036734012356, + "grad_norm": 0.4581855742704899, + "learning_rate": 2.9118427048999544e-06, + "loss": 0.1555, + "step": 6049 + }, + { + "epoch": 2.0203706795792287, + "grad_norm": 0.5045197369778212, + "learning_rate": 2.9100771179186904e-06, + "loss": 0.17, + "step": 6050 + }, + { + "epoch": 2.0207046251461014, + "grad_norm": 0.4421223167410785, + "learning_rate": 2.9083118466448845e-06, + "loss": 0.1502, + "step": 6051 + }, + { + "epoch": 2.0210385707129737, + "grad_norm": 0.4474440028569155, + "learning_rate": 2.9065468913452045e-06, + "loss": 0.1462, + "step": 6052 + }, + { + "epoch": 2.0213725162798464, + "grad_norm": 0.4925003177758385, + "learning_rate": 2.904782252286264e-06, + "loss": 0.1618, + "step": 6053 + }, + { + "epoch": 2.021706461846719, + "grad_norm": 0.4821571393316654, + "learning_rate": 2.903017929734635e-06, + "loss": 0.1662, + "step": 6054 + }, + { + "epoch": 2.0220404074135914, + "grad_norm": 0.49103538206612174, + "learning_rate": 2.9012539239568405e-06, + "loss": 0.1633, + "step": 6055 + }, + { + "epoch": 2.022374352980464, + "grad_norm": 0.4891641946599009, + "learning_rate": 2.899490235219351e-06, + "loss": 0.1706, + "step": 6056 + }, + { + "epoch": 2.022708298547337, + "grad_norm": 0.4685788784764766, + "learning_rate": 2.897726863788595e-06, + "loss": 0.1602, + "step": 6057 + }, + { + "epoch": 2.0230422441142095, + "grad_norm": 0.4753894335902614, + "learning_rate": 2.8959638099309504e-06, + "loss": 0.1542, + "step": 6058 + }, + { + "epoch": 2.023376189681082, + "grad_norm": 0.4919778734814164, + "learning_rate": 2.8942010739127446e-06, + "loss": 0.1572, + "step": 6059 + }, + { + "epoch": 2.0237101352479545, + "grad_norm": 0.46606003547768277, + "learning_rate": 2.8924386560002627e-06, + "loss": 0.1574, + "step": 6060 + }, + { + "epoch": 2.0240440808148272, + "grad_norm": 0.48964811026319693, + "learning_rate": 2.8906765564597384e-06, + "loss": 0.1502, + "step": 6061 + }, + { + "epoch": 2.0243780263817, + "grad_norm": 0.48273759736055805, + "learning_rate": 2.8889147755573556e-06, + "loss": 0.1546, + "step": 6062 + }, + { + "epoch": 2.024711971948572, + "grad_norm": 0.5370134887436172, + "learning_rate": 2.8871533135592544e-06, + "loss": 0.1724, + "step": 6063 + }, + { + "epoch": 2.025045917515445, + "grad_norm": 0.45586288119980345, + "learning_rate": 2.8853921707315215e-06, + "loss": 0.1602, + "step": 6064 + }, + { + "epoch": 2.0253798630823177, + "grad_norm": 0.49510158520645964, + "learning_rate": 2.8836313473402e-06, + "loss": 0.1659, + "step": 6065 + }, + { + "epoch": 2.0257138086491904, + "grad_norm": 0.4668970296054312, + "learning_rate": 2.881870843651282e-06, + "loss": 0.1579, + "step": 6066 + }, + { + "epoch": 2.0260477542160626, + "grad_norm": 0.4694378333177316, + "learning_rate": 2.8801106599307164e-06, + "loss": 0.1492, + "step": 6067 + }, + { + "epoch": 2.0263816997829354, + "grad_norm": 0.4914932697805478, + "learning_rate": 2.8783507964443942e-06, + "loss": 0.1627, + "step": 6068 + }, + { + "epoch": 2.026715645349808, + "grad_norm": 0.4815900522040312, + "learning_rate": 2.8765912534581674e-06, + "loss": 0.1626, + "step": 6069 + }, + { + "epoch": 2.027049590916681, + "grad_norm": 0.48576180596656354, + "learning_rate": 2.874832031237833e-06, + "loss": 0.1565, + "step": 6070 + }, + { + "epoch": 2.027383536483553, + "grad_norm": 0.5102513368484094, + "learning_rate": 2.873073130049142e-06, + "loss": 0.1637, + "step": 6071 + }, + { + "epoch": 2.027717482050426, + "grad_norm": 0.4971694308223245, + "learning_rate": 2.8713145501578e-06, + "loss": 0.1609, + "step": 6072 + }, + { + "epoch": 2.0280514276172985, + "grad_norm": 0.5074347111244822, + "learning_rate": 2.869556291829461e-06, + "loss": 0.1503, + "step": 6073 + }, + { + "epoch": 2.028385373184171, + "grad_norm": 0.4216789477572703, + "learning_rate": 2.8677983553297266e-06, + "loss": 0.1452, + "step": 6074 + }, + { + "epoch": 2.0287193187510435, + "grad_norm": 0.49704788792406124, + "learning_rate": 2.8660407409241593e-06, + "loss": 0.1625, + "step": 6075 + }, + { + "epoch": 2.029053264317916, + "grad_norm": 0.49175915805051923, + "learning_rate": 2.864283448878262e-06, + "loss": 0.1694, + "step": 6076 + }, + { + "epoch": 2.029387209884789, + "grad_norm": 0.46571201952476754, + "learning_rate": 2.8625264794574975e-06, + "loss": 0.1572, + "step": 6077 + }, + { + "epoch": 2.029721155451661, + "grad_norm": 0.500277066125853, + "learning_rate": 2.860769832927276e-06, + "loss": 0.1662, + "step": 6078 + }, + { + "epoch": 2.030055101018534, + "grad_norm": 0.49439200808987693, + "learning_rate": 2.8590135095529624e-06, + "loss": 0.1676, + "step": 6079 + }, + { + "epoch": 2.0303890465854066, + "grad_norm": 0.44751887545978086, + "learning_rate": 2.8572575095998646e-06, + "loss": 0.1479, + "step": 6080 + }, + { + "epoch": 2.0307229921522794, + "grad_norm": 0.4366187834049901, + "learning_rate": 2.855501833333253e-06, + "loss": 0.1499, + "step": 6081 + }, + { + "epoch": 2.0310569377191516, + "grad_norm": 0.4463655726940175, + "learning_rate": 2.853746481018337e-06, + "loss": 0.1478, + "step": 6082 + }, + { + "epoch": 2.0313908832860244, + "grad_norm": 0.525002855673986, + "learning_rate": 2.8519914529202868e-06, + "loss": 0.1768, + "step": 6083 + }, + { + "epoch": 2.031724828852897, + "grad_norm": 0.46830169747916234, + "learning_rate": 2.8502367493042217e-06, + "loss": 0.1652, + "step": 6084 + }, + { + "epoch": 2.03205877441977, + "grad_norm": 0.474430022851497, + "learning_rate": 2.848482370435206e-06, + "loss": 0.1601, + "step": 6085 + }, + { + "epoch": 2.032392719986642, + "grad_norm": 0.4858834048312866, + "learning_rate": 2.8467283165782643e-06, + "loss": 0.1637, + "step": 6086 + }, + { + "epoch": 2.0327266655535148, + "grad_norm": 0.5000627831706353, + "learning_rate": 2.8449745879983614e-06, + "loss": 0.1562, + "step": 6087 + }, + { + "epoch": 2.0330606111203875, + "grad_norm": 0.48498567163926287, + "learning_rate": 2.8432211849604218e-06, + "loss": 0.1594, + "step": 6088 + }, + { + "epoch": 2.0333945566872598, + "grad_norm": 0.4706989935254033, + "learning_rate": 2.841468107729318e-06, + "loss": 0.1627, + "step": 6089 + }, + { + "epoch": 2.0337285022541325, + "grad_norm": 0.4897839906916831, + "learning_rate": 2.8397153565698744e-06, + "loss": 0.1567, + "step": 6090 + }, + { + "epoch": 2.034062447821005, + "grad_norm": 0.46017908487227666, + "learning_rate": 2.8379629317468604e-06, + "loss": 0.1588, + "step": 6091 + }, + { + "epoch": 2.034396393387878, + "grad_norm": 0.4692319755849913, + "learning_rate": 2.8362108335250044e-06, + "loss": 0.1522, + "step": 6092 + }, + { + "epoch": 2.03473033895475, + "grad_norm": 0.520679375837806, + "learning_rate": 2.834459062168978e-06, + "loss": 0.1718, + "step": 6093 + }, + { + "epoch": 2.035064284521623, + "grad_norm": 0.5246005906068195, + "learning_rate": 2.8327076179434088e-06, + "loss": 0.1669, + "step": 6094 + }, + { + "epoch": 2.0353982300884956, + "grad_norm": 0.45873793348961106, + "learning_rate": 2.8309565011128732e-06, + "loss": 0.1488, + "step": 6095 + }, + { + "epoch": 2.0357321756553683, + "grad_norm": 0.47134127061918596, + "learning_rate": 2.8292057119418994e-06, + "loss": 0.1631, + "step": 6096 + }, + { + "epoch": 2.0360661212222406, + "grad_norm": 0.4729789859407233, + "learning_rate": 2.827455250694961e-06, + "loss": 0.16, + "step": 6097 + }, + { + "epoch": 2.0364000667891133, + "grad_norm": 0.5146567715009788, + "learning_rate": 2.8257051176364903e-06, + "loss": 0.1588, + "step": 6098 + }, + { + "epoch": 2.036734012355986, + "grad_norm": 0.4937190219735824, + "learning_rate": 2.8239553130308604e-06, + "loss": 0.1733, + "step": 6099 + }, + { + "epoch": 2.0370679579228588, + "grad_norm": 0.49973854857161226, + "learning_rate": 2.8222058371424033e-06, + "loss": 0.1609, + "step": 6100 + }, + { + "epoch": 2.037401903489731, + "grad_norm": 0.5022894018079591, + "learning_rate": 2.820456690235397e-06, + "loss": 0.1618, + "step": 6101 + }, + { + "epoch": 2.0377358490566038, + "grad_norm": 0.5307860067204769, + "learning_rate": 2.8187078725740723e-06, + "loss": 0.1717, + "step": 6102 + }, + { + "epoch": 2.0380697946234765, + "grad_norm": 0.5005034713170023, + "learning_rate": 2.8169593844226063e-06, + "loss": 0.1656, + "step": 6103 + }, + { + "epoch": 2.0384037401903488, + "grad_norm": 0.44826699148693205, + "learning_rate": 2.815211226045131e-06, + "loss": 0.1444, + "step": 6104 + }, + { + "epoch": 2.0387376857572215, + "grad_norm": 0.4616708746298766, + "learning_rate": 2.8134633977057236e-06, + "loss": 0.1543, + "step": 6105 + }, + { + "epoch": 2.039071631324094, + "grad_norm": 0.456410361967302, + "learning_rate": 2.811715899668415e-06, + "loss": 0.1539, + "step": 6106 + }, + { + "epoch": 2.039405576890967, + "grad_norm": 0.47621252955779786, + "learning_rate": 2.8099687321971887e-06, + "loss": 0.1556, + "step": 6107 + }, + { + "epoch": 2.039739522457839, + "grad_norm": 0.4543629974389159, + "learning_rate": 2.80822189555597e-06, + "loss": 0.1537, + "step": 6108 + }, + { + "epoch": 2.040073468024712, + "grad_norm": 0.49525294210493037, + "learning_rate": 2.8064753900086427e-06, + "loss": 0.1654, + "step": 6109 + }, + { + "epoch": 2.0404074135915846, + "grad_norm": 0.512395040030274, + "learning_rate": 2.804729215819034e-06, + "loss": 0.1712, + "step": 6110 + }, + { + "epoch": 2.0407413591584573, + "grad_norm": 0.47930074814671675, + "learning_rate": 2.8029833732509282e-06, + "loss": 0.1545, + "step": 6111 + }, + { + "epoch": 2.0410753047253296, + "grad_norm": 0.496750207975215, + "learning_rate": 2.801237862568048e-06, + "loss": 0.1538, + "step": 6112 + }, + { + "epoch": 2.0414092502922023, + "grad_norm": 0.5533662779888088, + "learning_rate": 2.799492684034083e-06, + "loss": 0.159, + "step": 6113 + }, + { + "epoch": 2.041743195859075, + "grad_norm": 0.5055832988689022, + "learning_rate": 2.797747837912656e-06, + "loss": 0.1607, + "step": 6114 + }, + { + "epoch": 2.0420771414259478, + "grad_norm": 0.48537757955911165, + "learning_rate": 2.796003324467351e-06, + "loss": 0.1572, + "step": 6115 + }, + { + "epoch": 2.04241108699282, + "grad_norm": 0.5055379856535059, + "learning_rate": 2.794259143961693e-06, + "loss": 0.1621, + "step": 6116 + }, + { + "epoch": 2.0427450325596928, + "grad_norm": 0.5309785977167635, + "learning_rate": 2.7925152966591627e-06, + "loss": 0.168, + "step": 6117 + }, + { + "epoch": 2.0430789781265655, + "grad_norm": 0.48438645896285526, + "learning_rate": 2.7907717828231893e-06, + "loss": 0.158, + "step": 6118 + }, + { + "epoch": 2.043412923693438, + "grad_norm": 0.4991961469549531, + "learning_rate": 2.7890286027171532e-06, + "loss": 0.1591, + "step": 6119 + }, + { + "epoch": 2.0437468692603105, + "grad_norm": 0.45423321499364905, + "learning_rate": 2.7872857566043775e-06, + "loss": 0.158, + "step": 6120 + }, + { + "epoch": 2.044080814827183, + "grad_norm": 0.4865693553830601, + "learning_rate": 2.7855432447481444e-06, + "loss": 0.1649, + "step": 6121 + }, + { + "epoch": 2.044414760394056, + "grad_norm": 0.5337114471576638, + "learning_rate": 2.7838010674116767e-06, + "loss": 0.1718, + "step": 6122 + }, + { + "epoch": 2.044748705960928, + "grad_norm": 0.44948290522077555, + "learning_rate": 2.7820592248581523e-06, + "loss": 0.1487, + "step": 6123 + }, + { + "epoch": 2.045082651527801, + "grad_norm": 0.4938115357940406, + "learning_rate": 2.780317717350697e-06, + "loss": 0.163, + "step": 6124 + }, + { + "epoch": 2.0454165970946736, + "grad_norm": 0.49731409157250733, + "learning_rate": 2.7785765451523896e-06, + "loss": 0.165, + "step": 6125 + }, + { + "epoch": 2.0457505426615463, + "grad_norm": 0.45450185170041324, + "learning_rate": 2.7768357085262486e-06, + "loss": 0.1488, + "step": 6126 + }, + { + "epoch": 2.0460844882284186, + "grad_norm": 0.45184670089086093, + "learning_rate": 2.7750952077352534e-06, + "loss": 0.1578, + "step": 6127 + }, + { + "epoch": 2.0464184337952913, + "grad_norm": 0.4948991525337002, + "learning_rate": 2.7733550430423216e-06, + "loss": 0.1658, + "step": 6128 + }, + { + "epoch": 2.046752379362164, + "grad_norm": 0.5040836204485063, + "learning_rate": 2.7716152147103292e-06, + "loss": 0.1577, + "step": 6129 + }, + { + "epoch": 2.0470863249290367, + "grad_norm": 0.5335238908571901, + "learning_rate": 2.7698757230020986e-06, + "loss": 0.1572, + "step": 6130 + }, + { + "epoch": 2.047420270495909, + "grad_norm": 0.49828751734413135, + "learning_rate": 2.7681365681803967e-06, + "loss": 0.1657, + "step": 6131 + }, + { + "epoch": 2.0477542160627817, + "grad_norm": 0.4999406939225242, + "learning_rate": 2.7663977505079483e-06, + "loss": 0.1687, + "step": 6132 + }, + { + "epoch": 2.0480881616296545, + "grad_norm": 0.5085730563774729, + "learning_rate": 2.764659270247417e-06, + "loss": 0.1711, + "step": 6133 + }, + { + "epoch": 2.048422107196527, + "grad_norm": 0.47688020503244877, + "learning_rate": 2.7629211276614255e-06, + "loss": 0.1589, + "step": 6134 + }, + { + "epoch": 2.0487560527633994, + "grad_norm": 0.47192663378129107, + "learning_rate": 2.761183323012534e-06, + "loss": 0.1613, + "step": 6135 + }, + { + "epoch": 2.049089998330272, + "grad_norm": 0.4848663154654449, + "learning_rate": 2.7594458565632664e-06, + "loss": 0.1578, + "step": 6136 + }, + { + "epoch": 2.049423943897145, + "grad_norm": 0.5000980624774899, + "learning_rate": 2.757708728576083e-06, + "loss": 0.1556, + "step": 6137 + }, + { + "epoch": 2.049757889464017, + "grad_norm": 0.47224990165416336, + "learning_rate": 2.7559719393133987e-06, + "loss": 0.1538, + "step": 6138 + }, + { + "epoch": 2.05009183503089, + "grad_norm": 0.5215834737284637, + "learning_rate": 2.754235489037575e-06, + "loss": 0.1563, + "step": 6139 + }, + { + "epoch": 2.0504257805977626, + "grad_norm": 0.4972320969795835, + "learning_rate": 2.7524993780109254e-06, + "loss": 0.1491, + "step": 6140 + }, + { + "epoch": 2.0507597261646353, + "grad_norm": 0.49919640281659483, + "learning_rate": 2.750763606495704e-06, + "loss": 0.1581, + "step": 6141 + }, + { + "epoch": 2.0510936717315076, + "grad_norm": 0.5143857980365772, + "learning_rate": 2.7490281747541276e-06, + "loss": 0.1712, + "step": 6142 + }, + { + "epoch": 2.0514276172983803, + "grad_norm": 0.5088252050706469, + "learning_rate": 2.747293083048348e-06, + "loss": 0.1591, + "step": 6143 + }, + { + "epoch": 2.051761562865253, + "grad_norm": 0.5077613718070135, + "learning_rate": 2.7455583316404744e-06, + "loss": 0.1673, + "step": 6144 + }, + { + "epoch": 2.0520955084321257, + "grad_norm": 0.5097681301324266, + "learning_rate": 2.743823920792559e-06, + "loss": 0.1652, + "step": 6145 + }, + { + "epoch": 2.052429453998998, + "grad_norm": 0.5389169316301308, + "learning_rate": 2.742089850766607e-06, + "loss": 0.1742, + "step": 6146 + }, + { + "epoch": 2.0527633995658707, + "grad_norm": 0.48373136141051654, + "learning_rate": 2.7403561218245654e-06, + "loss": 0.1589, + "step": 6147 + }, + { + "epoch": 2.0530973451327434, + "grad_norm": 0.4750482606605971, + "learning_rate": 2.7386227342283423e-06, + "loss": 0.1573, + "step": 6148 + }, + { + "epoch": 2.053431290699616, + "grad_norm": 0.5005599752487294, + "learning_rate": 2.73688968823978e-06, + "loss": 0.1723, + "step": 6149 + }, + { + "epoch": 2.0537652362664884, + "grad_norm": 0.48098693092080774, + "learning_rate": 2.7351569841206792e-06, + "loss": 0.1565, + "step": 6150 + }, + { + "epoch": 2.054099181833361, + "grad_norm": 0.5329904578572746, + "learning_rate": 2.733424622132782e-06, + "loss": 0.1717, + "step": 6151 + }, + { + "epoch": 2.054433127400234, + "grad_norm": 0.4760378622271746, + "learning_rate": 2.7316926025377855e-06, + "loss": 0.1588, + "step": 6152 + }, + { + "epoch": 2.054767072967106, + "grad_norm": 0.4462805437410892, + "learning_rate": 2.729960925597328e-06, + "loss": 0.1476, + "step": 6153 + }, + { + "epoch": 2.055101018533979, + "grad_norm": 0.49345870296933825, + "learning_rate": 2.7282295915730016e-06, + "loss": 0.1642, + "step": 6154 + }, + { + "epoch": 2.0554349641008516, + "grad_norm": 0.48456295420893086, + "learning_rate": 2.726498600726346e-06, + "loss": 0.1632, + "step": 6155 + }, + { + "epoch": 2.0557689096677243, + "grad_norm": 0.45486223885521615, + "learning_rate": 2.7247679533188446e-06, + "loss": 0.1519, + "step": 6156 + }, + { + "epoch": 2.0561028552345966, + "grad_norm": 0.5200442659549483, + "learning_rate": 2.723037649611936e-06, + "loss": 0.1635, + "step": 6157 + }, + { + "epoch": 2.0564368008014693, + "grad_norm": 0.4909804430537013, + "learning_rate": 2.721307689866997e-06, + "loss": 0.1635, + "step": 6158 + }, + { + "epoch": 2.056770746368342, + "grad_norm": 0.4594697562211968, + "learning_rate": 2.719578074345366e-06, + "loss": 0.151, + "step": 6159 + }, + { + "epoch": 2.0571046919352147, + "grad_norm": 0.5065496098738638, + "learning_rate": 2.7178488033083163e-06, + "loss": 0.1652, + "step": 6160 + }, + { + "epoch": 2.057438637502087, + "grad_norm": 0.5436549815894377, + "learning_rate": 2.7161198770170784e-06, + "loss": 0.1702, + "step": 6161 + }, + { + "epoch": 2.0577725830689597, + "grad_norm": 0.49925937165886347, + "learning_rate": 2.714391295732822e-06, + "loss": 0.1647, + "step": 6162 + }, + { + "epoch": 2.0581065286358324, + "grad_norm": 0.4852644890591728, + "learning_rate": 2.712663059716675e-06, + "loss": 0.1458, + "step": 6163 + }, + { + "epoch": 2.058440474202705, + "grad_norm": 0.48613041699076365, + "learning_rate": 2.7109351692297015e-06, + "loss": 0.1657, + "step": 6164 + }, + { + "epoch": 2.0587744197695774, + "grad_norm": 0.4648594550935304, + "learning_rate": 2.7092076245329273e-06, + "loss": 0.1598, + "step": 6165 + }, + { + "epoch": 2.05910836533645, + "grad_norm": 0.4584401434296162, + "learning_rate": 2.7074804258873127e-06, + "loss": 0.1527, + "step": 6166 + }, + { + "epoch": 2.059442310903323, + "grad_norm": 0.48690854592041694, + "learning_rate": 2.7057535735537754e-06, + "loss": 0.1588, + "step": 6167 + }, + { + "epoch": 2.0597762564701956, + "grad_norm": 0.4726518812957776, + "learning_rate": 2.704027067793173e-06, + "loss": 0.1596, + "step": 6168 + }, + { + "epoch": 2.060110202037068, + "grad_norm": 0.4848631026115318, + "learning_rate": 2.7023009088663176e-06, + "loss": 0.1622, + "step": 6169 + }, + { + "epoch": 2.0604441476039406, + "grad_norm": 0.5010288195268352, + "learning_rate": 2.7005750970339607e-06, + "loss": 0.1493, + "step": 6170 + }, + { + "epoch": 2.0607780931708133, + "grad_norm": 0.5589263121127757, + "learning_rate": 2.698849632556815e-06, + "loss": 0.1801, + "step": 6171 + }, + { + "epoch": 2.0611120387376856, + "grad_norm": 0.4738720262047763, + "learning_rate": 2.697124515695524e-06, + "loss": 0.1539, + "step": 6172 + }, + { + "epoch": 2.0614459843045583, + "grad_norm": 0.4860672112480917, + "learning_rate": 2.695399746710693e-06, + "loss": 0.1598, + "step": 6173 + }, + { + "epoch": 2.061779929871431, + "grad_norm": 0.5056145335563673, + "learning_rate": 2.6936753258628643e-06, + "loss": 0.1633, + "step": 6174 + }, + { + "epoch": 2.0621138754383037, + "grad_norm": 0.526585125988662, + "learning_rate": 2.691951253412536e-06, + "loss": 0.1569, + "step": 6175 + }, + { + "epoch": 2.062447821005176, + "grad_norm": 0.5141605422386901, + "learning_rate": 2.6902275296201445e-06, + "loss": 0.1628, + "step": 6176 + }, + { + "epoch": 2.0627817665720487, + "grad_norm": 0.4813014518093507, + "learning_rate": 2.688504154746082e-06, + "loss": 0.1613, + "step": 6177 + }, + { + "epoch": 2.0631157121389214, + "grad_norm": 0.4515762636761851, + "learning_rate": 2.686781129050685e-06, + "loss": 0.1475, + "step": 6178 + }, + { + "epoch": 2.063449657705794, + "grad_norm": 0.4709359444645056, + "learning_rate": 2.685058452794235e-06, + "loss": 0.1603, + "step": 6179 + }, + { + "epoch": 2.0637836032726664, + "grad_norm": 0.4820755120947043, + "learning_rate": 2.6833361262369644e-06, + "loss": 0.1532, + "step": 6180 + }, + { + "epoch": 2.064117548839539, + "grad_norm": 0.45748798633211313, + "learning_rate": 2.681614149639048e-06, + "loss": 0.1603, + "step": 6181 + }, + { + "epoch": 2.064451494406412, + "grad_norm": 0.47928383265863794, + "learning_rate": 2.679892523260612e-06, + "loss": 0.1565, + "step": 6182 + }, + { + "epoch": 2.0647854399732846, + "grad_norm": 0.4912860713391019, + "learning_rate": 2.6781712473617293e-06, + "loss": 0.1631, + "step": 6183 + }, + { + "epoch": 2.065119385540157, + "grad_norm": 0.4937495874322415, + "learning_rate": 2.6764503222024202e-06, + "loss": 0.1631, + "step": 6184 + }, + { + "epoch": 2.0654533311070296, + "grad_norm": 0.5250388336695911, + "learning_rate": 2.674729748042647e-06, + "loss": 0.1712, + "step": 6185 + }, + { + "epoch": 2.0657872766739023, + "grad_norm": 0.47623079436105503, + "learning_rate": 2.673009525142326e-06, + "loss": 0.1564, + "step": 6186 + }, + { + "epoch": 2.0661212222407745, + "grad_norm": 0.4874432578932814, + "learning_rate": 2.6712896537613143e-06, + "loss": 0.1579, + "step": 6187 + }, + { + "epoch": 2.0664551678076473, + "grad_norm": 0.45987158697491903, + "learning_rate": 2.6695701341594193e-06, + "loss": 0.1505, + "step": 6188 + }, + { + "epoch": 2.06678911337452, + "grad_norm": 0.44396258874408284, + "learning_rate": 2.667850966596396e-06, + "loss": 0.1597, + "step": 6189 + }, + { + "epoch": 2.0671230589413927, + "grad_norm": 0.47997705349381775, + "learning_rate": 2.6661321513319467e-06, + "loss": 0.1575, + "step": 6190 + }, + { + "epoch": 2.067457004508265, + "grad_norm": 0.4639530958941431, + "learning_rate": 2.6644136886257138e-06, + "loss": 0.1554, + "step": 6191 + }, + { + "epoch": 2.0677909500751377, + "grad_norm": 0.47671499247486326, + "learning_rate": 2.6626955787372962e-06, + "loss": 0.1528, + "step": 6192 + }, + { + "epoch": 2.0681248956420104, + "grad_norm": 0.5172377476601987, + "learning_rate": 2.6609778219262296e-06, + "loss": 0.1572, + "step": 6193 + }, + { + "epoch": 2.068458841208883, + "grad_norm": 0.46150608992405706, + "learning_rate": 2.659260418452005e-06, + "loss": 0.1508, + "step": 6194 + }, + { + "epoch": 2.0687927867757554, + "grad_norm": 0.4864260006069014, + "learning_rate": 2.6575433685740547e-06, + "loss": 0.1503, + "step": 6195 + }, + { + "epoch": 2.069126732342628, + "grad_norm": 0.47462807507812255, + "learning_rate": 2.655826672551762e-06, + "loss": 0.1544, + "step": 6196 + }, + { + "epoch": 2.069460677909501, + "grad_norm": 0.5047881595787677, + "learning_rate": 2.6541103306444516e-06, + "loss": 0.1627, + "step": 6197 + }, + { + "epoch": 2.0697946234763736, + "grad_norm": 0.4847110255829372, + "learning_rate": 2.6523943431113985e-06, + "loss": 0.1575, + "step": 6198 + }, + { + "epoch": 2.070128569043246, + "grad_norm": 0.5684634058179412, + "learning_rate": 2.6506787102118204e-06, + "loss": 0.1793, + "step": 6199 + }, + { + "epoch": 2.0704625146101185, + "grad_norm": 0.4668023927238211, + "learning_rate": 2.6489634322048853e-06, + "loss": 0.1475, + "step": 6200 + }, + { + "epoch": 2.0707964601769913, + "grad_norm": 0.4823615249413857, + "learning_rate": 2.647248509349708e-06, + "loss": 0.1591, + "step": 6201 + }, + { + "epoch": 2.0711304057438635, + "grad_norm": 0.5284323022808656, + "learning_rate": 2.645533941905345e-06, + "loss": 0.1711, + "step": 6202 + }, + { + "epoch": 2.0714643513107363, + "grad_norm": 0.4629106304307049, + "learning_rate": 2.6438197301308045e-06, + "loss": 0.153, + "step": 6203 + }, + { + "epoch": 2.071798296877609, + "grad_norm": 0.4756702290381628, + "learning_rate": 2.6421058742850346e-06, + "loss": 0.1595, + "step": 6204 + }, + { + "epoch": 2.0721322424444817, + "grad_norm": 0.4651028891547631, + "learning_rate": 2.6403923746269368e-06, + "loss": 0.1439, + "step": 6205 + }, + { + "epoch": 2.072466188011354, + "grad_norm": 0.5038789264372514, + "learning_rate": 2.638679231415353e-06, + "loss": 0.1612, + "step": 6206 + }, + { + "epoch": 2.0728001335782267, + "grad_norm": 0.465142161854533, + "learning_rate": 2.636966444909077e-06, + "loss": 0.1581, + "step": 6207 + }, + { + "epoch": 2.0731340791450994, + "grad_norm": 0.471800296558469, + "learning_rate": 2.635254015366842e-06, + "loss": 0.1617, + "step": 6208 + }, + { + "epoch": 2.073468024711972, + "grad_norm": 0.4929302779325763, + "learning_rate": 2.633541943047334e-06, + "loss": 0.1617, + "step": 6209 + }, + { + "epoch": 2.0738019702788444, + "grad_norm": 0.5009578484887693, + "learning_rate": 2.6318302282091772e-06, + "loss": 0.1582, + "step": 6210 + }, + { + "epoch": 2.074135915845717, + "grad_norm": 0.4885347046833819, + "learning_rate": 2.6301188711109494e-06, + "loss": 0.1608, + "step": 6211 + }, + { + "epoch": 2.07446986141259, + "grad_norm": 0.46827738776381345, + "learning_rate": 2.6284078720111693e-06, + "loss": 0.1463, + "step": 6212 + }, + { + "epoch": 2.0748038069794625, + "grad_norm": 0.548235859732826, + "learning_rate": 2.626697231168308e-06, + "loss": 0.1672, + "step": 6213 + }, + { + "epoch": 2.075137752546335, + "grad_norm": 0.4807495227938576, + "learning_rate": 2.624986948840772e-06, + "loss": 0.1597, + "step": 6214 + }, + { + "epoch": 2.0754716981132075, + "grad_norm": 0.5208785862675631, + "learning_rate": 2.6232770252869243e-06, + "loss": 0.1652, + "step": 6215 + }, + { + "epoch": 2.0758056436800802, + "grad_norm": 0.5392394769474849, + "learning_rate": 2.6215674607650653e-06, + "loss": 0.1674, + "step": 6216 + }, + { + "epoch": 2.076139589246953, + "grad_norm": 0.4812349060192288, + "learning_rate": 2.619858255533446e-06, + "loss": 0.1555, + "step": 6217 + }, + { + "epoch": 2.0764735348138252, + "grad_norm": 0.4772963896311151, + "learning_rate": 2.6181494098502626e-06, + "loss": 0.1508, + "step": 6218 + }, + { + "epoch": 2.076807480380698, + "grad_norm": 0.4820130088588004, + "learning_rate": 2.616440923973659e-06, + "loss": 0.1579, + "step": 6219 + }, + { + "epoch": 2.0771414259475707, + "grad_norm": 0.47326464893761655, + "learning_rate": 2.6147327981617167e-06, + "loss": 0.156, + "step": 6220 + }, + { + "epoch": 2.077475371514443, + "grad_norm": 0.4814515104395897, + "learning_rate": 2.613025032672472e-06, + "loss": 0.1565, + "step": 6221 + }, + { + "epoch": 2.0778093170813157, + "grad_norm": 0.484941581282741, + "learning_rate": 2.611317627763901e-06, + "loss": 0.1559, + "step": 6222 + }, + { + "epoch": 2.0781432626481884, + "grad_norm": 0.46578841936370874, + "learning_rate": 2.609610583693928e-06, + "loss": 0.1565, + "step": 6223 + }, + { + "epoch": 2.078477208215061, + "grad_norm": 0.5122244564079962, + "learning_rate": 2.6079039007204238e-06, + "loss": 0.1628, + "step": 6224 + }, + { + "epoch": 2.0788111537819334, + "grad_norm": 0.4916632187074603, + "learning_rate": 2.6061975791011996e-06, + "loss": 0.1529, + "step": 6225 + }, + { + "epoch": 2.079145099348806, + "grad_norm": 0.4816831224754778, + "learning_rate": 2.6044916190940194e-06, + "loss": 0.1592, + "step": 6226 + }, + { + "epoch": 2.079479044915679, + "grad_norm": 0.4506436759636466, + "learning_rate": 2.6027860209565835e-06, + "loss": 0.145, + "step": 6227 + }, + { + "epoch": 2.0798129904825515, + "grad_norm": 0.5060128940585642, + "learning_rate": 2.6010807849465468e-06, + "loss": 0.1579, + "step": 6228 + }, + { + "epoch": 2.080146936049424, + "grad_norm": 0.5398542683218175, + "learning_rate": 2.5993759113215032e-06, + "loss": 0.178, + "step": 6229 + }, + { + "epoch": 2.0804808816162965, + "grad_norm": 0.46194809764874106, + "learning_rate": 2.5976714003389963e-06, + "loss": 0.1558, + "step": 6230 + }, + { + "epoch": 2.0808148271831692, + "grad_norm": 0.5504935947232371, + "learning_rate": 2.5959672522565095e-06, + "loss": 0.1697, + "step": 6231 + }, + { + "epoch": 2.081148772750042, + "grad_norm": 0.5461299363549844, + "learning_rate": 2.594263467331477e-06, + "loss": 0.1653, + "step": 6232 + }, + { + "epoch": 2.0814827183169142, + "grad_norm": 0.5349505003346677, + "learning_rate": 2.592560045821273e-06, + "loss": 0.1631, + "step": 6233 + }, + { + "epoch": 2.081816663883787, + "grad_norm": 0.5028722732399603, + "learning_rate": 2.5908569879832223e-06, + "loss": 0.1587, + "step": 6234 + }, + { + "epoch": 2.0821506094506597, + "grad_norm": 0.4847488245991641, + "learning_rate": 2.5891542940745873e-06, + "loss": 0.1645, + "step": 6235 + }, + { + "epoch": 2.082484555017532, + "grad_norm": 0.5179191384302583, + "learning_rate": 2.5874519643525864e-06, + "loss": 0.1652, + "step": 6236 + }, + { + "epoch": 2.0828185005844047, + "grad_norm": 0.5235684818125339, + "learning_rate": 2.5857499990743706e-06, + "loss": 0.1662, + "step": 6237 + }, + { + "epoch": 2.0831524461512774, + "grad_norm": 0.4954099960770102, + "learning_rate": 2.584048398497047e-06, + "loss": 0.1576, + "step": 6238 + }, + { + "epoch": 2.08348639171815, + "grad_norm": 0.5232717340566326, + "learning_rate": 2.5823471628776574e-06, + "loss": 0.1611, + "step": 6239 + }, + { + "epoch": 2.0838203372850224, + "grad_norm": 0.518556939706427, + "learning_rate": 2.5806462924731955e-06, + "loss": 0.1651, + "step": 6240 + }, + { + "epoch": 2.084154282851895, + "grad_norm": 0.4616700212286318, + "learning_rate": 2.5789457875405986e-06, + "loss": 0.1516, + "step": 6241 + }, + { + "epoch": 2.084488228418768, + "grad_norm": 0.5116149450430864, + "learning_rate": 2.57724564833675e-06, + "loss": 0.1602, + "step": 6242 + }, + { + "epoch": 2.0848221739856405, + "grad_norm": 0.4893670944675162, + "learning_rate": 2.5755458751184705e-06, + "loss": 0.1565, + "step": 6243 + }, + { + "epoch": 2.085156119552513, + "grad_norm": 0.4520354757290471, + "learning_rate": 2.5738464681425356e-06, + "loss": 0.1531, + "step": 6244 + }, + { + "epoch": 2.0854900651193855, + "grad_norm": 0.5233378545227989, + "learning_rate": 2.5721474276656566e-06, + "loss": 0.1656, + "step": 6245 + }, + { + "epoch": 2.0858240106862582, + "grad_norm": 0.5282369500802311, + "learning_rate": 2.5704487539444956e-06, + "loss": 0.17, + "step": 6246 + }, + { + "epoch": 2.086157956253131, + "grad_norm": 0.572079378431354, + "learning_rate": 2.5687504472356596e-06, + "loss": 0.1673, + "step": 6247 + }, + { + "epoch": 2.086491901820003, + "grad_norm": 0.5136764358982354, + "learning_rate": 2.5670525077956944e-06, + "loss": 0.1672, + "step": 6248 + }, + { + "epoch": 2.086825847386876, + "grad_norm": 0.4780306930943609, + "learning_rate": 2.5653549358810957e-06, + "loss": 0.1512, + "step": 6249 + }, + { + "epoch": 2.0871597929537486, + "grad_norm": 0.5351708052915466, + "learning_rate": 2.563657731748299e-06, + "loss": 0.1616, + "step": 6250 + }, + { + "epoch": 2.087493738520621, + "grad_norm": 0.48212931576855894, + "learning_rate": 2.5619608956536895e-06, + "loss": 0.1541, + "step": 6251 + }, + { + "epoch": 2.0878276840874936, + "grad_norm": 0.4882011252089522, + "learning_rate": 2.5602644278535937e-06, + "loss": 0.1689, + "step": 6252 + }, + { + "epoch": 2.0881616296543664, + "grad_norm": 0.47356586066482304, + "learning_rate": 2.558568328604285e-06, + "loss": 0.1535, + "step": 6253 + }, + { + "epoch": 2.088495575221239, + "grad_norm": 0.5437862879169038, + "learning_rate": 2.5568725981619747e-06, + "loss": 0.1729, + "step": 6254 + }, + { + "epoch": 2.0888295207881113, + "grad_norm": 0.5035531617051279, + "learning_rate": 2.5551772367828276e-06, + "loss": 0.1774, + "step": 6255 + }, + { + "epoch": 2.089163466354984, + "grad_norm": 0.4478316519506567, + "learning_rate": 2.5534822447229436e-06, + "loss": 0.146, + "step": 6256 + }, + { + "epoch": 2.089497411921857, + "grad_norm": 0.5394101004610315, + "learning_rate": 2.551787622238376e-06, + "loss": 0.1699, + "step": 6257 + }, + { + "epoch": 2.0898313574887295, + "grad_norm": 0.49097878747683316, + "learning_rate": 2.5500933695851104e-06, + "loss": 0.155, + "step": 6258 + }, + { + "epoch": 2.0901653030556018, + "grad_norm": 0.5146220478613275, + "learning_rate": 2.548399487019092e-06, + "loss": 0.1591, + "step": 6259 + }, + { + "epoch": 2.0904992486224745, + "grad_norm": 0.5347843554604156, + "learning_rate": 2.5467059747961953e-06, + "loss": 0.1591, + "step": 6260 + }, + { + "epoch": 2.090833194189347, + "grad_norm": 0.5233614282885114, + "learning_rate": 2.54501283317225e-06, + "loss": 0.1649, + "step": 6261 + }, + { + "epoch": 2.09116713975622, + "grad_norm": 0.5146286619898135, + "learning_rate": 2.5433200624030212e-06, + "loss": 0.161, + "step": 6262 + }, + { + "epoch": 2.091501085323092, + "grad_norm": 0.5020319234607654, + "learning_rate": 2.541627662744225e-06, + "loss": 0.1574, + "step": 6263 + }, + { + "epoch": 2.091835030889965, + "grad_norm": 0.5004055836984991, + "learning_rate": 2.5399356344515138e-06, + "loss": 0.1572, + "step": 6264 + }, + { + "epoch": 2.0921689764568376, + "grad_norm": 0.4979420565995763, + "learning_rate": 2.538243977780494e-06, + "loss": 0.1564, + "step": 6265 + }, + { + "epoch": 2.0925029220237104, + "grad_norm": 0.51833341034248, + "learning_rate": 2.5365526929867056e-06, + "loss": 0.1628, + "step": 6266 + }, + { + "epoch": 2.0928368675905826, + "grad_norm": 0.5459444259176912, + "learning_rate": 2.534861780325642e-06, + "loss": 0.1555, + "step": 6267 + }, + { + "epoch": 2.0931708131574553, + "grad_norm": 0.4788442607046698, + "learning_rate": 2.53317124005273e-06, + "loss": 0.1499, + "step": 6268 + }, + { + "epoch": 2.093504758724328, + "grad_norm": 0.5008550202971368, + "learning_rate": 2.5314810724233502e-06, + "loss": 0.1683, + "step": 6269 + }, + { + "epoch": 2.0938387042912003, + "grad_norm": 0.522704888977689, + "learning_rate": 2.529791277692818e-06, + "loss": 0.1655, + "step": 6270 + }, + { + "epoch": 2.094172649858073, + "grad_norm": 0.507328669366408, + "learning_rate": 2.5281018561163996e-06, + "loss": 0.1635, + "step": 6271 + }, + { + "epoch": 2.0945065954249458, + "grad_norm": 0.5081138020513082, + "learning_rate": 2.5264128079493033e-06, + "loss": 0.1694, + "step": 6272 + }, + { + "epoch": 2.0948405409918185, + "grad_norm": 0.4772258614297116, + "learning_rate": 2.524724133446676e-06, + "loss": 0.1616, + "step": 6273 + }, + { + "epoch": 2.0951744865586908, + "grad_norm": 0.495524661859938, + "learning_rate": 2.523035832863614e-06, + "loss": 0.1641, + "step": 6274 + }, + { + "epoch": 2.0955084321255635, + "grad_norm": 0.4529427960497026, + "learning_rate": 2.521347906455154e-06, + "loss": 0.1517, + "step": 6275 + }, + { + "epoch": 2.095842377692436, + "grad_norm": 0.511282048473958, + "learning_rate": 2.5196603544762804e-06, + "loss": 0.1628, + "step": 6276 + }, + { + "epoch": 2.096176323259309, + "grad_norm": 0.4720720037080335, + "learning_rate": 2.5179731771819133e-06, + "loss": 0.1573, + "step": 6277 + }, + { + "epoch": 2.096510268826181, + "grad_norm": 0.46615925544747905, + "learning_rate": 2.5162863748269247e-06, + "loss": 0.1518, + "step": 6278 + }, + { + "epoch": 2.096844214393054, + "grad_norm": 0.5095981778636587, + "learning_rate": 2.514599947666122e-06, + "loss": 0.1593, + "step": 6279 + }, + { + "epoch": 2.0971781599599266, + "grad_norm": 0.5176486525595663, + "learning_rate": 2.5129138959542633e-06, + "loss": 0.1702, + "step": 6280 + }, + { + "epoch": 2.0975121055267993, + "grad_norm": 0.476234499249456, + "learning_rate": 2.5112282199460415e-06, + "loss": 0.1543, + "step": 6281 + }, + { + "epoch": 2.0978460510936716, + "grad_norm": 0.522206312703102, + "learning_rate": 2.5095429198961056e-06, + "loss": 0.173, + "step": 6282 + }, + { + "epoch": 2.0981799966605443, + "grad_norm": 0.5078914776921112, + "learning_rate": 2.507857996059034e-06, + "loss": 0.1628, + "step": 6283 + }, + { + "epoch": 2.098513942227417, + "grad_norm": 0.48432480768589903, + "learning_rate": 2.5061734486893574e-06, + "loss": 0.1541, + "step": 6284 + }, + { + "epoch": 2.0988478877942893, + "grad_norm": 0.49121279758470837, + "learning_rate": 2.504489278041544e-06, + "loss": 0.1597, + "step": 6285 + }, + { + "epoch": 2.099181833361162, + "grad_norm": 0.47385292608458507, + "learning_rate": 2.5028054843700102e-06, + "loss": 0.1531, + "step": 6286 + }, + { + "epoch": 2.0995157789280348, + "grad_norm": 0.4841879216861, + "learning_rate": 2.501122067929108e-06, + "loss": 0.1517, + "step": 6287 + }, + { + "epoch": 2.0998497244949075, + "grad_norm": 0.479421292773347, + "learning_rate": 2.4994390289731446e-06, + "loss": 0.1551, + "step": 6288 + }, + { + "epoch": 2.1001836700617798, + "grad_norm": 0.5527499779534556, + "learning_rate": 2.497756367756357e-06, + "loss": 0.1618, + "step": 6289 + }, + { + "epoch": 2.1005176156286525, + "grad_norm": 0.4913185942010325, + "learning_rate": 2.496074084532935e-06, + "loss": 0.154, + "step": 6290 + }, + { + "epoch": 2.100851561195525, + "grad_norm": 0.5031384055965269, + "learning_rate": 2.4943921795570033e-06, + "loss": 0.1479, + "step": 6291 + }, + { + "epoch": 2.101185506762398, + "grad_norm": 0.49927324992800404, + "learning_rate": 2.4927106530826372e-06, + "loss": 0.1658, + "step": 6292 + }, + { + "epoch": 2.10151945232927, + "grad_norm": 0.5309889957366104, + "learning_rate": 2.491029505363848e-06, + "loss": 0.1422, + "step": 6293 + }, + { + "epoch": 2.101853397896143, + "grad_norm": 0.49560503238802656, + "learning_rate": 2.489348736654593e-06, + "loss": 0.1625, + "step": 6294 + }, + { + "epoch": 2.1021873434630156, + "grad_norm": 0.4782744370282557, + "learning_rate": 2.4876683472087767e-06, + "loss": 0.1508, + "step": 6295 + }, + { + "epoch": 2.1025212890298883, + "grad_norm": 0.4648251202295055, + "learning_rate": 2.4859883372802357e-06, + "loss": 0.1514, + "step": 6296 + }, + { + "epoch": 2.1028552345967606, + "grad_norm": 0.47761287277219516, + "learning_rate": 2.484308707122758e-06, + "loss": 0.1469, + "step": 6297 + }, + { + "epoch": 2.1031891801636333, + "grad_norm": 0.5120606241455523, + "learning_rate": 2.4826294569900725e-06, + "loss": 0.1599, + "step": 6298 + }, + { + "epoch": 2.103523125730506, + "grad_norm": 0.5192893295764826, + "learning_rate": 2.4809505871358476e-06, + "loss": 0.1647, + "step": 6299 + }, + { + "epoch": 2.1038570712973783, + "grad_norm": 0.49615949952418487, + "learning_rate": 2.4792720978136967e-06, + "loss": 0.1615, + "step": 6300 + }, + { + "epoch": 2.104191016864251, + "grad_norm": 0.5171662212170216, + "learning_rate": 2.4775939892771787e-06, + "loss": 0.1544, + "step": 6301 + }, + { + "epoch": 2.1045249624311237, + "grad_norm": 0.5549893785210516, + "learning_rate": 2.4759162617797873e-06, + "loss": 0.1687, + "step": 6302 + }, + { + "epoch": 2.1048589079979965, + "grad_norm": 0.5230583873016571, + "learning_rate": 2.4742389155749657e-06, + "loss": 0.1651, + "step": 6303 + }, + { + "epoch": 2.1051928535648687, + "grad_norm": 0.4862306544241593, + "learning_rate": 2.472561950916094e-06, + "loss": 0.1553, + "step": 6304 + }, + { + "epoch": 2.1055267991317415, + "grad_norm": 0.5309377784725847, + "learning_rate": 2.4708853680565e-06, + "loss": 0.1652, + "step": 6305 + }, + { + "epoch": 2.105860744698614, + "grad_norm": 0.5055232116428447, + "learning_rate": 2.4692091672494494e-06, + "loss": 0.163, + "step": 6306 + }, + { + "epoch": 2.106194690265487, + "grad_norm": 0.5642390943911669, + "learning_rate": 2.4675333487481558e-06, + "loss": 0.1711, + "step": 6307 + }, + { + "epoch": 2.106528635832359, + "grad_norm": 0.461679532517826, + "learning_rate": 2.4658579128057665e-06, + "loss": 0.1534, + "step": 6308 + }, + { + "epoch": 2.106862581399232, + "grad_norm": 0.5100394097368949, + "learning_rate": 2.4641828596753803e-06, + "loss": 0.1637, + "step": 6309 + }, + { + "epoch": 2.1071965269661046, + "grad_norm": 0.46839005082728324, + "learning_rate": 2.4625081896100294e-06, + "loss": 0.1535, + "step": 6310 + }, + { + "epoch": 2.1075304725329773, + "grad_norm": 0.4964670435707832, + "learning_rate": 2.4608339028626943e-06, + "loss": 0.1647, + "step": 6311 + }, + { + "epoch": 2.1078644180998496, + "grad_norm": 0.48855274829211875, + "learning_rate": 2.4591599996862957e-06, + "loss": 0.1639, + "step": 6312 + }, + { + "epoch": 2.1081983636667223, + "grad_norm": 0.5450080590558118, + "learning_rate": 2.457486480333699e-06, + "loss": 0.174, + "step": 6313 + }, + { + "epoch": 2.108532309233595, + "grad_norm": 0.5224423801079273, + "learning_rate": 2.4558133450577044e-06, + "loss": 0.1577, + "step": 6314 + }, + { + "epoch": 2.1088662548004677, + "grad_norm": 0.5174969356229879, + "learning_rate": 2.4541405941110626e-06, + "loss": 0.162, + "step": 6315 + }, + { + "epoch": 2.10920020036734, + "grad_norm": 0.4964356122182152, + "learning_rate": 2.452468227746459e-06, + "loss": 0.1575, + "step": 6316 + }, + { + "epoch": 2.1095341459342127, + "grad_norm": 0.5613011623009709, + "learning_rate": 2.4507962462165254e-06, + "loss": 0.1669, + "step": 6317 + }, + { + "epoch": 2.1098680915010855, + "grad_norm": 0.47625742049767633, + "learning_rate": 2.449124649773835e-06, + "loss": 0.1559, + "step": 6318 + }, + { + "epoch": 2.1102020370679577, + "grad_norm": 0.6372927108132521, + "learning_rate": 2.4474534386709036e-06, + "loss": 0.1766, + "step": 6319 + }, + { + "epoch": 2.1105359826348304, + "grad_norm": 0.5062800748969212, + "learning_rate": 2.4457826131601835e-06, + "loss": 0.1498, + "step": 6320 + }, + { + "epoch": 2.110869928201703, + "grad_norm": 0.5307208312611601, + "learning_rate": 2.444112173494077e-06, + "loss": 0.1615, + "step": 6321 + }, + { + "epoch": 2.111203873768576, + "grad_norm": 0.5407648531518704, + "learning_rate": 2.4424421199249194e-06, + "loss": 0.1691, + "step": 6322 + }, + { + "epoch": 2.111537819335448, + "grad_norm": 0.5343008151179298, + "learning_rate": 2.440772452704993e-06, + "loss": 0.1642, + "step": 6323 + }, + { + "epoch": 2.111871764902321, + "grad_norm": 0.49431699281947467, + "learning_rate": 2.4391031720865246e-06, + "loss": 0.1634, + "step": 6324 + }, + { + "epoch": 2.1122057104691936, + "grad_norm": 0.46526916827882536, + "learning_rate": 2.4374342783216732e-06, + "loss": 0.1483, + "step": 6325 + }, + { + "epoch": 2.1125396560360663, + "grad_norm": 0.5278048019270066, + "learning_rate": 2.435765771662549e-06, + "loss": 0.1569, + "step": 6326 + }, + { + "epoch": 2.1128736016029386, + "grad_norm": 0.48140891039036754, + "learning_rate": 2.4340976523611957e-06, + "loss": 0.1496, + "step": 6327 + }, + { + "epoch": 2.1132075471698113, + "grad_norm": 0.49088279957734005, + "learning_rate": 2.4324299206696057e-06, + "loss": 0.1609, + "step": 6328 + }, + { + "epoch": 2.113541492736684, + "grad_norm": 0.49893234907819617, + "learning_rate": 2.4307625768397077e-06, + "loss": 0.164, + "step": 6329 + }, + { + "epoch": 2.1138754383035567, + "grad_norm": 0.5409361980416141, + "learning_rate": 2.4290956211233757e-06, + "loss": 0.1665, + "step": 6330 + }, + { + "epoch": 2.114209383870429, + "grad_norm": 0.5101008229874319, + "learning_rate": 2.42742905377242e-06, + "loss": 0.1529, + "step": 6331 + }, + { + "epoch": 2.1145433294373017, + "grad_norm": 0.5238806985611678, + "learning_rate": 2.4257628750385987e-06, + "loss": 0.1633, + "step": 6332 + }, + { + "epoch": 2.1148772750041744, + "grad_norm": 0.4728995120386821, + "learning_rate": 2.424097085173604e-06, + "loss": 0.1476, + "step": 6333 + }, + { + "epoch": 2.1152112205710467, + "grad_norm": 0.5183928863262791, + "learning_rate": 2.4224316844290747e-06, + "loss": 0.1603, + "step": 6334 + }, + { + "epoch": 2.1155451661379194, + "grad_norm": 0.5244171017941279, + "learning_rate": 2.4207666730565893e-06, + "loss": 0.1601, + "step": 6335 + }, + { + "epoch": 2.115879111704792, + "grad_norm": 0.483669792524147, + "learning_rate": 2.4191020513076697e-06, + "loss": 0.1643, + "step": 6336 + }, + { + "epoch": 2.116213057271665, + "grad_norm": 0.5036617136208132, + "learning_rate": 2.4174378194337715e-06, + "loss": 0.1617, + "step": 6337 + }, + { + "epoch": 2.116547002838537, + "grad_norm": 0.5000061906999408, + "learning_rate": 2.4157739776863023e-06, + "loss": 0.16, + "step": 6338 + }, + { + "epoch": 2.11688094840541, + "grad_norm": 0.48562974176186696, + "learning_rate": 2.4141105263166e-06, + "loss": 0.1594, + "step": 6339 + }, + { + "epoch": 2.1172148939722826, + "grad_norm": 0.4796849682215605, + "learning_rate": 2.41244746557595e-06, + "loss": 0.152, + "step": 6340 + }, + { + "epoch": 2.1175488395391553, + "grad_norm": 0.45539955587257097, + "learning_rate": 2.4107847957155784e-06, + "loss": 0.157, + "step": 6341 + }, + { + "epoch": 2.1178827851060276, + "grad_norm": 0.46438375956752614, + "learning_rate": 2.409122516986652e-06, + "loss": 0.1549, + "step": 6342 + }, + { + "epoch": 2.1182167306729003, + "grad_norm": 0.4514297641602212, + "learning_rate": 2.4074606296402735e-06, + "loss": 0.1519, + "step": 6343 + }, + { + "epoch": 2.118550676239773, + "grad_norm": 0.4859800183560971, + "learning_rate": 2.405799133927496e-06, + "loss": 0.1593, + "step": 6344 + }, + { + "epoch": 2.1188846218066457, + "grad_norm": 0.5005344670210375, + "learning_rate": 2.404138030099303e-06, + "loss": 0.1597, + "step": 6345 + }, + { + "epoch": 2.119218567373518, + "grad_norm": 0.48375070616490734, + "learning_rate": 2.4024773184066253e-06, + "loss": 0.1665, + "step": 6346 + }, + { + "epoch": 2.1195525129403907, + "grad_norm": 0.472840448624495, + "learning_rate": 2.4008169991003356e-06, + "loss": 0.1452, + "step": 6347 + }, + { + "epoch": 2.1198864585072634, + "grad_norm": 0.49084422815911033, + "learning_rate": 2.3991570724312405e-06, + "loss": 0.1528, + "step": 6348 + }, + { + "epoch": 2.1202204040741357, + "grad_norm": 0.4692583452343208, + "learning_rate": 2.3974975386500958e-06, + "loss": 0.155, + "step": 6349 + }, + { + "epoch": 2.1205543496410084, + "grad_norm": 0.5062663536376966, + "learning_rate": 2.3958383980075896e-06, + "loss": 0.1633, + "step": 6350 + }, + { + "epoch": 2.120888295207881, + "grad_norm": 0.5403727659874564, + "learning_rate": 2.394179650754358e-06, + "loss": 0.1671, + "step": 6351 + }, + { + "epoch": 2.121222240774754, + "grad_norm": 0.5235630447457905, + "learning_rate": 2.3925212971409688e-06, + "loss": 0.1622, + "step": 6352 + }, + { + "epoch": 2.121556186341626, + "grad_norm": 0.4699026763388292, + "learning_rate": 2.3908633374179436e-06, + "loss": 0.1518, + "step": 6353 + }, + { + "epoch": 2.121890131908499, + "grad_norm": 0.5167884671729314, + "learning_rate": 2.3892057718357308e-06, + "loss": 0.1604, + "step": 6354 + }, + { + "epoch": 2.1222240774753716, + "grad_norm": 0.4623642417869892, + "learning_rate": 2.3875486006447294e-06, + "loss": 0.1491, + "step": 6355 + }, + { + "epoch": 2.1225580230422443, + "grad_norm": 0.5158989070059895, + "learning_rate": 2.3858918240952703e-06, + "loss": 0.1576, + "step": 6356 + }, + { + "epoch": 2.1228919686091166, + "grad_norm": 0.5110491439871296, + "learning_rate": 2.384235442437632e-06, + "loss": 0.1651, + "step": 6357 + }, + { + "epoch": 2.1232259141759893, + "grad_norm": 0.5119404415532335, + "learning_rate": 2.3825794559220296e-06, + "loss": 0.1729, + "step": 6358 + }, + { + "epoch": 2.123559859742862, + "grad_norm": 0.5501830648498629, + "learning_rate": 2.380923864798621e-06, + "loss": 0.1692, + "step": 6359 + }, + { + "epoch": 2.1238938053097347, + "grad_norm": 0.5038516827919598, + "learning_rate": 2.3792686693174993e-06, + "loss": 0.1627, + "step": 6360 + }, + { + "epoch": 2.124227750876607, + "grad_norm": 0.5332702528419736, + "learning_rate": 2.3776138697287055e-06, + "loss": 0.1651, + "step": 6361 + }, + { + "epoch": 2.1245616964434797, + "grad_norm": 0.4763237636455076, + "learning_rate": 2.3759594662822122e-06, + "loss": 0.1436, + "step": 6362 + }, + { + "epoch": 2.1248956420103524, + "grad_norm": 0.5286176751132988, + "learning_rate": 2.3743054592279386e-06, + "loss": 0.1605, + "step": 6363 + }, + { + "epoch": 2.125229587577225, + "grad_norm": 0.47547415889114697, + "learning_rate": 2.372651848815742e-06, + "loss": 0.1473, + "step": 6364 + }, + { + "epoch": 2.1255635331440974, + "grad_norm": 0.4533019654138708, + "learning_rate": 2.370998635295421e-06, + "loss": 0.1531, + "step": 6365 + }, + { + "epoch": 2.12589747871097, + "grad_norm": 0.45228899024811703, + "learning_rate": 2.3693458189167106e-06, + "loss": 0.1517, + "step": 6366 + }, + { + "epoch": 2.126231424277843, + "grad_norm": 0.5400654982070013, + "learning_rate": 2.3676933999292905e-06, + "loss": 0.1697, + "step": 6367 + }, + { + "epoch": 2.126565369844715, + "grad_norm": 0.5439371706519016, + "learning_rate": 2.366041378582775e-06, + "loss": 0.171, + "step": 6368 + }, + { + "epoch": 2.126899315411588, + "grad_norm": 0.5018152963433691, + "learning_rate": 2.364389755126723e-06, + "loss": 0.1528, + "step": 6369 + }, + { + "epoch": 2.1272332609784605, + "grad_norm": 0.487011525737873, + "learning_rate": 2.3627385298106344e-06, + "loss": 0.1598, + "step": 6370 + }, + { + "epoch": 2.1275672065453333, + "grad_norm": 0.5186675987624135, + "learning_rate": 2.361087702883941e-06, + "loss": 0.1655, + "step": 6371 + }, + { + "epoch": 2.1279011521122055, + "grad_norm": 0.4740219296383837, + "learning_rate": 2.359437274596024e-06, + "loss": 0.1535, + "step": 6372 + }, + { + "epoch": 2.1282350976790783, + "grad_norm": 0.4723892674328739, + "learning_rate": 2.357787245196197e-06, + "loss": 0.1461, + "step": 6373 + }, + { + "epoch": 2.128569043245951, + "grad_norm": 0.4653901182497193, + "learning_rate": 2.3561376149337188e-06, + "loss": 0.1542, + "step": 6374 + }, + { + "epoch": 2.1289029888128237, + "grad_norm": 0.5055198140337246, + "learning_rate": 2.3544883840577815e-06, + "loss": 0.1614, + "step": 6375 + }, + { + "epoch": 2.129236934379696, + "grad_norm": 0.49423346871659607, + "learning_rate": 2.352839552817527e-06, + "loss": 0.1606, + "step": 6376 + }, + { + "epoch": 2.1295708799465687, + "grad_norm": 0.5028984209451132, + "learning_rate": 2.3511911214620255e-06, + "loss": 0.1524, + "step": 6377 + }, + { + "epoch": 2.1299048255134414, + "grad_norm": 0.534742967358469, + "learning_rate": 2.3495430902402956e-06, + "loss": 0.1685, + "step": 6378 + }, + { + "epoch": 2.1302387710803137, + "grad_norm": 0.5180111072516133, + "learning_rate": 2.3478954594012884e-06, + "loss": 0.1613, + "step": 6379 + }, + { + "epoch": 2.1305727166471864, + "grad_norm": 0.4735330750800575, + "learning_rate": 2.346248229193901e-06, + "loss": 0.1488, + "step": 6380 + }, + { + "epoch": 2.130906662214059, + "grad_norm": 0.5283937925064244, + "learning_rate": 2.344601399866962e-06, + "loss": 0.1635, + "step": 6381 + }, + { + "epoch": 2.131240607780932, + "grad_norm": 0.5098981708034045, + "learning_rate": 2.342954971669252e-06, + "loss": 0.1621, + "step": 6382 + }, + { + "epoch": 2.131574553347804, + "grad_norm": 0.5297193277625377, + "learning_rate": 2.341308944849477e-06, + "loss": 0.1697, + "step": 6383 + }, + { + "epoch": 2.131908498914677, + "grad_norm": 0.4851259261560039, + "learning_rate": 2.3396633196562924e-06, + "loss": 0.154, + "step": 6384 + }, + { + "epoch": 2.1322424444815495, + "grad_norm": 0.5309045479819359, + "learning_rate": 2.3380180963382866e-06, + "loss": 0.1625, + "step": 6385 + }, + { + "epoch": 2.1325763900484223, + "grad_norm": 0.529874878718374, + "learning_rate": 2.3363732751439926e-06, + "loss": 0.1707, + "step": 6386 + }, + { + "epoch": 2.1329103356152945, + "grad_norm": 0.5019851431122059, + "learning_rate": 2.334728856321875e-06, + "loss": 0.1647, + "step": 6387 + }, + { + "epoch": 2.1332442811821672, + "grad_norm": 0.5565660036653327, + "learning_rate": 2.33308484012035e-06, + "loss": 0.1724, + "step": 6388 + }, + { + "epoch": 2.13357822674904, + "grad_norm": 0.4636960099887346, + "learning_rate": 2.33144122678776e-06, + "loss": 0.1483, + "step": 6389 + }, + { + "epoch": 2.1339121723159127, + "grad_norm": 0.4886503505718227, + "learning_rate": 2.3297980165723953e-06, + "loss": 0.1556, + "step": 6390 + }, + { + "epoch": 2.134246117882785, + "grad_norm": 0.5243203293316878, + "learning_rate": 2.3281552097224798e-06, + "loss": 0.1692, + "step": 6391 + }, + { + "epoch": 2.1345800634496577, + "grad_norm": 0.5512527072406636, + "learning_rate": 2.326512806486181e-06, + "loss": 0.1702, + "step": 6392 + }, + { + "epoch": 2.1349140090165304, + "grad_norm": 0.4869770099606462, + "learning_rate": 2.3248708071116005e-06, + "loss": 0.1604, + "step": 6393 + }, + { + "epoch": 2.135247954583403, + "grad_norm": 0.4892570672975904, + "learning_rate": 2.323229211846783e-06, + "loss": 0.1547, + "step": 6394 + }, + { + "epoch": 2.1355819001502754, + "grad_norm": 0.49250010065010924, + "learning_rate": 2.3215880209397133e-06, + "loss": 0.1537, + "step": 6395 + }, + { + "epoch": 2.135915845717148, + "grad_norm": 0.47927021514041757, + "learning_rate": 2.319947234638308e-06, + "loss": 0.1532, + "step": 6396 + }, + { + "epoch": 2.136249791284021, + "grad_norm": 0.5035140794080505, + "learning_rate": 2.3183068531904317e-06, + "loss": 0.1624, + "step": 6397 + }, + { + "epoch": 2.136583736850893, + "grad_norm": 0.5127475929816925, + "learning_rate": 2.3166668768438772e-06, + "loss": 0.1566, + "step": 6398 + }, + { + "epoch": 2.136917682417766, + "grad_norm": 0.5021756770030403, + "learning_rate": 2.31502730584639e-06, + "loss": 0.16, + "step": 6399 + }, + { + "epoch": 2.1372516279846385, + "grad_norm": 0.49666732574803907, + "learning_rate": 2.313388140445641e-06, + "loss": 0.1579, + "step": 6400 + }, + { + "epoch": 2.1375855735515112, + "grad_norm": 0.48350078859534124, + "learning_rate": 2.311749380889249e-06, + "loss": 0.1654, + "step": 6401 + }, + { + "epoch": 2.1379195191183835, + "grad_norm": 0.5439150444830929, + "learning_rate": 2.310111027424764e-06, + "loss": 0.1725, + "step": 6402 + }, + { + "epoch": 2.1382534646852562, + "grad_norm": 0.48456783877805504, + "learning_rate": 2.308473080299683e-06, + "loss": 0.1408, + "step": 6403 + }, + { + "epoch": 2.138587410252129, + "grad_norm": 0.4718955603006833, + "learning_rate": 2.3068355397614313e-06, + "loss": 0.1574, + "step": 6404 + }, + { + "epoch": 2.1389213558190017, + "grad_norm": 0.5054752478801913, + "learning_rate": 2.3051984060573855e-06, + "loss": 0.1601, + "step": 6405 + }, + { + "epoch": 2.139255301385874, + "grad_norm": 0.4728648535556213, + "learning_rate": 2.303561679434849e-06, + "loss": 0.1509, + "step": 6406 + }, + { + "epoch": 2.1395892469527467, + "grad_norm": 0.527707051909604, + "learning_rate": 2.3019253601410725e-06, + "loss": 0.1704, + "step": 6407 + }, + { + "epoch": 2.1399231925196194, + "grad_norm": 0.47206826548069847, + "learning_rate": 2.300289448423237e-06, + "loss": 0.1479, + "step": 6408 + }, + { + "epoch": 2.140257138086492, + "grad_norm": 0.5071150172469431, + "learning_rate": 2.2986539445284705e-06, + "loss": 0.1628, + "step": 6409 + }, + { + "epoch": 2.1405910836533644, + "grad_norm": 0.5114902739240026, + "learning_rate": 2.2970188487038293e-06, + "loss": 0.1616, + "step": 6410 + }, + { + "epoch": 2.140925029220237, + "grad_norm": 0.5143644585016027, + "learning_rate": 2.295384161196321e-06, + "loss": 0.1639, + "step": 6411 + }, + { + "epoch": 2.14125897478711, + "grad_norm": 0.47396643773274283, + "learning_rate": 2.293749882252879e-06, + "loss": 0.1453, + "step": 6412 + }, + { + "epoch": 2.1415929203539825, + "grad_norm": 0.5070902818006285, + "learning_rate": 2.2921160121203847e-06, + "loss": 0.1617, + "step": 6413 + }, + { + "epoch": 2.141926865920855, + "grad_norm": 0.5206899156340429, + "learning_rate": 2.290482551045649e-06, + "loss": 0.1673, + "step": 6414 + }, + { + "epoch": 2.1422608114877275, + "grad_norm": 0.4910791307640534, + "learning_rate": 2.2888494992754294e-06, + "loss": 0.163, + "step": 6415 + }, + { + "epoch": 2.1425947570546002, + "grad_norm": 0.49078588542590307, + "learning_rate": 2.2872168570564136e-06, + "loss": 0.1597, + "step": 6416 + }, + { + "epoch": 2.1429287026214725, + "grad_norm": 0.48656902115080175, + "learning_rate": 2.2855846246352335e-06, + "loss": 0.1573, + "step": 6417 + }, + { + "epoch": 2.143262648188345, + "grad_norm": 0.5285475116853615, + "learning_rate": 2.2839528022584596e-06, + "loss": 0.1678, + "step": 6418 + }, + { + "epoch": 2.143596593755218, + "grad_norm": 0.5097327251095314, + "learning_rate": 2.2823213901725927e-06, + "loss": 0.1722, + "step": 6419 + }, + { + "epoch": 2.1439305393220907, + "grad_norm": 0.5020115119724882, + "learning_rate": 2.2806903886240815e-06, + "loss": 0.1567, + "step": 6420 + }, + { + "epoch": 2.144264484888963, + "grad_norm": 0.5005087225398663, + "learning_rate": 2.2790597978593044e-06, + "loss": 0.1608, + "step": 6421 + }, + { + "epoch": 2.1445984304558356, + "grad_norm": 0.4972452001072568, + "learning_rate": 2.2774296181245825e-06, + "loss": 0.1708, + "step": 6422 + }, + { + "epoch": 2.1449323760227084, + "grad_norm": 0.5307429371345069, + "learning_rate": 2.275799849666174e-06, + "loss": 0.1578, + "step": 6423 + }, + { + "epoch": 2.145266321589581, + "grad_norm": 0.5104050954727964, + "learning_rate": 2.274170492730277e-06, + "loss": 0.1669, + "step": 6424 + }, + { + "epoch": 2.1456002671564534, + "grad_norm": 0.5447982842445855, + "learning_rate": 2.27254154756302e-06, + "loss": 0.1671, + "step": 6425 + }, + { + "epoch": 2.145934212723326, + "grad_norm": 0.539396360023142, + "learning_rate": 2.2709130144104795e-06, + "loss": 0.1722, + "step": 6426 + }, + { + "epoch": 2.146268158290199, + "grad_norm": 0.48061812916772473, + "learning_rate": 2.26928489351866e-06, + "loss": 0.1572, + "step": 6427 + }, + { + "epoch": 2.146602103857071, + "grad_norm": 0.5558718837443882, + "learning_rate": 2.267657185133511e-06, + "loss": 0.1679, + "step": 6428 + }, + { + "epoch": 2.146936049423944, + "grad_norm": 0.4912517189710352, + "learning_rate": 2.2660298895009157e-06, + "loss": 0.1533, + "step": 6429 + }, + { + "epoch": 2.1472699949908165, + "grad_norm": 0.5449170299577375, + "learning_rate": 2.2644030068666993e-06, + "loss": 0.1701, + "step": 6430 + }, + { + "epoch": 2.147603940557689, + "grad_norm": 0.5249455334713858, + "learning_rate": 2.2627765374766175e-06, + "loss": 0.1519, + "step": 6431 + }, + { + "epoch": 2.1479378861245615, + "grad_norm": 0.5490812081136831, + "learning_rate": 2.2611504815763715e-06, + "loss": 0.175, + "step": 6432 + }, + { + "epoch": 2.148271831691434, + "grad_norm": 0.501323784094873, + "learning_rate": 2.259524839411592e-06, + "loss": 0.1545, + "step": 6433 + }, + { + "epoch": 2.148605777258307, + "grad_norm": 0.46873001297372796, + "learning_rate": 2.2578996112278535e-06, + "loss": 0.1472, + "step": 6434 + }, + { + "epoch": 2.1489397228251796, + "grad_norm": 0.5482354098239963, + "learning_rate": 2.2562747972706663e-06, + "loss": 0.17, + "step": 6435 + }, + { + "epoch": 2.149273668392052, + "grad_norm": 0.5127389817917001, + "learning_rate": 2.254650397785479e-06, + "loss": 0.1563, + "step": 6436 + }, + { + "epoch": 2.1496076139589246, + "grad_norm": 0.5439560842821645, + "learning_rate": 2.253026413017672e-06, + "loss": 0.1631, + "step": 6437 + }, + { + "epoch": 2.1499415595257974, + "grad_norm": 0.5200199301047186, + "learning_rate": 2.2514028432125722e-06, + "loss": 0.1587, + "step": 6438 + }, + { + "epoch": 2.15027550509267, + "grad_norm": 0.48845583828397887, + "learning_rate": 2.249779688615435e-06, + "loss": 0.1572, + "step": 6439 + }, + { + "epoch": 2.1506094506595423, + "grad_norm": 0.4553983917786923, + "learning_rate": 2.248156949471459e-06, + "loss": 0.1534, + "step": 6440 + }, + { + "epoch": 2.150943396226415, + "grad_norm": 0.4848004378929125, + "learning_rate": 2.2465346260257786e-06, + "loss": 0.1607, + "step": 6441 + }, + { + "epoch": 2.151277341793288, + "grad_norm": 0.49347866729377543, + "learning_rate": 2.2449127185234626e-06, + "loss": 0.1541, + "step": 6442 + }, + { + "epoch": 2.1516112873601605, + "grad_norm": 0.5741045672451875, + "learning_rate": 2.2432912272095227e-06, + "loss": 0.1637, + "step": 6443 + }, + { + "epoch": 2.1519452329270328, + "grad_norm": 0.5120871107028994, + "learning_rate": 2.2416701523288997e-06, + "loss": 0.1607, + "step": 6444 + }, + { + "epoch": 2.1522791784939055, + "grad_norm": 0.4437091895118815, + "learning_rate": 2.240049494126479e-06, + "loss": 0.1463, + "step": 6445 + }, + { + "epoch": 2.152613124060778, + "grad_norm": 0.4871523787141582, + "learning_rate": 2.238429252847079e-06, + "loss": 0.1627, + "step": 6446 + }, + { + "epoch": 2.1529470696276505, + "grad_norm": 0.49786741973353504, + "learning_rate": 2.2368094287354586e-06, + "loss": 0.162, + "step": 6447 + }, + { + "epoch": 2.153281015194523, + "grad_norm": 0.5147125301457852, + "learning_rate": 2.2351900220363083e-06, + "loss": 0.1634, + "step": 6448 + }, + { + "epoch": 2.153614960761396, + "grad_norm": 0.5019185934700248, + "learning_rate": 2.2335710329942613e-06, + "loss": 0.1608, + "step": 6449 + }, + { + "epoch": 2.1539489063282686, + "grad_norm": 0.5206661699341952, + "learning_rate": 2.2319524618538814e-06, + "loss": 0.1593, + "step": 6450 + }, + { + "epoch": 2.154282851895141, + "grad_norm": 0.5048674347182872, + "learning_rate": 2.2303343088596753e-06, + "loss": 0.1581, + "step": 6451 + }, + { + "epoch": 2.1546167974620136, + "grad_norm": 0.48713040989661927, + "learning_rate": 2.2287165742560828e-06, + "loss": 0.1578, + "step": 6452 + }, + { + "epoch": 2.1549507430288863, + "grad_norm": 0.49585282458056723, + "learning_rate": 2.227099258287485e-06, + "loss": 0.157, + "step": 6453 + }, + { + "epoch": 2.155284688595759, + "grad_norm": 0.5328333267249918, + "learning_rate": 2.2254823611981926e-06, + "loss": 0.1658, + "step": 6454 + }, + { + "epoch": 2.1556186341626313, + "grad_norm": 0.4931587019632607, + "learning_rate": 2.2238658832324593e-06, + "loss": 0.1547, + "step": 6455 + }, + { + "epoch": 2.155952579729504, + "grad_norm": 0.4506269112296143, + "learning_rate": 2.222249824634471e-06, + "loss": 0.1439, + "step": 6456 + }, + { + "epoch": 2.1562865252963768, + "grad_norm": 0.5495482012766242, + "learning_rate": 2.220634185648354e-06, + "loss": 0.1773, + "step": 6457 + }, + { + "epoch": 2.1566204708632495, + "grad_norm": 0.483556197943497, + "learning_rate": 2.2190189665181684e-06, + "loss": 0.1548, + "step": 6458 + }, + { + "epoch": 2.1569544164301218, + "grad_norm": 0.504658616614687, + "learning_rate": 2.2174041674879152e-06, + "loss": 0.1605, + "step": 6459 + }, + { + "epoch": 2.1572883619969945, + "grad_norm": 0.47666557877987864, + "learning_rate": 2.2157897888015247e-06, + "loss": 0.1489, + "step": 6460 + }, + { + "epoch": 2.157622307563867, + "grad_norm": 0.5125882112106799, + "learning_rate": 2.214175830702871e-06, + "loss": 0.1576, + "step": 6461 + }, + { + "epoch": 2.15795625313074, + "grad_norm": 0.47157304371108405, + "learning_rate": 2.2125622934357588e-06, + "loss": 0.1528, + "step": 6462 + }, + { + "epoch": 2.158290198697612, + "grad_norm": 0.5841106675855774, + "learning_rate": 2.210949177243933e-06, + "loss": 0.1491, + "step": 6463 + }, + { + "epoch": 2.158624144264485, + "grad_norm": 0.4945144495146848, + "learning_rate": 2.209336482371076e-06, + "loss": 0.161, + "step": 6464 + }, + { + "epoch": 2.1589580898313576, + "grad_norm": 0.5245463934666308, + "learning_rate": 2.2077242090608e-06, + "loss": 0.1638, + "step": 6465 + }, + { + "epoch": 2.15929203539823, + "grad_norm": 0.5189920938025878, + "learning_rate": 2.206112357556662e-06, + "loss": 0.1619, + "step": 6466 + }, + { + "epoch": 2.1596259809651026, + "grad_norm": 0.4710356006327489, + "learning_rate": 2.2045009281021486e-06, + "loss": 0.1492, + "step": 6467 + }, + { + "epoch": 2.1599599265319753, + "grad_norm": 0.5391720083168611, + "learning_rate": 2.202889920940685e-06, + "loss": 0.1673, + "step": 6468 + }, + { + "epoch": 2.160293872098848, + "grad_norm": 0.5300999004989326, + "learning_rate": 2.2012793363156337e-06, + "loss": 0.1662, + "step": 6469 + }, + { + "epoch": 2.1606278176657203, + "grad_norm": 0.545770302502262, + "learning_rate": 2.199669174470295e-06, + "loss": 0.177, + "step": 6470 + }, + { + "epoch": 2.160961763232593, + "grad_norm": 0.5544787542265645, + "learning_rate": 2.1980594356478977e-06, + "loss": 0.1717, + "step": 6471 + }, + { + "epoch": 2.1612957087994658, + "grad_norm": 0.47765190280150527, + "learning_rate": 2.196450120091617e-06, + "loss": 0.1532, + "step": 6472 + }, + { + "epoch": 2.1616296543663385, + "grad_norm": 0.48763213938191263, + "learning_rate": 2.194841228044554e-06, + "loss": 0.1495, + "step": 6473 + }, + { + "epoch": 2.1619635999332107, + "grad_norm": 0.47668729251630615, + "learning_rate": 2.1932327597497537e-06, + "loss": 0.1552, + "step": 6474 + }, + { + "epoch": 2.1622975455000835, + "grad_norm": 0.509947791860357, + "learning_rate": 2.1916247154501937e-06, + "loss": 0.1653, + "step": 6475 + }, + { + "epoch": 2.162631491066956, + "grad_norm": 0.5242288336740697, + "learning_rate": 2.190017095388789e-06, + "loss": 0.167, + "step": 6476 + }, + { + "epoch": 2.1629654366338285, + "grad_norm": 0.4842299004185554, + "learning_rate": 2.1884098998083867e-06, + "loss": 0.1544, + "step": 6477 + }, + { + "epoch": 2.163299382200701, + "grad_norm": 0.5657409384245479, + "learning_rate": 2.1868031289517773e-06, + "loss": 0.17, + "step": 6478 + }, + { + "epoch": 2.163633327767574, + "grad_norm": 0.5057846468272276, + "learning_rate": 2.1851967830616773e-06, + "loss": 0.1608, + "step": 6479 + }, + { + "epoch": 2.1639672733344466, + "grad_norm": 0.5045992465263685, + "learning_rate": 2.1835908623807462e-06, + "loss": 0.1587, + "step": 6480 + }, + { + "epoch": 2.164301218901319, + "grad_norm": 0.5337740972894388, + "learning_rate": 2.1819853671515774e-06, + "loss": 0.1659, + "step": 6481 + }, + { + "epoch": 2.1646351644681916, + "grad_norm": 0.5210496803965817, + "learning_rate": 2.180380297616702e-06, + "loss": 0.1591, + "step": 6482 + }, + { + "epoch": 2.1649691100350643, + "grad_norm": 0.5248088157313302, + "learning_rate": 2.178775654018581e-06, + "loss": 0.1624, + "step": 6483 + }, + { + "epoch": 2.165303055601937, + "grad_norm": 0.5262130819709666, + "learning_rate": 2.177171436599618e-06, + "loss": 0.1688, + "step": 6484 + }, + { + "epoch": 2.1656370011688093, + "grad_norm": 0.5361036965685855, + "learning_rate": 2.1755676456021454e-06, + "loss": 0.1605, + "step": 6485 + }, + { + "epoch": 2.165970946735682, + "grad_norm": 0.4938672738227419, + "learning_rate": 2.173964281268436e-06, + "loss": 0.1545, + "step": 6486 + }, + { + "epoch": 2.1663048923025547, + "grad_norm": 0.5366176125755003, + "learning_rate": 2.1723613438407e-06, + "loss": 0.1582, + "step": 6487 + }, + { + "epoch": 2.1666388378694275, + "grad_norm": 0.5677703842430186, + "learning_rate": 2.170758833561075e-06, + "loss": 0.1607, + "step": 6488 + }, + { + "epoch": 2.1669727834362997, + "grad_norm": 0.5831020985138208, + "learning_rate": 2.1691567506716433e-06, + "loss": 0.1692, + "step": 6489 + }, + { + "epoch": 2.1673067290031724, + "grad_norm": 0.5308398773677225, + "learning_rate": 2.1675550954144147e-06, + "loss": 0.1566, + "step": 6490 + }, + { + "epoch": 2.167640674570045, + "grad_norm": 0.47316973548002056, + "learning_rate": 2.1659538680313403e-06, + "loss": 0.1487, + "step": 6491 + }, + { + "epoch": 2.167974620136918, + "grad_norm": 0.5451574177675723, + "learning_rate": 2.1643530687643036e-06, + "loss": 0.1648, + "step": 6492 + }, + { + "epoch": 2.16830856570379, + "grad_norm": 0.559874991831784, + "learning_rate": 2.1627526978551265e-06, + "loss": 0.1522, + "step": 6493 + }, + { + "epoch": 2.168642511270663, + "grad_norm": 0.5653685743132745, + "learning_rate": 2.1611527555455604e-06, + "loss": 0.1705, + "step": 6494 + }, + { + "epoch": 2.1689764568375356, + "grad_norm": 0.4836390569478203, + "learning_rate": 2.159553242077298e-06, + "loss": 0.1603, + "step": 6495 + }, + { + "epoch": 2.169310402404408, + "grad_norm": 0.5285691812855493, + "learning_rate": 2.1579541576919624e-06, + "loss": 0.1624, + "step": 6496 + }, + { + "epoch": 2.1696443479712806, + "grad_norm": 0.5499127677294646, + "learning_rate": 2.1563555026311166e-06, + "loss": 0.1644, + "step": 6497 + }, + { + "epoch": 2.1699782935381533, + "grad_norm": 0.5111505875055791, + "learning_rate": 2.154757277136251e-06, + "loss": 0.1594, + "step": 6498 + }, + { + "epoch": 2.170312239105026, + "grad_norm": 0.4626953791952365, + "learning_rate": 2.153159481448805e-06, + "loss": 0.1402, + "step": 6499 + }, + { + "epoch": 2.1706461846718983, + "grad_norm": 0.5450349768292784, + "learning_rate": 2.1515621158101372e-06, + "loss": 0.1658, + "step": 6500 + }, + { + "epoch": 2.170980130238771, + "grad_norm": 0.4631639748506876, + "learning_rate": 2.1499651804615534e-06, + "loss": 0.1496, + "step": 6501 + }, + { + "epoch": 2.1713140758056437, + "grad_norm": 0.5041192964368789, + "learning_rate": 2.148368675644285e-06, + "loss": 0.1693, + "step": 6502 + }, + { + "epoch": 2.1716480213725164, + "grad_norm": 0.5020294331070693, + "learning_rate": 2.146772601599507e-06, + "loss": 0.1599, + "step": 6503 + }, + { + "epoch": 2.1719819669393887, + "grad_norm": 0.5070209282890003, + "learning_rate": 2.1451769585683196e-06, + "loss": 0.153, + "step": 6504 + }, + { + "epoch": 2.1723159125062614, + "grad_norm": 0.5004602093524981, + "learning_rate": 2.14358174679177e-06, + "loss": 0.1534, + "step": 6505 + }, + { + "epoch": 2.172649858073134, + "grad_norm": 0.4939325771882108, + "learning_rate": 2.1419869665108303e-06, + "loss": 0.1525, + "step": 6506 + }, + { + "epoch": 2.172983803640007, + "grad_norm": 0.5500117190899586, + "learning_rate": 2.140392617966412e-06, + "loss": 0.1658, + "step": 6507 + }, + { + "epoch": 2.173317749206879, + "grad_norm": 0.536575903530373, + "learning_rate": 2.1387987013993583e-06, + "loss": 0.1588, + "step": 6508 + }, + { + "epoch": 2.173651694773752, + "grad_norm": 0.49710428559563785, + "learning_rate": 2.137205217050452e-06, + "loss": 0.1607, + "step": 6509 + }, + { + "epoch": 2.1739856403406246, + "grad_norm": 0.46432099575708613, + "learning_rate": 2.135612165160404e-06, + "loss": 0.1476, + "step": 6510 + }, + { + "epoch": 2.1743195859074973, + "grad_norm": 0.5301977692094965, + "learning_rate": 2.1340195459698653e-06, + "loss": 0.1652, + "step": 6511 + }, + { + "epoch": 2.1746535314743696, + "grad_norm": 0.4947325177627652, + "learning_rate": 2.1324273597194223e-06, + "loss": 0.159, + "step": 6512 + }, + { + "epoch": 2.1749874770412423, + "grad_norm": 0.4855377166862449, + "learning_rate": 2.1308356066495893e-06, + "loss": 0.1516, + "step": 6513 + }, + { + "epoch": 2.175321422608115, + "grad_norm": 0.47997522170058093, + "learning_rate": 2.1292442870008213e-06, + "loss": 0.155, + "step": 6514 + }, + { + "epoch": 2.1756553681749873, + "grad_norm": 0.5003823360484102, + "learning_rate": 2.1276534010135053e-06, + "loss": 0.1468, + "step": 6515 + }, + { + "epoch": 2.17598931374186, + "grad_norm": 0.5190481271339713, + "learning_rate": 2.1260629489279662e-06, + "loss": 0.1649, + "step": 6516 + }, + { + "epoch": 2.1763232593087327, + "grad_norm": 0.4604055659642761, + "learning_rate": 2.1244729309844564e-06, + "loss": 0.1495, + "step": 6517 + }, + { + "epoch": 2.1766572048756054, + "grad_norm": 0.558870729021529, + "learning_rate": 2.1228833474231703e-06, + "loss": 0.1622, + "step": 6518 + }, + { + "epoch": 2.1769911504424777, + "grad_norm": 0.48777199157089124, + "learning_rate": 2.1212941984842295e-06, + "loss": 0.1514, + "step": 6519 + }, + { + "epoch": 2.1773250960093504, + "grad_norm": 0.49435336792484225, + "learning_rate": 2.1197054844076975e-06, + "loss": 0.1488, + "step": 6520 + }, + { + "epoch": 2.177659041576223, + "grad_norm": 0.5453124726321699, + "learning_rate": 2.118117205433563e-06, + "loss": 0.167, + "step": 6521 + }, + { + "epoch": 2.177992987143096, + "grad_norm": 0.4889668124604241, + "learning_rate": 2.1165293618017612e-06, + "loss": 0.1562, + "step": 6522 + }, + { + "epoch": 2.178326932709968, + "grad_norm": 0.5060322916216825, + "learning_rate": 2.1149419537521495e-06, + "loss": 0.1636, + "step": 6523 + }, + { + "epoch": 2.178660878276841, + "grad_norm": 0.572919121563999, + "learning_rate": 2.1133549815245273e-06, + "loss": 0.1706, + "step": 6524 + }, + { + "epoch": 2.1789948238437136, + "grad_norm": 0.5270190608334708, + "learning_rate": 2.1117684453586236e-06, + "loss": 0.1531, + "step": 6525 + }, + { + "epoch": 2.179328769410586, + "grad_norm": 0.5559601743872964, + "learning_rate": 2.110182345494105e-06, + "loss": 0.1719, + "step": 6526 + }, + { + "epoch": 2.1796627149774586, + "grad_norm": 0.5192431372887695, + "learning_rate": 2.1085966821705662e-06, + "loss": 0.1595, + "step": 6527 + }, + { + "epoch": 2.1799966605443313, + "grad_norm": 0.501123467668914, + "learning_rate": 2.1070114556275473e-06, + "loss": 0.1564, + "step": 6528 + }, + { + "epoch": 2.180330606111204, + "grad_norm": 0.4801142572023988, + "learning_rate": 2.1054266661045105e-06, + "loss": 0.145, + "step": 6529 + }, + { + "epoch": 2.1806645516780763, + "grad_norm": 0.5100950044547728, + "learning_rate": 2.103842313840859e-06, + "loss": 0.1618, + "step": 6530 + }, + { + "epoch": 2.180998497244949, + "grad_norm": 0.49876239967556707, + "learning_rate": 2.1022583990759265e-06, + "loss": 0.1579, + "step": 6531 + }, + { + "epoch": 2.1813324428118217, + "grad_norm": 0.5300376773967295, + "learning_rate": 2.1006749220489834e-06, + "loss": 0.1601, + "step": 6532 + }, + { + "epoch": 2.1816663883786944, + "grad_norm": 0.5139517459957516, + "learning_rate": 2.0990918829992307e-06, + "loss": 0.1638, + "step": 6533 + }, + { + "epoch": 2.1820003339455667, + "grad_norm": 0.4956864720621521, + "learning_rate": 2.097509282165806e-06, + "loss": 0.1678, + "step": 6534 + }, + { + "epoch": 2.1823342795124394, + "grad_norm": 0.5514089951815736, + "learning_rate": 2.0959271197877816e-06, + "loss": 0.1699, + "step": 6535 + }, + { + "epoch": 2.182668225079312, + "grad_norm": 0.529402674892044, + "learning_rate": 2.0943453961041587e-06, + "loss": 0.1583, + "step": 6536 + }, + { + "epoch": 2.183002170646185, + "grad_norm": 0.5027930243812394, + "learning_rate": 2.0927641113538764e-06, + "loss": 0.1558, + "step": 6537 + }, + { + "epoch": 2.183336116213057, + "grad_norm": 0.48247284060734513, + "learning_rate": 2.0911832657758086e-06, + "loss": 0.1504, + "step": 6538 + }, + { + "epoch": 2.18367006177993, + "grad_norm": 0.553224289771926, + "learning_rate": 2.089602859608757e-06, + "loss": 0.1612, + "step": 6539 + }, + { + "epoch": 2.1840040073468026, + "grad_norm": 0.5128855380439065, + "learning_rate": 2.088022893091462e-06, + "loss": 0.1688, + "step": 6540 + }, + { + "epoch": 2.1843379529136753, + "grad_norm": 0.4987615340223241, + "learning_rate": 2.086443366462598e-06, + "loss": 0.1584, + "step": 6541 + }, + { + "epoch": 2.1846718984805475, + "grad_norm": 0.45996958264592197, + "learning_rate": 2.084864279960768e-06, + "loss": 0.1414, + "step": 6542 + }, + { + "epoch": 2.1850058440474203, + "grad_norm": 0.5060506307251191, + "learning_rate": 2.0832856338245157e-06, + "loss": 0.1662, + "step": 6543 + }, + { + "epoch": 2.185339789614293, + "grad_norm": 0.5240449344599418, + "learning_rate": 2.0817074282923087e-06, + "loss": 0.17, + "step": 6544 + }, + { + "epoch": 2.1856737351811653, + "grad_norm": 0.48387681849094627, + "learning_rate": 2.080129663602557e-06, + "loss": 0.1501, + "step": 6545 + }, + { + "epoch": 2.186007680748038, + "grad_norm": 0.4837620146066662, + "learning_rate": 2.0785523399935996e-06, + "loss": 0.1548, + "step": 6546 + }, + { + "epoch": 2.1863416263149107, + "grad_norm": 0.5719904583600143, + "learning_rate": 2.076975457703712e-06, + "loss": 0.1661, + "step": 6547 + }, + { + "epoch": 2.1866755718817834, + "grad_norm": 0.5294141126429812, + "learning_rate": 2.0753990169710973e-06, + "loss": 0.1555, + "step": 6548 + }, + { + "epoch": 2.1870095174486557, + "grad_norm": 0.5038994789068824, + "learning_rate": 2.0738230180338993e-06, + "loss": 0.1579, + "step": 6549 + }, + { + "epoch": 2.1873434630155284, + "grad_norm": 0.5180801238233425, + "learning_rate": 2.0722474611301868e-06, + "loss": 0.1565, + "step": 6550 + }, + { + "epoch": 2.187677408582401, + "grad_norm": 0.517550029930567, + "learning_rate": 2.0706723464979687e-06, + "loss": 0.1631, + "step": 6551 + }, + { + "epoch": 2.188011354149274, + "grad_norm": 0.5632560056493515, + "learning_rate": 2.0690976743751844e-06, + "loss": 0.1737, + "step": 6552 + }, + { + "epoch": 2.188345299716146, + "grad_norm": 0.5057496168693179, + "learning_rate": 2.0675234449997085e-06, + "loss": 0.1558, + "step": 6553 + }, + { + "epoch": 2.188679245283019, + "grad_norm": 0.49701932491339085, + "learning_rate": 2.065949658609343e-06, + "loss": 0.1564, + "step": 6554 + }, + { + "epoch": 2.1890131908498915, + "grad_norm": 0.49770206149039964, + "learning_rate": 2.0643763154418304e-06, + "loss": 0.1545, + "step": 6555 + }, + { + "epoch": 2.1893471364167643, + "grad_norm": 0.5063235022252858, + "learning_rate": 2.06280341573484e-06, + "loss": 0.1538, + "step": 6556 + }, + { + "epoch": 2.1896810819836365, + "grad_norm": 0.4810621047438368, + "learning_rate": 2.0612309597259776e-06, + "loss": 0.1518, + "step": 6557 + }, + { + "epoch": 2.1900150275505093, + "grad_norm": 0.5329849457976076, + "learning_rate": 2.059658947652784e-06, + "loss": 0.1588, + "step": 6558 + }, + { + "epoch": 2.190348973117382, + "grad_norm": 0.4806564305506935, + "learning_rate": 2.058087379752725e-06, + "loss": 0.1555, + "step": 6559 + }, + { + "epoch": 2.1906829186842547, + "grad_norm": 0.5063446638868672, + "learning_rate": 2.056516256263208e-06, + "loss": 0.1567, + "step": 6560 + }, + { + "epoch": 2.191016864251127, + "grad_norm": 0.5230077420968994, + "learning_rate": 2.0549455774215705e-06, + "loss": 0.156, + "step": 6561 + }, + { + "epoch": 2.1913508098179997, + "grad_norm": 0.49869886677540565, + "learning_rate": 2.0533753434650784e-06, + "loss": 0.1652, + "step": 6562 + }, + { + "epoch": 2.1916847553848724, + "grad_norm": 0.5172552699731698, + "learning_rate": 2.0518055546309362e-06, + "loss": 0.1523, + "step": 6563 + }, + { + "epoch": 2.1920187009517447, + "grad_norm": 0.4894924237454037, + "learning_rate": 2.0502362111562806e-06, + "loss": 0.1578, + "step": 6564 + }, + { + "epoch": 2.1923526465186174, + "grad_norm": 0.5085326999824232, + "learning_rate": 2.048667313278176e-06, + "loss": 0.1537, + "step": 6565 + }, + { + "epoch": 2.19268659208549, + "grad_norm": 0.49822549747118766, + "learning_rate": 2.0470988612336264e-06, + "loss": 0.1536, + "step": 6566 + }, + { + "epoch": 2.193020537652363, + "grad_norm": 0.5148739261403503, + "learning_rate": 2.045530855259561e-06, + "loss": 0.1637, + "step": 6567 + }, + { + "epoch": 2.193354483219235, + "grad_norm": 0.5168387604417966, + "learning_rate": 2.043963295592848e-06, + "loss": 0.166, + "step": 6568 + }, + { + "epoch": 2.193688428786108, + "grad_norm": 0.5076478055233222, + "learning_rate": 2.042396182470285e-06, + "loss": 0.1676, + "step": 6569 + }, + { + "epoch": 2.1940223743529805, + "grad_norm": 0.4850803716080977, + "learning_rate": 2.040829516128605e-06, + "loss": 0.1507, + "step": 6570 + }, + { + "epoch": 2.1943563199198532, + "grad_norm": 0.5096244454579222, + "learning_rate": 2.0392632968044686e-06, + "loss": 0.1642, + "step": 6571 + }, + { + "epoch": 2.1946902654867255, + "grad_norm": 0.4942711483631219, + "learning_rate": 2.0376975247344736e-06, + "loss": 0.1559, + "step": 6572 + }, + { + "epoch": 2.1950242110535982, + "grad_norm": 0.5207629645400701, + "learning_rate": 2.0361322001551466e-06, + "loss": 0.1721, + "step": 6573 + }, + { + "epoch": 2.195358156620471, + "grad_norm": 0.45191543742406315, + "learning_rate": 2.034567323302949e-06, + "loss": 0.1337, + "step": 6574 + }, + { + "epoch": 2.1956921021873432, + "grad_norm": 0.48335848242486656, + "learning_rate": 2.0330028944142736e-06, + "loss": 0.1551, + "step": 6575 + }, + { + "epoch": 2.196026047754216, + "grad_norm": 0.4956348807447585, + "learning_rate": 2.031438913725448e-06, + "loss": 0.1504, + "step": 6576 + }, + { + "epoch": 2.1963599933210887, + "grad_norm": 0.5681269369080285, + "learning_rate": 2.0298753814727267e-06, + "loss": 0.1531, + "step": 6577 + }, + { + "epoch": 2.1966939388879614, + "grad_norm": 0.5291428022534318, + "learning_rate": 2.028312297892303e-06, + "loss": 0.1621, + "step": 6578 + }, + { + "epoch": 2.1970278844548337, + "grad_norm": 0.5239390275772546, + "learning_rate": 2.0267496632202953e-06, + "loss": 0.166, + "step": 6579 + }, + { + "epoch": 2.1973618300217064, + "grad_norm": 0.5409865518744863, + "learning_rate": 2.0251874776927598e-06, + "loss": 0.1643, + "step": 6580 + }, + { + "epoch": 2.197695775588579, + "grad_norm": 0.5372965064486103, + "learning_rate": 2.0236257415456833e-06, + "loss": 0.1646, + "step": 6581 + }, + { + "epoch": 2.198029721155452, + "grad_norm": 0.509285554214082, + "learning_rate": 2.022064455014986e-06, + "loss": 0.1623, + "step": 6582 + }, + { + "epoch": 2.198363666722324, + "grad_norm": 0.757210617776752, + "learning_rate": 2.0205036183365145e-06, + "loss": 0.1749, + "step": 6583 + }, + { + "epoch": 2.198697612289197, + "grad_norm": 0.4667375146591432, + "learning_rate": 2.018943231746056e-06, + "loss": 0.1455, + "step": 6584 + }, + { + "epoch": 2.1990315578560695, + "grad_norm": 0.5693617648213951, + "learning_rate": 2.0173832954793216e-06, + "loss": 0.164, + "step": 6585 + }, + { + "epoch": 2.1993655034229422, + "grad_norm": 0.49312895306196874, + "learning_rate": 2.0158238097719597e-06, + "loss": 0.1543, + "step": 6586 + }, + { + "epoch": 2.1996994489898145, + "grad_norm": 0.5064476054065244, + "learning_rate": 2.0142647748595502e-06, + "loss": 0.1651, + "step": 6587 + }, + { + "epoch": 2.2000333945566872, + "grad_norm": 0.5332088995867316, + "learning_rate": 2.0127061909776e-06, + "loss": 0.1661, + "step": 6588 + }, + { + "epoch": 2.20036734012356, + "grad_norm": 0.47982702274764294, + "learning_rate": 2.0111480583615566e-06, + "loss": 0.1466, + "step": 6589 + }, + { + "epoch": 2.2007012856904327, + "grad_norm": 0.4910497844263571, + "learning_rate": 2.00959037724679e-06, + "loss": 0.1515, + "step": 6590 + }, + { + "epoch": 2.201035231257305, + "grad_norm": 0.5585938097349513, + "learning_rate": 2.0080331478686087e-06, + "loss": 0.1732, + "step": 6591 + }, + { + "epoch": 2.2013691768241777, + "grad_norm": 0.4896922496577059, + "learning_rate": 2.006476370462247e-06, + "loss": 0.1557, + "step": 6592 + }, + { + "epoch": 2.2017031223910504, + "grad_norm": 0.4576228652187881, + "learning_rate": 2.0049200452628803e-06, + "loss": 0.1437, + "step": 6593 + }, + { + "epoch": 2.2020370679579226, + "grad_norm": 0.4734216591875249, + "learning_rate": 2.0033641725056048e-06, + "loss": 0.1487, + "step": 6594 + }, + { + "epoch": 2.2023710135247954, + "grad_norm": 0.5060810259961187, + "learning_rate": 2.001808752425457e-06, + "loss": 0.1471, + "step": 6595 + }, + { + "epoch": 2.202704959091668, + "grad_norm": 0.48329648438901496, + "learning_rate": 2.000253785257398e-06, + "loss": 0.1525, + "step": 6596 + }, + { + "epoch": 2.203038904658541, + "grad_norm": 0.5357710590885335, + "learning_rate": 1.998699271236326e-06, + "loss": 0.1612, + "step": 6597 + }, + { + "epoch": 2.203372850225413, + "grad_norm": 0.525150917969299, + "learning_rate": 1.997145210597068e-06, + "loss": 0.1697, + "step": 6598 + }, + { + "epoch": 2.203706795792286, + "grad_norm": 0.47723396449168365, + "learning_rate": 1.9955916035743855e-06, + "loss": 0.1512, + "step": 6599 + }, + { + "epoch": 2.2040407413591585, + "grad_norm": 0.47291493505957904, + "learning_rate": 1.9940384504029647e-06, + "loss": 0.1562, + "step": 6600 + }, + { + "epoch": 2.2043746869260312, + "grad_norm": 0.5084750289652352, + "learning_rate": 1.9924857513174324e-06, + "loss": 0.1588, + "step": 6601 + }, + { + "epoch": 2.2047086324929035, + "grad_norm": 0.5015177899122328, + "learning_rate": 1.990933506552337e-06, + "loss": 0.1591, + "step": 6602 + }, + { + "epoch": 2.205042578059776, + "grad_norm": 0.5140862563797094, + "learning_rate": 1.989381716342167e-06, + "loss": 0.1669, + "step": 6603 + }, + { + "epoch": 2.205376523626649, + "grad_norm": 0.5335787711446098, + "learning_rate": 1.9878303809213367e-06, + "loss": 0.1512, + "step": 6604 + }, + { + "epoch": 2.2057104691935217, + "grad_norm": 0.5424751411330692, + "learning_rate": 1.986279500524197e-06, + "loss": 0.1657, + "step": 6605 + }, + { + "epoch": 2.206044414760394, + "grad_norm": 0.5056230951340244, + "learning_rate": 1.984729075385022e-06, + "loss": 0.1596, + "step": 6606 + }, + { + "epoch": 2.2063783603272666, + "grad_norm": 0.47996255878953, + "learning_rate": 1.983179105738026e-06, + "loss": 0.1582, + "step": 6607 + }, + { + "epoch": 2.2067123058941394, + "grad_norm": 0.4911356123723574, + "learning_rate": 1.9816295918173462e-06, + "loss": 0.1534, + "step": 6608 + }, + { + "epoch": 2.207046251461012, + "grad_norm": 0.465393956829766, + "learning_rate": 1.9800805338570562e-06, + "loss": 0.1601, + "step": 6609 + }, + { + "epoch": 2.2073801970278843, + "grad_norm": 0.4979367096913454, + "learning_rate": 1.9785319320911623e-06, + "loss": 0.1463, + "step": 6610 + }, + { + "epoch": 2.207714142594757, + "grad_norm": 0.486772494636018, + "learning_rate": 1.9769837867535948e-06, + "loss": 0.1557, + "step": 6611 + }, + { + "epoch": 2.20804808816163, + "grad_norm": 0.487323805440558, + "learning_rate": 1.9754360980782227e-06, + "loss": 0.1563, + "step": 6612 + }, + { + "epoch": 2.208382033728502, + "grad_norm": 0.5575220348139566, + "learning_rate": 1.973888866298839e-06, + "loss": 0.1676, + "step": 6613 + }, + { + "epoch": 2.2087159792953748, + "grad_norm": 0.537349301156599, + "learning_rate": 1.972342091649176e-06, + "loss": 0.1667, + "step": 6614 + }, + { + "epoch": 2.2090499248622475, + "grad_norm": 0.49476253066855386, + "learning_rate": 1.9707957743628854e-06, + "loss": 0.1534, + "step": 6615 + }, + { + "epoch": 2.20938387042912, + "grad_norm": 0.5405416629754679, + "learning_rate": 1.9692499146735646e-06, + "loss": 0.1642, + "step": 6616 + }, + { + "epoch": 2.2097178159959925, + "grad_norm": 0.5165616973685996, + "learning_rate": 1.967704512814728e-06, + "loss": 0.164, + "step": 6617 + }, + { + "epoch": 2.210051761562865, + "grad_norm": 0.524024480111212, + "learning_rate": 1.966159569019831e-06, + "loss": 0.1629, + "step": 6618 + }, + { + "epoch": 2.210385707129738, + "grad_norm": 0.528009150373322, + "learning_rate": 1.9646150835222517e-06, + "loss": 0.1641, + "step": 6619 + }, + { + "epoch": 2.2107196526966106, + "grad_norm": 0.48772901021730125, + "learning_rate": 1.9630710565553063e-06, + "loss": 0.1532, + "step": 6620 + }, + { + "epoch": 2.211053598263483, + "grad_norm": 0.4371984282049327, + "learning_rate": 1.9615274883522327e-06, + "loss": 0.1386, + "step": 6621 + }, + { + "epoch": 2.2113875438303556, + "grad_norm": 0.5506478629606789, + "learning_rate": 1.9599843791462123e-06, + "loss": 0.1703, + "step": 6622 + }, + { + "epoch": 2.2117214893972283, + "grad_norm": 0.5183244278342396, + "learning_rate": 1.958441729170345e-06, + "loss": 0.1558, + "step": 6623 + }, + { + "epoch": 2.2120554349641006, + "grad_norm": 0.4653062989884138, + "learning_rate": 1.9568995386576695e-06, + "loss": 0.148, + "step": 6624 + }, + { + "epoch": 2.2123893805309733, + "grad_norm": 0.49138499728836343, + "learning_rate": 1.9553578078411476e-06, + "loss": 0.1541, + "step": 6625 + }, + { + "epoch": 2.212723326097846, + "grad_norm": 0.5254863963640038, + "learning_rate": 1.953816536953681e-06, + "loss": 0.1546, + "step": 6626 + }, + { + "epoch": 2.2130572716647188, + "grad_norm": 0.5206964772367224, + "learning_rate": 1.95227572622809e-06, + "loss": 0.1621, + "step": 6627 + }, + { + "epoch": 2.213391217231591, + "grad_norm": 0.563970641509512, + "learning_rate": 1.95073537589714e-06, + "loss": 0.1722, + "step": 6628 + }, + { + "epoch": 2.2137251627984638, + "grad_norm": 0.4877735248249902, + "learning_rate": 1.949195486193514e-06, + "loss": 0.1466, + "step": 6629 + }, + { + "epoch": 2.2140591083653365, + "grad_norm": 0.5032630664511833, + "learning_rate": 1.9476560573498332e-06, + "loss": 0.1651, + "step": 6630 + }, + { + "epoch": 2.214393053932209, + "grad_norm": 0.5375345394822361, + "learning_rate": 1.946117089598644e-06, + "loss": 0.1737, + "step": 6631 + }, + { + "epoch": 2.2147269994990815, + "grad_norm": 0.4730562669425479, + "learning_rate": 1.9445785831724274e-06, + "loss": 0.1511, + "step": 6632 + }, + { + "epoch": 2.215060945065954, + "grad_norm": 0.5274486320082097, + "learning_rate": 1.943040538303591e-06, + "loss": 0.1596, + "step": 6633 + }, + { + "epoch": 2.215394890632827, + "grad_norm": 0.5020720245794152, + "learning_rate": 1.9415029552244758e-06, + "loss": 0.1585, + "step": 6634 + }, + { + "epoch": 2.2157288361996996, + "grad_norm": 0.5225443854090014, + "learning_rate": 1.939965834167354e-06, + "loss": 0.1642, + "step": 6635 + }, + { + "epoch": 2.216062781766572, + "grad_norm": 0.5246272127996099, + "learning_rate": 1.9384291753644215e-06, + "loss": 0.1531, + "step": 6636 + }, + { + "epoch": 2.2163967273334446, + "grad_norm": 0.4890983704136873, + "learning_rate": 1.9368929790478126e-06, + "loss": 0.155, + "step": 6637 + }, + { + "epoch": 2.2167306729003173, + "grad_norm": 0.5091358082023, + "learning_rate": 1.935357245449583e-06, + "loss": 0.1508, + "step": 6638 + }, + { + "epoch": 2.21706461846719, + "grad_norm": 0.5688713170703724, + "learning_rate": 1.9338219748017297e-06, + "loss": 0.1745, + "step": 6639 + }, + { + "epoch": 2.2173985640340623, + "grad_norm": 0.5001120410713611, + "learning_rate": 1.932287167336168e-06, + "loss": 0.1617, + "step": 6640 + }, + { + "epoch": 2.217732509600935, + "grad_norm": 0.5223274403848096, + "learning_rate": 1.9307528232847533e-06, + "loss": 0.1588, + "step": 6641 + }, + { + "epoch": 2.2180664551678078, + "grad_norm": 0.49231056426484826, + "learning_rate": 1.9292189428792617e-06, + "loss": 0.1453, + "step": 6642 + }, + { + "epoch": 2.21840040073468, + "grad_norm": 0.5447393922671808, + "learning_rate": 1.927685526351408e-06, + "loss": 0.1614, + "step": 6643 + }, + { + "epoch": 2.2187343463015528, + "grad_norm": 0.5130642995180773, + "learning_rate": 1.9261525739328273e-06, + "loss": 0.1539, + "step": 6644 + }, + { + "epoch": 2.2190682918684255, + "grad_norm": 0.4673620218126964, + "learning_rate": 1.924620085855097e-06, + "loss": 0.1416, + "step": 6645 + }, + { + "epoch": 2.219402237435298, + "grad_norm": 0.5239963893912938, + "learning_rate": 1.923088062349713e-06, + "loss": 0.1574, + "step": 6646 + }, + { + "epoch": 2.2197361830021705, + "grad_norm": 0.5192341161140694, + "learning_rate": 1.9215565036481083e-06, + "loss": 0.1606, + "step": 6647 + }, + { + "epoch": 2.220070128569043, + "grad_norm": 0.520663322092088, + "learning_rate": 1.920025409981639e-06, + "loss": 0.1663, + "step": 6648 + }, + { + "epoch": 2.220404074135916, + "grad_norm": 0.5032776512637247, + "learning_rate": 1.918494781581599e-06, + "loss": 0.158, + "step": 6649 + }, + { + "epoch": 2.2207380197027886, + "grad_norm": 0.4713101365681985, + "learning_rate": 1.9169646186792025e-06, + "loss": 0.1467, + "step": 6650 + }, + { + "epoch": 2.221071965269661, + "grad_norm": 0.46675389053456917, + "learning_rate": 1.9154349215056052e-06, + "loss": 0.1546, + "step": 6651 + }, + { + "epoch": 2.2214059108365336, + "grad_norm": 0.5200622300655069, + "learning_rate": 1.9139056902918805e-06, + "loss": 0.1665, + "step": 6652 + }, + { + "epoch": 2.2217398564034063, + "grad_norm": 0.48668102810544783, + "learning_rate": 1.912376925269041e-06, + "loss": 0.1544, + "step": 6653 + }, + { + "epoch": 2.222073801970279, + "grad_norm": 0.5121790498856402, + "learning_rate": 1.910848626668021e-06, + "loss": 0.1533, + "step": 6654 + }, + { + "epoch": 2.2224077475371513, + "grad_norm": 0.4908382616540141, + "learning_rate": 1.9093207947196908e-06, + "loss": 0.1604, + "step": 6655 + }, + { + "epoch": 2.222741693104024, + "grad_norm": 0.4999744558438863, + "learning_rate": 1.9077934296548445e-06, + "loss": 0.1542, + "step": 6656 + }, + { + "epoch": 2.2230756386708967, + "grad_norm": 0.5256425135871804, + "learning_rate": 1.9062665317042106e-06, + "loss": 0.1519, + "step": 6657 + }, + { + "epoch": 2.2234095842377695, + "grad_norm": 0.4849691863222047, + "learning_rate": 1.9047401010984456e-06, + "loss": 0.1504, + "step": 6658 + }, + { + "epoch": 2.2237435298046417, + "grad_norm": 0.4791081942937301, + "learning_rate": 1.9032141380681329e-06, + "loss": 0.1499, + "step": 6659 + }, + { + "epoch": 2.2240774753715145, + "grad_norm": 0.5202987778444312, + "learning_rate": 1.9016886428437893e-06, + "loss": 0.1667, + "step": 6660 + }, + { + "epoch": 2.224411420938387, + "grad_norm": 0.4901000495769656, + "learning_rate": 1.9001636156558562e-06, + "loss": 0.1565, + "step": 6661 + }, + { + "epoch": 2.2247453665052594, + "grad_norm": 0.4823202571420205, + "learning_rate": 1.8986390567347085e-06, + "loss": 0.1529, + "step": 6662 + }, + { + "epoch": 2.225079312072132, + "grad_norm": 0.5025149193666405, + "learning_rate": 1.8971149663106482e-06, + "loss": 0.1572, + "step": 6663 + }, + { + "epoch": 2.225413257639005, + "grad_norm": 0.4810470147277487, + "learning_rate": 1.8955913446139096e-06, + "loss": 0.1569, + "step": 6664 + }, + { + "epoch": 2.2257472032058776, + "grad_norm": 0.46343686657363625, + "learning_rate": 1.8940681918746495e-06, + "loss": 0.1452, + "step": 6665 + }, + { + "epoch": 2.22608114877275, + "grad_norm": 0.47263048459042295, + "learning_rate": 1.8925455083229622e-06, + "loss": 0.1542, + "step": 6666 + }, + { + "epoch": 2.2264150943396226, + "grad_norm": 0.48227650724640997, + "learning_rate": 1.891023294188863e-06, + "loss": 0.1566, + "step": 6667 + }, + { + "epoch": 2.2267490399064953, + "grad_norm": 0.5703992870293059, + "learning_rate": 1.8895015497023022e-06, + "loss": 0.1628, + "step": 6668 + }, + { + "epoch": 2.227082985473368, + "grad_norm": 0.5171876577597354, + "learning_rate": 1.8879802750931574e-06, + "loss": 0.1594, + "step": 6669 + }, + { + "epoch": 2.2274169310402403, + "grad_norm": 0.4918589655419636, + "learning_rate": 1.886459470591237e-06, + "loss": 0.1492, + "step": 6670 + }, + { + "epoch": 2.227750876607113, + "grad_norm": 0.535274808727, + "learning_rate": 1.8849391364262721e-06, + "loss": 0.1616, + "step": 6671 + }, + { + "epoch": 2.2280848221739857, + "grad_norm": 0.4918788564052449, + "learning_rate": 1.883419272827931e-06, + "loss": 0.1544, + "step": 6672 + }, + { + "epoch": 2.228418767740858, + "grad_norm": 0.5305209595696436, + "learning_rate": 1.881899880025802e-06, + "loss": 0.1537, + "step": 6673 + }, + { + "epoch": 2.2287527133077307, + "grad_norm": 0.5543131551389201, + "learning_rate": 1.8803809582494143e-06, + "loss": 0.1598, + "step": 6674 + }, + { + "epoch": 2.2290866588746034, + "grad_norm": 0.5080012549587938, + "learning_rate": 1.878862507728213e-06, + "loss": 0.1602, + "step": 6675 + }, + { + "epoch": 2.229420604441476, + "grad_norm": 0.48004398539465265, + "learning_rate": 1.877344528691582e-06, + "loss": 0.147, + "step": 6676 + }, + { + "epoch": 2.2297545500083484, + "grad_norm": 0.5318763956775434, + "learning_rate": 1.8758270213688263e-06, + "loss": 0.1656, + "step": 6677 + }, + { + "epoch": 2.230088495575221, + "grad_norm": 0.527960686138271, + "learning_rate": 1.8743099859891866e-06, + "loss": 0.1584, + "step": 6678 + }, + { + "epoch": 2.230422441142094, + "grad_norm": 0.4634772222288885, + "learning_rate": 1.8727934227818255e-06, + "loss": 0.1459, + "step": 6679 + }, + { + "epoch": 2.2307563867089666, + "grad_norm": 0.5262489510400422, + "learning_rate": 1.8712773319758398e-06, + "loss": 0.1623, + "step": 6680 + }, + { + "epoch": 2.231090332275839, + "grad_norm": 0.49489280932867763, + "learning_rate": 1.8697617138002545e-06, + "loss": 0.1478, + "step": 6681 + }, + { + "epoch": 2.2314242778427116, + "grad_norm": 0.5031062159733811, + "learning_rate": 1.8682465684840178e-06, + "loss": 0.1564, + "step": 6682 + }, + { + "epoch": 2.2317582234095843, + "grad_norm": 0.4953788465486049, + "learning_rate": 1.8667318962560137e-06, + "loss": 0.1626, + "step": 6683 + }, + { + "epoch": 2.232092168976457, + "grad_norm": 0.5329151547726576, + "learning_rate": 1.865217697345048e-06, + "loss": 0.1674, + "step": 6684 + }, + { + "epoch": 2.2324261145433293, + "grad_norm": 0.5510845744808649, + "learning_rate": 1.86370397197986e-06, + "loss": 0.1663, + "step": 6685 + }, + { + "epoch": 2.232760060110202, + "grad_norm": 0.5461167706937765, + "learning_rate": 1.8621907203891159e-06, + "loss": 0.1704, + "step": 6686 + }, + { + "epoch": 2.2330940056770747, + "grad_norm": 0.5425557341551985, + "learning_rate": 1.8606779428014116e-06, + "loss": 0.1778, + "step": 6687 + }, + { + "epoch": 2.2334279512439474, + "grad_norm": 0.5394947897686756, + "learning_rate": 1.8591656394452667e-06, + "loss": 0.1649, + "step": 6688 + }, + { + "epoch": 2.2337618968108197, + "grad_norm": 0.4755957408400034, + "learning_rate": 1.8576538105491359e-06, + "loss": 0.1519, + "step": 6689 + }, + { + "epoch": 2.2340958423776924, + "grad_norm": 0.49279914369472505, + "learning_rate": 1.8561424563413949e-06, + "loss": 0.1529, + "step": 6690 + }, + { + "epoch": 2.234429787944565, + "grad_norm": 0.495172322606566, + "learning_rate": 1.8546315770503537e-06, + "loss": 0.153, + "step": 6691 + }, + { + "epoch": 2.2347637335114374, + "grad_norm": 0.4707511082921486, + "learning_rate": 1.8531211729042486e-06, + "loss": 0.1505, + "step": 6692 + }, + { + "epoch": 2.23509767907831, + "grad_norm": 0.5004010993757639, + "learning_rate": 1.8516112441312451e-06, + "loss": 0.1559, + "step": 6693 + }, + { + "epoch": 2.235431624645183, + "grad_norm": 0.5388923994203958, + "learning_rate": 1.8501017909594327e-06, + "loss": 0.1623, + "step": 6694 + }, + { + "epoch": 2.2357655702120556, + "grad_norm": 0.5725007414524714, + "learning_rate": 1.8485928136168353e-06, + "loss": 0.1733, + "step": 6695 + }, + { + "epoch": 2.236099515778928, + "grad_norm": 0.5413884294057945, + "learning_rate": 1.8470843123313982e-06, + "loss": 0.1526, + "step": 6696 + }, + { + "epoch": 2.2364334613458006, + "grad_norm": 0.5215501259534671, + "learning_rate": 1.8455762873309995e-06, + "loss": 0.1599, + "step": 6697 + }, + { + "epoch": 2.2367674069126733, + "grad_norm": 0.5434730278673744, + "learning_rate": 1.844068738843446e-06, + "loss": 0.1697, + "step": 6698 + }, + { + "epoch": 2.237101352479546, + "grad_norm": 0.8447780480576517, + "learning_rate": 1.8425616670964702e-06, + "loss": 0.1591, + "step": 6699 + }, + { + "epoch": 2.2374352980464183, + "grad_norm": 0.4889453910112141, + "learning_rate": 1.8410550723177306e-06, + "loss": 0.148, + "step": 6700 + }, + { + "epoch": 2.237769243613291, + "grad_norm": 0.5348087835055323, + "learning_rate": 1.8395489547348193e-06, + "loss": 0.1586, + "step": 6701 + }, + { + "epoch": 2.2381031891801637, + "grad_norm": 0.509290022703682, + "learning_rate": 1.8380433145752502e-06, + "loss": 0.159, + "step": 6702 + }, + { + "epoch": 2.2384371347470364, + "grad_norm": 0.5065409444970693, + "learning_rate": 1.8365381520664695e-06, + "loss": 0.1551, + "step": 6703 + }, + { + "epoch": 2.2387710803139087, + "grad_norm": 0.5002223281275832, + "learning_rate": 1.8350334674358505e-06, + "loss": 0.1524, + "step": 6704 + }, + { + "epoch": 2.2391050258807814, + "grad_norm": 0.521909205621699, + "learning_rate": 1.8335292609106914e-06, + "loss": 0.1507, + "step": 6705 + }, + { + "epoch": 2.239438971447654, + "grad_norm": 0.5051898762790804, + "learning_rate": 1.8320255327182224e-06, + "loss": 0.1597, + "step": 6706 + }, + { + "epoch": 2.239772917014527, + "grad_norm": 0.46375293585113725, + "learning_rate": 1.8305222830855973e-06, + "loss": 0.1426, + "step": 6707 + }, + { + "epoch": 2.240106862581399, + "grad_norm": 0.45694266168008607, + "learning_rate": 1.8290195122399007e-06, + "loss": 0.141, + "step": 6708 + }, + { + "epoch": 2.240440808148272, + "grad_norm": 0.5090966358483844, + "learning_rate": 1.8275172204081437e-06, + "loss": 0.1543, + "step": 6709 + }, + { + "epoch": 2.2407747537151446, + "grad_norm": 0.5076709222116111, + "learning_rate": 1.826015407817267e-06, + "loss": 0.1548, + "step": 6710 + }, + { + "epoch": 2.241108699282017, + "grad_norm": 0.553645802415818, + "learning_rate": 1.8245140746941336e-06, + "loss": 0.1673, + "step": 6711 + }, + { + "epoch": 2.2414426448488896, + "grad_norm": 0.5243245790384291, + "learning_rate": 1.823013221265541e-06, + "loss": 0.1592, + "step": 6712 + }, + { + "epoch": 2.2417765904157623, + "grad_norm": 0.5330623020965718, + "learning_rate": 1.8215128477582077e-06, + "loss": 0.1623, + "step": 6713 + }, + { + "epoch": 2.242110535982635, + "grad_norm": 0.5257624493735769, + "learning_rate": 1.8200129543987843e-06, + "loss": 0.1633, + "step": 6714 + }, + { + "epoch": 2.2424444815495073, + "grad_norm": 0.5021319140625929, + "learning_rate": 1.818513541413847e-06, + "loss": 0.1648, + "step": 6715 + }, + { + "epoch": 2.24277842711638, + "grad_norm": 0.5189532423799506, + "learning_rate": 1.8170146090299018e-06, + "loss": 0.1596, + "step": 6716 + }, + { + "epoch": 2.2431123726832527, + "grad_norm": 0.5187290309316134, + "learning_rate": 1.8155161574733772e-06, + "loss": 0.1553, + "step": 6717 + }, + { + "epoch": 2.2434463182501254, + "grad_norm": 0.5275964325368073, + "learning_rate": 1.8140181869706341e-06, + "loss": 0.1555, + "step": 6718 + }, + { + "epoch": 2.2437802638169977, + "grad_norm": 0.5053082731154795, + "learning_rate": 1.812520697747956e-06, + "loss": 0.1592, + "step": 6719 + }, + { + "epoch": 2.2441142093838704, + "grad_norm": 0.539300185313967, + "learning_rate": 1.8110236900315582e-06, + "loss": 0.1582, + "step": 6720 + }, + { + "epoch": 2.244448154950743, + "grad_norm": 0.5128000481869541, + "learning_rate": 1.8095271640475802e-06, + "loss": 0.1615, + "step": 6721 + }, + { + "epoch": 2.2447821005176154, + "grad_norm": 0.5339868466354106, + "learning_rate": 1.8080311200220935e-06, + "loss": 0.1661, + "step": 6722 + }, + { + "epoch": 2.245116046084488, + "grad_norm": 0.5083433012779742, + "learning_rate": 1.8065355581810878e-06, + "loss": 0.1551, + "step": 6723 + }, + { + "epoch": 2.245449991651361, + "grad_norm": 0.5628766905570722, + "learning_rate": 1.80504047875049e-06, + "loss": 0.1724, + "step": 6724 + }, + { + "epoch": 2.2457839372182336, + "grad_norm": 0.5084207068862423, + "learning_rate": 1.8035458819561453e-06, + "loss": 0.158, + "step": 6725 + }, + { + "epoch": 2.246117882785106, + "grad_norm": 0.5065025201452854, + "learning_rate": 1.8020517680238326e-06, + "loss": 0.1598, + "step": 6726 + }, + { + "epoch": 2.2464518283519785, + "grad_norm": 0.4990795520486874, + "learning_rate": 1.8005581371792564e-06, + "loss": 0.1525, + "step": 6727 + }, + { + "epoch": 2.2467857739188513, + "grad_norm": 0.5080788566646063, + "learning_rate": 1.799064989648044e-06, + "loss": 0.1528, + "step": 6728 + }, + { + "epoch": 2.247119719485724, + "grad_norm": 0.4974786247986726, + "learning_rate": 1.797572325655756e-06, + "loss": 0.1557, + "step": 6729 + }, + { + "epoch": 2.2474536650525963, + "grad_norm": 0.47239598611834804, + "learning_rate": 1.7960801454278742e-06, + "loss": 0.1466, + "step": 6730 + }, + { + "epoch": 2.247787610619469, + "grad_norm": 0.4902956696179493, + "learning_rate": 1.7945884491898119e-06, + "loss": 0.1496, + "step": 6731 + }, + { + "epoch": 2.2481215561863417, + "grad_norm": 0.4867643676693888, + "learning_rate": 1.7930972371669064e-06, + "loss": 0.153, + "step": 6732 + }, + { + "epoch": 2.2484555017532144, + "grad_norm": 0.49740646202701094, + "learning_rate": 1.791606509584425e-06, + "loss": 0.1581, + "step": 6733 + }, + { + "epoch": 2.2487894473200867, + "grad_norm": 0.4636440286036032, + "learning_rate": 1.7901162666675564e-06, + "loss": 0.1425, + "step": 6734 + }, + { + "epoch": 2.2491233928869594, + "grad_norm": 0.5257749950431144, + "learning_rate": 1.7886265086414222e-06, + "loss": 0.1714, + "step": 6735 + }, + { + "epoch": 2.249457338453832, + "grad_norm": 0.5635419552815703, + "learning_rate": 1.7871372357310651e-06, + "loss": 0.1699, + "step": 6736 + }, + { + "epoch": 2.249791284020705, + "grad_norm": 0.6170420468842089, + "learning_rate": 1.7856484481614605e-06, + "loss": 0.1536, + "step": 6737 + }, + { + "epoch": 2.250125229587577, + "grad_norm": 0.5167276951617383, + "learning_rate": 1.784160146157502e-06, + "loss": 0.1624, + "step": 6738 + }, + { + "epoch": 2.25045917515445, + "grad_norm": 0.54841599016874, + "learning_rate": 1.7826723299440224e-06, + "loss": 0.1693, + "step": 6739 + }, + { + "epoch": 2.2507931207213225, + "grad_norm": 0.45416893488770504, + "learning_rate": 1.7811849997457681e-06, + "loss": 0.1435, + "step": 6740 + }, + { + "epoch": 2.251127066288195, + "grad_norm": 0.5136555475743064, + "learning_rate": 1.779698155787422e-06, + "loss": 0.1577, + "step": 6741 + }, + { + "epoch": 2.2514610118550675, + "grad_norm": 0.4719315935157619, + "learning_rate": 1.7782117982935854e-06, + "loss": 0.1522, + "step": 6742 + }, + { + "epoch": 2.2517949574219402, + "grad_norm": 0.5301261871177757, + "learning_rate": 1.7767259274887937e-06, + "loss": 0.163, + "step": 6743 + }, + { + "epoch": 2.252128902988813, + "grad_norm": 0.5431816850628656, + "learning_rate": 1.7752405435975002e-06, + "loss": 0.1704, + "step": 6744 + }, + { + "epoch": 2.2524628485556852, + "grad_norm": 0.49674970018012937, + "learning_rate": 1.7737556468440964e-06, + "loss": 0.1541, + "step": 6745 + }, + { + "epoch": 2.252796794122558, + "grad_norm": 0.5211417957528897, + "learning_rate": 1.7722712374528877e-06, + "loss": 0.1613, + "step": 6746 + }, + { + "epoch": 2.2531307396894307, + "grad_norm": 0.5097312100588599, + "learning_rate": 1.7707873156481158e-06, + "loss": 0.1553, + "step": 6747 + }, + { + "epoch": 2.2534646852563034, + "grad_norm": 0.5066032252735806, + "learning_rate": 1.7693038816539416e-06, + "loss": 0.1571, + "step": 6748 + }, + { + "epoch": 2.2537986308231757, + "grad_norm": 0.5454789116485014, + "learning_rate": 1.767820935694457e-06, + "loss": 0.1717, + "step": 6749 + }, + { + "epoch": 2.2541325763900484, + "grad_norm": 0.524425253837954, + "learning_rate": 1.7663384779936764e-06, + "loss": 0.1659, + "step": 6750 + }, + { + "epoch": 2.254466521956921, + "grad_norm": 0.49825581432779875, + "learning_rate": 1.7648565087755442e-06, + "loss": 0.1494, + "step": 6751 + }, + { + "epoch": 2.2548004675237934, + "grad_norm": 0.5309709923110758, + "learning_rate": 1.76337502826393e-06, + "loss": 0.1589, + "step": 6752 + }, + { + "epoch": 2.255134413090666, + "grad_norm": 0.5173890025549714, + "learning_rate": 1.7618940366826266e-06, + "loss": 0.1596, + "step": 6753 + }, + { + "epoch": 2.255468358657539, + "grad_norm": 0.538202002454078, + "learning_rate": 1.7604135342553564e-06, + "loss": 0.1688, + "step": 6754 + }, + { + "epoch": 2.2558023042244115, + "grad_norm": 0.5511066879412111, + "learning_rate": 1.7589335212057663e-06, + "loss": 0.1716, + "step": 6755 + }, + { + "epoch": 2.2561362497912842, + "grad_norm": 0.49185964257268533, + "learning_rate": 1.7574539977574323e-06, + "loss": 0.1553, + "step": 6756 + }, + { + "epoch": 2.2564701953581565, + "grad_norm": 0.5284689435240054, + "learning_rate": 1.7559749641338497e-06, + "loss": 0.157, + "step": 6757 + }, + { + "epoch": 2.2568041409250292, + "grad_norm": 0.5137649551069989, + "learning_rate": 1.7544964205584476e-06, + "loss": 0.1509, + "step": 6758 + }, + { + "epoch": 2.257138086491902, + "grad_norm": 0.5155409321355809, + "learning_rate": 1.7530183672545743e-06, + "loss": 0.1557, + "step": 6759 + }, + { + "epoch": 2.2574720320587742, + "grad_norm": 0.5312821280939001, + "learning_rate": 1.7515408044455102e-06, + "loss": 0.1541, + "step": 6760 + }, + { + "epoch": 2.257805977625647, + "grad_norm": 0.5410274974402473, + "learning_rate": 1.7500637323544534e-06, + "loss": 0.1625, + "step": 6761 + }, + { + "epoch": 2.2581399231925197, + "grad_norm": 0.5369298073405431, + "learning_rate": 1.74858715120454e-06, + "loss": 0.1623, + "step": 6762 + }, + { + "epoch": 2.2584738687593924, + "grad_norm": 0.5330721160665443, + "learning_rate": 1.7471110612188203e-06, + "loss": 0.1667, + "step": 6763 + }, + { + "epoch": 2.2588078143262647, + "grad_norm": 0.5564739677028375, + "learning_rate": 1.7456354626202775e-06, + "loss": 0.1622, + "step": 6764 + }, + { + "epoch": 2.2591417598931374, + "grad_norm": 0.5025246524131759, + "learning_rate": 1.7441603556318155e-06, + "loss": 0.1591, + "step": 6765 + }, + { + "epoch": 2.25947570546001, + "grad_norm": 0.5102844772368513, + "learning_rate": 1.74268574047627e-06, + "loss": 0.1605, + "step": 6766 + }, + { + "epoch": 2.259809651026883, + "grad_norm": 0.528306743042957, + "learning_rate": 1.7412116173763931e-06, + "loss": 0.1564, + "step": 6767 + }, + { + "epoch": 2.260143596593755, + "grad_norm": 0.5076408569071325, + "learning_rate": 1.7397379865548758e-06, + "loss": 0.1603, + "step": 6768 + }, + { + "epoch": 2.260477542160628, + "grad_norm": 0.5293441364353967, + "learning_rate": 1.7382648482343229e-06, + "loss": 0.1626, + "step": 6769 + }, + { + "epoch": 2.2608114877275005, + "grad_norm": 0.5160283033863646, + "learning_rate": 1.7367922026372713e-06, + "loss": 0.1491, + "step": 6770 + }, + { + "epoch": 2.261145433294373, + "grad_norm": 0.5243537166726169, + "learning_rate": 1.7353200499861794e-06, + "loss": 0.1659, + "step": 6771 + }, + { + "epoch": 2.2614793788612455, + "grad_norm": 0.46787144815206605, + "learning_rate": 1.733848390503436e-06, + "loss": 0.1473, + "step": 6772 + }, + { + "epoch": 2.261813324428118, + "grad_norm": 0.5408034010548239, + "learning_rate": 1.732377224411349e-06, + "loss": 0.1628, + "step": 6773 + }, + { + "epoch": 2.262147269994991, + "grad_norm": 0.5580667063910909, + "learning_rate": 1.7309065519321572e-06, + "loss": 0.1675, + "step": 6774 + }, + { + "epoch": 2.2624812155618637, + "grad_norm": 0.5029465093966058, + "learning_rate": 1.729436373288025e-06, + "loss": 0.1554, + "step": 6775 + }, + { + "epoch": 2.262815161128736, + "grad_norm": 0.515217227344929, + "learning_rate": 1.7279666887010361e-06, + "loss": 0.1618, + "step": 6776 + }, + { + "epoch": 2.2631491066956086, + "grad_norm": 0.5002295957612295, + "learning_rate": 1.726497498393206e-06, + "loss": 0.1531, + "step": 6777 + }, + { + "epoch": 2.2634830522624814, + "grad_norm": 0.5206621476137773, + "learning_rate": 1.7250288025864747e-06, + "loss": 0.17, + "step": 6778 + }, + { + "epoch": 2.2638169978293536, + "grad_norm": 0.5521129732067034, + "learning_rate": 1.7235606015027029e-06, + "loss": 0.1707, + "step": 6779 + }, + { + "epoch": 2.2641509433962264, + "grad_norm": 0.5102176164588842, + "learning_rate": 1.7220928953636812e-06, + "loss": 0.1572, + "step": 6780 + }, + { + "epoch": 2.264484888963099, + "grad_norm": 0.5209070051551632, + "learning_rate": 1.7206256843911252e-06, + "loss": 0.1541, + "step": 6781 + }, + { + "epoch": 2.264818834529972, + "grad_norm": 0.48372276583702023, + "learning_rate": 1.7191589688066706e-06, + "loss": 0.1577, + "step": 6782 + }, + { + "epoch": 2.265152780096844, + "grad_norm": 0.5580522540630803, + "learning_rate": 1.7176927488318868e-06, + "loss": 0.1672, + "step": 6783 + }, + { + "epoch": 2.265486725663717, + "grad_norm": 0.5402796189908685, + "learning_rate": 1.7162270246882595e-06, + "loss": 0.1607, + "step": 6784 + }, + { + "epoch": 2.2658206712305895, + "grad_norm": 0.48998808382575204, + "learning_rate": 1.7147617965972052e-06, + "loss": 0.1558, + "step": 6785 + }, + { + "epoch": 2.266154616797462, + "grad_norm": 0.4631863279760277, + "learning_rate": 1.7132970647800639e-06, + "loss": 0.1402, + "step": 6786 + }, + { + "epoch": 2.2664885623643345, + "grad_norm": 0.5319179193859932, + "learning_rate": 1.7118328294581028e-06, + "loss": 0.1637, + "step": 6787 + }, + { + "epoch": 2.266822507931207, + "grad_norm": 0.5057429440108493, + "learning_rate": 1.7103690908525072e-06, + "loss": 0.1508, + "step": 6788 + }, + { + "epoch": 2.26715645349808, + "grad_norm": 0.5396050693212004, + "learning_rate": 1.7089058491843967e-06, + "loss": 0.1644, + "step": 6789 + }, + { + "epoch": 2.267490399064952, + "grad_norm": 0.5070077732229445, + "learning_rate": 1.7074431046748075e-06, + "loss": 0.1552, + "step": 6790 + }, + { + "epoch": 2.267824344631825, + "grad_norm": 0.5506435678104178, + "learning_rate": 1.7059808575447057e-06, + "loss": 0.1587, + "step": 6791 + }, + { + "epoch": 2.2681582901986976, + "grad_norm": 0.5180851984442151, + "learning_rate": 1.7045191080149815e-06, + "loss": 0.1545, + "step": 6792 + }, + { + "epoch": 2.2684922357655704, + "grad_norm": 0.503753635614517, + "learning_rate": 1.7030578563064504e-06, + "loss": 0.1575, + "step": 6793 + }, + { + "epoch": 2.2688261813324426, + "grad_norm": 0.5181212723409958, + "learning_rate": 1.7015971026398487e-06, + "loss": 0.1563, + "step": 6794 + }, + { + "epoch": 2.2691601268993153, + "grad_norm": 0.47584629353603225, + "learning_rate": 1.7001368472358442e-06, + "loss": 0.149, + "step": 6795 + }, + { + "epoch": 2.269494072466188, + "grad_norm": 0.5001463375155726, + "learning_rate": 1.6986770903150213e-06, + "loss": 0.1497, + "step": 6796 + }, + { + "epoch": 2.269828018033061, + "grad_norm": 0.5233528316290307, + "learning_rate": 1.697217832097896e-06, + "loss": 0.1543, + "step": 6797 + }, + { + "epoch": 2.270161963599933, + "grad_norm": 0.5574253388658429, + "learning_rate": 1.6957590728049078e-06, + "loss": 0.1696, + "step": 6798 + }, + { + "epoch": 2.2704959091668058, + "grad_norm": 0.5284625902947807, + "learning_rate": 1.6943008126564164e-06, + "loss": 0.1522, + "step": 6799 + }, + { + "epoch": 2.2708298547336785, + "grad_norm": 0.5126830539761008, + "learning_rate": 1.6928430518727102e-06, + "loss": 0.159, + "step": 6800 + }, + { + "epoch": 2.2711638003005508, + "grad_norm": 0.5004642345379817, + "learning_rate": 1.6913857906740033e-06, + "loss": 0.1561, + "step": 6801 + }, + { + "epoch": 2.2714977458674235, + "grad_norm": 0.5114750316659729, + "learning_rate": 1.6899290292804288e-06, + "loss": 0.1614, + "step": 6802 + }, + { + "epoch": 2.271831691434296, + "grad_norm": 0.5395301012866076, + "learning_rate": 1.6884727679120493e-06, + "loss": 0.1575, + "step": 6803 + }, + { + "epoch": 2.272165637001169, + "grad_norm": 0.5343051941291189, + "learning_rate": 1.687017006788852e-06, + "loss": 0.1649, + "step": 6804 + }, + { + "epoch": 2.2724995825680416, + "grad_norm": 0.49336411451085405, + "learning_rate": 1.6855617461307427e-06, + "loss": 0.1535, + "step": 6805 + }, + { + "epoch": 2.272833528134914, + "grad_norm": 0.5168391221141233, + "learning_rate": 1.6841069861575598e-06, + "loss": 0.1534, + "step": 6806 + }, + { + "epoch": 2.2731674737017866, + "grad_norm": 0.49208671517873626, + "learning_rate": 1.6826527270890587e-06, + "loss": 0.1514, + "step": 6807 + }, + { + "epoch": 2.2735014192686593, + "grad_norm": 0.4972768325081571, + "learning_rate": 1.6811989691449232e-06, + "loss": 0.1572, + "step": 6808 + }, + { + "epoch": 2.2738353648355316, + "grad_norm": 0.48863264662669803, + "learning_rate": 1.6797457125447614e-06, + "loss": 0.1481, + "step": 6809 + }, + { + "epoch": 2.2741693104024043, + "grad_norm": 0.5221620981515211, + "learning_rate": 1.678292957508106e-06, + "loss": 0.1627, + "step": 6810 + }, + { + "epoch": 2.274503255969277, + "grad_norm": 0.519752893108589, + "learning_rate": 1.6768407042544093e-06, + "loss": 0.1641, + "step": 6811 + }, + { + "epoch": 2.2748372015361498, + "grad_norm": 0.5210756938106716, + "learning_rate": 1.6753889530030554e-06, + "loss": 0.156, + "step": 6812 + }, + { + "epoch": 2.275171147103022, + "grad_norm": 0.5504910072599424, + "learning_rate": 1.673937703973344e-06, + "loss": 0.166, + "step": 6813 + }, + { + "epoch": 2.2755050926698948, + "grad_norm": 0.4910304113541981, + "learning_rate": 1.6724869573845054e-06, + "loss": 0.1502, + "step": 6814 + }, + { + "epoch": 2.2758390382367675, + "grad_norm": 0.5560014050143508, + "learning_rate": 1.6710367134556926e-06, + "loss": 0.1667, + "step": 6815 + }, + { + "epoch": 2.27617298380364, + "grad_norm": 0.5421536902759155, + "learning_rate": 1.6695869724059827e-06, + "loss": 0.1513, + "step": 6816 + }, + { + "epoch": 2.2765069293705125, + "grad_norm": 0.5104732647871095, + "learning_rate": 1.6681377344543737e-06, + "loss": 0.1632, + "step": 6817 + }, + { + "epoch": 2.276840874937385, + "grad_norm": 0.5161082704973411, + "learning_rate": 1.6666889998197927e-06, + "loss": 0.1588, + "step": 6818 + }, + { + "epoch": 2.277174820504258, + "grad_norm": 0.5211932452386149, + "learning_rate": 1.6652407687210853e-06, + "loss": 0.1645, + "step": 6819 + }, + { + "epoch": 2.27750876607113, + "grad_norm": 0.47349743298012514, + "learning_rate": 1.6637930413770249e-06, + "loss": 0.1438, + "step": 6820 + }, + { + "epoch": 2.277842711638003, + "grad_norm": 0.5511636785563299, + "learning_rate": 1.6623458180063084e-06, + "loss": 0.1632, + "step": 6821 + }, + { + "epoch": 2.2781766572048756, + "grad_norm": 0.5180653642051466, + "learning_rate": 1.6608990988275575e-06, + "loss": 0.1581, + "step": 6822 + }, + { + "epoch": 2.2785106027717483, + "grad_norm": 0.48902595955559913, + "learning_rate": 1.6594528840593128e-06, + "loss": 0.1577, + "step": 6823 + }, + { + "epoch": 2.278844548338621, + "grad_norm": 0.5283771989671097, + "learning_rate": 1.6580071739200448e-06, + "loss": 0.1637, + "step": 6824 + }, + { + "epoch": 2.2791784939054933, + "grad_norm": 0.4798685050156958, + "learning_rate": 1.6565619686281425e-06, + "loss": 0.1425, + "step": 6825 + }, + { + "epoch": 2.279512439472366, + "grad_norm": 0.5262706305004466, + "learning_rate": 1.6551172684019224e-06, + "loss": 0.1572, + "step": 6826 + }, + { + "epoch": 2.2798463850392388, + "grad_norm": 0.5158131133227103, + "learning_rate": 1.6536730734596257e-06, + "loss": 0.1578, + "step": 6827 + }, + { + "epoch": 2.280180330606111, + "grad_norm": 0.5469011331347828, + "learning_rate": 1.652229384019411e-06, + "loss": 0.171, + "step": 6828 + }, + { + "epoch": 2.2805142761729837, + "grad_norm": 0.5075499952775651, + "learning_rate": 1.650786200299368e-06, + "loss": 0.1542, + "step": 6829 + }, + { + "epoch": 2.2808482217398565, + "grad_norm": 0.5425927659062526, + "learning_rate": 1.6493435225175042e-06, + "loss": 0.1612, + "step": 6830 + }, + { + "epoch": 2.281182167306729, + "grad_norm": 0.46335339234518974, + "learning_rate": 1.6479013508917552e-06, + "loss": 0.1502, + "step": 6831 + }, + { + "epoch": 2.2815161128736015, + "grad_norm": 0.48052008185076805, + "learning_rate": 1.6464596856399734e-06, + "loss": 0.1493, + "step": 6832 + }, + { + "epoch": 2.281850058440474, + "grad_norm": 0.48377398516358, + "learning_rate": 1.6450185269799462e-06, + "loss": 0.1529, + "step": 6833 + }, + { + "epoch": 2.282184004007347, + "grad_norm": 0.5249199169347003, + "learning_rate": 1.6435778751293723e-06, + "loss": 0.1536, + "step": 6834 + }, + { + "epoch": 2.2825179495742196, + "grad_norm": 0.49911936444525085, + "learning_rate": 1.6421377303058829e-06, + "loss": 0.1509, + "step": 6835 + }, + { + "epoch": 2.282851895141092, + "grad_norm": 0.5020572435479131, + "learning_rate": 1.640698092727025e-06, + "loss": 0.1572, + "step": 6836 + }, + { + "epoch": 2.2831858407079646, + "grad_norm": 0.5029497729673867, + "learning_rate": 1.639258962610275e-06, + "loss": 0.1536, + "step": 6837 + }, + { + "epoch": 2.2835197862748373, + "grad_norm": 0.4852289148568792, + "learning_rate": 1.6378203401730303e-06, + "loss": 0.163, + "step": 6838 + }, + { + "epoch": 2.2838537318417096, + "grad_norm": 0.5788851829372151, + "learning_rate": 1.6363822256326128e-06, + "loss": 0.1653, + "step": 6839 + }, + { + "epoch": 2.2841876774085823, + "grad_norm": 0.5116983762288065, + "learning_rate": 1.6349446192062635e-06, + "loss": 0.1602, + "step": 6840 + }, + { + "epoch": 2.284521622975455, + "grad_norm": 0.5586774089313628, + "learning_rate": 1.633507521111154e-06, + "loss": 0.1619, + "step": 6841 + }, + { + "epoch": 2.2848555685423277, + "grad_norm": 0.5331816877090615, + "learning_rate": 1.6320709315643708e-06, + "loss": 0.1701, + "step": 6842 + }, + { + "epoch": 2.2851895141092, + "grad_norm": 0.49596637489137746, + "learning_rate": 1.6306348507829294e-06, + "loss": 0.1521, + "step": 6843 + }, + { + "epoch": 2.2855234596760727, + "grad_norm": 0.5525180134705676, + "learning_rate": 1.6291992789837669e-06, + "loss": 0.1614, + "step": 6844 + }, + { + "epoch": 2.2858574052429455, + "grad_norm": 0.5308161541932059, + "learning_rate": 1.6277642163837444e-06, + "loss": 0.1607, + "step": 6845 + }, + { + "epoch": 2.286191350809818, + "grad_norm": 0.49825308016711456, + "learning_rate": 1.6263296631996422e-06, + "loss": 0.1527, + "step": 6846 + }, + { + "epoch": 2.2865252963766904, + "grad_norm": 0.4718547788653712, + "learning_rate": 1.6248956196481701e-06, + "loss": 0.148, + "step": 6847 + }, + { + "epoch": 2.286859241943563, + "grad_norm": 0.5382050275600502, + "learning_rate": 1.6234620859459537e-06, + "loss": 0.1613, + "step": 6848 + }, + { + "epoch": 2.287193187510436, + "grad_norm": 0.49960602289495415, + "learning_rate": 1.6220290623095463e-06, + "loss": 0.1491, + "step": 6849 + }, + { + "epoch": 2.287527133077308, + "grad_norm": 0.5156720699510375, + "learning_rate": 1.6205965489554248e-06, + "loss": 0.1561, + "step": 6850 + }, + { + "epoch": 2.287861078644181, + "grad_norm": 0.5025226699485877, + "learning_rate": 1.619164546099985e-06, + "loss": 0.1593, + "step": 6851 + }, + { + "epoch": 2.2881950242110536, + "grad_norm": 0.5185730184204812, + "learning_rate": 1.6177330539595493e-06, + "loss": 0.15, + "step": 6852 + }, + { + "epoch": 2.2885289697779263, + "grad_norm": 0.48338359238850986, + "learning_rate": 1.6163020727503592e-06, + "loss": 0.1486, + "step": 6853 + }, + { + "epoch": 2.288862915344799, + "grad_norm": 0.5446527503447174, + "learning_rate": 1.6148716026885847e-06, + "loss": 0.1715, + "step": 6854 + }, + { + "epoch": 2.2891968609116713, + "grad_norm": 0.5200794562046593, + "learning_rate": 1.61344164399031e-06, + "loss": 0.1543, + "step": 6855 + }, + { + "epoch": 2.289530806478544, + "grad_norm": 0.47904725129674774, + "learning_rate": 1.6120121968715535e-06, + "loss": 0.1538, + "step": 6856 + }, + { + "epoch": 2.2898647520454167, + "grad_norm": 0.544473388151401, + "learning_rate": 1.6105832615482453e-06, + "loss": 0.1712, + "step": 6857 + }, + { + "epoch": 2.290198697612289, + "grad_norm": 0.4891963514385171, + "learning_rate": 1.609154838236246e-06, + "loss": 0.15, + "step": 6858 + }, + { + "epoch": 2.2905326431791617, + "grad_norm": 0.5130192652682918, + "learning_rate": 1.6077269271513328e-06, + "loss": 0.1616, + "step": 6859 + }, + { + "epoch": 2.2908665887460344, + "grad_norm": 0.5118177941331495, + "learning_rate": 1.606299528509212e-06, + "loss": 0.1592, + "step": 6860 + }, + { + "epoch": 2.291200534312907, + "grad_norm": 0.49636585798317306, + "learning_rate": 1.604872642525503e-06, + "loss": 0.1486, + "step": 6861 + }, + { + "epoch": 2.2915344798797794, + "grad_norm": 0.5108213993992322, + "learning_rate": 1.6034462694157615e-06, + "loss": 0.1589, + "step": 6862 + }, + { + "epoch": 2.291868425446652, + "grad_norm": 0.5216001516467106, + "learning_rate": 1.6020204093954523e-06, + "loss": 0.1618, + "step": 6863 + }, + { + "epoch": 2.292202371013525, + "grad_norm": 0.4612499122336571, + "learning_rate": 1.6005950626799716e-06, + "loss": 0.1478, + "step": 6864 + }, + { + "epoch": 2.2925363165803976, + "grad_norm": 0.5496928863070641, + "learning_rate": 1.5991702294846318e-06, + "loss": 0.169, + "step": 6865 + }, + { + "epoch": 2.29287026214727, + "grad_norm": 0.519491010757705, + "learning_rate": 1.597745910024674e-06, + "loss": 0.1535, + "step": 6866 + }, + { + "epoch": 2.2932042077141426, + "grad_norm": 0.5395166624198875, + "learning_rate": 1.5963221045152537e-06, + "loss": 0.1679, + "step": 6867 + }, + { + "epoch": 2.2935381532810153, + "grad_norm": 0.49544413691093875, + "learning_rate": 1.5948988131714594e-06, + "loss": 0.1502, + "step": 6868 + }, + { + "epoch": 2.2938720988478876, + "grad_norm": 0.5089430397982071, + "learning_rate": 1.593476036208292e-06, + "loss": 0.1584, + "step": 6869 + }, + { + "epoch": 2.2942060444147603, + "grad_norm": 0.4761749830623971, + "learning_rate": 1.5920537738406811e-06, + "loss": 0.1438, + "step": 6870 + }, + { + "epoch": 2.294539989981633, + "grad_norm": 0.5128220624654717, + "learning_rate": 1.5906320262834735e-06, + "loss": 0.1582, + "step": 6871 + }, + { + "epoch": 2.2948739355485057, + "grad_norm": 0.4769933038550351, + "learning_rate": 1.5892107937514424e-06, + "loss": 0.1482, + "step": 6872 + }, + { + "epoch": 2.2952078811153784, + "grad_norm": 0.49727500860357027, + "learning_rate": 1.587790076459283e-06, + "loss": 0.1569, + "step": 6873 + }, + { + "epoch": 2.2955418266822507, + "grad_norm": 0.5198335573936426, + "learning_rate": 1.5863698746216082e-06, + "loss": 0.1573, + "step": 6874 + }, + { + "epoch": 2.2958757722491234, + "grad_norm": 0.5199506171376683, + "learning_rate": 1.58495018845296e-06, + "loss": 0.1652, + "step": 6875 + }, + { + "epoch": 2.296209717815996, + "grad_norm": 0.5041888828958186, + "learning_rate": 1.5835310181677954e-06, + "loss": 0.1588, + "step": 6876 + }, + { + "epoch": 2.2965436633828684, + "grad_norm": 0.5595007893689801, + "learning_rate": 1.5821123639804992e-06, + "loss": 0.1639, + "step": 6877 + }, + { + "epoch": 2.296877608949741, + "grad_norm": 0.49220921014348185, + "learning_rate": 1.5806942261053715e-06, + "loss": 0.1582, + "step": 6878 + }, + { + "epoch": 2.297211554516614, + "grad_norm": 0.5223549427049144, + "learning_rate": 1.5792766047566455e-06, + "loss": 0.1577, + "step": 6879 + }, + { + "epoch": 2.2975455000834866, + "grad_norm": 0.4778172847309024, + "learning_rate": 1.5778595001484648e-06, + "loss": 0.1468, + "step": 6880 + }, + { + "epoch": 2.297879445650359, + "grad_norm": 0.5503150749267659, + "learning_rate": 1.5764429124949022e-06, + "loss": 0.1622, + "step": 6881 + }, + { + "epoch": 2.2982133912172316, + "grad_norm": 0.48813966328418384, + "learning_rate": 1.5750268420099468e-06, + "loss": 0.1537, + "step": 6882 + }, + { + "epoch": 2.2985473367841043, + "grad_norm": 0.48710079923452626, + "learning_rate": 1.5736112889075167e-06, + "loss": 0.1499, + "step": 6883 + }, + { + "epoch": 2.298881282350977, + "grad_norm": 0.48893154112588416, + "learning_rate": 1.5721962534014424e-06, + "loss": 0.1566, + "step": 6884 + }, + { + "epoch": 2.2992152279178493, + "grad_norm": 0.4801053992347205, + "learning_rate": 1.5707817357054882e-06, + "loss": 0.15, + "step": 6885 + }, + { + "epoch": 2.299549173484722, + "grad_norm": 0.5580430430657689, + "learning_rate": 1.5693677360333293e-06, + "loss": 0.1615, + "step": 6886 + }, + { + "epoch": 2.2998831190515947, + "grad_norm": 0.530284558249108, + "learning_rate": 1.56795425459857e-06, + "loss": 0.1566, + "step": 6887 + }, + { + "epoch": 2.300217064618467, + "grad_norm": 0.5378779347275838, + "learning_rate": 1.5665412916147298e-06, + "loss": 0.1532, + "step": 6888 + }, + { + "epoch": 2.3005510101853397, + "grad_norm": 0.4847888072829989, + "learning_rate": 1.5651288472952564e-06, + "loss": 0.1509, + "step": 6889 + }, + { + "epoch": 2.3008849557522124, + "grad_norm": 0.5658711305785874, + "learning_rate": 1.563716921853512e-06, + "loss": 0.162, + "step": 6890 + }, + { + "epoch": 2.301218901319085, + "grad_norm": 0.5439624589912487, + "learning_rate": 1.562305515502791e-06, + "loss": 0.1656, + "step": 6891 + }, + { + "epoch": 2.3015528468859574, + "grad_norm": 0.555316715494243, + "learning_rate": 1.5608946284562977e-06, + "loss": 0.1656, + "step": 6892 + }, + { + "epoch": 2.30188679245283, + "grad_norm": 0.5347395221649406, + "learning_rate": 1.559484260927166e-06, + "loss": 0.1592, + "step": 6893 + }, + { + "epoch": 2.302220738019703, + "grad_norm": 0.557941240584311, + "learning_rate": 1.5580744131284464e-06, + "loss": 0.1677, + "step": 6894 + }, + { + "epoch": 2.3025546835865756, + "grad_norm": 0.5276421760511721, + "learning_rate": 1.5566650852731151e-06, + "loss": 0.1685, + "step": 6895 + }, + { + "epoch": 2.302888629153448, + "grad_norm": 0.5020349892960838, + "learning_rate": 1.5552562775740654e-06, + "loss": 0.1594, + "step": 6896 + }, + { + "epoch": 2.3032225747203205, + "grad_norm": 0.5293814651715417, + "learning_rate": 1.5538479902441156e-06, + "loss": 0.1678, + "step": 6897 + }, + { + "epoch": 2.3035565202871933, + "grad_norm": 0.48716428596425015, + "learning_rate": 1.5524402234960056e-06, + "loss": 0.147, + "step": 6898 + }, + { + "epoch": 2.3038904658540655, + "grad_norm": 0.48127955691998964, + "learning_rate": 1.5510329775423916e-06, + "loss": 0.1542, + "step": 6899 + }, + { + "epoch": 2.3042244114209383, + "grad_norm": 0.49897847992147765, + "learning_rate": 1.5496262525958583e-06, + "loss": 0.1628, + "step": 6900 + }, + { + "epoch": 2.304558356987811, + "grad_norm": 0.48569413278480206, + "learning_rate": 1.5482200488689054e-06, + "loss": 0.1399, + "step": 6901 + }, + { + "epoch": 2.3048923025546837, + "grad_norm": 0.5315498993933447, + "learning_rate": 1.5468143665739565e-06, + "loss": 0.1608, + "step": 6902 + }, + { + "epoch": 2.3052262481215564, + "grad_norm": 0.5276387365311821, + "learning_rate": 1.5454092059233583e-06, + "loss": 0.1563, + "step": 6903 + }, + { + "epoch": 2.3055601936884287, + "grad_norm": 0.5473811887845702, + "learning_rate": 1.5440045671293774e-06, + "loss": 0.1657, + "step": 6904 + }, + { + "epoch": 2.3058941392553014, + "grad_norm": 0.519692883836073, + "learning_rate": 1.542600450404198e-06, + "loss": 0.1621, + "step": 6905 + }, + { + "epoch": 2.306228084822174, + "grad_norm": 0.5274237692456049, + "learning_rate": 1.5411968559599317e-06, + "loss": 0.1502, + "step": 6906 + }, + { + "epoch": 2.3065620303890464, + "grad_norm": 0.5165181375891817, + "learning_rate": 1.5397937840086048e-06, + "loss": 0.167, + "step": 6907 + }, + { + "epoch": 2.306895975955919, + "grad_norm": 0.5392613904904744, + "learning_rate": 1.5383912347621693e-06, + "loss": 0.1609, + "step": 6908 + }, + { + "epoch": 2.307229921522792, + "grad_norm": 0.5352550993399279, + "learning_rate": 1.5369892084324972e-06, + "loss": 0.157, + "step": 6909 + }, + { + "epoch": 2.3075638670896645, + "grad_norm": 0.5132398793438658, + "learning_rate": 1.5355877052313822e-06, + "loss": 0.162, + "step": 6910 + }, + { + "epoch": 2.307897812656537, + "grad_norm": 0.5059045512819826, + "learning_rate": 1.534186725370535e-06, + "loss": 0.1569, + "step": 6911 + }, + { + "epoch": 2.3082317582234095, + "grad_norm": 0.5241489880536726, + "learning_rate": 1.532786269061593e-06, + "loss": 0.1645, + "step": 6912 + }, + { + "epoch": 2.3085657037902823, + "grad_norm": 0.5862344966512784, + "learning_rate": 1.531386336516107e-06, + "loss": 0.1482, + "step": 6913 + }, + { + "epoch": 2.308899649357155, + "grad_norm": 0.4937850033656664, + "learning_rate": 1.52998692794556e-06, + "loss": 0.16, + "step": 6914 + }, + { + "epoch": 2.3092335949240272, + "grad_norm": 0.5025478647760935, + "learning_rate": 1.5285880435613438e-06, + "loss": 0.1536, + "step": 6915 + }, + { + "epoch": 2.3095675404909, + "grad_norm": 0.5395976195462507, + "learning_rate": 1.5271896835747795e-06, + "loss": 0.1602, + "step": 6916 + }, + { + "epoch": 2.3099014860577727, + "grad_norm": 0.50885180620015, + "learning_rate": 1.5257918481971028e-06, + "loss": 0.1584, + "step": 6917 + }, + { + "epoch": 2.310235431624645, + "grad_norm": 0.49907541237904407, + "learning_rate": 1.524394537639477e-06, + "loss": 0.1468, + "step": 6918 + }, + { + "epoch": 2.3105693771915177, + "grad_norm": 0.520098742948566, + "learning_rate": 1.5229977521129785e-06, + "loss": 0.152, + "step": 6919 + }, + { + "epoch": 2.3109033227583904, + "grad_norm": 0.555034188223172, + "learning_rate": 1.5216014918286097e-06, + "loss": 0.1728, + "step": 6920 + }, + { + "epoch": 2.311237268325263, + "grad_norm": 0.5277476198952158, + "learning_rate": 1.5202057569972945e-06, + "loss": 0.1559, + "step": 6921 + }, + { + "epoch": 2.311571213892136, + "grad_norm": 0.5082237210587632, + "learning_rate": 1.518810547829871e-06, + "loss": 0.1546, + "step": 6922 + }, + { + "epoch": 2.311905159459008, + "grad_norm": 0.5498238792531575, + "learning_rate": 1.517415864537105e-06, + "loss": 0.1672, + "step": 6923 + }, + { + "epoch": 2.312239105025881, + "grad_norm": 0.5102413698777625, + "learning_rate": 1.516021707329678e-06, + "loss": 0.1639, + "step": 6924 + }, + { + "epoch": 2.3125730505927535, + "grad_norm": 0.49159724301878027, + "learning_rate": 1.5146280764181942e-06, + "loss": 0.1535, + "step": 6925 + }, + { + "epoch": 2.312906996159626, + "grad_norm": 0.5166047110747541, + "learning_rate": 1.5132349720131783e-06, + "loss": 0.1588, + "step": 6926 + }, + { + "epoch": 2.3132409417264985, + "grad_norm": 0.5055406381133966, + "learning_rate": 1.511842394325077e-06, + "loss": 0.1483, + "step": 6927 + }, + { + "epoch": 2.3135748872933712, + "grad_norm": 0.5191101090446806, + "learning_rate": 1.5104503435642526e-06, + "loss": 0.158, + "step": 6928 + }, + { + "epoch": 2.313908832860244, + "grad_norm": 0.5014849155315904, + "learning_rate": 1.5090588199409927e-06, + "loss": 0.159, + "step": 6929 + }, + { + "epoch": 2.3142427784271162, + "grad_norm": 0.5270760259726192, + "learning_rate": 1.5076678236655018e-06, + "loss": 0.1663, + "step": 6930 + }, + { + "epoch": 2.314576723993989, + "grad_norm": 0.5265525170252408, + "learning_rate": 1.5062773549479064e-06, + "loss": 0.1674, + "step": 6931 + }, + { + "epoch": 2.3149106695608617, + "grad_norm": 0.4754623491282869, + "learning_rate": 1.504887413998254e-06, + "loss": 0.1517, + "step": 6932 + }, + { + "epoch": 2.3152446151277344, + "grad_norm": 0.4902506423363311, + "learning_rate": 1.5034980010265127e-06, + "loss": 0.1553, + "step": 6933 + }, + { + "epoch": 2.3155785606946067, + "grad_norm": 0.5575953129911351, + "learning_rate": 1.5021091162425672e-06, + "loss": 0.1666, + "step": 6934 + }, + { + "epoch": 2.3159125062614794, + "grad_norm": 0.5099824545247369, + "learning_rate": 1.5007207598562268e-06, + "loss": 0.1579, + "step": 6935 + }, + { + "epoch": 2.316246451828352, + "grad_norm": 0.539874027836098, + "learning_rate": 1.4993329320772177e-06, + "loss": 0.1662, + "step": 6936 + }, + { + "epoch": 2.3165803973952244, + "grad_norm": 0.5295612734416608, + "learning_rate": 1.4979456331151875e-06, + "loss": 0.1597, + "step": 6937 + }, + { + "epoch": 2.316914342962097, + "grad_norm": 0.5894263333785696, + "learning_rate": 1.4965588631797052e-06, + "loss": 0.159, + "step": 6938 + }, + { + "epoch": 2.31724828852897, + "grad_norm": 0.4741039633929316, + "learning_rate": 1.4951726224802593e-06, + "loss": 0.1494, + "step": 6939 + }, + { + "epoch": 2.3175822340958425, + "grad_norm": 0.4862856197350301, + "learning_rate": 1.493786911226256e-06, + "loss": 0.1569, + "step": 6940 + }, + { + "epoch": 2.317916179662715, + "grad_norm": 0.5529752111878695, + "learning_rate": 1.492401729627025e-06, + "loss": 0.1608, + "step": 6941 + }, + { + "epoch": 2.3182501252295875, + "grad_norm": 0.5507021790387356, + "learning_rate": 1.491017077891812e-06, + "loss": 0.168, + "step": 6942 + }, + { + "epoch": 2.3185840707964602, + "grad_norm": 0.5125603052330294, + "learning_rate": 1.4896329562297863e-06, + "loss": 0.1602, + "step": 6943 + }, + { + "epoch": 2.318918016363333, + "grad_norm": 0.5257969566496764, + "learning_rate": 1.4882493648500373e-06, + "loss": 0.1576, + "step": 6944 + }, + { + "epoch": 2.319251961930205, + "grad_norm": 0.5381499105664779, + "learning_rate": 1.48686630396157e-06, + "loss": 0.1683, + "step": 6945 + }, + { + "epoch": 2.319585907497078, + "grad_norm": 0.5422363917617385, + "learning_rate": 1.4854837737733147e-06, + "loss": 0.1679, + "step": 6946 + }, + { + "epoch": 2.3199198530639507, + "grad_norm": 0.5641296285837942, + "learning_rate": 1.484101774494116e-06, + "loss": 0.1722, + "step": 6947 + }, + { + "epoch": 2.320253798630823, + "grad_norm": 0.5253053775187845, + "learning_rate": 1.4827203063327427e-06, + "loss": 0.1608, + "step": 6948 + }, + { + "epoch": 2.3205877441976956, + "grad_norm": 0.4501898716795521, + "learning_rate": 1.4813393694978812e-06, + "loss": 0.1527, + "step": 6949 + }, + { + "epoch": 2.3209216897645684, + "grad_norm": 0.4766070843342987, + "learning_rate": 1.479958964198141e-06, + "loss": 0.1473, + "step": 6950 + }, + { + "epoch": 2.321255635331441, + "grad_norm": 0.5543755055251879, + "learning_rate": 1.4785790906420445e-06, + "loss": 0.1729, + "step": 6951 + }, + { + "epoch": 2.321589580898314, + "grad_norm": 0.47207588022242153, + "learning_rate": 1.4771997490380414e-06, + "loss": 0.1449, + "step": 6952 + }, + { + "epoch": 2.321923526465186, + "grad_norm": 0.5489402475099499, + "learning_rate": 1.4758209395944945e-06, + "loss": 0.1666, + "step": 6953 + }, + { + "epoch": 2.322257472032059, + "grad_norm": 0.4807129427731627, + "learning_rate": 1.47444266251969e-06, + "loss": 0.1518, + "step": 6954 + }, + { + "epoch": 2.3225914175989315, + "grad_norm": 0.554604022659153, + "learning_rate": 1.4730649180218337e-06, + "loss": 0.1757, + "step": 6955 + }, + { + "epoch": 2.322925363165804, + "grad_norm": 0.5348203406510965, + "learning_rate": 1.4716877063090517e-06, + "loss": 0.1606, + "step": 6956 + }, + { + "epoch": 2.3232593087326765, + "grad_norm": 0.525792610223351, + "learning_rate": 1.4703110275893846e-06, + "loss": 0.1645, + "step": 6957 + }, + { + "epoch": 2.323593254299549, + "grad_norm": 0.5191996867761393, + "learning_rate": 1.4689348820707988e-06, + "loss": 0.1601, + "step": 6958 + }, + { + "epoch": 2.323927199866422, + "grad_norm": 0.5021670260891927, + "learning_rate": 1.4675592699611741e-06, + "loss": 0.157, + "step": 6959 + }, + { + "epoch": 2.324261145433294, + "grad_norm": 0.5189366470501476, + "learning_rate": 1.4661841914683156e-06, + "loss": 0.1561, + "step": 6960 + }, + { + "epoch": 2.324595091000167, + "grad_norm": 0.5367046655871626, + "learning_rate": 1.464809646799944e-06, + "loss": 0.1705, + "step": 6961 + }, + { + "epoch": 2.3249290365670396, + "grad_norm": 0.46937388491530546, + "learning_rate": 1.463435636163702e-06, + "loss": 0.1535, + "step": 6962 + }, + { + "epoch": 2.3252629821339124, + "grad_norm": 0.482195068948941, + "learning_rate": 1.4620621597671476e-06, + "loss": 0.1498, + "step": 6963 + }, + { + "epoch": 2.3255969277007846, + "grad_norm": 0.4782325808602372, + "learning_rate": 1.4606892178177633e-06, + "loss": 0.1492, + "step": 6964 + }, + { + "epoch": 2.3259308732676574, + "grad_norm": 0.49949896473890126, + "learning_rate": 1.459316810522945e-06, + "loss": 0.1541, + "step": 6965 + }, + { + "epoch": 2.32626481883453, + "grad_norm": 0.5514134284700918, + "learning_rate": 1.457944938090013e-06, + "loss": 0.1627, + "step": 6966 + }, + { + "epoch": 2.3265987644014023, + "grad_norm": 0.5122505150543959, + "learning_rate": 1.456573600726206e-06, + "loss": 0.1582, + "step": 6967 + }, + { + "epoch": 2.326932709968275, + "grad_norm": 0.5155388868093768, + "learning_rate": 1.4552027986386775e-06, + "loss": 0.1605, + "step": 6968 + }, + { + "epoch": 2.3272666555351478, + "grad_norm": 0.5319645910926085, + "learning_rate": 1.453832532034506e-06, + "loss": 0.1672, + "step": 6969 + }, + { + "epoch": 2.3276006011020205, + "grad_norm": 0.5212564670307227, + "learning_rate": 1.4524628011206843e-06, + "loss": 0.1606, + "step": 6970 + }, + { + "epoch": 2.327934546668893, + "grad_norm": 0.5197062994619243, + "learning_rate": 1.4510936061041269e-06, + "loss": 0.1586, + "step": 6971 + }, + { + "epoch": 2.3282684922357655, + "grad_norm": 0.5793340057434153, + "learning_rate": 1.449724947191668e-06, + "loss": 0.1625, + "step": 6972 + }, + { + "epoch": 2.328602437802638, + "grad_norm": 0.5080639803068403, + "learning_rate": 1.4483568245900597e-06, + "loss": 0.1554, + "step": 6973 + }, + { + "epoch": 2.328936383369511, + "grad_norm": 0.5268549200084457, + "learning_rate": 1.4469892385059713e-06, + "loss": 0.1655, + "step": 6974 + }, + { + "epoch": 2.329270328936383, + "grad_norm": 0.4921015168951849, + "learning_rate": 1.4456221891459953e-06, + "loss": 0.1559, + "step": 6975 + }, + { + "epoch": 2.329604274503256, + "grad_norm": 0.507333813102178, + "learning_rate": 1.4442556767166371e-06, + "loss": 0.1478, + "step": 6976 + }, + { + "epoch": 2.3299382200701286, + "grad_norm": 0.4805039087114265, + "learning_rate": 1.4428897014243288e-06, + "loss": 0.143, + "step": 6977 + }, + { + "epoch": 2.3302721656370013, + "grad_norm": 0.525864995462506, + "learning_rate": 1.4415242634754107e-06, + "loss": 0.1564, + "step": 6978 + }, + { + "epoch": 2.3306061112038736, + "grad_norm": 0.48453778085656507, + "learning_rate": 1.4401593630761562e-06, + "loss": 0.153, + "step": 6979 + }, + { + "epoch": 2.3309400567707463, + "grad_norm": 0.5560825048911766, + "learning_rate": 1.4387950004327434e-06, + "loss": 0.1696, + "step": 6980 + }, + { + "epoch": 2.331274002337619, + "grad_norm": 0.4999034443705413, + "learning_rate": 1.4374311757512798e-06, + "loss": 0.1489, + "step": 6981 + }, + { + "epoch": 2.3316079479044918, + "grad_norm": 0.48674712053155916, + "learning_rate": 1.4360678892377833e-06, + "loss": 0.1558, + "step": 6982 + }, + { + "epoch": 2.331941893471364, + "grad_norm": 0.5376662163779152, + "learning_rate": 1.434705141098197e-06, + "loss": 0.1618, + "step": 6983 + }, + { + "epoch": 2.3322758390382368, + "grad_norm": 0.49634944316675383, + "learning_rate": 1.4333429315383768e-06, + "loss": 0.1536, + "step": 6984 + }, + { + "epoch": 2.3326097846051095, + "grad_norm": 0.5261178930730273, + "learning_rate": 1.4319812607641055e-06, + "loss": 0.1491, + "step": 6985 + }, + { + "epoch": 2.3329437301719818, + "grad_norm": 0.4925740465169854, + "learning_rate": 1.4306201289810756e-06, + "loss": 0.1542, + "step": 6986 + }, + { + "epoch": 2.3332776757388545, + "grad_norm": 0.5190410270947525, + "learning_rate": 1.4292595363949047e-06, + "loss": 0.1499, + "step": 6987 + }, + { + "epoch": 2.333611621305727, + "grad_norm": 0.5062847873051785, + "learning_rate": 1.4278994832111232e-06, + "loss": 0.1559, + "step": 6988 + }, + { + "epoch": 2.3339455668726, + "grad_norm": 0.5152470669811914, + "learning_rate": 1.4265399696351867e-06, + "loss": 0.1543, + "step": 6989 + }, + { + "epoch": 2.334279512439472, + "grad_norm": 0.5110787108439315, + "learning_rate": 1.4251809958724623e-06, + "loss": 0.1607, + "step": 6990 + }, + { + "epoch": 2.334613458006345, + "grad_norm": 0.4988509374702272, + "learning_rate": 1.4238225621282403e-06, + "loss": 0.1588, + "step": 6991 + }, + { + "epoch": 2.3349474035732176, + "grad_norm": 0.5005557600957211, + "learning_rate": 1.4224646686077303e-06, + "loss": 0.1569, + "step": 6992 + }, + { + "epoch": 2.3352813491400903, + "grad_norm": 0.48763438079425747, + "learning_rate": 1.4211073155160544e-06, + "loss": 0.158, + "step": 6993 + }, + { + "epoch": 2.3356152947069626, + "grad_norm": 0.5314112983450026, + "learning_rate": 1.4197505030582588e-06, + "loss": 0.164, + "step": 6994 + }, + { + "epoch": 2.3359492402738353, + "grad_norm": 0.502506767280615, + "learning_rate": 1.4183942314393056e-06, + "loss": 0.1553, + "step": 6995 + }, + { + "epoch": 2.336283185840708, + "grad_norm": 0.514455411162069, + "learning_rate": 1.4170385008640774e-06, + "loss": 0.1561, + "step": 6996 + }, + { + "epoch": 2.3366171314075803, + "grad_norm": 0.5256261869279392, + "learning_rate": 1.4156833115373702e-06, + "loss": 0.1522, + "step": 6997 + }, + { + "epoch": 2.336951076974453, + "grad_norm": 0.5419599369164668, + "learning_rate": 1.4143286636639043e-06, + "loss": 0.1648, + "step": 6998 + }, + { + "epoch": 2.3372850225413258, + "grad_norm": 0.4543607056514028, + "learning_rate": 1.4129745574483123e-06, + "loss": 0.1443, + "step": 6999 + }, + { + "epoch": 2.3376189681081985, + "grad_norm": 0.5119302019907201, + "learning_rate": 1.4116209930951508e-06, + "loss": 0.1589, + "step": 7000 + }, + { + "epoch": 2.337952913675071, + "grad_norm": 0.5143893370915871, + "learning_rate": 1.4102679708088867e-06, + "loss": 0.1617, + "step": 7001 + }, + { + "epoch": 2.3382868592419435, + "grad_norm": 0.4951445956972631, + "learning_rate": 1.4089154907939162e-06, + "loss": 0.1552, + "step": 7002 + }, + { + "epoch": 2.338620804808816, + "grad_norm": 0.4944981712221224, + "learning_rate": 1.4075635532545435e-06, + "loss": 0.1471, + "step": 7003 + }, + { + "epoch": 2.338954750375689, + "grad_norm": 0.5217606879242871, + "learning_rate": 1.4062121583949967e-06, + "loss": 0.1527, + "step": 7004 + }, + { + "epoch": 2.339288695942561, + "grad_norm": 0.5372747665208512, + "learning_rate": 1.4048613064194178e-06, + "loss": 0.1639, + "step": 7005 + }, + { + "epoch": 2.339622641509434, + "grad_norm": 0.5084540072164366, + "learning_rate": 1.4035109975318712e-06, + "loss": 0.1522, + "step": 7006 + }, + { + "epoch": 2.3399565870763066, + "grad_norm": 0.4915624674536933, + "learning_rate": 1.4021612319363326e-06, + "loss": 0.1475, + "step": 7007 + }, + { + "epoch": 2.3402905326431793, + "grad_norm": 0.5141788309732952, + "learning_rate": 1.4008120098367062e-06, + "loss": 0.1611, + "step": 7008 + }, + { + "epoch": 2.3406244782100516, + "grad_norm": 0.5340670488732328, + "learning_rate": 1.3994633314368034e-06, + "loss": 0.1635, + "step": 7009 + }, + { + "epoch": 2.3409584237769243, + "grad_norm": 0.4839970244154489, + "learning_rate": 1.3981151969403606e-06, + "loss": 0.1458, + "step": 7010 + }, + { + "epoch": 2.341292369343797, + "grad_norm": 0.4786078131814078, + "learning_rate": 1.3967676065510266e-06, + "loss": 0.145, + "step": 7011 + }, + { + "epoch": 2.3416263149106697, + "grad_norm": 0.47065456298226127, + "learning_rate": 1.3954205604723742e-06, + "loss": 0.1455, + "step": 7012 + }, + { + "epoch": 2.341960260477542, + "grad_norm": 0.550572446756861, + "learning_rate": 1.3940740589078872e-06, + "loss": 0.1606, + "step": 7013 + }, + { + "epoch": 2.3422942060444147, + "grad_norm": 0.5654689482348924, + "learning_rate": 1.3927281020609712e-06, + "loss": 0.172, + "step": 7014 + }, + { + "epoch": 2.3426281516112875, + "grad_norm": 0.5893082288878831, + "learning_rate": 1.391382690134952e-06, + "loss": 0.1689, + "step": 7015 + }, + { + "epoch": 2.3429620971781597, + "grad_norm": 0.532586165834147, + "learning_rate": 1.3900378233330658e-06, + "loss": 0.1589, + "step": 7016 + }, + { + "epoch": 2.3432960427450324, + "grad_norm": 0.5434759431816953, + "learning_rate": 1.3886935018584719e-06, + "loss": 0.1561, + "step": 7017 + }, + { + "epoch": 2.343629988311905, + "grad_norm": 0.4986753275861255, + "learning_rate": 1.3873497259142483e-06, + "loss": 0.1576, + "step": 7018 + }, + { + "epoch": 2.343963933878778, + "grad_norm": 0.4850699716446126, + "learning_rate": 1.3860064957033847e-06, + "loss": 0.1565, + "step": 7019 + }, + { + "epoch": 2.3442978794456506, + "grad_norm": 0.4626717680688309, + "learning_rate": 1.384663811428793e-06, + "loss": 0.1403, + "step": 7020 + }, + { + "epoch": 2.344631825012523, + "grad_norm": 0.5748882353001997, + "learning_rate": 1.3833216732933035e-06, + "loss": 0.1677, + "step": 7021 + }, + { + "epoch": 2.3449657705793956, + "grad_norm": 0.4886924233962931, + "learning_rate": 1.3819800814996587e-06, + "loss": 0.1476, + "step": 7022 + }, + { + "epoch": 2.3452997161462683, + "grad_norm": 0.5189394050352707, + "learning_rate": 1.3806390362505251e-06, + "loss": 0.1625, + "step": 7023 + }, + { + "epoch": 2.3456336617131406, + "grad_norm": 0.4929703711875753, + "learning_rate": 1.3792985377484796e-06, + "loss": 0.153, + "step": 7024 + }, + { + "epoch": 2.3459676072800133, + "grad_norm": 0.5074258924655396, + "learning_rate": 1.3779585861960226e-06, + "loss": 0.1479, + "step": 7025 + }, + { + "epoch": 2.346301552846886, + "grad_norm": 0.5344322232369463, + "learning_rate": 1.3766191817955699e-06, + "loss": 0.1666, + "step": 7026 + }, + { + "epoch": 2.3466354984137587, + "grad_norm": 0.5450383787251549, + "learning_rate": 1.3752803247494545e-06, + "loss": 0.1697, + "step": 7027 + }, + { + "epoch": 2.346969443980631, + "grad_norm": 0.5356447852167889, + "learning_rate": 1.3739420152599247e-06, + "loss": 0.1625, + "step": 7028 + }, + { + "epoch": 2.3473033895475037, + "grad_norm": 0.49739392807914606, + "learning_rate": 1.37260425352915e-06, + "loss": 0.1442, + "step": 7029 + }, + { + "epoch": 2.3476373351143764, + "grad_norm": 0.4677708828080009, + "learning_rate": 1.3712670397592127e-06, + "loss": 0.1455, + "step": 7030 + }, + { + "epoch": 2.347971280681249, + "grad_norm": 0.531422867823814, + "learning_rate": 1.3699303741521158e-06, + "loss": 0.1503, + "step": 7031 + }, + { + "epoch": 2.3483052262481214, + "grad_norm": 0.5219314248338266, + "learning_rate": 1.3685942569097793e-06, + "loss": 0.1666, + "step": 7032 + }, + { + "epoch": 2.348639171814994, + "grad_norm": 0.5289721391712382, + "learning_rate": 1.3672586882340393e-06, + "loss": 0.1646, + "step": 7033 + }, + { + "epoch": 2.348973117381867, + "grad_norm": 0.4755425996576312, + "learning_rate": 1.3659236683266475e-06, + "loss": 0.1514, + "step": 7034 + }, + { + "epoch": 2.349307062948739, + "grad_norm": 0.49561200390500754, + "learning_rate": 1.3645891973892772e-06, + "loss": 0.1493, + "step": 7035 + }, + { + "epoch": 2.349641008515612, + "grad_norm": 0.5361377733253501, + "learning_rate": 1.3632552756235124e-06, + "loss": 0.1635, + "step": 7036 + }, + { + "epoch": 2.3499749540824846, + "grad_norm": 0.5245007107369157, + "learning_rate": 1.3619219032308594e-06, + "loss": 0.1621, + "step": 7037 + }, + { + "epoch": 2.3503088996493573, + "grad_norm": 0.5233641249374307, + "learning_rate": 1.3605890804127415e-06, + "loss": 0.1567, + "step": 7038 + }, + { + "epoch": 2.3506428452162296, + "grad_norm": 0.529716471906781, + "learning_rate": 1.3592568073704943e-06, + "loss": 0.1567, + "step": 7039 + }, + { + "epoch": 2.3509767907831023, + "grad_norm": 0.5282219003779071, + "learning_rate": 1.3579250843053747e-06, + "loss": 0.1522, + "step": 7040 + }, + { + "epoch": 2.351310736349975, + "grad_norm": 0.5234598148920355, + "learning_rate": 1.3565939114185568e-06, + "loss": 0.1503, + "step": 7041 + }, + { + "epoch": 2.3516446819168477, + "grad_norm": 0.6006249908501967, + "learning_rate": 1.3552632889111266e-06, + "loss": 0.1674, + "step": 7042 + }, + { + "epoch": 2.35197862748372, + "grad_norm": 0.5012523499275392, + "learning_rate": 1.3539332169840918e-06, + "loss": 0.1537, + "step": 7043 + }, + { + "epoch": 2.3523125730505927, + "grad_norm": 0.47883312000552264, + "learning_rate": 1.3526036958383777e-06, + "loss": 0.1444, + "step": 7044 + }, + { + "epoch": 2.3526465186174654, + "grad_norm": 0.5046804575270472, + "learning_rate": 1.35127472567482e-06, + "loss": 0.1516, + "step": 7045 + }, + { + "epoch": 2.3529804641843377, + "grad_norm": 0.4807311309572237, + "learning_rate": 1.3499463066941787e-06, + "loss": 0.1466, + "step": 7046 + }, + { + "epoch": 2.3533144097512104, + "grad_norm": 0.5251926780869718, + "learning_rate": 1.3486184390971246e-06, + "loss": 0.1545, + "step": 7047 + }, + { + "epoch": 2.353648355318083, + "grad_norm": 0.5442022137332256, + "learning_rate": 1.347291123084249e-06, + "loss": 0.1689, + "step": 7048 + }, + { + "epoch": 2.353982300884956, + "grad_norm": 0.5188350766445398, + "learning_rate": 1.3459643588560583e-06, + "loss": 0.1597, + "step": 7049 + }, + { + "epoch": 2.3543162464518286, + "grad_norm": 0.47155341824463987, + "learning_rate": 1.3446381466129777e-06, + "loss": 0.1477, + "step": 7050 + }, + { + "epoch": 2.354650192018701, + "grad_norm": 0.5474209459200585, + "learning_rate": 1.3433124865553437e-06, + "loss": 0.1697, + "step": 7051 + }, + { + "epoch": 2.3549841375855736, + "grad_norm": 0.5271510936720203, + "learning_rate": 1.3419873788834164e-06, + "loss": 0.1657, + "step": 7052 + }, + { + "epoch": 2.3553180831524463, + "grad_norm": 0.5277962735942314, + "learning_rate": 1.3406628237973662e-06, + "loss": 0.1608, + "step": 7053 + }, + { + "epoch": 2.3556520287193186, + "grad_norm": 0.5556972174352737, + "learning_rate": 1.339338821497283e-06, + "loss": 0.1625, + "step": 7054 + }, + { + "epoch": 2.3559859742861913, + "grad_norm": 0.5248699594279046, + "learning_rate": 1.3380153721831745e-06, + "loss": 0.1619, + "step": 7055 + }, + { + "epoch": 2.356319919853064, + "grad_norm": 0.505305439890515, + "learning_rate": 1.3366924760549632e-06, + "loss": 0.1605, + "step": 7056 + }, + { + "epoch": 2.3566538654199367, + "grad_norm": 0.509624361874994, + "learning_rate": 1.3353701333124863e-06, + "loss": 0.1558, + "step": 7057 + }, + { + "epoch": 2.356987810986809, + "grad_norm": 0.5139996192629374, + "learning_rate": 1.3340483441555024e-06, + "loss": 0.1569, + "step": 7058 + }, + { + "epoch": 2.3573217565536817, + "grad_norm": 0.50576323330744, + "learning_rate": 1.3327271087836792e-06, + "loss": 0.1528, + "step": 7059 + }, + { + "epoch": 2.3576557021205544, + "grad_norm": 0.4936700282126162, + "learning_rate": 1.331406427396607e-06, + "loss": 0.1513, + "step": 7060 + }, + { + "epoch": 2.357989647687427, + "grad_norm": 0.5475069928862222, + "learning_rate": 1.3300863001937902e-06, + "loss": 0.1588, + "step": 7061 + }, + { + "epoch": 2.3583235932542994, + "grad_norm": 0.5259688821837174, + "learning_rate": 1.3287667273746513e-06, + "loss": 0.1646, + "step": 7062 + }, + { + "epoch": 2.358657538821172, + "grad_norm": 0.6091072760868147, + "learning_rate": 1.3274477091385241e-06, + "loss": 0.1632, + "step": 7063 + }, + { + "epoch": 2.358991484388045, + "grad_norm": 0.49664916653441177, + "learning_rate": 1.3261292456846648e-06, + "loss": 0.1483, + "step": 7064 + }, + { + "epoch": 2.359325429954917, + "grad_norm": 0.5428777150582343, + "learning_rate": 1.3248113372122395e-06, + "loss": 0.1596, + "step": 7065 + }, + { + "epoch": 2.35965937552179, + "grad_norm": 0.5426577923297033, + "learning_rate": 1.3234939839203358e-06, + "loss": 0.1602, + "step": 7066 + }, + { + "epoch": 2.3599933210886626, + "grad_norm": 0.5188785999223405, + "learning_rate": 1.3221771860079569e-06, + "loss": 0.1591, + "step": 7067 + }, + { + "epoch": 2.3603272666555353, + "grad_norm": 0.5086994931226321, + "learning_rate": 1.3208609436740178e-06, + "loss": 0.1522, + "step": 7068 + }, + { + "epoch": 2.360661212222408, + "grad_norm": 0.47362864053642506, + "learning_rate": 1.3195452571173551e-06, + "loss": 0.1467, + "step": 7069 + }, + { + "epoch": 2.3609951577892803, + "grad_norm": 0.5264254283456309, + "learning_rate": 1.3182301265367154e-06, + "loss": 0.1555, + "step": 7070 + }, + { + "epoch": 2.361329103356153, + "grad_norm": 0.4742773192417172, + "learning_rate": 1.3169155521307664e-06, + "loss": 0.1433, + "step": 7071 + }, + { + "epoch": 2.3616630489230257, + "grad_norm": 0.5145066895595479, + "learning_rate": 1.3156015340980904e-06, + "loss": 0.1574, + "step": 7072 + }, + { + "epoch": 2.361996994489898, + "grad_norm": 0.5604203763870581, + "learning_rate": 1.3142880726371865e-06, + "loss": 0.1657, + "step": 7073 + }, + { + "epoch": 2.3623309400567707, + "grad_norm": 0.53274867372277, + "learning_rate": 1.312975167946466e-06, + "loss": 0.1718, + "step": 7074 + }, + { + "epoch": 2.3626648856236434, + "grad_norm": 0.53957363798226, + "learning_rate": 1.3116628202242603e-06, + "loss": 0.1594, + "step": 7075 + }, + { + "epoch": 2.362998831190516, + "grad_norm": 0.5006375891688938, + "learning_rate": 1.3103510296688137e-06, + "loss": 0.1598, + "step": 7076 + }, + { + "epoch": 2.3633327767573884, + "grad_norm": 0.48159573025330465, + "learning_rate": 1.309039796478288e-06, + "loss": 0.1495, + "step": 7077 + }, + { + "epoch": 2.363666722324261, + "grad_norm": 0.5301067970927092, + "learning_rate": 1.307729120850761e-06, + "loss": 0.1578, + "step": 7078 + }, + { + "epoch": 2.364000667891134, + "grad_norm": 0.5357552915413548, + "learning_rate": 1.306419002984226e-06, + "loss": 0.1566, + "step": 7079 + }, + { + "epoch": 2.3643346134580066, + "grad_norm": 0.5482638058241193, + "learning_rate": 1.3051094430765905e-06, + "loss": 0.1611, + "step": 7080 + }, + { + "epoch": 2.364668559024879, + "grad_norm": 0.5096740195373038, + "learning_rate": 1.3038004413256805e-06, + "loss": 0.1604, + "step": 7081 + }, + { + "epoch": 2.3650025045917515, + "grad_norm": 0.470098411066193, + "learning_rate": 1.3024919979292338e-06, + "loss": 0.1474, + "step": 7082 + }, + { + "epoch": 2.3653364501586243, + "grad_norm": 0.538932422734086, + "learning_rate": 1.3011841130849079e-06, + "loss": 0.1491, + "step": 7083 + }, + { + "epoch": 2.3656703957254965, + "grad_norm": 0.48937676072754, + "learning_rate": 1.2998767869902733e-06, + "loss": 0.159, + "step": 7084 + }, + { + "epoch": 2.3660043412923693, + "grad_norm": 0.49651132883192295, + "learning_rate": 1.2985700198428197e-06, + "loss": 0.1492, + "step": 7085 + }, + { + "epoch": 2.366338286859242, + "grad_norm": 0.5336066250058544, + "learning_rate": 1.2972638118399456e-06, + "loss": 0.1509, + "step": 7086 + }, + { + "epoch": 2.3666722324261147, + "grad_norm": 0.5022650378372203, + "learning_rate": 1.2959581631789725e-06, + "loss": 0.1521, + "step": 7087 + }, + { + "epoch": 2.367006177992987, + "grad_norm": 0.5256152756288829, + "learning_rate": 1.2946530740571316e-06, + "loss": 0.1561, + "step": 7088 + }, + { + "epoch": 2.3673401235598597, + "grad_norm": 0.5106806485581995, + "learning_rate": 1.293348544671572e-06, + "loss": 0.1584, + "step": 7089 + }, + { + "epoch": 2.3676740691267324, + "grad_norm": 0.4815349175488414, + "learning_rate": 1.2920445752193617e-06, + "loss": 0.1494, + "step": 7090 + }, + { + "epoch": 2.368008014693605, + "grad_norm": 0.49516867886375865, + "learning_rate": 1.2907411658974756e-06, + "loss": 0.1536, + "step": 7091 + }, + { + "epoch": 2.3683419602604774, + "grad_norm": 0.5133737571155627, + "learning_rate": 1.2894383169028134e-06, + "loss": 0.1514, + "step": 7092 + }, + { + "epoch": 2.36867590582735, + "grad_norm": 0.5047295137569292, + "learning_rate": 1.2881360284321825e-06, + "loss": 0.1537, + "step": 7093 + }, + { + "epoch": 2.369009851394223, + "grad_norm": 0.5233980435478595, + "learning_rate": 1.2868343006823113e-06, + "loss": 0.1617, + "step": 7094 + }, + { + "epoch": 2.369343796961095, + "grad_norm": 0.47283027887814555, + "learning_rate": 1.2855331338498377e-06, + "loss": 0.1528, + "step": 7095 + }, + { + "epoch": 2.369677742527968, + "grad_norm": 0.5649361822819848, + "learning_rate": 1.2842325281313233e-06, + "loss": 0.1699, + "step": 7096 + }, + { + "epoch": 2.3700116880948405, + "grad_norm": 0.5228406368655593, + "learning_rate": 1.282932483723236e-06, + "loss": 0.1571, + "step": 7097 + }, + { + "epoch": 2.3703456336617132, + "grad_norm": 0.528258600013772, + "learning_rate": 1.2816330008219656e-06, + "loss": 0.1673, + "step": 7098 + }, + { + "epoch": 2.370679579228586, + "grad_norm": 0.5126074736580588, + "learning_rate": 1.280334079623811e-06, + "loss": 0.1586, + "step": 7099 + }, + { + "epoch": 2.3710135247954582, + "grad_norm": 0.5209165501480114, + "learning_rate": 1.2790357203249931e-06, + "loss": 0.1612, + "step": 7100 + }, + { + "epoch": 2.371347470362331, + "grad_norm": 0.5650474597014316, + "learning_rate": 1.2777379231216391e-06, + "loss": 0.1616, + "step": 7101 + }, + { + "epoch": 2.3716814159292037, + "grad_norm": 0.5315493028470518, + "learning_rate": 1.2764406882098035e-06, + "loss": 0.1576, + "step": 7102 + }, + { + "epoch": 2.372015361496076, + "grad_norm": 0.511469997004763, + "learning_rate": 1.2751440157854439e-06, + "loss": 0.1546, + "step": 7103 + }, + { + "epoch": 2.3723493070629487, + "grad_norm": 0.5397007074652154, + "learning_rate": 1.2738479060444408e-06, + "loss": 0.1623, + "step": 7104 + }, + { + "epoch": 2.3726832526298214, + "grad_norm": 0.47300263395113906, + "learning_rate": 1.2725523591825845e-06, + "loss": 0.1443, + "step": 7105 + }, + { + "epoch": 2.373017198196694, + "grad_norm": 0.49067687314657127, + "learning_rate": 1.2712573753955842e-06, + "loss": 0.1523, + "step": 7106 + }, + { + "epoch": 2.3733511437635664, + "grad_norm": 0.5354579285385755, + "learning_rate": 1.2699629548790599e-06, + "loss": 0.1634, + "step": 7107 + }, + { + "epoch": 2.373685089330439, + "grad_norm": 0.5125476972638018, + "learning_rate": 1.2686690978285533e-06, + "loss": 0.1618, + "step": 7108 + }, + { + "epoch": 2.374019034897312, + "grad_norm": 0.5010154309189855, + "learning_rate": 1.267375804439513e-06, + "loss": 0.1514, + "step": 7109 + }, + { + "epoch": 2.3743529804641845, + "grad_norm": 0.4951422731618649, + "learning_rate": 1.2660830749073093e-06, + "loss": 0.1505, + "step": 7110 + }, + { + "epoch": 2.374686926031057, + "grad_norm": 0.5662049963465066, + "learning_rate": 1.2647909094272215e-06, + "loss": 0.1686, + "step": 7111 + }, + { + "epoch": 2.3750208715979295, + "grad_norm": 0.496822761224137, + "learning_rate": 1.2634993081944469e-06, + "loss": 0.15, + "step": 7112 + }, + { + "epoch": 2.3753548171648022, + "grad_norm": 0.48684393051910746, + "learning_rate": 1.2622082714040995e-06, + "loss": 0.1555, + "step": 7113 + }, + { + "epoch": 2.3756887627316745, + "grad_norm": 0.51894267837415, + "learning_rate": 1.2609177992512022e-06, + "loss": 0.1644, + "step": 7114 + }, + { + "epoch": 2.3760227082985472, + "grad_norm": 0.5292677495026286, + "learning_rate": 1.2596278919306993e-06, + "loss": 0.1585, + "step": 7115 + }, + { + "epoch": 2.37635665386542, + "grad_norm": 0.48845207606014346, + "learning_rate": 1.2583385496374428e-06, + "loss": 0.152, + "step": 7116 + }, + { + "epoch": 2.3766905994322927, + "grad_norm": 0.5379775111433409, + "learning_rate": 1.2570497725662067e-06, + "loss": 0.1645, + "step": 7117 + }, + { + "epoch": 2.3770245449991654, + "grad_norm": 0.5055506542870268, + "learning_rate": 1.2557615609116713e-06, + "loss": 0.1555, + "step": 7118 + }, + { + "epoch": 2.3773584905660377, + "grad_norm": 0.4733144294957525, + "learning_rate": 1.254473914868442e-06, + "loss": 0.1484, + "step": 7119 + }, + { + "epoch": 2.3776924361329104, + "grad_norm": 0.5299212758598031, + "learning_rate": 1.2531868346310288e-06, + "loss": 0.16, + "step": 7120 + }, + { + "epoch": 2.378026381699783, + "grad_norm": 0.5284052191730892, + "learning_rate": 1.2519003203938628e-06, + "loss": 0.1529, + "step": 7121 + }, + { + "epoch": 2.3783603272666554, + "grad_norm": 0.5238906543650232, + "learning_rate": 1.2506143723512842e-06, + "loss": 0.1557, + "step": 7122 + }, + { + "epoch": 2.378694272833528, + "grad_norm": 0.5335370952252194, + "learning_rate": 1.2493289906975543e-06, + "loss": 0.1533, + "step": 7123 + }, + { + "epoch": 2.379028218400401, + "grad_norm": 0.5786503237493286, + "learning_rate": 1.2480441756268397e-06, + "loss": 0.1715, + "step": 7124 + }, + { + "epoch": 2.3793621639672735, + "grad_norm": 0.4843155928013435, + "learning_rate": 1.2467599273332332e-06, + "loss": 0.156, + "step": 7125 + }, + { + "epoch": 2.379696109534146, + "grad_norm": 0.5227159067590956, + "learning_rate": 1.245476246010731e-06, + "loss": 0.1642, + "step": 7126 + }, + { + "epoch": 2.3800300551010185, + "grad_norm": 0.4944621882634703, + "learning_rate": 1.244193131853252e-06, + "loss": 0.1475, + "step": 7127 + }, + { + "epoch": 2.3803640006678912, + "grad_norm": 0.5358983175601851, + "learning_rate": 1.2429105850546213e-06, + "loss": 0.1548, + "step": 7128 + }, + { + "epoch": 2.380697946234764, + "grad_norm": 0.5032449469880587, + "learning_rate": 1.241628605808587e-06, + "loss": 0.1589, + "step": 7129 + }, + { + "epoch": 2.381031891801636, + "grad_norm": 0.5243288444558761, + "learning_rate": 1.2403471943088018e-06, + "loss": 0.1556, + "step": 7130 + }, + { + "epoch": 2.381365837368509, + "grad_norm": 0.5599001289068225, + "learning_rate": 1.239066350748845e-06, + "loss": 0.1747, + "step": 7131 + }, + { + "epoch": 2.3816997829353816, + "grad_norm": 0.5533477090829723, + "learning_rate": 1.2377860753221976e-06, + "loss": 0.1721, + "step": 7132 + }, + { + "epoch": 2.382033728502254, + "grad_norm": 0.551804825022015, + "learning_rate": 1.236506368222264e-06, + "loss": 0.1633, + "step": 7133 + }, + { + "epoch": 2.3823676740691266, + "grad_norm": 0.5217476920386144, + "learning_rate": 1.235227229642355e-06, + "loss": 0.156, + "step": 7134 + }, + { + "epoch": 2.3827016196359994, + "grad_norm": 0.5338330593096421, + "learning_rate": 1.2339486597757038e-06, + "loss": 0.1665, + "step": 7135 + }, + { + "epoch": 2.383035565202872, + "grad_norm": 0.5231766549666796, + "learning_rate": 1.2326706588154496e-06, + "loss": 0.1622, + "step": 7136 + }, + { + "epoch": 2.3833695107697443, + "grad_norm": 0.5193547058090123, + "learning_rate": 1.2313932269546518e-06, + "loss": 0.1553, + "step": 7137 + }, + { + "epoch": 2.383703456336617, + "grad_norm": 0.5268208854231688, + "learning_rate": 1.2301163643862817e-06, + "loss": 0.1584, + "step": 7138 + }, + { + "epoch": 2.38403740190349, + "grad_norm": 0.509012014482607, + "learning_rate": 1.2288400713032227e-06, + "loss": 0.1524, + "step": 7139 + }, + { + "epoch": 2.3843713474703625, + "grad_norm": 0.5645944044633189, + "learning_rate": 1.2275643478982762e-06, + "loss": 0.1684, + "step": 7140 + }, + { + "epoch": 2.3847052930372348, + "grad_norm": 0.5293226597996646, + "learning_rate": 1.2262891943641526e-06, + "loss": 0.1536, + "step": 7141 + }, + { + "epoch": 2.3850392386041075, + "grad_norm": 0.567921584560172, + "learning_rate": 1.2250146108934802e-06, + "loss": 0.1659, + "step": 7142 + }, + { + "epoch": 2.38537318417098, + "grad_norm": 0.5097541913627487, + "learning_rate": 1.2237405976787997e-06, + "loss": 0.1544, + "step": 7143 + }, + { + "epoch": 2.3857071297378525, + "grad_norm": 0.5253260119018623, + "learning_rate": 1.2224671549125673e-06, + "loss": 0.1627, + "step": 7144 + }, + { + "epoch": 2.386041075304725, + "grad_norm": 0.5148031038721778, + "learning_rate": 1.2211942827871486e-06, + "loss": 0.1573, + "step": 7145 + }, + { + "epoch": 2.386375020871598, + "grad_norm": 0.515311953727318, + "learning_rate": 1.2199219814948294e-06, + "loss": 0.154, + "step": 7146 + }, + { + "epoch": 2.3867089664384706, + "grad_norm": 0.5236022358304645, + "learning_rate": 1.218650251227802e-06, + "loss": 0.1574, + "step": 7147 + }, + { + "epoch": 2.3870429120053434, + "grad_norm": 0.5614360461378991, + "learning_rate": 1.2173790921781786e-06, + "loss": 0.1605, + "step": 7148 + }, + { + "epoch": 2.3873768575722156, + "grad_norm": 0.5241197638766266, + "learning_rate": 1.2161085045379818e-06, + "loss": 0.155, + "step": 7149 + }, + { + "epoch": 2.3877108031390883, + "grad_norm": 0.5220473602267451, + "learning_rate": 1.214838488499151e-06, + "loss": 0.1545, + "step": 7150 + }, + { + "epoch": 2.388044748705961, + "grad_norm": 0.5446184370250129, + "learning_rate": 1.2135690442535335e-06, + "loss": 0.1576, + "step": 7151 + }, + { + "epoch": 2.3883786942728333, + "grad_norm": 0.5369404959958738, + "learning_rate": 1.2123001719928972e-06, + "loss": 0.1645, + "step": 7152 + }, + { + "epoch": 2.388712639839706, + "grad_norm": 0.5412541896274068, + "learning_rate": 1.211031871908916e-06, + "loss": 0.1587, + "step": 7153 + }, + { + "epoch": 2.3890465854065788, + "grad_norm": 0.5880386251671798, + "learning_rate": 1.2097641441931868e-06, + "loss": 0.1785, + "step": 7154 + }, + { + "epoch": 2.3893805309734515, + "grad_norm": 0.5251849578213053, + "learning_rate": 1.2084969890372111e-06, + "loss": 0.1567, + "step": 7155 + }, + { + "epoch": 2.3897144765403238, + "grad_norm": 0.5583506662971707, + "learning_rate": 1.2072304066324103e-06, + "loss": 0.1698, + "step": 7156 + }, + { + "epoch": 2.3900484221071965, + "grad_norm": 0.5335369655122735, + "learning_rate": 1.205964397170113e-06, + "loss": 0.1592, + "step": 7157 + }, + { + "epoch": 2.390382367674069, + "grad_norm": 0.5418466727615724, + "learning_rate": 1.2046989608415682e-06, + "loss": 0.1598, + "step": 7158 + }, + { + "epoch": 2.390716313240942, + "grad_norm": 0.4969750469808351, + "learning_rate": 1.2034340978379328e-06, + "loss": 0.1478, + "step": 7159 + }, + { + "epoch": 2.391050258807814, + "grad_norm": 0.6032598631872019, + "learning_rate": 1.2021698083502797e-06, + "loss": 0.172, + "step": 7160 + }, + { + "epoch": 2.391384204374687, + "grad_norm": 0.5109947419002258, + "learning_rate": 1.2009060925695965e-06, + "loss": 0.146, + "step": 7161 + }, + { + "epoch": 2.3917181499415596, + "grad_norm": 0.5009788689546751, + "learning_rate": 1.1996429506867797e-06, + "loss": 0.1501, + "step": 7162 + }, + { + "epoch": 2.392052095508432, + "grad_norm": 0.5335564451937221, + "learning_rate": 1.1983803828926438e-06, + "loss": 0.1633, + "step": 7163 + }, + { + "epoch": 2.3923860410753046, + "grad_norm": 0.5471286776914387, + "learning_rate": 1.1971183893779125e-06, + "loss": 0.1637, + "step": 7164 + }, + { + "epoch": 2.3927199866421773, + "grad_norm": 0.5296431778636541, + "learning_rate": 1.1958569703332262e-06, + "loss": 0.1612, + "step": 7165 + }, + { + "epoch": 2.39305393220905, + "grad_norm": 0.5313318060463695, + "learning_rate": 1.1945961259491368e-06, + "loss": 0.1544, + "step": 7166 + }, + { + "epoch": 2.3933878777759228, + "grad_norm": 0.5573335599867503, + "learning_rate": 1.1933358564161108e-06, + "loss": 0.1561, + "step": 7167 + }, + { + "epoch": 2.393721823342795, + "grad_norm": 0.5278835623914241, + "learning_rate": 1.1920761619245246e-06, + "loss": 0.1661, + "step": 7168 + }, + { + "epoch": 2.3940557689096678, + "grad_norm": 0.5286543182285065, + "learning_rate": 1.1908170426646726e-06, + "loss": 0.1576, + "step": 7169 + }, + { + "epoch": 2.3943897144765405, + "grad_norm": 0.4797109888500716, + "learning_rate": 1.189558498826756e-06, + "loss": 0.1383, + "step": 7170 + }, + { + "epoch": 2.3947236600434127, + "grad_norm": 0.5215268741066937, + "learning_rate": 1.1883005306008955e-06, + "loss": 0.1531, + "step": 7171 + }, + { + "epoch": 2.3950576056102855, + "grad_norm": 0.4951612997717505, + "learning_rate": 1.1870431381771203e-06, + "loss": 0.152, + "step": 7172 + }, + { + "epoch": 2.395391551177158, + "grad_norm": 0.5105466271334883, + "learning_rate": 1.185786321745377e-06, + "loss": 0.1489, + "step": 7173 + }, + { + "epoch": 2.395725496744031, + "grad_norm": 0.557204326375292, + "learning_rate": 1.1845300814955192e-06, + "loss": 0.1576, + "step": 7174 + }, + { + "epoch": 2.396059442310903, + "grad_norm": 0.5630209033895921, + "learning_rate": 1.18327441761732e-06, + "loss": 0.1585, + "step": 7175 + }, + { + "epoch": 2.396393387877776, + "grad_norm": 0.5298445211408632, + "learning_rate": 1.1820193303004584e-06, + "loss": 0.1627, + "step": 7176 + }, + { + "epoch": 2.3967273334446486, + "grad_norm": 0.46722816889909324, + "learning_rate": 1.1807648197345327e-06, + "loss": 0.1376, + "step": 7177 + }, + { + "epoch": 2.3970612790115213, + "grad_norm": 0.5325098603893473, + "learning_rate": 1.1795108861090515e-06, + "loss": 0.1551, + "step": 7178 + }, + { + "epoch": 2.3973952245783936, + "grad_norm": 0.5301349354335696, + "learning_rate": 1.1782575296134363e-06, + "loss": 0.1519, + "step": 7179 + }, + { + "epoch": 2.3977291701452663, + "grad_norm": 0.5145757838651948, + "learning_rate": 1.1770047504370197e-06, + "loss": 0.1538, + "step": 7180 + }, + { + "epoch": 2.398063115712139, + "grad_norm": 0.5300739981678517, + "learning_rate": 1.1757525487690513e-06, + "loss": 0.1541, + "step": 7181 + }, + { + "epoch": 2.3983970612790113, + "grad_norm": 0.5601765735049783, + "learning_rate": 1.1745009247986882e-06, + "loss": 0.1669, + "step": 7182 + }, + { + "epoch": 2.398731006845884, + "grad_norm": 0.5090285498231324, + "learning_rate": 1.1732498787150044e-06, + "loss": 0.1448, + "step": 7183 + }, + { + "epoch": 2.3990649524127567, + "grad_norm": 0.5003809889361199, + "learning_rate": 1.171999410706986e-06, + "loss": 0.1451, + "step": 7184 + }, + { + "epoch": 2.3993988979796295, + "grad_norm": 0.5417432938953705, + "learning_rate": 1.1707495209635283e-06, + "loss": 0.1557, + "step": 7185 + }, + { + "epoch": 2.3997328435465017, + "grad_norm": 0.5604087075100392, + "learning_rate": 1.1695002096734454e-06, + "loss": 0.1657, + "step": 7186 + }, + { + "epoch": 2.4000667891133745, + "grad_norm": 0.5283839371353753, + "learning_rate": 1.1682514770254567e-06, + "loss": 0.1552, + "step": 7187 + }, + { + "epoch": 2.400400734680247, + "grad_norm": 0.5167426973502384, + "learning_rate": 1.1670033232081995e-06, + "loss": 0.1596, + "step": 7188 + }, + { + "epoch": 2.40073468024712, + "grad_norm": 0.4966963801218271, + "learning_rate": 1.1657557484102228e-06, + "loss": 0.1551, + "step": 7189 + }, + { + "epoch": 2.401068625813992, + "grad_norm": 0.5768210088793992, + "learning_rate": 1.1645087528199883e-06, + "loss": 0.1669, + "step": 7190 + }, + { + "epoch": 2.401402571380865, + "grad_norm": 0.5000691629042625, + "learning_rate": 1.1632623366258666e-06, + "loss": 0.1472, + "step": 7191 + }, + { + "epoch": 2.4017365169477376, + "grad_norm": 0.5345663390184785, + "learning_rate": 1.162016500016147e-06, + "loss": 0.1532, + "step": 7192 + }, + { + "epoch": 2.40207046251461, + "grad_norm": 0.5230805429006006, + "learning_rate": 1.1607712431790242e-06, + "loss": 0.1613, + "step": 7193 + }, + { + "epoch": 2.4024044080814826, + "grad_norm": 0.47784139221190725, + "learning_rate": 1.15952656630261e-06, + "loss": 0.1378, + "step": 7194 + }, + { + "epoch": 2.4027383536483553, + "grad_norm": 0.5413001588245746, + "learning_rate": 1.158282469574929e-06, + "loss": 0.16, + "step": 7195 + }, + { + "epoch": 2.403072299215228, + "grad_norm": 0.5172176939927573, + "learning_rate": 1.1570389531839165e-06, + "loss": 0.1449, + "step": 7196 + }, + { + "epoch": 2.4034062447821007, + "grad_norm": 0.5126788675251615, + "learning_rate": 1.1557960173174183e-06, + "loss": 0.1583, + "step": 7197 + }, + { + "epoch": 2.403740190348973, + "grad_norm": 0.5071441617182192, + "learning_rate": 1.154553662163197e-06, + "loss": 0.1485, + "step": 7198 + }, + { + "epoch": 2.4040741359158457, + "grad_norm": 0.5710809229470581, + "learning_rate": 1.1533118879089227e-06, + "loss": 0.1681, + "step": 7199 + }, + { + "epoch": 2.4044080814827185, + "grad_norm": 0.5293626105834608, + "learning_rate": 1.1520706947421806e-06, + "loss": 0.149, + "step": 7200 + }, + { + "epoch": 2.4047420270495907, + "grad_norm": 0.5452941336660879, + "learning_rate": 1.1508300828504682e-06, + "loss": 0.1697, + "step": 7201 + }, + { + "epoch": 2.4050759726164634, + "grad_norm": 0.5166114823495399, + "learning_rate": 1.1495900524211955e-06, + "loss": 0.1532, + "step": 7202 + }, + { + "epoch": 2.405409918183336, + "grad_norm": 0.5471123464355846, + "learning_rate": 1.1483506036416814e-06, + "loss": 0.1664, + "step": 7203 + }, + { + "epoch": 2.405743863750209, + "grad_norm": 0.5141430169803471, + "learning_rate": 1.1471117366991613e-06, + "loss": 0.1555, + "step": 7204 + }, + { + "epoch": 2.406077809317081, + "grad_norm": 0.46243845794224187, + "learning_rate": 1.1458734517807785e-06, + "loss": 0.1418, + "step": 7205 + }, + { + "epoch": 2.406411754883954, + "grad_norm": 0.5162303747719997, + "learning_rate": 1.1446357490735921e-06, + "loss": 0.1535, + "step": 7206 + }, + { + "epoch": 2.4067457004508266, + "grad_norm": 0.48927363828819215, + "learning_rate": 1.143398628764572e-06, + "loss": 0.1525, + "step": 7207 + }, + { + "epoch": 2.4070796460176993, + "grad_norm": 0.5390800009438081, + "learning_rate": 1.1421620910405977e-06, + "loss": 0.1674, + "step": 7208 + }, + { + "epoch": 2.4074135915845716, + "grad_norm": 0.4994447846500062, + "learning_rate": 1.1409261360884661e-06, + "loss": 0.1584, + "step": 7209 + }, + { + "epoch": 2.4077475371514443, + "grad_norm": 0.5197712049684485, + "learning_rate": 1.1396907640948785e-06, + "loss": 0.1478, + "step": 7210 + }, + { + "epoch": 2.408081482718317, + "grad_norm": 0.5135963024209774, + "learning_rate": 1.1384559752464553e-06, + "loss": 0.1488, + "step": 7211 + }, + { + "epoch": 2.4084154282851893, + "grad_norm": 0.531386611449885, + "learning_rate": 1.137221769729725e-06, + "loss": 0.153, + "step": 7212 + }, + { + "epoch": 2.408749373852062, + "grad_norm": 0.5578052539668118, + "learning_rate": 1.1359881477311301e-06, + "loss": 0.1591, + "step": 7213 + }, + { + "epoch": 2.4090833194189347, + "grad_norm": 0.5529075972559461, + "learning_rate": 1.1347551094370224e-06, + "loss": 0.1704, + "step": 7214 + }, + { + "epoch": 2.4094172649858074, + "grad_norm": 0.5441030100011386, + "learning_rate": 1.1335226550336676e-06, + "loss": 0.1587, + "step": 7215 + }, + { + "epoch": 2.40975121055268, + "grad_norm": 0.485355416601119, + "learning_rate": 1.1322907847072411e-06, + "loss": 0.1419, + "step": 7216 + }, + { + "epoch": 2.4100851561195524, + "grad_norm": 0.5148035517585307, + "learning_rate": 1.1310594986438339e-06, + "loss": 0.1503, + "step": 7217 + }, + { + "epoch": 2.410419101686425, + "grad_norm": 0.5021996340564928, + "learning_rate": 1.129828797029442e-06, + "loss": 0.1545, + "step": 7218 + }, + { + "epoch": 2.410753047253298, + "grad_norm": 0.5582216534507548, + "learning_rate": 1.128598680049982e-06, + "loss": 0.1584, + "step": 7219 + }, + { + "epoch": 2.41108699282017, + "grad_norm": 0.5244566063420066, + "learning_rate": 1.1273691478912752e-06, + "loss": 0.1464, + "step": 7220 + }, + { + "epoch": 2.411420938387043, + "grad_norm": 0.5195593318131434, + "learning_rate": 1.1261402007390587e-06, + "loss": 0.1592, + "step": 7221 + }, + { + "epoch": 2.4117548839539156, + "grad_norm": 0.5493008998470231, + "learning_rate": 1.1249118387789764e-06, + "loss": 0.1627, + "step": 7222 + }, + { + "epoch": 2.4120888295207883, + "grad_norm": 0.525214705071319, + "learning_rate": 1.12368406219659e-06, + "loss": 0.1606, + "step": 7223 + }, + { + "epoch": 2.4124227750876606, + "grad_norm": 0.4939289053178912, + "learning_rate": 1.1224568711773653e-06, + "loss": 0.151, + "step": 7224 + }, + { + "epoch": 2.4127567206545333, + "grad_norm": 0.48455079432812664, + "learning_rate": 1.1212302659066898e-06, + "loss": 0.1501, + "step": 7225 + }, + { + "epoch": 2.413090666221406, + "grad_norm": 0.5156418088336056, + "learning_rate": 1.1200042465698518e-06, + "loss": 0.151, + "step": 7226 + }, + { + "epoch": 2.4134246117882787, + "grad_norm": 0.5148384016761979, + "learning_rate": 1.1187788133520594e-06, + "loss": 0.1642, + "step": 7227 + }, + { + "epoch": 2.413758557355151, + "grad_norm": 0.5138997748746464, + "learning_rate": 1.1175539664384261e-06, + "loss": 0.1587, + "step": 7228 + }, + { + "epoch": 2.4140925029220237, + "grad_norm": 0.519066569925353, + "learning_rate": 1.1163297060139815e-06, + "loss": 0.1533, + "step": 7229 + }, + { + "epoch": 2.4144264484888964, + "grad_norm": 0.49248902007645573, + "learning_rate": 1.1151060322636625e-06, + "loss": 0.1506, + "step": 7230 + }, + { + "epoch": 2.4147603940557687, + "grad_norm": 0.5312541778053379, + "learning_rate": 1.1138829453723204e-06, + "loss": 0.1593, + "step": 7231 + }, + { + "epoch": 2.4150943396226414, + "grad_norm": 0.5018480303953734, + "learning_rate": 1.112660445524718e-06, + "loss": 0.1461, + "step": 7232 + }, + { + "epoch": 2.415428285189514, + "grad_norm": 0.5226338743612154, + "learning_rate": 1.1114385329055262e-06, + "loss": 0.1608, + "step": 7233 + }, + { + "epoch": 2.415762230756387, + "grad_norm": 0.5635333501894039, + "learning_rate": 1.1102172076993301e-06, + "loss": 0.1633, + "step": 7234 + }, + { + "epoch": 2.416096176323259, + "grad_norm": 0.5602180181468333, + "learning_rate": 1.1089964700906257e-06, + "loss": 0.1655, + "step": 7235 + }, + { + "epoch": 2.416430121890132, + "grad_norm": 0.5756321012974656, + "learning_rate": 1.1077763202638208e-06, + "loss": 0.1613, + "step": 7236 + }, + { + "epoch": 2.4167640674570046, + "grad_norm": 0.5029427815181746, + "learning_rate": 1.106556758403231e-06, + "loss": 0.1515, + "step": 7237 + }, + { + "epoch": 2.4170980130238773, + "grad_norm": 0.5262956548357789, + "learning_rate": 1.105337784693088e-06, + "loss": 0.1634, + "step": 7238 + }, + { + "epoch": 2.4174319585907496, + "grad_norm": 0.5626068139042897, + "learning_rate": 1.1041193993175293e-06, + "loss": 0.1579, + "step": 7239 + }, + { + "epoch": 2.4177659041576223, + "grad_norm": 0.5082947467016112, + "learning_rate": 1.1029016024606093e-06, + "loss": 0.1532, + "step": 7240 + }, + { + "epoch": 2.418099849724495, + "grad_norm": 0.5297964029071952, + "learning_rate": 1.101684394306286e-06, + "loss": 0.1582, + "step": 7241 + }, + { + "epoch": 2.4184337952913673, + "grad_norm": 0.5296061030264333, + "learning_rate": 1.100467775038439e-06, + "loss": 0.1555, + "step": 7242 + }, + { + "epoch": 2.41876774085824, + "grad_norm": 0.5951302915676688, + "learning_rate": 1.099251744840849e-06, + "loss": 0.1609, + "step": 7243 + }, + { + "epoch": 2.4191016864251127, + "grad_norm": 0.5448858703349814, + "learning_rate": 1.0980363038972141e-06, + "loss": 0.1512, + "step": 7244 + }, + { + "epoch": 2.4194356319919854, + "grad_norm": 0.5240412587293782, + "learning_rate": 1.096821452391138e-06, + "loss": 0.1472, + "step": 7245 + }, + { + "epoch": 2.419769577558858, + "grad_norm": 0.5340001181165048, + "learning_rate": 1.0956071905061415e-06, + "loss": 0.1524, + "step": 7246 + }, + { + "epoch": 2.4201035231257304, + "grad_norm": 0.5621964258233278, + "learning_rate": 1.0943935184256487e-06, + "loss": 0.162, + "step": 7247 + }, + { + "epoch": 2.420437468692603, + "grad_norm": 0.49430836108102416, + "learning_rate": 1.093180436333005e-06, + "loss": 0.1499, + "step": 7248 + }, + { + "epoch": 2.420771414259476, + "grad_norm": 0.49277358371660024, + "learning_rate": 1.091967944411456e-06, + "loss": 0.1503, + "step": 7249 + }, + { + "epoch": 2.421105359826348, + "grad_norm": 0.5566152601905626, + "learning_rate": 1.0907560428441666e-06, + "loss": 0.163, + "step": 7250 + }, + { + "epoch": 2.421439305393221, + "grad_norm": 0.5354411967166895, + "learning_rate": 1.0895447318142043e-06, + "loss": 0.1592, + "step": 7251 + }, + { + "epoch": 2.4217732509600935, + "grad_norm": 0.5178968638492157, + "learning_rate": 1.0883340115045566e-06, + "loss": 0.1555, + "step": 7252 + }, + { + "epoch": 2.4221071965269663, + "grad_norm": 0.5064556197403924, + "learning_rate": 1.0871238820981133e-06, + "loss": 0.1553, + "step": 7253 + }, + { + "epoch": 2.4224411420938385, + "grad_norm": 0.513639948844901, + "learning_rate": 1.0859143437776803e-06, + "loss": 0.1611, + "step": 7254 + }, + { + "epoch": 2.4227750876607113, + "grad_norm": 0.539271199077283, + "learning_rate": 1.0847053967259736e-06, + "loss": 0.1529, + "step": 7255 + }, + { + "epoch": 2.423109033227584, + "grad_norm": 0.52303917750466, + "learning_rate": 1.0834970411256167e-06, + "loss": 0.1611, + "step": 7256 + }, + { + "epoch": 2.4234429787944567, + "grad_norm": 0.5294641295234923, + "learning_rate": 1.082289277159147e-06, + "loss": 0.1598, + "step": 7257 + }, + { + "epoch": 2.423776924361329, + "grad_norm": 0.550231135188987, + "learning_rate": 1.0810821050090132e-06, + "loss": 0.1721, + "step": 7258 + }, + { + "epoch": 2.4241108699282017, + "grad_norm": 0.6018154425280414, + "learning_rate": 1.0798755248575694e-06, + "loss": 0.1592, + "step": 7259 + }, + { + "epoch": 2.4244448154950744, + "grad_norm": 0.5102466501385734, + "learning_rate": 1.078669536887086e-06, + "loss": 0.1615, + "step": 7260 + }, + { + "epoch": 2.4247787610619467, + "grad_norm": 0.5042309590141728, + "learning_rate": 1.077464141279742e-06, + "loss": 0.1518, + "step": 7261 + }, + { + "epoch": 2.4251127066288194, + "grad_norm": 0.5388065888923106, + "learning_rate": 1.0762593382176244e-06, + "loss": 0.1687, + "step": 7262 + }, + { + "epoch": 2.425446652195692, + "grad_norm": 0.48205881440780235, + "learning_rate": 1.0750551278827365e-06, + "loss": 0.1533, + "step": 7263 + }, + { + "epoch": 2.425780597762565, + "grad_norm": 0.5594387794673621, + "learning_rate": 1.073851510456984e-06, + "loss": 0.1672, + "step": 7264 + }, + { + "epoch": 2.4261145433294375, + "grad_norm": 0.4905063271692703, + "learning_rate": 1.0726484861221902e-06, + "loss": 0.149, + "step": 7265 + }, + { + "epoch": 2.42644848889631, + "grad_norm": 0.5205108609752576, + "learning_rate": 1.0714460550600859e-06, + "loss": 0.1485, + "step": 7266 + }, + { + "epoch": 2.4267824344631825, + "grad_norm": 0.5231560925068052, + "learning_rate": 1.0702442174523132e-06, + "loss": 0.1552, + "step": 7267 + }, + { + "epoch": 2.4271163800300553, + "grad_norm": 0.4769453332959195, + "learning_rate": 1.0690429734804214e-06, + "loss": 0.1434, + "step": 7268 + }, + { + "epoch": 2.4274503255969275, + "grad_norm": 0.5312419128658573, + "learning_rate": 1.0678423233258755e-06, + "loss": 0.1514, + "step": 7269 + }, + { + "epoch": 2.4277842711638002, + "grad_norm": 0.5457120778252545, + "learning_rate": 1.0666422671700438e-06, + "loss": 0.1529, + "step": 7270 + }, + { + "epoch": 2.428118216730673, + "grad_norm": 0.5443040289592945, + "learning_rate": 1.065442805194214e-06, + "loss": 0.1643, + "step": 7271 + }, + { + "epoch": 2.4284521622975457, + "grad_norm": 0.5274373283039757, + "learning_rate": 1.0642439375795748e-06, + "loss": 0.1488, + "step": 7272 + }, + { + "epoch": 2.428786107864418, + "grad_norm": 0.5390642593642723, + "learning_rate": 1.0630456645072324e-06, + "loss": 0.1586, + "step": 7273 + }, + { + "epoch": 2.4291200534312907, + "grad_norm": 0.5089151199607201, + "learning_rate": 1.0618479861581971e-06, + "loss": 0.1537, + "step": 7274 + }, + { + "epoch": 2.4294539989981634, + "grad_norm": 0.5258576186084203, + "learning_rate": 1.060650902713395e-06, + "loss": 0.1556, + "step": 7275 + }, + { + "epoch": 2.429787944565036, + "grad_norm": 0.5113060552599415, + "learning_rate": 1.0594544143536572e-06, + "loss": 0.1517, + "step": 7276 + }, + { + "epoch": 2.4301218901319084, + "grad_norm": 0.5072388702252578, + "learning_rate": 1.0582585212597286e-06, + "loss": 0.1499, + "step": 7277 + }, + { + "epoch": 2.430455835698781, + "grad_norm": 0.5557065639821541, + "learning_rate": 1.0570632236122641e-06, + "loss": 0.1621, + "step": 7278 + }, + { + "epoch": 2.430789781265654, + "grad_norm": 0.4741104501250161, + "learning_rate": 1.0558685215918246e-06, + "loss": 0.139, + "step": 7279 + }, + { + "epoch": 2.431123726832526, + "grad_norm": 0.5313610251633167, + "learning_rate": 1.0546744153788858e-06, + "loss": 0.1554, + "step": 7280 + }, + { + "epoch": 2.431457672399399, + "grad_norm": 0.5144511403714003, + "learning_rate": 1.0534809051538324e-06, + "loss": 0.1471, + "step": 7281 + }, + { + "epoch": 2.4317916179662715, + "grad_norm": 0.5905335390702079, + "learning_rate": 1.0522879910969563e-06, + "loss": 0.1742, + "step": 7282 + }, + { + "epoch": 2.4321255635331442, + "grad_norm": 0.49916027008308816, + "learning_rate": 1.0510956733884614e-06, + "loss": 0.1487, + "step": 7283 + }, + { + "epoch": 2.4324595091000165, + "grad_norm": 0.5277947791001935, + "learning_rate": 1.0499039522084637e-06, + "loss": 0.1534, + "step": 7284 + }, + { + "epoch": 2.4327934546668892, + "grad_norm": 0.5430657315225573, + "learning_rate": 1.0487128277369829e-06, + "loss": 0.1649, + "step": 7285 + }, + { + "epoch": 2.433127400233762, + "grad_norm": 0.539630403103098, + "learning_rate": 1.0475223001539564e-06, + "loss": 0.1602, + "step": 7286 + }, + { + "epoch": 2.4334613458006347, + "grad_norm": 0.5264937727044785, + "learning_rate": 1.0463323696392236e-06, + "loss": 0.1561, + "step": 7287 + }, + { + "epoch": 2.433795291367507, + "grad_norm": 0.47187721488243567, + "learning_rate": 1.0451430363725395e-06, + "loss": 0.1526, + "step": 7288 + }, + { + "epoch": 2.4341292369343797, + "grad_norm": 0.532862403776023, + "learning_rate": 1.043954300533566e-06, + "loss": 0.1615, + "step": 7289 + }, + { + "epoch": 2.4344631825012524, + "grad_norm": 0.540835650588637, + "learning_rate": 1.0427661623018786e-06, + "loss": 0.1648, + "step": 7290 + }, + { + "epoch": 2.4347971280681246, + "grad_norm": 0.5395899185409637, + "learning_rate": 1.0415786218569557e-06, + "loss": 0.1655, + "step": 7291 + }, + { + "epoch": 2.4351310736349974, + "grad_norm": 0.5003448121019961, + "learning_rate": 1.0403916793781922e-06, + "loss": 0.1498, + "step": 7292 + }, + { + "epoch": 2.43546501920187, + "grad_norm": 0.4713027523101309, + "learning_rate": 1.0392053350448867e-06, + "loss": 0.14, + "step": 7293 + }, + { + "epoch": 2.435798964768743, + "grad_norm": 0.49901626838835894, + "learning_rate": 1.0380195890362527e-06, + "loss": 0.1591, + "step": 7294 + }, + { + "epoch": 2.4361329103356155, + "grad_norm": 0.4962709060760146, + "learning_rate": 1.0368344415314101e-06, + "loss": 0.1559, + "step": 7295 + }, + { + "epoch": 2.436466855902488, + "grad_norm": 0.5031119480742257, + "learning_rate": 1.0356498927093916e-06, + "loss": 0.1496, + "step": 7296 + }, + { + "epoch": 2.4368008014693605, + "grad_norm": 0.4904804267379514, + "learning_rate": 1.0344659427491343e-06, + "loss": 0.1566, + "step": 7297 + }, + { + "epoch": 2.4371347470362332, + "grad_norm": 0.5219797130767747, + "learning_rate": 1.0332825918294898e-06, + "loss": 0.1554, + "step": 7298 + }, + { + "epoch": 2.4374686926031055, + "grad_norm": 0.5189968707324387, + "learning_rate": 1.0320998401292154e-06, + "loss": 0.1472, + "step": 7299 + }, + { + "epoch": 2.437802638169978, + "grad_norm": 0.5253702042666821, + "learning_rate": 1.0309176878269806e-06, + "loss": 0.1489, + "step": 7300 + }, + { + "epoch": 2.438136583736851, + "grad_norm": 0.49138094461283727, + "learning_rate": 1.0297361351013646e-06, + "loss": 0.1465, + "step": 7301 + }, + { + "epoch": 2.4384705293037237, + "grad_norm": 0.5309848647784869, + "learning_rate": 1.028555182130853e-06, + "loss": 0.1605, + "step": 7302 + }, + { + "epoch": 2.438804474870596, + "grad_norm": 0.538951743332172, + "learning_rate": 1.027374829093843e-06, + "loss": 0.1658, + "step": 7303 + }, + { + "epoch": 2.4391384204374686, + "grad_norm": 0.5411757444422154, + "learning_rate": 1.0261950761686423e-06, + "loss": 0.1581, + "step": 7304 + }, + { + "epoch": 2.4394723660043414, + "grad_norm": 0.5200715406012469, + "learning_rate": 1.0250159235334645e-06, + "loss": 0.158, + "step": 7305 + }, + { + "epoch": 2.439806311571214, + "grad_norm": 0.558830853232633, + "learning_rate": 1.0238373713664351e-06, + "loss": 0.1662, + "step": 7306 + }, + { + "epoch": 2.4401402571380864, + "grad_norm": 0.4868370076213395, + "learning_rate": 1.0226594198455903e-06, + "loss": 0.146, + "step": 7307 + }, + { + "epoch": 2.440474202704959, + "grad_norm": 0.49711149600547216, + "learning_rate": 1.0214820691488698e-06, + "loss": 0.1514, + "step": 7308 + }, + { + "epoch": 2.440808148271832, + "grad_norm": 0.5147765431980813, + "learning_rate": 1.02030531945413e-06, + "loss": 0.1631, + "step": 7309 + }, + { + "epoch": 2.441142093838704, + "grad_norm": 0.5618697583716376, + "learning_rate": 1.0191291709391298e-06, + "loss": 0.1437, + "step": 7310 + }, + { + "epoch": 2.441476039405577, + "grad_norm": 0.5719539944950716, + "learning_rate": 1.0179536237815413e-06, + "loss": 0.1691, + "step": 7311 + }, + { + "epoch": 2.4418099849724495, + "grad_norm": 0.5234878850474092, + "learning_rate": 1.016778678158945e-06, + "loss": 0.155, + "step": 7312 + }, + { + "epoch": 2.442143930539322, + "grad_norm": 0.5452399489884633, + "learning_rate": 1.015604334248832e-06, + "loss": 0.1589, + "step": 7313 + }, + { + "epoch": 2.442477876106195, + "grad_norm": 0.5534133198761949, + "learning_rate": 1.0144305922285975e-06, + "loss": 0.1523, + "step": 7314 + }, + { + "epoch": 2.442811821673067, + "grad_norm": 0.5535102460042848, + "learning_rate": 1.0132574522755518e-06, + "loss": 0.1592, + "step": 7315 + }, + { + "epoch": 2.44314576723994, + "grad_norm": 0.5204726820983054, + "learning_rate": 1.0120849145669093e-06, + "loss": 0.1534, + "step": 7316 + }, + { + "epoch": 2.4434797128068126, + "grad_norm": 0.5244522290512145, + "learning_rate": 1.010912979279796e-06, + "loss": 0.161, + "step": 7317 + }, + { + "epoch": 2.443813658373685, + "grad_norm": 0.4986635964399862, + "learning_rate": 1.009741646591248e-06, + "loss": 0.1537, + "step": 7318 + }, + { + "epoch": 2.4441476039405576, + "grad_norm": 0.5206734432989935, + "learning_rate": 1.0085709166782088e-06, + "loss": 0.1612, + "step": 7319 + }, + { + "epoch": 2.4444815495074304, + "grad_norm": 0.5336710529700994, + "learning_rate": 1.0074007897175291e-06, + "loss": 0.1531, + "step": 7320 + }, + { + "epoch": 2.444815495074303, + "grad_norm": 0.5533500819209353, + "learning_rate": 1.0062312658859723e-06, + "loss": 0.1611, + "step": 7321 + }, + { + "epoch": 2.4451494406411753, + "grad_norm": 0.5658983196079189, + "learning_rate": 1.0050623453602075e-06, + "loss": 0.1667, + "step": 7322 + }, + { + "epoch": 2.445483386208048, + "grad_norm": 0.5287095854597083, + "learning_rate": 1.0038940283168136e-06, + "loss": 0.1505, + "step": 7323 + }, + { + "epoch": 2.4458173317749208, + "grad_norm": 0.5375178265696032, + "learning_rate": 1.0027263149322797e-06, + "loss": 0.1637, + "step": 7324 + }, + { + "epoch": 2.4461512773417935, + "grad_norm": 0.47445833611149235, + "learning_rate": 1.001559205383003e-06, + "loss": 0.1446, + "step": 7325 + }, + { + "epoch": 2.4464852229086658, + "grad_norm": 0.4768203964078314, + "learning_rate": 1.000392699845288e-06, + "loss": 0.1345, + "step": 7326 + }, + { + "epoch": 2.4468191684755385, + "grad_norm": 0.5206013386975504, + "learning_rate": 9.992267984953503e-07, + "loss": 0.1528, + "step": 7327 + }, + { + "epoch": 2.447153114042411, + "grad_norm": 0.5182522042264855, + "learning_rate": 9.98061501509311e-07, + "loss": 0.1566, + "step": 7328 + }, + { + "epoch": 2.4474870596092835, + "grad_norm": 0.505189403897211, + "learning_rate": 9.968968090632032e-07, + "loss": 0.1522, + "step": 7329 + }, + { + "epoch": 2.447821005176156, + "grad_norm": 0.5362073600754579, + "learning_rate": 9.957327213329687e-07, + "loss": 0.1546, + "step": 7330 + }, + { + "epoch": 2.448154950743029, + "grad_norm": 0.5102731648397336, + "learning_rate": 9.945692384944544e-07, + "loss": 0.1461, + "step": 7331 + }, + { + "epoch": 2.4484888963099016, + "grad_norm": 0.5226654494134502, + "learning_rate": 9.934063607234202e-07, + "loss": 0.1532, + "step": 7332 + }, + { + "epoch": 2.448822841876774, + "grad_norm": 0.5230643942856512, + "learning_rate": 9.922440881955298e-07, + "loss": 0.1523, + "step": 7333 + }, + { + "epoch": 2.4491567874436466, + "grad_norm": 0.5655913741188585, + "learning_rate": 9.910824210863611e-07, + "loss": 0.1639, + "step": 7334 + }, + { + "epoch": 2.4494907330105193, + "grad_norm": 0.5386421493301948, + "learning_rate": 9.899213595713935e-07, + "loss": 0.1628, + "step": 7335 + }, + { + "epoch": 2.449824678577392, + "grad_norm": 0.49776939645132456, + "learning_rate": 9.887609038260243e-07, + "loss": 0.1549, + "step": 7336 + }, + { + "epoch": 2.4501586241442643, + "grad_norm": 0.558716304445521, + "learning_rate": 9.876010540255504e-07, + "loss": 0.1635, + "step": 7337 + }, + { + "epoch": 2.450492569711137, + "grad_norm": 0.4917972097451291, + "learning_rate": 9.86441810345183e-07, + "loss": 0.1527, + "step": 7338 + }, + { + "epoch": 2.4508265152780098, + "grad_norm": 0.5377757832010998, + "learning_rate": 9.852831729600365e-07, + "loss": 0.162, + "step": 7339 + }, + { + "epoch": 2.451160460844882, + "grad_norm": 0.4868757826124942, + "learning_rate": 9.841251420451398e-07, + "loss": 0.1444, + "step": 7340 + }, + { + "epoch": 2.4514944064117548, + "grad_norm": 0.5439255876381551, + "learning_rate": 9.829677177754231e-07, + "loss": 0.1568, + "step": 7341 + }, + { + "epoch": 2.4518283519786275, + "grad_norm": 0.5411457197387434, + "learning_rate": 9.818109003257348e-07, + "loss": 0.1649, + "step": 7342 + }, + { + "epoch": 2.4521622975455, + "grad_norm": 0.5373406277835615, + "learning_rate": 9.806546898708213e-07, + "loss": 0.1587, + "step": 7343 + }, + { + "epoch": 2.452496243112373, + "grad_norm": 0.5184857587685223, + "learning_rate": 9.794990865853444e-07, + "loss": 0.1553, + "step": 7344 + }, + { + "epoch": 2.452830188679245, + "grad_norm": 0.4972829968409121, + "learning_rate": 9.783440906438686e-07, + "loss": 0.1518, + "step": 7345 + }, + { + "epoch": 2.453164134246118, + "grad_norm": 0.5263298934429176, + "learning_rate": 9.771897022208732e-07, + "loss": 0.1605, + "step": 7346 + }, + { + "epoch": 2.4534980798129906, + "grad_norm": 0.5039481262297244, + "learning_rate": 9.760359214907372e-07, + "loss": 0.1489, + "step": 7347 + }, + { + "epoch": 2.453832025379863, + "grad_norm": 0.5540379369889635, + "learning_rate": 9.74882748627759e-07, + "loss": 0.1582, + "step": 7348 + }, + { + "epoch": 2.4541659709467356, + "grad_norm": 0.5561596840406983, + "learning_rate": 9.737301838061342e-07, + "loss": 0.1647, + "step": 7349 + }, + { + "epoch": 2.4544999165136083, + "grad_norm": 0.5045374898134211, + "learning_rate": 9.725782271999744e-07, + "loss": 0.1477, + "step": 7350 + }, + { + "epoch": 2.454833862080481, + "grad_norm": 0.49671437127259194, + "learning_rate": 9.714268789832937e-07, + "loss": 0.1501, + "step": 7351 + }, + { + "epoch": 2.4551678076473533, + "grad_norm": 0.5284559165780311, + "learning_rate": 9.702761393300176e-07, + "loss": 0.1578, + "step": 7352 + }, + { + "epoch": 2.455501753214226, + "grad_norm": 0.5117383212099872, + "learning_rate": 9.691260084139802e-07, + "loss": 0.1496, + "step": 7353 + }, + { + "epoch": 2.4558356987810988, + "grad_norm": 0.526036454760328, + "learning_rate": 9.679764864089203e-07, + "loss": 0.1562, + "step": 7354 + }, + { + "epoch": 2.4561696443479715, + "grad_norm": 0.49859254666734404, + "learning_rate": 9.668275734884885e-07, + "loss": 0.1517, + "step": 7355 + }, + { + "epoch": 2.4565035899148437, + "grad_norm": 0.5353355704871284, + "learning_rate": 9.656792698262402e-07, + "loss": 0.1527, + "step": 7356 + }, + { + "epoch": 2.4568375354817165, + "grad_norm": 0.5062378256192873, + "learning_rate": 9.645315755956413e-07, + "loss": 0.1534, + "step": 7357 + }, + { + "epoch": 2.457171481048589, + "grad_norm": 0.46223832301303525, + "learning_rate": 9.633844909700618e-07, + "loss": 0.1401, + "step": 7358 + }, + { + "epoch": 2.4575054266154615, + "grad_norm": 0.5524054906187581, + "learning_rate": 9.622380161227873e-07, + "loss": 0.161, + "step": 7359 + }, + { + "epoch": 2.457839372182334, + "grad_norm": 0.5056246623447084, + "learning_rate": 9.61092151227002e-07, + "loss": 0.1548, + "step": 7360 + }, + { + "epoch": 2.458173317749207, + "grad_norm": 0.5121677065114587, + "learning_rate": 9.599468964558051e-07, + "loss": 0.1459, + "step": 7361 + }, + { + "epoch": 2.4585072633160796, + "grad_norm": 0.5588974512360972, + "learning_rate": 9.588022519821983e-07, + "loss": 0.161, + "step": 7362 + }, + { + "epoch": 2.4588412088829523, + "grad_norm": 0.5641228129018973, + "learning_rate": 9.576582179790967e-07, + "loss": 0.1607, + "step": 7363 + }, + { + "epoch": 2.4591751544498246, + "grad_norm": 0.5565054266004965, + "learning_rate": 9.565147946193149e-07, + "loss": 0.1578, + "step": 7364 + }, + { + "epoch": 2.4595091000166973, + "grad_norm": 0.5954752685742998, + "learning_rate": 9.553719820755869e-07, + "loss": 0.1652, + "step": 7365 + }, + { + "epoch": 2.45984304558357, + "grad_norm": 0.5018003233841967, + "learning_rate": 9.542297805205436e-07, + "loss": 0.1422, + "step": 7366 + }, + { + "epoch": 2.4601769911504423, + "grad_norm": 0.5505488375112121, + "learning_rate": 9.530881901267308e-07, + "loss": 0.1484, + "step": 7367 + }, + { + "epoch": 2.460510936717315, + "grad_norm": 0.5195725678865587, + "learning_rate": 9.519472110665967e-07, + "loss": 0.1467, + "step": 7368 + }, + { + "epoch": 2.4608448822841877, + "grad_norm": 0.5151373816065521, + "learning_rate": 9.508068435125012e-07, + "loss": 0.1553, + "step": 7369 + }, + { + "epoch": 2.4611788278510605, + "grad_norm": 0.5281539463812102, + "learning_rate": 9.496670876367076e-07, + "loss": 0.1568, + "step": 7370 + }, + { + "epoch": 2.4615127734179327, + "grad_norm": 0.5334649152791151, + "learning_rate": 9.485279436113942e-07, + "loss": 0.1671, + "step": 7371 + }, + { + "epoch": 2.4618467189848054, + "grad_norm": 0.541217305277803, + "learning_rate": 9.473894116086379e-07, + "loss": 0.1582, + "step": 7372 + }, + { + "epoch": 2.462180664551678, + "grad_norm": 0.4819910188513477, + "learning_rate": 9.462514918004301e-07, + "loss": 0.1401, + "step": 7373 + }, + { + "epoch": 2.462514610118551, + "grad_norm": 0.5450310802132138, + "learning_rate": 9.451141843586647e-07, + "loss": 0.1556, + "step": 7374 + }, + { + "epoch": 2.462848555685423, + "grad_norm": 0.5413919382336527, + "learning_rate": 9.439774894551479e-07, + "loss": 0.1603, + "step": 7375 + }, + { + "epoch": 2.463182501252296, + "grad_norm": 0.5547122615105633, + "learning_rate": 9.428414072615877e-07, + "loss": 0.1618, + "step": 7376 + }, + { + "epoch": 2.4635164468191686, + "grad_norm": 0.5160881731381368, + "learning_rate": 9.417059379496047e-07, + "loss": 0.1609, + "step": 7377 + }, + { + "epoch": 2.463850392386041, + "grad_norm": 0.5382470923154085, + "learning_rate": 9.40571081690726e-07, + "loss": 0.1624, + "step": 7378 + }, + { + "epoch": 2.4641843379529136, + "grad_norm": 0.5136862931411745, + "learning_rate": 9.394368386563823e-07, + "loss": 0.1537, + "step": 7379 + }, + { + "epoch": 2.4645182835197863, + "grad_norm": 0.5130561945345334, + "learning_rate": 9.383032090179173e-07, + "loss": 0.1595, + "step": 7380 + }, + { + "epoch": 2.464852229086659, + "grad_norm": 0.4979229387087476, + "learning_rate": 9.371701929465759e-07, + "loss": 0.1538, + "step": 7381 + }, + { + "epoch": 2.4651861746535313, + "grad_norm": 0.5288085451267187, + "learning_rate": 9.360377906135148e-07, + "loss": 0.1653, + "step": 7382 + }, + { + "epoch": 2.465520120220404, + "grad_norm": 0.5246843887272186, + "learning_rate": 9.349060021897976e-07, + "loss": 0.1599, + "step": 7383 + }, + { + "epoch": 2.4658540657872767, + "grad_norm": 0.5128075936979838, + "learning_rate": 9.337748278463948e-07, + "loss": 0.1592, + "step": 7384 + }, + { + "epoch": 2.4661880113541494, + "grad_norm": 0.5244124581207874, + "learning_rate": 9.326442677541813e-07, + "loss": 0.1576, + "step": 7385 + }, + { + "epoch": 2.4665219569210217, + "grad_norm": 0.5204631164355119, + "learning_rate": 9.31514322083944e-07, + "loss": 0.153, + "step": 7386 + }, + { + "epoch": 2.4668559024878944, + "grad_norm": 0.5045026609152344, + "learning_rate": 9.303849910063717e-07, + "loss": 0.1523, + "step": 7387 + }, + { + "epoch": 2.467189848054767, + "grad_norm": 0.5230456092218726, + "learning_rate": 9.292562746920647e-07, + "loss": 0.1592, + "step": 7388 + }, + { + "epoch": 2.4675237936216394, + "grad_norm": 0.5378631540624517, + "learning_rate": 9.281281733115288e-07, + "loss": 0.1546, + "step": 7389 + }, + { + "epoch": 2.467857739188512, + "grad_norm": 0.5030886402668243, + "learning_rate": 9.270006870351789e-07, + "loss": 0.1416, + "step": 7390 + }, + { + "epoch": 2.468191684755385, + "grad_norm": 0.5286770365980373, + "learning_rate": 9.258738160333314e-07, + "loss": 0.1533, + "step": 7391 + }, + { + "epoch": 2.4685256303222576, + "grad_norm": 0.5174511810173182, + "learning_rate": 9.247475604762168e-07, + "loss": 0.1629, + "step": 7392 + }, + { + "epoch": 2.4688595758891303, + "grad_norm": 0.5775562479058309, + "learning_rate": 9.236219205339647e-07, + "loss": 0.1591, + "step": 7393 + }, + { + "epoch": 2.4691935214560026, + "grad_norm": 0.5530670682386785, + "learning_rate": 9.224968963766223e-07, + "loss": 0.1617, + "step": 7394 + }, + { + "epoch": 2.4695274670228753, + "grad_norm": 0.5588841493266121, + "learning_rate": 9.213724881741337e-07, + "loss": 0.1697, + "step": 7395 + }, + { + "epoch": 2.469861412589748, + "grad_norm": 0.496013975272932, + "learning_rate": 9.202486960963559e-07, + "loss": 0.1531, + "step": 7396 + }, + { + "epoch": 2.4701953581566203, + "grad_norm": 0.5076710510224289, + "learning_rate": 9.191255203130489e-07, + "loss": 0.1587, + "step": 7397 + }, + { + "epoch": 2.470529303723493, + "grad_norm": 0.5020408243905019, + "learning_rate": 9.18002960993884e-07, + "loss": 0.1518, + "step": 7398 + }, + { + "epoch": 2.4708632492903657, + "grad_norm": 0.5348771472192876, + "learning_rate": 9.168810183084348e-07, + "loss": 0.1601, + "step": 7399 + }, + { + "epoch": 2.4711971948572384, + "grad_norm": 0.5044766853939331, + "learning_rate": 9.157596924261847e-07, + "loss": 0.1542, + "step": 7400 + }, + { + "epoch": 2.4715311404241107, + "grad_norm": 0.5757230051543882, + "learning_rate": 9.146389835165248e-07, + "loss": 0.1769, + "step": 7401 + }, + { + "epoch": 2.4718650859909834, + "grad_norm": 0.5052922924503145, + "learning_rate": 9.135188917487487e-07, + "loss": 0.1507, + "step": 7402 + }, + { + "epoch": 2.472199031557856, + "grad_norm": 0.5365350669677343, + "learning_rate": 9.12399417292062e-07, + "loss": 0.1488, + "step": 7403 + }, + { + "epoch": 2.472532977124729, + "grad_norm": 0.507603981314568, + "learning_rate": 9.112805603155716e-07, + "loss": 0.1534, + "step": 7404 + }, + { + "epoch": 2.472866922691601, + "grad_norm": 0.8258984417878298, + "learning_rate": 9.101623209882965e-07, + "loss": 0.157, + "step": 7405 + }, + { + "epoch": 2.473200868258474, + "grad_norm": 0.5625468162092421, + "learning_rate": 9.090446994791585e-07, + "loss": 0.1589, + "step": 7406 + }, + { + "epoch": 2.4735348138253466, + "grad_norm": 0.5349082547026425, + "learning_rate": 9.079276959569899e-07, + "loss": 0.1647, + "step": 7407 + }, + { + "epoch": 2.473868759392219, + "grad_norm": 0.5270376007769813, + "learning_rate": 9.068113105905235e-07, + "loss": 0.155, + "step": 7408 + }, + { + "epoch": 2.4742027049590916, + "grad_norm": 0.5257667676560155, + "learning_rate": 9.056955435484061e-07, + "loss": 0.1517, + "step": 7409 + }, + { + "epoch": 2.4745366505259643, + "grad_norm": 0.5310131029906042, + "learning_rate": 9.045803949991843e-07, + "loss": 0.1627, + "step": 7410 + }, + { + "epoch": 2.474870596092837, + "grad_norm": 0.5110165371528869, + "learning_rate": 9.034658651113154e-07, + "loss": 0.1547, + "step": 7411 + }, + { + "epoch": 2.4752045416597097, + "grad_norm": 0.5240970004480227, + "learning_rate": 9.023519540531633e-07, + "loss": 0.1574, + "step": 7412 + }, + { + "epoch": 2.475538487226582, + "grad_norm": 0.5146289857699795, + "learning_rate": 9.01238661992998e-07, + "loss": 0.1519, + "step": 7413 + }, + { + "epoch": 2.4758724327934547, + "grad_norm": 0.5125323382835603, + "learning_rate": 9.001259890989927e-07, + "loss": 0.1452, + "step": 7414 + }, + { + "epoch": 2.4762063783603274, + "grad_norm": 0.548528572666765, + "learning_rate": 8.990139355392324e-07, + "loss": 0.1633, + "step": 7415 + }, + { + "epoch": 2.4765403239271997, + "grad_norm": 0.5426017894190176, + "learning_rate": 8.979025014817039e-07, + "loss": 0.1554, + "step": 7416 + }, + { + "epoch": 2.4768742694940724, + "grad_norm": 0.5273871603200672, + "learning_rate": 8.967916870943028e-07, + "loss": 0.156, + "step": 7417 + }, + { + "epoch": 2.477208215060945, + "grad_norm": 0.5015673721907438, + "learning_rate": 8.956814925448309e-07, + "loss": 0.1551, + "step": 7418 + }, + { + "epoch": 2.477542160627818, + "grad_norm": 0.5351497045095686, + "learning_rate": 8.945719180009977e-07, + "loss": 0.1593, + "step": 7419 + }, + { + "epoch": 2.47787610619469, + "grad_norm": 0.5248370885000369, + "learning_rate": 8.934629636304149e-07, + "loss": 0.1505, + "step": 7420 + }, + { + "epoch": 2.478210051761563, + "grad_norm": 0.5879544927092775, + "learning_rate": 8.923546296006058e-07, + "loss": 0.166, + "step": 7421 + }, + { + "epoch": 2.4785439973284356, + "grad_norm": 0.5316868901778447, + "learning_rate": 8.912469160789944e-07, + "loss": 0.1548, + "step": 7422 + }, + { + "epoch": 2.4788779428953083, + "grad_norm": 0.5293767236125144, + "learning_rate": 8.901398232329156e-07, + "loss": 0.1555, + "step": 7423 + }, + { + "epoch": 2.4792118884621805, + "grad_norm": 0.5766018928369575, + "learning_rate": 8.890333512296095e-07, + "loss": 0.1685, + "step": 7424 + }, + { + "epoch": 2.4795458340290533, + "grad_norm": 0.49283680451273953, + "learning_rate": 8.879275002362197e-07, + "loss": 0.1481, + "step": 7425 + }, + { + "epoch": 2.479879779595926, + "grad_norm": 0.7372173936319003, + "learning_rate": 8.868222704198004e-07, + "loss": 0.1562, + "step": 7426 + }, + { + "epoch": 2.4802137251627983, + "grad_norm": 0.5586198661064684, + "learning_rate": 8.857176619473068e-07, + "loss": 0.1478, + "step": 7427 + }, + { + "epoch": 2.480547670729671, + "grad_norm": 0.48785260397998287, + "learning_rate": 8.846136749856044e-07, + "loss": 0.1443, + "step": 7428 + }, + { + "epoch": 2.4808816162965437, + "grad_norm": 0.5223348130739237, + "learning_rate": 8.835103097014636e-07, + "loss": 0.1563, + "step": 7429 + }, + { + "epoch": 2.4812155618634164, + "grad_norm": 0.5289422818248141, + "learning_rate": 8.824075662615617e-07, + "loss": 0.1539, + "step": 7430 + }, + { + "epoch": 2.4815495074302887, + "grad_norm": 0.5366617936145673, + "learning_rate": 8.813054448324792e-07, + "loss": 0.1564, + "step": 7431 + }, + { + "epoch": 2.4818834529971614, + "grad_norm": 0.5091155974176387, + "learning_rate": 8.80203945580706e-07, + "loss": 0.1421, + "step": 7432 + }, + { + "epoch": 2.482217398564034, + "grad_norm": 0.5162217183502404, + "learning_rate": 8.791030686726349e-07, + "loss": 0.1594, + "step": 7433 + }, + { + "epoch": 2.482551344130907, + "grad_norm": 0.53574017718763, + "learning_rate": 8.780028142745673e-07, + "loss": 0.158, + "step": 7434 + }, + { + "epoch": 2.482885289697779, + "grad_norm": 0.5170105505185464, + "learning_rate": 8.769031825527097e-07, + "loss": 0.1585, + "step": 7435 + }, + { + "epoch": 2.483219235264652, + "grad_norm": 0.544046520845691, + "learning_rate": 8.758041736731753e-07, + "loss": 0.1589, + "step": 7436 + }, + { + "epoch": 2.4835531808315245, + "grad_norm": 0.5531553610216963, + "learning_rate": 8.747057878019799e-07, + "loss": 0.1604, + "step": 7437 + }, + { + "epoch": 2.483887126398397, + "grad_norm": 0.4998496915162335, + "learning_rate": 8.736080251050505e-07, + "loss": 0.1451, + "step": 7438 + }, + { + "epoch": 2.4842210719652695, + "grad_norm": 0.5360141755777315, + "learning_rate": 8.725108857482145e-07, + "loss": 0.1601, + "step": 7439 + }, + { + "epoch": 2.4845550175321423, + "grad_norm": 0.5323788988525391, + "learning_rate": 8.714143698972083e-07, + "loss": 0.155, + "step": 7440 + }, + { + "epoch": 2.484888963099015, + "grad_norm": 0.48108964205325405, + "learning_rate": 8.703184777176743e-07, + "loss": 0.1426, + "step": 7441 + }, + { + "epoch": 2.4852229086658877, + "grad_norm": 0.5182627973890735, + "learning_rate": 8.692232093751613e-07, + "loss": 0.1491, + "step": 7442 + }, + { + "epoch": 2.48555685423276, + "grad_norm": 0.48494737578990094, + "learning_rate": 8.68128565035119e-07, + "loss": 0.1471, + "step": 7443 + }, + { + "epoch": 2.4858907997996327, + "grad_norm": 0.49811481612701375, + "learning_rate": 8.670345448629097e-07, + "loss": 0.1515, + "step": 7444 + }, + { + "epoch": 2.4862247453665054, + "grad_norm": 0.5458036809897917, + "learning_rate": 8.659411490237951e-07, + "loss": 0.1586, + "step": 7445 + }, + { + "epoch": 2.4865586909333777, + "grad_norm": 0.5294276700638828, + "learning_rate": 8.648483776829469e-07, + "loss": 0.1663, + "step": 7446 + }, + { + "epoch": 2.4868926365002504, + "grad_norm": 0.491083599496525, + "learning_rate": 8.637562310054425e-07, + "loss": 0.1483, + "step": 7447 + }, + { + "epoch": 2.487226582067123, + "grad_norm": 0.551263065477862, + "learning_rate": 8.626647091562612e-07, + "loss": 0.1617, + "step": 7448 + }, + { + "epoch": 2.487560527633996, + "grad_norm": 0.49702157711243133, + "learning_rate": 8.61573812300292e-07, + "loss": 0.1504, + "step": 7449 + }, + { + "epoch": 2.487894473200868, + "grad_norm": 0.5200659870372368, + "learning_rate": 8.604835406023254e-07, + "loss": 0.1598, + "step": 7450 + }, + { + "epoch": 2.488228418767741, + "grad_norm": 0.49403596255559235, + "learning_rate": 8.593938942270613e-07, + "loss": 0.1489, + "step": 7451 + }, + { + "epoch": 2.4885623643346135, + "grad_norm": 0.5572892670064632, + "learning_rate": 8.583048733391036e-07, + "loss": 0.1634, + "step": 7452 + }, + { + "epoch": 2.4888963099014862, + "grad_norm": 0.524050270984186, + "learning_rate": 8.57216478102963e-07, + "loss": 0.152, + "step": 7453 + }, + { + "epoch": 2.4892302554683585, + "grad_norm": 0.5450655789511571, + "learning_rate": 8.561287086830516e-07, + "loss": 0.1578, + "step": 7454 + }, + { + "epoch": 2.4895642010352312, + "grad_norm": 0.4938587213270162, + "learning_rate": 8.550415652436927e-07, + "loss": 0.1541, + "step": 7455 + }, + { + "epoch": 2.489898146602104, + "grad_norm": 0.5114068685962905, + "learning_rate": 8.539550479491093e-07, + "loss": 0.151, + "step": 7456 + }, + { + "epoch": 2.4902320921689762, + "grad_norm": 0.5449036600871238, + "learning_rate": 8.528691569634357e-07, + "loss": 0.1687, + "step": 7457 + }, + { + "epoch": 2.490566037735849, + "grad_norm": 0.5858119072813122, + "learning_rate": 8.517838924507039e-07, + "loss": 0.1708, + "step": 7458 + }, + { + "epoch": 2.4908999833027217, + "grad_norm": 0.5347934032616631, + "learning_rate": 8.50699254574861e-07, + "loss": 0.1602, + "step": 7459 + }, + { + "epoch": 2.4912339288695944, + "grad_norm": 0.511032385630495, + "learning_rate": 8.496152434997518e-07, + "loss": 0.1505, + "step": 7460 + }, + { + "epoch": 2.491567874436467, + "grad_norm": 0.5590357970061134, + "learning_rate": 8.485318593891295e-07, + "loss": 0.1422, + "step": 7461 + }, + { + "epoch": 2.4919018200033394, + "grad_norm": 0.5920055950996739, + "learning_rate": 8.474491024066512e-07, + "loss": 0.1667, + "step": 7462 + }, + { + "epoch": 2.492235765570212, + "grad_norm": 0.5492390339266932, + "learning_rate": 8.463669727158819e-07, + "loss": 0.1587, + "step": 7463 + }, + { + "epoch": 2.492569711137085, + "grad_norm": 0.526164253031194, + "learning_rate": 8.45285470480286e-07, + "loss": 0.1534, + "step": 7464 + }, + { + "epoch": 2.492903656703957, + "grad_norm": 0.5364159996478518, + "learning_rate": 8.442045958632428e-07, + "loss": 0.1597, + "step": 7465 + }, + { + "epoch": 2.49323760227083, + "grad_norm": 0.5481814028546967, + "learning_rate": 8.431243490280267e-07, + "loss": 0.1607, + "step": 7466 + }, + { + "epoch": 2.4935715478377025, + "grad_norm": 0.5497464097584005, + "learning_rate": 8.420447301378249e-07, + "loss": 0.1603, + "step": 7467 + }, + { + "epoch": 2.4939054934045752, + "grad_norm": 0.49499461566929853, + "learning_rate": 8.409657393557236e-07, + "loss": 0.1532, + "step": 7468 + }, + { + "epoch": 2.4942394389714475, + "grad_norm": 0.5280999383444667, + "learning_rate": 8.39887376844718e-07, + "loss": 0.1513, + "step": 7469 + }, + { + "epoch": 2.4945733845383202, + "grad_norm": 0.541491593990084, + "learning_rate": 8.388096427677095e-07, + "loss": 0.1575, + "step": 7470 + }, + { + "epoch": 2.494907330105193, + "grad_norm": 0.5567512139651408, + "learning_rate": 8.377325372874995e-07, + "loss": 0.1587, + "step": 7471 + }, + { + "epoch": 2.4952412756720657, + "grad_norm": 0.4789075537156718, + "learning_rate": 8.366560605668006e-07, + "loss": 0.1466, + "step": 7472 + }, + { + "epoch": 2.495575221238938, + "grad_norm": 0.5241227372625797, + "learning_rate": 8.355802127682238e-07, + "loss": 0.1575, + "step": 7473 + }, + { + "epoch": 2.4959091668058107, + "grad_norm": 0.5261817341228623, + "learning_rate": 8.345049940542904e-07, + "loss": 0.1473, + "step": 7474 + }, + { + "epoch": 2.4962431123726834, + "grad_norm": 0.5125544170513542, + "learning_rate": 8.334304045874248e-07, + "loss": 0.1622, + "step": 7475 + }, + { + "epoch": 2.4965770579395556, + "grad_norm": 0.4886019125041492, + "learning_rate": 8.323564445299575e-07, + "loss": 0.1477, + "step": 7476 + }, + { + "epoch": 2.4969110035064284, + "grad_norm": 0.46844423797008194, + "learning_rate": 8.312831140441207e-07, + "loss": 0.1444, + "step": 7477 + }, + { + "epoch": 2.497244949073301, + "grad_norm": 0.5183590771956148, + "learning_rate": 8.302104132920552e-07, + "loss": 0.1565, + "step": 7478 + }, + { + "epoch": 2.497578894640174, + "grad_norm": 0.5577300530187361, + "learning_rate": 8.291383424358041e-07, + "loss": 0.1617, + "step": 7479 + }, + { + "epoch": 2.497912840207046, + "grad_norm": 0.5636308968550843, + "learning_rate": 8.280669016373172e-07, + "loss": 0.1691, + "step": 7480 + }, + { + "epoch": 2.498246785773919, + "grad_norm": 0.5084283575774319, + "learning_rate": 8.269960910584457e-07, + "loss": 0.155, + "step": 7481 + }, + { + "epoch": 2.4985807313407915, + "grad_norm": 0.5246190756503197, + "learning_rate": 8.259259108609524e-07, + "loss": 0.1592, + "step": 7482 + }, + { + "epoch": 2.4989146769076642, + "grad_norm": 0.4765757170409379, + "learning_rate": 8.248563612064969e-07, + "loss": 0.1451, + "step": 7483 + }, + { + "epoch": 2.4992486224745365, + "grad_norm": 0.5183633745167002, + "learning_rate": 8.237874422566505e-07, + "loss": 0.1571, + "step": 7484 + }, + { + "epoch": 2.499582568041409, + "grad_norm": 0.49644317659578663, + "learning_rate": 8.227191541728829e-07, + "loss": 0.1486, + "step": 7485 + }, + { + "epoch": 2.499916513608282, + "grad_norm": 0.5302602187154111, + "learning_rate": 8.21651497116574e-07, + "loss": 0.1544, + "step": 7486 + }, + { + "epoch": 2.500250459175154, + "grad_norm": 0.5532769666070599, + "learning_rate": 8.205844712490024e-07, + "loss": 0.1677, + "step": 7487 + }, + { + "epoch": 2.500584404742027, + "grad_norm": 0.555633213185817, + "learning_rate": 8.195180767313604e-07, + "loss": 0.1645, + "step": 7488 + }, + { + "epoch": 2.5009183503088996, + "grad_norm": 0.5607026487442186, + "learning_rate": 8.184523137247346e-07, + "loss": 0.1576, + "step": 7489 + }, + { + "epoch": 2.5012522958757724, + "grad_norm": 0.5288814062908609, + "learning_rate": 8.173871823901247e-07, + "loss": 0.1486, + "step": 7490 + }, + { + "epoch": 2.501586241442645, + "grad_norm": 0.5171926986273441, + "learning_rate": 8.16322682888428e-07, + "loss": 0.1588, + "step": 7491 + }, + { + "epoch": 2.5019201870095173, + "grad_norm": 0.5293127044915098, + "learning_rate": 8.15258815380453e-07, + "loss": 0.1583, + "step": 7492 + }, + { + "epoch": 2.50225413257639, + "grad_norm": 0.5176749228194966, + "learning_rate": 8.141955800269058e-07, + "loss": 0.1549, + "step": 7493 + }, + { + "epoch": 2.502588078143263, + "grad_norm": 0.5159921514636313, + "learning_rate": 8.131329769884027e-07, + "loss": 0.1512, + "step": 7494 + }, + { + "epoch": 2.502922023710135, + "grad_norm": 0.5257063522299663, + "learning_rate": 8.120710064254634e-07, + "loss": 0.1536, + "step": 7495 + }, + { + "epoch": 2.5032559692770078, + "grad_norm": 0.5264810763788741, + "learning_rate": 8.110096684985086e-07, + "loss": 0.1622, + "step": 7496 + }, + { + "epoch": 2.5035899148438805, + "grad_norm": 0.5639565998252986, + "learning_rate": 8.099489633678676e-07, + "loss": 0.1718, + "step": 7497 + }, + { + "epoch": 2.503923860410753, + "grad_norm": 0.5502371897141942, + "learning_rate": 8.088888911937726e-07, + "loss": 0.1676, + "step": 7498 + }, + { + "epoch": 2.5042578059776255, + "grad_norm": 0.6154170180004221, + "learning_rate": 8.078294521363584e-07, + "loss": 0.1745, + "step": 7499 + }, + { + "epoch": 2.504591751544498, + "grad_norm": 0.5371791465358621, + "learning_rate": 8.067706463556663e-07, + "loss": 0.1613, + "step": 7500 + }, + { + "epoch": 2.504925697111371, + "grad_norm": 0.5207840621091431, + "learning_rate": 8.057124740116434e-07, + "loss": 0.1594, + "step": 7501 + }, + { + "epoch": 2.5052596426782436, + "grad_norm": 0.4840124847038789, + "learning_rate": 8.046549352641359e-07, + "loss": 0.1549, + "step": 7502 + }, + { + "epoch": 2.505593588245116, + "grad_norm": 0.4843225011059664, + "learning_rate": 8.035980302729008e-07, + "loss": 0.1444, + "step": 7503 + }, + { + "epoch": 2.5059275338119886, + "grad_norm": 0.5015309666946438, + "learning_rate": 8.025417591975926e-07, + "loss": 0.1495, + "step": 7504 + }, + { + "epoch": 2.5062614793788613, + "grad_norm": 0.5588866802180681, + "learning_rate": 8.014861221977749e-07, + "loss": 0.1632, + "step": 7505 + }, + { + "epoch": 2.5065954249457336, + "grad_norm": 0.5258734952648931, + "learning_rate": 8.004311194329145e-07, + "loss": 0.1564, + "step": 7506 + }, + { + "epoch": 2.5069293705126063, + "grad_norm": 0.5018390452316217, + "learning_rate": 7.993767510623834e-07, + "loss": 0.1539, + "step": 7507 + }, + { + "epoch": 2.507263316079479, + "grad_norm": 0.5238239925463696, + "learning_rate": 7.983230172454531e-07, + "loss": 0.1519, + "step": 7508 + }, + { + "epoch": 2.5075972616463518, + "grad_norm": 0.5573481196677725, + "learning_rate": 7.972699181413058e-07, + "loss": 0.1591, + "step": 7509 + }, + { + "epoch": 2.5079312072132245, + "grad_norm": 0.5166173807653006, + "learning_rate": 7.962174539090201e-07, + "loss": 0.1494, + "step": 7510 + }, + { + "epoch": 2.5082651527800968, + "grad_norm": 0.5055095687381181, + "learning_rate": 7.951656247075884e-07, + "loss": 0.1519, + "step": 7511 + }, + { + "epoch": 2.5085990983469695, + "grad_norm": 0.5560200453792518, + "learning_rate": 7.941144306958986e-07, + "loss": 0.1676, + "step": 7512 + }, + { + "epoch": 2.508933043913842, + "grad_norm": 0.539443573117671, + "learning_rate": 7.930638720327477e-07, + "loss": 0.1644, + "step": 7513 + }, + { + "epoch": 2.5092669894807145, + "grad_norm": 0.5078255433817627, + "learning_rate": 7.920139488768325e-07, + "loss": 0.1542, + "step": 7514 + }, + { + "epoch": 2.509600935047587, + "grad_norm": 0.5588296693905245, + "learning_rate": 7.909646613867594e-07, + "loss": 0.1689, + "step": 7515 + }, + { + "epoch": 2.50993488061446, + "grad_norm": 0.5243310356959708, + "learning_rate": 7.899160097210329e-07, + "loss": 0.1535, + "step": 7516 + }, + { + "epoch": 2.510268826181332, + "grad_norm": 0.4984528436569994, + "learning_rate": 7.888679940380644e-07, + "loss": 0.1573, + "step": 7517 + }, + { + "epoch": 2.510602771748205, + "grad_norm": 0.4913467718422485, + "learning_rate": 7.87820614496172e-07, + "loss": 0.1558, + "step": 7518 + }, + { + "epoch": 2.5109367173150776, + "grad_norm": 0.5518878812557201, + "learning_rate": 7.867738712535711e-07, + "loss": 0.1622, + "step": 7519 + }, + { + "epoch": 2.5112706628819503, + "grad_norm": 0.5117546474410686, + "learning_rate": 7.857277644683858e-07, + "loss": 0.151, + "step": 7520 + }, + { + "epoch": 2.511604608448823, + "grad_norm": 0.5716066786013106, + "learning_rate": 7.846822942986449e-07, + "loss": 0.1585, + "step": 7521 + }, + { + "epoch": 2.5119385540156953, + "grad_norm": 0.526200387575101, + "learning_rate": 7.836374609022756e-07, + "loss": 0.146, + "step": 7522 + }, + { + "epoch": 2.512272499582568, + "grad_norm": 0.5237426581490771, + "learning_rate": 7.825932644371137e-07, + "loss": 0.1565, + "step": 7523 + }, + { + "epoch": 2.5126064451494408, + "grad_norm": 0.5157049326498191, + "learning_rate": 7.815497050608989e-07, + "loss": 0.1586, + "step": 7524 + }, + { + "epoch": 2.512940390716313, + "grad_norm": 0.5075710985330287, + "learning_rate": 7.805067829312707e-07, + "loss": 0.1526, + "step": 7525 + }, + { + "epoch": 2.5132743362831858, + "grad_norm": 0.5214226974333179, + "learning_rate": 7.79464498205777e-07, + "loss": 0.153, + "step": 7526 + }, + { + "epoch": 2.5136082818500585, + "grad_norm": 0.519026372135375, + "learning_rate": 7.78422851041865e-07, + "loss": 0.1557, + "step": 7527 + }, + { + "epoch": 2.513942227416931, + "grad_norm": 0.5682299743488699, + "learning_rate": 7.773818415968887e-07, + "loss": 0.1679, + "step": 7528 + }, + { + "epoch": 2.514276172983804, + "grad_norm": 0.527132465649919, + "learning_rate": 7.763414700281053e-07, + "loss": 0.1512, + "step": 7529 + }, + { + "epoch": 2.514610118550676, + "grad_norm": 0.5662292574706226, + "learning_rate": 7.753017364926757e-07, + "loss": 0.1618, + "step": 7530 + }, + { + "epoch": 2.514944064117549, + "grad_norm": 0.5394910775395201, + "learning_rate": 7.742626411476617e-07, + "loss": 0.1461, + "step": 7531 + }, + { + "epoch": 2.5152780096844216, + "grad_norm": 0.5124017265010042, + "learning_rate": 7.732241841500332e-07, + "loss": 0.1497, + "step": 7532 + }, + { + "epoch": 2.515611955251294, + "grad_norm": 0.5236527981564442, + "learning_rate": 7.721863656566597e-07, + "loss": 0.1495, + "step": 7533 + }, + { + "epoch": 2.5159459008181666, + "grad_norm": 0.45904066998357373, + "learning_rate": 7.711491858243164e-07, + "loss": 0.1397, + "step": 7534 + }, + { + "epoch": 2.5162798463850393, + "grad_norm": 0.5023271030916909, + "learning_rate": 7.701126448096813e-07, + "loss": 0.1515, + "step": 7535 + }, + { + "epoch": 2.5166137919519116, + "grad_norm": 0.5461686273133802, + "learning_rate": 7.69076742769338e-07, + "loss": 0.1552, + "step": 7536 + }, + { + "epoch": 2.5169477375187843, + "grad_norm": 0.4674555571007729, + "learning_rate": 7.68041479859769e-07, + "loss": 0.1399, + "step": 7537 + }, + { + "epoch": 2.517281683085657, + "grad_norm": 0.5515888155967048, + "learning_rate": 7.670068562373656e-07, + "loss": 0.1637, + "step": 7538 + }, + { + "epoch": 2.5176156286525297, + "grad_norm": 0.5957056156113891, + "learning_rate": 7.65972872058417e-07, + "loss": 0.1634, + "step": 7539 + }, + { + "epoch": 2.5179495742194025, + "grad_norm": 0.5036132612130619, + "learning_rate": 7.6493952747912e-07, + "loss": 0.1535, + "step": 7540 + }, + { + "epoch": 2.5182835197862747, + "grad_norm": 0.5190668427338941, + "learning_rate": 7.639068226555751e-07, + "loss": 0.1519, + "step": 7541 + }, + { + "epoch": 2.5186174653531475, + "grad_norm": 0.5037790938432386, + "learning_rate": 7.628747577437817e-07, + "loss": 0.1524, + "step": 7542 + }, + { + "epoch": 2.51895141092002, + "grad_norm": 0.5004546836079737, + "learning_rate": 7.618433328996466e-07, + "loss": 0.1532, + "step": 7543 + }, + { + "epoch": 2.5192853564868924, + "grad_norm": 0.5501512241433032, + "learning_rate": 7.608125482789802e-07, + "loss": 0.1653, + "step": 7544 + }, + { + "epoch": 2.519619302053765, + "grad_norm": 0.6007238389340253, + "learning_rate": 7.597824040374918e-07, + "loss": 0.1758, + "step": 7545 + }, + { + "epoch": 2.519953247620638, + "grad_norm": 0.49556874191589473, + "learning_rate": 7.587529003307981e-07, + "loss": 0.1545, + "step": 7546 + }, + { + "epoch": 2.5202871931875106, + "grad_norm": 0.5251067061399489, + "learning_rate": 7.57724037314419e-07, + "loss": 0.1582, + "step": 7547 + }, + { + "epoch": 2.520621138754383, + "grad_norm": 0.5381560130179013, + "learning_rate": 7.566958151437743e-07, + "loss": 0.159, + "step": 7548 + }, + { + "epoch": 2.5209550843212556, + "grad_norm": 0.4986522998380296, + "learning_rate": 7.556682339741911e-07, + "loss": 0.1498, + "step": 7549 + }, + { + "epoch": 2.5212890298881283, + "grad_norm": 0.4996807633077003, + "learning_rate": 7.546412939608955e-07, + "loss": 0.1471, + "step": 7550 + }, + { + "epoch": 2.521622975455001, + "grad_norm": 0.6054073039657178, + "learning_rate": 7.5361499525902e-07, + "loss": 0.1613, + "step": 7551 + }, + { + "epoch": 2.5219569210218733, + "grad_norm": 0.48971374377963084, + "learning_rate": 7.525893380235988e-07, + "loss": 0.1441, + "step": 7552 + }, + { + "epoch": 2.522290866588746, + "grad_norm": 0.5391665062112588, + "learning_rate": 7.515643224095709e-07, + "loss": 0.1617, + "step": 7553 + }, + { + "epoch": 2.5226248121556187, + "grad_norm": 0.5135011269872816, + "learning_rate": 7.505399485717746e-07, + "loss": 0.1564, + "step": 7554 + }, + { + "epoch": 2.522958757722491, + "grad_norm": 0.5283825677531684, + "learning_rate": 7.495162166649561e-07, + "loss": 0.1627, + "step": 7555 + }, + { + "epoch": 2.5232927032893637, + "grad_norm": 0.5178565756705336, + "learning_rate": 7.484931268437595e-07, + "loss": 0.1548, + "step": 7556 + }, + { + "epoch": 2.5236266488562364, + "grad_norm": 0.5152821898488568, + "learning_rate": 7.474706792627362e-07, + "loss": 0.1573, + "step": 7557 + }, + { + "epoch": 2.523960594423109, + "grad_norm": 0.5314921955835342, + "learning_rate": 7.464488740763387e-07, + "loss": 0.152, + "step": 7558 + }, + { + "epoch": 2.524294539989982, + "grad_norm": 0.4911118017182647, + "learning_rate": 7.454277114389241e-07, + "loss": 0.1472, + "step": 7559 + }, + { + "epoch": 2.524628485556854, + "grad_norm": 0.5249373913173525, + "learning_rate": 7.444071915047479e-07, + "loss": 0.1576, + "step": 7560 + }, + { + "epoch": 2.524962431123727, + "grad_norm": 0.5337994119643109, + "learning_rate": 7.433873144279751e-07, + "loss": 0.1643, + "step": 7561 + }, + { + "epoch": 2.5252963766905996, + "grad_norm": 0.6119106329771593, + "learning_rate": 7.42368080362667e-07, + "loss": 0.1687, + "step": 7562 + }, + { + "epoch": 2.525630322257472, + "grad_norm": 0.5577030632990558, + "learning_rate": 7.413494894627926e-07, + "loss": 0.1777, + "step": 7563 + }, + { + "epoch": 2.5259642678243446, + "grad_norm": 0.5203702420065588, + "learning_rate": 7.403315418822215e-07, + "loss": 0.1485, + "step": 7564 + }, + { + "epoch": 2.5262982133912173, + "grad_norm": 0.5038351465089509, + "learning_rate": 7.393142377747287e-07, + "loss": 0.1475, + "step": 7565 + }, + { + "epoch": 2.5266321589580896, + "grad_norm": 0.48326180757043785, + "learning_rate": 7.382975772939866e-07, + "loss": 0.1471, + "step": 7566 + }, + { + "epoch": 2.5269661045249623, + "grad_norm": 0.5366216732331346, + "learning_rate": 7.372815605935763e-07, + "loss": 0.1617, + "step": 7567 + }, + { + "epoch": 2.527300050091835, + "grad_norm": 0.5061743583902273, + "learning_rate": 7.362661878269772e-07, + "loss": 0.1571, + "step": 7568 + }, + { + "epoch": 2.5276339956587077, + "grad_norm": 0.5307042051406543, + "learning_rate": 7.352514591475746e-07, + "loss": 0.1586, + "step": 7569 + }, + { + "epoch": 2.5279679412255804, + "grad_norm": 0.680745479156831, + "learning_rate": 7.342373747086557e-07, + "loss": 0.1666, + "step": 7570 + }, + { + "epoch": 2.5283018867924527, + "grad_norm": 0.5242025885825541, + "learning_rate": 7.332239346634079e-07, + "loss": 0.1573, + "step": 7571 + }, + { + "epoch": 2.5286358323593254, + "grad_norm": 0.5393298239285648, + "learning_rate": 7.322111391649261e-07, + "loss": 0.1593, + "step": 7572 + }, + { + "epoch": 2.528969777926198, + "grad_norm": 0.5185798461075484, + "learning_rate": 7.311989883662018e-07, + "loss": 0.1588, + "step": 7573 + }, + { + "epoch": 2.5293037234930704, + "grad_norm": 0.592503504928077, + "learning_rate": 7.301874824201349e-07, + "loss": 0.1737, + "step": 7574 + }, + { + "epoch": 2.529637669059943, + "grad_norm": 0.5289070259578227, + "learning_rate": 7.29176621479522e-07, + "loss": 0.1541, + "step": 7575 + }, + { + "epoch": 2.529971614626816, + "grad_norm": 0.5399286735957974, + "learning_rate": 7.2816640569707e-07, + "loss": 0.154, + "step": 7576 + }, + { + "epoch": 2.5303055601936886, + "grad_norm": 0.5254361535936164, + "learning_rate": 7.271568352253804e-07, + "loss": 0.1564, + "step": 7577 + }, + { + "epoch": 2.5306395057605613, + "grad_norm": 0.555206381458067, + "learning_rate": 7.261479102169627e-07, + "loss": 0.1725, + "step": 7578 + }, + { + "epoch": 2.5309734513274336, + "grad_norm": 0.5290307988418901, + "learning_rate": 7.251396308242259e-07, + "loss": 0.1484, + "step": 7579 + }, + { + "epoch": 2.5313073968943063, + "grad_norm": 0.504515982522606, + "learning_rate": 7.241319971994831e-07, + "loss": 0.1496, + "step": 7580 + }, + { + "epoch": 2.531641342461179, + "grad_norm": 0.49559680913923954, + "learning_rate": 7.231250094949472e-07, + "loss": 0.1461, + "step": 7581 + }, + { + "epoch": 2.5319752880280513, + "grad_norm": 0.5978243995818697, + "learning_rate": 7.221186678627389e-07, + "loss": 0.1729, + "step": 7582 + }, + { + "epoch": 2.532309233594924, + "grad_norm": 0.5606973816163814, + "learning_rate": 7.211129724548754e-07, + "loss": 0.1581, + "step": 7583 + }, + { + "epoch": 2.5326431791617967, + "grad_norm": 0.5306020480428629, + "learning_rate": 7.201079234232805e-07, + "loss": 0.1624, + "step": 7584 + }, + { + "epoch": 2.532977124728669, + "grad_norm": 0.5223762718058418, + "learning_rate": 7.191035209197772e-07, + "loss": 0.1497, + "step": 7585 + }, + { + "epoch": 2.5333110702955417, + "grad_norm": 0.540345518499979, + "learning_rate": 7.180997650960936e-07, + "loss": 0.16, + "step": 7586 + }, + { + "epoch": 2.5336450158624144, + "grad_norm": 0.5450407713397211, + "learning_rate": 7.170966561038561e-07, + "loss": 0.1657, + "step": 7587 + }, + { + "epoch": 2.533978961429287, + "grad_norm": 0.5391559776735416, + "learning_rate": 7.160941940946009e-07, + "loss": 0.1577, + "step": 7588 + }, + { + "epoch": 2.53431290699616, + "grad_norm": 0.5230788079516275, + "learning_rate": 7.150923792197579e-07, + "loss": 0.1556, + "step": 7589 + }, + { + "epoch": 2.534646852563032, + "grad_norm": 0.5216974254081449, + "learning_rate": 7.140912116306648e-07, + "loss": 0.1486, + "step": 7590 + }, + { + "epoch": 2.534980798129905, + "grad_norm": 0.6321980599276655, + "learning_rate": 7.130906914785585e-07, + "loss": 0.1547, + "step": 7591 + }, + { + "epoch": 2.5353147436967776, + "grad_norm": 0.506483027802621, + "learning_rate": 7.120908189145798e-07, + "loss": 0.1447, + "step": 7592 + }, + { + "epoch": 2.53564868926365, + "grad_norm": 0.5169580171339891, + "learning_rate": 7.110915940897722e-07, + "loss": 0.1485, + "step": 7593 + }, + { + "epoch": 2.5359826348305226, + "grad_norm": 0.5278085164861606, + "learning_rate": 7.100930171550785e-07, + "loss": 0.1552, + "step": 7594 + }, + { + "epoch": 2.5363165803973953, + "grad_norm": 0.5350544885579419, + "learning_rate": 7.090950882613479e-07, + "loss": 0.1638, + "step": 7595 + }, + { + "epoch": 2.536650525964268, + "grad_norm": 0.5060006641486688, + "learning_rate": 7.08097807559327e-07, + "loss": 0.148, + "step": 7596 + }, + { + "epoch": 2.5369844715311403, + "grad_norm": 0.5396958297533546, + "learning_rate": 7.071011751996687e-07, + "loss": 0.1581, + "step": 7597 + }, + { + "epoch": 2.537318417098013, + "grad_norm": 0.556091790600878, + "learning_rate": 7.061051913329231e-07, + "loss": 0.1697, + "step": 7598 + }, + { + "epoch": 2.5376523626648857, + "grad_norm": 0.5181420886400087, + "learning_rate": 7.051098561095493e-07, + "loss": 0.1489, + "step": 7599 + }, + { + "epoch": 2.5379863082317584, + "grad_norm": 0.4768470665798708, + "learning_rate": 7.041151696799014e-07, + "loss": 0.1363, + "step": 7600 + }, + { + "epoch": 2.5383202537986307, + "grad_norm": 0.5196161421818665, + "learning_rate": 7.031211321942405e-07, + "loss": 0.1533, + "step": 7601 + }, + { + "epoch": 2.5386541993655034, + "grad_norm": 0.5232659742245588, + "learning_rate": 7.021277438027258e-07, + "loss": 0.1568, + "step": 7602 + }, + { + "epoch": 2.538988144932376, + "grad_norm": 0.5176128865084071, + "learning_rate": 7.011350046554227e-07, + "loss": 0.1542, + "step": 7603 + }, + { + "epoch": 2.5393220904992484, + "grad_norm": 0.5651500840876793, + "learning_rate": 7.001429149022915e-07, + "loss": 0.1647, + "step": 7604 + }, + { + "epoch": 2.539656036066121, + "grad_norm": 0.5098263760222316, + "learning_rate": 6.991514746932048e-07, + "loss": 0.1498, + "step": 7605 + }, + { + "epoch": 2.539989981632994, + "grad_norm": 0.536465839742826, + "learning_rate": 6.981606841779281e-07, + "loss": 0.1533, + "step": 7606 + }, + { + "epoch": 2.5403239271998665, + "grad_norm": 0.5226597424328394, + "learning_rate": 6.971705435061333e-07, + "loss": 0.1516, + "step": 7607 + }, + { + "epoch": 2.5406578727667393, + "grad_norm": 0.5804541135430479, + "learning_rate": 6.96181052827391e-07, + "loss": 0.165, + "step": 7608 + }, + { + "epoch": 2.5409918183336115, + "grad_norm": 0.5237353158113098, + "learning_rate": 6.951922122911775e-07, + "loss": 0.1538, + "step": 7609 + }, + { + "epoch": 2.5413257639004843, + "grad_norm": 0.5080706830627284, + "learning_rate": 6.942040220468654e-07, + "loss": 0.148, + "step": 7610 + }, + { + "epoch": 2.541659709467357, + "grad_norm": 0.5140049188783807, + "learning_rate": 6.932164822437371e-07, + "loss": 0.1557, + "step": 7611 + }, + { + "epoch": 2.5419936550342292, + "grad_norm": 0.5930589485845684, + "learning_rate": 6.922295930309691e-07, + "loss": 0.165, + "step": 7612 + }, + { + "epoch": 2.542327600601102, + "grad_norm": 0.549299614539444, + "learning_rate": 6.912433545576446e-07, + "loss": 0.1579, + "step": 7613 + }, + { + "epoch": 2.5426615461679747, + "grad_norm": 0.562070790263285, + "learning_rate": 6.90257766972744e-07, + "loss": 0.1699, + "step": 7614 + }, + { + "epoch": 2.542995491734847, + "grad_norm": 0.5227209625996607, + "learning_rate": 6.892728304251544e-07, + "loss": 0.1612, + "step": 7615 + }, + { + "epoch": 2.5433294373017197, + "grad_norm": 0.5487714357697593, + "learning_rate": 6.8828854506366e-07, + "loss": 0.16, + "step": 7616 + }, + { + "epoch": 2.5436633828685924, + "grad_norm": 0.5139354728097603, + "learning_rate": 6.873049110369495e-07, + "loss": 0.1473, + "step": 7617 + }, + { + "epoch": 2.543997328435465, + "grad_norm": 0.495487042816524, + "learning_rate": 6.863219284936135e-07, + "loss": 0.143, + "step": 7618 + }, + { + "epoch": 2.544331274002338, + "grad_norm": 0.5240554996595517, + "learning_rate": 6.853395975821414e-07, + "loss": 0.158, + "step": 7619 + }, + { + "epoch": 2.54466521956921, + "grad_norm": 0.5291080480147916, + "learning_rate": 6.843579184509275e-07, + "loss": 0.1543, + "step": 7620 + }, + { + "epoch": 2.544999165136083, + "grad_norm": 0.49305356830405517, + "learning_rate": 6.833768912482636e-07, + "loss": 0.1466, + "step": 7621 + }, + { + "epoch": 2.5453331107029555, + "grad_norm": 0.4901391626064939, + "learning_rate": 6.823965161223472e-07, + "loss": 0.142, + "step": 7622 + }, + { + "epoch": 2.545667056269828, + "grad_norm": 0.49462502258324764, + "learning_rate": 6.814167932212751e-07, + "loss": 0.1467, + "step": 7623 + }, + { + "epoch": 2.5460010018367005, + "grad_norm": 0.5141470034471365, + "learning_rate": 6.804377226930469e-07, + "loss": 0.1573, + "step": 7624 + }, + { + "epoch": 2.5463349474035732, + "grad_norm": 0.5089120455293014, + "learning_rate": 6.794593046855613e-07, + "loss": 0.1534, + "step": 7625 + }, + { + "epoch": 2.546668892970446, + "grad_norm": 0.5090069224737189, + "learning_rate": 6.784815393466215e-07, + "loss": 0.1551, + "step": 7626 + }, + { + "epoch": 2.5470028385373187, + "grad_norm": 0.518767697913172, + "learning_rate": 6.775044268239278e-07, + "loss": 0.1463, + "step": 7627 + }, + { + "epoch": 2.547336784104191, + "grad_norm": 0.5376965894711552, + "learning_rate": 6.765279672650865e-07, + "loss": 0.1537, + "step": 7628 + }, + { + "epoch": 2.5476707296710637, + "grad_norm": 0.5257888120052993, + "learning_rate": 6.75552160817603e-07, + "loss": 0.1505, + "step": 7629 + }, + { + "epoch": 2.5480046752379364, + "grad_norm": 0.5316600959696877, + "learning_rate": 6.745770076288854e-07, + "loss": 0.1562, + "step": 7630 + }, + { + "epoch": 2.5483386208048087, + "grad_norm": 0.5346487077712377, + "learning_rate": 6.736025078462399e-07, + "loss": 0.1618, + "step": 7631 + }, + { + "epoch": 2.5486725663716814, + "grad_norm": 0.5060699607051514, + "learning_rate": 6.726286616168781e-07, + "loss": 0.1487, + "step": 7632 + }, + { + "epoch": 2.549006511938554, + "grad_norm": 0.5685768568375997, + "learning_rate": 6.716554690879085e-07, + "loss": 0.1582, + "step": 7633 + }, + { + "epoch": 2.5493404575054264, + "grad_norm": 0.5704678810184574, + "learning_rate": 6.706829304063467e-07, + "loss": 0.1697, + "step": 7634 + }, + { + "epoch": 2.549674403072299, + "grad_norm": 0.5273554677422916, + "learning_rate": 6.697110457191031e-07, + "loss": 0.1525, + "step": 7635 + }, + { + "epoch": 2.550008348639172, + "grad_norm": 0.5291739725258179, + "learning_rate": 6.687398151729951e-07, + "loss": 0.1556, + "step": 7636 + }, + { + "epoch": 2.5503422942060445, + "grad_norm": 0.49722507845623964, + "learning_rate": 6.677692389147355e-07, + "loss": 0.1444, + "step": 7637 + }, + { + "epoch": 2.5506762397729172, + "grad_norm": 0.5218911341868374, + "learning_rate": 6.667993170909437e-07, + "loss": 0.1534, + "step": 7638 + }, + { + "epoch": 2.5510101853397895, + "grad_norm": 0.4754035667843285, + "learning_rate": 6.658300498481363e-07, + "loss": 0.143, + "step": 7639 + }, + { + "epoch": 2.5513441309066622, + "grad_norm": 0.5646948692153895, + "learning_rate": 6.648614373327328e-07, + "loss": 0.1604, + "step": 7640 + }, + { + "epoch": 2.551678076473535, + "grad_norm": 0.5619083057889515, + "learning_rate": 6.638934796910545e-07, + "loss": 0.1614, + "step": 7641 + }, + { + "epoch": 2.5520120220404072, + "grad_norm": 0.5161418977249806, + "learning_rate": 6.629261770693213e-07, + "loss": 0.1586, + "step": 7642 + }, + { + "epoch": 2.55234596760728, + "grad_norm": 0.5378163686695885, + "learning_rate": 6.619595296136577e-07, + "loss": 0.1624, + "step": 7643 + }, + { + "epoch": 2.5526799131741527, + "grad_norm": 0.4863809933072381, + "learning_rate": 6.609935374700849e-07, + "loss": 0.1483, + "step": 7644 + }, + { + "epoch": 2.5530138587410254, + "grad_norm": 0.5311657608751162, + "learning_rate": 6.600282007845277e-07, + "loss": 0.158, + "step": 7645 + }, + { + "epoch": 2.5533478043078977, + "grad_norm": 0.5304450691241134, + "learning_rate": 6.590635197028128e-07, + "loss": 0.16, + "step": 7646 + }, + { + "epoch": 2.5536817498747704, + "grad_norm": 0.5094024551256224, + "learning_rate": 6.580994943706675e-07, + "loss": 0.161, + "step": 7647 + }, + { + "epoch": 2.554015695441643, + "grad_norm": 0.4882401458282333, + "learning_rate": 6.571361249337161e-07, + "loss": 0.1455, + "step": 7648 + }, + { + "epoch": 2.554349641008516, + "grad_norm": 0.5183521869165364, + "learning_rate": 6.561734115374901e-07, + "loss": 0.1589, + "step": 7649 + }, + { + "epoch": 2.554683586575388, + "grad_norm": 0.4919516392113919, + "learning_rate": 6.552113543274158e-07, + "loss": 0.1466, + "step": 7650 + }, + { + "epoch": 2.555017532142261, + "grad_norm": 0.4819998758087338, + "learning_rate": 6.54249953448825e-07, + "loss": 0.1441, + "step": 7651 + }, + { + "epoch": 2.5553514777091335, + "grad_norm": 0.5147802225320831, + "learning_rate": 6.532892090469484e-07, + "loss": 0.1563, + "step": 7652 + }, + { + "epoch": 2.555685423276006, + "grad_norm": 0.5320623190525292, + "learning_rate": 6.52329121266918e-07, + "loss": 0.1685, + "step": 7653 + }, + { + "epoch": 2.5560193688428785, + "grad_norm": 0.5253504225252258, + "learning_rate": 6.513696902537653e-07, + "loss": 0.166, + "step": 7654 + }, + { + "epoch": 2.556353314409751, + "grad_norm": 0.5296212624005909, + "learning_rate": 6.504109161524257e-07, + "loss": 0.1573, + "step": 7655 + }, + { + "epoch": 2.556687259976624, + "grad_norm": 0.5441043227921218, + "learning_rate": 6.494527991077304e-07, + "loss": 0.162, + "step": 7656 + }, + { + "epoch": 2.5570212055434967, + "grad_norm": 0.5589909412419124, + "learning_rate": 6.484953392644161e-07, + "loss": 0.1505, + "step": 7657 + }, + { + "epoch": 2.557355151110369, + "grad_norm": 0.5117170772974657, + "learning_rate": 6.475385367671183e-07, + "loss": 0.1514, + "step": 7658 + }, + { + "epoch": 2.5576890966772416, + "grad_norm": 0.534672576883815, + "learning_rate": 6.465823917603742e-07, + "loss": 0.1478, + "step": 7659 + }, + { + "epoch": 2.5580230422441144, + "grad_norm": 0.5313783504428939, + "learning_rate": 6.456269043886182e-07, + "loss": 0.1628, + "step": 7660 + }, + { + "epoch": 2.5583569878109866, + "grad_norm": 0.5160524410564582, + "learning_rate": 6.446720747961904e-07, + "loss": 0.1531, + "step": 7661 + }, + { + "epoch": 2.5586909333778594, + "grad_norm": 0.47096063309538416, + "learning_rate": 6.437179031273272e-07, + "loss": 0.1446, + "step": 7662 + }, + { + "epoch": 2.559024878944732, + "grad_norm": 0.537685199712028, + "learning_rate": 6.427643895261687e-07, + "loss": 0.1578, + "step": 7663 + }, + { + "epoch": 2.5593588245116043, + "grad_norm": 0.5315214663957065, + "learning_rate": 6.418115341367543e-07, + "loss": 0.1524, + "step": 7664 + }, + { + "epoch": 2.559692770078477, + "grad_norm": 0.5027992814477966, + "learning_rate": 6.408593371030231e-07, + "loss": 0.1503, + "step": 7665 + }, + { + "epoch": 2.56002671564535, + "grad_norm": 0.5622015188810098, + "learning_rate": 6.399077985688168e-07, + "loss": 0.1647, + "step": 7666 + }, + { + "epoch": 2.5603606612122225, + "grad_norm": 0.5444389360370473, + "learning_rate": 6.389569186778754e-07, + "loss": 0.1541, + "step": 7667 + }, + { + "epoch": 2.560694606779095, + "grad_norm": 0.4900284290776257, + "learning_rate": 6.38006697573842e-07, + "loss": 0.1419, + "step": 7668 + }, + { + "epoch": 2.5610285523459675, + "grad_norm": 0.5383726943088658, + "learning_rate": 6.370571354002553e-07, + "loss": 0.1616, + "step": 7669 + }, + { + "epoch": 2.56136249791284, + "grad_norm": 0.5003686273859252, + "learning_rate": 6.361082323005624e-07, + "loss": 0.1542, + "step": 7670 + }, + { + "epoch": 2.561696443479713, + "grad_norm": 0.5788182480274164, + "learning_rate": 6.351599884181037e-07, + "loss": 0.1705, + "step": 7671 + }, + { + "epoch": 2.562030389046585, + "grad_norm": 0.5398324783213839, + "learning_rate": 6.342124038961234e-07, + "loss": 0.1549, + "step": 7672 + }, + { + "epoch": 2.562364334613458, + "grad_norm": 0.5253027875595168, + "learning_rate": 6.332654788777642e-07, + "loss": 0.1559, + "step": 7673 + }, + { + "epoch": 2.5626982801803306, + "grad_norm": 0.5641699534412207, + "learning_rate": 6.323192135060713e-07, + "loss": 0.1604, + "step": 7674 + }, + { + "epoch": 2.5630322257472034, + "grad_norm": 0.5536198139751572, + "learning_rate": 6.31373607923989e-07, + "loss": 0.1575, + "step": 7675 + }, + { + "epoch": 2.563366171314076, + "grad_norm": 0.5608823102074348, + "learning_rate": 6.304286622743627e-07, + "loss": 0.1548, + "step": 7676 + }, + { + "epoch": 2.5637001168809483, + "grad_norm": 0.5147008886535758, + "learning_rate": 6.294843766999364e-07, + "loss": 0.1644, + "step": 7677 + }, + { + "epoch": 2.564034062447821, + "grad_norm": 0.5051122394513354, + "learning_rate": 6.285407513433572e-07, + "loss": 0.1504, + "step": 7678 + }, + { + "epoch": 2.564368008014694, + "grad_norm": 0.5089827237941057, + "learning_rate": 6.275977863471683e-07, + "loss": 0.1541, + "step": 7679 + }, + { + "epoch": 2.564701953581566, + "grad_norm": 0.5833479789208602, + "learning_rate": 6.266554818538173e-07, + "loss": 0.1733, + "step": 7680 + }, + { + "epoch": 2.5650358991484388, + "grad_norm": 0.49383382112180885, + "learning_rate": 6.257138380056505e-07, + "loss": 0.1437, + "step": 7681 + }, + { + "epoch": 2.5653698447153115, + "grad_norm": 0.5396466133422341, + "learning_rate": 6.24772854944915e-07, + "loss": 0.164, + "step": 7682 + }, + { + "epoch": 2.5657037902821838, + "grad_norm": 0.5253514663001958, + "learning_rate": 6.238325328137552e-07, + "loss": 0.1444, + "step": 7683 + }, + { + "epoch": 2.5660377358490565, + "grad_norm": 0.5067795635086644, + "learning_rate": 6.228928717542205e-07, + "loss": 0.1549, + "step": 7684 + }, + { + "epoch": 2.566371681415929, + "grad_norm": 0.5002550750718825, + "learning_rate": 6.219538719082546e-07, + "loss": 0.1482, + "step": 7685 + }, + { + "epoch": 2.566705626982802, + "grad_norm": 0.5179834571022827, + "learning_rate": 6.210155334177064e-07, + "loss": 0.1456, + "step": 7686 + }, + { + "epoch": 2.5670395725496746, + "grad_norm": 0.5463570828984406, + "learning_rate": 6.200778564243237e-07, + "loss": 0.1557, + "step": 7687 + }, + { + "epoch": 2.567373518116547, + "grad_norm": 0.533683576059173, + "learning_rate": 6.19140841069752e-07, + "loss": 0.1511, + "step": 7688 + }, + { + "epoch": 2.5677074636834196, + "grad_norm": 0.5512649791188792, + "learning_rate": 6.1820448749554e-07, + "loss": 0.1568, + "step": 7689 + }, + { + "epoch": 2.5680414092502923, + "grad_norm": 0.5157111237491274, + "learning_rate": 6.172687958431328e-07, + "loss": 0.1499, + "step": 7690 + }, + { + "epoch": 2.5683753548171646, + "grad_norm": 0.5054894343421307, + "learning_rate": 6.163337662538793e-07, + "loss": 0.1648, + "step": 7691 + }, + { + "epoch": 2.5687093003840373, + "grad_norm": 0.5738044330278376, + "learning_rate": 6.153993988690266e-07, + "loss": 0.1696, + "step": 7692 + }, + { + "epoch": 2.56904324595091, + "grad_norm": 0.5139856319427727, + "learning_rate": 6.144656938297227e-07, + "loss": 0.1616, + "step": 7693 + }, + { + "epoch": 2.5693771915177828, + "grad_norm": 0.529624079244088, + "learning_rate": 6.135326512770124e-07, + "loss": 0.1554, + "step": 7694 + }, + { + "epoch": 2.569711137084655, + "grad_norm": 0.532186839181486, + "learning_rate": 6.126002713518453e-07, + "loss": 0.1634, + "step": 7695 + }, + { + "epoch": 2.5700450826515278, + "grad_norm": 0.4761868956330876, + "learning_rate": 6.116685541950663e-07, + "loss": 0.1485, + "step": 7696 + }, + { + "epoch": 2.5703790282184005, + "grad_norm": 0.5616544286220363, + "learning_rate": 6.107374999474236e-07, + "loss": 0.1708, + "step": 7697 + }, + { + "epoch": 2.570712973785273, + "grad_norm": 0.5155203837062691, + "learning_rate": 6.098071087495621e-07, + "loss": 0.1484, + "step": 7698 + }, + { + "epoch": 2.5710469193521455, + "grad_norm": 0.5225906206355545, + "learning_rate": 6.088773807420312e-07, + "loss": 0.1591, + "step": 7699 + }, + { + "epoch": 2.571380864919018, + "grad_norm": 0.5466633655649673, + "learning_rate": 6.07948316065275e-07, + "loss": 0.1612, + "step": 7700 + }, + { + "epoch": 2.571714810485891, + "grad_norm": 0.5276109996287601, + "learning_rate": 6.070199148596411e-07, + "loss": 0.1543, + "step": 7701 + }, + { + "epoch": 2.572048756052763, + "grad_norm": 0.5516155736454269, + "learning_rate": 6.060921772653738e-07, + "loss": 0.1618, + "step": 7702 + }, + { + "epoch": 2.572382701619636, + "grad_norm": 0.4830812549050556, + "learning_rate": 6.051651034226208e-07, + "loss": 0.1473, + "step": 7703 + }, + { + "epoch": 2.5727166471865086, + "grad_norm": 0.5248643245059543, + "learning_rate": 6.042386934714245e-07, + "loss": 0.156, + "step": 7704 + }, + { + "epoch": 2.5730505927533813, + "grad_norm": 0.5189440442245319, + "learning_rate": 6.03312947551734e-07, + "loss": 0.153, + "step": 7705 + }, + { + "epoch": 2.573384538320254, + "grad_norm": 0.5311742748839656, + "learning_rate": 6.02387865803391e-07, + "loss": 0.1588, + "step": 7706 + }, + { + "epoch": 2.5737184838871263, + "grad_norm": 0.5306508792963632, + "learning_rate": 6.014634483661419e-07, + "loss": 0.1582, + "step": 7707 + }, + { + "epoch": 2.574052429453999, + "grad_norm": 0.5187201844876844, + "learning_rate": 6.005396953796294e-07, + "loss": 0.1506, + "step": 7708 + }, + { + "epoch": 2.5743863750208718, + "grad_norm": 0.50422011790657, + "learning_rate": 5.996166069833976e-07, + "loss": 0.1513, + "step": 7709 + }, + { + "epoch": 2.574720320587744, + "grad_norm": 0.5131733480453597, + "learning_rate": 5.986941833168913e-07, + "loss": 0.1477, + "step": 7710 + }, + { + "epoch": 2.5750542661546167, + "grad_norm": 0.5346311131575376, + "learning_rate": 5.97772424519451e-07, + "loss": 0.1571, + "step": 7711 + }, + { + "epoch": 2.5753882117214895, + "grad_norm": 0.5096595724529045, + "learning_rate": 5.96851330730322e-07, + "loss": 0.1472, + "step": 7712 + }, + { + "epoch": 2.5757221572883617, + "grad_norm": 0.4972298467703904, + "learning_rate": 5.959309020886433e-07, + "loss": 0.1452, + "step": 7713 + }, + { + "epoch": 2.5760561028552345, + "grad_norm": 0.5252331142466029, + "learning_rate": 5.950111387334584e-07, + "loss": 0.1486, + "step": 7714 + }, + { + "epoch": 2.576390048422107, + "grad_norm": 0.5088947881046305, + "learning_rate": 5.940920408037081e-07, + "loss": 0.1479, + "step": 7715 + }, + { + "epoch": 2.57672399398898, + "grad_norm": 0.5705717088562303, + "learning_rate": 5.93173608438234e-07, + "loss": 0.1641, + "step": 7716 + }, + { + "epoch": 2.5770579395558526, + "grad_norm": 0.5117709453123926, + "learning_rate": 5.92255841775774e-07, + "loss": 0.1552, + "step": 7717 + }, + { + "epoch": 2.577391885122725, + "grad_norm": 0.5272255582904503, + "learning_rate": 5.913387409549693e-07, + "loss": 0.1486, + "step": 7718 + }, + { + "epoch": 2.5777258306895976, + "grad_norm": 0.5500948968492612, + "learning_rate": 5.904223061143577e-07, + "loss": 0.1614, + "step": 7719 + }, + { + "epoch": 2.5780597762564703, + "grad_norm": 0.5422871702657105, + "learning_rate": 5.895065373923781e-07, + "loss": 0.1612, + "step": 7720 + }, + { + "epoch": 2.5783937218233426, + "grad_norm": 0.5311152125105255, + "learning_rate": 5.885914349273664e-07, + "loss": 0.1588, + "step": 7721 + }, + { + "epoch": 2.5787276673902153, + "grad_norm": 0.5100562305085944, + "learning_rate": 5.876769988575631e-07, + "loss": 0.1479, + "step": 7722 + }, + { + "epoch": 2.579061612957088, + "grad_norm": 0.5539085439821296, + "learning_rate": 5.867632293211011e-07, + "loss": 0.1611, + "step": 7723 + }, + { + "epoch": 2.5793955585239607, + "grad_norm": 0.5398868308395143, + "learning_rate": 5.85850126456019e-07, + "loss": 0.1503, + "step": 7724 + }, + { + "epoch": 2.5797295040908335, + "grad_norm": 0.5099924237947704, + "learning_rate": 5.84937690400249e-07, + "loss": 0.1465, + "step": 7725 + }, + { + "epoch": 2.5800634496577057, + "grad_norm": 0.5346486269242904, + "learning_rate": 5.840259212916277e-07, + "loss": 0.1515, + "step": 7726 + }, + { + "epoch": 2.5803973952245784, + "grad_norm": 0.5473017465597664, + "learning_rate": 5.831148192678853e-07, + "loss": 0.1598, + "step": 7727 + }, + { + "epoch": 2.580731340791451, + "grad_norm": 0.5494409248977583, + "learning_rate": 5.822043844666586e-07, + "loss": 0.1605, + "step": 7728 + }, + { + "epoch": 2.5810652863583234, + "grad_norm": 0.5465152627591858, + "learning_rate": 5.812946170254763e-07, + "loss": 0.1607, + "step": 7729 + }, + { + "epoch": 2.581399231925196, + "grad_norm": 0.547006910687151, + "learning_rate": 5.803855170817718e-07, + "loss": 0.1538, + "step": 7730 + }, + { + "epoch": 2.581733177492069, + "grad_norm": 0.5075310278681255, + "learning_rate": 5.794770847728736e-07, + "loss": 0.147, + "step": 7731 + }, + { + "epoch": 2.582067123058941, + "grad_norm": 0.5263962669768227, + "learning_rate": 5.785693202360121e-07, + "loss": 0.155, + "step": 7732 + }, + { + "epoch": 2.582401068625814, + "grad_norm": 0.5384871321201414, + "learning_rate": 5.776622236083146e-07, + "loss": 0.1469, + "step": 7733 + }, + { + "epoch": 2.5827350141926866, + "grad_norm": 0.5167209940597584, + "learning_rate": 5.767557950268099e-07, + "loss": 0.1554, + "step": 7734 + }, + { + "epoch": 2.5830689597595593, + "grad_norm": 0.5339870635390116, + "learning_rate": 5.758500346284252e-07, + "loss": 0.1633, + "step": 7735 + }, + { + "epoch": 2.583402905326432, + "grad_norm": 0.5639288536519077, + "learning_rate": 5.749449425499843e-07, + "loss": 0.1646, + "step": 7736 + }, + { + "epoch": 2.5837368508933043, + "grad_norm": 0.5703969992909854, + "learning_rate": 5.740405189282134e-07, + "loss": 0.1692, + "step": 7737 + }, + { + "epoch": 2.584070796460177, + "grad_norm": 0.49620284954011423, + "learning_rate": 5.73136763899737e-07, + "loss": 0.1574, + "step": 7738 + }, + { + "epoch": 2.5844047420270497, + "grad_norm": 0.5336886804139199, + "learning_rate": 5.722336776010756e-07, + "loss": 0.158, + "step": 7739 + }, + { + "epoch": 2.584738687593922, + "grad_norm": 0.564930073801695, + "learning_rate": 5.713312601686533e-07, + "loss": 0.1654, + "step": 7740 + }, + { + "epoch": 2.5850726331607947, + "grad_norm": 0.5116232221341999, + "learning_rate": 5.704295117387904e-07, + "loss": 0.1563, + "step": 7741 + }, + { + "epoch": 2.5854065787276674, + "grad_norm": 0.5427584437481574, + "learning_rate": 5.695284324477052e-07, + "loss": 0.1608, + "step": 7742 + }, + { + "epoch": 2.58574052429454, + "grad_norm": 0.49116718894623634, + "learning_rate": 5.686280224315189e-07, + "loss": 0.1455, + "step": 7743 + }, + { + "epoch": 2.5860744698614124, + "grad_norm": 0.4711470768494421, + "learning_rate": 5.677282818262464e-07, + "loss": 0.1355, + "step": 7744 + }, + { + "epoch": 2.586408415428285, + "grad_norm": 0.5313007777846921, + "learning_rate": 5.668292107678048e-07, + "loss": 0.1594, + "step": 7745 + }, + { + "epoch": 2.586742360995158, + "grad_norm": 0.5555662849152072, + "learning_rate": 5.659308093920101e-07, + "loss": 0.1595, + "step": 7746 + }, + { + "epoch": 2.5870763065620306, + "grad_norm": 0.5467008793166378, + "learning_rate": 5.650330778345776e-07, + "loss": 0.1571, + "step": 7747 + }, + { + "epoch": 2.587410252128903, + "grad_norm": 0.5230866932851201, + "learning_rate": 5.641360162311171e-07, + "loss": 0.1468, + "step": 7748 + }, + { + "epoch": 2.5877441976957756, + "grad_norm": 0.49357033180205706, + "learning_rate": 5.632396247171429e-07, + "loss": 0.1513, + "step": 7749 + }, + { + "epoch": 2.5880781432626483, + "grad_norm": 0.5286758416649487, + "learning_rate": 5.623439034280625e-07, + "loss": 0.1546, + "step": 7750 + }, + { + "epoch": 2.5884120888295206, + "grad_norm": 0.5572992513940127, + "learning_rate": 5.614488524991896e-07, + "loss": 0.1607, + "step": 7751 + }, + { + "epoch": 2.5887460343963933, + "grad_norm": 0.5053585297967675, + "learning_rate": 5.605544720657286e-07, + "loss": 0.1557, + "step": 7752 + }, + { + "epoch": 2.589079979963266, + "grad_norm": 0.5642451185748008, + "learning_rate": 5.596607622627887e-07, + "loss": 0.1607, + "step": 7753 + }, + { + "epoch": 2.5894139255301387, + "grad_norm": 0.5401260030717812, + "learning_rate": 5.587677232253725e-07, + "loss": 0.1525, + "step": 7754 + }, + { + "epoch": 2.5897478710970114, + "grad_norm": 0.4954060068895129, + "learning_rate": 5.57875355088387e-07, + "loss": 0.1487, + "step": 7755 + }, + { + "epoch": 2.5900818166638837, + "grad_norm": 0.5493464971995057, + "learning_rate": 5.569836579866316e-07, + "loss": 0.1593, + "step": 7756 + }, + { + "epoch": 2.5904157622307564, + "grad_norm": 0.5442184853873584, + "learning_rate": 5.560926320548105e-07, + "loss": 0.1657, + "step": 7757 + }, + { + "epoch": 2.590749707797629, + "grad_norm": 0.5006345373798716, + "learning_rate": 5.552022774275228e-07, + "loss": 0.153, + "step": 7758 + }, + { + "epoch": 2.5910836533645014, + "grad_norm": 0.5469704888693632, + "learning_rate": 5.543125942392664e-07, + "loss": 0.1609, + "step": 7759 + }, + { + "epoch": 2.591417598931374, + "grad_norm": 0.494422781019865, + "learning_rate": 5.534235826244389e-07, + "loss": 0.1493, + "step": 7760 + }, + { + "epoch": 2.591751544498247, + "grad_norm": 0.5474099861552523, + "learning_rate": 5.525352427173369e-07, + "loss": 0.1608, + "step": 7761 + }, + { + "epoch": 2.592085490065119, + "grad_norm": 0.5147734733031684, + "learning_rate": 5.516475746521527e-07, + "loss": 0.1536, + "step": 7762 + }, + { + "epoch": 2.592419435631992, + "grad_norm": 0.5449219405525347, + "learning_rate": 5.507605785629794e-07, + "loss": 0.1661, + "step": 7763 + }, + { + "epoch": 2.5927533811988646, + "grad_norm": 0.510727938826208, + "learning_rate": 5.498742545838104e-07, + "loss": 0.1602, + "step": 7764 + }, + { + "epoch": 2.5930873267657373, + "grad_norm": 0.49424743101460045, + "learning_rate": 5.48988602848533e-07, + "loss": 0.1438, + "step": 7765 + }, + { + "epoch": 2.59342127233261, + "grad_norm": 0.5161401437755033, + "learning_rate": 5.481036234909365e-07, + "loss": 0.1544, + "step": 7766 + }, + { + "epoch": 2.5937552178994823, + "grad_norm": 0.5239447561809009, + "learning_rate": 5.472193166447065e-07, + "loss": 0.1617, + "step": 7767 + }, + { + "epoch": 2.594089163466355, + "grad_norm": 0.48449270455326193, + "learning_rate": 5.463356824434285e-07, + "loss": 0.1475, + "step": 7768 + }, + { + "epoch": 2.5944231090332277, + "grad_norm": 0.5139041345004897, + "learning_rate": 5.454527210205857e-07, + "loss": 0.1523, + "step": 7769 + }, + { + "epoch": 2.5947570546001, + "grad_norm": 0.5735900578841117, + "learning_rate": 5.445704325095613e-07, + "loss": 0.1689, + "step": 7770 + }, + { + "epoch": 2.5950910001669727, + "grad_norm": 0.5461313985515455, + "learning_rate": 5.436888170436327e-07, + "loss": 0.1595, + "step": 7771 + }, + { + "epoch": 2.5954249457338454, + "grad_norm": 0.5188701878410461, + "learning_rate": 5.428078747559806e-07, + "loss": 0.1524, + "step": 7772 + }, + { + "epoch": 2.595758891300718, + "grad_norm": 0.5380374188365196, + "learning_rate": 5.419276057796802e-07, + "loss": 0.1522, + "step": 7773 + }, + { + "epoch": 2.596092836867591, + "grad_norm": 0.5483299645095472, + "learning_rate": 5.410480102477067e-07, + "loss": 0.1706, + "step": 7774 + }, + { + "epoch": 2.596426782434463, + "grad_norm": 0.5307610952040726, + "learning_rate": 5.401690882929333e-07, + "loss": 0.1538, + "step": 7775 + }, + { + "epoch": 2.596760728001336, + "grad_norm": 0.51063905487827, + "learning_rate": 5.392908400481334e-07, + "loss": 0.1476, + "step": 7776 + }, + { + "epoch": 2.5970946735682086, + "grad_norm": 0.4993184207547867, + "learning_rate": 5.384132656459745e-07, + "loss": 0.1525, + "step": 7777 + }, + { + "epoch": 2.597428619135081, + "grad_norm": 0.5401423593539602, + "learning_rate": 5.375363652190257e-07, + "loss": 0.1602, + "step": 7778 + }, + { + "epoch": 2.5977625647019535, + "grad_norm": 0.5034998004381569, + "learning_rate": 5.366601388997522e-07, + "loss": 0.1438, + "step": 7779 + }, + { + "epoch": 2.5980965102688263, + "grad_norm": 0.48936768353800086, + "learning_rate": 5.357845868205191e-07, + "loss": 0.1464, + "step": 7780 + }, + { + "epoch": 2.5984304558356985, + "grad_norm": 0.5169519189655872, + "learning_rate": 5.34909709113589e-07, + "loss": 0.1577, + "step": 7781 + }, + { + "epoch": 2.5987644014025713, + "grad_norm": 0.5433344403290058, + "learning_rate": 5.340355059111213e-07, + "loss": 0.1594, + "step": 7782 + }, + { + "epoch": 2.599098346969444, + "grad_norm": 0.5137287693888043, + "learning_rate": 5.331619773451757e-07, + "loss": 0.1555, + "step": 7783 + }, + { + "epoch": 2.5994322925363167, + "grad_norm": 0.543531749495905, + "learning_rate": 5.32289123547709e-07, + "loss": 0.1582, + "step": 7784 + }, + { + "epoch": 2.5997662381031894, + "grad_norm": 0.5571790592000054, + "learning_rate": 5.314169446505757e-07, + "loss": 0.1595, + "step": 7785 + }, + { + "epoch": 2.6001001836700617, + "grad_norm": 0.5359827903002512, + "learning_rate": 5.305454407855282e-07, + "loss": 0.1579, + "step": 7786 + }, + { + "epoch": 2.6004341292369344, + "grad_norm": 0.5327339424957118, + "learning_rate": 5.296746120842189e-07, + "loss": 0.1504, + "step": 7787 + }, + { + "epoch": 2.600768074803807, + "grad_norm": 0.532171748237851, + "learning_rate": 5.288044586781955e-07, + "loss": 0.161, + "step": 7788 + }, + { + "epoch": 2.6011020203706794, + "grad_norm": 0.5941419346349204, + "learning_rate": 5.279349806989054e-07, + "loss": 0.1709, + "step": 7789 + }, + { + "epoch": 2.601435965937552, + "grad_norm": 0.5568723344691379, + "learning_rate": 5.270661782776931e-07, + "loss": 0.1656, + "step": 7790 + }, + { + "epoch": 2.601769911504425, + "grad_norm": 0.5382829971872203, + "learning_rate": 5.26198051545801e-07, + "loss": 0.1664, + "step": 7791 + }, + { + "epoch": 2.6021038570712975, + "grad_norm": 0.4687505363311479, + "learning_rate": 5.253306006343706e-07, + "loss": 0.1445, + "step": 7792 + }, + { + "epoch": 2.60243780263817, + "grad_norm": 0.5365972383720856, + "learning_rate": 5.244638256744422e-07, + "loss": 0.1563, + "step": 7793 + }, + { + "epoch": 2.6027717482050425, + "grad_norm": 0.531629913868646, + "learning_rate": 5.235977267969489e-07, + "loss": 0.1545, + "step": 7794 + }, + { + "epoch": 2.6031056937719153, + "grad_norm": 0.5512052735448711, + "learning_rate": 5.227323041327281e-07, + "loss": 0.1651, + "step": 7795 + }, + { + "epoch": 2.603439639338788, + "grad_norm": 0.5192438240687594, + "learning_rate": 5.218675578125099e-07, + "loss": 0.1593, + "step": 7796 + }, + { + "epoch": 2.6037735849056602, + "grad_norm": 0.5341457063593223, + "learning_rate": 5.210034879669257e-07, + "loss": 0.1606, + "step": 7797 + }, + { + "epoch": 2.604107530472533, + "grad_norm": 0.4849767604435831, + "learning_rate": 5.201400947265029e-07, + "loss": 0.1402, + "step": 7798 + }, + { + "epoch": 2.6044414760394057, + "grad_norm": 0.5178804361452914, + "learning_rate": 5.192773782216681e-07, + "loss": 0.1586, + "step": 7799 + }, + { + "epoch": 2.604775421606278, + "grad_norm": 0.5509222801070119, + "learning_rate": 5.184153385827434e-07, + "loss": 0.1608, + "step": 7800 + }, + { + "epoch": 2.6051093671731507, + "grad_norm": 0.5055391926707585, + "learning_rate": 5.175539759399518e-07, + "loss": 0.149, + "step": 7801 + }, + { + "epoch": 2.6054433127400234, + "grad_norm": 0.4990687566409661, + "learning_rate": 5.166932904234101e-07, + "loss": 0.1407, + "step": 7802 + }, + { + "epoch": 2.605777258306896, + "grad_norm": 0.5445711995101518, + "learning_rate": 5.158332821631362e-07, + "loss": 0.1567, + "step": 7803 + }, + { + "epoch": 2.606111203873769, + "grad_norm": 0.5676814520874788, + "learning_rate": 5.149739512890445e-07, + "loss": 0.1597, + "step": 7804 + }, + { + "epoch": 2.606445149440641, + "grad_norm": 0.49071635784943063, + "learning_rate": 5.141152979309477e-07, + "loss": 0.1433, + "step": 7805 + }, + { + "epoch": 2.606779095007514, + "grad_norm": 0.5593982162095931, + "learning_rate": 5.132573222185539e-07, + "loss": 0.1624, + "step": 7806 + }, + { + "epoch": 2.6071130405743865, + "grad_norm": 0.5026472702629329, + "learning_rate": 5.124000242814725e-07, + "loss": 0.1545, + "step": 7807 + }, + { + "epoch": 2.607446986141259, + "grad_norm": 0.5215743662653198, + "learning_rate": 5.115434042492057e-07, + "loss": 0.1497, + "step": 7808 + }, + { + "epoch": 2.6077809317081315, + "grad_norm": 0.5329900153400169, + "learning_rate": 5.106874622511576e-07, + "loss": 0.1571, + "step": 7809 + }, + { + "epoch": 2.6081148772750042, + "grad_norm": 0.5110805817757201, + "learning_rate": 5.098321984166293e-07, + "loss": 0.1449, + "step": 7810 + }, + { + "epoch": 2.6084488228418765, + "grad_norm": 0.5610884253526799, + "learning_rate": 5.089776128748169e-07, + "loss": 0.1589, + "step": 7811 + }, + { + "epoch": 2.6087827684087492, + "grad_norm": 0.5378244387007248, + "learning_rate": 5.081237057548166e-07, + "loss": 0.157, + "step": 7812 + }, + { + "epoch": 2.609116713975622, + "grad_norm": 0.5931449426137503, + "learning_rate": 5.072704771856201e-07, + "loss": 0.1687, + "step": 7813 + }, + { + "epoch": 2.6094506595424947, + "grad_norm": 0.4805946751761879, + "learning_rate": 5.06417927296119e-07, + "loss": 0.1457, + "step": 7814 + }, + { + "epoch": 2.6097846051093674, + "grad_norm": 0.4980998435747012, + "learning_rate": 5.055660562150983e-07, + "loss": 0.1444, + "step": 7815 + }, + { + "epoch": 2.6101185506762397, + "grad_norm": 0.531313199842911, + "learning_rate": 5.047148640712468e-07, + "loss": 0.1558, + "step": 7816 + }, + { + "epoch": 2.6104524962431124, + "grad_norm": 0.5085288422637632, + "learning_rate": 5.038643509931446e-07, + "loss": 0.156, + "step": 7817 + }, + { + "epoch": 2.610786441809985, + "grad_norm": 0.5819241309614197, + "learning_rate": 5.030145171092732e-07, + "loss": 0.1652, + "step": 7818 + }, + { + "epoch": 2.6111203873768574, + "grad_norm": 0.5511100906751708, + "learning_rate": 5.021653625480089e-07, + "loss": 0.1678, + "step": 7819 + }, + { + "epoch": 2.61145433294373, + "grad_norm": 0.5176029698496152, + "learning_rate": 5.013168874376273e-07, + "loss": 0.1526, + "step": 7820 + }, + { + "epoch": 2.611788278510603, + "grad_norm": 0.6173691909315729, + "learning_rate": 5.004690919062983e-07, + "loss": 0.1704, + "step": 7821 + }, + { + "epoch": 2.6121222240774755, + "grad_norm": 0.539288615553018, + "learning_rate": 4.996219760820947e-07, + "loss": 0.1529, + "step": 7822 + }, + { + "epoch": 2.6124561696443482, + "grad_norm": 0.5481896336994918, + "learning_rate": 4.987755400929817e-07, + "loss": 0.1596, + "step": 7823 + }, + { + "epoch": 2.6127901152112205, + "grad_norm": 0.5191460081896047, + "learning_rate": 4.97929784066824e-07, + "loss": 0.151, + "step": 7824 + }, + { + "epoch": 2.6131240607780932, + "grad_norm": 0.5264355185799506, + "learning_rate": 4.970847081313818e-07, + "loss": 0.1561, + "step": 7825 + }, + { + "epoch": 2.613458006344966, + "grad_norm": 0.5277097383950515, + "learning_rate": 4.962403124143156e-07, + "loss": 0.1513, + "step": 7826 + }, + { + "epoch": 2.613791951911838, + "grad_norm": 0.5003241442742133, + "learning_rate": 4.953965970431779e-07, + "loss": 0.153, + "step": 7827 + }, + { + "epoch": 2.614125897478711, + "grad_norm": 0.5230613526678042, + "learning_rate": 4.945535621454268e-07, + "loss": 0.1479, + "step": 7828 + }, + { + "epoch": 2.6144598430455837, + "grad_norm": 0.5437893134594357, + "learning_rate": 4.937112078484086e-07, + "loss": 0.1573, + "step": 7829 + }, + { + "epoch": 2.614793788612456, + "grad_norm": 0.5504021915641389, + "learning_rate": 4.928695342793733e-07, + "loss": 0.1626, + "step": 7830 + }, + { + "epoch": 2.6151277341793286, + "grad_norm": 0.5153465880324457, + "learning_rate": 4.92028541565464e-07, + "loss": 0.1548, + "step": 7831 + }, + { + "epoch": 2.6154616797462014, + "grad_norm": 0.5340166597075059, + "learning_rate": 4.911882298337228e-07, + "loss": 0.1628, + "step": 7832 + }, + { + "epoch": 2.615795625313074, + "grad_norm": 0.5363461554034484, + "learning_rate": 4.903485992110901e-07, + "loss": 0.1536, + "step": 7833 + }, + { + "epoch": 2.616129570879947, + "grad_norm": 0.5255258415056727, + "learning_rate": 4.895096498243995e-07, + "loss": 0.1517, + "step": 7834 + }, + { + "epoch": 2.616463516446819, + "grad_norm": 0.5926188747420524, + "learning_rate": 4.886713818003874e-07, + "loss": 0.1681, + "step": 7835 + }, + { + "epoch": 2.616797462013692, + "grad_norm": 0.5227398551663379, + "learning_rate": 4.878337952656809e-07, + "loss": 0.1522, + "step": 7836 + }, + { + "epoch": 2.6171314075805645, + "grad_norm": 0.5447007227711369, + "learning_rate": 4.869968903468092e-07, + "loss": 0.1609, + "step": 7837 + }, + { + "epoch": 2.617465353147437, + "grad_norm": 0.5237554693670284, + "learning_rate": 4.861606671701946e-07, + "loss": 0.1559, + "step": 7838 + }, + { + "epoch": 2.6177992987143095, + "grad_norm": 0.5797396060428281, + "learning_rate": 4.853251258621621e-07, + "loss": 0.1723, + "step": 7839 + }, + { + "epoch": 2.618133244281182, + "grad_norm": 0.5201124138243138, + "learning_rate": 4.844902665489265e-07, + "loss": 0.159, + "step": 7840 + }, + { + "epoch": 2.618467189848055, + "grad_norm": 0.514676268580547, + "learning_rate": 4.836560893566056e-07, + "loss": 0.1527, + "step": 7841 + }, + { + "epoch": 2.618801135414927, + "grad_norm": 0.5207632098157826, + "learning_rate": 4.828225944112097e-07, + "loss": 0.1545, + "step": 7842 + }, + { + "epoch": 2.6191350809818, + "grad_norm": 0.5262119712435508, + "learning_rate": 4.819897818386499e-07, + "loss": 0.1511, + "step": 7843 + }, + { + "epoch": 2.6194690265486726, + "grad_norm": 0.5143227879462103, + "learning_rate": 4.811576517647299e-07, + "loss": 0.148, + "step": 7844 + }, + { + "epoch": 2.6198029721155454, + "grad_norm": 0.5622213956904916, + "learning_rate": 4.803262043151557e-07, + "loss": 0.1617, + "step": 7845 + }, + { + "epoch": 2.6201369176824176, + "grad_norm": 0.5978754119797444, + "learning_rate": 4.794954396155249e-07, + "loss": 0.1688, + "step": 7846 + }, + { + "epoch": 2.6204708632492903, + "grad_norm": 0.5177420641465259, + "learning_rate": 4.786653577913364e-07, + "loss": 0.1503, + "step": 7847 + }, + { + "epoch": 2.620804808816163, + "grad_norm": 0.5276437691064124, + "learning_rate": 4.77835958967981e-07, + "loss": 0.1479, + "step": 7848 + }, + { + "epoch": 2.6211387543830353, + "grad_norm": 0.5363122643509928, + "learning_rate": 4.770072432707523e-07, + "loss": 0.1486, + "step": 7849 + }, + { + "epoch": 2.621472699949908, + "grad_norm": 0.5652686462454588, + "learning_rate": 4.761792108248342e-07, + "loss": 0.1667, + "step": 7850 + }, + { + "epoch": 2.6218066455167808, + "grad_norm": 0.5305584330166254, + "learning_rate": 4.753518617553138e-07, + "loss": 0.1591, + "step": 7851 + }, + { + "epoch": 2.6221405910836535, + "grad_norm": 0.5253770844239304, + "learning_rate": 4.745251961871705e-07, + "loss": 0.156, + "step": 7852 + }, + { + "epoch": 2.622474536650526, + "grad_norm": 0.5468790587793163, + "learning_rate": 4.736992142452823e-07, + "loss": 0.1598, + "step": 7853 + }, + { + "epoch": 2.6228084822173985, + "grad_norm": 0.5425759314571648, + "learning_rate": 4.728739160544227e-07, + "loss": 0.153, + "step": 7854 + }, + { + "epoch": 2.623142427784271, + "grad_norm": 0.5271469526144371, + "learning_rate": 4.720493017392641e-07, + "loss": 0.152, + "step": 7855 + }, + { + "epoch": 2.623476373351144, + "grad_norm": 0.49647538460886176, + "learning_rate": 4.712253714243725e-07, + "loss": 0.1552, + "step": 7856 + }, + { + "epoch": 2.623810318918016, + "grad_norm": 0.5114529568024085, + "learning_rate": 4.7040212523421335e-07, + "loss": 0.1582, + "step": 7857 + }, + { + "epoch": 2.624144264484889, + "grad_norm": 0.5079040388836952, + "learning_rate": 4.695795632931477e-07, + "loss": 0.1553, + "step": 7858 + }, + { + "epoch": 2.6244782100517616, + "grad_norm": 0.48829885332335254, + "learning_rate": 4.687576857254328e-07, + "loss": 0.1446, + "step": 7859 + }, + { + "epoch": 2.624812155618634, + "grad_norm": 0.4728253401593144, + "learning_rate": 4.679364926552238e-07, + "loss": 0.1451, + "step": 7860 + }, + { + "epoch": 2.6251461011855066, + "grad_norm": 0.49875372399465623, + "learning_rate": 4.671159842065698e-07, + "loss": 0.1521, + "step": 7861 + }, + { + "epoch": 2.6254800467523793, + "grad_norm": 0.5076996510828747, + "learning_rate": 4.662961605034194e-07, + "loss": 0.1442, + "step": 7862 + }, + { + "epoch": 2.625813992319252, + "grad_norm": 0.5122407403110252, + "learning_rate": 4.654770216696169e-07, + "loss": 0.15, + "step": 7863 + }, + { + "epoch": 2.6261479378861248, + "grad_norm": 0.5321463370987862, + "learning_rate": 4.646585678289034e-07, + "loss": 0.1531, + "step": 7864 + }, + { + "epoch": 2.626481883452997, + "grad_norm": 0.5579442129363771, + "learning_rate": 4.6384079910491376e-07, + "loss": 0.1634, + "step": 7865 + }, + { + "epoch": 2.6268158290198698, + "grad_norm": 0.5191857872148999, + "learning_rate": 4.630237156211842e-07, + "loss": 0.1521, + "step": 7866 + }, + { + "epoch": 2.6271497745867425, + "grad_norm": 0.537176066982126, + "learning_rate": 4.6220731750114267e-07, + "loss": 0.1581, + "step": 7867 + }, + { + "epoch": 2.6274837201536148, + "grad_norm": 0.5506531503517323, + "learning_rate": 4.6139160486811663e-07, + "loss": 0.1521, + "step": 7868 + }, + { + "epoch": 2.6278176657204875, + "grad_norm": 0.48213991422421537, + "learning_rate": 4.605765778453292e-07, + "loss": 0.1415, + "step": 7869 + }, + { + "epoch": 2.62815161128736, + "grad_norm": 0.6021241570026848, + "learning_rate": 4.597622365559007e-07, + "loss": 0.1767, + "step": 7870 + }, + { + "epoch": 2.628485556854233, + "grad_norm": 0.5383826080314704, + "learning_rate": 4.5894858112284445e-07, + "loss": 0.1555, + "step": 7871 + }, + { + "epoch": 2.6288195024211056, + "grad_norm": 0.5133156672319394, + "learning_rate": 4.581356116690755e-07, + "loss": 0.1429, + "step": 7872 + }, + { + "epoch": 2.629153447987978, + "grad_norm": 0.45845178214461585, + "learning_rate": 4.573233283173989e-07, + "loss": 0.1351, + "step": 7873 + }, + { + "epoch": 2.6294873935548506, + "grad_norm": 0.5028748544138391, + "learning_rate": 4.5651173119052427e-07, + "loss": 0.1568, + "step": 7874 + }, + { + "epoch": 2.6298213391217233, + "grad_norm": 0.523847639748306, + "learning_rate": 4.5570082041104915e-07, + "loss": 0.1592, + "step": 7875 + }, + { + "epoch": 2.6301552846885956, + "grad_norm": 0.5533083847144648, + "learning_rate": 4.5489059610147323e-07, + "loss": 0.1642, + "step": 7876 + }, + { + "epoch": 2.6304892302554683, + "grad_norm": 0.5741688540753057, + "learning_rate": 4.5408105838418924e-07, + "loss": 0.1677, + "step": 7877 + }, + { + "epoch": 2.630823175822341, + "grad_norm": 0.5274978341726365, + "learning_rate": 4.5327220738148823e-07, + "loss": 0.1568, + "step": 7878 + }, + { + "epoch": 2.6311571213892133, + "grad_norm": 0.49812528970220843, + "learning_rate": 4.524640432155558e-07, + "loss": 0.1485, + "step": 7879 + }, + { + "epoch": 2.631491066956086, + "grad_norm": 0.5285848686406707, + "learning_rate": 4.516565660084754e-07, + "loss": 0.1573, + "step": 7880 + }, + { + "epoch": 2.6318250125229588, + "grad_norm": 0.544151028133756, + "learning_rate": 4.5084977588222613e-07, + "loss": 0.1612, + "step": 7881 + }, + { + "epoch": 2.6321589580898315, + "grad_norm": 0.5246947358977803, + "learning_rate": 4.500436729586821e-07, + "loss": 0.1584, + "step": 7882 + }, + { + "epoch": 2.632492903656704, + "grad_norm": 0.5428300093923333, + "learning_rate": 4.4923825735961604e-07, + "loss": 0.165, + "step": 7883 + }, + { + "epoch": 2.6328268492235765, + "grad_norm": 0.4968719145895019, + "learning_rate": 4.484335292066938e-07, + "loss": 0.1451, + "step": 7884 + }, + { + "epoch": 2.633160794790449, + "grad_norm": 0.5269566759015472, + "learning_rate": 4.476294886214799e-07, + "loss": 0.1479, + "step": 7885 + }, + { + "epoch": 2.633494740357322, + "grad_norm": 0.5511501631755724, + "learning_rate": 4.468261357254339e-07, + "loss": 0.1517, + "step": 7886 + }, + { + "epoch": 2.633828685924194, + "grad_norm": 0.5047376296992027, + "learning_rate": 4.46023470639913e-07, + "loss": 0.1522, + "step": 7887 + }, + { + "epoch": 2.634162631491067, + "grad_norm": 0.5883433464805333, + "learning_rate": 4.452214934861676e-07, + "loss": 0.165, + "step": 7888 + }, + { + "epoch": 2.6344965770579396, + "grad_norm": 0.5735857232549988, + "learning_rate": 4.4442020438534737e-07, + "loss": 0.1584, + "step": 7889 + }, + { + "epoch": 2.6348305226248123, + "grad_norm": 0.5085681511536392, + "learning_rate": 4.436196034584944e-07, + "loss": 0.1553, + "step": 7890 + }, + { + "epoch": 2.6351644681916846, + "grad_norm": 0.5134750451469148, + "learning_rate": 4.4281969082654976e-07, + "loss": 0.1521, + "step": 7891 + }, + { + "epoch": 2.6354984137585573, + "grad_norm": 0.5501752039523184, + "learning_rate": 4.4202046661035e-07, + "loss": 0.1586, + "step": 7892 + }, + { + "epoch": 2.63583235932543, + "grad_norm": 0.4935318484140184, + "learning_rate": 4.4122193093062815e-07, + "loss": 0.1499, + "step": 7893 + }, + { + "epoch": 2.6361663048923027, + "grad_norm": 0.5381149489809288, + "learning_rate": 4.4042408390801097e-07, + "loss": 0.1524, + "step": 7894 + }, + { + "epoch": 2.636500250459175, + "grad_norm": 0.48618271758439613, + "learning_rate": 4.3962692566302366e-07, + "loss": 0.1459, + "step": 7895 + }, + { + "epoch": 2.6368341960260477, + "grad_norm": 0.5542917107201565, + "learning_rate": 4.38830456316085e-07, + "loss": 0.1621, + "step": 7896 + }, + { + "epoch": 2.6371681415929205, + "grad_norm": 0.5934052591504712, + "learning_rate": 4.38034675987512e-07, + "loss": 0.1693, + "step": 7897 + }, + { + "epoch": 2.6375020871597927, + "grad_norm": 0.6050578302721633, + "learning_rate": 4.372395847975164e-07, + "loss": 0.1661, + "step": 7898 + }, + { + "epoch": 2.6378360327266654, + "grad_norm": 0.5416419907657133, + "learning_rate": 4.364451828662075e-07, + "loss": 0.1546, + "step": 7899 + }, + { + "epoch": 2.638169978293538, + "grad_norm": 0.5494613936282062, + "learning_rate": 4.356514703135867e-07, + "loss": 0.1629, + "step": 7900 + }, + { + "epoch": 2.638503923860411, + "grad_norm": 0.5203439914354697, + "learning_rate": 4.348584472595557e-07, + "loss": 0.1557, + "step": 7901 + }, + { + "epoch": 2.6388378694272836, + "grad_norm": 0.5468240192643921, + "learning_rate": 4.3406611382390826e-07, + "loss": 0.1554, + "step": 7902 + }, + { + "epoch": 2.639171814994156, + "grad_norm": 0.5255586767892553, + "learning_rate": 4.3327447012633695e-07, + "loss": 0.1536, + "step": 7903 + }, + { + "epoch": 2.6395057605610286, + "grad_norm": 0.5141342995815591, + "learning_rate": 4.324835162864283e-07, + "loss": 0.1467, + "step": 7904 + }, + { + "epoch": 2.6398397061279013, + "grad_norm": 0.5304691027531064, + "learning_rate": 4.31693252423665e-07, + "loss": 0.1543, + "step": 7905 + }, + { + "epoch": 2.6401736516947736, + "grad_norm": 0.5023980897506727, + "learning_rate": 4.3090367865742666e-07, + "loss": 0.1553, + "step": 7906 + }, + { + "epoch": 2.6405075972616463, + "grad_norm": 0.5085208473006091, + "learning_rate": 4.3011479510698615e-07, + "loss": 0.1499, + "step": 7907 + }, + { + "epoch": 2.640841542828519, + "grad_norm": 0.5327667092975095, + "learning_rate": 4.293266018915149e-07, + "loss": 0.1558, + "step": 7908 + }, + { + "epoch": 2.6411754883953913, + "grad_norm": 0.5607038079211306, + "learning_rate": 4.2853909913007807e-07, + "loss": 0.1694, + "step": 7909 + }, + { + "epoch": 2.641509433962264, + "grad_norm": 0.5499665080788384, + "learning_rate": 4.277522869416384e-07, + "loss": 0.1645, + "step": 7910 + }, + { + "epoch": 2.6418433795291367, + "grad_norm": 0.5290539398893629, + "learning_rate": 4.269661654450513e-07, + "loss": 0.1529, + "step": 7911 + }, + { + "epoch": 2.6421773250960094, + "grad_norm": 0.49108126097882077, + "learning_rate": 4.261807347590713e-07, + "loss": 0.1364, + "step": 7912 + }, + { + "epoch": 2.642511270662882, + "grad_norm": 0.5284309578302578, + "learning_rate": 4.253959950023456e-07, + "loss": 0.1559, + "step": 7913 + }, + { + "epoch": 2.6428452162297544, + "grad_norm": 0.5074179173504525, + "learning_rate": 4.246119462934195e-07, + "loss": 0.1559, + "step": 7914 + }, + { + "epoch": 2.643179161796627, + "grad_norm": 0.5065896426675138, + "learning_rate": 4.238285887507315e-07, + "loss": 0.155, + "step": 7915 + }, + { + "epoch": 2.6435131073635, + "grad_norm": 0.5039285388619513, + "learning_rate": 4.230459224926198e-07, + "loss": 0.1531, + "step": 7916 + }, + { + "epoch": 2.643847052930372, + "grad_norm": 0.5283799716103906, + "learning_rate": 4.222639476373119e-07, + "loss": 0.1448, + "step": 7917 + }, + { + "epoch": 2.644180998497245, + "grad_norm": 0.5626566515484439, + "learning_rate": 4.2148266430293627e-07, + "loss": 0.154, + "step": 7918 + }, + { + "epoch": 2.6445149440641176, + "grad_norm": 0.5156261937650805, + "learning_rate": 4.207020726075145e-07, + "loss": 0.1585, + "step": 7919 + }, + { + "epoch": 2.6448488896309903, + "grad_norm": 0.5440110225549866, + "learning_rate": 4.199221726689634e-07, + "loss": 0.1593, + "step": 7920 + }, + { + "epoch": 2.645182835197863, + "grad_norm": 0.570266964313994, + "learning_rate": 4.191429646050971e-07, + "loss": 0.1671, + "step": 7921 + }, + { + "epoch": 2.6455167807647353, + "grad_norm": 0.5221227396321205, + "learning_rate": 4.1836444853362465e-07, + "loss": 0.1506, + "step": 7922 + }, + { + "epoch": 2.645850726331608, + "grad_norm": 0.497219251017224, + "learning_rate": 4.1758662457214884e-07, + "loss": 0.1373, + "step": 7923 + }, + { + "epoch": 2.6461846718984807, + "grad_norm": 0.541261652338555, + "learning_rate": 4.1680949283816996e-07, + "loss": 0.1576, + "step": 7924 + }, + { + "epoch": 2.646518617465353, + "grad_norm": 0.6069496137295698, + "learning_rate": 4.160330534490814e-07, + "loss": 0.158, + "step": 7925 + }, + { + "epoch": 2.6468525630322257, + "grad_norm": 0.5806445366255634, + "learning_rate": 4.152573065221749e-07, + "loss": 0.1691, + "step": 7926 + }, + { + "epoch": 2.6471865085990984, + "grad_norm": 0.5463692447613053, + "learning_rate": 4.1448225217463724e-07, + "loss": 0.1595, + "step": 7927 + }, + { + "epoch": 2.6475204541659707, + "grad_norm": 0.493295458582906, + "learning_rate": 4.1370789052354644e-07, + "loss": 0.1481, + "step": 7928 + }, + { + "epoch": 2.6478543997328434, + "grad_norm": 0.5120650298647899, + "learning_rate": 4.129342216858817e-07, + "loss": 0.1541, + "step": 7929 + }, + { + "epoch": 2.648188345299716, + "grad_norm": 0.5501003904091634, + "learning_rate": 4.1216124577851293e-07, + "loss": 0.1646, + "step": 7930 + }, + { + "epoch": 2.648522290866589, + "grad_norm": 0.5994137158090266, + "learning_rate": 4.113889629182083e-07, + "loss": 0.1743, + "step": 7931 + }, + { + "epoch": 2.6488562364334616, + "grad_norm": 0.5321524927305665, + "learning_rate": 4.106173732216295e-07, + "loss": 0.1548, + "step": 7932 + }, + { + "epoch": 2.649190182000334, + "grad_norm": 0.5392827869317124, + "learning_rate": 4.0984647680533564e-07, + "loss": 0.1551, + "step": 7933 + }, + { + "epoch": 2.6495241275672066, + "grad_norm": 0.5489927227974719, + "learning_rate": 4.090762737857784e-07, + "loss": 0.1408, + "step": 7934 + }, + { + "epoch": 2.6498580731340793, + "grad_norm": 0.5787246971378018, + "learning_rate": 4.0830676427930646e-07, + "loss": 0.1632, + "step": 7935 + }, + { + "epoch": 2.6501920187009516, + "grad_norm": 0.5134979626899223, + "learning_rate": 4.0753794840216296e-07, + "loss": 0.1568, + "step": 7936 + }, + { + "epoch": 2.6505259642678243, + "grad_norm": 0.5604056768059363, + "learning_rate": 4.067698262704878e-07, + "loss": 0.1671, + "step": 7937 + }, + { + "epoch": 2.650859909834697, + "grad_norm": 0.5144901739013071, + "learning_rate": 4.0600239800031136e-07, + "loss": 0.1545, + "step": 7938 + }, + { + "epoch": 2.6511938554015697, + "grad_norm": 0.5203460567839596, + "learning_rate": 4.0523566370756774e-07, + "loss": 0.1491, + "step": 7939 + }, + { + "epoch": 2.651527800968442, + "grad_norm": 0.5067411197472471, + "learning_rate": 4.044696235080775e-07, + "loss": 0.1451, + "step": 7940 + }, + { + "epoch": 2.6518617465353147, + "grad_norm": 0.5252416651013114, + "learning_rate": 4.037042775175626e-07, + "loss": 0.1578, + "step": 7941 + }, + { + "epoch": 2.6521956921021874, + "grad_norm": 0.6042433021264649, + "learning_rate": 4.0293962585163493e-07, + "loss": 0.1583, + "step": 7942 + }, + { + "epoch": 2.65252963766906, + "grad_norm": 0.49441861117276115, + "learning_rate": 4.02175668625806e-07, + "loss": 0.1454, + "step": 7943 + }, + { + "epoch": 2.6528635832359324, + "grad_norm": 0.5240774832604242, + "learning_rate": 4.014124059554786e-07, + "loss": 0.1564, + "step": 7944 + }, + { + "epoch": 2.653197528802805, + "grad_norm": 0.5320277327365568, + "learning_rate": 4.006498379559559e-07, + "loss": 0.1569, + "step": 7945 + }, + { + "epoch": 2.653531474369678, + "grad_norm": 0.530128976732692, + "learning_rate": 3.9988796474242977e-07, + "loss": 0.149, + "step": 7946 + }, + { + "epoch": 2.65386541993655, + "grad_norm": 0.5389535705555505, + "learning_rate": 3.9912678642999134e-07, + "loss": 0.1621, + "step": 7947 + }, + { + "epoch": 2.654199365503423, + "grad_norm": 0.511151976632125, + "learning_rate": 3.983663031336249e-07, + "loss": 0.1574, + "step": 7948 + }, + { + "epoch": 2.6545333110702956, + "grad_norm": 0.5122387147490539, + "learning_rate": 3.976065149682112e-07, + "loss": 0.1439, + "step": 7949 + }, + { + "epoch": 2.6548672566371683, + "grad_norm": 0.5142207485921048, + "learning_rate": 3.968474220485252e-07, + "loss": 0.1555, + "step": 7950 + }, + { + "epoch": 2.655201202204041, + "grad_norm": 0.6583635200461101, + "learning_rate": 3.960890244892362e-07, + "loss": 0.1672, + "step": 7951 + }, + { + "epoch": 2.6555351477709133, + "grad_norm": 0.4891978157653677, + "learning_rate": 3.953313224049099e-07, + "loss": 0.151, + "step": 7952 + }, + { + "epoch": 2.655869093337786, + "grad_norm": 0.535158030555081, + "learning_rate": 3.945743159100046e-07, + "loss": 0.1524, + "step": 7953 + }, + { + "epoch": 2.6562030389046587, + "grad_norm": 0.5370561320841315, + "learning_rate": 3.938180051188756e-07, + "loss": 0.1574, + "step": 7954 + }, + { + "epoch": 2.656536984471531, + "grad_norm": 0.5358457597336919, + "learning_rate": 3.930623901457736e-07, + "loss": 0.1575, + "step": 7955 + }, + { + "epoch": 2.6568709300384037, + "grad_norm": 0.55410806790667, + "learning_rate": 3.92307471104843e-07, + "loss": 0.1572, + "step": 7956 + }, + { + "epoch": 2.6572048756052764, + "grad_norm": 0.5379106010763365, + "learning_rate": 3.915532481101225e-07, + "loss": 0.1611, + "step": 7957 + }, + { + "epoch": 2.6575388211721487, + "grad_norm": 0.5380414994383523, + "learning_rate": 3.9079972127554657e-07, + "loss": 0.1574, + "step": 7958 + }, + { + "epoch": 2.6578727667390214, + "grad_norm": 0.5306946489832371, + "learning_rate": 3.9004689071494406e-07, + "loss": 0.1552, + "step": 7959 + }, + { + "epoch": 2.658206712305894, + "grad_norm": 0.5083058729379549, + "learning_rate": 3.8929475654203963e-07, + "loss": 0.1509, + "step": 7960 + }, + { + "epoch": 2.658540657872767, + "grad_norm": 0.5221341981384049, + "learning_rate": 3.8854331887045016e-07, + "loss": 0.1501, + "step": 7961 + }, + { + "epoch": 2.6588746034396396, + "grad_norm": 0.5406700255249568, + "learning_rate": 3.877925778136921e-07, + "loss": 0.1588, + "step": 7962 + }, + { + "epoch": 2.659208549006512, + "grad_norm": 0.5407476407616856, + "learning_rate": 3.870425334851713e-07, + "loss": 0.158, + "step": 7963 + }, + { + "epoch": 2.6595424945733845, + "grad_norm": 0.5643950493663836, + "learning_rate": 3.8629318599819224e-07, + "loss": 0.1672, + "step": 7964 + }, + { + "epoch": 2.6598764401402573, + "grad_norm": 0.4808623015760249, + "learning_rate": 3.855445354659515e-07, + "loss": 0.1474, + "step": 7965 + }, + { + "epoch": 2.6602103857071295, + "grad_norm": 0.554313310672822, + "learning_rate": 3.847965820015426e-07, + "loss": 0.1649, + "step": 7966 + }, + { + "epoch": 2.6605443312740022, + "grad_norm": 0.49048685231778866, + "learning_rate": 3.8404932571795115e-07, + "loss": 0.1489, + "step": 7967 + }, + { + "epoch": 2.660878276840875, + "grad_norm": 0.5770239422265581, + "learning_rate": 3.833027667280614e-07, + "loss": 0.1627, + "step": 7968 + }, + { + "epoch": 2.6612122224077477, + "grad_norm": 0.5259826441398753, + "learning_rate": 3.825569051446476e-07, + "loss": 0.1535, + "step": 7969 + }, + { + "epoch": 2.6615461679746204, + "grad_norm": 0.5236874169599129, + "learning_rate": 3.8181174108038286e-07, + "loss": 0.164, + "step": 7970 + }, + { + "epoch": 2.6618801135414927, + "grad_norm": 0.5653557919006781, + "learning_rate": 3.810672746478317e-07, + "loss": 0.1617, + "step": 7971 + }, + { + "epoch": 2.6622140591083654, + "grad_norm": 0.5668011201139045, + "learning_rate": 3.803235059594551e-07, + "loss": 0.1694, + "step": 7972 + }, + { + "epoch": 2.662548004675238, + "grad_norm": 0.5805953775265659, + "learning_rate": 3.795804351276072e-07, + "loss": 0.1654, + "step": 7973 + }, + { + "epoch": 2.6628819502421104, + "grad_norm": 0.5111928242314996, + "learning_rate": 3.788380622645382e-07, + "loss": 0.1611, + "step": 7974 + }, + { + "epoch": 2.663215895808983, + "grad_norm": 0.5324550619095361, + "learning_rate": 3.780963874823934e-07, + "loss": 0.152, + "step": 7975 + }, + { + "epoch": 2.663549841375856, + "grad_norm": 0.533264878533902, + "learning_rate": 3.773554108932093e-07, + "loss": 0.1625, + "step": 7976 + }, + { + "epoch": 2.663883786942728, + "grad_norm": 0.5447286073196415, + "learning_rate": 3.7661513260892067e-07, + "loss": 0.1549, + "step": 7977 + }, + { + "epoch": 2.664217732509601, + "grad_norm": 0.5342876864651773, + "learning_rate": 3.7587555274135544e-07, + "loss": 0.1578, + "step": 7978 + }, + { + "epoch": 2.6645516780764735, + "grad_norm": 0.5094628488311667, + "learning_rate": 3.751366714022342e-07, + "loss": 0.1491, + "step": 7979 + }, + { + "epoch": 2.6648856236433462, + "grad_norm": 0.5346577689522607, + "learning_rate": 3.7439848870317487e-07, + "loss": 0.1602, + "step": 7980 + }, + { + "epoch": 2.665219569210219, + "grad_norm": 0.5777180909450799, + "learning_rate": 3.7366100475568935e-07, + "loss": 0.1643, + "step": 7981 + }, + { + "epoch": 2.6655535147770912, + "grad_norm": 0.552682877062851, + "learning_rate": 3.7292421967118185e-07, + "loss": 0.1615, + "step": 7982 + }, + { + "epoch": 2.665887460343964, + "grad_norm": 0.5213827805993274, + "learning_rate": 3.72188133560954e-07, + "loss": 0.154, + "step": 7983 + }, + { + "epoch": 2.6662214059108367, + "grad_norm": 0.5552413572866788, + "learning_rate": 3.7145274653619776e-07, + "loss": 0.158, + "step": 7984 + }, + { + "epoch": 2.666555351477709, + "grad_norm": 0.4755073461749714, + "learning_rate": 3.7071805870800395e-07, + "loss": 0.145, + "step": 7985 + }, + { + "epoch": 2.6668892970445817, + "grad_norm": 0.5444612344044898, + "learning_rate": 3.6998407018735525e-07, + "loss": 0.1573, + "step": 7986 + }, + { + "epoch": 2.6672232426114544, + "grad_norm": 0.5403035052984416, + "learning_rate": 3.6925078108513033e-07, + "loss": 0.1538, + "step": 7987 + }, + { + "epoch": 2.667557188178327, + "grad_norm": 0.4817666520309885, + "learning_rate": 3.6851819151209947e-07, + "loss": 0.1466, + "step": 7988 + }, + { + "epoch": 2.6678911337451994, + "grad_norm": 0.5338765353416609, + "learning_rate": 3.677863015789307e-07, + "loss": 0.1503, + "step": 7989 + }, + { + "epoch": 2.668225079312072, + "grad_norm": 0.4705920965256273, + "learning_rate": 3.6705511139618177e-07, + "loss": 0.1401, + "step": 7990 + }, + { + "epoch": 2.668559024878945, + "grad_norm": 0.5343266008666374, + "learning_rate": 3.66324621074311e-07, + "loss": 0.1522, + "step": 7991 + }, + { + "epoch": 2.6688929704458175, + "grad_norm": 0.5223355051647398, + "learning_rate": 3.6559483072366506e-07, + "loss": 0.1549, + "step": 7992 + }, + { + "epoch": 2.66922691601269, + "grad_norm": 0.5378562856207899, + "learning_rate": 3.6486574045448973e-07, + "loss": 0.1561, + "step": 7993 + }, + { + "epoch": 2.6695608615795625, + "grad_norm": 0.5167077818281667, + "learning_rate": 3.6413735037691966e-07, + "loss": 0.1433, + "step": 7994 + }, + { + "epoch": 2.6698948071464352, + "grad_norm": 0.5341312147095918, + "learning_rate": 3.634096606009896e-07, + "loss": 0.1602, + "step": 7995 + }, + { + "epoch": 2.6702287527133075, + "grad_norm": 0.5225832292402557, + "learning_rate": 3.626826712366233e-07, + "loss": 0.1424, + "step": 7996 + }, + { + "epoch": 2.6705626982801802, + "grad_norm": 0.5189043060999742, + "learning_rate": 3.6195638239364225e-07, + "loss": 0.1518, + "step": 7997 + }, + { + "epoch": 2.670896643847053, + "grad_norm": 0.5300387936707925, + "learning_rate": 3.612307941817622e-07, + "loss": 0.1572, + "step": 7998 + }, + { + "epoch": 2.6712305894139257, + "grad_norm": 0.45974788723278426, + "learning_rate": 3.605059067105887e-07, + "loss": 0.1372, + "step": 7999 + }, + { + "epoch": 2.6715645349807984, + "grad_norm": 0.5306840434506943, + "learning_rate": 3.59781720089627e-07, + "loss": 0.1524, + "step": 8000 + }, + { + "epoch": 2.6718984805476707, + "grad_norm": 0.5618883169905055, + "learning_rate": 3.5905823442827393e-07, + "loss": 0.1654, + "step": 8001 + }, + { + "epoch": 2.6722324261145434, + "grad_norm": 0.5110715136379484, + "learning_rate": 3.583354498358188e-07, + "loss": 0.1548, + "step": 8002 + }, + { + "epoch": 2.672566371681416, + "grad_norm": 0.5165177851710339, + "learning_rate": 3.576133664214476e-07, + "loss": 0.1534, + "step": 8003 + }, + { + "epoch": 2.6729003172482884, + "grad_norm": 0.5427360752665066, + "learning_rate": 3.568919842942409e-07, + "loss": 0.1529, + "step": 8004 + }, + { + "epoch": 2.673234262815161, + "grad_norm": 0.5339557782418821, + "learning_rate": 3.5617130356316977e-07, + "loss": 0.1593, + "step": 8005 + }, + { + "epoch": 2.673568208382034, + "grad_norm": 0.556828539859666, + "learning_rate": 3.554513243371038e-07, + "loss": 0.1574, + "step": 8006 + }, + { + "epoch": 2.673902153948906, + "grad_norm": 0.538619853386981, + "learning_rate": 3.5473204672480224e-07, + "loss": 0.1667, + "step": 8007 + }, + { + "epoch": 2.674236099515779, + "grad_norm": 0.5434513800104122, + "learning_rate": 3.5401347083492077e-07, + "loss": 0.1479, + "step": 8008 + }, + { + "epoch": 2.6745700450826515, + "grad_norm": 0.5290721889738227, + "learning_rate": 3.532955967760093e-07, + "loss": 0.1585, + "step": 8009 + }, + { + "epoch": 2.674903990649524, + "grad_norm": 0.49434759453802185, + "learning_rate": 3.5257842465651226e-07, + "loss": 0.1473, + "step": 8010 + }, + { + "epoch": 2.675237936216397, + "grad_norm": 0.5113588034851804, + "learning_rate": 3.5186195458476515e-07, + "loss": 0.1493, + "step": 8011 + }, + { + "epoch": 2.675571881783269, + "grad_norm": 0.5688152683995327, + "learning_rate": 3.5114618666900023e-07, + "loss": 0.1628, + "step": 8012 + }, + { + "epoch": 2.675905827350142, + "grad_norm": 0.5039811736577918, + "learning_rate": 3.5043112101734166e-07, + "loss": 0.151, + "step": 8013 + }, + { + "epoch": 2.6762397729170146, + "grad_norm": 0.4828551437518733, + "learning_rate": 3.4971675773780913e-07, + "loss": 0.1432, + "step": 8014 + }, + { + "epoch": 2.676573718483887, + "grad_norm": 0.5549856306713297, + "learning_rate": 3.490030969383157e-07, + "loss": 0.1592, + "step": 8015 + }, + { + "epoch": 2.6769076640507596, + "grad_norm": 0.5379246934070531, + "learning_rate": 3.482901387266685e-07, + "loss": 0.1618, + "step": 8016 + }, + { + "epoch": 2.6772416096176324, + "grad_norm": 0.5072517308727558, + "learning_rate": 3.475778832105681e-07, + "loss": 0.1423, + "step": 8017 + }, + { + "epoch": 2.677575555184505, + "grad_norm": 0.4998020934814151, + "learning_rate": 3.468663304976089e-07, + "loss": 0.1552, + "step": 8018 + }, + { + "epoch": 2.677909500751378, + "grad_norm": 0.5309140304675917, + "learning_rate": 3.4615548069527883e-07, + "loss": 0.1557, + "step": 8019 + }, + { + "epoch": 2.67824344631825, + "grad_norm": 0.5489735365073403, + "learning_rate": 3.4544533391096093e-07, + "loss": 0.154, + "step": 8020 + }, + { + "epoch": 2.678577391885123, + "grad_norm": 0.6022169835721196, + "learning_rate": 3.4473589025193155e-07, + "loss": 0.1569, + "step": 8021 + }, + { + "epoch": 2.6789113374519955, + "grad_norm": 0.5545056781397845, + "learning_rate": 3.440271498253589e-07, + "loss": 0.1612, + "step": 8022 + }, + { + "epoch": 2.6792452830188678, + "grad_norm": 0.5777137091933007, + "learning_rate": 3.433191127383079e-07, + "loss": 0.1696, + "step": 8023 + }, + { + "epoch": 2.6795792285857405, + "grad_norm": 0.496180545244153, + "learning_rate": 3.4261177909773624e-07, + "loss": 0.1493, + "step": 8024 + }, + { + "epoch": 2.679913174152613, + "grad_norm": 0.5524161786976409, + "learning_rate": 3.419051490104935e-07, + "loss": 0.1599, + "step": 8025 + }, + { + "epoch": 2.6802471197194855, + "grad_norm": 0.5548262424062345, + "learning_rate": 3.4119922258332496e-07, + "loss": 0.1652, + "step": 8026 + }, + { + "epoch": 2.680581065286358, + "grad_norm": 0.5153735852895094, + "learning_rate": 3.4049399992287067e-07, + "loss": 0.1486, + "step": 8027 + }, + { + "epoch": 2.680915010853231, + "grad_norm": 0.5127645749070047, + "learning_rate": 3.3978948113566056e-07, + "loss": 0.1538, + "step": 8028 + }, + { + "epoch": 2.6812489564201036, + "grad_norm": 0.5397099844954941, + "learning_rate": 3.390856663281228e-07, + "loss": 0.1595, + "step": 8029 + }, + { + "epoch": 2.6815829019869764, + "grad_norm": 0.5237062122977899, + "learning_rate": 3.3838255560657453e-07, + "loss": 0.1513, + "step": 8030 + }, + { + "epoch": 2.6819168475538486, + "grad_norm": 0.5395864823693816, + "learning_rate": 3.3768014907722966e-07, + "loss": 0.1548, + "step": 8031 + }, + { + "epoch": 2.6822507931207213, + "grad_norm": 0.4867347579548099, + "learning_rate": 3.369784468461956e-07, + "loss": 0.1428, + "step": 8032 + }, + { + "epoch": 2.682584738687594, + "grad_norm": 0.5527869275755876, + "learning_rate": 3.3627744901947313e-07, + "loss": 0.1527, + "step": 8033 + }, + { + "epoch": 2.6829186842544663, + "grad_norm": 0.5023108032433418, + "learning_rate": 3.3557715570295523e-07, + "loss": 0.1514, + "step": 8034 + }, + { + "epoch": 2.683252629821339, + "grad_norm": 0.5360412608959365, + "learning_rate": 3.3487756700243014e-07, + "loss": 0.1604, + "step": 8035 + }, + { + "epoch": 2.6835865753882118, + "grad_norm": 0.5424139704297755, + "learning_rate": 3.341786830235777e-07, + "loss": 0.1544, + "step": 8036 + }, + { + "epoch": 2.6839205209550845, + "grad_norm": 0.5048209881726411, + "learning_rate": 3.334805038719735e-07, + "loss": 0.1545, + "step": 8037 + }, + { + "epoch": 2.6842544665219568, + "grad_norm": 0.5486406304826595, + "learning_rate": 3.3278302965308593e-07, + "loss": 0.1686, + "step": 8038 + }, + { + "epoch": 2.6845884120888295, + "grad_norm": 0.5300204256810848, + "learning_rate": 3.3208626047227687e-07, + "loss": 0.1601, + "step": 8039 + }, + { + "epoch": 2.684922357655702, + "grad_norm": 0.5323499926681304, + "learning_rate": 3.313901964348004e-07, + "loss": 0.1554, + "step": 8040 + }, + { + "epoch": 2.685256303222575, + "grad_norm": 0.5355346384229268, + "learning_rate": 3.306948376458069e-07, + "loss": 0.163, + "step": 8041 + }, + { + "epoch": 2.685590248789447, + "grad_norm": 0.5435803333076292, + "learning_rate": 3.3000018421033675e-07, + "loss": 0.1511, + "step": 8042 + }, + { + "epoch": 2.68592419435632, + "grad_norm": 0.5284470262839093, + "learning_rate": 3.29306236233326e-07, + "loss": 0.1517, + "step": 8043 + }, + { + "epoch": 2.6862581399231926, + "grad_norm": 0.5375116338923487, + "learning_rate": 3.286129938196048e-07, + "loss": 0.149, + "step": 8044 + }, + { + "epoch": 2.686592085490065, + "grad_norm": 0.5196788029953022, + "learning_rate": 3.279204570738936e-07, + "loss": 0.1494, + "step": 8045 + }, + { + "epoch": 2.6869260310569376, + "grad_norm": 0.5989969976700711, + "learning_rate": 3.272286261008095e-07, + "loss": 0.1594, + "step": 8046 + }, + { + "epoch": 2.6872599766238103, + "grad_norm": 0.533802596049426, + "learning_rate": 3.2653750100486213e-07, + "loss": 0.1606, + "step": 8047 + }, + { + "epoch": 2.687593922190683, + "grad_norm": 0.5985362027338466, + "learning_rate": 3.25847081890453e-07, + "loss": 0.1645, + "step": 8048 + }, + { + "epoch": 2.6879278677575558, + "grad_norm": 0.5123641039133764, + "learning_rate": 3.251573688618781e-07, + "loss": 0.1541, + "step": 8049 + }, + { + "epoch": 2.688261813324428, + "grad_norm": 0.5633459684436906, + "learning_rate": 3.2446836202332854e-07, + "loss": 0.1641, + "step": 8050 + }, + { + "epoch": 2.6885957588913008, + "grad_norm": 0.5383588555865818, + "learning_rate": 3.237800614788844e-07, + "loss": 0.1596, + "step": 8051 + }, + { + "epoch": 2.6889297044581735, + "grad_norm": 0.5287494546807463, + "learning_rate": 3.230924673325231e-07, + "loss": 0.1483, + "step": 8052 + }, + { + "epoch": 2.6892636500250457, + "grad_norm": 0.5349375464782349, + "learning_rate": 3.2240557968811315e-07, + "loss": 0.1511, + "step": 8053 + }, + { + "epoch": 2.6895975955919185, + "grad_norm": 0.5826177066382261, + "learning_rate": 3.217193986494177e-07, + "loss": 0.1605, + "step": 8054 + }, + { + "epoch": 2.689931541158791, + "grad_norm": 0.5289988608050177, + "learning_rate": 3.2103392432009105e-07, + "loss": 0.1425, + "step": 8055 + }, + { + "epoch": 2.6902654867256635, + "grad_norm": 0.5460451950316173, + "learning_rate": 3.203491568036843e-07, + "loss": 0.1566, + "step": 8056 + }, + { + "epoch": 2.690599432292536, + "grad_norm": 0.47868697237246965, + "learning_rate": 3.196650962036374e-07, + "loss": 0.148, + "step": 8057 + }, + { + "epoch": 2.690933377859409, + "grad_norm": 0.5647617020216217, + "learning_rate": 3.189817426232883e-07, + "loss": 0.1633, + "step": 8058 + }, + { + "epoch": 2.6912673234262816, + "grad_norm": 0.5517400532178436, + "learning_rate": 3.182990961658633e-07, + "loss": 0.1611, + "step": 8059 + }, + { + "epoch": 2.6916012689931543, + "grad_norm": 0.5601119667361114, + "learning_rate": 3.1761715693448546e-07, + "loss": 0.1586, + "step": 8060 + }, + { + "epoch": 2.6919352145600266, + "grad_norm": 0.5071398052612933, + "learning_rate": 3.1693592503216795e-07, + "loss": 0.1504, + "step": 8061 + }, + { + "epoch": 2.6922691601268993, + "grad_norm": 0.5063705615598098, + "learning_rate": 3.162554005618218e-07, + "loss": 0.1574, + "step": 8062 + }, + { + "epoch": 2.692603105693772, + "grad_norm": 0.5263427681179844, + "learning_rate": 3.155755836262464e-07, + "loss": 0.1428, + "step": 8063 + }, + { + "epoch": 2.6929370512606443, + "grad_norm": 0.5440253785171091, + "learning_rate": 3.148964743281363e-07, + "loss": 0.1609, + "step": 8064 + }, + { + "epoch": 2.693270996827517, + "grad_norm": 0.5213216657932812, + "learning_rate": 3.1421807277007885e-07, + "loss": 0.1525, + "step": 8065 + }, + { + "epoch": 2.6936049423943897, + "grad_norm": 0.5149269451276092, + "learning_rate": 3.1354037905455547e-07, + "loss": 0.1546, + "step": 8066 + }, + { + "epoch": 2.6939388879612625, + "grad_norm": 0.5446427194292313, + "learning_rate": 3.1286339328393755e-07, + "loss": 0.166, + "step": 8067 + }, + { + "epoch": 2.694272833528135, + "grad_norm": 0.5545769562726003, + "learning_rate": 3.1218711556049494e-07, + "loss": 0.1616, + "step": 8068 + }, + { + "epoch": 2.6946067790950075, + "grad_norm": 0.5233064501341601, + "learning_rate": 3.115115459863849e-07, + "loss": 0.1564, + "step": 8069 + }, + { + "epoch": 2.69494072466188, + "grad_norm": 0.5760191622579325, + "learning_rate": 3.108366846636618e-07, + "loss": 0.162, + "step": 8070 + }, + { + "epoch": 2.695274670228753, + "grad_norm": 0.52977035763498, + "learning_rate": 3.101625316942697e-07, + "loss": 0.1517, + "step": 8071 + }, + { + "epoch": 2.695608615795625, + "grad_norm": 0.5592586244479276, + "learning_rate": 3.094890871800488e-07, + "loss": 0.1644, + "step": 8072 + }, + { + "epoch": 2.695942561362498, + "grad_norm": 0.5035640831811913, + "learning_rate": 3.0881635122273047e-07, + "loss": 0.1459, + "step": 8073 + }, + { + "epoch": 2.6962765069293706, + "grad_norm": 0.5410337877985646, + "learning_rate": 3.0814432392393847e-07, + "loss": 0.1599, + "step": 8074 + }, + { + "epoch": 2.696610452496243, + "grad_norm": 0.5273773774689604, + "learning_rate": 3.074730053851921e-07, + "loss": 0.1553, + "step": 8075 + }, + { + "epoch": 2.6969443980631156, + "grad_norm": 0.48100970564002676, + "learning_rate": 3.068023957078997e-07, + "loss": 0.1479, + "step": 8076 + }, + { + "epoch": 2.6972783436299883, + "grad_norm": 0.49067868878960497, + "learning_rate": 3.061324949933675e-07, + "loss": 0.1342, + "step": 8077 + }, + { + "epoch": 2.697612289196861, + "grad_norm": 0.5583056416030368, + "learning_rate": 3.054633033427884e-07, + "loss": 0.1661, + "step": 8078 + }, + { + "epoch": 2.6979462347637337, + "grad_norm": 0.5215917195872181, + "learning_rate": 3.0479482085725545e-07, + "loss": 0.1538, + "step": 8079 + }, + { + "epoch": 2.698280180330606, + "grad_norm": 0.5729067492916314, + "learning_rate": 3.0412704763774836e-07, + "loss": 0.1695, + "step": 8080 + }, + { + "epoch": 2.6986141258974787, + "grad_norm": 0.5332376143310963, + "learning_rate": 3.034599837851432e-07, + "loss": 0.1607, + "step": 8081 + }, + { + "epoch": 2.6989480714643515, + "grad_norm": 0.4942390417812226, + "learning_rate": 3.027936294002071e-07, + "loss": 0.1476, + "step": 8082 + }, + { + "epoch": 2.6992820170312237, + "grad_norm": 0.5414771162644846, + "learning_rate": 3.021279845836017e-07, + "loss": 0.1559, + "step": 8083 + }, + { + "epoch": 2.6996159625980964, + "grad_norm": 0.5527997876111395, + "learning_rate": 3.0146304943587833e-07, + "loss": 0.1595, + "step": 8084 + }, + { + "epoch": 2.699949908164969, + "grad_norm": 0.5062233776814364, + "learning_rate": 3.007988240574866e-07, + "loss": 0.1522, + "step": 8085 + }, + { + "epoch": 2.7002838537318414, + "grad_norm": 0.5204497543953683, + "learning_rate": 3.0013530854876296e-07, + "loss": 0.1471, + "step": 8086 + }, + { + "epoch": 2.700617799298714, + "grad_norm": 0.5209478565356521, + "learning_rate": 2.9947250300994046e-07, + "loss": 0.1586, + "step": 8087 + }, + { + "epoch": 2.700951744865587, + "grad_norm": 0.5312789973530363, + "learning_rate": 2.98810407541143e-07, + "loss": 0.1588, + "step": 8088 + }, + { + "epoch": 2.7012856904324596, + "grad_norm": 0.5075938953896761, + "learning_rate": 2.9814902224238886e-07, + "loss": 0.1484, + "step": 8089 + }, + { + "epoch": 2.7016196359993323, + "grad_norm": 0.5160275474594337, + "learning_rate": 2.974883472135859e-07, + "loss": 0.147, + "step": 8090 + }, + { + "epoch": 2.7019535815662046, + "grad_norm": 0.5232544001518299, + "learning_rate": 2.968283825545398e-07, + "loss": 0.1496, + "step": 8091 + }, + { + "epoch": 2.7022875271330773, + "grad_norm": 0.5034655279070774, + "learning_rate": 2.961691283649437e-07, + "loss": 0.1468, + "step": 8092 + }, + { + "epoch": 2.70262147269995, + "grad_norm": 0.5510510157293431, + "learning_rate": 2.955105847443873e-07, + "loss": 0.1611, + "step": 8093 + }, + { + "epoch": 2.7029554182668223, + "grad_norm": 0.5315473818061955, + "learning_rate": 2.9485275179235e-07, + "loss": 0.15, + "step": 8094 + }, + { + "epoch": 2.703289363833695, + "grad_norm": 0.536253149180482, + "learning_rate": 2.9419562960820656e-07, + "loss": 0.1589, + "step": 8095 + }, + { + "epoch": 2.7036233094005677, + "grad_norm": 0.5306591757905975, + "learning_rate": 2.9353921829122167e-07, + "loss": 0.153, + "step": 8096 + }, + { + "epoch": 2.7039572549674404, + "grad_norm": 0.5436819952810656, + "learning_rate": 2.928835179405548e-07, + "loss": 0.1599, + "step": 8097 + }, + { + "epoch": 2.704291200534313, + "grad_norm": 0.5402673068926906, + "learning_rate": 2.922285286552579e-07, + "loss": 0.1529, + "step": 8098 + }, + { + "epoch": 2.7046251461011854, + "grad_norm": 0.5004065904993527, + "learning_rate": 2.915742505342728e-07, + "loss": 0.1487, + "step": 8099 + }, + { + "epoch": 2.704959091668058, + "grad_norm": 0.5229279567002475, + "learning_rate": 2.9092068367643776e-07, + "loss": 0.1545, + "step": 8100 + }, + { + "epoch": 2.705293037234931, + "grad_norm": 0.48729652498503284, + "learning_rate": 2.902678281804805e-07, + "loss": 0.1414, + "step": 8101 + }, + { + "epoch": 2.705626982801803, + "grad_norm": 0.4747083845884613, + "learning_rate": 2.896156841450232e-07, + "loss": 0.1441, + "step": 8102 + }, + { + "epoch": 2.705960928368676, + "grad_norm": 0.5342365800558092, + "learning_rate": 2.8896425166857976e-07, + "loss": 0.1543, + "step": 8103 + }, + { + "epoch": 2.7062948739355486, + "grad_norm": 0.5292850700939392, + "learning_rate": 2.8831353084955717e-07, + "loss": 0.1537, + "step": 8104 + }, + { + "epoch": 2.706628819502421, + "grad_norm": 0.5500522283182325, + "learning_rate": 2.8766352178625387e-07, + "loss": 0.1616, + "step": 8105 + }, + { + "epoch": 2.7069627650692936, + "grad_norm": 0.5134535972496134, + "learning_rate": 2.87014224576862e-07, + "loss": 0.1523, + "step": 8106 + }, + { + "epoch": 2.7072967106361663, + "grad_norm": 0.5471782056045605, + "learning_rate": 2.863656393194636e-07, + "loss": 0.1475, + "step": 8107 + }, + { + "epoch": 2.707630656203039, + "grad_norm": 0.5110760153803138, + "learning_rate": 2.8571776611203804e-07, + "loss": 0.1513, + "step": 8108 + }, + { + "epoch": 2.7079646017699117, + "grad_norm": 0.5385910497219654, + "learning_rate": 2.850706050524521e-07, + "loss": 0.1604, + "step": 8109 + }, + { + "epoch": 2.708298547336784, + "grad_norm": 0.5334396636556735, + "learning_rate": 2.844241562384686e-07, + "loss": 0.1556, + "step": 8110 + }, + { + "epoch": 2.7086324929036567, + "grad_norm": 0.5204610290097634, + "learning_rate": 2.8377841976773955e-07, + "loss": 0.1543, + "step": 8111 + }, + { + "epoch": 2.7089664384705294, + "grad_norm": 0.5496955299983243, + "learning_rate": 2.83133395737813e-07, + "loss": 0.1698, + "step": 8112 + }, + { + "epoch": 2.7093003840374017, + "grad_norm": 0.5536709752852906, + "learning_rate": 2.824890842461242e-07, + "loss": 0.1603, + "step": 8113 + }, + { + "epoch": 2.7096343296042744, + "grad_norm": 0.5010653680915442, + "learning_rate": 2.818454853900082e-07, + "loss": 0.1461, + "step": 8114 + }, + { + "epoch": 2.709968275171147, + "grad_norm": 0.5267900896237233, + "learning_rate": 2.8120259926668505e-07, + "loss": 0.152, + "step": 8115 + }, + { + "epoch": 2.71030222073802, + "grad_norm": 0.5478437369126266, + "learning_rate": 2.8056042597327196e-07, + "loss": 0.1687, + "step": 8116 + }, + { + "epoch": 2.7106361663048926, + "grad_norm": 0.5359815228385162, + "learning_rate": 2.799189656067758e-07, + "loss": 0.156, + "step": 8117 + }, + { + "epoch": 2.710970111871765, + "grad_norm": 0.5151592514466387, + "learning_rate": 2.792782182640974e-07, + "loss": 0.153, + "step": 8118 + }, + { + "epoch": 2.7113040574386376, + "grad_norm": 0.5344553467307052, + "learning_rate": 2.7863818404202823e-07, + "loss": 0.16, + "step": 8119 + }, + { + "epoch": 2.7116380030055103, + "grad_norm": 0.5700785819621733, + "learning_rate": 2.7799886303725376e-07, + "loss": 0.1635, + "step": 8120 + }, + { + "epoch": 2.7119719485723826, + "grad_norm": 0.5333711626924268, + "learning_rate": 2.7736025534635115e-07, + "loss": 0.1471, + "step": 8121 + }, + { + "epoch": 2.7123058941392553, + "grad_norm": 0.5161369376262971, + "learning_rate": 2.767223610657888e-07, + "loss": 0.1532, + "step": 8122 + }, + { + "epoch": 2.712639839706128, + "grad_norm": 0.579164591351255, + "learning_rate": 2.7608518029192897e-07, + "loss": 0.1704, + "step": 8123 + }, + { + "epoch": 2.7129737852730003, + "grad_norm": 0.556780216361864, + "learning_rate": 2.7544871312102485e-07, + "loss": 0.1614, + "step": 8124 + }, + { + "epoch": 2.713307730839873, + "grad_norm": 0.5134496409120298, + "learning_rate": 2.7481295964922216e-07, + "loss": 0.1524, + "step": 8125 + }, + { + "epoch": 2.7136416764067457, + "grad_norm": 0.5205512225709891, + "learning_rate": 2.7417791997255916e-07, + "loss": 0.1511, + "step": 8126 + }, + { + "epoch": 2.7139756219736184, + "grad_norm": 0.5524009827531768, + "learning_rate": 2.735435941869663e-07, + "loss": 0.1591, + "step": 8127 + }, + { + "epoch": 2.714309567540491, + "grad_norm": 0.5320081308305175, + "learning_rate": 2.7290998238826584e-07, + "loss": 0.1518, + "step": 8128 + }, + { + "epoch": 2.7146435131073634, + "grad_norm": 0.5274646254287043, + "learning_rate": 2.7227708467217227e-07, + "loss": 0.1586, + "step": 8129 + }, + { + "epoch": 2.714977458674236, + "grad_norm": 0.5314673716570912, + "learning_rate": 2.71644901134292e-07, + "loss": 0.1594, + "step": 8130 + }, + { + "epoch": 2.715311404241109, + "grad_norm": 0.5680112090581357, + "learning_rate": 2.7101343187012354e-07, + "loss": 0.166, + "step": 8131 + }, + { + "epoch": 2.715645349807981, + "grad_norm": 0.5302166166756883, + "learning_rate": 2.7038267697505894e-07, + "loss": 0.1601, + "step": 8132 + }, + { + "epoch": 2.715979295374854, + "grad_norm": 0.5334984611340821, + "learning_rate": 2.697526365443803e-07, + "loss": 0.1538, + "step": 8133 + }, + { + "epoch": 2.7163132409417265, + "grad_norm": 0.533924310055932, + "learning_rate": 2.691233106732627e-07, + "loss": 0.1499, + "step": 8134 + }, + { + "epoch": 2.716647186508599, + "grad_norm": 0.5165454426854755, + "learning_rate": 2.684946994567733e-07, + "loss": 0.1507, + "step": 8135 + }, + { + "epoch": 2.7169811320754715, + "grad_norm": 0.5319683743964586, + "learning_rate": 2.678668029898712e-07, + "loss": 0.1534, + "step": 8136 + }, + { + "epoch": 2.7173150776423443, + "grad_norm": 0.49055927054446136, + "learning_rate": 2.672396213674072e-07, + "loss": 0.1473, + "step": 8137 + }, + { + "epoch": 2.717649023209217, + "grad_norm": 0.5013407224064176, + "learning_rate": 2.66613154684125e-07, + "loss": 0.1515, + "step": 8138 + }, + { + "epoch": 2.7179829687760897, + "grad_norm": 0.557314050415258, + "learning_rate": 2.659874030346604e-07, + "loss": 0.16, + "step": 8139 + }, + { + "epoch": 2.718316914342962, + "grad_norm": 0.5188704409179276, + "learning_rate": 2.653623665135391e-07, + "loss": 0.1485, + "step": 8140 + }, + { + "epoch": 2.7186508599098347, + "grad_norm": 0.4733489482475054, + "learning_rate": 2.6473804521518097e-07, + "loss": 0.1369, + "step": 8141 + }, + { + "epoch": 2.7189848054767074, + "grad_norm": 0.5158640174213729, + "learning_rate": 2.641144392338968e-07, + "loss": 0.1615, + "step": 8142 + }, + { + "epoch": 2.7193187510435797, + "grad_norm": 0.5175936291666983, + "learning_rate": 2.6349154866389e-07, + "loss": 0.1487, + "step": 8143 + }, + { + "epoch": 2.7196526966104524, + "grad_norm": 0.4851398068455481, + "learning_rate": 2.6286937359925545e-07, + "loss": 0.1484, + "step": 8144 + }, + { + "epoch": 2.719986642177325, + "grad_norm": 0.4612050712346406, + "learning_rate": 2.622479141339801e-07, + "loss": 0.1426, + "step": 8145 + }, + { + "epoch": 2.720320587744198, + "grad_norm": 0.5103788486703124, + "learning_rate": 2.6162717036194274e-07, + "loss": 0.1545, + "step": 8146 + }, + { + "epoch": 2.7206545333110705, + "grad_norm": 0.5508718255938416, + "learning_rate": 2.610071423769128e-07, + "loss": 0.1552, + "step": 8147 + }, + { + "epoch": 2.720988478877943, + "grad_norm": 0.5482206957546422, + "learning_rate": 2.603878302725543e-07, + "loss": 0.162, + "step": 8148 + }, + { + "epoch": 2.7213224244448155, + "grad_norm": 0.5354816153222969, + "learning_rate": 2.5976923414242126e-07, + "loss": 0.1591, + "step": 8149 + }, + { + "epoch": 2.7216563700116883, + "grad_norm": 0.508383617507333, + "learning_rate": 2.5915135407996005e-07, + "loss": 0.1486, + "step": 8150 + }, + { + "epoch": 2.7219903155785605, + "grad_norm": 0.4846830693165029, + "learning_rate": 2.585341901785082e-07, + "loss": 0.1463, + "step": 8151 + }, + { + "epoch": 2.7223242611454332, + "grad_norm": 0.5589614124312149, + "learning_rate": 2.579177425312962e-07, + "loss": 0.1631, + "step": 8152 + }, + { + "epoch": 2.722658206712306, + "grad_norm": 0.5085255581901547, + "learning_rate": 2.5730201123144503e-07, + "loss": 0.156, + "step": 8153 + }, + { + "epoch": 2.7229921522791782, + "grad_norm": 0.5678891288995872, + "learning_rate": 2.566869963719681e-07, + "loss": 0.1583, + "step": 8154 + }, + { + "epoch": 2.723326097846051, + "grad_norm": 0.5160552742462742, + "learning_rate": 2.5607269804577174e-07, + "loss": 0.1487, + "step": 8155 + }, + { + "epoch": 2.7236600434129237, + "grad_norm": 0.5679998884899234, + "learning_rate": 2.5545911634565266e-07, + "loss": 0.1675, + "step": 8156 + }, + { + "epoch": 2.7239939889797964, + "grad_norm": 0.49259962902633603, + "learning_rate": 2.5484625136429854e-07, + "loss": 0.1528, + "step": 8157 + }, + { + "epoch": 2.724327934546669, + "grad_norm": 0.5321498066466733, + "learning_rate": 2.5423410319429075e-07, + "loss": 0.1574, + "step": 8158 + }, + { + "epoch": 2.7246618801135414, + "grad_norm": 0.5491822405598497, + "learning_rate": 2.5362267192810095e-07, + "loss": 0.1565, + "step": 8159 + }, + { + "epoch": 2.724995825680414, + "grad_norm": 0.5484143142827499, + "learning_rate": 2.530119576580936e-07, + "loss": 0.1648, + "step": 8160 + }, + { + "epoch": 2.725329771247287, + "grad_norm": 0.5123547918351172, + "learning_rate": 2.5240196047652377e-07, + "loss": 0.1561, + "step": 8161 + }, + { + "epoch": 2.725663716814159, + "grad_norm": 0.5079554577300277, + "learning_rate": 2.5179268047553937e-07, + "loss": 0.1473, + "step": 8162 + }, + { + "epoch": 2.725997662381032, + "grad_norm": 0.5048876817838505, + "learning_rate": 2.5118411774717857e-07, + "loss": 0.1455, + "step": 8163 + }, + { + "epoch": 2.7263316079479045, + "grad_norm": 0.5285246244585439, + "learning_rate": 2.5057627238337324e-07, + "loss": 0.1548, + "step": 8164 + }, + { + "epoch": 2.7266655535147772, + "grad_norm": 0.5301222062922429, + "learning_rate": 2.4996914447594334e-07, + "loss": 0.1638, + "step": 8165 + }, + { + "epoch": 2.72699949908165, + "grad_norm": 0.5486195673192618, + "learning_rate": 2.493627341166044e-07, + "loss": 0.1557, + "step": 8166 + }, + { + "epoch": 2.7273334446485222, + "grad_norm": 0.5147618754850339, + "learning_rate": 2.48757041396962e-07, + "loss": 0.1524, + "step": 8167 + }, + { + "epoch": 2.727667390215395, + "grad_norm": 0.47570026933007525, + "learning_rate": 2.481520664085113e-07, + "loss": 0.1414, + "step": 8168 + }, + { + "epoch": 2.7280013357822677, + "grad_norm": 0.5695233117084313, + "learning_rate": 2.4754780924264366e-07, + "loss": 0.1671, + "step": 8169 + }, + { + "epoch": 2.72833528134914, + "grad_norm": 0.5044921765852178, + "learning_rate": 2.4694426999063657e-07, + "loss": 0.1566, + "step": 8170 + }, + { + "epoch": 2.7286692269160127, + "grad_norm": 0.5059445019609731, + "learning_rate": 2.463414487436633e-07, + "loss": 0.151, + "step": 8171 + }, + { + "epoch": 2.7290031724828854, + "grad_norm": 0.5047425599919322, + "learning_rate": 2.4573934559278646e-07, + "loss": 0.1518, + "step": 8172 + }, + { + "epoch": 2.7293371180497576, + "grad_norm": 0.5073492122785407, + "learning_rate": 2.4513796062896166e-07, + "loss": 0.1472, + "step": 8173 + }, + { + "epoch": 2.7296710636166304, + "grad_norm": 0.5427657012511299, + "learning_rate": 2.4453729394303404e-07, + "loss": 0.1567, + "step": 8174 + }, + { + "epoch": 2.730005009183503, + "grad_norm": 0.5360771115077887, + "learning_rate": 2.439373456257427e-07, + "loss": 0.1562, + "step": 8175 + }, + { + "epoch": 2.730338954750376, + "grad_norm": 0.5371756975065656, + "learning_rate": 2.433381157677156e-07, + "loss": 0.1519, + "step": 8176 + }, + { + "epoch": 2.7306729003172485, + "grad_norm": 0.5073709026677747, + "learning_rate": 2.427396044594743e-07, + "loss": 0.1501, + "step": 8177 + }, + { + "epoch": 2.731006845884121, + "grad_norm": 0.6418391887464554, + "learning_rate": 2.421418117914298e-07, + "loss": 0.1538, + "step": 8178 + }, + { + "epoch": 2.7313407914509935, + "grad_norm": 0.5285420352825599, + "learning_rate": 2.415447378538871e-07, + "loss": 0.1597, + "step": 8179 + }, + { + "epoch": 2.7316747370178662, + "grad_norm": 0.594718310704362, + "learning_rate": 2.409483827370407e-07, + "loss": 0.1739, + "step": 8180 + }, + { + "epoch": 2.7320086825847385, + "grad_norm": 0.4977407152777746, + "learning_rate": 2.4035274653097797e-07, + "loss": 0.1474, + "step": 8181 + }, + { + "epoch": 2.732342628151611, + "grad_norm": 0.5529928738083278, + "learning_rate": 2.3975782932567473e-07, + "loss": 0.1595, + "step": 8182 + }, + { + "epoch": 2.732676573718484, + "grad_norm": 0.5064571077259249, + "learning_rate": 2.391636312110024e-07, + "loss": 0.146, + "step": 8183 + }, + { + "epoch": 2.733010519285356, + "grad_norm": 0.5539490252056876, + "learning_rate": 2.385701522767192e-07, + "loss": 0.1657, + "step": 8184 + }, + { + "epoch": 2.733344464852229, + "grad_norm": 0.5679411279580505, + "learning_rate": 2.3797739261247955e-07, + "loss": 0.1631, + "step": 8185 + }, + { + "epoch": 2.7336784104191016, + "grad_norm": 0.5833955890630944, + "learning_rate": 2.3738535230782568e-07, + "loss": 0.1643, + "step": 8186 + }, + { + "epoch": 2.7340123559859744, + "grad_norm": 0.5280431470599531, + "learning_rate": 2.3679403145219214e-07, + "loss": 0.1478, + "step": 8187 + }, + { + "epoch": 2.734346301552847, + "grad_norm": 0.5736288252761815, + "learning_rate": 2.362034301349053e-07, + "loss": 0.1657, + "step": 8188 + }, + { + "epoch": 2.7346802471197194, + "grad_norm": 0.5530244579608455, + "learning_rate": 2.3561354844518157e-07, + "loss": 0.1642, + "step": 8189 + }, + { + "epoch": 2.735014192686592, + "grad_norm": 0.5518902439512378, + "learning_rate": 2.3502438647213132e-07, + "loss": 0.1647, + "step": 8190 + }, + { + "epoch": 2.735348138253465, + "grad_norm": 0.5247716871272328, + "learning_rate": 2.3443594430475224e-07, + "loss": 0.1542, + "step": 8191 + }, + { + "epoch": 2.735682083820337, + "grad_norm": 0.5177961555436752, + "learning_rate": 2.3384822203193714e-07, + "loss": 0.1528, + "step": 8192 + }, + { + "epoch": 2.73601602938721, + "grad_norm": 0.5447187428407865, + "learning_rate": 2.332612197424672e-07, + "loss": 0.159, + "step": 8193 + }, + { + "epoch": 2.7363499749540825, + "grad_norm": 0.5400715115866794, + "learning_rate": 2.32674937525017e-07, + "loss": 0.1518, + "step": 8194 + }, + { + "epoch": 2.736683920520955, + "grad_norm": 0.518061402756023, + "learning_rate": 2.3208937546815026e-07, + "loss": 0.1563, + "step": 8195 + }, + { + "epoch": 2.737017866087828, + "grad_norm": 0.5073899376170143, + "learning_rate": 2.3150453366032445e-07, + "loss": 0.1533, + "step": 8196 + }, + { + "epoch": 2.7373518116547, + "grad_norm": 0.5308916049545908, + "learning_rate": 2.309204121898856e-07, + "loss": 0.1586, + "step": 8197 + }, + { + "epoch": 2.737685757221573, + "grad_norm": 0.5483262825685362, + "learning_rate": 2.3033701114507313e-07, + "loss": 0.1642, + "step": 8198 + }, + { + "epoch": 2.7380197027884456, + "grad_norm": 0.5920765856051325, + "learning_rate": 2.2975433061401541e-07, + "loss": 0.1734, + "step": 8199 + }, + { + "epoch": 2.738353648355318, + "grad_norm": 0.5495984257125057, + "learning_rate": 2.2917237068473484e-07, + "loss": 0.1591, + "step": 8200 + }, + { + "epoch": 2.7386875939221906, + "grad_norm": 0.543004345814433, + "learning_rate": 2.2859113144514055e-07, + "loss": 0.1581, + "step": 8201 + }, + { + "epoch": 2.7390215394890634, + "grad_norm": 0.49894758949368256, + "learning_rate": 2.2801061298303895e-07, + "loss": 0.154, + "step": 8202 + }, + { + "epoch": 2.7393554850559356, + "grad_norm": 0.5334926320488186, + "learning_rate": 2.2743081538612154e-07, + "loss": 0.1525, + "step": 8203 + }, + { + "epoch": 2.7396894306228083, + "grad_norm": 0.5413584792641948, + "learning_rate": 2.268517387419761e-07, + "loss": 0.1558, + "step": 8204 + }, + { + "epoch": 2.740023376189681, + "grad_norm": 0.5686222507446349, + "learning_rate": 2.2627338313807645e-07, + "loss": 0.1636, + "step": 8205 + }, + { + "epoch": 2.7403573217565538, + "grad_norm": 0.5348153988527491, + "learning_rate": 2.2569574866179166e-07, + "loss": 0.1592, + "step": 8206 + }, + { + "epoch": 2.7406912673234265, + "grad_norm": 0.5642140442258513, + "learning_rate": 2.2511883540037805e-07, + "loss": 0.1573, + "step": 8207 + }, + { + "epoch": 2.7410252128902988, + "grad_norm": 0.5024714154716956, + "learning_rate": 2.2454264344098865e-07, + "loss": 0.1479, + "step": 8208 + }, + { + "epoch": 2.7413591584571715, + "grad_norm": 0.517979664407899, + "learning_rate": 2.2396717287066106e-07, + "loss": 0.145, + "step": 8209 + }, + { + "epoch": 2.741693104024044, + "grad_norm": 0.5289911278223267, + "learning_rate": 2.233924237763291e-07, + "loss": 0.1537, + "step": 8210 + }, + { + "epoch": 2.7420270495909165, + "grad_norm": 0.519553175252985, + "learning_rate": 2.2281839624481328e-07, + "loss": 0.1495, + "step": 8211 + }, + { + "epoch": 2.742360995157789, + "grad_norm": 0.5335564407178233, + "learning_rate": 2.222450903628287e-07, + "loss": 0.1557, + "step": 8212 + }, + { + "epoch": 2.742694940724662, + "grad_norm": 0.5617213957103794, + "learning_rate": 2.2167250621697944e-07, + "loss": 0.1529, + "step": 8213 + }, + { + "epoch": 2.7430288862915346, + "grad_norm": 0.533394370268705, + "learning_rate": 2.2110064389376017e-07, + "loss": 0.1554, + "step": 8214 + }, + { + "epoch": 2.7433628318584073, + "grad_norm": 0.5242839412788696, + "learning_rate": 2.205295034795596e-07, + "loss": 0.1545, + "step": 8215 + }, + { + "epoch": 2.7436967774252796, + "grad_norm": 0.5180468545852273, + "learning_rate": 2.1995908506065366e-07, + "loss": 0.1562, + "step": 8216 + }, + { + "epoch": 2.7440307229921523, + "grad_norm": 0.5525667020861501, + "learning_rate": 2.1938938872321014e-07, + "loss": 0.153, + "step": 8217 + }, + { + "epoch": 2.744364668559025, + "grad_norm": 0.5713615936439899, + "learning_rate": 2.1882041455329073e-07, + "loss": 0.1606, + "step": 8218 + }, + { + "epoch": 2.7446986141258973, + "grad_norm": 0.5264399545928933, + "learning_rate": 2.1825216263684336e-07, + "loss": 0.1556, + "step": 8219 + }, + { + "epoch": 2.74503255969277, + "grad_norm": 0.4902137034825769, + "learning_rate": 2.176846330597099e-07, + "loss": 0.1515, + "step": 8220 + }, + { + "epoch": 2.7453665052596428, + "grad_norm": 0.5132192422380915, + "learning_rate": 2.1711782590762344e-07, + "loss": 0.148, + "step": 8221 + }, + { + "epoch": 2.745700450826515, + "grad_norm": 0.5177457173738288, + "learning_rate": 2.165517412662055e-07, + "loss": 0.1553, + "step": 8222 + }, + { + "epoch": 2.7460343963933878, + "grad_norm": 0.5390903052995786, + "learning_rate": 2.1598637922097098e-07, + "loss": 0.1537, + "step": 8223 + }, + { + "epoch": 2.7463683419602605, + "grad_norm": 0.5250086474438852, + "learning_rate": 2.1542173985732274e-07, + "loss": 0.1467, + "step": 8224 + }, + { + "epoch": 2.746702287527133, + "grad_norm": 0.4842237891444192, + "learning_rate": 2.148578232605575e-07, + "loss": 0.1439, + "step": 8225 + }, + { + "epoch": 2.747036233094006, + "grad_norm": 0.4949766102656577, + "learning_rate": 2.14294629515861e-07, + "loss": 0.1492, + "step": 8226 + }, + { + "epoch": 2.747370178660878, + "grad_norm": 0.5230593113441051, + "learning_rate": 2.137321587083119e-07, + "loss": 0.1517, + "step": 8227 + }, + { + "epoch": 2.747704124227751, + "grad_norm": 0.5123186979953736, + "learning_rate": 2.1317041092287548e-07, + "loss": 0.1467, + "step": 8228 + }, + { + "epoch": 2.7480380697946236, + "grad_norm": 0.5123779178667591, + "learning_rate": 2.126093862444123e-07, + "loss": 0.1604, + "step": 8229 + }, + { + "epoch": 2.748372015361496, + "grad_norm": 0.5225226900228949, + "learning_rate": 2.1204908475767005e-07, + "loss": 0.1521, + "step": 8230 + }, + { + "epoch": 2.7487059609283686, + "grad_norm": 0.5023831154513653, + "learning_rate": 2.114895065472905e-07, + "loss": 0.1449, + "step": 8231 + }, + { + "epoch": 2.7490399064952413, + "grad_norm": 0.5847888545067589, + "learning_rate": 2.109306516978038e-07, + "loss": 0.1677, + "step": 8232 + }, + { + "epoch": 2.7493738520621136, + "grad_norm": 0.49554499992889467, + "learning_rate": 2.1037252029363242e-07, + "loss": 0.1469, + "step": 8233 + }, + { + "epoch": 2.7497077976289863, + "grad_norm": 0.5556392349656654, + "learning_rate": 2.098151124190867e-07, + "loss": 0.164, + "step": 8234 + }, + { + "epoch": 2.750041743195859, + "grad_norm": 0.5186943346251153, + "learning_rate": 2.092584281583715e-07, + "loss": 0.1501, + "step": 8235 + }, + { + "epoch": 2.7503756887627318, + "grad_norm": 0.5265346475920051, + "learning_rate": 2.0870246759557956e-07, + "loss": 0.1491, + "step": 8236 + }, + { + "epoch": 2.7507096343296045, + "grad_norm": 0.5111789881374325, + "learning_rate": 2.0814723081469535e-07, + "loss": 0.1546, + "step": 8237 + }, + { + "epoch": 2.7510435798964767, + "grad_norm": 0.5401318445314278, + "learning_rate": 2.0759271789959513e-07, + "loss": 0.1534, + "step": 8238 + }, + { + "epoch": 2.7513775254633495, + "grad_norm": 0.534377040910652, + "learning_rate": 2.0703892893404299e-07, + "loss": 0.1541, + "step": 8239 + }, + { + "epoch": 2.751711471030222, + "grad_norm": 0.5721615250971579, + "learning_rate": 2.064858640016959e-07, + "loss": 0.1592, + "step": 8240 + }, + { + "epoch": 2.7520454165970945, + "grad_norm": 0.5306616003142766, + "learning_rate": 2.0593352318610093e-07, + "loss": 0.1585, + "step": 8241 + }, + { + "epoch": 2.752379362163967, + "grad_norm": 0.502808647723062, + "learning_rate": 2.0538190657069523e-07, + "loss": 0.1422, + "step": 8242 + }, + { + "epoch": 2.75271330773084, + "grad_norm": 0.5580436841293485, + "learning_rate": 2.048310142388077e-07, + "loss": 0.161, + "step": 8243 + }, + { + "epoch": 2.7530472532977126, + "grad_norm": 0.5351425052272484, + "learning_rate": 2.0428084627365729e-07, + "loss": 0.149, + "step": 8244 + }, + { + "epoch": 2.7533811988645853, + "grad_norm": 0.4946587104254831, + "learning_rate": 2.0373140275835203e-07, + "loss": 0.1361, + "step": 8245 + }, + { + "epoch": 2.7537151444314576, + "grad_norm": 0.5167893750141161, + "learning_rate": 2.0318268377589323e-07, + "loss": 0.1504, + "step": 8246 + }, + { + "epoch": 2.7540490899983303, + "grad_norm": 0.550129526570199, + "learning_rate": 2.026346894091702e-07, + "loss": 0.1593, + "step": 8247 + }, + { + "epoch": 2.754383035565203, + "grad_norm": 0.5486648519214529, + "learning_rate": 2.0208741974096445e-07, + "loss": 0.1584, + "step": 8248 + }, + { + "epoch": 2.7547169811320753, + "grad_norm": 0.5266687534141231, + "learning_rate": 2.0154087485394713e-07, + "loss": 0.1468, + "step": 8249 + }, + { + "epoch": 2.755050926698948, + "grad_norm": 0.5303620209898399, + "learning_rate": 2.0099505483068216e-07, + "loss": 0.1524, + "step": 8250 + }, + { + "epoch": 2.7553848722658207, + "grad_norm": 0.5017401492165686, + "learning_rate": 2.0044995975361914e-07, + "loss": 0.1476, + "step": 8251 + }, + { + "epoch": 2.755718817832693, + "grad_norm": 0.5512204576696247, + "learning_rate": 1.9990558970510388e-07, + "loss": 0.1627, + "step": 8252 + }, + { + "epoch": 2.7560527633995657, + "grad_norm": 0.5328549457695625, + "learning_rate": 1.9936194476736782e-07, + "loss": 0.1469, + "step": 8253 + }, + { + "epoch": 2.7563867089664384, + "grad_norm": 0.5332369435035683, + "learning_rate": 1.9881902502253525e-07, + "loss": 0.1586, + "step": 8254 + }, + { + "epoch": 2.756720654533311, + "grad_norm": 0.5442681969901398, + "learning_rate": 1.9827683055262114e-07, + "loss": 0.1625, + "step": 8255 + }, + { + "epoch": 2.757054600100184, + "grad_norm": 0.5073616134036689, + "learning_rate": 1.977353614395311e-07, + "loss": 0.1479, + "step": 8256 + }, + { + "epoch": 2.757388545667056, + "grad_norm": 0.5378551125440103, + "learning_rate": 1.971946177650591e-07, + "loss": 0.1518, + "step": 8257 + }, + { + "epoch": 2.757722491233929, + "grad_norm": 0.549121416484626, + "learning_rate": 1.966545996108915e-07, + "loss": 0.1548, + "step": 8258 + }, + { + "epoch": 2.7580564368008016, + "grad_norm": 0.550399829623678, + "learning_rate": 1.961153070586036e-07, + "loss": 0.1606, + "step": 8259 + }, + { + "epoch": 2.758390382367674, + "grad_norm": 0.5449800142650917, + "learning_rate": 1.9557674018966244e-07, + "loss": 0.1538, + "step": 8260 + }, + { + "epoch": 2.7587243279345466, + "grad_norm": 0.5752625807188275, + "learning_rate": 1.9503889908542572e-07, + "loss": 0.1673, + "step": 8261 + }, + { + "epoch": 2.7590582735014193, + "grad_norm": 0.5737368577234173, + "learning_rate": 1.9450178382713957e-07, + "loss": 0.1677, + "step": 8262 + }, + { + "epoch": 2.759392219068292, + "grad_norm": 0.5051673849530389, + "learning_rate": 1.9396539449594131e-07, + "loss": 0.1503, + "step": 8263 + }, + { + "epoch": 2.7597261646351647, + "grad_norm": 0.4914480531561124, + "learning_rate": 1.9342973117286056e-07, + "loss": 0.144, + "step": 8264 + }, + { + "epoch": 2.760060110202037, + "grad_norm": 0.5631823006449259, + "learning_rate": 1.9289479393881317e-07, + "loss": 0.165, + "step": 8265 + }, + { + "epoch": 2.7603940557689097, + "grad_norm": 0.5355383401496868, + "learning_rate": 1.9236058287460946e-07, + "loss": 0.1565, + "step": 8266 + }, + { + "epoch": 2.7607280013357824, + "grad_norm": 0.5450111267966022, + "learning_rate": 1.9182709806094823e-07, + "loss": 0.1658, + "step": 8267 + }, + { + "epoch": 2.7610619469026547, + "grad_norm": 0.5399961091071, + "learning_rate": 1.9129433957841781e-07, + "loss": 0.1513, + "step": 8268 + }, + { + "epoch": 2.7613958924695274, + "grad_norm": 0.5322165741350056, + "learning_rate": 1.907623075074988e-07, + "loss": 0.1516, + "step": 8269 + }, + { + "epoch": 2.7617298380364, + "grad_norm": 0.5481681573013868, + "learning_rate": 1.9023100192855914e-07, + "loss": 0.1608, + "step": 8270 + }, + { + "epoch": 2.7620637836032724, + "grad_norm": 0.544718465373997, + "learning_rate": 1.897004229218602e-07, + "loss": 0.1524, + "step": 8271 + }, + { + "epoch": 2.762397729170145, + "grad_norm": 0.5542344358536933, + "learning_rate": 1.8917057056755172e-07, + "loss": 0.1545, + "step": 8272 + }, + { + "epoch": 2.762731674737018, + "grad_norm": 0.4983410315703037, + "learning_rate": 1.8864144494567528e-07, + "loss": 0.1503, + "step": 8273 + }, + { + "epoch": 2.7630656203038906, + "grad_norm": 0.5290491211012668, + "learning_rate": 1.881130461361591e-07, + "loss": 0.1575, + "step": 8274 + }, + { + "epoch": 2.7633995658707633, + "grad_norm": 0.5105092858702401, + "learning_rate": 1.8758537421882662e-07, + "loss": 0.1462, + "step": 8275 + }, + { + "epoch": 2.7637335114376356, + "grad_norm": 0.5636688731155922, + "learning_rate": 1.870584292733868e-07, + "loss": 0.1596, + "step": 8276 + }, + { + "epoch": 2.7640674570045083, + "grad_norm": 0.547183058318822, + "learning_rate": 1.8653221137944155e-07, + "loss": 0.1678, + "step": 8277 + }, + { + "epoch": 2.764401402571381, + "grad_norm": 0.49128257503638373, + "learning_rate": 1.8600672061648283e-07, + "loss": 0.1442, + "step": 8278 + }, + { + "epoch": 2.7647353481382533, + "grad_norm": 0.5670280593322085, + "learning_rate": 1.8548195706389272e-07, + "loss": 0.1703, + "step": 8279 + }, + { + "epoch": 2.765069293705126, + "grad_norm": 0.4735398261466528, + "learning_rate": 1.849579208009411e-07, + "loss": 0.1331, + "step": 8280 + }, + { + "epoch": 2.7654032392719987, + "grad_norm": 0.5617939142612081, + "learning_rate": 1.844346119067919e-07, + "loss": 0.167, + "step": 8281 + }, + { + "epoch": 2.765737184838871, + "grad_norm": 0.5642449533519758, + "learning_rate": 1.8391203046049522e-07, + "loss": 0.1643, + "step": 8282 + }, + { + "epoch": 2.7660711304057437, + "grad_norm": 0.5244528138173514, + "learning_rate": 1.8339017654099344e-07, + "loss": 0.1534, + "step": 8283 + }, + { + "epoch": 2.7664050759726164, + "grad_norm": 0.5292603150792725, + "learning_rate": 1.828690502271202e-07, + "loss": 0.1611, + "step": 8284 + }, + { + "epoch": 2.766739021539489, + "grad_norm": 0.5377384654986338, + "learning_rate": 1.823486515975964e-07, + "loss": 0.1575, + "step": 8285 + }, + { + "epoch": 2.767072967106362, + "grad_norm": 0.5817154299132605, + "learning_rate": 1.818289807310347e-07, + "loss": 0.1533, + "step": 8286 + }, + { + "epoch": 2.767406912673234, + "grad_norm": 0.5228418557171426, + "learning_rate": 1.813100377059379e-07, + "loss": 0.1563, + "step": 8287 + }, + { + "epoch": 2.767740858240107, + "grad_norm": 0.5760879601371929, + "learning_rate": 1.8079182260069773e-07, + "loss": 0.1561, + "step": 8288 + }, + { + "epoch": 2.7680748038069796, + "grad_norm": 0.49105535586314847, + "learning_rate": 1.8027433549359764e-07, + "loss": 0.1542, + "step": 8289 + }, + { + "epoch": 2.768408749373852, + "grad_norm": 0.5680155038927197, + "learning_rate": 1.7975757646280955e-07, + "loss": 0.1668, + "step": 8290 + }, + { + "epoch": 2.7687426949407246, + "grad_norm": 0.5082235918459275, + "learning_rate": 1.792415455863955e-07, + "loss": 0.1557, + "step": 8291 + }, + { + "epoch": 2.7690766405075973, + "grad_norm": 0.5112458964828165, + "learning_rate": 1.7872624294230924e-07, + "loss": 0.1538, + "step": 8292 + }, + { + "epoch": 2.76941058607447, + "grad_norm": 0.5128409883230706, + "learning_rate": 1.7821166860839179e-07, + "loss": 0.1502, + "step": 8293 + }, + { + "epoch": 2.7697445316413427, + "grad_norm": 0.5237625902927323, + "learning_rate": 1.7769782266237767e-07, + "loss": 0.1508, + "step": 8294 + }, + { + "epoch": 2.770078477208215, + "grad_norm": 0.4762761343236909, + "learning_rate": 1.7718470518188645e-07, + "loss": 0.1441, + "step": 8295 + }, + { + "epoch": 2.7704124227750877, + "grad_norm": 0.5057082621318426, + "learning_rate": 1.7667231624443393e-07, + "loss": 0.1431, + "step": 8296 + }, + { + "epoch": 2.7707463683419604, + "grad_norm": 0.5366015119904906, + "learning_rate": 1.7616065592742038e-07, + "loss": 0.1633, + "step": 8297 + }, + { + "epoch": 2.7710803139088327, + "grad_norm": 0.5063382829156947, + "learning_rate": 1.7564972430813899e-07, + "loss": 0.1511, + "step": 8298 + }, + { + "epoch": 2.7714142594757054, + "grad_norm": 0.584640844229602, + "learning_rate": 1.751395214637708e-07, + "loss": 0.165, + "step": 8299 + }, + { + "epoch": 2.771748205042578, + "grad_norm": 0.51047234479896, + "learning_rate": 1.7463004747138967e-07, + "loss": 0.1502, + "step": 8300 + }, + { + "epoch": 2.7720821506094504, + "grad_norm": 0.5145726980495454, + "learning_rate": 1.7412130240795578e-07, + "loss": 0.1496, + "step": 8301 + }, + { + "epoch": 2.772416096176323, + "grad_norm": 0.5479476488710432, + "learning_rate": 1.736132863503226e-07, + "loss": 0.1591, + "step": 8302 + }, + { + "epoch": 2.772750041743196, + "grad_norm": 0.5223980288241024, + "learning_rate": 1.7310599937523153e-07, + "loss": 0.16, + "step": 8303 + }, + { + "epoch": 2.7730839873100686, + "grad_norm": 0.5203609186908762, + "learning_rate": 1.7259944155931407e-07, + "loss": 0.154, + "step": 8304 + }, + { + "epoch": 2.7734179328769413, + "grad_norm": 0.5262438153454855, + "learning_rate": 1.720936129790912e-07, + "loss": 0.1534, + "step": 8305 + }, + { + "epoch": 2.7737518784438135, + "grad_norm": 0.5572448507311574, + "learning_rate": 1.7158851371097518e-07, + "loss": 0.164, + "step": 8306 + }, + { + "epoch": 2.7740858240106863, + "grad_norm": 0.5516458484449425, + "learning_rate": 1.7108414383126658e-07, + "loss": 0.1575, + "step": 8307 + }, + { + "epoch": 2.774419769577559, + "grad_norm": 0.5305221309834752, + "learning_rate": 1.7058050341615783e-07, + "loss": 0.1504, + "step": 8308 + }, + { + "epoch": 2.7747537151444313, + "grad_norm": 0.5465339511703754, + "learning_rate": 1.7007759254172752e-07, + "loss": 0.1597, + "step": 8309 + }, + { + "epoch": 2.775087660711304, + "grad_norm": 0.5236311349194479, + "learning_rate": 1.6957541128394817e-07, + "loss": 0.151, + "step": 8310 + }, + { + "epoch": 2.7754216062781767, + "grad_norm": 0.5794917232299244, + "learning_rate": 1.6907395971867858e-07, + "loss": 0.165, + "step": 8311 + }, + { + "epoch": 2.7757555518450494, + "grad_norm": 0.5111090214492914, + "learning_rate": 1.685732379216698e-07, + "loss": 0.1537, + "step": 8312 + }, + { + "epoch": 2.776089497411922, + "grad_norm": 0.5182270741000585, + "learning_rate": 1.680732459685619e-07, + "loss": 0.1485, + "step": 8313 + }, + { + "epoch": 2.7764234429787944, + "grad_norm": 0.5299871536028107, + "learning_rate": 1.6757398393488443e-07, + "loss": 0.1615, + "step": 8314 + }, + { + "epoch": 2.776757388545667, + "grad_norm": 0.5229619474271693, + "learning_rate": 1.6707545189605657e-07, + "loss": 0.1549, + "step": 8315 + }, + { + "epoch": 2.77709133411254, + "grad_norm": 0.5312633993540525, + "learning_rate": 1.6657764992738746e-07, + "loss": 0.1508, + "step": 8316 + }, + { + "epoch": 2.777425279679412, + "grad_norm": 0.5207042176983809, + "learning_rate": 1.6608057810407586e-07, + "loss": 0.1548, + "step": 8317 + }, + { + "epoch": 2.777759225246285, + "grad_norm": 0.509134113266697, + "learning_rate": 1.6558423650121003e-07, + "loss": 0.149, + "step": 8318 + }, + { + "epoch": 2.7780931708131575, + "grad_norm": 0.5635185316038204, + "learning_rate": 1.6508862519376945e-07, + "loss": 0.166, + "step": 8319 + }, + { + "epoch": 2.77842711638003, + "grad_norm": 0.5039009942903678, + "learning_rate": 1.6459374425662088e-07, + "loss": 0.152, + "step": 8320 + }, + { + "epoch": 2.7787610619469025, + "grad_norm": 0.5185011394458492, + "learning_rate": 1.6409959376452289e-07, + "loss": 0.1497, + "step": 8321 + }, + { + "epoch": 2.7790950075137753, + "grad_norm": 0.5035741229676389, + "learning_rate": 1.6360617379212185e-07, + "loss": 0.147, + "step": 8322 + }, + { + "epoch": 2.779428953080648, + "grad_norm": 0.5234105413015845, + "learning_rate": 1.6311348441395535e-07, + "loss": 0.1525, + "step": 8323 + }, + { + "epoch": 2.7797628986475207, + "grad_norm": 0.5402213344741763, + "learning_rate": 1.6262152570444777e-07, + "loss": 0.1567, + "step": 8324 + }, + { + "epoch": 2.780096844214393, + "grad_norm": 0.500714119408257, + "learning_rate": 1.6213029773791912e-07, + "loss": 0.1503, + "step": 8325 + }, + { + "epoch": 2.7804307897812657, + "grad_norm": 0.5327017668640379, + "learning_rate": 1.6163980058857164e-07, + "loss": 0.159, + "step": 8326 + }, + { + "epoch": 2.7807647353481384, + "grad_norm": 0.5552130569413134, + "learning_rate": 1.6115003433050336e-07, + "loss": 0.1523, + "step": 8327 + }, + { + "epoch": 2.7810986809150107, + "grad_norm": 0.5036265248479774, + "learning_rate": 1.6066099903769726e-07, + "loss": 0.1443, + "step": 8328 + }, + { + "epoch": 2.7814326264818834, + "grad_norm": 0.5280869369504387, + "learning_rate": 1.6017269478402875e-07, + "loss": 0.1594, + "step": 8329 + }, + { + "epoch": 2.781766572048756, + "grad_norm": 0.5299118807830349, + "learning_rate": 1.59685121643261e-07, + "loss": 0.1493, + "step": 8330 + }, + { + "epoch": 2.7821005176156284, + "grad_norm": 0.5115595941536983, + "learning_rate": 1.5919827968904955e-07, + "loss": 0.1498, + "step": 8331 + }, + { + "epoch": 2.782434463182501, + "grad_norm": 0.5517706631282778, + "learning_rate": 1.5871216899493612e-07, + "loss": 0.1623, + "step": 8332 + }, + { + "epoch": 2.782768408749374, + "grad_norm": 0.5322940156215018, + "learning_rate": 1.5822678963435479e-07, + "loss": 0.1637, + "step": 8333 + }, + { + "epoch": 2.7831023543162465, + "grad_norm": 0.5279044532161087, + "learning_rate": 1.5774214168062575e-07, + "loss": 0.1544, + "step": 8334 + }, + { + "epoch": 2.7834362998831192, + "grad_norm": 0.5358377006180688, + "learning_rate": 1.5725822520696267e-07, + "loss": 0.159, + "step": 8335 + }, + { + "epoch": 2.7837702454499915, + "grad_norm": 0.5181269469955557, + "learning_rate": 1.567750402864654e-07, + "loss": 0.149, + "step": 8336 + }, + { + "epoch": 2.7841041910168642, + "grad_norm": 0.5627060721708065, + "learning_rate": 1.5629258699212613e-07, + "loss": 0.1598, + "step": 8337 + }, + { + "epoch": 2.784438136583737, + "grad_norm": 0.5672074227846221, + "learning_rate": 1.5581086539682433e-07, + "loss": 0.1647, + "step": 8338 + }, + { + "epoch": 2.7847720821506092, + "grad_norm": 0.5532121190782273, + "learning_rate": 1.5532987557332902e-07, + "loss": 0.1606, + "step": 8339 + }, + { + "epoch": 2.785106027717482, + "grad_norm": 0.5144356673510838, + "learning_rate": 1.5484961759430095e-07, + "loss": 0.1483, + "step": 8340 + }, + { + "epoch": 2.7854399732843547, + "grad_norm": 0.5476944503372652, + "learning_rate": 1.5437009153228766e-07, + "loss": 0.1581, + "step": 8341 + }, + { + "epoch": 2.7857739188512274, + "grad_norm": 0.5373962416791143, + "learning_rate": 1.538912974597273e-07, + "loss": 0.1578, + "step": 8342 + }, + { + "epoch": 2.7861078644181, + "grad_norm": 0.5169478563051959, + "learning_rate": 1.5341323544894758e-07, + "loss": 0.1531, + "step": 8343 + }, + { + "epoch": 2.7864418099849724, + "grad_norm": 0.5270574826065255, + "learning_rate": 1.5293590557216577e-07, + "loss": 0.1554, + "step": 8344 + }, + { + "epoch": 2.786775755551845, + "grad_norm": 0.5322542069948488, + "learning_rate": 1.5245930790148743e-07, + "loss": 0.1533, + "step": 8345 + }, + { + "epoch": 2.787109701118718, + "grad_norm": 0.5002626708460715, + "learning_rate": 1.5198344250890894e-07, + "loss": 0.1479, + "step": 8346 + }, + { + "epoch": 2.78744364668559, + "grad_norm": 0.510836318152733, + "learning_rate": 1.515083094663139e-07, + "loss": 0.1564, + "step": 8347 + }, + { + "epoch": 2.787777592252463, + "grad_norm": 0.5024767543651935, + "learning_rate": 1.5103390884547931e-07, + "loss": 0.1496, + "step": 8348 + }, + { + "epoch": 2.7881115378193355, + "grad_norm": 0.5256020147012425, + "learning_rate": 1.5056024071806674e-07, + "loss": 0.156, + "step": 8349 + }, + { + "epoch": 2.788445483386208, + "grad_norm": 0.5513860801672124, + "learning_rate": 1.5008730515563064e-07, + "loss": 0.1702, + "step": 8350 + }, + { + "epoch": 2.7887794289530805, + "grad_norm": 0.5583302740203661, + "learning_rate": 1.4961510222961216e-07, + "loss": 0.1689, + "step": 8351 + }, + { + "epoch": 2.7891133745199532, + "grad_norm": 0.5442873203295383, + "learning_rate": 1.4914363201134486e-07, + "loss": 0.1543, + "step": 8352 + }, + { + "epoch": 2.789447320086826, + "grad_norm": 0.5524028432461694, + "learning_rate": 1.4867289457204726e-07, + "loss": 0.1726, + "step": 8353 + }, + { + "epoch": 2.7897812656536987, + "grad_norm": 0.530554499586277, + "learning_rate": 1.4820288998283304e-07, + "loss": 0.1517, + "step": 8354 + }, + { + "epoch": 2.790115211220571, + "grad_norm": 0.556591191428861, + "learning_rate": 1.477336183146999e-07, + "loss": 0.1557, + "step": 8355 + }, + { + "epoch": 2.7904491567874437, + "grad_norm": 0.544705956885302, + "learning_rate": 1.4726507963853776e-07, + "loss": 0.1627, + "step": 8356 + }, + { + "epoch": 2.7907831023543164, + "grad_norm": 0.5220600913224868, + "learning_rate": 1.4679727402512334e-07, + "loss": 0.1527, + "step": 8357 + }, + { + "epoch": 2.7911170479211886, + "grad_norm": 0.52810589422968, + "learning_rate": 1.4633020154512677e-07, + "loss": 0.1551, + "step": 8358 + }, + { + "epoch": 2.7914509934880614, + "grad_norm": 0.4970005355575271, + "learning_rate": 1.458638622691022e-07, + "loss": 0.1479, + "step": 8359 + }, + { + "epoch": 2.791784939054934, + "grad_norm": 0.5212873276869456, + "learning_rate": 1.4539825626749715e-07, + "loss": 0.1541, + "step": 8360 + }, + { + "epoch": 2.792118884621807, + "grad_norm": 0.5042382836183906, + "learning_rate": 1.4493338361064646e-07, + "loss": 0.1455, + "step": 8361 + }, + { + "epoch": 2.7924528301886795, + "grad_norm": 0.5309456541518355, + "learning_rate": 1.4446924436877507e-07, + "loss": 0.1609, + "step": 8362 + }, + { + "epoch": 2.792786775755552, + "grad_norm": 0.5118129734037632, + "learning_rate": 1.4400583861199636e-07, + "loss": 0.1541, + "step": 8363 + }, + { + "epoch": 2.7931207213224245, + "grad_norm": 0.4853226783481245, + "learning_rate": 1.4354316641031263e-07, + "loss": 0.1543, + "step": 8364 + }, + { + "epoch": 2.7934546668892972, + "grad_norm": 0.541481071236972, + "learning_rate": 1.4308122783361688e-07, + "loss": 0.1523, + "step": 8365 + }, + { + "epoch": 2.7937886124561695, + "grad_norm": 0.5278220146033721, + "learning_rate": 1.4262002295168997e-07, + "loss": 0.1547, + "step": 8366 + }, + { + "epoch": 2.794122558023042, + "grad_norm": 0.5135966918537538, + "learning_rate": 1.4215955183420282e-07, + "loss": 0.1502, + "step": 8367 + }, + { + "epoch": 2.794456503589915, + "grad_norm": 0.4920830612333697, + "learning_rate": 1.4169981455071368e-07, + "loss": 0.1571, + "step": 8368 + }, + { + "epoch": 2.794790449156787, + "grad_norm": 0.5148476681687206, + "learning_rate": 1.4124081117067313e-07, + "loss": 0.1553, + "step": 8369 + }, + { + "epoch": 2.79512439472366, + "grad_norm": 0.5234172849783939, + "learning_rate": 1.4078254176341788e-07, + "loss": 0.1583, + "step": 8370 + }, + { + "epoch": 2.7954583402905326, + "grad_norm": 0.5072703583829362, + "learning_rate": 1.4032500639817426e-07, + "loss": 0.154, + "step": 8371 + }, + { + "epoch": 2.7957922858574054, + "grad_norm": 0.5591966083853513, + "learning_rate": 1.3986820514405973e-07, + "loss": 0.1635, + "step": 8372 + }, + { + "epoch": 2.796126231424278, + "grad_norm": 0.5831281500644744, + "learning_rate": 1.394121380700797e-07, + "loss": 0.1666, + "step": 8373 + }, + { + "epoch": 2.7964601769911503, + "grad_norm": 0.5324043094043531, + "learning_rate": 1.3895680524512734e-07, + "loss": 0.1529, + "step": 8374 + }, + { + "epoch": 2.796794122558023, + "grad_norm": 0.5407819919549826, + "learning_rate": 1.3850220673798655e-07, + "loss": 0.1523, + "step": 8375 + }, + { + "epoch": 2.797128068124896, + "grad_norm": 0.571743758557889, + "learning_rate": 1.3804834261732957e-07, + "loss": 0.1604, + "step": 8376 + }, + { + "epoch": 2.797462013691768, + "grad_norm": 0.5261550026856082, + "learning_rate": 1.3759521295171773e-07, + "loss": 0.1579, + "step": 8377 + }, + { + "epoch": 2.7977959592586408, + "grad_norm": 0.5077553496486351, + "learning_rate": 1.3714281780960237e-07, + "loss": 0.1549, + "step": 8378 + }, + { + "epoch": 2.7981299048255135, + "grad_norm": 0.5512498148977449, + "learning_rate": 1.366911572593227e-07, + "loss": 0.1462, + "step": 8379 + }, + { + "epoch": 2.7984638503923858, + "grad_norm": 0.5278418783066358, + "learning_rate": 1.3624023136910691e-07, + "loss": 0.1527, + "step": 8380 + }, + { + "epoch": 2.7987977959592585, + "grad_norm": 0.5720969297382702, + "learning_rate": 1.3579004020707387e-07, + "loss": 0.1616, + "step": 8381 + }, + { + "epoch": 2.799131741526131, + "grad_norm": 0.5391404210914308, + "learning_rate": 1.3534058384122862e-07, + "loss": 0.1588, + "step": 8382 + }, + { + "epoch": 2.799465687093004, + "grad_norm": 0.5391505004911138, + "learning_rate": 1.3489186233946793e-07, + "loss": 0.1586, + "step": 8383 + }, + { + "epoch": 2.7997996326598766, + "grad_norm": 0.5405451852070394, + "learning_rate": 1.3444387576957706e-07, + "loss": 0.162, + "step": 8384 + }, + { + "epoch": 2.800133578226749, + "grad_norm": 0.5523476563053142, + "learning_rate": 1.33996624199228e-07, + "loss": 0.1588, + "step": 8385 + }, + { + "epoch": 2.8004675237936216, + "grad_norm": 0.520024555301227, + "learning_rate": 1.335501076959844e-07, + "loss": 0.1544, + "step": 8386 + }, + { + "epoch": 2.8008014693604943, + "grad_norm": 0.5772233450893358, + "learning_rate": 1.331043263272974e-07, + "loss": 0.1648, + "step": 8387 + }, + { + "epoch": 2.8011354149273666, + "grad_norm": 0.5240091069680041, + "learning_rate": 1.3265928016050756e-07, + "loss": 0.1468, + "step": 8388 + }, + { + "epoch": 2.8014693604942393, + "grad_norm": 0.5134959826858604, + "learning_rate": 1.3221496926284493e-07, + "loss": 0.1523, + "step": 8389 + }, + { + "epoch": 2.801803306061112, + "grad_norm": 0.53411701119722, + "learning_rate": 1.3177139370142755e-07, + "loss": 0.1533, + "step": 8390 + }, + { + "epoch": 2.8021372516279848, + "grad_norm": 0.5451994225507716, + "learning_rate": 1.3132855354326236e-07, + "loss": 0.1565, + "step": 8391 + }, + { + "epoch": 2.8024711971948575, + "grad_norm": 0.4988335998137933, + "learning_rate": 1.3088644885524637e-07, + "loss": 0.1479, + "step": 8392 + }, + { + "epoch": 2.8028051427617298, + "grad_norm": 0.5470946888485676, + "learning_rate": 1.3044507970416398e-07, + "loss": 0.1557, + "step": 8393 + }, + { + "epoch": 2.8031390883286025, + "grad_norm": 0.5434547017527317, + "learning_rate": 1.3000444615668906e-07, + "loss": 0.1557, + "step": 8394 + }, + { + "epoch": 2.803473033895475, + "grad_norm": 0.5210177368074225, + "learning_rate": 1.2956454827938557e-07, + "loss": 0.1471, + "step": 8395 + }, + { + "epoch": 2.8038069794623475, + "grad_norm": 0.506645320292903, + "learning_rate": 1.291253861387043e-07, + "loss": 0.1385, + "step": 8396 + }, + { + "epoch": 2.80414092502922, + "grad_norm": 0.5153043838572172, + "learning_rate": 1.28686959800986e-07, + "loss": 0.1547, + "step": 8397 + }, + { + "epoch": 2.804474870596093, + "grad_norm": 0.5433374471677337, + "learning_rate": 1.2824926933246106e-07, + "loss": 0.1577, + "step": 8398 + }, + { + "epoch": 2.804808816162965, + "grad_norm": 0.5741759511423757, + "learning_rate": 1.2781231479924606e-07, + "loss": 0.1671, + "step": 8399 + }, + { + "epoch": 2.805142761729838, + "grad_norm": 0.5348545400171368, + "learning_rate": 1.2737609626734927e-07, + "loss": 0.1571, + "step": 8400 + }, + { + "epoch": 2.8054767072967106, + "grad_norm": 0.5347605161338985, + "learning_rate": 1.269406138026663e-07, + "loss": 0.1603, + "step": 8401 + }, + { + "epoch": 2.8058106528635833, + "grad_norm": 0.5372346879839113, + "learning_rate": 1.2650586747098238e-07, + "loss": 0.1609, + "step": 8402 + }, + { + "epoch": 2.806144598430456, + "grad_norm": 0.5184156394531372, + "learning_rate": 1.2607185733797044e-07, + "loss": 0.1531, + "step": 8403 + }, + { + "epoch": 2.8064785439973283, + "grad_norm": 0.5193297902452196, + "learning_rate": 1.2563858346919365e-07, + "loss": 0.1495, + "step": 8404 + }, + { + "epoch": 2.806812489564201, + "grad_norm": 0.4735671716020055, + "learning_rate": 1.2520604593010189e-07, + "loss": 0.139, + "step": 8405 + }, + { + "epoch": 2.8071464351310738, + "grad_norm": 0.5452990496424134, + "learning_rate": 1.247742447860356e-07, + "loss": 0.1518, + "step": 8406 + }, + { + "epoch": 2.807480380697946, + "grad_norm": 0.5197495861568605, + "learning_rate": 1.2434318010222434e-07, + "loss": 0.1531, + "step": 8407 + }, + { + "epoch": 2.8078143262648187, + "grad_norm": 0.5682800571844647, + "learning_rate": 1.2391285194378433e-07, + "loss": 0.1558, + "step": 8408 + }, + { + "epoch": 2.8081482718316915, + "grad_norm": 0.5476986226433352, + "learning_rate": 1.2348326037572244e-07, + "loss": 0.1642, + "step": 8409 + }, + { + "epoch": 2.808482217398564, + "grad_norm": 0.5660502282746109, + "learning_rate": 1.2305440546293236e-07, + "loss": 0.1727, + "step": 8410 + }, + { + "epoch": 2.808816162965437, + "grad_norm": 0.4987505875655572, + "learning_rate": 1.2262628727019942e-07, + "loss": 0.1414, + "step": 8411 + }, + { + "epoch": 2.809150108532309, + "grad_norm": 0.5209194368951366, + "learning_rate": 1.221989058621942e-07, + "loss": 0.1512, + "step": 8412 + }, + { + "epoch": 2.809484054099182, + "grad_norm": 0.5773338493631209, + "learning_rate": 1.2177226130347886e-07, + "loss": 0.1533, + "step": 8413 + }, + { + "epoch": 2.8098179996660546, + "grad_norm": 0.49229443873769146, + "learning_rate": 1.21346353658503e-07, + "loss": 0.1488, + "step": 8414 + }, + { + "epoch": 2.810151945232927, + "grad_norm": 0.5239314850265931, + "learning_rate": 1.209211829916046e-07, + "loss": 0.1632, + "step": 8415 + }, + { + "epoch": 2.8104858907997996, + "grad_norm": 0.5733286509903617, + "learning_rate": 1.204967493670106e-07, + "loss": 0.1632, + "step": 8416 + }, + { + "epoch": 2.8108198363666723, + "grad_norm": 0.5649617767440496, + "learning_rate": 1.2007305284883696e-07, + "loss": 0.1524, + "step": 8417 + }, + { + "epoch": 2.8111537819335446, + "grad_norm": 0.5495402597096266, + "learning_rate": 1.1965009350108747e-07, + "loss": 0.1856, + "step": 8418 + }, + { + "epoch": 2.8114877275004173, + "grad_norm": 0.4862318982701236, + "learning_rate": 1.1922787138765656e-07, + "loss": 0.1439, + "step": 8419 + }, + { + "epoch": 2.81182167306729, + "grad_norm": 0.5452192079690045, + "learning_rate": 1.188063865723238e-07, + "loss": 0.1584, + "step": 8420 + }, + { + "epoch": 2.8121556186341627, + "grad_norm": 0.5197784434274771, + "learning_rate": 1.1838563911876155e-07, + "loss": 0.1568, + "step": 8421 + }, + { + "epoch": 2.8124895642010355, + "grad_norm": 0.5583439489780697, + "learning_rate": 1.1796562909052734e-07, + "loss": 0.1571, + "step": 8422 + }, + { + "epoch": 2.8128235097679077, + "grad_norm": 0.5495414718376356, + "learning_rate": 1.1754635655106928e-07, + "loss": 0.1638, + "step": 8423 + }, + { + "epoch": 2.8131574553347805, + "grad_norm": 0.5353757303775786, + "learning_rate": 1.1712782156372226e-07, + "loss": 0.1556, + "step": 8424 + }, + { + "epoch": 2.813491400901653, + "grad_norm": 0.5316180867730131, + "learning_rate": 1.167100241917124e-07, + "loss": 0.1542, + "step": 8425 + }, + { + "epoch": 2.8138253464685254, + "grad_norm": 0.5410694261321864, + "learning_rate": 1.1629296449815197e-07, + "loss": 0.1539, + "step": 8426 + }, + { + "epoch": 2.814159292035398, + "grad_norm": 0.5255062069267001, + "learning_rate": 1.1587664254604336e-07, + "loss": 0.1558, + "step": 8427 + }, + { + "epoch": 2.814493237602271, + "grad_norm": 0.5564039864164055, + "learning_rate": 1.1546105839827626e-07, + "loss": 0.1642, + "step": 8428 + }, + { + "epoch": 2.814827183169143, + "grad_norm": 0.5451419332138157, + "learning_rate": 1.150462121176299e-07, + "loss": 0.1605, + "step": 8429 + }, + { + "epoch": 2.815161128736016, + "grad_norm": 0.5608596813939348, + "learning_rate": 1.1463210376677192e-07, + "loss": 0.1678, + "step": 8430 + }, + { + "epoch": 2.8154950743028886, + "grad_norm": 0.5426405423185976, + "learning_rate": 1.1421873340825729e-07, + "loss": 0.1549, + "step": 8431 + }, + { + "epoch": 2.8158290198697613, + "grad_norm": 0.5354590434247339, + "learning_rate": 1.1380610110453217e-07, + "loss": 0.1545, + "step": 8432 + }, + { + "epoch": 2.816162965436634, + "grad_norm": 0.5214195383604076, + "learning_rate": 1.133942069179278e-07, + "loss": 0.1524, + "step": 8433 + }, + { + "epoch": 2.8164969110035063, + "grad_norm": 0.5846181741316001, + "learning_rate": 1.1298305091066664e-07, + "loss": 0.1571, + "step": 8434 + }, + { + "epoch": 2.816830856570379, + "grad_norm": 0.5684128730306873, + "learning_rate": 1.1257263314485844e-07, + "loss": 0.1588, + "step": 8435 + }, + { + "epoch": 2.8171648021372517, + "grad_norm": 0.5148921793629571, + "learning_rate": 1.1216295368250196e-07, + "loss": 0.1561, + "step": 8436 + }, + { + "epoch": 2.817498747704124, + "grad_norm": 0.5536026332038109, + "learning_rate": 1.1175401258548324e-07, + "loss": 0.1539, + "step": 8437 + }, + { + "epoch": 2.8178326932709967, + "grad_norm": 0.5272870234520881, + "learning_rate": 1.1134580991557842e-07, + "loss": 0.1623, + "step": 8438 + }, + { + "epoch": 2.8181666388378694, + "grad_norm": 0.49663883935703573, + "learning_rate": 1.1093834573445094e-07, + "loss": 0.1494, + "step": 8439 + }, + { + "epoch": 2.818500584404742, + "grad_norm": 0.5741127853098581, + "learning_rate": 1.1053162010365326e-07, + "loss": 0.1507, + "step": 8440 + }, + { + "epoch": 2.818834529971615, + "grad_norm": 0.5546891679590489, + "learning_rate": 1.1012563308462565e-07, + "loss": 0.1574, + "step": 8441 + }, + { + "epoch": 2.819168475538487, + "grad_norm": 0.5037622547843496, + "learning_rate": 1.0972038473869795e-07, + "loss": 0.151, + "step": 8442 + }, + { + "epoch": 2.81950242110536, + "grad_norm": 0.5486643889881713, + "learning_rate": 1.093158751270873e-07, + "loss": 0.1506, + "step": 8443 + }, + { + "epoch": 2.8198363666722326, + "grad_norm": 0.5590759169843443, + "learning_rate": 1.0891210431089983e-07, + "loss": 0.1634, + "step": 8444 + }, + { + "epoch": 2.820170312239105, + "grad_norm": 0.5382380812540658, + "learning_rate": 1.0850907235112895e-07, + "loss": 0.1579, + "step": 8445 + }, + { + "epoch": 2.8205042578059776, + "grad_norm": 0.5253716527065034, + "learning_rate": 1.0810677930865876e-07, + "loss": 0.1521, + "step": 8446 + }, + { + "epoch": 2.8208382033728503, + "grad_norm": 0.48555707327772146, + "learning_rate": 1.0770522524425898e-07, + "loss": 0.1417, + "step": 8447 + }, + { + "epoch": 2.8211721489397226, + "grad_norm": 0.5528063179335293, + "learning_rate": 1.0730441021859106e-07, + "loss": 0.1491, + "step": 8448 + }, + { + "epoch": 2.8215060945065953, + "grad_norm": 0.5496044182417962, + "learning_rate": 1.0690433429220049e-07, + "loss": 0.1506, + "step": 8449 + }, + { + "epoch": 2.821840040073468, + "grad_norm": 0.5346500000652207, + "learning_rate": 1.0650499752552557e-07, + "loss": 0.1539, + "step": 8450 + }, + { + "epoch": 2.8221739856403407, + "grad_norm": 0.5479349408608167, + "learning_rate": 1.0610639997888917e-07, + "loss": 0.1557, + "step": 8451 + }, + { + "epoch": 2.8225079312072134, + "grad_norm": 0.46744982444128813, + "learning_rate": 1.0570854171250478e-07, + "loss": 0.1374, + "step": 8452 + }, + { + "epoch": 2.8228418767740857, + "grad_norm": 0.5441636040790427, + "learning_rate": 1.0531142278647378e-07, + "loss": 0.153, + "step": 8453 + }, + { + "epoch": 2.8231758223409584, + "grad_norm": 0.5717208544928984, + "learning_rate": 1.0491504326078483e-07, + "loss": 0.1606, + "step": 8454 + }, + { + "epoch": 2.823509767907831, + "grad_norm": 0.5434976551693014, + "learning_rate": 1.0451940319531728e-07, + "loss": 0.1495, + "step": 8455 + }, + { + "epoch": 2.8238437134747034, + "grad_norm": 0.5563013385219229, + "learning_rate": 1.0412450264983609e-07, + "loss": 0.1582, + "step": 8456 + }, + { + "epoch": 2.824177659041576, + "grad_norm": 0.5133006701112091, + "learning_rate": 1.0373034168399521e-07, + "loss": 0.1468, + "step": 8457 + }, + { + "epoch": 2.824511604608449, + "grad_norm": 0.5368851278335857, + "learning_rate": 1.0333692035733867e-07, + "loss": 0.1577, + "step": 8458 + }, + { + "epoch": 2.8248455501753216, + "grad_norm": 0.5658419408433192, + "learning_rate": 1.0294423872929615e-07, + "loss": 0.1458, + "step": 8459 + }, + { + "epoch": 2.8251794957421943, + "grad_norm": 0.565524948708705, + "learning_rate": 1.0255229685918744e-07, + "loss": 0.1694, + "step": 8460 + }, + { + "epoch": 2.8255134413090666, + "grad_norm": 0.5525057994914118, + "learning_rate": 1.0216109480622017e-07, + "loss": 0.1674, + "step": 8461 + }, + { + "epoch": 2.8258473868759393, + "grad_norm": 0.7857522622361894, + "learning_rate": 1.0177063262948927e-07, + "loss": 0.1641, + "step": 8462 + }, + { + "epoch": 2.826181332442812, + "grad_norm": 0.5695931540924961, + "learning_rate": 1.0138091038797982e-07, + "loss": 0.1592, + "step": 8463 + }, + { + "epoch": 2.8265152780096843, + "grad_norm": 0.5376746632499131, + "learning_rate": 1.0099192814056247e-07, + "loss": 0.1663, + "step": 8464 + }, + { + "epoch": 2.826849223576557, + "grad_norm": 0.4698801948441204, + "learning_rate": 1.0060368594599856e-07, + "loss": 0.136, + "step": 8465 + }, + { + "epoch": 2.8271831691434297, + "grad_norm": 0.544981428758501, + "learning_rate": 1.002161838629362e-07, + "loss": 0.1525, + "step": 8466 + }, + { + "epoch": 2.827517114710302, + "grad_norm": 0.5350947441046976, + "learning_rate": 9.982942194991297e-08, + "loss": 0.1503, + "step": 8467 + }, + { + "epoch": 2.8278510602771747, + "grad_norm": 0.5659243272345094, + "learning_rate": 9.94434002653527e-08, + "loss": 0.1621, + "step": 8468 + }, + { + "epoch": 2.8281850058440474, + "grad_norm": 0.4893884162320132, + "learning_rate": 9.905811886756933e-08, + "loss": 0.1444, + "step": 8469 + }, + { + "epoch": 2.82851895141092, + "grad_norm": 0.5115181229873957, + "learning_rate": 9.867357781476294e-08, + "loss": 0.1536, + "step": 8470 + }, + { + "epoch": 2.828852896977793, + "grad_norm": 0.5310645008445086, + "learning_rate": 9.828977716502486e-08, + "loss": 0.1527, + "step": 8471 + }, + { + "epoch": 2.829186842544665, + "grad_norm": 0.5390370539150535, + "learning_rate": 9.790671697633092e-08, + "loss": 0.1607, + "step": 8472 + }, + { + "epoch": 2.829520788111538, + "grad_norm": 0.5542118402025401, + "learning_rate": 9.752439730654872e-08, + "loss": 0.1641, + "step": 8473 + }, + { + "epoch": 2.8298547336784106, + "grad_norm": 0.5423612496980644, + "learning_rate": 9.714281821343041e-08, + "loss": 0.1626, + "step": 8474 + }, + { + "epoch": 2.830188679245283, + "grad_norm": 0.5342496957180441, + "learning_rate": 9.676197975461876e-08, + "loss": 0.1582, + "step": 8475 + }, + { + "epoch": 2.8305226248121556, + "grad_norm": 0.49551719740241496, + "learning_rate": 9.638188198764387e-08, + "loss": 0.1514, + "step": 8476 + }, + { + "epoch": 2.8308565703790283, + "grad_norm": 0.514845882985583, + "learning_rate": 9.600252496992369e-08, + "loss": 0.1455, + "step": 8477 + }, + { + "epoch": 2.8311905159459005, + "grad_norm": 0.5246479501288792, + "learning_rate": 9.562390875876515e-08, + "loss": 0.1571, + "step": 8478 + }, + { + "epoch": 2.8315244615127733, + "grad_norm": 0.5151252059582621, + "learning_rate": 9.524603341136251e-08, + "loss": 0.15, + "step": 8479 + }, + { + "epoch": 2.831858407079646, + "grad_norm": 0.5659310966269068, + "learning_rate": 9.486889898479734e-08, + "loss": 0.156, + "step": 8480 + }, + { + "epoch": 2.8321923526465187, + "grad_norm": 0.5146682132991759, + "learning_rate": 9.449250553604184e-08, + "loss": 0.1505, + "step": 8481 + }, + { + "epoch": 2.8325262982133914, + "grad_norm": 0.4889795191225081, + "learning_rate": 9.41168531219533e-08, + "loss": 0.1508, + "step": 8482 + }, + { + "epoch": 2.8328602437802637, + "grad_norm": 0.56495527958627, + "learning_rate": 9.374194179927909e-08, + "loss": 0.1577, + "step": 8483 + }, + { + "epoch": 2.8331941893471364, + "grad_norm": 0.5605532903002794, + "learning_rate": 9.336777162465449e-08, + "loss": 0.1633, + "step": 8484 + }, + { + "epoch": 2.833528134914009, + "grad_norm": 0.4893798584627006, + "learning_rate": 9.299434265460095e-08, + "loss": 0.1394, + "step": 8485 + }, + { + "epoch": 2.8338620804808814, + "grad_norm": 0.5367020601261826, + "learning_rate": 9.262165494553055e-08, + "loss": 0.1521, + "step": 8486 + }, + { + "epoch": 2.834196026047754, + "grad_norm": 0.5127849134301988, + "learning_rate": 9.22497085537416e-08, + "loss": 0.1504, + "step": 8487 + }, + { + "epoch": 2.834529971614627, + "grad_norm": 0.5615166401386916, + "learning_rate": 9.187850353542082e-08, + "loss": 0.1581, + "step": 8488 + }, + { + "epoch": 2.8348639171814995, + "grad_norm": 0.5169398545404523, + "learning_rate": 9.150803994664337e-08, + "loss": 0.1505, + "step": 8489 + }, + { + "epoch": 2.8351978627483723, + "grad_norm": 0.5682736343819038, + "learning_rate": 9.113831784337279e-08, + "loss": 0.1531, + "step": 8490 + }, + { + "epoch": 2.8355318083152445, + "grad_norm": 0.5340266016385549, + "learning_rate": 9.076933728145832e-08, + "loss": 0.151, + "step": 8491 + }, + { + "epoch": 2.8358657538821173, + "grad_norm": 0.5585134485577232, + "learning_rate": 9.040109831664035e-08, + "loss": 0.1619, + "step": 8492 + }, + { + "epoch": 2.83619969944899, + "grad_norm": 0.5725765003377334, + "learning_rate": 9.003360100454495e-08, + "loss": 0.1592, + "step": 8493 + }, + { + "epoch": 2.8365336450158622, + "grad_norm": 0.534273889129343, + "learning_rate": 8.966684540068659e-08, + "loss": 0.1553, + "step": 8494 + }, + { + "epoch": 2.836867590582735, + "grad_norm": 0.5171157368775522, + "learning_rate": 8.930083156046931e-08, + "loss": 0.1517, + "step": 8495 + }, + { + "epoch": 2.8372015361496077, + "grad_norm": 0.5634182648212217, + "learning_rate": 8.893555953918276e-08, + "loss": 0.1567, + "step": 8496 + }, + { + "epoch": 2.83753548171648, + "grad_norm": 0.5264929981389637, + "learning_rate": 8.857102939200557e-08, + "loss": 0.1459, + "step": 8497 + }, + { + "epoch": 2.8378694272833527, + "grad_norm": 0.5735554438040473, + "learning_rate": 8.820724117400536e-08, + "loss": 0.1652, + "step": 8498 + }, + { + "epoch": 2.8382033728502254, + "grad_norm": 0.5039614742487958, + "learning_rate": 8.784419494013541e-08, + "loss": 0.1444, + "step": 8499 + }, + { + "epoch": 2.838537318417098, + "grad_norm": 0.5278294997838044, + "learning_rate": 8.74818907452385e-08, + "loss": 0.1614, + "step": 8500 + }, + { + "epoch": 2.838871263983971, + "grad_norm": 0.510269423877444, + "learning_rate": 8.712032864404529e-08, + "loss": 0.1497, + "step": 8501 + }, + { + "epoch": 2.839205209550843, + "grad_norm": 0.5285759359254864, + "learning_rate": 8.675950869117323e-08, + "loss": 0.1573, + "step": 8502 + }, + { + "epoch": 2.839539155117716, + "grad_norm": 0.5220521008233078, + "learning_rate": 8.639943094112868e-08, + "loss": 0.1458, + "step": 8503 + }, + { + "epoch": 2.8398731006845885, + "grad_norm": 0.5595841029538259, + "learning_rate": 8.604009544830705e-08, + "loss": 0.1601, + "step": 8504 + }, + { + "epoch": 2.840207046251461, + "grad_norm": 0.5506637831429321, + "learning_rate": 8.568150226698823e-08, + "loss": 0.1599, + "step": 8505 + }, + { + "epoch": 2.8405409918183335, + "grad_norm": 0.5610602819838691, + "learning_rate": 8.532365145134226e-08, + "loss": 0.1546, + "step": 8506 + }, + { + "epoch": 2.8408749373852062, + "grad_norm": 0.5818644312959905, + "learning_rate": 8.496654305542807e-08, + "loss": 0.1728, + "step": 8507 + }, + { + "epoch": 2.841208882952079, + "grad_norm": 0.5776511122322144, + "learning_rate": 8.461017713318976e-08, + "loss": 0.1654, + "step": 8508 + }, + { + "epoch": 2.8415428285189517, + "grad_norm": 0.5189470178377121, + "learning_rate": 8.425455373846147e-08, + "loss": 0.1497, + "step": 8509 + }, + { + "epoch": 2.841876774085824, + "grad_norm": 0.4788685040909344, + "learning_rate": 8.38996729249636e-08, + "loss": 0.1392, + "step": 8510 + }, + { + "epoch": 2.8422107196526967, + "grad_norm": 0.5386980557967677, + "learning_rate": 8.354553474630489e-08, + "loss": 0.1554, + "step": 8511 + }, + { + "epoch": 2.8425446652195694, + "grad_norm": 0.5621709868208091, + "learning_rate": 8.319213925598258e-08, + "loss": 0.1595, + "step": 8512 + }, + { + "epoch": 2.8428786107864417, + "grad_norm": 0.5321301128110231, + "learning_rate": 8.283948650738172e-08, + "loss": 0.147, + "step": 8513 + }, + { + "epoch": 2.8432125563533144, + "grad_norm": 0.521599077794686, + "learning_rate": 8.248757655377415e-08, + "loss": 0.1559, + "step": 8514 + }, + { + "epoch": 2.843546501920187, + "grad_norm": 0.5471021279762034, + "learning_rate": 8.213640944831957e-08, + "loss": 0.1635, + "step": 8515 + }, + { + "epoch": 2.8438804474870594, + "grad_norm": 0.5022501808867641, + "learning_rate": 8.178598524406667e-08, + "loss": 0.1464, + "step": 8516 + }, + { + "epoch": 2.844214393053932, + "grad_norm": 0.5173838285389819, + "learning_rate": 8.143630399395031e-08, + "loss": 0.1457, + "step": 8517 + }, + { + "epoch": 2.844548338620805, + "grad_norm": 0.5329604081610628, + "learning_rate": 8.108736575079434e-08, + "loss": 0.1615, + "step": 8518 + }, + { + "epoch": 2.8448822841876775, + "grad_norm": 0.49406524837234916, + "learning_rate": 8.073917056731106e-08, + "loss": 0.1471, + "step": 8519 + }, + { + "epoch": 2.8452162297545502, + "grad_norm": 0.503715390690022, + "learning_rate": 8.039171849609728e-08, + "loss": 0.1481, + "step": 8520 + }, + { + "epoch": 2.8455501753214225, + "grad_norm": 0.5202325840935615, + "learning_rate": 8.004500958964211e-08, + "loss": 0.1472, + "step": 8521 + }, + { + "epoch": 2.8458841208882952, + "grad_norm": 0.5296350657811596, + "learning_rate": 7.969904390031812e-08, + "loss": 0.1518, + "step": 8522 + }, + { + "epoch": 2.846218066455168, + "grad_norm": 0.5206405247107793, + "learning_rate": 7.935382148038794e-08, + "loss": 0.1583, + "step": 8523 + }, + { + "epoch": 2.8465520120220402, + "grad_norm": 0.5080001963788301, + "learning_rate": 7.900934238200265e-08, + "loss": 0.1458, + "step": 8524 + }, + { + "epoch": 2.846885957588913, + "grad_norm": 0.5533271921936429, + "learning_rate": 7.866560665719836e-08, + "loss": 0.1606, + "step": 8525 + }, + { + "epoch": 2.8472199031557857, + "grad_norm": 0.5516994173535836, + "learning_rate": 7.832261435790078e-08, + "loss": 0.1581, + "step": 8526 + }, + { + "epoch": 2.847553848722658, + "grad_norm": 0.603291791816102, + "learning_rate": 7.798036553592403e-08, + "loss": 0.1708, + "step": 8527 + }, + { + "epoch": 2.8478877942895306, + "grad_norm": 0.5226165249381783, + "learning_rate": 7.763886024296729e-08, + "loss": 0.1494, + "step": 8528 + }, + { + "epoch": 2.8482217398564034, + "grad_norm": 0.4984085875281068, + "learning_rate": 7.729809853061987e-08, + "loss": 0.1477, + "step": 8529 + }, + { + "epoch": 2.848555685423276, + "grad_norm": 0.5546006723134326, + "learning_rate": 7.69580804503578e-08, + "loss": 0.16, + "step": 8530 + }, + { + "epoch": 2.848889630990149, + "grad_norm": 0.4951006251440752, + "learning_rate": 7.661880605354444e-08, + "loss": 0.1459, + "step": 8531 + }, + { + "epoch": 2.849223576557021, + "grad_norm": 0.5475486717210286, + "learning_rate": 7.628027539143156e-08, + "loss": 0.155, + "step": 8532 + }, + { + "epoch": 2.849557522123894, + "grad_norm": 0.4853112230938831, + "learning_rate": 7.594248851515717e-08, + "loss": 0.1446, + "step": 8533 + }, + { + "epoch": 2.8498914676907665, + "grad_norm": 0.5277882995951618, + "learning_rate": 7.560544547574988e-08, + "loss": 0.157, + "step": 8534 + }, + { + "epoch": 2.850225413257639, + "grad_norm": 0.5129719043043828, + "learning_rate": 7.526914632412175e-08, + "loss": 0.1493, + "step": 8535 + }, + { + "epoch": 2.8505593588245115, + "grad_norm": 0.5254590538526176, + "learning_rate": 7.493359111107712e-08, + "loss": 0.1562, + "step": 8536 + }, + { + "epoch": 2.850893304391384, + "grad_norm": 0.5457808147998661, + "learning_rate": 7.459877988730325e-08, + "loss": 0.1568, + "step": 8537 + }, + { + "epoch": 2.851227249958257, + "grad_norm": 0.5225349529691248, + "learning_rate": 7.42647127033791e-08, + "loss": 0.1501, + "step": 8538 + }, + { + "epoch": 2.8515611955251297, + "grad_norm": 0.5583061635314059, + "learning_rate": 7.393138960976876e-08, + "loss": 0.1536, + "step": 8539 + }, + { + "epoch": 2.851895141092002, + "grad_norm": 0.5121547924256945, + "learning_rate": 7.359881065682473e-08, + "loss": 0.1438, + "step": 8540 + }, + { + "epoch": 2.8522290866588746, + "grad_norm": 0.49864090386106596, + "learning_rate": 7.32669758947857e-08, + "loss": 0.144, + "step": 8541 + }, + { + "epoch": 2.8525630322257474, + "grad_norm": 0.4893222689278706, + "learning_rate": 7.29358853737816e-08, + "loss": 0.146, + "step": 8542 + }, + { + "epoch": 2.8528969777926196, + "grad_norm": 0.5348409656064933, + "learning_rate": 7.260553914382573e-08, + "loss": 0.149, + "step": 8543 + }, + { + "epoch": 2.8532309233594924, + "grad_norm": 0.5435223373432201, + "learning_rate": 7.227593725482207e-08, + "loss": 0.1587, + "step": 8544 + }, + { + "epoch": 2.853564868926365, + "grad_norm": 0.5217911226961017, + "learning_rate": 7.194707975655912e-08, + "loss": 0.1444, + "step": 8545 + }, + { + "epoch": 2.8538988144932373, + "grad_norm": 0.6065711866534749, + "learning_rate": 7.161896669871605e-08, + "loss": 0.1735, + "step": 8546 + }, + { + "epoch": 2.85423276006011, + "grad_norm": 0.5330691052638803, + "learning_rate": 7.129159813085817e-08, + "loss": 0.158, + "step": 8547 + }, + { + "epoch": 2.854566705626983, + "grad_norm": 0.5350161629397456, + "learning_rate": 7.096497410243819e-08, + "loss": 0.1528, + "step": 8548 + }, + { + "epoch": 2.8549006511938555, + "grad_norm": 0.5560545581651952, + "learning_rate": 7.063909466279605e-08, + "loss": 0.1656, + "step": 8549 + }, + { + "epoch": 2.855234596760728, + "grad_norm": 0.585578949405154, + "learning_rate": 7.031395986116019e-08, + "loss": 0.1588, + "step": 8550 + }, + { + "epoch": 2.8555685423276005, + "grad_norm": 0.5165515533972673, + "learning_rate": 6.998956974664573e-08, + "loss": 0.1473, + "step": 8551 + }, + { + "epoch": 2.855902487894473, + "grad_norm": 0.5970286051761957, + "learning_rate": 6.966592436825514e-08, + "loss": 0.1683, + "step": 8552 + }, + { + "epoch": 2.856236433461346, + "grad_norm": 0.5173373849701015, + "learning_rate": 6.934302377488045e-08, + "loss": 0.1488, + "step": 8553 + }, + { + "epoch": 2.856570379028218, + "grad_norm": 0.5131753943094373, + "learning_rate": 6.902086801529817e-08, + "loss": 0.1519, + "step": 8554 + }, + { + "epoch": 2.856904324595091, + "grad_norm": 0.5156084894747482, + "learning_rate": 6.869945713817438e-08, + "loss": 0.1485, + "step": 8555 + }, + { + "epoch": 2.8572382701619636, + "grad_norm": 0.4849489841706419, + "learning_rate": 6.837879119206192e-08, + "loss": 0.1476, + "step": 8556 + }, + { + "epoch": 2.8575722157288364, + "grad_norm": 0.6184373363815887, + "learning_rate": 6.805887022540093e-08, + "loss": 0.1686, + "step": 8557 + }, + { + "epoch": 2.857906161295709, + "grad_norm": 0.5476203063313209, + "learning_rate": 6.773969428651883e-08, + "loss": 0.165, + "step": 8558 + }, + { + "epoch": 2.8582401068625813, + "grad_norm": 0.5373994473998241, + "learning_rate": 6.742126342363153e-08, + "loss": 0.1498, + "step": 8559 + }, + { + "epoch": 2.858574052429454, + "grad_norm": 0.5178688034759278, + "learning_rate": 6.710357768484165e-08, + "loss": 0.1571, + "step": 8560 + }, + { + "epoch": 2.8589079979963268, + "grad_norm": 0.5039114208119598, + "learning_rate": 6.67866371181397e-08, + "loss": 0.1535, + "step": 8561 + }, + { + "epoch": 2.859241943563199, + "grad_norm": 0.5332718448517993, + "learning_rate": 6.647044177140293e-08, + "loss": 0.1601, + "step": 8562 + }, + { + "epoch": 2.8595758891300718, + "grad_norm": 0.5102172652244273, + "learning_rate": 6.615499169239647e-08, + "loss": 0.1399, + "step": 8563 + }, + { + "epoch": 2.8599098346969445, + "grad_norm": 0.531015741793665, + "learning_rate": 6.584028692877164e-08, + "loss": 0.1588, + "step": 8564 + }, + { + "epoch": 2.8602437802638168, + "grad_norm": 0.523805491269051, + "learning_rate": 6.552632752807042e-08, + "loss": 0.1589, + "step": 8565 + }, + { + "epoch": 2.8605777258306895, + "grad_norm": 0.5326354340246765, + "learning_rate": 6.52131135377182e-08, + "loss": 0.1538, + "step": 8566 + }, + { + "epoch": 2.860911671397562, + "grad_norm": 0.511949106766765, + "learning_rate": 6.490064500503102e-08, + "loss": 0.1453, + "step": 8567 + }, + { + "epoch": 2.861245616964435, + "grad_norm": 0.5271829727987514, + "learning_rate": 6.458892197721e-08, + "loss": 0.1585, + "step": 8568 + }, + { + "epoch": 2.8615795625313076, + "grad_norm": 0.5430974193562058, + "learning_rate": 6.427794450134529e-08, + "loss": 0.1573, + "step": 8569 + }, + { + "epoch": 2.86191350809818, + "grad_norm": 0.5402436743016346, + "learning_rate": 6.396771262441259e-08, + "loss": 0.1587, + "step": 8570 + }, + { + "epoch": 2.8622474536650526, + "grad_norm": 0.5228404957057184, + "learning_rate": 6.365822639327724e-08, + "loss": 0.1527, + "step": 8571 + }, + { + "epoch": 2.8625813992319253, + "grad_norm": 0.5431852960693747, + "learning_rate": 6.334948585469014e-08, + "loss": 0.1516, + "step": 8572 + }, + { + "epoch": 2.8629153447987976, + "grad_norm": 0.5535152802323873, + "learning_rate": 6.304149105529067e-08, + "loss": 0.1597, + "step": 8573 + }, + { + "epoch": 2.8632492903656703, + "grad_norm": 0.503798876425636, + "learning_rate": 6.273424204160438e-08, + "loss": 0.1581, + "step": 8574 + }, + { + "epoch": 2.863583235932543, + "grad_norm": 0.4954582075562573, + "learning_rate": 6.242773886004583e-08, + "loss": 0.148, + "step": 8575 + }, + { + "epoch": 2.8639171814994153, + "grad_norm": 0.5227935230801231, + "learning_rate": 6.212198155691518e-08, + "loss": 0.1569, + "step": 8576 + }, + { + "epoch": 2.864251127066288, + "grad_norm": 0.5072845345431894, + "learning_rate": 6.181697017840049e-08, + "loss": 0.1568, + "step": 8577 + }, + { + "epoch": 2.8645850726331608, + "grad_norm": 0.5680395290888719, + "learning_rate": 6.151270477057825e-08, + "loss": 0.1665, + "step": 8578 + }, + { + "epoch": 2.8649190182000335, + "grad_norm": 0.5723718892695402, + "learning_rate": 6.120918537941001e-08, + "loss": 0.1689, + "step": 8579 + }, + { + "epoch": 2.865252963766906, + "grad_norm": 0.5543284280807889, + "learning_rate": 6.090641205074743e-08, + "loss": 0.164, + "step": 8580 + }, + { + "epoch": 2.8655869093337785, + "grad_norm": 0.5035318018400274, + "learning_rate": 6.060438483032671e-08, + "loss": 0.142, + "step": 8581 + }, + { + "epoch": 2.865920854900651, + "grad_norm": 0.5355651436216142, + "learning_rate": 6.030310376377302e-08, + "loss": 0.1612, + "step": 8582 + }, + { + "epoch": 2.866254800467524, + "grad_norm": 0.48977033513131546, + "learning_rate": 6.000256889659883e-08, + "loss": 0.144, + "step": 8583 + }, + { + "epoch": 2.866588746034396, + "grad_norm": 0.5436549484026989, + "learning_rate": 5.97027802742034e-08, + "loss": 0.168, + "step": 8584 + }, + { + "epoch": 2.866922691601269, + "grad_norm": 0.572267212305895, + "learning_rate": 5.940373794187326e-08, + "loss": 0.1636, + "step": 8585 + }, + { + "epoch": 2.8672566371681416, + "grad_norm": 0.5209482329083956, + "learning_rate": 5.910544194478174e-08, + "loss": 0.1509, + "step": 8586 + }, + { + "epoch": 2.8675905827350143, + "grad_norm": 0.5484505549507601, + "learning_rate": 5.880789232799e-08, + "loss": 0.1565, + "step": 8587 + }, + { + "epoch": 2.867924528301887, + "grad_norm": 0.5478787273088694, + "learning_rate": 5.851108913644765e-08, + "loss": 0.1641, + "step": 8588 + }, + { + "epoch": 2.8682584738687593, + "grad_norm": 0.5522406436362908, + "learning_rate": 5.821503241498882e-08, + "loss": 0.1636, + "step": 8589 + }, + { + "epoch": 2.868592419435632, + "grad_norm": 0.511907461284583, + "learning_rate": 5.791972220833719e-08, + "loss": 0.1552, + "step": 8590 + }, + { + "epoch": 2.8689263650025048, + "grad_norm": 0.5152899425702656, + "learning_rate": 5.762515856110262e-08, + "loss": 0.1517, + "step": 8591 + }, + { + "epoch": 2.869260310569377, + "grad_norm": 0.5258169726148847, + "learning_rate": 5.7331341517782855e-08, + "loss": 0.1528, + "step": 8592 + }, + { + "epoch": 2.8695942561362497, + "grad_norm": 0.4713808899737005, + "learning_rate": 5.703827112276128e-08, + "loss": 0.1416, + "step": 8593 + }, + { + "epoch": 2.8699282017031225, + "grad_norm": 0.5273185402336777, + "learning_rate": 5.674594742031081e-08, + "loss": 0.1601, + "step": 8594 + }, + { + "epoch": 2.8702621472699947, + "grad_norm": 0.5220436578594836, + "learning_rate": 5.6454370454589456e-08, + "loss": 0.1562, + "step": 8595 + }, + { + "epoch": 2.8705960928368675, + "grad_norm": 0.5221063380938278, + "learning_rate": 5.6163540269644215e-08, + "loss": 0.1518, + "step": 8596 + }, + { + "epoch": 2.87093003840374, + "grad_norm": 0.4974464271881414, + "learning_rate": 5.5873456909407706e-08, + "loss": 0.1505, + "step": 8597 + }, + { + "epoch": 2.871263983970613, + "grad_norm": 0.5046804081978551, + "learning_rate": 5.5584120417701005e-08, + "loss": 0.1511, + "step": 8598 + }, + { + "epoch": 2.8715979295374856, + "grad_norm": 0.5053148608639023, + "learning_rate": 5.529553083823136e-08, + "loss": 0.1516, + "step": 8599 + }, + { + "epoch": 2.871931875104358, + "grad_norm": 0.5777837178110677, + "learning_rate": 5.50076882145939e-08, + "loss": 0.1639, + "step": 8600 + }, + { + "epoch": 2.8722658206712306, + "grad_norm": 0.5620172653957106, + "learning_rate": 5.472059259027051e-08, + "loss": 0.1632, + "step": 8601 + }, + { + "epoch": 2.8725997662381033, + "grad_norm": 0.5347887245031796, + "learning_rate": 5.44342440086304e-08, + "loss": 0.1568, + "step": 8602 + }, + { + "epoch": 2.8729337118049756, + "grad_norm": 0.5236727280193502, + "learning_rate": 5.414864251293006e-08, + "loss": 0.1484, + "step": 8603 + }, + { + "epoch": 2.8732676573718483, + "grad_norm": 0.5322026712399329, + "learning_rate": 5.386378814631277e-08, + "loss": 0.1536, + "step": 8604 + }, + { + "epoch": 2.873601602938721, + "grad_norm": 0.534740394002748, + "learning_rate": 5.3579680951808545e-08, + "loss": 0.1604, + "step": 8605 + }, + { + "epoch": 2.8739355485055937, + "grad_norm": 0.5853877385693443, + "learning_rate": 5.329632097233639e-08, + "loss": 0.1597, + "step": 8606 + }, + { + "epoch": 2.8742694940724665, + "grad_norm": 0.5011755113078209, + "learning_rate": 5.3013708250700405e-08, + "loss": 0.1491, + "step": 8607 + }, + { + "epoch": 2.8746034396393387, + "grad_norm": 0.5460105318096763, + "learning_rate": 5.2731842829591984e-08, + "loss": 0.1603, + "step": 8608 + }, + { + "epoch": 2.8749373852062114, + "grad_norm": 0.5535251387041829, + "learning_rate": 5.2450724751592076e-08, + "loss": 0.1646, + "step": 8609 + }, + { + "epoch": 2.875271330773084, + "grad_norm": 0.5412106966495831, + "learning_rate": 5.217035405916449e-08, + "loss": 0.1524, + "step": 8610 + }, + { + "epoch": 2.8756052763399564, + "grad_norm": 0.48812577783849903, + "learning_rate": 5.1890730794664227e-08, + "loss": 0.1438, + "step": 8611 + }, + { + "epoch": 2.875939221906829, + "grad_norm": 0.5630739912686819, + "learning_rate": 5.161185500033139e-08, + "loss": 0.1621, + "step": 8612 + }, + { + "epoch": 2.876273167473702, + "grad_norm": 0.5637930034961525, + "learning_rate": 5.1333726718293396e-08, + "loss": 0.1584, + "step": 8613 + }, + { + "epoch": 2.876607113040574, + "grad_norm": 0.5413995620609211, + "learning_rate": 5.105634599056386e-08, + "loss": 0.1517, + "step": 8614 + }, + { + "epoch": 2.876941058607447, + "grad_norm": 0.5284011452682914, + "learning_rate": 5.077971285904593e-08, + "loss": 0.1532, + "step": 8615 + }, + { + "epoch": 2.8772750041743196, + "grad_norm": 0.5373211068578061, + "learning_rate": 5.050382736552728e-08, + "loss": 0.1567, + "step": 8616 + }, + { + "epoch": 2.8776089497411923, + "grad_norm": 0.5164966070831943, + "learning_rate": 5.022868955168403e-08, + "loss": 0.1498, + "step": 8617 + }, + { + "epoch": 2.877942895308065, + "grad_norm": 0.5016749655968777, + "learning_rate": 4.995429945907848e-08, + "loss": 0.148, + "step": 8618 + }, + { + "epoch": 2.8782768408749373, + "grad_norm": 0.5350771758808319, + "learning_rate": 4.968065712916137e-08, + "loss": 0.1571, + "step": 8619 + }, + { + "epoch": 2.87861078644181, + "grad_norm": 0.5315077919152822, + "learning_rate": 4.940776260326907e-08, + "loss": 0.149, + "step": 8620 + }, + { + "epoch": 2.8789447320086827, + "grad_norm": 0.530178375587157, + "learning_rate": 4.913561592262528e-08, + "loss": 0.1555, + "step": 8621 + }, + { + "epoch": 2.879278677575555, + "grad_norm": 0.5217846163655511, + "learning_rate": 4.886421712834155e-08, + "loss": 0.1571, + "step": 8622 + }, + { + "epoch": 2.8796126231424277, + "grad_norm": 0.5115490799371355, + "learning_rate": 4.859356626141509e-08, + "loss": 0.1491, + "step": 8623 + }, + { + "epoch": 2.8799465687093004, + "grad_norm": 0.5516763395859796, + "learning_rate": 4.8323663362732084e-08, + "loss": 0.166, + "step": 8624 + }, + { + "epoch": 2.8802805142761727, + "grad_norm": 0.5599558340991891, + "learning_rate": 4.8054508473063253e-08, + "loss": 0.1685, + "step": 8625 + }, + { + "epoch": 2.8806144598430454, + "grad_norm": 0.4982081440561017, + "learning_rate": 4.778610163306885e-08, + "loss": 0.1551, + "step": 8626 + }, + { + "epoch": 2.880948405409918, + "grad_norm": 0.5606710703278645, + "learning_rate": 4.751844288329366e-08, + "loss": 0.1567, + "step": 8627 + }, + { + "epoch": 2.881282350976791, + "grad_norm": 0.5087763751751658, + "learning_rate": 4.72515322641709e-08, + "loss": 0.1464, + "step": 8628 + }, + { + "epoch": 2.8816162965436636, + "grad_norm": 0.552549417686956, + "learning_rate": 4.6985369816021644e-08, + "loss": 0.1565, + "step": 8629 + }, + { + "epoch": 2.881950242110536, + "grad_norm": 0.49592917632561173, + "learning_rate": 4.6719955579052064e-08, + "loss": 0.1448, + "step": 8630 + }, + { + "epoch": 2.8822841876774086, + "grad_norm": 0.5255353569803617, + "learning_rate": 4.6455289593355656e-08, + "loss": 0.1557, + "step": 8631 + }, + { + "epoch": 2.8826181332442813, + "grad_norm": 0.5534574457295879, + "learning_rate": 4.619137189891432e-08, + "loss": 0.163, + "step": 8632 + }, + { + "epoch": 2.8829520788111536, + "grad_norm": 0.5397072465178986, + "learning_rate": 4.5928202535595044e-08, + "loss": 0.157, + "step": 8633 + }, + { + "epoch": 2.8832860243780263, + "grad_norm": 0.549349296130443, + "learning_rate": 4.5665781543153266e-08, + "loss": 0.1645, + "step": 8634 + }, + { + "epoch": 2.883619969944899, + "grad_norm": 0.5420717576627341, + "learning_rate": 4.54041089612306e-08, + "loss": 0.1656, + "step": 8635 + }, + { + "epoch": 2.8839539155117717, + "grad_norm": 0.5363732368933584, + "learning_rate": 4.514318482935598e-08, + "loss": 0.1617, + "step": 8636 + }, + { + "epoch": 2.8842878610786444, + "grad_norm": 0.5350097682625145, + "learning_rate": 4.488300918694455e-08, + "loss": 0.1562, + "step": 8637 + }, + { + "epoch": 2.8846218066455167, + "grad_norm": 0.5246389948891459, + "learning_rate": 4.4623582073299864e-08, + "loss": 0.158, + "step": 8638 + }, + { + "epoch": 2.8849557522123894, + "grad_norm": 0.5436712211339741, + "learning_rate": 4.4364903527610026e-08, + "loss": 0.1545, + "step": 8639 + }, + { + "epoch": 2.885289697779262, + "grad_norm": 0.5009484631759727, + "learning_rate": 4.410697358895211e-08, + "loss": 0.1464, + "step": 8640 + }, + { + "epoch": 2.8856236433461344, + "grad_norm": 0.5290374848110081, + "learning_rate": 4.384979229628994e-08, + "loss": 0.1482, + "step": 8641 + }, + { + "epoch": 2.885957588913007, + "grad_norm": 0.5504796169207347, + "learning_rate": 4.359335968847356e-08, + "loss": 0.1583, + "step": 8642 + }, + { + "epoch": 2.88629153447988, + "grad_norm": 0.5589515588770203, + "learning_rate": 4.333767580423976e-08, + "loss": 0.1672, + "step": 8643 + }, + { + "epoch": 2.886625480046752, + "grad_norm": 0.5394714452979373, + "learning_rate": 4.3082740682213186e-08, + "loss": 0.1581, + "step": 8644 + }, + { + "epoch": 2.886959425613625, + "grad_norm": 0.5311277472998632, + "learning_rate": 4.2828554360904165e-08, + "loss": 0.16, + "step": 8645 + }, + { + "epoch": 2.8872933711804976, + "grad_norm": 0.5214855770881811, + "learning_rate": 4.25751168787103e-08, + "loss": 0.1584, + "step": 8646 + }, + { + "epoch": 2.8876273167473703, + "grad_norm": 0.5196864326489907, + "learning_rate": 4.2322428273917635e-08, + "loss": 0.1505, + "step": 8647 + }, + { + "epoch": 2.887961262314243, + "grad_norm": 0.5395236365751814, + "learning_rate": 4.2070488584696754e-08, + "loss": 0.1583, + "step": 8648 + }, + { + "epoch": 2.8882952078811153, + "grad_norm": 0.5281187130550312, + "learning_rate": 4.18192978491061e-08, + "loss": 0.157, + "step": 8649 + }, + { + "epoch": 2.888629153447988, + "grad_norm": 0.5335230980635554, + "learning_rate": 4.1568856105091424e-08, + "loss": 0.1605, + "step": 8650 + }, + { + "epoch": 2.8889630990148607, + "grad_norm": 0.5585441771953708, + "learning_rate": 4.1319163390484693e-08, + "loss": 0.1614, + "step": 8651 + }, + { + "epoch": 2.889297044581733, + "grad_norm": 0.5469020513513515, + "learning_rate": 4.107021974300407e-08, + "loss": 0.1586, + "step": 8652 + }, + { + "epoch": 2.8896309901486057, + "grad_norm": 0.507051026666806, + "learning_rate": 4.082202520025724e-08, + "loss": 0.151, + "step": 8653 + }, + { + "epoch": 2.8899649357154784, + "grad_norm": 0.4997794435735754, + "learning_rate": 4.0574579799735335e-08, + "loss": 0.1458, + "step": 8654 + }, + { + "epoch": 2.890298881282351, + "grad_norm": 0.530000754000752, + "learning_rate": 4.0327883578819006e-08, + "loss": 0.1558, + "step": 8655 + }, + { + "epoch": 2.890632826849224, + "grad_norm": 0.5068361461273592, + "learning_rate": 4.008193657477399e-08, + "loss": 0.1485, + "step": 8656 + }, + { + "epoch": 2.890966772416096, + "grad_norm": 0.5529734401920643, + "learning_rate": 3.9836738824753364e-08, + "loss": 0.1637, + "step": 8657 + }, + { + "epoch": 2.891300717982969, + "grad_norm": 0.5096699557010229, + "learning_rate": 3.959229036579748e-08, + "loss": 0.1483, + "step": 8658 + }, + { + "epoch": 2.8916346635498416, + "grad_norm": 0.4798650693812542, + "learning_rate": 3.9348591234832926e-08, + "loss": 0.1448, + "step": 8659 + }, + { + "epoch": 2.891968609116714, + "grad_norm": 0.4990085154143496, + "learning_rate": 3.9105641468673574e-08, + "loss": 0.1561, + "step": 8660 + }, + { + "epoch": 2.8923025546835865, + "grad_norm": 0.5224750928551145, + "learning_rate": 3.886344110402007e-08, + "loss": 0.1531, + "step": 8661 + }, + { + "epoch": 2.8926365002504593, + "grad_norm": 0.5338496892667479, + "learning_rate": 3.862199017745871e-08, + "loss": 0.1585, + "step": 8662 + }, + { + "epoch": 2.8929704458173315, + "grad_norm": 0.5134775255661097, + "learning_rate": 3.838128872546421e-08, + "loss": 0.1547, + "step": 8663 + }, + { + "epoch": 2.8933043913842043, + "grad_norm": 0.5331918577725364, + "learning_rate": 3.814133678439691e-08, + "loss": 0.1587, + "step": 8664 + }, + { + "epoch": 2.893638336951077, + "grad_norm": 0.517842146198162, + "learning_rate": 3.790213439050561e-08, + "loss": 0.1533, + "step": 8665 + }, + { + "epoch": 2.8939722825179497, + "grad_norm": 0.5189498284952392, + "learning_rate": 3.766368157992306e-08, + "loss": 0.1523, + "step": 8666 + }, + { + "epoch": 2.8943062280848224, + "grad_norm": 0.49860727440804087, + "learning_rate": 3.7425978388671014e-08, + "loss": 0.1468, + "step": 8667 + }, + { + "epoch": 2.8946401736516947, + "grad_norm": 0.5297330753233082, + "learning_rate": 3.718902485265741e-08, + "loss": 0.1577, + "step": 8668 + }, + { + "epoch": 2.8949741192185674, + "grad_norm": 0.5334255049213774, + "learning_rate": 3.6952821007676943e-08, + "loss": 0.1537, + "step": 8669 + }, + { + "epoch": 2.89530806478544, + "grad_norm": 0.5289833683098253, + "learning_rate": 3.671736688941108e-08, + "loss": 0.152, + "step": 8670 + }, + { + "epoch": 2.8956420103523124, + "grad_norm": 0.4915608352001022, + "learning_rate": 3.6482662533426914e-08, + "loss": 0.1436, + "step": 8671 + }, + { + "epoch": 2.895975955919185, + "grad_norm": 0.5486892456552841, + "learning_rate": 3.6248707975181096e-08, + "loss": 0.1543, + "step": 8672 + }, + { + "epoch": 2.896309901486058, + "grad_norm": 0.5503337548743263, + "learning_rate": 3.601550325001313e-08, + "loss": 0.1567, + "step": 8673 + }, + { + "epoch": 2.89664384705293, + "grad_norm": 0.5034194587207542, + "learning_rate": 3.578304839315316e-08, + "loss": 0.1445, + "step": 8674 + }, + { + "epoch": 2.896977792619803, + "grad_norm": 0.5146376386794662, + "learning_rate": 3.5551343439715336e-08, + "loss": 0.1441, + "step": 8675 + }, + { + "epoch": 2.8973117381866755, + "grad_norm": 0.5207461435783136, + "learning_rate": 3.5320388424701644e-08, + "loss": 0.1455, + "step": 8676 + }, + { + "epoch": 2.8976456837535483, + "grad_norm": 0.5822846711151547, + "learning_rate": 3.50901833830003e-08, + "loss": 0.1609, + "step": 8677 + }, + { + "epoch": 2.897979629320421, + "grad_norm": 0.4881700169042898, + "learning_rate": 3.4860728349386807e-08, + "loss": 0.151, + "step": 8678 + }, + { + "epoch": 2.8983135748872932, + "grad_norm": 0.5621883536149694, + "learning_rate": 3.4632023358522894e-08, + "loss": 0.1619, + "step": 8679 + }, + { + "epoch": 2.898647520454166, + "grad_norm": 0.5110749883128612, + "learning_rate": 3.440406844495758e-08, + "loss": 0.1549, + "step": 8680 + }, + { + "epoch": 2.8989814660210387, + "grad_norm": 0.4713601733022798, + "learning_rate": 3.4176863643125e-08, + "loss": 0.1366, + "step": 8681 + }, + { + "epoch": 2.899315411587911, + "grad_norm": 0.5227456137198871, + "learning_rate": 3.395040898734825e-08, + "loss": 0.1552, + "step": 8682 + }, + { + "epoch": 2.8996493571547837, + "grad_norm": 0.6136173309903541, + "learning_rate": 3.372470451183496e-08, + "loss": 0.1657, + "step": 8683 + }, + { + "epoch": 2.8999833027216564, + "grad_norm": 0.5297325697192652, + "learning_rate": 3.349975025068175e-08, + "loss": 0.1527, + "step": 8684 + }, + { + "epoch": 2.900317248288529, + "grad_norm": 0.6288686454375647, + "learning_rate": 3.327554623786977e-08, + "loss": 0.161, + "step": 8685 + }, + { + "epoch": 2.900651193855402, + "grad_norm": 0.5269797630905554, + "learning_rate": 3.305209250726804e-08, + "loss": 0.1501, + "step": 8686 + }, + { + "epoch": 2.900985139422274, + "grad_norm": 0.5554879926954489, + "learning_rate": 3.282938909263122e-08, + "loss": 0.1624, + "step": 8687 + }, + { + "epoch": 2.901319084989147, + "grad_norm": 0.5208615726777216, + "learning_rate": 3.2607436027601854e-08, + "loss": 0.1534, + "step": 8688 + }, + { + "epoch": 2.9016530305560195, + "grad_norm": 0.5420289935437654, + "learning_rate": 3.238623334570812e-08, + "loss": 0.1535, + "step": 8689 + }, + { + "epoch": 2.901986976122892, + "grad_norm": 0.538732737361838, + "learning_rate": 3.2165781080366054e-08, + "loss": 0.1569, + "step": 8690 + }, + { + "epoch": 2.9023209216897645, + "grad_norm": 0.5313485948505864, + "learning_rate": 3.194607926487681e-08, + "loss": 0.1586, + "step": 8691 + }, + { + "epoch": 2.9026548672566372, + "grad_norm": 0.5356772127350496, + "learning_rate": 3.1727127932429936e-08, + "loss": 0.1568, + "step": 8692 + }, + { + "epoch": 2.9029888128235095, + "grad_norm": 0.5169393673994584, + "learning_rate": 3.150892711609899e-08, + "loss": 0.1603, + "step": 8693 + }, + { + "epoch": 2.9033227583903822, + "grad_norm": 0.4894913655189434, + "learning_rate": 3.129147684884704e-08, + "loss": 0.1385, + "step": 8694 + }, + { + "epoch": 2.903656703957255, + "grad_norm": 0.53281078755593, + "learning_rate": 3.107477716352225e-08, + "loss": 0.1507, + "step": 8695 + }, + { + "epoch": 2.9039906495241277, + "grad_norm": 0.5359916182265713, + "learning_rate": 3.0858828092859564e-08, + "loss": 0.1701, + "step": 8696 + }, + { + "epoch": 2.9043245950910004, + "grad_norm": 0.5262814413620647, + "learning_rate": 3.0643629669480644e-08, + "loss": 0.1534, + "step": 8697 + }, + { + "epoch": 2.9046585406578727, + "grad_norm": 0.5497355759559763, + "learning_rate": 3.042918192589395e-08, + "loss": 0.1596, + "step": 8698 + }, + { + "epoch": 2.9049924862247454, + "grad_norm": 0.5412939441164044, + "learning_rate": 3.021548489449355e-08, + "loss": 0.1588, + "step": 8699 + }, + { + "epoch": 2.905326431791618, + "grad_norm": 0.49003677617408303, + "learning_rate": 3.000253860756197e-08, + "loss": 0.1559, + "step": 8700 + }, + { + "epoch": 2.9056603773584904, + "grad_norm": 0.5473785424325217, + "learning_rate": 2.979034309726625e-08, + "loss": 0.1618, + "step": 8701 + }, + { + "epoch": 2.905994322925363, + "grad_norm": 0.5097771881556972, + "learning_rate": 2.9578898395661858e-08, + "loss": 0.1488, + "step": 8702 + }, + { + "epoch": 2.906328268492236, + "grad_norm": 0.5149654857058888, + "learning_rate": 2.9368204534689916e-08, + "loss": 0.1468, + "step": 8703 + }, + { + "epoch": 2.9066622140591085, + "grad_norm": 0.5259625454544891, + "learning_rate": 2.915826154617718e-08, + "loss": 0.1545, + "step": 8704 + }, + { + "epoch": 2.9069961596259812, + "grad_norm": 0.48948710540364265, + "learning_rate": 2.8949069461839952e-08, + "loss": 0.1401, + "step": 8705 + }, + { + "epoch": 2.9073301051928535, + "grad_norm": 0.5531487170068914, + "learning_rate": 2.8740628313276842e-08, + "loss": 0.1606, + "step": 8706 + }, + { + "epoch": 2.9076640507597262, + "grad_norm": 0.5032223065794879, + "learning_rate": 2.853293813197766e-08, + "loss": 0.1505, + "step": 8707 + }, + { + "epoch": 2.907997996326599, + "grad_norm": 0.5328715714346113, + "learning_rate": 2.8325998949314536e-08, + "loss": 0.1536, + "step": 8708 + }, + { + "epoch": 2.908331941893471, + "grad_norm": 0.6225983310360557, + "learning_rate": 2.811981079654913e-08, + "loss": 0.1652, + "step": 8709 + }, + { + "epoch": 2.908665887460344, + "grad_norm": 0.5526958652803692, + "learning_rate": 2.7914373704827634e-08, + "loss": 0.1601, + "step": 8710 + }, + { + "epoch": 2.9089998330272167, + "grad_norm": 0.5632023342482809, + "learning_rate": 2.7709687705185227e-08, + "loss": 0.1673, + "step": 8711 + }, + { + "epoch": 2.909333778594089, + "grad_norm": 0.5155594811296534, + "learning_rate": 2.7505752828541065e-08, + "loss": 0.1518, + "step": 8712 + }, + { + "epoch": 2.9096677241609616, + "grad_norm": 0.5339483867535059, + "learning_rate": 2.730256910570217e-08, + "loss": 0.1641, + "step": 8713 + }, + { + "epoch": 2.9100016697278344, + "grad_norm": 0.5461665859120574, + "learning_rate": 2.7100136567361767e-08, + "loss": 0.1576, + "step": 8714 + }, + { + "epoch": 2.910335615294707, + "grad_norm": 0.5336837655705691, + "learning_rate": 2.689845524409984e-08, + "loss": 0.1493, + "step": 8715 + }, + { + "epoch": 2.91066956086158, + "grad_norm": 0.4644299519054088, + "learning_rate": 2.6697525166382575e-08, + "loss": 0.1365, + "step": 8716 + }, + { + "epoch": 2.911003506428452, + "grad_norm": 0.5243383754514328, + "learning_rate": 2.649734636456236e-08, + "loss": 0.1487, + "step": 8717 + }, + { + "epoch": 2.911337451995325, + "grad_norm": 0.566255311577455, + "learning_rate": 2.629791886888e-08, + "loss": 0.1657, + "step": 8718 + }, + { + "epoch": 2.9116713975621975, + "grad_norm": 0.5151497217026064, + "learning_rate": 2.6099242709459737e-08, + "loss": 0.1535, + "step": 8719 + }, + { + "epoch": 2.91200534312907, + "grad_norm": 0.5286692237715818, + "learning_rate": 2.5901317916314783e-08, + "loss": 0.1574, + "step": 8720 + }, + { + "epoch": 2.9123392886959425, + "grad_norm": 0.5286053495921429, + "learning_rate": 2.5704144519344e-08, + "loss": 0.1609, + "step": 8721 + }, + { + "epoch": 2.912673234262815, + "grad_norm": 0.5265287526525314, + "learning_rate": 2.5507722548332446e-08, + "loss": 0.151, + "step": 8722 + }, + { + "epoch": 2.9130071798296875, + "grad_norm": 0.5179557694097449, + "learning_rate": 2.5312052032952505e-08, + "loss": 0.1517, + "step": 8723 + }, + { + "epoch": 2.91334112539656, + "grad_norm": 0.5090029191612081, + "learning_rate": 2.5117133002762196e-08, + "loss": 0.1455, + "step": 8724 + }, + { + "epoch": 2.913675070963433, + "grad_norm": 0.5325582522074706, + "learning_rate": 2.492296548720574e-08, + "loss": 0.1581, + "step": 8725 + }, + { + "epoch": 2.9140090165303056, + "grad_norm": 0.5281192943466827, + "learning_rate": 2.4729549515615235e-08, + "loss": 0.1561, + "step": 8726 + }, + { + "epoch": 2.9143429620971784, + "grad_norm": 0.5038006994679268, + "learning_rate": 2.453688511720842e-08, + "loss": 0.1418, + "step": 8727 + }, + { + "epoch": 2.9146769076640506, + "grad_norm": 0.5549235611646274, + "learning_rate": 2.4344972321089234e-08, + "loss": 0.1599, + "step": 8728 + }, + { + "epoch": 2.9150108532309233, + "grad_norm": 0.5302992496522636, + "learning_rate": 2.415381115624782e-08, + "loss": 0.1574, + "step": 8729 + }, + { + "epoch": 2.915344798797796, + "grad_norm": 0.5110893643807364, + "learning_rate": 2.3963401651562747e-08, + "loss": 0.1544, + "step": 8730 + }, + { + "epoch": 2.9156787443646683, + "grad_norm": 0.48901569574272535, + "learning_rate": 2.3773743835796558e-08, + "loss": 0.148, + "step": 8731 + }, + { + "epoch": 2.916012689931541, + "grad_norm": 0.5057738968119625, + "learning_rate": 2.358483773759912e-08, + "loss": 0.148, + "step": 8732 + }, + { + "epoch": 2.9163466354984138, + "grad_norm": 0.5454906371257616, + "learning_rate": 2.33966833855076e-08, + "loss": 0.1649, + "step": 8733 + }, + { + "epoch": 2.9166805810652865, + "grad_norm": 0.5538324525215251, + "learning_rate": 2.320928080794482e-08, + "loss": 0.1628, + "step": 8734 + }, + { + "epoch": 2.917014526632159, + "grad_norm": 0.5046291060510665, + "learning_rate": 2.3022630033219807e-08, + "loss": 0.1389, + "step": 8735 + }, + { + "epoch": 2.9173484721990315, + "grad_norm": 0.5542770135094989, + "learning_rate": 2.2836731089528886e-08, + "loss": 0.1655, + "step": 8736 + }, + { + "epoch": 2.917682417765904, + "grad_norm": 0.5699978406392153, + "learning_rate": 2.2651584004953485e-08, + "loss": 0.1642, + "step": 8737 + }, + { + "epoch": 2.918016363332777, + "grad_norm": 0.5293384260135789, + "learning_rate": 2.2467188807462902e-08, + "loss": 0.1564, + "step": 8738 + }, + { + "epoch": 2.918350308899649, + "grad_norm": 0.551306712890341, + "learning_rate": 2.2283545524912075e-08, + "loss": 0.1614, + "step": 8739 + }, + { + "epoch": 2.918684254466522, + "grad_norm": 0.5530682179532525, + "learning_rate": 2.210065418504215e-08, + "loss": 0.1504, + "step": 8740 + }, + { + "epoch": 2.9190182000333946, + "grad_norm": 0.5133652618900586, + "learning_rate": 2.1918514815481572e-08, + "loss": 0.1513, + "step": 8741 + }, + { + "epoch": 2.919352145600267, + "grad_norm": 0.48003189542915814, + "learning_rate": 2.17371274437439e-08, + "loss": 0.139, + "step": 8742 + }, + { + "epoch": 2.9196860911671396, + "grad_norm": 0.5691839116317102, + "learning_rate": 2.155649209723054e-08, + "loss": 0.1588, + "step": 8743 + }, + { + "epoch": 2.9200200367340123, + "grad_norm": 0.5663322722634725, + "learning_rate": 2.137660880322856e-08, + "loss": 0.1551, + "step": 8744 + }, + { + "epoch": 2.920353982300885, + "grad_norm": 0.522117785518442, + "learning_rate": 2.1197477588910666e-08, + "loss": 0.1534, + "step": 8745 + }, + { + "epoch": 2.9206879278677578, + "grad_norm": 0.5412269304337229, + "learning_rate": 2.101909848133743e-08, + "loss": 0.1605, + "step": 8746 + }, + { + "epoch": 2.92102187343463, + "grad_norm": 0.5590537588726184, + "learning_rate": 2.0841471507455635e-08, + "loss": 0.1613, + "step": 8747 + }, + { + "epoch": 2.9213558190015028, + "grad_norm": 0.5697624900762938, + "learning_rate": 2.0664596694096596e-08, + "loss": 0.1606, + "step": 8748 + }, + { + "epoch": 2.9216897645683755, + "grad_norm": 0.5509931109216466, + "learning_rate": 2.0488474067980045e-08, + "loss": 0.1608, + "step": 8749 + }, + { + "epoch": 2.9220237101352478, + "grad_norm": 0.5328571613716071, + "learning_rate": 2.0313103655711373e-08, + "loss": 0.1549, + "step": 8750 + }, + { + "epoch": 2.9223576557021205, + "grad_norm": 0.513781721288196, + "learning_rate": 2.0138485483782723e-08, + "loss": 0.1522, + "step": 8751 + }, + { + "epoch": 2.922691601268993, + "grad_norm": 0.5293720468326389, + "learning_rate": 1.996461957857132e-08, + "loss": 0.1501, + "step": 8752 + }, + { + "epoch": 2.923025546835866, + "grad_norm": 0.5633355764295512, + "learning_rate": 1.9791505966342273e-08, + "loss": 0.1624, + "step": 8753 + }, + { + "epoch": 2.9233594924027386, + "grad_norm": 0.5170586045028958, + "learning_rate": 1.9619144673246325e-08, + "loss": 0.1503, + "step": 8754 + }, + { + "epoch": 2.923693437969611, + "grad_norm": 0.5323221698384542, + "learning_rate": 1.9447535725320987e-08, + "loss": 0.1494, + "step": 8755 + }, + { + "epoch": 2.9240273835364836, + "grad_norm": 0.5356685252034754, + "learning_rate": 1.9276679148488854e-08, + "loss": 0.1478, + "step": 8756 + }, + { + "epoch": 2.9243613291033563, + "grad_norm": 0.5517955598586819, + "learning_rate": 1.9106574968560943e-08, + "loss": 0.1537, + "step": 8757 + }, + { + "epoch": 2.9246952746702286, + "grad_norm": 0.5070202914078584, + "learning_rate": 1.8937223211232257e-08, + "loss": 0.1442, + "step": 8758 + }, + { + "epoch": 2.9250292202371013, + "grad_norm": 0.5321126462836772, + "learning_rate": 1.876862390208678e-08, + "loss": 0.1546, + "step": 8759 + }, + { + "epoch": 2.925363165803974, + "grad_norm": 0.5484493215810594, + "learning_rate": 1.8600777066593023e-08, + "loss": 0.1554, + "step": 8760 + }, + { + "epoch": 2.9256971113708463, + "grad_norm": 0.5291482520966377, + "learning_rate": 1.8433682730105706e-08, + "loss": 0.1558, + "step": 8761 + }, + { + "epoch": 2.926031056937719, + "grad_norm": 0.5176849375689132, + "learning_rate": 1.8267340917866306e-08, + "loss": 0.1506, + "step": 8762 + }, + { + "epoch": 2.9263650025045918, + "grad_norm": 0.5205766280424573, + "learning_rate": 1.8101751655003053e-08, + "loss": 0.1494, + "step": 8763 + }, + { + "epoch": 2.9266989480714645, + "grad_norm": 0.5050130872096809, + "learning_rate": 1.793691496653094e-08, + "loss": 0.1546, + "step": 8764 + }, + { + "epoch": 2.927032893638337, + "grad_norm": 0.5509444172234437, + "learning_rate": 1.7772830877348933e-08, + "loss": 0.159, + "step": 8765 + }, + { + "epoch": 2.9273668392052095, + "grad_norm": 0.5237455644993183, + "learning_rate": 1.760949941224499e-08, + "loss": 0.1498, + "step": 8766 + }, + { + "epoch": 2.927700784772082, + "grad_norm": 0.5474546675198331, + "learning_rate": 1.7446920595892147e-08, + "loss": 0.1599, + "step": 8767 + }, + { + "epoch": 2.928034730338955, + "grad_norm": 0.5292033875636737, + "learning_rate": 1.7285094452849095e-08, + "loss": 0.1528, + "step": 8768 + }, + { + "epoch": 2.928368675905827, + "grad_norm": 0.5326258288778125, + "learning_rate": 1.7124021007562385e-08, + "loss": 0.1541, + "step": 8769 + }, + { + "epoch": 2.9287026214727, + "grad_norm": 0.505259236701369, + "learning_rate": 1.696370028436367e-08, + "loss": 0.1504, + "step": 8770 + }, + { + "epoch": 2.9290365670395726, + "grad_norm": 0.5295968720784965, + "learning_rate": 1.6804132307471354e-08, + "loss": 0.1554, + "step": 8771 + }, + { + "epoch": 2.929370512606445, + "grad_norm": 0.5855530439478099, + "learning_rate": 1.6645317100990044e-08, + "loss": 0.1655, + "step": 8772 + }, + { + "epoch": 2.9297044581733176, + "grad_norm": 0.5341122406061138, + "learning_rate": 1.6487254688910546e-08, + "loss": 0.1496, + "step": 8773 + }, + { + "epoch": 2.9300384037401903, + "grad_norm": 0.4966688117422215, + "learning_rate": 1.6329945095110435e-08, + "loss": 0.1437, + "step": 8774 + }, + { + "epoch": 2.930372349307063, + "grad_norm": 0.5169723744277109, + "learning_rate": 1.6173388343352915e-08, + "loss": 0.154, + "step": 8775 + }, + { + "epoch": 2.9307062948739357, + "grad_norm": 0.5287153064523404, + "learning_rate": 1.601758445728796e-08, + "loss": 0.146, + "step": 8776 + }, + { + "epoch": 2.931040240440808, + "grad_norm": 0.5645974728665406, + "learning_rate": 1.586253346045119e-08, + "loss": 0.1636, + "step": 8777 + }, + { + "epoch": 2.9313741860076807, + "grad_norm": 0.5405173803625365, + "learning_rate": 1.570823537626498e-08, + "loss": 0.1619, + "step": 8778 + }, + { + "epoch": 2.9317081315745535, + "grad_norm": 0.47878908052664143, + "learning_rate": 1.5554690228037905e-08, + "loss": 0.1496, + "step": 8779 + }, + { + "epoch": 2.9320420771414257, + "grad_norm": 0.5136540347637661, + "learning_rate": 1.5401898038964748e-08, + "loss": 0.1505, + "step": 8780 + }, + { + "epoch": 2.9323760227082984, + "grad_norm": 0.5066164231201058, + "learning_rate": 1.5249858832126486e-08, + "loss": 0.1515, + "step": 8781 + }, + { + "epoch": 2.932709968275171, + "grad_norm": 0.5494650012983797, + "learning_rate": 1.5098572630491414e-08, + "loss": 0.1557, + "step": 8782 + }, + { + "epoch": 2.933043913842044, + "grad_norm": 0.5356394443781805, + "learning_rate": 1.4948039456911256e-08, + "loss": 0.1598, + "step": 8783 + }, + { + "epoch": 2.9333778594089166, + "grad_norm": 0.579231974043971, + "learning_rate": 1.4798259334127263e-08, + "loss": 0.1622, + "step": 8784 + }, + { + "epoch": 2.933711804975789, + "grad_norm": 0.549857407176255, + "learning_rate": 1.4649232284765225e-08, + "loss": 0.1566, + "step": 8785 + }, + { + "epoch": 2.9340457505426616, + "grad_norm": 0.4940294824980054, + "learning_rate": 1.4500958331337134e-08, + "loss": 0.1479, + "step": 8786 + }, + { + "epoch": 2.9343796961095343, + "grad_norm": 0.5158826721565871, + "learning_rate": 1.435343749624174e-08, + "loss": 0.1515, + "step": 8787 + }, + { + "epoch": 2.9347136416764066, + "grad_norm": 0.5551253298998934, + "learning_rate": 1.420666980176344e-08, + "loss": 0.1695, + "step": 8788 + }, + { + "epoch": 2.9350475872432793, + "grad_norm": 0.5605107676992664, + "learning_rate": 1.4060655270073387e-08, + "loss": 0.1479, + "step": 8789 + }, + { + "epoch": 2.935381532810152, + "grad_norm": 0.535586571303752, + "learning_rate": 1.3915393923228936e-08, + "loss": 0.1575, + "step": 8790 + }, + { + "epoch": 2.9357154783770243, + "grad_norm": 0.4790608505081332, + "learning_rate": 1.3770885783173649e-08, + "loss": 0.1492, + "step": 8791 + }, + { + "epoch": 2.936049423943897, + "grad_norm": 0.5189679586079701, + "learning_rate": 1.3627130871737282e-08, + "loss": 0.1468, + "step": 8792 + }, + { + "epoch": 2.9363833695107697, + "grad_norm": 0.5060738164366134, + "learning_rate": 1.3484129210635243e-08, + "loss": 0.1486, + "step": 8793 + }, + { + "epoch": 2.9367173150776424, + "grad_norm": 0.5236931826293424, + "learning_rate": 1.3341880821469699e-08, + "loss": 0.1517, + "step": 8794 + }, + { + "epoch": 2.937051260644515, + "grad_norm": 0.5122034983251454, + "learning_rate": 1.3200385725729014e-08, + "loss": 0.146, + "step": 8795 + }, + { + "epoch": 2.9373852062113874, + "grad_norm": 0.5644397660384538, + "learning_rate": 1.3059643944787759e-08, + "loss": 0.1627, + "step": 8796 + }, + { + "epoch": 2.93771915177826, + "grad_norm": 0.5436746008349438, + "learning_rate": 1.2919655499906703e-08, + "loss": 0.1595, + "step": 8797 + }, + { + "epoch": 2.938053097345133, + "grad_norm": 0.5430037231902027, + "learning_rate": 1.2780420412232263e-08, + "loss": 0.1577, + "step": 8798 + }, + { + "epoch": 2.938387042912005, + "grad_norm": 0.5761697034083223, + "learning_rate": 1.2641938702798174e-08, + "loss": 0.1583, + "step": 8799 + }, + { + "epoch": 2.938720988478878, + "grad_norm": 0.47887126004228137, + "learning_rate": 1.2504210392523808e-08, + "loss": 0.1374, + "step": 8800 + }, + { + "epoch": 2.9390549340457506, + "grad_norm": 0.5759489993737833, + "learning_rate": 1.2367235502214192e-08, + "loss": 0.1603, + "step": 8801 + }, + { + "epoch": 2.9393888796126233, + "grad_norm": 0.5153572086231668, + "learning_rate": 1.2231014052560553e-08, + "loss": 0.1525, + "step": 8802 + }, + { + "epoch": 2.939722825179496, + "grad_norm": 0.518999090589087, + "learning_rate": 1.2095546064141982e-08, + "loss": 0.16, + "step": 8803 + }, + { + "epoch": 2.9400567707463683, + "grad_norm": 0.54913576945233, + "learning_rate": 1.196083155742156e-08, + "loss": 0.159, + "step": 8804 + }, + { + "epoch": 2.940390716313241, + "grad_norm": 0.5129445655764181, + "learning_rate": 1.1826870552749669e-08, + "loss": 0.1531, + "step": 8805 + }, + { + "epoch": 2.9407246618801137, + "grad_norm": 0.5302553484489443, + "learning_rate": 1.169366307036346e-08, + "loss": 0.1511, + "step": 8806 + }, + { + "epoch": 2.941058607446986, + "grad_norm": 0.5156122133349168, + "learning_rate": 1.1561209130384055e-08, + "loss": 0.145, + "step": 8807 + }, + { + "epoch": 2.9413925530138587, + "grad_norm": 0.5127356121469071, + "learning_rate": 1.1429508752821561e-08, + "loss": 0.1478, + "step": 8808 + }, + { + "epoch": 2.9417264985807314, + "grad_norm": 0.522416127983697, + "learning_rate": 1.1298561957570065e-08, + "loss": 0.156, + "step": 8809 + }, + { + "epoch": 2.9420604441476037, + "grad_norm": 0.5832906496004197, + "learning_rate": 1.1168368764410408e-08, + "loss": 0.1676, + "step": 8810 + }, + { + "epoch": 2.9423943897144764, + "grad_norm": 0.5065280786514194, + "learning_rate": 1.103892919301075e-08, + "loss": 0.1494, + "step": 8811 + }, + { + "epoch": 2.942728335281349, + "grad_norm": 0.4869535167280794, + "learning_rate": 1.0910243262923781e-08, + "loss": 0.146, + "step": 8812 + }, + { + "epoch": 2.943062280848222, + "grad_norm": 0.5054299642033462, + "learning_rate": 1.0782310993589506e-08, + "loss": 0.1485, + "step": 8813 + }, + { + "epoch": 2.9433962264150946, + "grad_norm": 0.48002626148076355, + "learning_rate": 1.0655132404333024e-08, + "loss": 0.1442, + "step": 8814 + }, + { + "epoch": 2.943730171981967, + "grad_norm": 0.5758905705157378, + "learning_rate": 1.0528707514366743e-08, + "loss": 0.168, + "step": 8815 + }, + { + "epoch": 2.9440641175488396, + "grad_norm": 0.5102727690819755, + "learning_rate": 1.0403036342787609e-08, + "loss": 0.1505, + "step": 8816 + }, + { + "epoch": 2.9443980631157123, + "grad_norm": 0.48725259548600885, + "learning_rate": 1.0278118908580992e-08, + "loss": 0.1423, + "step": 8817 + }, + { + "epoch": 2.9447320086825846, + "grad_norm": 0.5339536559315551, + "learning_rate": 1.0153955230616241e-08, + "loss": 0.1593, + "step": 8818 + }, + { + "epoch": 2.9450659542494573, + "grad_norm": 0.5337907365193634, + "learning_rate": 1.0030545327650576e-08, + "loss": 0.1554, + "step": 8819 + }, + { + "epoch": 2.94539989981633, + "grad_norm": 0.5371286184405106, + "learning_rate": 9.907889218325751e-09, + "loss": 0.162, + "step": 8820 + }, + { + "epoch": 2.9457338453832023, + "grad_norm": 0.5106944556580268, + "learning_rate": 9.78598692117083e-09, + "loss": 0.1527, + "step": 8821 + }, + { + "epoch": 2.946067790950075, + "grad_norm": 0.4891694550107267, + "learning_rate": 9.664838454599978e-09, + "loss": 0.1417, + "step": 8822 + }, + { + "epoch": 2.9464017365169477, + "grad_norm": 0.5067859025666626, + "learning_rate": 9.544443836914664e-09, + "loss": 0.1561, + "step": 8823 + }, + { + "epoch": 2.9467356820838204, + "grad_norm": 0.6033702392881961, + "learning_rate": 9.42480308630256e-09, + "loss": 0.1666, + "step": 8824 + }, + { + "epoch": 2.947069627650693, + "grad_norm": 0.5274076271656751, + "learning_rate": 9.30591622083532e-09, + "loss": 0.1581, + "step": 8825 + }, + { + "epoch": 2.9474035732175654, + "grad_norm": 0.6016450274515811, + "learning_rate": 9.187783258473027e-09, + "loss": 0.1543, + "step": 8826 + }, + { + "epoch": 2.947737518784438, + "grad_norm": 0.629833846802068, + "learning_rate": 9.070404217061402e-09, + "loss": 0.1559, + "step": 8827 + }, + { + "epoch": 2.948071464351311, + "grad_norm": 0.49603332101775693, + "learning_rate": 8.953779114331262e-09, + "loss": 0.1417, + "step": 8828 + }, + { + "epoch": 2.948405409918183, + "grad_norm": 0.5690481056431702, + "learning_rate": 8.837907967900183e-09, + "loss": 0.1615, + "step": 8829 + }, + { + "epoch": 2.948739355485056, + "grad_norm": 0.5286074793185255, + "learning_rate": 8.722790795272495e-09, + "loss": 0.1498, + "step": 8830 + }, + { + "epoch": 2.9490733010519286, + "grad_norm": 0.5394688875104848, + "learning_rate": 8.608427613837622e-09, + "loss": 0.1599, + "step": 8831 + }, + { + "epoch": 2.9494072466188013, + "grad_norm": 0.5232591082172079, + "learning_rate": 8.494818440871189e-09, + "loss": 0.1518, + "step": 8832 + }, + { + "epoch": 2.949741192185674, + "grad_norm": 0.5323106596009022, + "learning_rate": 8.381963293535577e-09, + "loss": 0.1489, + "step": 8833 + }, + { + "epoch": 2.9500751377525463, + "grad_norm": 0.5059971519222234, + "learning_rate": 8.269862188879374e-09, + "loss": 0.1531, + "step": 8834 + }, + { + "epoch": 2.950409083319419, + "grad_norm": 0.5337561349373425, + "learning_rate": 8.158515143835698e-09, + "loss": 0.1498, + "step": 8835 + }, + { + "epoch": 2.9507430288862917, + "grad_norm": 0.5203134409775217, + "learning_rate": 8.047922175225542e-09, + "loss": 0.1616, + "step": 8836 + }, + { + "epoch": 2.951076974453164, + "grad_norm": 0.5669691571198704, + "learning_rate": 7.938083299754984e-09, + "loss": 0.1622, + "step": 8837 + }, + { + "epoch": 2.9514109200200367, + "grad_norm": 0.5413633005491444, + "learning_rate": 7.828998534016308e-09, + "loss": 0.1566, + "step": 8838 + }, + { + "epoch": 2.9517448655869094, + "grad_norm": 0.5485356620391724, + "learning_rate": 7.720667894488554e-09, + "loss": 0.1603, + "step": 8839 + }, + { + "epoch": 2.9520788111537817, + "grad_norm": 0.5412801017017537, + "learning_rate": 7.613091397535855e-09, + "loss": 0.1529, + "step": 8840 + }, + { + "epoch": 2.9524127567206544, + "grad_norm": 0.5082343717572196, + "learning_rate": 7.506269059409654e-09, + "loss": 0.1489, + "step": 8841 + }, + { + "epoch": 2.952746702287527, + "grad_norm": 0.5188298301684736, + "learning_rate": 7.400200896245935e-09, + "loss": 0.1548, + "step": 8842 + }, + { + "epoch": 2.9530806478544, + "grad_norm": 0.5401849776800605, + "learning_rate": 7.29488692406799e-09, + "loss": 0.1637, + "step": 8843 + }, + { + "epoch": 2.9534145934212725, + "grad_norm": 0.6017819588969197, + "learning_rate": 7.190327158784205e-09, + "loss": 0.17, + "step": 8844 + }, + { + "epoch": 2.953748538988145, + "grad_norm": 0.5050764086292866, + "learning_rate": 7.0865216161902785e-09, + "loss": 0.1495, + "step": 8845 + }, + { + "epoch": 2.9540824845550175, + "grad_norm": 0.5397065334704347, + "learning_rate": 6.983470311967e-09, + "loss": 0.1542, + "step": 8846 + }, + { + "epoch": 2.9544164301218903, + "grad_norm": 0.49879861658122404, + "learning_rate": 6.881173261680807e-09, + "loss": 0.1403, + "step": 8847 + }, + { + "epoch": 2.9547503756887625, + "grad_norm": 0.5275264970743618, + "learning_rate": 6.779630480786004e-09, + "loss": 0.1495, + "step": 8848 + }, + { + "epoch": 2.9550843212556352, + "grad_norm": 0.5011084154145502, + "learning_rate": 6.678841984621432e-09, + "loss": 0.1522, + "step": 8849 + }, + { + "epoch": 2.955418266822508, + "grad_norm": 0.5681686799273573, + "learning_rate": 6.578807788411579e-09, + "loss": 0.1636, + "step": 8850 + }, + { + "epoch": 2.9557522123893807, + "grad_norm": 0.531060734134599, + "learning_rate": 6.479527907268801e-09, + "loss": 0.1514, + "step": 8851 + }, + { + "epoch": 2.9560861579562534, + "grad_norm": 0.5219698663221229, + "learning_rate": 6.381002356189991e-09, + "loss": 0.1504, + "step": 8852 + }, + { + "epoch": 2.9564201035231257, + "grad_norm": 0.5363223857696832, + "learning_rate": 6.283231150058799e-09, + "loss": 0.156, + "step": 8853 + }, + { + "epoch": 2.9567540490899984, + "grad_norm": 0.5360700439648506, + "learning_rate": 6.186214303645077e-09, + "loss": 0.1574, + "step": 8854 + }, + { + "epoch": 2.957087994656871, + "grad_norm": 0.5512505946034206, + "learning_rate": 6.0899518316032135e-09, + "loss": 0.1598, + "step": 8855 + }, + { + "epoch": 2.9574219402237434, + "grad_norm": 0.5071274370861918, + "learning_rate": 5.99444374847602e-09, + "loss": 0.1502, + "step": 8856 + }, + { + "epoch": 2.957755885790616, + "grad_norm": 0.5092307827434925, + "learning_rate": 5.899690068690289e-09, + "loss": 0.1435, + "step": 8857 + }, + { + "epoch": 2.958089831357489, + "grad_norm": 0.49280916083565207, + "learning_rate": 5.805690806560127e-09, + "loss": 0.1444, + "step": 8858 + }, + { + "epoch": 2.958423776924361, + "grad_norm": 0.5065398927260356, + "learning_rate": 5.712445976285286e-09, + "loss": 0.1511, + "step": 8859 + }, + { + "epoch": 2.958757722491234, + "grad_norm": 0.5279817308600407, + "learning_rate": 5.619955591951165e-09, + "loss": 0.1616, + "step": 8860 + }, + { + "epoch": 2.9590916680581065, + "grad_norm": 0.5624691691259425, + "learning_rate": 5.528219667529921e-09, + "loss": 0.1622, + "step": 8861 + }, + { + "epoch": 2.9594256136249792, + "grad_norm": 0.5183889213776066, + "learning_rate": 5.437238216878804e-09, + "loss": 0.1538, + "step": 8862 + }, + { + "epoch": 2.959759559191852, + "grad_norm": 0.5406835519360788, + "learning_rate": 5.347011253741819e-09, + "loss": 0.1587, + "step": 8863 + }, + { + "epoch": 2.9600935047587242, + "grad_norm": 0.49802577951748117, + "learning_rate": 5.257538791749173e-09, + "loss": 0.1481, + "step": 8864 + }, + { + "epoch": 2.960427450325597, + "grad_norm": 0.5261683688675758, + "learning_rate": 5.168820844416167e-09, + "loss": 0.1565, + "step": 8865 + }, + { + "epoch": 2.9607613958924697, + "grad_norm": 0.5114224521968415, + "learning_rate": 5.080857425145413e-09, + "loss": 0.1521, + "step": 8866 + }, + { + "epoch": 2.961095341459342, + "grad_norm": 0.5208384086955586, + "learning_rate": 4.993648547224062e-09, + "loss": 0.1551, + "step": 8867 + }, + { + "epoch": 2.9614292870262147, + "grad_norm": 0.5482296679225996, + "learning_rate": 4.907194223826572e-09, + "loss": 0.1543, + "step": 8868 + }, + { + "epoch": 2.9617632325930874, + "grad_norm": 0.5106259843517691, + "learning_rate": 4.8214944680125e-09, + "loss": 0.1499, + "step": 8869 + }, + { + "epoch": 2.9620971781599597, + "grad_norm": 0.5261301025915729, + "learning_rate": 4.736549292728154e-09, + "loss": 0.1586, + "step": 8870 + }, + { + "epoch": 2.9624311237268324, + "grad_norm": 0.5858625891292126, + "learning_rate": 4.652358710805494e-09, + "loss": 0.1639, + "step": 8871 + }, + { + "epoch": 2.962765069293705, + "grad_norm": 0.5259863655766891, + "learning_rate": 4.5689227349626775e-09, + "loss": 0.1551, + "step": 8872 + }, + { + "epoch": 2.963099014860578, + "grad_norm": 0.5277440908987092, + "learning_rate": 4.486241377802958e-09, + "loss": 0.1527, + "step": 8873 + }, + { + "epoch": 2.9634329604274505, + "grad_norm": 0.5230736899290115, + "learning_rate": 4.404314651816344e-09, + "loss": 0.1591, + "step": 8874 + }, + { + "epoch": 2.963766905994323, + "grad_norm": 0.5575061010473056, + "learning_rate": 4.323142569379602e-09, + "loss": 0.1604, + "step": 8875 + }, + { + "epoch": 2.9641008515611955, + "grad_norm": 0.5401053105604766, + "learning_rate": 4.242725142754589e-09, + "loss": 0.1464, + "step": 8876 + }, + { + "epoch": 2.9644347971280682, + "grad_norm": 0.5475423256849493, + "learning_rate": 4.163062384088812e-09, + "loss": 0.1624, + "step": 8877 + }, + { + "epoch": 2.9647687426949405, + "grad_norm": 0.5765524254562182, + "learning_rate": 4.0841543054165324e-09, + "loss": 0.1742, + "step": 8878 + }, + { + "epoch": 2.9651026882618132, + "grad_norm": 0.5045226252131164, + "learning_rate": 4.006000918658215e-09, + "loss": 0.1519, + "step": 8879 + }, + { + "epoch": 2.965436633828686, + "grad_norm": 0.5294171058153377, + "learning_rate": 3.928602235618861e-09, + "loss": 0.146, + "step": 8880 + }, + { + "epoch": 2.9657705793955587, + "grad_norm": 0.5557587243982196, + "learning_rate": 3.851958267990785e-09, + "loss": 0.16, + "step": 8881 + }, + { + "epoch": 2.9661045249624314, + "grad_norm": 0.516710441466049, + "learning_rate": 3.776069027352503e-09, + "loss": 0.1487, + "step": 8882 + }, + { + "epoch": 2.9664384705293037, + "grad_norm": 0.5615655633903573, + "learning_rate": 3.700934525167621e-09, + "loss": 0.1602, + "step": 8883 + }, + { + "epoch": 2.9667724160961764, + "grad_norm": 0.5047773838898094, + "learning_rate": 3.626554772786506e-09, + "loss": 0.1444, + "step": 8884 + }, + { + "epoch": 2.967106361663049, + "grad_norm": 0.515974634919327, + "learning_rate": 3.5529297814440587e-09, + "loss": 0.1498, + "step": 8885 + }, + { + "epoch": 2.9674403072299214, + "grad_norm": 0.536967546601452, + "learning_rate": 3.4800595622630497e-09, + "loss": 0.1507, + "step": 8886 + }, + { + "epoch": 2.967774252796794, + "grad_norm": 0.5415943444305699, + "learning_rate": 3.407944126251339e-09, + "loss": 0.1511, + "step": 8887 + }, + { + "epoch": 2.968108198363667, + "grad_norm": 0.5237385427929732, + "learning_rate": 3.336583484301881e-09, + "loss": 0.1463, + "step": 8888 + }, + { + "epoch": 2.968442143930539, + "grad_norm": 0.5448904790540473, + "learning_rate": 3.2659776471960505e-09, + "loss": 0.1659, + "step": 8889 + }, + { + "epoch": 2.968776089497412, + "grad_norm": 0.47703613603288947, + "learning_rate": 3.19612662559865e-09, + "loss": 0.1421, + "step": 8890 + }, + { + "epoch": 2.9691100350642845, + "grad_norm": 0.5685768307596133, + "learning_rate": 3.1270304300617947e-09, + "loss": 0.1746, + "step": 8891 + }, + { + "epoch": 2.969443980631157, + "grad_norm": 0.5260057299774317, + "learning_rate": 3.0586890710232465e-09, + "loss": 0.1579, + "step": 8892 + }, + { + "epoch": 2.96977792619803, + "grad_norm": 0.48821693615311545, + "learning_rate": 2.9911025588069685e-09, + "loss": 0.1446, + "step": 8893 + }, + { + "epoch": 2.970111871764902, + "grad_norm": 0.4848027115717547, + "learning_rate": 2.9242709036225723e-09, + "loss": 0.1406, + "step": 8894 + }, + { + "epoch": 2.970445817331775, + "grad_norm": 0.5188283228711802, + "learning_rate": 2.858194115565871e-09, + "loss": 0.151, + "step": 8895 + }, + { + "epoch": 2.9707797628986476, + "grad_norm": 0.5553633231710993, + "learning_rate": 2.7928722046177692e-09, + "loss": 0.1645, + "step": 8896 + }, + { + "epoch": 2.97111370846552, + "grad_norm": 0.510796810430822, + "learning_rate": 2.7283051806470394e-09, + "loss": 0.1477, + "step": 8897 + }, + { + "epoch": 2.9714476540323926, + "grad_norm": 0.565538056069778, + "learning_rate": 2.664493053406436e-09, + "loss": 0.1614, + "step": 8898 + }, + { + "epoch": 2.9717815995992654, + "grad_norm": 0.5381306694870884, + "learning_rate": 2.6014358325360256e-09, + "loss": 0.1585, + "step": 8899 + }, + { + "epoch": 2.972115545166138, + "grad_norm": 0.5257043175764273, + "learning_rate": 2.5391335275609665e-09, + "loss": 0.1528, + "step": 8900 + }, + { + "epoch": 2.972449490733011, + "grad_norm": 0.5353702583181583, + "learning_rate": 2.4775861478937293e-09, + "loss": 0.1523, + "step": 8901 + }, + { + "epoch": 2.972783436299883, + "grad_norm": 0.508839273196237, + "learning_rate": 2.416793702830211e-09, + "loss": 0.1445, + "step": 8902 + }, + { + "epoch": 2.973117381866756, + "grad_norm": 0.5087754840730636, + "learning_rate": 2.3567562015547328e-09, + "loss": 0.1427, + "step": 8903 + }, + { + "epoch": 2.9734513274336285, + "grad_norm": 0.524446864735963, + "learning_rate": 2.297473653136706e-09, + "loss": 0.1457, + "step": 8904 + }, + { + "epoch": 2.9737852730005008, + "grad_norm": 0.5317021238776584, + "learning_rate": 2.2389460665317443e-09, + "loss": 0.1547, + "step": 8905 + }, + { + "epoch": 2.9741192185673735, + "grad_norm": 0.5407645623936888, + "learning_rate": 2.1811734505799985e-09, + "loss": 0.1641, + "step": 8906 + }, + { + "epoch": 2.974453164134246, + "grad_norm": 0.5124507419550098, + "learning_rate": 2.1241558140100426e-09, + "loss": 0.1574, + "step": 8907 + }, + { + "epoch": 2.9747871097011185, + "grad_norm": 0.5441593837282322, + "learning_rate": 2.0678931654344314e-09, + "loss": 0.158, + "step": 8908 + }, + { + "epoch": 2.975121055267991, + "grad_norm": 0.5132818149239089, + "learning_rate": 2.012385513351922e-09, + "loss": 0.15, + "step": 8909 + }, + { + "epoch": 2.975455000834864, + "grad_norm": 0.5508960448044162, + "learning_rate": 1.9576328661480293e-09, + "loss": 0.1608, + "step": 8910 + }, + { + "epoch": 2.9757889464017366, + "grad_norm": 0.5478676465623783, + "learning_rate": 1.9036352320939146e-09, + "loss": 0.1563, + "step": 8911 + }, + { + "epoch": 2.9761228919686094, + "grad_norm": 0.5743289834297103, + "learning_rate": 1.850392619345831e-09, + "loss": 0.169, + "step": 8912 + }, + { + "epoch": 2.9764568375354816, + "grad_norm": 0.540526385238044, + "learning_rate": 1.7979050359479e-09, + "loss": 0.1572, + "step": 8913 + }, + { + "epoch": 2.9767907831023543, + "grad_norm": 0.5118515001947324, + "learning_rate": 1.746172489828224e-09, + "loss": 0.1496, + "step": 8914 + }, + { + "epoch": 2.977124728669227, + "grad_norm": 0.5295962054993424, + "learning_rate": 1.6951949888016627e-09, + "loss": 0.1488, + "step": 8915 + }, + { + "epoch": 2.9774586742360993, + "grad_norm": 0.5556596310017836, + "learning_rate": 1.6449725405687234e-09, + "loss": 0.1555, + "step": 8916 + }, + { + "epoch": 2.977792619802972, + "grad_norm": 0.568570152916496, + "learning_rate": 1.59550515271667e-09, + "loss": 0.1674, + "step": 8917 + }, + { + "epoch": 2.9781265653698448, + "grad_norm": 0.5121218556638442, + "learning_rate": 1.5467928327178582e-09, + "loss": 0.1512, + "step": 8918 + }, + { + "epoch": 2.978460510936717, + "grad_norm": 0.545092074861627, + "learning_rate": 1.498835587930847e-09, + "loss": 0.1579, + "step": 8919 + }, + { + "epoch": 2.9787944565035898, + "grad_norm": 0.5238858042064578, + "learning_rate": 1.4516334256003962e-09, + "loss": 0.1558, + "step": 8920 + }, + { + "epoch": 2.9791284020704625, + "grad_norm": 0.5058952627939988, + "learning_rate": 1.4051863528563581e-09, + "loss": 0.1447, + "step": 8921 + }, + { + "epoch": 2.979462347637335, + "grad_norm": 0.48109655383730054, + "learning_rate": 1.3594943767158974e-09, + "loss": 0.1414, + "step": 8922 + }, + { + "epoch": 2.979796293204208, + "grad_norm": 0.5352658964999676, + "learning_rate": 1.3145575040801605e-09, + "loss": 0.1466, + "step": 8923 + }, + { + "epoch": 2.98013023877108, + "grad_norm": 0.5336342161222689, + "learning_rate": 1.2703757417387164e-09, + "loss": 0.1514, + "step": 8924 + }, + { + "epoch": 2.980464184337953, + "grad_norm": 0.526133531148846, + "learning_rate": 1.2269490963651154e-09, + "loss": 0.154, + "step": 8925 + }, + { + "epoch": 2.9807981299048256, + "grad_norm": 0.5280976403546764, + "learning_rate": 1.1842775745196655e-09, + "loss": 0.1564, + "step": 8926 + }, + { + "epoch": 2.981132075471698, + "grad_norm": 0.49864255780159394, + "learning_rate": 1.1423611826477665e-09, + "loss": 0.153, + "step": 8927 + }, + { + "epoch": 2.9814660210385706, + "grad_norm": 0.4990574803432966, + "learning_rate": 1.1011999270821305e-09, + "loss": 0.1519, + "step": 8928 + }, + { + "epoch": 2.9817999666054433, + "grad_norm": 0.5206535132804454, + "learning_rate": 1.0607938140400064e-09, + "loss": 0.1481, + "step": 8929 + }, + { + "epoch": 2.982133912172316, + "grad_norm": 0.5203593281225992, + "learning_rate": 1.0211428496259557e-09, + "loss": 0.1585, + "step": 8930 + }, + { + "epoch": 2.9824678577391888, + "grad_norm": 0.511652837070169, + "learning_rate": 9.822470398296312e-10, + "loss": 0.1445, + "step": 8931 + }, + { + "epoch": 2.982801803306061, + "grad_norm": 0.5299622217401528, + "learning_rate": 9.441063905257785e-10, + "loss": 0.1523, + "step": 8932 + }, + { + "epoch": 2.9831357488729338, + "grad_norm": 0.5267298343073171, + "learning_rate": 9.067209074770101e-10, + "loss": 0.1556, + "step": 8933 + }, + { + "epoch": 2.9834696944398065, + "grad_norm": 0.5149660634700207, + "learning_rate": 8.700905963304751e-10, + "loss": 0.1552, + "step": 8934 + }, + { + "epoch": 2.9838036400066787, + "grad_norm": 0.6143707298607329, + "learning_rate": 8.342154626195254e-10, + "loss": 0.1573, + "step": 8935 + }, + { + "epoch": 2.9841375855735515, + "grad_norm": 0.5435314943549732, + "learning_rate": 7.990955117631594e-10, + "loss": 0.1528, + "step": 8936 + }, + { + "epoch": 2.984471531140424, + "grad_norm": 0.5484366140014967, + "learning_rate": 7.647307490676881e-10, + "loss": 0.1598, + "step": 8937 + }, + { + "epoch": 2.9848054767072965, + "grad_norm": 0.55852719855818, + "learning_rate": 7.311211797234041e-10, + "loss": 0.1551, + "step": 8938 + }, + { + "epoch": 2.985139422274169, + "grad_norm": 0.557832342106755, + "learning_rate": 6.982668088079126e-10, + "loss": 0.1673, + "step": 8939 + }, + { + "epoch": 2.985473367841042, + "grad_norm": 0.5086856936416371, + "learning_rate": 6.661676412844653e-10, + "loss": 0.1518, + "step": 8940 + }, + { + "epoch": 2.9858073134079146, + "grad_norm": 0.5437075245689668, + "learning_rate": 6.348236820008513e-10, + "loss": 0.1728, + "step": 8941 + }, + { + "epoch": 2.9861412589747873, + "grad_norm": 0.5162048454653639, + "learning_rate": 6.042349356932819e-10, + "loss": 0.1616, + "step": 8942 + }, + { + "epoch": 2.9864752045416596, + "grad_norm": 0.539657063914861, + "learning_rate": 5.744014069819503e-10, + "loss": 0.155, + "step": 8943 + }, + { + "epoch": 2.9868091501085323, + "grad_norm": 0.5276528948415139, + "learning_rate": 5.453231003732518e-10, + "loss": 0.1614, + "step": 8944 + }, + { + "epoch": 2.987143095675405, + "grad_norm": 0.5228415015085567, + "learning_rate": 5.170000202608938e-10, + "loss": 0.1488, + "step": 8945 + }, + { + "epoch": 2.9874770412422773, + "grad_norm": 0.5328035030755964, + "learning_rate": 4.894321709220106e-10, + "loss": 0.1496, + "step": 8946 + }, + { + "epoch": 2.98781098680915, + "grad_norm": 0.5211736551909211, + "learning_rate": 4.626195565221592e-10, + "loss": 0.156, + "step": 8947 + }, + { + "epoch": 2.9881449323760227, + "grad_norm": 0.5664070819139843, + "learning_rate": 4.365621811108778e-10, + "loss": 0.162, + "step": 8948 + }, + { + "epoch": 2.9884788779428955, + "grad_norm": 0.5453801039366839, + "learning_rate": 4.112600486250173e-10, + "loss": 0.1543, + "step": 8949 + }, + { + "epoch": 2.988812823509768, + "grad_norm": 0.5163009694570976, + "learning_rate": 3.867131628865206e-10, + "loss": 0.15, + "step": 8950 + }, + { + "epoch": 2.9891467690766405, + "grad_norm": 0.5144527921379415, + "learning_rate": 3.629215276035325e-10, + "loss": 0.1479, + "step": 8951 + }, + { + "epoch": 2.989480714643513, + "grad_norm": 0.4984509452806607, + "learning_rate": 3.3988514637040003e-10, + "loss": 0.1503, + "step": 8952 + }, + { + "epoch": 2.989814660210386, + "grad_norm": 0.530959054033469, + "learning_rate": 3.176040226660071e-10, + "loss": 0.1607, + "step": 8953 + }, + { + "epoch": 2.990148605777258, + "grad_norm": 0.5323475806559018, + "learning_rate": 2.960781598576601e-10, + "loss": 0.1558, + "step": 8954 + }, + { + "epoch": 2.990482551344131, + "grad_norm": 0.525452993656753, + "learning_rate": 2.7530756119609204e-10, + "loss": 0.1534, + "step": 8955 + }, + { + "epoch": 2.9908164969110036, + "grad_norm": 0.5141292907473098, + "learning_rate": 2.5529222981879323e-10, + "loss": 0.1516, + "step": 8956 + }, + { + "epoch": 2.991150442477876, + "grad_norm": 0.5160401121615004, + "learning_rate": 2.360321687500111e-10, + "loss": 0.1578, + "step": 8957 + }, + { + "epoch": 2.9914843880447486, + "grad_norm": 0.5226041005544119, + "learning_rate": 2.175273808985301e-10, + "loss": 0.1587, + "step": 8958 + }, + { + "epoch": 2.9918183336116213, + "grad_norm": 0.5481442045964605, + "learning_rate": 1.9977786906044683e-10, + "loss": 0.1589, + "step": 8959 + }, + { + "epoch": 2.992152279178494, + "grad_norm": 0.5130316124877334, + "learning_rate": 1.827836359163948e-10, + "loss": 0.1543, + "step": 8960 + }, + { + "epoch": 2.9924862247453667, + "grad_norm": 0.49218211497168046, + "learning_rate": 1.665446840343199e-10, + "loss": 0.1487, + "step": 8961 + }, + { + "epoch": 2.992820170312239, + "grad_norm": 0.5173019376331516, + "learning_rate": 1.5106101586614963e-10, + "loss": 0.1496, + "step": 8962 + }, + { + "epoch": 2.9931541158791117, + "grad_norm": 0.5519389841712395, + "learning_rate": 1.3633263375223414e-10, + "loss": 0.1646, + "step": 8963 + }, + { + "epoch": 2.9934880614459844, + "grad_norm": 0.49634411519978117, + "learning_rate": 1.223595399163502e-10, + "loss": 0.1522, + "step": 8964 + }, + { + "epoch": 2.9938220070128567, + "grad_norm": 0.5190963445321033, + "learning_rate": 1.091417364695868e-10, + "loss": 0.1518, + "step": 8965 + }, + { + "epoch": 2.9941559525797294, + "grad_norm": 0.5046318641135656, + "learning_rate": 9.667922540868013e-11, + "loss": 0.1506, + "step": 8966 + }, + { + "epoch": 2.994489898146602, + "grad_norm": 0.5811234054996668, + "learning_rate": 8.49720086165684e-11, + "loss": 0.1624, + "step": 8967 + }, + { + "epoch": 2.9948238437134744, + "grad_norm": 0.5187060796598542, + "learning_rate": 7.40200878618369e-11, + "loss": 0.1494, + "step": 8968 + }, + { + "epoch": 2.995157789280347, + "grad_norm": 0.5210939621866848, + "learning_rate": 6.382346479816282e-11, + "loss": 0.1562, + "step": 8969 + }, + { + "epoch": 2.99549173484722, + "grad_norm": 0.5147922038095906, + "learning_rate": 5.438214096653571e-11, + "loss": 0.1591, + "step": 8970 + }, + { + "epoch": 2.9958256804140926, + "grad_norm": 0.5091227792055766, + "learning_rate": 4.569611779248195e-11, + "loss": 0.1474, + "step": 8971 + }, + { + "epoch": 2.9961596259809653, + "grad_norm": 0.52462660718752, + "learning_rate": 3.776539658939538e-11, + "loss": 0.1587, + "step": 8972 + }, + { + "epoch": 2.9964935715478376, + "grad_norm": 0.5151458507703925, + "learning_rate": 3.0589978553541286e-11, + "loss": 0.1616, + "step": 8973 + }, + { + "epoch": 2.9968275171147103, + "grad_norm": 0.5064544199820976, + "learning_rate": 2.416986477071781e-11, + "loss": 0.1483, + "step": 8974 + }, + { + "epoch": 2.997161462681583, + "grad_norm": 0.5835195691906906, + "learning_rate": 1.850505620903942e-11, + "loss": 0.1629, + "step": 8975 + }, + { + "epoch": 2.9974954082484553, + "grad_norm": 0.5602503239016194, + "learning_rate": 1.3595553725598287e-11, + "loss": 0.1666, + "step": 8976 + }, + { + "epoch": 2.997829353815328, + "grad_norm": 0.5032549204664997, + "learning_rate": 9.441358061468286e-12, + "loss": 0.156, + "step": 8977 + }, + { + "epoch": 2.9981632993822007, + "grad_norm": 0.5091638685400584, + "learning_rate": 6.042469843925425e-12, + "loss": 0.1564, + "step": 8978 + }, + { + "epoch": 2.9984972449490734, + "grad_norm": 0.5291218098615191, + "learning_rate": 3.398889586447851e-12, + "loss": 0.1521, + "step": 8979 + }, + { + "epoch": 2.998831190515946, + "grad_norm": 0.5183009155912531, + "learning_rate": 1.5106176892709656e-12, + "loss": 0.1503, + "step": 8980 + }, + { + "epoch": 2.9991651360828184, + "grad_norm": 0.5279125117346501, + "learning_rate": 3.7765443661186283e-13, + "loss": 0.1586, + "step": 8981 + }, + { + "epoch": 2.999499081649691, + "grad_norm": 0.4923848154444246, + "learning_rate": 0.0, + "loss": 0.1483, + "step": 8982 + }, + { + "epoch": 2.999499081649691, + "eval_loss": 0.21598096191883087, + "eval_runtime": 183.8798, + "eval_samples_per_second": 109.708, + "eval_steps_per_second": 1.719, + "step": 8982 + }, + { + "epoch": 2.999499081649691, + "step": 8982, + "total_flos": 2.8091411711025807e+18, + "train_loss": 0.2020581066641609, + "train_runtime": 42907.438, + "train_samples_per_second": 26.797, + "train_steps_per_second": 0.209 + } + ], + "logging_steps": 1, + "max_steps": 8982, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.8091411711025807e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}