diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,32642 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 46554, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 2.1480431327061047e-05, + "grad_norm": 26.65201832627396, + "learning_rate": 4.291845493562232e-08, + "loss": 2.0281, + "step": 1 + }, + { + "epoch": 0.0002148043132706105, + "grad_norm": 24.545838847916603, + "learning_rate": 4.291845493562232e-07, + "loss": 1.9707, + "step": 10 + }, + { + "epoch": 0.000429608626541221, + "grad_norm": 6.925669336747074, + "learning_rate": 8.583690987124464e-07, + "loss": 1.7361, + "step": 20 + }, + { + "epoch": 0.0006444129398118315, + "grad_norm": 3.563926800405336, + "learning_rate": 1.2875536480686696e-06, + "loss": 1.5248, + "step": 30 + }, + { + "epoch": 0.000859217253082442, + "grad_norm": 2.7917255910937255, + "learning_rate": 1.7167381974248929e-06, + "loss": 1.3065, + "step": 40 + }, + { + "epoch": 0.0010740215663530524, + "grad_norm": 1.3127546793240747, + "learning_rate": 2.145922746781116e-06, + "loss": 1.2055, + "step": 50 + }, + { + "epoch": 0.001288825879623663, + "grad_norm": 1.1166152222590124, + "learning_rate": 2.575107296137339e-06, + "loss": 1.1276, + "step": 60 + }, + { + "epoch": 0.0015036301928942734, + "grad_norm": 1.1067088125847406, + "learning_rate": 3.004291845493563e-06, + "loss": 1.111, + "step": 70 + }, + { + "epoch": 0.001718434506164884, + "grad_norm": 1.0213495251064038, + "learning_rate": 3.4334763948497858e-06, + "loss": 1.0633, + "step": 80 + }, + { + "epoch": 0.0019332388194354942, + "grad_norm": 1.4698974676460892, + "learning_rate": 3.862660944206009e-06, + "loss": 1.0363, + "step": 90 + }, + { + "epoch": 0.002148043132706105, + "grad_norm": 1.1069539300208826, + "learning_rate": 4.291845493562232e-06, + "loss": 1.0595, + "step": 100 + }, + { + "epoch": 0.002362847445976715, + "grad_norm": 1.0428861367575017, + "learning_rate": 4.721030042918455e-06, + "loss": 1.0544, + "step": 110 + }, + { + "epoch": 0.002577651759247326, + "grad_norm": 1.0155126146726958, + "learning_rate": 5.150214592274678e-06, + "loss": 1.0247, + "step": 120 + }, + { + "epoch": 0.002792456072517936, + "grad_norm": 1.0457307269517822, + "learning_rate": 5.579399141630901e-06, + "loss": 1.0231, + "step": 130 + }, + { + "epoch": 0.003007260385788547, + "grad_norm": 0.9747222232804185, + "learning_rate": 6.008583690987126e-06, + "loss": 1.0112, + "step": 140 + }, + { + "epoch": 0.003222064699059157, + "grad_norm": 1.0485056497719716, + "learning_rate": 6.437768240343349e-06, + "loss": 0.9848, + "step": 150 + }, + { + "epoch": 0.003436869012329768, + "grad_norm": 1.015067491069871, + "learning_rate": 6.8669527896995715e-06, + "loss": 1.0101, + "step": 160 + }, + { + "epoch": 0.003651673325600378, + "grad_norm": 0.9810404079330955, + "learning_rate": 7.296137339055794e-06, + "loss": 0.981, + "step": 170 + }, + { + "epoch": 0.0038664776388709883, + "grad_norm": 1.018895365287239, + "learning_rate": 7.725321888412017e-06, + "loss": 0.9754, + "step": 180 + }, + { + "epoch": 0.004081281952141599, + "grad_norm": 1.079857560560485, + "learning_rate": 8.154506437768241e-06, + "loss": 0.9996, + "step": 190 + }, + { + "epoch": 0.00429608626541221, + "grad_norm": 0.9967859001475669, + "learning_rate": 8.583690987124465e-06, + "loss": 0.9952, + "step": 200 + }, + { + "epoch": 0.00451089057868282, + "grad_norm": 1.115201831741858, + "learning_rate": 9.012875536480687e-06, + "loss": 0.9742, + "step": 210 + }, + { + "epoch": 0.00472569489195343, + "grad_norm": 0.9523509165576094, + "learning_rate": 9.44206008583691e-06, + "loss": 0.9831, + "step": 220 + }, + { + "epoch": 0.004940499205224041, + "grad_norm": 1.005436826590897, + "learning_rate": 9.871244635193133e-06, + "loss": 0.9773, + "step": 230 + }, + { + "epoch": 0.005155303518494652, + "grad_norm": 0.9901616620650135, + "learning_rate": 1.0300429184549356e-05, + "loss": 0.9781, + "step": 240 + }, + { + "epoch": 0.0053701078317652615, + "grad_norm": 1.0638485393442876, + "learning_rate": 1.072961373390558e-05, + "loss": 0.96, + "step": 250 + }, + { + "epoch": 0.005584912145035872, + "grad_norm": 0.9964178960246284, + "learning_rate": 1.1158798283261802e-05, + "loss": 0.9742, + "step": 260 + }, + { + "epoch": 0.005799716458306483, + "grad_norm": 1.0370493768439413, + "learning_rate": 1.1587982832618028e-05, + "loss": 0.9602, + "step": 270 + }, + { + "epoch": 0.006014520771577094, + "grad_norm": 0.9796931302576042, + "learning_rate": 1.2017167381974251e-05, + "loss": 0.9632, + "step": 280 + }, + { + "epoch": 0.0062293250848477035, + "grad_norm": 1.031038089029062, + "learning_rate": 1.2446351931330473e-05, + "loss": 0.9593, + "step": 290 + }, + { + "epoch": 0.006444129398118314, + "grad_norm": 0.9515929962341028, + "learning_rate": 1.2875536480686697e-05, + "loss": 0.949, + "step": 300 + }, + { + "epoch": 0.006658933711388925, + "grad_norm": 0.9520559803996252, + "learning_rate": 1.330472103004292e-05, + "loss": 0.9457, + "step": 310 + }, + { + "epoch": 0.006873738024659536, + "grad_norm": 1.3890037359682255, + "learning_rate": 1.3733905579399143e-05, + "loss": 0.9584, + "step": 320 + }, + { + "epoch": 0.007088542337930145, + "grad_norm": 0.9169319166076477, + "learning_rate": 1.4163090128755365e-05, + "loss": 0.9464, + "step": 330 + }, + { + "epoch": 0.007303346651200756, + "grad_norm": 0.9772519097125956, + "learning_rate": 1.4592274678111589e-05, + "loss": 0.9614, + "step": 340 + }, + { + "epoch": 0.007518150964471367, + "grad_norm": 1.1706529511376937, + "learning_rate": 1.5021459227467811e-05, + "loss": 0.9553, + "step": 350 + }, + { + "epoch": 0.007732955277741977, + "grad_norm": 0.9436703190905599, + "learning_rate": 1.5450643776824035e-05, + "loss": 0.9417, + "step": 360 + }, + { + "epoch": 0.007947759591012588, + "grad_norm": 0.8941917268650862, + "learning_rate": 1.587982832618026e-05, + "loss": 0.9382, + "step": 370 + }, + { + "epoch": 0.008162563904283198, + "grad_norm": 0.9682494688789302, + "learning_rate": 1.6309012875536482e-05, + "loss": 0.9636, + "step": 380 + }, + { + "epoch": 0.008377368217553808, + "grad_norm": 0.968118858319082, + "learning_rate": 1.6738197424892706e-05, + "loss": 0.9406, + "step": 390 + }, + { + "epoch": 0.00859217253082442, + "grad_norm": 0.9304992691801565, + "learning_rate": 1.716738197424893e-05, + "loss": 0.9467, + "step": 400 + }, + { + "epoch": 0.00880697684409503, + "grad_norm": 0.8846995721706429, + "learning_rate": 1.7596566523605153e-05, + "loss": 0.9552, + "step": 410 + }, + { + "epoch": 0.00902178115736564, + "grad_norm": 1.0342299734011187, + "learning_rate": 1.8025751072961374e-05, + "loss": 0.9485, + "step": 420 + }, + { + "epoch": 0.00923658547063625, + "grad_norm": 1.051909712027358, + "learning_rate": 1.8454935622317597e-05, + "loss": 0.9635, + "step": 430 + }, + { + "epoch": 0.00945138978390686, + "grad_norm": 1.0942808627738883, + "learning_rate": 1.888412017167382e-05, + "loss": 0.9421, + "step": 440 + }, + { + "epoch": 0.009666194097177472, + "grad_norm": 0.8646650135642147, + "learning_rate": 1.9313304721030045e-05, + "loss": 0.977, + "step": 450 + }, + { + "epoch": 0.009880998410448082, + "grad_norm": 0.9215653231258697, + "learning_rate": 1.9742489270386265e-05, + "loss": 0.9571, + "step": 460 + }, + { + "epoch": 0.010095802723718692, + "grad_norm": 0.9199665564587306, + "learning_rate": 1.999999962828165e-05, + "loss": 0.9356, + "step": 470 + }, + { + "epoch": 0.010310607036989303, + "grad_norm": 0.9425113636574227, + "learning_rate": 1.9999995446450485e-05, + "loss": 0.9624, + "step": 480 + }, + { + "epoch": 0.010525411350259913, + "grad_norm": 0.9337828020549753, + "learning_rate": 1.999998661814216e-05, + "loss": 0.9468, + "step": 490 + }, + { + "epoch": 0.010740215663530523, + "grad_norm": 0.927989701189411, + "learning_rate": 1.9999973143360788e-05, + "loss": 0.9396, + "step": 500 + }, + { + "epoch": 0.010955019976801135, + "grad_norm": 0.9324290313999429, + "learning_rate": 1.999995502211262e-05, + "loss": 0.9563, + "step": 510 + }, + { + "epoch": 0.011169824290071744, + "grad_norm": 0.9434102759361815, + "learning_rate": 1.9999932254406077e-05, + "loss": 0.9609, + "step": 520 + }, + { + "epoch": 0.011384628603342354, + "grad_norm": 0.8724473565087025, + "learning_rate": 1.9999904840251743e-05, + "loss": 0.9427, + "step": 530 + }, + { + "epoch": 0.011599432916612966, + "grad_norm": 0.8942612386723021, + "learning_rate": 1.9999872779662347e-05, + "loss": 0.9245, + "step": 540 + }, + { + "epoch": 0.011814237229883576, + "grad_norm": 0.9928300421934061, + "learning_rate": 1.9999836072652794e-05, + "loss": 0.932, + "step": 550 + }, + { + "epoch": 0.012029041543154187, + "grad_norm": 0.8947389195896635, + "learning_rate": 1.999979471924014e-05, + "loss": 0.945, + "step": 560 + }, + { + "epoch": 0.012243845856424797, + "grad_norm": 0.8728388436746601, + "learning_rate": 1.9999748719443594e-05, + "loss": 0.9508, + "step": 570 + }, + { + "epoch": 0.012458650169695407, + "grad_norm": 1.0931390263840937, + "learning_rate": 1.9999698073284534e-05, + "loss": 0.9334, + "step": 580 + }, + { + "epoch": 0.012673454482966019, + "grad_norm": 0.8867248563676206, + "learning_rate": 1.9999642780786486e-05, + "loss": 0.9352, + "step": 590 + }, + { + "epoch": 0.012888258796236628, + "grad_norm": 0.8378993692657936, + "learning_rate": 1.9999582841975152e-05, + "loss": 0.9211, + "step": 600 + }, + { + "epoch": 0.013103063109507238, + "grad_norm": 0.944765913779485, + "learning_rate": 1.9999518256878377e-05, + "loss": 0.9455, + "step": 610 + }, + { + "epoch": 0.01331786742277785, + "grad_norm": 1.110589470050176, + "learning_rate": 1.999944902552617e-05, + "loss": 0.9312, + "step": 620 + }, + { + "epoch": 0.01353267173604846, + "grad_norm": 0.93740820263563, + "learning_rate": 1.9999375147950698e-05, + "loss": 0.9476, + "step": 630 + }, + { + "epoch": 0.013747476049319071, + "grad_norm": 0.8693919640628258, + "learning_rate": 1.999929662418629e-05, + "loss": 0.9397, + "step": 640 + }, + { + "epoch": 0.013962280362589681, + "grad_norm": 0.821330072078906, + "learning_rate": 1.9999213454269433e-05, + "loss": 0.9387, + "step": 650 + }, + { + "epoch": 0.01417708467586029, + "grad_norm": 0.8706987017404653, + "learning_rate": 1.999912563823877e-05, + "loss": 0.9293, + "step": 660 + }, + { + "epoch": 0.014391888989130902, + "grad_norm": 0.8517170283338585, + "learning_rate": 1.9999033176135104e-05, + "loss": 0.9359, + "step": 670 + }, + { + "epoch": 0.014606693302401512, + "grad_norm": 0.8579178853579097, + "learning_rate": 1.99989360680014e-05, + "loss": 0.9063, + "step": 680 + }, + { + "epoch": 0.014821497615672122, + "grad_norm": 0.8866957779531295, + "learning_rate": 1.999883431388278e-05, + "loss": 0.9354, + "step": 690 + }, + { + "epoch": 0.015036301928942734, + "grad_norm": 0.8870565536110462, + "learning_rate": 1.9998727913826516e-05, + "loss": 0.9366, + "step": 700 + }, + { + "epoch": 0.015251106242213344, + "grad_norm": 0.8987989639943288, + "learning_rate": 1.9998616867882052e-05, + "loss": 0.9369, + "step": 710 + }, + { + "epoch": 0.015465910555483953, + "grad_norm": 0.8110762813161089, + "learning_rate": 1.9998501176100987e-05, + "loss": 0.9267, + "step": 720 + }, + { + "epoch": 0.015680714868754563, + "grad_norm": 0.8393357992971544, + "learning_rate": 1.9998380838537075e-05, + "loss": 0.9275, + "step": 730 + }, + { + "epoch": 0.015895519182025176, + "grad_norm": 0.9524547482458253, + "learning_rate": 1.999825585524623e-05, + "loss": 0.9252, + "step": 740 + }, + { + "epoch": 0.016110323495295786, + "grad_norm": 0.8155389716366475, + "learning_rate": 1.9998126226286528e-05, + "loss": 0.9147, + "step": 750 + }, + { + "epoch": 0.016325127808566396, + "grad_norm": 0.8231072792576088, + "learning_rate": 1.9997991951718196e-05, + "loss": 0.9276, + "step": 760 + }, + { + "epoch": 0.016539932121837006, + "grad_norm": 0.8125297140263947, + "learning_rate": 1.9997853031603628e-05, + "loss": 0.9345, + "step": 770 + }, + { + "epoch": 0.016754736435107616, + "grad_norm": 0.9347609633412168, + "learning_rate": 1.9997709466007374e-05, + "loss": 0.9139, + "step": 780 + }, + { + "epoch": 0.01696954074837823, + "grad_norm": 0.8167131053353975, + "learning_rate": 1.9997561254996138e-05, + "loss": 0.9258, + "step": 790 + }, + { + "epoch": 0.01718434506164884, + "grad_norm": 0.8141997003853896, + "learning_rate": 1.9997408398638785e-05, + "loss": 0.9169, + "step": 800 + }, + { + "epoch": 0.01739914937491945, + "grad_norm": 0.8020228216788807, + "learning_rate": 1.9997250897006345e-05, + "loss": 0.9114, + "step": 810 + }, + { + "epoch": 0.01761395368819006, + "grad_norm": 0.7977580905585953, + "learning_rate": 1.9997088750171995e-05, + "loss": 0.9243, + "step": 820 + }, + { + "epoch": 0.01782875800146067, + "grad_norm": 0.8518531375273793, + "learning_rate": 1.999692195821108e-05, + "loss": 0.9191, + "step": 830 + }, + { + "epoch": 0.01804356231473128, + "grad_norm": 0.8006041023630638, + "learning_rate": 1.99967505212011e-05, + "loss": 0.9152, + "step": 840 + }, + { + "epoch": 0.01825836662800189, + "grad_norm": 0.793159791876333, + "learning_rate": 1.9996574439221708e-05, + "loss": 0.9235, + "step": 850 + }, + { + "epoch": 0.0184731709412725, + "grad_norm": 0.798911293614745, + "learning_rate": 1.9996393712354724e-05, + "loss": 0.907, + "step": 860 + }, + { + "epoch": 0.01868797525454311, + "grad_norm": 0.7876487421020302, + "learning_rate": 1.9996208340684124e-05, + "loss": 0.9073, + "step": 870 + }, + { + "epoch": 0.01890277956781372, + "grad_norm": 0.7656899260637665, + "learning_rate": 1.9996018324296036e-05, + "loss": 0.9191, + "step": 880 + }, + { + "epoch": 0.01911758388108433, + "grad_norm": 0.8200150022936012, + "learning_rate": 1.9995823663278753e-05, + "loss": 0.9144, + "step": 890 + }, + { + "epoch": 0.019332388194354944, + "grad_norm": 0.8730487914144835, + "learning_rate": 1.9995624357722726e-05, + "loss": 0.9239, + "step": 900 + }, + { + "epoch": 0.019547192507625554, + "grad_norm": 0.8281237932100032, + "learning_rate": 1.9995420407720556e-05, + "loss": 0.9205, + "step": 910 + }, + { + "epoch": 0.019761996820896164, + "grad_norm": 0.8232205349274369, + "learning_rate": 1.9995211813367014e-05, + "loss": 0.919, + "step": 920 + }, + { + "epoch": 0.019976801134166774, + "grad_norm": 0.8147366319849692, + "learning_rate": 1.9994998574759023e-05, + "loss": 0.907, + "step": 930 + }, + { + "epoch": 0.020191605447437384, + "grad_norm": 0.7976853731174323, + "learning_rate": 1.9994780691995658e-05, + "loss": 0.9112, + "step": 940 + }, + { + "epoch": 0.020406409760707993, + "grad_norm": 0.8274499776650804, + "learning_rate": 1.9994558165178165e-05, + "loss": 0.9228, + "step": 950 + }, + { + "epoch": 0.020621214073978607, + "grad_norm": 0.824749615058281, + "learning_rate": 1.9994330994409932e-05, + "loss": 0.9331, + "step": 960 + }, + { + "epoch": 0.020836018387249217, + "grad_norm": 0.7904631449154701, + "learning_rate": 1.999409917979652e-05, + "loss": 0.91, + "step": 970 + }, + { + "epoch": 0.021050822700519826, + "grad_norm": 0.8284138971940564, + "learning_rate": 1.999386272144564e-05, + "loss": 0.9163, + "step": 980 + }, + { + "epoch": 0.021265627013790436, + "grad_norm": 0.8529599065323894, + "learning_rate": 1.9993621619467163e-05, + "loss": 0.9098, + "step": 990 + }, + { + "epoch": 0.021480431327061046, + "grad_norm": 0.8034510770514283, + "learning_rate": 1.999337587397311e-05, + "loss": 0.9221, + "step": 1000 + }, + { + "epoch": 0.02169523564033166, + "grad_norm": 0.7866973908662558, + "learning_rate": 1.9993125485077675e-05, + "loss": 0.9284, + "step": 1010 + }, + { + "epoch": 0.02191003995360227, + "grad_norm": 0.8772012374367091, + "learning_rate": 1.9992870452897195e-05, + "loss": 0.9145, + "step": 1020 + }, + { + "epoch": 0.02212484426687288, + "grad_norm": 0.7704037718521081, + "learning_rate": 1.9992610777550174e-05, + "loss": 0.9022, + "step": 1030 + }, + { + "epoch": 0.02233964858014349, + "grad_norm": 0.8139905784316902, + "learning_rate": 1.999234645915727e-05, + "loss": 0.899, + "step": 1040 + }, + { + "epoch": 0.0225544528934141, + "grad_norm": 0.8035022366722004, + "learning_rate": 1.999207749784129e-05, + "loss": 0.9119, + "step": 1050 + }, + { + "epoch": 0.02276925720668471, + "grad_norm": 0.8018634228953854, + "learning_rate": 1.9991803893727213e-05, + "loss": 0.913, + "step": 1060 + }, + { + "epoch": 0.022984061519955322, + "grad_norm": 0.8762806370784909, + "learning_rate": 1.9991525646942167e-05, + "loss": 0.8927, + "step": 1070 + }, + { + "epoch": 0.023198865833225932, + "grad_norm": 0.8434215687788776, + "learning_rate": 1.999124275761544e-05, + "loss": 0.9035, + "step": 1080 + }, + { + "epoch": 0.02341367014649654, + "grad_norm": 0.8323692540509068, + "learning_rate": 1.9990955225878475e-05, + "loss": 0.9184, + "step": 1090 + }, + { + "epoch": 0.02362847445976715, + "grad_norm": 0.8062273577144896, + "learning_rate": 1.9990663051864875e-05, + "loss": 0.9158, + "step": 1100 + }, + { + "epoch": 0.02384327877303776, + "grad_norm": 0.7741118350909906, + "learning_rate": 1.9990366235710392e-05, + "loss": 0.8994, + "step": 1110 + }, + { + "epoch": 0.024058083086308375, + "grad_norm": 0.8409438928193937, + "learning_rate": 1.9990064777552947e-05, + "loss": 0.9039, + "step": 1120 + }, + { + "epoch": 0.024272887399578984, + "grad_norm": 0.8040510324600977, + "learning_rate": 1.9989758677532613e-05, + "loss": 0.9153, + "step": 1130 + }, + { + "epoch": 0.024487691712849594, + "grad_norm": 0.7860147216474672, + "learning_rate": 1.9989447935791616e-05, + "loss": 0.9111, + "step": 1140 + }, + { + "epoch": 0.024702496026120204, + "grad_norm": 0.7955991962712436, + "learning_rate": 1.9989132552474335e-05, + "loss": 0.9148, + "step": 1150 + }, + { + "epoch": 0.024917300339390814, + "grad_norm": 1.305671067059314, + "learning_rate": 1.9988812527727324e-05, + "loss": 0.9093, + "step": 1160 + }, + { + "epoch": 0.025132104652661424, + "grad_norm": 0.7217672239649101, + "learning_rate": 1.9988487861699277e-05, + "loss": 0.9105, + "step": 1170 + }, + { + "epoch": 0.025346908965932037, + "grad_norm": 6.271674767342428, + "learning_rate": 1.9988158554541047e-05, + "loss": 0.9047, + "step": 1180 + }, + { + "epoch": 0.025561713279202647, + "grad_norm": 0.8349965377102029, + "learning_rate": 1.9987824606405647e-05, + "loss": 0.914, + "step": 1190 + }, + { + "epoch": 0.025776517592473257, + "grad_norm": 0.8221862837050625, + "learning_rate": 1.9987486017448245e-05, + "loss": 0.9111, + "step": 1200 + }, + { + "epoch": 0.025991321905743867, + "grad_norm": 0.7579519216571478, + "learning_rate": 1.998714278782617e-05, + "loss": 0.9148, + "step": 1210 + }, + { + "epoch": 0.026206126219014476, + "grad_norm": 1.220350327147385, + "learning_rate": 1.9986794917698894e-05, + "loss": 0.9209, + "step": 1220 + }, + { + "epoch": 0.02642093053228509, + "grad_norm": 0.765028183579955, + "learning_rate": 1.9986442407228064e-05, + "loss": 0.908, + "step": 1230 + }, + { + "epoch": 0.0266357348455557, + "grad_norm": 0.7816681258072063, + "learning_rate": 1.998608525657747e-05, + "loss": 0.9077, + "step": 1240 + }, + { + "epoch": 0.02685053915882631, + "grad_norm": 0.8249985245317925, + "learning_rate": 1.9985723465913058e-05, + "loss": 0.9021, + "step": 1250 + }, + { + "epoch": 0.02706534347209692, + "grad_norm": 0.7570103217659379, + "learning_rate": 1.9985357035402937e-05, + "loss": 0.8967, + "step": 1260 + }, + { + "epoch": 0.02728014778536753, + "grad_norm": 0.7550746594070511, + "learning_rate": 1.998498596521737e-05, + "loss": 0.9088, + "step": 1270 + }, + { + "epoch": 0.027494952098638142, + "grad_norm": 0.74593111439136, + "learning_rate": 1.998461025552876e-05, + "loss": 0.914, + "step": 1280 + }, + { + "epoch": 0.027709756411908752, + "grad_norm": 0.7538206481587928, + "learning_rate": 1.99842299065117e-05, + "loss": 0.892, + "step": 1290 + }, + { + "epoch": 0.027924560725179362, + "grad_norm": 0.7808275060158067, + "learning_rate": 1.9983844918342907e-05, + "loss": 0.9034, + "step": 1300 + }, + { + "epoch": 0.028139365038449972, + "grad_norm": 0.7539145866225501, + "learning_rate": 1.9983455291201267e-05, + "loss": 0.8867, + "step": 1310 + }, + { + "epoch": 0.02835416935172058, + "grad_norm": 0.8633072463687512, + "learning_rate": 1.9983061025267815e-05, + "loss": 0.9009, + "step": 1320 + }, + { + "epoch": 0.02856897366499119, + "grad_norm": 0.7733521101210006, + "learning_rate": 1.9982662120725753e-05, + "loss": 0.912, + "step": 1330 + }, + { + "epoch": 0.028783777978261805, + "grad_norm": 0.7673039038449831, + "learning_rate": 1.998225857776043e-05, + "loss": 0.8992, + "step": 1340 + }, + { + "epoch": 0.028998582291532415, + "grad_norm": 0.7714006517358715, + "learning_rate": 1.9981850396559348e-05, + "loss": 0.8889, + "step": 1350 + }, + { + "epoch": 0.029213386604803025, + "grad_norm": 0.8178218009457255, + "learning_rate": 1.9981437577312167e-05, + "loss": 0.9063, + "step": 1360 + }, + { + "epoch": 0.029428190918073634, + "grad_norm": 0.7794714859641063, + "learning_rate": 1.998102012021071e-05, + "loss": 0.9023, + "step": 1370 + }, + { + "epoch": 0.029642995231344244, + "grad_norm": 0.8496853423209075, + "learning_rate": 1.998059802544894e-05, + "loss": 0.8999, + "step": 1380 + }, + { + "epoch": 0.029857799544614858, + "grad_norm": 0.7714878599883224, + "learning_rate": 1.9980171293222982e-05, + "loss": 0.8896, + "step": 1390 + }, + { + "epoch": 0.030072603857885467, + "grad_norm": 0.7795267501311487, + "learning_rate": 1.9979739923731125e-05, + "loss": 0.9123, + "step": 1400 + }, + { + "epoch": 0.030287408171156077, + "grad_norm": 0.7765636777336408, + "learning_rate": 1.9979303917173793e-05, + "loss": 0.8967, + "step": 1410 + }, + { + "epoch": 0.030502212484426687, + "grad_norm": 0.7854049103534458, + "learning_rate": 1.9978863273753583e-05, + "loss": 0.8921, + "step": 1420 + }, + { + "epoch": 0.030717016797697297, + "grad_norm": 0.7525748519317893, + "learning_rate": 1.9978417993675236e-05, + "loss": 0.9078, + "step": 1430 + }, + { + "epoch": 0.030931821110967907, + "grad_norm": 0.76321986852774, + "learning_rate": 1.997796807714565e-05, + "loss": 0.9121, + "step": 1440 + }, + { + "epoch": 0.03114662542423852, + "grad_norm": 0.7619696286583564, + "learning_rate": 1.9977513524373878e-05, + "loss": 0.8986, + "step": 1450 + }, + { + "epoch": 0.031361429737509126, + "grad_norm": 0.8025471522668453, + "learning_rate": 1.997705433557113e-05, + "loss": 0.8827, + "step": 1460 + }, + { + "epoch": 0.031576234050779736, + "grad_norm": 0.7763997567061249, + "learning_rate": 1.9976590510950766e-05, + "loss": 0.9019, + "step": 1470 + }, + { + "epoch": 0.03179103836405035, + "grad_norm": 0.7413217078674147, + "learning_rate": 1.99761220507283e-05, + "loss": 0.8864, + "step": 1480 + }, + { + "epoch": 0.03200584267732096, + "grad_norm": 0.7457795342330259, + "learning_rate": 1.99756489551214e-05, + "loss": 0.8927, + "step": 1490 + }, + { + "epoch": 0.03222064699059157, + "grad_norm": 0.7177239737436911, + "learning_rate": 1.997517122434989e-05, + "loss": 0.8664, + "step": 1500 + }, + { + "epoch": 0.03243545130386218, + "grad_norm": 0.729910325595943, + "learning_rate": 1.9974688858635748e-05, + "loss": 0.8872, + "step": 1510 + }, + { + "epoch": 0.03265025561713279, + "grad_norm": 0.734760164934118, + "learning_rate": 1.9974201858203098e-05, + "loss": 0.902, + "step": 1520 + }, + { + "epoch": 0.0328650599304034, + "grad_norm": 0.7602546032074973, + "learning_rate": 1.9973710223278236e-05, + "loss": 0.8948, + "step": 1530 + }, + { + "epoch": 0.03307986424367401, + "grad_norm": 0.7562657004832819, + "learning_rate": 1.9973213954089586e-05, + "loss": 0.8869, + "step": 1540 + }, + { + "epoch": 0.03329466855694462, + "grad_norm": 0.7774738334729859, + "learning_rate": 1.9972713050867745e-05, + "loss": 0.9045, + "step": 1550 + }, + { + "epoch": 0.03350947287021523, + "grad_norm": 0.7681125250896116, + "learning_rate": 1.997220751384546e-05, + "loss": 0.9013, + "step": 1560 + }, + { + "epoch": 0.03372427718348584, + "grad_norm": 0.7339330432465896, + "learning_rate": 1.997169734325762e-05, + "loss": 0.9007, + "step": 1570 + }, + { + "epoch": 0.03393908149675646, + "grad_norm": 0.7065402914181205, + "learning_rate": 1.9971182539341284e-05, + "loss": 0.8787, + "step": 1580 + }, + { + "epoch": 0.03415388581002707, + "grad_norm": 0.7801128948603702, + "learning_rate": 1.9970663102335645e-05, + "loss": 0.9026, + "step": 1590 + }, + { + "epoch": 0.03436869012329768, + "grad_norm": 0.788227685604599, + "learning_rate": 1.997013903248206e-05, + "loss": 0.8949, + "step": 1600 + }, + { + "epoch": 0.03458349443656829, + "grad_norm": 0.7478323289786412, + "learning_rate": 1.9969610330024043e-05, + "loss": 0.8807, + "step": 1610 + }, + { + "epoch": 0.0347982987498389, + "grad_norm": 0.7594500146074927, + "learning_rate": 1.996907699520725e-05, + "loss": 0.897, + "step": 1620 + }, + { + "epoch": 0.03501310306310951, + "grad_norm": 0.7557561178768684, + "learning_rate": 1.9968539028279493e-05, + "loss": 0.8817, + "step": 1630 + }, + { + "epoch": 0.03522790737638012, + "grad_norm": 0.7705545022363263, + "learning_rate": 1.9967996429490738e-05, + "loss": 0.8975, + "step": 1640 + }, + { + "epoch": 0.03544271168965073, + "grad_norm": 0.7586938534256935, + "learning_rate": 1.9967449199093104e-05, + "loss": 0.9046, + "step": 1650 + }, + { + "epoch": 0.03565751600292134, + "grad_norm": 0.7431761255095913, + "learning_rate": 1.996689733734086e-05, + "loss": 0.8873, + "step": 1660 + }, + { + "epoch": 0.03587232031619195, + "grad_norm": 0.7208377192401944, + "learning_rate": 1.9966340844490427e-05, + "loss": 0.8785, + "step": 1670 + }, + { + "epoch": 0.03608712462946256, + "grad_norm": 0.7506584753315972, + "learning_rate": 1.9965779720800383e-05, + "loss": 0.8684, + "step": 1680 + }, + { + "epoch": 0.03630192894273317, + "grad_norm": 0.7391526908534235, + "learning_rate": 1.9965213966531443e-05, + "loss": 0.8925, + "step": 1690 + }, + { + "epoch": 0.03651673325600378, + "grad_norm": 0.7088294334006218, + "learning_rate": 1.996464358194649e-05, + "loss": 0.8812, + "step": 1700 + }, + { + "epoch": 0.03673153756927439, + "grad_norm": 0.7199397102089583, + "learning_rate": 1.9964068567310552e-05, + "loss": 0.9021, + "step": 1710 + }, + { + "epoch": 0.036946341882545, + "grad_norm": 0.7386582371472061, + "learning_rate": 1.996348892289081e-05, + "loss": 0.8847, + "step": 1720 + }, + { + "epoch": 0.03716114619581561, + "grad_norm": 0.7508464385857595, + "learning_rate": 1.9962904648956585e-05, + "loss": 0.8753, + "step": 1730 + }, + { + "epoch": 0.03737595050908622, + "grad_norm": 0.7470256078534874, + "learning_rate": 1.9962315745779372e-05, + "loss": 0.8963, + "step": 1740 + }, + { + "epoch": 0.03759075482235683, + "grad_norm": 0.7418761066889206, + "learning_rate": 1.9961722213632794e-05, + "loss": 0.8963, + "step": 1750 + }, + { + "epoch": 0.03780555913562744, + "grad_norm": 0.7616668970329441, + "learning_rate": 1.9961124052792636e-05, + "loss": 0.8795, + "step": 1760 + }, + { + "epoch": 0.03802036344889805, + "grad_norm": 0.8233002415920407, + "learning_rate": 1.996052126353684e-05, + "loss": 0.8716, + "step": 1770 + }, + { + "epoch": 0.03823516776216866, + "grad_norm": 0.7383776636172839, + "learning_rate": 1.995991384614548e-05, + "loss": 0.8813, + "step": 1780 + }, + { + "epoch": 0.03844997207543927, + "grad_norm": 0.7101146565285336, + "learning_rate": 1.9959301800900795e-05, + "loss": 0.8744, + "step": 1790 + }, + { + "epoch": 0.03866477638870989, + "grad_norm": 0.7797901881755945, + "learning_rate": 1.9958685128087175e-05, + "loss": 0.8739, + "step": 1800 + }, + { + "epoch": 0.0388795807019805, + "grad_norm": 0.7607914365027031, + "learning_rate": 1.995806382799115e-05, + "loss": 0.8769, + "step": 1810 + }, + { + "epoch": 0.03909438501525111, + "grad_norm": 0.7186167152089645, + "learning_rate": 1.9957437900901408e-05, + "loss": 0.8728, + "step": 1820 + }, + { + "epoch": 0.03930918932852172, + "grad_norm": 0.7500460414609829, + "learning_rate": 1.9956807347108787e-05, + "loss": 0.886, + "step": 1830 + }, + { + "epoch": 0.03952399364179233, + "grad_norm": 0.8448420531394487, + "learning_rate": 1.9956172166906267e-05, + "loss": 0.876, + "step": 1840 + }, + { + "epoch": 0.03973879795506294, + "grad_norm": 0.7616239143421761, + "learning_rate": 1.9955532360588986e-05, + "loss": 0.8731, + "step": 1850 + }, + { + "epoch": 0.03995360226833355, + "grad_norm": 0.7516220625045015, + "learning_rate": 1.9954887928454232e-05, + "loss": 0.8895, + "step": 1860 + }, + { + "epoch": 0.04016840658160416, + "grad_norm": 0.6940486511611167, + "learning_rate": 1.9954238870801434e-05, + "loss": 0.8816, + "step": 1870 + }, + { + "epoch": 0.04038321089487477, + "grad_norm": 0.7207106268872732, + "learning_rate": 1.995358518793218e-05, + "loss": 0.8786, + "step": 1880 + }, + { + "epoch": 0.04059801520814538, + "grad_norm": 0.7003859763965479, + "learning_rate": 1.99529268801502e-05, + "loss": 0.8909, + "step": 1890 + }, + { + "epoch": 0.04081281952141599, + "grad_norm": 0.7143855349325927, + "learning_rate": 1.995226394776137e-05, + "loss": 0.876, + "step": 1900 + }, + { + "epoch": 0.041027623834686604, + "grad_norm": 1.027903124400268, + "learning_rate": 1.9951596391073732e-05, + "loss": 0.8845, + "step": 1910 + }, + { + "epoch": 0.041242428147957214, + "grad_norm": 0.7561863103739813, + "learning_rate": 1.9950924210397453e-05, + "loss": 0.8805, + "step": 1920 + }, + { + "epoch": 0.04145723246122782, + "grad_norm": 0.7070451516725674, + "learning_rate": 1.9950247406044867e-05, + "loss": 0.8866, + "step": 1930 + }, + { + "epoch": 0.04167203677449843, + "grad_norm": 0.7414269087308623, + "learning_rate": 1.9949565978330447e-05, + "loss": 0.8967, + "step": 1940 + }, + { + "epoch": 0.04188684108776904, + "grad_norm": 0.7282024119626752, + "learning_rate": 1.9948879927570823e-05, + "loss": 0.883, + "step": 1950 + }, + { + "epoch": 0.04210164540103965, + "grad_norm": 0.7460528091826555, + "learning_rate": 1.9948189254084758e-05, + "loss": 0.8701, + "step": 1960 + }, + { + "epoch": 0.04231644971431026, + "grad_norm": 0.6963758847916436, + "learning_rate": 1.9947493958193176e-05, + "loss": 0.8748, + "step": 1970 + }, + { + "epoch": 0.04253125402758087, + "grad_norm": 0.7069516648127419, + "learning_rate": 1.9946794040219147e-05, + "loss": 0.8832, + "step": 1980 + }, + { + "epoch": 0.04274605834085148, + "grad_norm": 0.746440584689567, + "learning_rate": 1.9946089500487888e-05, + "loss": 0.8733, + "step": 1990 + }, + { + "epoch": 0.04296086265412209, + "grad_norm": 0.7146001602504596, + "learning_rate": 1.994538033932675e-05, + "loss": 0.88, + "step": 2000 + }, + { + "epoch": 0.0431756669673927, + "grad_norm": 0.8314467011125141, + "learning_rate": 1.9944666557065257e-05, + "loss": 0.8876, + "step": 2010 + }, + { + "epoch": 0.04339047128066332, + "grad_norm": 0.8015245377512796, + "learning_rate": 1.994394815403506e-05, + "loss": 0.9035, + "step": 2020 + }, + { + "epoch": 0.04360527559393393, + "grad_norm": 0.7222397980014357, + "learning_rate": 1.9943225130569967e-05, + "loss": 0.8846, + "step": 2030 + }, + { + "epoch": 0.04382007990720454, + "grad_norm": 0.9345503125138953, + "learning_rate": 1.9942497487005922e-05, + "loss": 0.875, + "step": 2040 + }, + { + "epoch": 0.04403488422047515, + "grad_norm": 0.777701186759307, + "learning_rate": 1.9941765223681032e-05, + "loss": 0.8886, + "step": 2050 + }, + { + "epoch": 0.04424968853374576, + "grad_norm": 0.7503022644965333, + "learning_rate": 1.9941028340935536e-05, + "loss": 0.8579, + "step": 2060 + }, + { + "epoch": 0.04446449284701637, + "grad_norm": 0.719946551023971, + "learning_rate": 1.9940286839111832e-05, + "loss": 0.8802, + "step": 2070 + }, + { + "epoch": 0.04467929716028698, + "grad_norm": 0.749093812327845, + "learning_rate": 1.9939540718554445e-05, + "loss": 0.8834, + "step": 2080 + }, + { + "epoch": 0.04489410147355759, + "grad_norm": 0.7422208424583762, + "learning_rate": 1.993878997961007e-05, + "loss": 0.8853, + "step": 2090 + }, + { + "epoch": 0.0451089057868282, + "grad_norm": 0.7374573183269133, + "learning_rate": 1.993803462262753e-05, + "loss": 0.891, + "step": 2100 + }, + { + "epoch": 0.04532371010009881, + "grad_norm": 0.7350782974678935, + "learning_rate": 1.99372746479578e-05, + "loss": 0.9003, + "step": 2110 + }, + { + "epoch": 0.04553851441336942, + "grad_norm": 0.7281319667850538, + "learning_rate": 1.9936510055954002e-05, + "loss": 0.8683, + "step": 2120 + }, + { + "epoch": 0.045753318726640034, + "grad_norm": 0.7965856691084933, + "learning_rate": 1.9935740846971404e-05, + "loss": 0.8818, + "step": 2130 + }, + { + "epoch": 0.045968123039910644, + "grad_norm": 0.7049379609150542, + "learning_rate": 1.9934967021367417e-05, + "loss": 0.8873, + "step": 2140 + }, + { + "epoch": 0.046182927353181254, + "grad_norm": 0.6866133102632582, + "learning_rate": 1.9934188579501596e-05, + "loss": 0.8799, + "step": 2150 + }, + { + "epoch": 0.046397731666451864, + "grad_norm": 0.7086840811762911, + "learning_rate": 1.993340552173564e-05, + "loss": 0.8724, + "step": 2160 + }, + { + "epoch": 0.04661253597972247, + "grad_norm": 0.7205449804498884, + "learning_rate": 1.99326178484334e-05, + "loss": 0.8906, + "step": 2170 + }, + { + "epoch": 0.04682734029299308, + "grad_norm": 0.7271300532378566, + "learning_rate": 1.9931825559960867e-05, + "loss": 0.8739, + "step": 2180 + }, + { + "epoch": 0.04704214460626369, + "grad_norm": 0.782395233795208, + "learning_rate": 1.993102865668617e-05, + "loss": 0.8766, + "step": 2190 + }, + { + "epoch": 0.0472569489195343, + "grad_norm": 0.7344948505203559, + "learning_rate": 1.9930227138979595e-05, + "loss": 0.8825, + "step": 2200 + }, + { + "epoch": 0.04747175323280491, + "grad_norm": 0.6708701924983489, + "learning_rate": 1.992942100721356e-05, + "loss": 0.8659, + "step": 2210 + }, + { + "epoch": 0.04768655754607552, + "grad_norm": 0.6843127757880155, + "learning_rate": 1.992861026176264e-05, + "loss": 0.8689, + "step": 2220 + }, + { + "epoch": 0.04790136185934613, + "grad_norm": 0.7601120607749401, + "learning_rate": 1.992779490300354e-05, + "loss": 0.8809, + "step": 2230 + }, + { + "epoch": 0.04811616617261675, + "grad_norm": 0.7771578485534071, + "learning_rate": 1.9926974931315114e-05, + "loss": 0.874, + "step": 2240 + }, + { + "epoch": 0.04833097048588736, + "grad_norm": 0.6677106942558194, + "learning_rate": 1.9926150347078363e-05, + "loss": 0.8715, + "step": 2250 + }, + { + "epoch": 0.04854577479915797, + "grad_norm": 0.6825562101263428, + "learning_rate": 1.9925321150676426e-05, + "loss": 0.8747, + "step": 2260 + }, + { + "epoch": 0.04876057911242858, + "grad_norm": 0.7870531733277122, + "learning_rate": 1.992448734249459e-05, + "loss": 0.876, + "step": 2270 + }, + { + "epoch": 0.04897538342569919, + "grad_norm": 0.7487545797908844, + "learning_rate": 1.9923648922920284e-05, + "loss": 0.8842, + "step": 2280 + }, + { + "epoch": 0.0491901877389698, + "grad_norm": 0.7615635965125641, + "learning_rate": 1.9922805892343073e-05, + "loss": 0.8758, + "step": 2290 + }, + { + "epoch": 0.04940499205224041, + "grad_norm": 0.7376131287808475, + "learning_rate": 1.9921958251154673e-05, + "loss": 0.8767, + "step": 2300 + }, + { + "epoch": 0.04961979636551102, + "grad_norm": 0.7011021964691581, + "learning_rate": 1.9921105999748937e-05, + "loss": 0.8785, + "step": 2310 + }, + { + "epoch": 0.04983460067878163, + "grad_norm": 0.6991405501900602, + "learning_rate": 1.9920249138521864e-05, + "loss": 0.8659, + "step": 2320 + }, + { + "epoch": 0.05004940499205224, + "grad_norm": 0.8709849609023845, + "learning_rate": 1.9919387667871586e-05, + "loss": 0.8831, + "step": 2330 + }, + { + "epoch": 0.05026420930532285, + "grad_norm": 0.7150713210870077, + "learning_rate": 1.9918521588198395e-05, + "loss": 0.8758, + "step": 2340 + }, + { + "epoch": 0.050479013618593464, + "grad_norm": 0.7286077482741432, + "learning_rate": 1.9917650899904704e-05, + "loss": 0.8807, + "step": 2350 + }, + { + "epoch": 0.050693817931864074, + "grad_norm": 0.7213279597207197, + "learning_rate": 1.9916775603395078e-05, + "loss": 0.8884, + "step": 2360 + }, + { + "epoch": 0.050908622245134684, + "grad_norm": 0.7135061250388653, + "learning_rate": 1.9915895699076224e-05, + "loss": 0.8759, + "step": 2370 + }, + { + "epoch": 0.051123426558405294, + "grad_norm": 0.7313831738364597, + "learning_rate": 1.9915011187356988e-05, + "loss": 0.857, + "step": 2380 + }, + { + "epoch": 0.051338230871675904, + "grad_norm": 0.6876477115175337, + "learning_rate": 1.991412206864835e-05, + "loss": 0.8572, + "step": 2390 + }, + { + "epoch": 0.051553035184946513, + "grad_norm": 0.6945309755588273, + "learning_rate": 1.9913228343363448e-05, + "loss": 0.8652, + "step": 2400 + }, + { + "epoch": 0.05176783949821712, + "grad_norm": 0.704316576659832, + "learning_rate": 1.991233001191754e-05, + "loss": 0.8601, + "step": 2410 + }, + { + "epoch": 0.05198264381148773, + "grad_norm": 0.6985006179988886, + "learning_rate": 1.9911427074728043e-05, + "loss": 0.8859, + "step": 2420 + }, + { + "epoch": 0.05219744812475834, + "grad_norm": 0.7524066390625885, + "learning_rate": 1.9910519532214498e-05, + "loss": 0.8892, + "step": 2430 + }, + { + "epoch": 0.05241225243802895, + "grad_norm": 0.6901836096027437, + "learning_rate": 1.990960738479859e-05, + "loss": 0.8846, + "step": 2440 + }, + { + "epoch": 0.05262705675129956, + "grad_norm": 0.6849358351981636, + "learning_rate": 1.9908690632904157e-05, + "loss": 0.8634, + "step": 2450 + }, + { + "epoch": 0.05284186106457018, + "grad_norm": 0.765461828081668, + "learning_rate": 1.9907769276957156e-05, + "loss": 0.8746, + "step": 2460 + }, + { + "epoch": 0.05305666537784079, + "grad_norm": 0.7679215224483907, + "learning_rate": 1.9906843317385696e-05, + "loss": 0.8717, + "step": 2470 + }, + { + "epoch": 0.0532714696911114, + "grad_norm": 0.7102215793244879, + "learning_rate": 1.9905912754620028e-05, + "loss": 0.8768, + "step": 2480 + }, + { + "epoch": 0.05348627400438201, + "grad_norm": 0.7328716303001253, + "learning_rate": 1.9904977589092526e-05, + "loss": 0.8762, + "step": 2490 + }, + { + "epoch": 0.05370107831765262, + "grad_norm": 0.6873393080849327, + "learning_rate": 1.990403782123772e-05, + "loss": 0.8666, + "step": 2500 + }, + { + "epoch": 0.05391588263092323, + "grad_norm": 0.7147008262903141, + "learning_rate": 1.990309345149227e-05, + "loss": 0.8702, + "step": 2510 + }, + { + "epoch": 0.05413068694419384, + "grad_norm": 0.7258515311880707, + "learning_rate": 1.990214448029497e-05, + "loss": 0.861, + "step": 2520 + }, + { + "epoch": 0.05434549125746445, + "grad_norm": 0.6864484714517523, + "learning_rate": 1.9901190908086768e-05, + "loss": 0.8607, + "step": 2530 + }, + { + "epoch": 0.05456029557073506, + "grad_norm": 0.7219498116874772, + "learning_rate": 1.9900232735310732e-05, + "loss": 0.8636, + "step": 2540 + }, + { + "epoch": 0.05477509988400567, + "grad_norm": 0.7132672252604229, + "learning_rate": 1.9899269962412075e-05, + "loss": 0.8785, + "step": 2550 + }, + { + "epoch": 0.054989904197276285, + "grad_norm": 0.6983739180648069, + "learning_rate": 1.989830258983815e-05, + "loss": 0.8688, + "step": 2560 + }, + { + "epoch": 0.055204708510546895, + "grad_norm": 0.7061201979674048, + "learning_rate": 1.9897330618038443e-05, + "loss": 0.8771, + "step": 2570 + }, + { + "epoch": 0.055419512823817504, + "grad_norm": 0.7219202198100194, + "learning_rate": 1.9896354047464578e-05, + "loss": 0.8875, + "step": 2580 + }, + { + "epoch": 0.055634317137088114, + "grad_norm": 0.8868550476280311, + "learning_rate": 1.9895372878570314e-05, + "loss": 0.8623, + "step": 2590 + }, + { + "epoch": 0.055849121450358724, + "grad_norm": 0.7475858890792529, + "learning_rate": 1.989438711181156e-05, + "loss": 0.852, + "step": 2600 + }, + { + "epoch": 0.056063925763629334, + "grad_norm": 0.6861827188596874, + "learning_rate": 1.9893396747646337e-05, + "loss": 0.8919, + "step": 2610 + }, + { + "epoch": 0.056278730076899944, + "grad_norm": 0.6806996838562623, + "learning_rate": 1.9892401786534827e-05, + "loss": 0.8581, + "step": 2620 + }, + { + "epoch": 0.056493534390170554, + "grad_norm": 0.6852549300278078, + "learning_rate": 1.989140222893933e-05, + "loss": 0.8697, + "step": 2630 + }, + { + "epoch": 0.05670833870344116, + "grad_norm": 0.6657649574580968, + "learning_rate": 1.989039807532429e-05, + "loss": 0.8807, + "step": 2640 + }, + { + "epoch": 0.05692314301671177, + "grad_norm": 0.6905459875216616, + "learning_rate": 1.988938932615628e-05, + "loss": 0.8673, + "step": 2650 + }, + { + "epoch": 0.05713794732998238, + "grad_norm": 0.7020389616663805, + "learning_rate": 1.9888375981904024e-05, + "loss": 0.8641, + "step": 2660 + }, + { + "epoch": 0.057352751643253, + "grad_norm": 0.6916187469122672, + "learning_rate": 1.988735804303836e-05, + "loss": 0.8621, + "step": 2670 + }, + { + "epoch": 0.05756755595652361, + "grad_norm": 0.7314594201824384, + "learning_rate": 1.988633551003228e-05, + "loss": 0.879, + "step": 2680 + }, + { + "epoch": 0.05778236026979422, + "grad_norm": 0.746171027813674, + "learning_rate": 1.9885308383360894e-05, + "loss": 0.8707, + "step": 2690 + }, + { + "epoch": 0.05799716458306483, + "grad_norm": 0.6634180978048528, + "learning_rate": 1.988427666350146e-05, + "loss": 0.8598, + "step": 2700 + }, + { + "epoch": 0.05821196889633544, + "grad_norm": 0.6453064675883513, + "learning_rate": 1.988324035093336e-05, + "loss": 0.8683, + "step": 2710 + }, + { + "epoch": 0.05842677320960605, + "grad_norm": 0.6912271165294316, + "learning_rate": 1.9882199446138116e-05, + "loss": 0.8746, + "step": 2720 + }, + { + "epoch": 0.05864157752287666, + "grad_norm": 0.7643854365177722, + "learning_rate": 1.9881153949599384e-05, + "loss": 0.8744, + "step": 2730 + }, + { + "epoch": 0.05885638183614727, + "grad_norm": 0.6767199632505446, + "learning_rate": 1.988010386180295e-05, + "loss": 0.8516, + "step": 2740 + }, + { + "epoch": 0.05907118614941788, + "grad_norm": 0.7108698181797346, + "learning_rate": 1.9879049183236735e-05, + "loss": 0.8695, + "step": 2750 + }, + { + "epoch": 0.05928599046268849, + "grad_norm": 0.7253238005438297, + "learning_rate": 1.9877989914390794e-05, + "loss": 0.8555, + "step": 2760 + }, + { + "epoch": 0.0595007947759591, + "grad_norm": 0.6453538421152363, + "learning_rate": 1.9876926055757316e-05, + "loss": 0.8633, + "step": 2770 + }, + { + "epoch": 0.059715599089229715, + "grad_norm": 0.7205543033570071, + "learning_rate": 1.9875857607830616e-05, + "loss": 0.8684, + "step": 2780 + }, + { + "epoch": 0.059930403402500325, + "grad_norm": 0.756612819272997, + "learning_rate": 1.987478457110715e-05, + "loss": 0.8544, + "step": 2790 + }, + { + "epoch": 0.060145207715770935, + "grad_norm": 0.7043547954742904, + "learning_rate": 1.9873706946085504e-05, + "loss": 0.8622, + "step": 2800 + }, + { + "epoch": 0.060360012029041545, + "grad_norm": 0.7593440635743444, + "learning_rate": 1.9872624733266386e-05, + "loss": 0.8834, + "step": 2810 + }, + { + "epoch": 0.060574816342312154, + "grad_norm": 0.6863494468953989, + "learning_rate": 1.9871537933152653e-05, + "loss": 0.8633, + "step": 2820 + }, + { + "epoch": 0.060789620655582764, + "grad_norm": 0.7138487939483891, + "learning_rate": 1.987044654624928e-05, + "loss": 0.8594, + "step": 2830 + }, + { + "epoch": 0.061004424968853374, + "grad_norm": 0.6981415113485295, + "learning_rate": 1.9869350573063376e-05, + "loss": 0.8677, + "step": 2840 + }, + { + "epoch": 0.061219229282123984, + "grad_norm": 0.7075330565366764, + "learning_rate": 1.9868250014104187e-05, + "loss": 0.8682, + "step": 2850 + }, + { + "epoch": 0.061434033595394594, + "grad_norm": 0.714624614534178, + "learning_rate": 1.9867144869883083e-05, + "loss": 0.8628, + "step": 2860 + }, + { + "epoch": 0.061648837908665204, + "grad_norm": 0.7137186386058576, + "learning_rate": 1.9866035140913568e-05, + "loss": 0.8521, + "step": 2870 + }, + { + "epoch": 0.06186364222193581, + "grad_norm": 0.6880470582910433, + "learning_rate": 1.9864920827711273e-05, + "loss": 0.8615, + "step": 2880 + }, + { + "epoch": 0.06207844653520643, + "grad_norm": 0.6631751007947058, + "learning_rate": 1.9863801930793966e-05, + "loss": 0.8684, + "step": 2890 + }, + { + "epoch": 0.06229325084847704, + "grad_norm": 0.680905465307777, + "learning_rate": 1.9862678450681537e-05, + "loss": 0.8573, + "step": 2900 + }, + { + "epoch": 0.06250805516174765, + "grad_norm": 0.722687320949144, + "learning_rate": 1.9861550387896007e-05, + "loss": 0.8573, + "step": 2910 + }, + { + "epoch": 0.06272285947501825, + "grad_norm": 0.707058027911016, + "learning_rate": 1.986041774296153e-05, + "loss": 0.8623, + "step": 2920 + }, + { + "epoch": 0.06293766378828887, + "grad_norm": 0.6766536125382012, + "learning_rate": 1.9859280516404387e-05, + "loss": 0.862, + "step": 2930 + }, + { + "epoch": 0.06315246810155947, + "grad_norm": 0.7142428076724684, + "learning_rate": 1.985813870875299e-05, + "loss": 0.8769, + "step": 2940 + }, + { + "epoch": 0.06336727241483009, + "grad_norm": 0.6712747602182263, + "learning_rate": 1.9856992320537872e-05, + "loss": 0.8651, + "step": 2950 + }, + { + "epoch": 0.0635820767281007, + "grad_norm": 0.7333177718791057, + "learning_rate": 1.9855841352291705e-05, + "loss": 0.8496, + "step": 2960 + }, + { + "epoch": 0.06379688104137131, + "grad_norm": 0.6738403800576495, + "learning_rate": 1.9854685804549282e-05, + "loss": 0.8679, + "step": 2970 + }, + { + "epoch": 0.06401168535464193, + "grad_norm": 0.6926446949090426, + "learning_rate": 1.985352567784753e-05, + "loss": 0.8737, + "step": 2980 + }, + { + "epoch": 0.06422648966791253, + "grad_norm": 0.6819388635508017, + "learning_rate": 1.985236097272549e-05, + "loss": 0.8433, + "step": 2990 + }, + { + "epoch": 0.06444129398118315, + "grad_norm": 0.6578041236132836, + "learning_rate": 1.985119168972435e-05, + "loss": 0.8419, + "step": 3000 + }, + { + "epoch": 0.06465609829445375, + "grad_norm": 0.6561607204214681, + "learning_rate": 1.9850017829387406e-05, + "loss": 0.8539, + "step": 3010 + }, + { + "epoch": 0.06487090260772437, + "grad_norm": 0.693338421429239, + "learning_rate": 1.98488393922601e-05, + "loss": 0.8482, + "step": 3020 + }, + { + "epoch": 0.06508570692099497, + "grad_norm": 0.7241505791029577, + "learning_rate": 1.984765637888998e-05, + "loss": 0.8506, + "step": 3030 + }, + { + "epoch": 0.06530051123426558, + "grad_norm": 0.6726045381693205, + "learning_rate": 1.9846468789826737e-05, + "loss": 0.8605, + "step": 3040 + }, + { + "epoch": 0.0655153155475362, + "grad_norm": 0.7093889099822586, + "learning_rate": 1.984527662562218e-05, + "loss": 0.8651, + "step": 3050 + }, + { + "epoch": 0.0657301198608068, + "grad_norm": 0.6510030697876009, + "learning_rate": 1.9844079886830246e-05, + "loss": 0.8641, + "step": 3060 + }, + { + "epoch": 0.06594492417407742, + "grad_norm": 0.6560638118957208, + "learning_rate": 1.9842878574007e-05, + "loss": 0.874, + "step": 3070 + }, + { + "epoch": 0.06615972848734802, + "grad_norm": 0.7134647721402186, + "learning_rate": 1.9841672687710624e-05, + "loss": 0.8528, + "step": 3080 + }, + { + "epoch": 0.06637453280061864, + "grad_norm": 0.6922835771738086, + "learning_rate": 1.9840462228501432e-05, + "loss": 0.8609, + "step": 3090 + }, + { + "epoch": 0.06658933711388924, + "grad_norm": 0.6573245546694103, + "learning_rate": 1.9839247196941862e-05, + "loss": 0.8644, + "step": 3100 + }, + { + "epoch": 0.06680414142715986, + "grad_norm": 0.6864765270164866, + "learning_rate": 1.9838027593596477e-05, + "loss": 0.8633, + "step": 3110 + }, + { + "epoch": 0.06701894574043046, + "grad_norm": 0.6508340450389928, + "learning_rate": 1.9836803419031964e-05, + "loss": 0.8486, + "step": 3120 + }, + { + "epoch": 0.06723375005370108, + "grad_norm": 0.6712340814966278, + "learning_rate": 1.983557467381713e-05, + "loss": 0.8576, + "step": 3130 + }, + { + "epoch": 0.06744855436697168, + "grad_norm": 0.6946743953968081, + "learning_rate": 1.9834341358522914e-05, + "loss": 0.8615, + "step": 3140 + }, + { + "epoch": 0.0676633586802423, + "grad_norm": 0.6631456581489633, + "learning_rate": 1.9833103473722366e-05, + "loss": 0.8616, + "step": 3150 + }, + { + "epoch": 0.06787816299351292, + "grad_norm": 0.7015972542735149, + "learning_rate": 1.983186101999067e-05, + "loss": 0.8616, + "step": 3160 + }, + { + "epoch": 0.06809296730678352, + "grad_norm": 0.6897254667864308, + "learning_rate": 1.9830613997905136e-05, + "loss": 0.865, + "step": 3170 + }, + { + "epoch": 0.06830777162005414, + "grad_norm": 0.6577402763346296, + "learning_rate": 1.982936240804518e-05, + "loss": 0.8544, + "step": 3180 + }, + { + "epoch": 0.06852257593332474, + "grad_norm": 0.6890291309672536, + "learning_rate": 1.9828106250992353e-05, + "loss": 0.8645, + "step": 3190 + }, + { + "epoch": 0.06873738024659536, + "grad_norm": 0.703215862838126, + "learning_rate": 1.982684552733033e-05, + "loss": 0.8728, + "step": 3200 + }, + { + "epoch": 0.06895218455986596, + "grad_norm": 0.6827501388608269, + "learning_rate": 1.9825580237644903e-05, + "loss": 0.8602, + "step": 3210 + }, + { + "epoch": 0.06916698887313658, + "grad_norm": 0.697650000738595, + "learning_rate": 1.9824310382523982e-05, + "loss": 0.8726, + "step": 3220 + }, + { + "epoch": 0.06938179318640718, + "grad_norm": 0.6721960819494287, + "learning_rate": 1.9823035962557608e-05, + "loss": 0.8605, + "step": 3230 + }, + { + "epoch": 0.0695965974996778, + "grad_norm": 0.7154662455928354, + "learning_rate": 1.9821756978337935e-05, + "loss": 0.8643, + "step": 3240 + }, + { + "epoch": 0.0698114018129484, + "grad_norm": 0.681240118026375, + "learning_rate": 1.9820473430459237e-05, + "loss": 0.8621, + "step": 3250 + }, + { + "epoch": 0.07002620612621901, + "grad_norm": 0.6580326580613558, + "learning_rate": 1.9819185319517915e-05, + "loss": 0.8466, + "step": 3260 + }, + { + "epoch": 0.07024101043948963, + "grad_norm": 0.6710767809283256, + "learning_rate": 1.981789264611249e-05, + "loss": 0.8659, + "step": 3270 + }, + { + "epoch": 0.07045581475276023, + "grad_norm": 0.6593001195695262, + "learning_rate": 1.9816595410843596e-05, + "loss": 0.8663, + "step": 3280 + }, + { + "epoch": 0.07067061906603085, + "grad_norm": 0.6628941061395422, + "learning_rate": 1.981529361431399e-05, + "loss": 0.8766, + "step": 3290 + }, + { + "epoch": 0.07088542337930145, + "grad_norm": 0.6706542522918096, + "learning_rate": 1.9813987257128552e-05, + "loss": 0.8515, + "step": 3300 + }, + { + "epoch": 0.07110022769257207, + "grad_norm": 0.7081735675645245, + "learning_rate": 1.981267633989428e-05, + "loss": 0.8536, + "step": 3310 + }, + { + "epoch": 0.07131503200584267, + "grad_norm": 0.6631068395510924, + "learning_rate": 1.981136086322028e-05, + "loss": 0.8595, + "step": 3320 + }, + { + "epoch": 0.07152983631911329, + "grad_norm": 0.6815244729783259, + "learning_rate": 1.98100408277178e-05, + "loss": 0.8452, + "step": 3330 + }, + { + "epoch": 0.0717446406323839, + "grad_norm": 0.6708767643583214, + "learning_rate": 1.9808716234000176e-05, + "loss": 0.8646, + "step": 3340 + }, + { + "epoch": 0.07195944494565451, + "grad_norm": 0.7070823636265635, + "learning_rate": 1.9807387082682888e-05, + "loss": 0.8596, + "step": 3350 + }, + { + "epoch": 0.07217424925892511, + "grad_norm": 0.6385205053779432, + "learning_rate": 1.980605337438352e-05, + "loss": 0.8596, + "step": 3360 + }, + { + "epoch": 0.07238905357219573, + "grad_norm": 0.6900045332944192, + "learning_rate": 1.9804715109721773e-05, + "loss": 0.8455, + "step": 3370 + }, + { + "epoch": 0.07260385788546635, + "grad_norm": 0.6952726462768112, + "learning_rate": 1.980337228931948e-05, + "loss": 0.854, + "step": 3380 + }, + { + "epoch": 0.07281866219873695, + "grad_norm": 0.6316666106551074, + "learning_rate": 1.9802024913800567e-05, + "loss": 0.8607, + "step": 3390 + }, + { + "epoch": 0.07303346651200757, + "grad_norm": 0.6782909648023765, + "learning_rate": 1.9800672983791097e-05, + "loss": 0.8578, + "step": 3400 + }, + { + "epoch": 0.07324827082527817, + "grad_norm": 0.703136553111246, + "learning_rate": 1.979931649991924e-05, + "loss": 0.8537, + "step": 3410 + }, + { + "epoch": 0.07346307513854879, + "grad_norm": 0.6766152461075413, + "learning_rate": 1.9797955462815285e-05, + "loss": 0.8551, + "step": 3420 + }, + { + "epoch": 0.07367787945181939, + "grad_norm": 0.6548199817442408, + "learning_rate": 1.9796589873111627e-05, + "loss": 0.848, + "step": 3430 + }, + { + "epoch": 0.07389268376509, + "grad_norm": 0.6699380907258793, + "learning_rate": 1.9795219731442798e-05, + "loss": 0.8518, + "step": 3440 + }, + { + "epoch": 0.07410748807836061, + "grad_norm": 0.646613126593096, + "learning_rate": 1.979384503844542e-05, + "loss": 0.8503, + "step": 3450 + }, + { + "epoch": 0.07432229239163123, + "grad_norm": 0.6657451032225497, + "learning_rate": 1.9792465794758246e-05, + "loss": 0.865, + "step": 3460 + }, + { + "epoch": 0.07453709670490183, + "grad_norm": 0.6566855884425971, + "learning_rate": 1.9791082001022137e-05, + "loss": 0.843, + "step": 3470 + }, + { + "epoch": 0.07475190101817245, + "grad_norm": 0.68529511952354, + "learning_rate": 1.978969365788007e-05, + "loss": 0.8569, + "step": 3480 + }, + { + "epoch": 0.07496670533144306, + "grad_norm": 0.694727034477339, + "learning_rate": 1.978830076597714e-05, + "loss": 0.8472, + "step": 3490 + }, + { + "epoch": 0.07518150964471366, + "grad_norm": 0.6631339825070333, + "learning_rate": 1.9786903325960545e-05, + "loss": 0.855, + "step": 3500 + }, + { + "epoch": 0.07539631395798428, + "grad_norm": 0.694444241956036, + "learning_rate": 1.9785501338479605e-05, + "loss": 0.8524, + "step": 3510 + }, + { + "epoch": 0.07561111827125488, + "grad_norm": 0.6945927280010864, + "learning_rate": 1.9784094804185755e-05, + "loss": 0.85, + "step": 3520 + }, + { + "epoch": 0.0758259225845255, + "grad_norm": 0.6672445340085226, + "learning_rate": 1.978268372373253e-05, + "loss": 0.8498, + "step": 3530 + }, + { + "epoch": 0.0760407268977961, + "grad_norm": 0.646988287501803, + "learning_rate": 1.978126809777559e-05, + "loss": 0.848, + "step": 3540 + }, + { + "epoch": 0.07625553121106672, + "grad_norm": 0.6911424046720633, + "learning_rate": 1.9779847926972703e-05, + "loss": 0.8652, + "step": 3550 + }, + { + "epoch": 0.07647033552433732, + "grad_norm": 0.6571056886382604, + "learning_rate": 1.977842321198375e-05, + "loss": 0.864, + "step": 3560 + }, + { + "epoch": 0.07668513983760794, + "grad_norm": 0.6668618449865177, + "learning_rate": 1.977699395347072e-05, + "loss": 0.8486, + "step": 3570 + }, + { + "epoch": 0.07689994415087854, + "grad_norm": 0.6575595446352954, + "learning_rate": 1.9775560152097713e-05, + "loss": 0.8534, + "step": 3580 + }, + { + "epoch": 0.07711474846414916, + "grad_norm": 0.658442535770228, + "learning_rate": 1.9774121808530944e-05, + "loss": 0.8554, + "step": 3590 + }, + { + "epoch": 0.07732955277741978, + "grad_norm": 0.6725808248232724, + "learning_rate": 1.977267892343874e-05, + "loss": 0.8524, + "step": 3600 + }, + { + "epoch": 0.07754435709069038, + "grad_norm": 0.6579027842938031, + "learning_rate": 1.9771231497491526e-05, + "loss": 0.8505, + "step": 3610 + }, + { + "epoch": 0.077759161403961, + "grad_norm": 0.6960301484489192, + "learning_rate": 1.976977953136185e-05, + "loss": 0.8602, + "step": 3620 + }, + { + "epoch": 0.0779739657172316, + "grad_norm": 0.6697741714896654, + "learning_rate": 1.9768323025724368e-05, + "loss": 0.8758, + "step": 3630 + }, + { + "epoch": 0.07818877003050222, + "grad_norm": 2.859419478862952, + "learning_rate": 1.9766861981255837e-05, + "loss": 0.864, + "step": 3640 + }, + { + "epoch": 0.07840357434377282, + "grad_norm": 0.6822863380857591, + "learning_rate": 1.9765396398635133e-05, + "loss": 0.8562, + "step": 3650 + }, + { + "epoch": 0.07861837865704344, + "grad_norm": 0.6500244246977851, + "learning_rate": 1.976392627854323e-05, + "loss": 0.8458, + "step": 3660 + }, + { + "epoch": 0.07883318297031404, + "grad_norm": 0.7383310944585931, + "learning_rate": 1.9762451621663225e-05, + "loss": 0.8435, + "step": 3670 + }, + { + "epoch": 0.07904798728358466, + "grad_norm": 0.692308880241196, + "learning_rate": 1.9760972428680304e-05, + "loss": 0.8506, + "step": 3680 + }, + { + "epoch": 0.07926279159685526, + "grad_norm": 0.6856631590673132, + "learning_rate": 1.975948870028178e-05, + "loss": 0.8686, + "step": 3690 + }, + { + "epoch": 0.07947759591012588, + "grad_norm": 0.6903089329869376, + "learning_rate": 1.9758000437157058e-05, + "loss": 0.8483, + "step": 3700 + }, + { + "epoch": 0.07969240022339649, + "grad_norm": 0.7078943762114845, + "learning_rate": 1.9756507639997663e-05, + "loss": 0.855, + "step": 3710 + }, + { + "epoch": 0.0799072045366671, + "grad_norm": 0.694778958880388, + "learning_rate": 1.975501030949721e-05, + "loss": 0.8738, + "step": 3720 + }, + { + "epoch": 0.08012200884993771, + "grad_norm": 0.7025809779085943, + "learning_rate": 1.975350844635144e-05, + "loss": 0.8455, + "step": 3730 + }, + { + "epoch": 0.08033681316320831, + "grad_norm": 0.7689670952873736, + "learning_rate": 1.9752002051258187e-05, + "loss": 0.8553, + "step": 3740 + }, + { + "epoch": 0.08055161747647893, + "grad_norm": 0.6448632769568776, + "learning_rate": 1.9750491124917396e-05, + "loss": 0.847, + "step": 3750 + }, + { + "epoch": 0.08076642178974953, + "grad_norm": 0.6613202041245299, + "learning_rate": 1.9748975668031113e-05, + "loss": 0.8514, + "step": 3760 + }, + { + "epoch": 0.08098122610302015, + "grad_norm": 0.6795441486901408, + "learning_rate": 1.974745568130349e-05, + "loss": 0.8551, + "step": 3770 + }, + { + "epoch": 0.08119603041629075, + "grad_norm": 0.6665715977606939, + "learning_rate": 1.974593116544079e-05, + "loss": 0.8464, + "step": 3780 + }, + { + "epoch": 0.08141083472956137, + "grad_norm": 0.6572184682125283, + "learning_rate": 1.974440212115138e-05, + "loss": 0.8535, + "step": 3790 + }, + { + "epoch": 0.08162563904283197, + "grad_norm": 0.6552167641764182, + "learning_rate": 1.9742868549145716e-05, + "loss": 0.8432, + "step": 3800 + }, + { + "epoch": 0.08184044335610259, + "grad_norm": 0.6421489707631659, + "learning_rate": 1.9741330450136377e-05, + "loss": 0.849, + "step": 3810 + }, + { + "epoch": 0.08205524766937321, + "grad_norm": 0.6485917266184518, + "learning_rate": 1.9739787824838036e-05, + "loss": 0.838, + "step": 3820 + }, + { + "epoch": 0.08227005198264381, + "grad_norm": 0.6959055244885545, + "learning_rate": 1.9738240673967473e-05, + "loss": 0.8473, + "step": 3830 + }, + { + "epoch": 0.08248485629591443, + "grad_norm": 0.713164987686896, + "learning_rate": 1.9736688998243562e-05, + "loss": 0.8528, + "step": 3840 + }, + { + "epoch": 0.08269966060918503, + "grad_norm": 0.6580917396986089, + "learning_rate": 1.9735132798387294e-05, + "loss": 0.852, + "step": 3850 + }, + { + "epoch": 0.08291446492245565, + "grad_norm": 0.7027948387761466, + "learning_rate": 1.9733572075121746e-05, + "loss": 0.8447, + "step": 3860 + }, + { + "epoch": 0.08312926923572625, + "grad_norm": 0.6861004251959052, + "learning_rate": 1.973200682917211e-05, + "loss": 0.8526, + "step": 3870 + }, + { + "epoch": 0.08334407354899687, + "grad_norm": 0.6869412094000382, + "learning_rate": 1.9730437061265674e-05, + "loss": 0.8416, + "step": 3880 + }, + { + "epoch": 0.08355887786226747, + "grad_norm": 0.6269537456004645, + "learning_rate": 1.9728862772131822e-05, + "loss": 0.8487, + "step": 3890 + }, + { + "epoch": 0.08377368217553809, + "grad_norm": 0.6491403415344045, + "learning_rate": 1.9727283962502054e-05, + "loss": 0.8678, + "step": 3900 + }, + { + "epoch": 0.08398848648880869, + "grad_norm": 0.6995859598956509, + "learning_rate": 1.9725700633109955e-05, + "loss": 0.8433, + "step": 3910 + }, + { + "epoch": 0.0842032908020793, + "grad_norm": 0.6700202312183003, + "learning_rate": 1.9724112784691213e-05, + "loss": 0.847, + "step": 3920 + }, + { + "epoch": 0.08441809511534992, + "grad_norm": 0.6485370618480597, + "learning_rate": 1.9722520417983618e-05, + "loss": 0.8544, + "step": 3930 + }, + { + "epoch": 0.08463289942862053, + "grad_norm": 0.6592424429265452, + "learning_rate": 1.972092353372707e-05, + "loss": 0.837, + "step": 3940 + }, + { + "epoch": 0.08484770374189114, + "grad_norm": 0.6555177826724199, + "learning_rate": 1.9719322132663547e-05, + "loss": 0.8654, + "step": 3950 + }, + { + "epoch": 0.08506250805516175, + "grad_norm": 0.6836389096207424, + "learning_rate": 1.9717716215537145e-05, + "loss": 0.8626, + "step": 3960 + }, + { + "epoch": 0.08527731236843236, + "grad_norm": 0.6336266324638282, + "learning_rate": 1.971610578309404e-05, + "loss": 0.8488, + "step": 3970 + }, + { + "epoch": 0.08549211668170296, + "grad_norm": 0.6198253547328094, + "learning_rate": 1.9714490836082527e-05, + "loss": 0.8606, + "step": 3980 + }, + { + "epoch": 0.08570692099497358, + "grad_norm": 0.6537044949678656, + "learning_rate": 1.9712871375252983e-05, + "loss": 0.8457, + "step": 3990 + }, + { + "epoch": 0.08592172530824418, + "grad_norm": 0.6678736312065985, + "learning_rate": 1.9711247401357886e-05, + "loss": 0.835, + "step": 4000 + }, + { + "epoch": 0.0861365296215148, + "grad_norm": 0.6692336566643887, + "learning_rate": 1.970961891515181e-05, + "loss": 0.8566, + "step": 4010 + }, + { + "epoch": 0.0863513339347854, + "grad_norm": 0.6585733096564904, + "learning_rate": 1.9707985917391435e-05, + "loss": 0.8339, + "step": 4020 + }, + { + "epoch": 0.08656613824805602, + "grad_norm": 0.6303363377268884, + "learning_rate": 1.9706348408835522e-05, + "loss": 0.8593, + "step": 4030 + }, + { + "epoch": 0.08678094256132664, + "grad_norm": 0.6782552039693405, + "learning_rate": 1.9704706390244943e-05, + "loss": 0.8601, + "step": 4040 + }, + { + "epoch": 0.08699574687459724, + "grad_norm": 0.6324566209058814, + "learning_rate": 1.970305986238265e-05, + "loss": 0.8513, + "step": 4050 + }, + { + "epoch": 0.08721055118786786, + "grad_norm": 0.6534333164738351, + "learning_rate": 1.970140882601371e-05, + "loss": 0.8526, + "step": 4060 + }, + { + "epoch": 0.08742535550113846, + "grad_norm": 0.6319924406902253, + "learning_rate": 1.9699753281905266e-05, + "loss": 0.8432, + "step": 4070 + }, + { + "epoch": 0.08764015981440908, + "grad_norm": 0.6523877824199336, + "learning_rate": 1.9698093230826566e-05, + "loss": 0.8472, + "step": 4080 + }, + { + "epoch": 0.08785496412767968, + "grad_norm": 0.6485572561784627, + "learning_rate": 1.9696428673548948e-05, + "loss": 0.8581, + "step": 4090 + }, + { + "epoch": 0.0880697684409503, + "grad_norm": 0.6747875245775294, + "learning_rate": 1.969475961084584e-05, + "loss": 0.8268, + "step": 4100 + }, + { + "epoch": 0.0882845727542209, + "grad_norm": 0.6183565239941164, + "learning_rate": 1.9693086043492778e-05, + "loss": 0.8336, + "step": 4110 + }, + { + "epoch": 0.08849937706749152, + "grad_norm": 0.6659244460477072, + "learning_rate": 1.9691407972267377e-05, + "loss": 0.8718, + "step": 4120 + }, + { + "epoch": 0.08871418138076212, + "grad_norm": 0.6537327783323973, + "learning_rate": 1.968972539794935e-05, + "loss": 0.8369, + "step": 4130 + }, + { + "epoch": 0.08892898569403274, + "grad_norm": 0.7320247153435835, + "learning_rate": 1.96880383213205e-05, + "loss": 0.8539, + "step": 4140 + }, + { + "epoch": 0.08914379000730335, + "grad_norm": 0.6260363637735203, + "learning_rate": 1.9686346743164726e-05, + "loss": 0.847, + "step": 4150 + }, + { + "epoch": 0.08935859432057396, + "grad_norm": 0.65678546506254, + "learning_rate": 1.9684650664268016e-05, + "loss": 0.8428, + "step": 4160 + }, + { + "epoch": 0.08957339863384457, + "grad_norm": 0.6552849199787741, + "learning_rate": 1.9682950085418446e-05, + "loss": 0.8336, + "step": 4170 + }, + { + "epoch": 0.08978820294711518, + "grad_norm": 0.6302616145397817, + "learning_rate": 1.9681245007406192e-05, + "loss": 0.8452, + "step": 4180 + }, + { + "epoch": 0.09000300726038579, + "grad_norm": 0.664218834980882, + "learning_rate": 1.9679535431023512e-05, + "loss": 0.8528, + "step": 4190 + }, + { + "epoch": 0.0902178115736564, + "grad_norm": 0.6382635022742835, + "learning_rate": 1.9677821357064758e-05, + "loss": 0.837, + "step": 4200 + }, + { + "epoch": 0.09043261588692701, + "grad_norm": 0.6287924929580544, + "learning_rate": 1.967610278632637e-05, + "loss": 0.8456, + "step": 4210 + }, + { + "epoch": 0.09064742020019761, + "grad_norm": 0.6735082120990712, + "learning_rate": 1.9674379719606874e-05, + "loss": 0.8307, + "step": 4220 + }, + { + "epoch": 0.09086222451346823, + "grad_norm": 0.6377012982256299, + "learning_rate": 1.9672652157706897e-05, + "loss": 0.8483, + "step": 4230 + }, + { + "epoch": 0.09107702882673883, + "grad_norm": 0.6593721140765066, + "learning_rate": 1.9670920101429142e-05, + "loss": 0.8441, + "step": 4240 + }, + { + "epoch": 0.09129183314000945, + "grad_norm": 0.674319812375293, + "learning_rate": 1.9669183551578414e-05, + "loss": 0.8243, + "step": 4250 + }, + { + "epoch": 0.09150663745328007, + "grad_norm": 0.627013344868889, + "learning_rate": 1.9667442508961585e-05, + "loss": 0.8425, + "step": 4260 + }, + { + "epoch": 0.09172144176655067, + "grad_norm": 0.6958437594298889, + "learning_rate": 1.9665696974387633e-05, + "loss": 0.8526, + "step": 4270 + }, + { + "epoch": 0.09193624607982129, + "grad_norm": 0.6044037023198737, + "learning_rate": 1.966394694866762e-05, + "loss": 0.8533, + "step": 4280 + }, + { + "epoch": 0.09215105039309189, + "grad_norm": 0.6542299584474508, + "learning_rate": 1.9662192432614683e-05, + "loss": 0.8565, + "step": 4290 + }, + { + "epoch": 0.09236585470636251, + "grad_norm": 0.6390677549567899, + "learning_rate": 1.9660433427044064e-05, + "loss": 0.8494, + "step": 4300 + }, + { + "epoch": 0.09258065901963311, + "grad_norm": 0.679504215836474, + "learning_rate": 1.9658669932773072e-05, + "loss": 0.8587, + "step": 4310 + }, + { + "epoch": 0.09279546333290373, + "grad_norm": 0.6627368866357531, + "learning_rate": 1.965690195062112e-05, + "loss": 0.8355, + "step": 4320 + }, + { + "epoch": 0.09301026764617433, + "grad_norm": 0.6349256488418371, + "learning_rate": 1.9655129481409695e-05, + "loss": 0.8476, + "step": 4330 + }, + { + "epoch": 0.09322507195944495, + "grad_norm": 0.6397326363467064, + "learning_rate": 1.9653352525962363e-05, + "loss": 0.8424, + "step": 4340 + }, + { + "epoch": 0.09343987627271555, + "grad_norm": 0.6437203998695848, + "learning_rate": 1.9651571085104796e-05, + "loss": 0.8575, + "step": 4350 + }, + { + "epoch": 0.09365468058598617, + "grad_norm": 0.6296219759406441, + "learning_rate": 1.9649785159664723e-05, + "loss": 0.8444, + "step": 4360 + }, + { + "epoch": 0.09386948489925678, + "grad_norm": 0.6364710421208644, + "learning_rate": 1.964799475047198e-05, + "loss": 0.8409, + "step": 4370 + }, + { + "epoch": 0.09408428921252739, + "grad_norm": 0.664099321204506, + "learning_rate": 1.9646199858358475e-05, + "loss": 0.8537, + "step": 4380 + }, + { + "epoch": 0.094299093525798, + "grad_norm": 0.6145796569330872, + "learning_rate": 1.96444004841582e-05, + "loss": 0.835, + "step": 4390 + }, + { + "epoch": 0.0945138978390686, + "grad_norm": 0.6258749417882592, + "learning_rate": 1.964259662870723e-05, + "loss": 0.8423, + "step": 4400 + }, + { + "epoch": 0.09472870215233922, + "grad_norm": 0.6768124086471206, + "learning_rate": 1.9640788292843722e-05, + "loss": 0.8526, + "step": 4410 + }, + { + "epoch": 0.09494350646560983, + "grad_norm": 0.6679825227131061, + "learning_rate": 1.963897547740792e-05, + "loss": 0.8593, + "step": 4420 + }, + { + "epoch": 0.09515831077888044, + "grad_norm": 0.6708318644795358, + "learning_rate": 1.9637158183242138e-05, + "loss": 0.8403, + "step": 4430 + }, + { + "epoch": 0.09537311509215105, + "grad_norm": 0.6182453595533811, + "learning_rate": 1.9635336411190786e-05, + "loss": 0.8464, + "step": 4440 + }, + { + "epoch": 0.09558791940542166, + "grad_norm": 0.6518263259411956, + "learning_rate": 1.963351016210034e-05, + "loss": 0.8406, + "step": 4450 + }, + { + "epoch": 0.09580272371869226, + "grad_norm": 0.6695930906879859, + "learning_rate": 1.9631679436819363e-05, + "loss": 0.8461, + "step": 4460 + }, + { + "epoch": 0.09601752803196288, + "grad_norm": 0.6609509728832942, + "learning_rate": 1.9629844236198502e-05, + "loss": 0.8533, + "step": 4470 + }, + { + "epoch": 0.0962323323452335, + "grad_norm": 0.6657143200018495, + "learning_rate": 1.9628004561090474e-05, + "loss": 0.8357, + "step": 4480 + }, + { + "epoch": 0.0964471366585041, + "grad_norm": 0.6630542341667736, + "learning_rate": 1.9626160412350085e-05, + "loss": 0.8322, + "step": 4490 + }, + { + "epoch": 0.09666194097177472, + "grad_norm": 0.6314767253481299, + "learning_rate": 1.9624311790834217e-05, + "loss": 0.8385, + "step": 4500 + }, + { + "epoch": 0.09687674528504532, + "grad_norm": 0.6903306283162439, + "learning_rate": 1.962245869740182e-05, + "loss": 0.8557, + "step": 4510 + }, + { + "epoch": 0.09709154959831594, + "grad_norm": 0.669279047897614, + "learning_rate": 1.9620601132913936e-05, + "loss": 0.8552, + "step": 4520 + }, + { + "epoch": 0.09730635391158654, + "grad_norm": 0.6814182363619116, + "learning_rate": 1.9618739098233676e-05, + "loss": 0.8416, + "step": 4530 + }, + { + "epoch": 0.09752115822485716, + "grad_norm": 0.6462039533634677, + "learning_rate": 1.9616872594226232e-05, + "loss": 0.8443, + "step": 4540 + }, + { + "epoch": 0.09773596253812776, + "grad_norm": 0.6397436767943191, + "learning_rate": 1.9615001621758867e-05, + "loss": 0.8401, + "step": 4550 + }, + { + "epoch": 0.09795076685139838, + "grad_norm": 0.661032708042101, + "learning_rate": 1.9613126181700932e-05, + "loss": 0.8313, + "step": 4560 + }, + { + "epoch": 0.09816557116466898, + "grad_norm": 0.6481039328379562, + "learning_rate": 1.9611246274923844e-05, + "loss": 0.834, + "step": 4570 + }, + { + "epoch": 0.0983803754779396, + "grad_norm": 0.6747780419159984, + "learning_rate": 1.960936190230109e-05, + "loss": 0.8326, + "step": 4580 + }, + { + "epoch": 0.09859517979121021, + "grad_norm": 0.661376547720958, + "learning_rate": 1.960747306470825e-05, + "loss": 0.8294, + "step": 4590 + }, + { + "epoch": 0.09880998410448082, + "grad_norm": 0.650547641584137, + "learning_rate": 1.9605579763022966e-05, + "loss": 0.8426, + "step": 4600 + }, + { + "epoch": 0.09902478841775143, + "grad_norm": 0.6682948635311629, + "learning_rate": 1.960368199812495e-05, + "loss": 0.8419, + "step": 4610 + }, + { + "epoch": 0.09923959273102204, + "grad_norm": 0.6467278134601648, + "learning_rate": 1.9601779770896007e-05, + "loss": 0.8293, + "step": 4620 + }, + { + "epoch": 0.09945439704429265, + "grad_norm": 0.6325684184789404, + "learning_rate": 1.9599873082219992e-05, + "loss": 0.8338, + "step": 4630 + }, + { + "epoch": 0.09966920135756326, + "grad_norm": 0.6688907540062, + "learning_rate": 1.9597961932982845e-05, + "loss": 0.8363, + "step": 4640 + }, + { + "epoch": 0.09988400567083387, + "grad_norm": 0.654845343648715, + "learning_rate": 1.9596046324072586e-05, + "loss": 0.8416, + "step": 4650 + }, + { + "epoch": 0.10009880998410448, + "grad_norm": 0.6320139560546627, + "learning_rate": 1.9594126256379286e-05, + "loss": 0.863, + "step": 4660 + }, + { + "epoch": 0.10031361429737509, + "grad_norm": 0.6691994371938742, + "learning_rate": 1.9592201730795112e-05, + "loss": 0.8456, + "step": 4670 + }, + { + "epoch": 0.1005284186106457, + "grad_norm": 0.6606839474988468, + "learning_rate": 1.9590272748214283e-05, + "loss": 0.8302, + "step": 4680 + }, + { + "epoch": 0.10074322292391631, + "grad_norm": 0.6262068989715459, + "learning_rate": 1.9588339309533103e-05, + "loss": 0.8377, + "step": 4690 + }, + { + "epoch": 0.10095802723718693, + "grad_norm": 0.6243559896905192, + "learning_rate": 1.9586401415649935e-05, + "loss": 0.8396, + "step": 4700 + }, + { + "epoch": 0.10117283155045753, + "grad_norm": 0.6687990805968839, + "learning_rate": 1.958445906746522e-05, + "loss": 0.8354, + "step": 4710 + }, + { + "epoch": 0.10138763586372815, + "grad_norm": 0.6310170587812323, + "learning_rate": 1.9582512265881467e-05, + "loss": 0.8211, + "step": 4720 + }, + { + "epoch": 0.10160244017699875, + "grad_norm": 0.6388753086935843, + "learning_rate": 1.958056101180325e-05, + "loss": 0.8441, + "step": 4730 + }, + { + "epoch": 0.10181724449026937, + "grad_norm": 0.6638753058560826, + "learning_rate": 1.9578605306137216e-05, + "loss": 0.8358, + "step": 4740 + }, + { + "epoch": 0.10203204880353997, + "grad_norm": 0.6302752971289505, + "learning_rate": 1.9576645149792083e-05, + "loss": 0.8395, + "step": 4750 + }, + { + "epoch": 0.10224685311681059, + "grad_norm": 0.6458064089587059, + "learning_rate": 1.957468054367863e-05, + "loss": 0.8401, + "step": 4760 + }, + { + "epoch": 0.10246165743008119, + "grad_norm": 0.646382816972308, + "learning_rate": 1.957271148870971e-05, + "loss": 0.8435, + "step": 4770 + }, + { + "epoch": 0.10267646174335181, + "grad_norm": 0.6155538183074262, + "learning_rate": 1.9570737985800237e-05, + "loss": 0.8306, + "step": 4780 + }, + { + "epoch": 0.10289126605662241, + "grad_norm": 0.6367578606631397, + "learning_rate": 1.95687600358672e-05, + "loss": 0.8369, + "step": 4790 + }, + { + "epoch": 0.10310607036989303, + "grad_norm": 0.6404541919457326, + "learning_rate": 1.956677763982964e-05, + "loss": 0.8254, + "step": 4800 + }, + { + "epoch": 0.10332087468316364, + "grad_norm": 0.6215108376628148, + "learning_rate": 1.9564790798608682e-05, + "loss": 0.8584, + "step": 4810 + }, + { + "epoch": 0.10353567899643425, + "grad_norm": 0.6338570435141742, + "learning_rate": 1.9562799513127507e-05, + "loss": 0.8324, + "step": 4820 + }, + { + "epoch": 0.10375048330970486, + "grad_norm": 0.6019370576161179, + "learning_rate": 1.956080378431136e-05, + "loss": 0.8559, + "step": 4830 + }, + { + "epoch": 0.10396528762297547, + "grad_norm": 0.6297162423533607, + "learning_rate": 1.9558803613087548e-05, + "loss": 0.8462, + "step": 4840 + }, + { + "epoch": 0.10418009193624608, + "grad_norm": 0.6045449219846318, + "learning_rate": 1.9556799000385454e-05, + "loss": 0.8386, + "step": 4850 + }, + { + "epoch": 0.10439489624951669, + "grad_norm": 0.6549861670806117, + "learning_rate": 1.9554789947136508e-05, + "loss": 0.8306, + "step": 4860 + }, + { + "epoch": 0.1046097005627873, + "grad_norm": 0.6630879044217777, + "learning_rate": 1.9552776454274223e-05, + "loss": 0.8314, + "step": 4870 + }, + { + "epoch": 0.1048245048760579, + "grad_norm": 0.651950768006272, + "learning_rate": 1.955075852273416e-05, + "loss": 0.8385, + "step": 4880 + }, + { + "epoch": 0.10503930918932852, + "grad_norm": 0.6626536087969357, + "learning_rate": 1.9548736153453943e-05, + "loss": 0.8363, + "step": 4890 + }, + { + "epoch": 0.10525411350259913, + "grad_norm": 0.6886579304677174, + "learning_rate": 1.9546709347373265e-05, + "loss": 0.8499, + "step": 4900 + }, + { + "epoch": 0.10546891781586974, + "grad_norm": 0.6312997525023037, + "learning_rate": 1.954467810543388e-05, + "loss": 0.8235, + "step": 4910 + }, + { + "epoch": 0.10568372212914036, + "grad_norm": 0.6365142217446823, + "learning_rate": 1.954264242857959e-05, + "loss": 0.8356, + "step": 4920 + }, + { + "epoch": 0.10589852644241096, + "grad_norm": 0.6746025653030885, + "learning_rate": 1.954060231775628e-05, + "loss": 0.8528, + "step": 4930 + }, + { + "epoch": 0.10611333075568158, + "grad_norm": 0.6281429344046033, + "learning_rate": 1.9538557773911878e-05, + "loss": 0.8473, + "step": 4940 + }, + { + "epoch": 0.10632813506895218, + "grad_norm": 0.6505897210485625, + "learning_rate": 1.9536508797996377e-05, + "loss": 0.8377, + "step": 4950 + }, + { + "epoch": 0.1065429393822228, + "grad_norm": 0.6758994881928211, + "learning_rate": 1.953445539096183e-05, + "loss": 0.8544, + "step": 4960 + }, + { + "epoch": 0.1067577436954934, + "grad_norm": 0.6285712244694984, + "learning_rate": 1.953239755376235e-05, + "loss": 0.8544, + "step": 4970 + }, + { + "epoch": 0.10697254800876402, + "grad_norm": 0.6210838565177652, + "learning_rate": 1.9530335287354102e-05, + "loss": 0.8304, + "step": 4980 + }, + { + "epoch": 0.10718735232203462, + "grad_norm": 0.6140776582373115, + "learning_rate": 1.952826859269532e-05, + "loss": 0.836, + "step": 4990 + }, + { + "epoch": 0.10740215663530524, + "grad_norm": 0.6671326709180452, + "learning_rate": 1.9526197470746283e-05, + "loss": 0.8343, + "step": 5000 + }, + { + "epoch": 0.10761696094857584, + "grad_norm": 0.6597003360765735, + "learning_rate": 1.9524121922469338e-05, + "loss": 0.8652, + "step": 5010 + }, + { + "epoch": 0.10783176526184646, + "grad_norm": 0.5956926521621552, + "learning_rate": 1.952204194882888e-05, + "loss": 0.8262, + "step": 5020 + }, + { + "epoch": 0.10804656957511707, + "grad_norm": 0.7019664111760795, + "learning_rate": 1.9519957550791372e-05, + "loss": 0.8429, + "step": 5030 + }, + { + "epoch": 0.10826137388838768, + "grad_norm": 0.6304099214813407, + "learning_rate": 1.951786872932532e-05, + "loss": 0.8393, + "step": 5040 + }, + { + "epoch": 0.1084761782016583, + "grad_norm": 0.6463172161616285, + "learning_rate": 1.951577548540129e-05, + "loss": 0.8387, + "step": 5050 + }, + { + "epoch": 0.1086909825149289, + "grad_norm": 0.6502922763327477, + "learning_rate": 1.9513677819991905e-05, + "loss": 0.835, + "step": 5060 + }, + { + "epoch": 0.10890578682819951, + "grad_norm": 0.618888528634145, + "learning_rate": 1.951157573407184e-05, + "loss": 0.8559, + "step": 5070 + }, + { + "epoch": 0.10912059114147012, + "grad_norm": 0.6374773953158367, + "learning_rate": 1.9509469228617827e-05, + "loss": 0.8317, + "step": 5080 + }, + { + "epoch": 0.10933539545474073, + "grad_norm": 0.6303019884270492, + "learning_rate": 1.9507358304608644e-05, + "loss": 0.8303, + "step": 5090 + }, + { + "epoch": 0.10955019976801134, + "grad_norm": 0.6561252982950152, + "learning_rate": 1.9505242963025133e-05, + "loss": 0.8273, + "step": 5100 + }, + { + "epoch": 0.10976500408128195, + "grad_norm": 0.6290165189842113, + "learning_rate": 1.9503123204850184e-05, + "loss": 0.8421, + "step": 5110 + }, + { + "epoch": 0.10997980839455257, + "grad_norm": 0.617329955446763, + "learning_rate": 1.9500999031068734e-05, + "loss": 0.8232, + "step": 5120 + }, + { + "epoch": 0.11019461270782317, + "grad_norm": 0.6107026109474121, + "learning_rate": 1.949887044266778e-05, + "loss": 0.835, + "step": 5130 + }, + { + "epoch": 0.11040941702109379, + "grad_norm": 0.6269967944152154, + "learning_rate": 1.949673744063636e-05, + "loss": 0.8305, + "step": 5140 + }, + { + "epoch": 0.11062422133436439, + "grad_norm": 0.6360564673651822, + "learning_rate": 1.949460002596557e-05, + "loss": 0.832, + "step": 5150 + }, + { + "epoch": 0.11083902564763501, + "grad_norm": 0.6328470911946484, + "learning_rate": 1.9492458199648564e-05, + "loss": 0.8273, + "step": 5160 + }, + { + "epoch": 0.11105382996090561, + "grad_norm": 0.880034167436241, + "learning_rate": 1.949031196268053e-05, + "loss": 0.8422, + "step": 5170 + }, + { + "epoch": 0.11126863427417623, + "grad_norm": 0.6718310412202478, + "learning_rate": 1.948816131605871e-05, + "loss": 0.8444, + "step": 5180 + }, + { + "epoch": 0.11148343858744683, + "grad_norm": 0.6443011347377299, + "learning_rate": 1.9486006260782406e-05, + "loss": 0.8365, + "step": 5190 + }, + { + "epoch": 0.11169824290071745, + "grad_norm": 0.6183046355489557, + "learning_rate": 1.948384679785295e-05, + "loss": 0.8545, + "step": 5200 + }, + { + "epoch": 0.11191304721398805, + "grad_norm": 0.6251393760951905, + "learning_rate": 1.9481682928273738e-05, + "loss": 0.8268, + "step": 5210 + }, + { + "epoch": 0.11212785152725867, + "grad_norm": 0.6344101557996115, + "learning_rate": 1.9479514653050212e-05, + "loss": 0.8404, + "step": 5220 + }, + { + "epoch": 0.11234265584052928, + "grad_norm": 0.5860944916243113, + "learning_rate": 1.9477341973189844e-05, + "loss": 0.8355, + "step": 5230 + }, + { + "epoch": 0.11255746015379989, + "grad_norm": 0.6186996808539741, + "learning_rate": 1.9475164889702175e-05, + "loss": 0.8479, + "step": 5240 + }, + { + "epoch": 0.1127722644670705, + "grad_norm": 0.6034601205701468, + "learning_rate": 1.9472983403598783e-05, + "loss": 0.8277, + "step": 5250 + }, + { + "epoch": 0.11298706878034111, + "grad_norm": 0.6440643260727591, + "learning_rate": 1.9470797515893286e-05, + "loss": 0.8376, + "step": 5260 + }, + { + "epoch": 0.11320187309361172, + "grad_norm": 0.6324744223203518, + "learning_rate": 1.946860722760135e-05, + "loss": 0.8311, + "step": 5270 + }, + { + "epoch": 0.11341667740688233, + "grad_norm": 0.618379973907413, + "learning_rate": 1.9466412539740697e-05, + "loss": 0.8287, + "step": 5280 + }, + { + "epoch": 0.11363148172015294, + "grad_norm": 0.6256736960626628, + "learning_rate": 1.9464213453331076e-05, + "loss": 0.8384, + "step": 5290 + }, + { + "epoch": 0.11384628603342355, + "grad_norm": 0.607811975191907, + "learning_rate": 1.9462009969394292e-05, + "loss": 0.8439, + "step": 5300 + }, + { + "epoch": 0.11406109034669416, + "grad_norm": 0.6214988694407934, + "learning_rate": 1.945980208895419e-05, + "loss": 0.8385, + "step": 5310 + }, + { + "epoch": 0.11427589465996477, + "grad_norm": 0.6260531019431299, + "learning_rate": 1.945758981303665e-05, + "loss": 0.8386, + "step": 5320 + }, + { + "epoch": 0.11449069897323538, + "grad_norm": 0.6510239330074841, + "learning_rate": 1.9455373142669615e-05, + "loss": 0.8155, + "step": 5330 + }, + { + "epoch": 0.114705503286506, + "grad_norm": 0.6114277545538802, + "learning_rate": 1.945315207888304e-05, + "loss": 0.8236, + "step": 5340 + }, + { + "epoch": 0.1149203075997766, + "grad_norm": 0.6290587853644128, + "learning_rate": 1.945092662270895e-05, + "loss": 0.8391, + "step": 5350 + }, + { + "epoch": 0.11513511191304722, + "grad_norm": 0.6252788540877645, + "learning_rate": 1.9448696775181393e-05, + "loss": 0.8302, + "step": 5360 + }, + { + "epoch": 0.11534991622631782, + "grad_norm": 0.6366366260996406, + "learning_rate": 1.9446462537336462e-05, + "loss": 0.832, + "step": 5370 + }, + { + "epoch": 0.11556472053958844, + "grad_norm": 0.6403035006648804, + "learning_rate": 1.9444223910212297e-05, + "loss": 0.8401, + "step": 5380 + }, + { + "epoch": 0.11577952485285904, + "grad_norm": 0.6471061463773111, + "learning_rate": 1.9441980894849068e-05, + "loss": 0.822, + "step": 5390 + }, + { + "epoch": 0.11599432916612966, + "grad_norm": 0.6485069683086401, + "learning_rate": 1.9439733492288986e-05, + "loss": 0.8327, + "step": 5400 + }, + { + "epoch": 0.11620913347940026, + "grad_norm": 0.6195424958899896, + "learning_rate": 1.9437481703576303e-05, + "loss": 0.827, + "step": 5410 + }, + { + "epoch": 0.11642393779267088, + "grad_norm": 0.6067271492911613, + "learning_rate": 1.9435225529757307e-05, + "loss": 0.8378, + "step": 5420 + }, + { + "epoch": 0.11663874210594148, + "grad_norm": 0.6333870629436046, + "learning_rate": 1.9432964971880325e-05, + "loss": 0.8274, + "step": 5430 + }, + { + "epoch": 0.1168535464192121, + "grad_norm": 0.6349621748512131, + "learning_rate": 1.9430700030995724e-05, + "loss": 0.8371, + "step": 5440 + }, + { + "epoch": 0.11706835073248271, + "grad_norm": 0.6132008334219502, + "learning_rate": 1.94284307081559e-05, + "loss": 0.8254, + "step": 5450 + }, + { + "epoch": 0.11728315504575332, + "grad_norm": 0.626720073182986, + "learning_rate": 1.942615700441529e-05, + "loss": 0.8296, + "step": 5460 + }, + { + "epoch": 0.11749795935902393, + "grad_norm": 0.6280587181806514, + "learning_rate": 1.9423878920830366e-05, + "loss": 0.8228, + "step": 5470 + }, + { + "epoch": 0.11771276367229454, + "grad_norm": 0.6514270560053967, + "learning_rate": 1.942159645845963e-05, + "loss": 0.8262, + "step": 5480 + }, + { + "epoch": 0.11792756798556515, + "grad_norm": 0.6130650794491808, + "learning_rate": 1.9419309618363637e-05, + "loss": 0.8379, + "step": 5490 + }, + { + "epoch": 0.11814237229883576, + "grad_norm": 0.637159008327422, + "learning_rate": 1.9417018401604947e-05, + "loss": 0.8282, + "step": 5500 + }, + { + "epoch": 0.11835717661210637, + "grad_norm": 0.6188423823463596, + "learning_rate": 1.9414722809248182e-05, + "loss": 0.8305, + "step": 5510 + }, + { + "epoch": 0.11857198092537698, + "grad_norm": 0.6245770269033344, + "learning_rate": 1.9412422842359976e-05, + "loss": 0.833, + "step": 5520 + }, + { + "epoch": 0.1187867852386476, + "grad_norm": 0.6365044259718952, + "learning_rate": 1.9410118502009003e-05, + "loss": 0.8388, + "step": 5530 + }, + { + "epoch": 0.1190015895519182, + "grad_norm": 0.6328160935271273, + "learning_rate": 1.9407809789265973e-05, + "loss": 0.845, + "step": 5540 + }, + { + "epoch": 0.11921639386518881, + "grad_norm": 0.630311910667844, + "learning_rate": 1.9405496705203628e-05, + "loss": 0.8345, + "step": 5550 + }, + { + "epoch": 0.11943119817845943, + "grad_norm": 0.638696910751977, + "learning_rate": 1.9403179250896733e-05, + "loss": 0.8416, + "step": 5560 + }, + { + "epoch": 0.11964600249173003, + "grad_norm": 0.6247248724186333, + "learning_rate": 1.940085742742209e-05, + "loss": 0.8331, + "step": 5570 + }, + { + "epoch": 0.11986080680500065, + "grad_norm": 0.6213341924856995, + "learning_rate": 1.9398531235858525e-05, + "loss": 0.8347, + "step": 5580 + }, + { + "epoch": 0.12007561111827125, + "grad_norm": 0.6121503915927947, + "learning_rate": 1.9396200677286907e-05, + "loss": 0.8323, + "step": 5590 + }, + { + "epoch": 0.12029041543154187, + "grad_norm": 0.6189647502053036, + "learning_rate": 1.939386575279012e-05, + "loss": 0.8457, + "step": 5600 + }, + { + "epoch": 0.12050521974481247, + "grad_norm": 0.6135493571505611, + "learning_rate": 1.939152646345308e-05, + "loss": 0.8368, + "step": 5610 + }, + { + "epoch": 0.12072002405808309, + "grad_norm": 0.6378689788750032, + "learning_rate": 1.9389182810362738e-05, + "loss": 0.8449, + "step": 5620 + }, + { + "epoch": 0.12093482837135369, + "grad_norm": 0.6294801432407113, + "learning_rate": 1.938683479460806e-05, + "loss": 0.8271, + "step": 5630 + }, + { + "epoch": 0.12114963268462431, + "grad_norm": 0.6328552710008448, + "learning_rate": 1.9384482417280056e-05, + "loss": 0.8262, + "step": 5640 + }, + { + "epoch": 0.12136443699789491, + "grad_norm": 0.6186605157387922, + "learning_rate": 1.9382125679471745e-05, + "loss": 0.8278, + "step": 5650 + }, + { + "epoch": 0.12157924131116553, + "grad_norm": 0.6334724170139352, + "learning_rate": 1.9379764582278185e-05, + "loss": 0.8284, + "step": 5660 + }, + { + "epoch": 0.12179404562443615, + "grad_norm": 0.6283387797940377, + "learning_rate": 1.9377399126796454e-05, + "loss": 0.8359, + "step": 5670 + }, + { + "epoch": 0.12200884993770675, + "grad_norm": 0.6055428009166745, + "learning_rate": 1.9375029314125658e-05, + "loss": 0.8267, + "step": 5680 + }, + { + "epoch": 0.12222365425097736, + "grad_norm": 0.6034275177420789, + "learning_rate": 1.9372655145366922e-05, + "loss": 0.8252, + "step": 5690 + }, + { + "epoch": 0.12243845856424797, + "grad_norm": 0.6100632464810285, + "learning_rate": 1.93702766216234e-05, + "loss": 0.8284, + "step": 5700 + }, + { + "epoch": 0.12265326287751858, + "grad_norm": 0.6654383228152203, + "learning_rate": 1.936789374400027e-05, + "loss": 0.8124, + "step": 5710 + }, + { + "epoch": 0.12286806719078919, + "grad_norm": 0.6202336240170405, + "learning_rate": 1.9365506513604725e-05, + "loss": 0.828, + "step": 5720 + }, + { + "epoch": 0.1230828715040598, + "grad_norm": 0.6181456688535198, + "learning_rate": 1.936311493154599e-05, + "loss": 0.8446, + "step": 5730 + }, + { + "epoch": 0.12329767581733041, + "grad_norm": 0.619325566527615, + "learning_rate": 1.9360718998935315e-05, + "loss": 0.835, + "step": 5740 + }, + { + "epoch": 0.12351248013060102, + "grad_norm": 0.6293306658393487, + "learning_rate": 1.9358318716885955e-05, + "loss": 0.8225, + "step": 5750 + }, + { + "epoch": 0.12372728444387163, + "grad_norm": 0.6231931933033569, + "learning_rate": 1.9355914086513205e-05, + "loss": 0.845, + "step": 5760 + }, + { + "epoch": 0.12394208875714224, + "grad_norm": 0.6298812123623225, + "learning_rate": 1.9353505108934363e-05, + "loss": 0.829, + "step": 5770 + }, + { + "epoch": 0.12415689307041286, + "grad_norm": 0.6200961201702063, + "learning_rate": 1.9351091785268762e-05, + "loss": 0.8269, + "step": 5780 + }, + { + "epoch": 0.12437169738368346, + "grad_norm": 0.6118430477057613, + "learning_rate": 1.9348674116637747e-05, + "loss": 0.8308, + "step": 5790 + }, + { + "epoch": 0.12458650169695408, + "grad_norm": 0.6127169022228522, + "learning_rate": 1.934625210416468e-05, + "loss": 0.8188, + "step": 5800 + }, + { + "epoch": 0.12480130601022468, + "grad_norm": 0.5858112550024495, + "learning_rate": 1.9343825748974946e-05, + "loss": 0.8225, + "step": 5810 + }, + { + "epoch": 0.1250161103234953, + "grad_norm": 0.6431362667704915, + "learning_rate": 1.9341395052195943e-05, + "loss": 0.8251, + "step": 5820 + }, + { + "epoch": 0.12523091463676592, + "grad_norm": 0.6177184812180734, + "learning_rate": 1.9338960014957094e-05, + "loss": 0.8117, + "step": 5830 + }, + { + "epoch": 0.1254457189500365, + "grad_norm": 7.261555237494985, + "learning_rate": 1.9336520638389828e-05, + "loss": 0.8424, + "step": 5840 + }, + { + "epoch": 0.12566052326330712, + "grad_norm": 0.6165137803342475, + "learning_rate": 1.9334076923627602e-05, + "loss": 0.8468, + "step": 5850 + }, + { + "epoch": 0.12587532757657774, + "grad_norm": 0.6652429602651283, + "learning_rate": 1.9331628871805882e-05, + "loss": 0.8241, + "step": 5860 + }, + { + "epoch": 0.12609013188984836, + "grad_norm": 0.5977417751851771, + "learning_rate": 1.9329176484062147e-05, + "loss": 0.815, + "step": 5870 + }, + { + "epoch": 0.12630493620311894, + "grad_norm": 0.6143551332297007, + "learning_rate": 1.9326719761535896e-05, + "loss": 0.829, + "step": 5880 + }, + { + "epoch": 0.12651974051638956, + "grad_norm": 0.5722771504606794, + "learning_rate": 1.932425870536864e-05, + "loss": 0.8276, + "step": 5890 + }, + { + "epoch": 0.12673454482966018, + "grad_norm": 0.6233204435595209, + "learning_rate": 1.9321793316703904e-05, + "loss": 0.8257, + "step": 5900 + }, + { + "epoch": 0.1269493491429308, + "grad_norm": 0.6060690392530359, + "learning_rate": 1.9319323596687226e-05, + "loss": 0.8361, + "step": 5910 + }, + { + "epoch": 0.1271641534562014, + "grad_norm": 0.6503234853479353, + "learning_rate": 1.9316849546466154e-05, + "loss": 0.8534, + "step": 5920 + }, + { + "epoch": 0.127378957769472, + "grad_norm": 0.6577873265981129, + "learning_rate": 1.931437116719025e-05, + "loss": 0.8263, + "step": 5930 + }, + { + "epoch": 0.12759376208274262, + "grad_norm": 0.6504948800648593, + "learning_rate": 1.9311888460011096e-05, + "loss": 0.8294, + "step": 5940 + }, + { + "epoch": 0.12780856639601323, + "grad_norm": 0.6317328887950812, + "learning_rate": 1.9309401426082263e-05, + "loss": 0.8313, + "step": 5950 + }, + { + "epoch": 0.12802337070928385, + "grad_norm": 0.6211031800695695, + "learning_rate": 1.9306910066559358e-05, + "loss": 0.8355, + "step": 5960 + }, + { + "epoch": 0.12823817502255444, + "grad_norm": 0.6055270936106261, + "learning_rate": 1.9304414382599977e-05, + "loss": 0.8246, + "step": 5970 + }, + { + "epoch": 0.12845297933582506, + "grad_norm": 0.5997807600765583, + "learning_rate": 1.9301914375363746e-05, + "loss": 0.8056, + "step": 5980 + }, + { + "epoch": 0.12866778364909567, + "grad_norm": 0.6211427083875907, + "learning_rate": 1.9299410046012277e-05, + "loss": 0.8315, + "step": 5990 + }, + { + "epoch": 0.1288825879623663, + "grad_norm": 0.5688864716592172, + "learning_rate": 1.9296901395709206e-05, + "loss": 0.8257, + "step": 6000 + }, + { + "epoch": 0.1290973922756369, + "grad_norm": 0.6012936384715789, + "learning_rate": 1.929438842562017e-05, + "loss": 0.8351, + "step": 6010 + }, + { + "epoch": 0.1293121965889075, + "grad_norm": 0.6005677088042237, + "learning_rate": 1.929187113691282e-05, + "loss": 0.8357, + "step": 6020 + }, + { + "epoch": 0.1295270009021781, + "grad_norm": 0.6160401614681437, + "learning_rate": 1.9289349530756804e-05, + "loss": 0.843, + "step": 6030 + }, + { + "epoch": 0.12974180521544873, + "grad_norm": 0.6124623308197319, + "learning_rate": 1.9286823608323785e-05, + "loss": 0.8233, + "step": 6040 + }, + { + "epoch": 0.12995660952871935, + "grad_norm": 0.6203159062875466, + "learning_rate": 1.9284293370787424e-05, + "loss": 0.8303, + "step": 6050 + }, + { + "epoch": 0.13017141384198994, + "grad_norm": 0.6107868080049285, + "learning_rate": 1.9281758819323393e-05, + "loss": 0.8236, + "step": 6060 + }, + { + "epoch": 0.13038621815526055, + "grad_norm": 0.6132700985840815, + "learning_rate": 1.9279219955109366e-05, + "loss": 0.8307, + "step": 6070 + }, + { + "epoch": 0.13060102246853117, + "grad_norm": 0.6055289750075414, + "learning_rate": 1.927667677932502e-05, + "loss": 0.8127, + "step": 6080 + }, + { + "epoch": 0.1308158267818018, + "grad_norm": 0.6116418582015993, + "learning_rate": 1.9274129293152037e-05, + "loss": 0.832, + "step": 6090 + }, + { + "epoch": 0.1310306310950724, + "grad_norm": 0.6179783492037558, + "learning_rate": 1.92715774977741e-05, + "loss": 0.829, + "step": 6100 + }, + { + "epoch": 0.131245435408343, + "grad_norm": 0.5940118498029715, + "learning_rate": 1.9269021394376896e-05, + "loss": 0.8176, + "step": 6110 + }, + { + "epoch": 0.1314602397216136, + "grad_norm": 0.6031829297041618, + "learning_rate": 1.9266460984148116e-05, + "loss": 0.8279, + "step": 6120 + }, + { + "epoch": 0.13167504403488423, + "grad_norm": 0.5930057021061649, + "learning_rate": 1.9263896268277448e-05, + "loss": 0.8139, + "step": 6130 + }, + { + "epoch": 0.13188984834815484, + "grad_norm": 0.6095258155514939, + "learning_rate": 1.9261327247956575e-05, + "loss": 0.8163, + "step": 6140 + }, + { + "epoch": 0.13210465266142543, + "grad_norm": 0.600045191978913, + "learning_rate": 1.9258753924379196e-05, + "loss": 0.8167, + "step": 6150 + }, + { + "epoch": 0.13231945697469605, + "grad_norm": 0.6983897516026605, + "learning_rate": 1.9256176298740997e-05, + "loss": 0.828, + "step": 6160 + }, + { + "epoch": 0.13253426128796666, + "grad_norm": 0.6132138821976615, + "learning_rate": 1.925359437223967e-05, + "loss": 0.8201, + "step": 6170 + }, + { + "epoch": 0.13274906560123728, + "grad_norm": 0.6155911518334949, + "learning_rate": 1.9251008146074895e-05, + "loss": 0.822, + "step": 6180 + }, + { + "epoch": 0.13296386991450787, + "grad_norm": 1.3805735351829016, + "learning_rate": 1.924841762144836e-05, + "loss": 0.8297, + "step": 6190 + }, + { + "epoch": 0.1331786742277785, + "grad_norm": 0.7417405322736338, + "learning_rate": 1.924582279956375e-05, + "loss": 0.8178, + "step": 6200 + }, + { + "epoch": 0.1333934785410491, + "grad_norm": 0.6590382875105805, + "learning_rate": 1.9243223681626734e-05, + "loss": 0.8177, + "step": 6210 + }, + { + "epoch": 0.13360828285431972, + "grad_norm": 0.5947961735015622, + "learning_rate": 1.9240620268845e-05, + "loss": 0.8375, + "step": 6220 + }, + { + "epoch": 0.13382308716759034, + "grad_norm": 0.6151652104414234, + "learning_rate": 1.9238012562428204e-05, + "loss": 0.8136, + "step": 6230 + }, + { + "epoch": 0.13403789148086093, + "grad_norm": 0.6081687195694883, + "learning_rate": 1.9235400563588027e-05, + "loss": 0.8092, + "step": 6240 + }, + { + "epoch": 0.13425269579413154, + "grad_norm": 0.609223276457847, + "learning_rate": 1.923278427353812e-05, + "loss": 0.8176, + "step": 6250 + }, + { + "epoch": 0.13446750010740216, + "grad_norm": 0.6057832795531277, + "learning_rate": 1.9230163693494134e-05, + "loss": 0.8245, + "step": 6260 + }, + { + "epoch": 0.13468230442067278, + "grad_norm": 0.7218938650204069, + "learning_rate": 1.922753882467372e-05, + "loss": 0.829, + "step": 6270 + }, + { + "epoch": 0.13489710873394337, + "grad_norm": 0.5956376678659784, + "learning_rate": 1.9224909668296518e-05, + "loss": 0.8323, + "step": 6280 + }, + { + "epoch": 0.13511191304721398, + "grad_norm": 0.6055774590690349, + "learning_rate": 1.922227622558416e-05, + "loss": 0.8325, + "step": 6290 + }, + { + "epoch": 0.1353267173604846, + "grad_norm": 0.63594852369417, + "learning_rate": 1.9219638497760272e-05, + "loss": 0.8236, + "step": 6300 + }, + { + "epoch": 0.13554152167375522, + "grad_norm": 0.6615251477336661, + "learning_rate": 1.921699648605046e-05, + "loss": 0.8299, + "step": 6310 + }, + { + "epoch": 0.13575632598702583, + "grad_norm": 0.5939888969308088, + "learning_rate": 1.921435019168234e-05, + "loss": 0.8314, + "step": 6320 + }, + { + "epoch": 0.13597113030029642, + "grad_norm": 0.6224118503409543, + "learning_rate": 1.9211699615885505e-05, + "loss": 0.8105, + "step": 6330 + }, + { + "epoch": 0.13618593461356704, + "grad_norm": 0.6349901674353197, + "learning_rate": 1.920904475989153e-05, + "loss": 0.8267, + "step": 6340 + }, + { + "epoch": 0.13640073892683766, + "grad_norm": 0.6184766342855774, + "learning_rate": 1.9206385624934002e-05, + "loss": 0.8235, + "step": 6350 + }, + { + "epoch": 0.13661554324010827, + "grad_norm": 0.593593688668399, + "learning_rate": 1.9203722212248473e-05, + "loss": 0.8179, + "step": 6360 + }, + { + "epoch": 0.13683034755337886, + "grad_norm": 0.5904445744420688, + "learning_rate": 1.9201054523072497e-05, + "loss": 0.8278, + "step": 6370 + }, + { + "epoch": 0.13704515186664948, + "grad_norm": 0.6197705191484348, + "learning_rate": 1.9198382558645608e-05, + "loss": 0.8315, + "step": 6380 + }, + { + "epoch": 0.1372599561799201, + "grad_norm": 0.5970982729709278, + "learning_rate": 1.9195706320209326e-05, + "loss": 0.823, + "step": 6390 + }, + { + "epoch": 0.1374747604931907, + "grad_norm": 0.5860292711215872, + "learning_rate": 1.9193025809007164e-05, + "loss": 0.8029, + "step": 6400 + }, + { + "epoch": 0.1376895648064613, + "grad_norm": 0.5880366108161124, + "learning_rate": 1.919034102628462e-05, + "loss": 0.8206, + "step": 6410 + }, + { + "epoch": 0.13790436911973192, + "grad_norm": 0.6454586755047972, + "learning_rate": 1.918765197328916e-05, + "loss": 0.8252, + "step": 6420 + }, + { + "epoch": 0.13811917343300253, + "grad_norm": 0.5855160919576355, + "learning_rate": 1.918495865127026e-05, + "loss": 0.8177, + "step": 6430 + }, + { + "epoch": 0.13833397774627315, + "grad_norm": 0.568749463356773, + "learning_rate": 1.9182261061479357e-05, + "loss": 0.8168, + "step": 6440 + }, + { + "epoch": 0.13854878205954377, + "grad_norm": 0.578722846016812, + "learning_rate": 1.9179559205169885e-05, + "loss": 0.8283, + "step": 6450 + }, + { + "epoch": 0.13876358637281436, + "grad_norm": 0.5933587203351999, + "learning_rate": 1.9176853083597257e-05, + "loss": 0.8366, + "step": 6460 + }, + { + "epoch": 0.13897839068608497, + "grad_norm": 0.679493941830241, + "learning_rate": 1.9174142698018864e-05, + "loss": 0.8439, + "step": 6470 + }, + { + "epoch": 0.1391931949993556, + "grad_norm": 0.6027286672996572, + "learning_rate": 1.9171428049694082e-05, + "loss": 0.8181, + "step": 6480 + }, + { + "epoch": 0.1394079993126262, + "grad_norm": 0.6104128555070933, + "learning_rate": 1.916870913988427e-05, + "loss": 0.8276, + "step": 6490 + }, + { + "epoch": 0.1396228036258968, + "grad_norm": 0.6013008901028725, + "learning_rate": 1.9165985969852757e-05, + "loss": 0.8274, + "step": 6500 + }, + { + "epoch": 0.1398376079391674, + "grad_norm": 0.6122352927576057, + "learning_rate": 1.916325854086486e-05, + "loss": 0.824, + "step": 6510 + }, + { + "epoch": 0.14005241225243803, + "grad_norm": 0.6300545859500599, + "learning_rate": 1.916052685418788e-05, + "loss": 0.8308, + "step": 6520 + }, + { + "epoch": 0.14026721656570865, + "grad_norm": 0.583338621183778, + "learning_rate": 1.9157790911091082e-05, + "loss": 0.8186, + "step": 6530 + }, + { + "epoch": 0.14048202087897926, + "grad_norm": 0.5947321272096432, + "learning_rate": 1.9155050712845722e-05, + "loss": 0.8141, + "step": 6540 + }, + { + "epoch": 0.14069682519224985, + "grad_norm": 0.6508513713242783, + "learning_rate": 1.915230626072502e-05, + "loss": 0.8241, + "step": 6550 + }, + { + "epoch": 0.14091162950552047, + "grad_norm": 0.5902491680834664, + "learning_rate": 1.914955755600419e-05, + "loss": 0.812, + "step": 6560 + }, + { + "epoch": 0.14112643381879109, + "grad_norm": 0.6223882649259539, + "learning_rate": 1.91468045999604e-05, + "loss": 0.8205, + "step": 6570 + }, + { + "epoch": 0.1413412381320617, + "grad_norm": 0.6039015945397973, + "learning_rate": 1.9144047393872818e-05, + "loss": 0.8254, + "step": 6580 + }, + { + "epoch": 0.1415560424453323, + "grad_norm": 4.480885455321173, + "learning_rate": 1.9141285939022563e-05, + "loss": 0.8123, + "step": 6590 + }, + { + "epoch": 0.1417708467586029, + "grad_norm": 0.6004465875420536, + "learning_rate": 1.9138520236692747e-05, + "loss": 0.8316, + "step": 6600 + }, + { + "epoch": 0.14198565107187353, + "grad_norm": 0.5877543081764919, + "learning_rate": 1.9135750288168446e-05, + "loss": 0.8194, + "step": 6610 + }, + { + "epoch": 0.14220045538514414, + "grad_norm": 0.6060063198959242, + "learning_rate": 1.9132976094736707e-05, + "loss": 0.8122, + "step": 6620 + }, + { + "epoch": 0.14241525969841473, + "grad_norm": 0.6105433238582144, + "learning_rate": 1.9130197657686555e-05, + "loss": 0.8144, + "step": 6630 + }, + { + "epoch": 0.14263006401168535, + "grad_norm": 0.5854211970675489, + "learning_rate": 1.9127414978308987e-05, + "loss": 0.826, + "step": 6640 + }, + { + "epoch": 0.14284486832495596, + "grad_norm": 0.6132307714509498, + "learning_rate": 1.9124628057896972e-05, + "loss": 0.8096, + "step": 6650 + }, + { + "epoch": 0.14305967263822658, + "grad_norm": 0.6097804703131902, + "learning_rate": 1.912183689774544e-05, + "loss": 0.8137, + "step": 6660 + }, + { + "epoch": 0.1432744769514972, + "grad_norm": 0.5755382542472285, + "learning_rate": 1.91190414991513e-05, + "loss": 0.8191, + "step": 6670 + }, + { + "epoch": 0.1434892812647678, + "grad_norm": 0.6077737497744028, + "learning_rate": 1.9116241863413433e-05, + "loss": 0.8327, + "step": 6680 + }, + { + "epoch": 0.1437040855780384, + "grad_norm": 0.6330293724055359, + "learning_rate": 1.9113437991832678e-05, + "loss": 0.8159, + "step": 6690 + }, + { + "epoch": 0.14391888989130902, + "grad_norm": 0.6019952588522437, + "learning_rate": 1.911062988571185e-05, + "loss": 0.8103, + "step": 6700 + }, + { + "epoch": 0.14413369420457964, + "grad_norm": 0.6157593320641529, + "learning_rate": 1.9107817546355726e-05, + "loss": 0.8192, + "step": 6710 + }, + { + "epoch": 0.14434849851785023, + "grad_norm": 0.6262865372100354, + "learning_rate": 1.9105000975071062e-05, + "loss": 0.8102, + "step": 6720 + }, + { + "epoch": 0.14456330283112084, + "grad_norm": 0.598604944782719, + "learning_rate": 1.9102180173166565e-05, + "loss": 0.8178, + "step": 6730 + }, + { + "epoch": 0.14477810714439146, + "grad_norm": 0.6130319012297948, + "learning_rate": 1.909935514195292e-05, + "loss": 0.8146, + "step": 6740 + }, + { + "epoch": 0.14499291145766208, + "grad_norm": 0.6051680516635761, + "learning_rate": 1.9096525882742766e-05, + "loss": 0.8243, + "step": 6750 + }, + { + "epoch": 0.1452077157709327, + "grad_norm": 0.5945658988091994, + "learning_rate": 1.909369239685072e-05, + "loss": 0.8215, + "step": 6760 + }, + { + "epoch": 0.14542252008420328, + "grad_norm": 0.6014406118512295, + "learning_rate": 1.9090854685593344e-05, + "loss": 0.8152, + "step": 6770 + }, + { + "epoch": 0.1456373243974739, + "grad_norm": 0.6069056436911239, + "learning_rate": 1.9088012750289185e-05, + "loss": 0.8089, + "step": 6780 + }, + { + "epoch": 0.14585212871074452, + "grad_norm": 0.6384017208108504, + "learning_rate": 1.908516659225874e-05, + "loss": 0.8185, + "step": 6790 + }, + { + "epoch": 0.14606693302401513, + "grad_norm": 0.6183552826731581, + "learning_rate": 1.9082316212824467e-05, + "loss": 0.829, + "step": 6800 + }, + { + "epoch": 0.14628173733728572, + "grad_norm": 0.6033726613256812, + "learning_rate": 1.9079461613310793e-05, + "loss": 0.8368, + "step": 6810 + }, + { + "epoch": 0.14649654165055634, + "grad_norm": 0.6767597702311462, + "learning_rate": 1.90766027950441e-05, + "loss": 0.8312, + "step": 6820 + }, + { + "epoch": 0.14671134596382696, + "grad_norm": 0.6047536333583156, + "learning_rate": 1.9073739759352728e-05, + "loss": 0.8334, + "step": 6830 + }, + { + "epoch": 0.14692615027709757, + "grad_norm": 0.5868050666630678, + "learning_rate": 1.9070872507566988e-05, + "loss": 0.8243, + "step": 6840 + }, + { + "epoch": 0.14714095459036816, + "grad_norm": 0.5987664541670297, + "learning_rate": 1.906800104101914e-05, + "loss": 0.8259, + "step": 6850 + }, + { + "epoch": 0.14735575890363878, + "grad_norm": 0.6171104544958446, + "learning_rate": 1.9065125361043403e-05, + "loss": 0.8265, + "step": 6860 + }, + { + "epoch": 0.1475705632169094, + "grad_norm": 0.5898285999163112, + "learning_rate": 1.9062245468975958e-05, + "loss": 0.8208, + "step": 6870 + }, + { + "epoch": 0.14778536753018, + "grad_norm": 0.6178554712481548, + "learning_rate": 1.9059361366154937e-05, + "loss": 0.8269, + "step": 6880 + }, + { + "epoch": 0.14800017184345063, + "grad_norm": 0.5673710580373017, + "learning_rate": 1.9056473053920436e-05, + "loss": 0.8027, + "step": 6890 + }, + { + "epoch": 0.14821497615672122, + "grad_norm": 0.599452012891556, + "learning_rate": 1.9053580533614502e-05, + "loss": 0.841, + "step": 6900 + }, + { + "epoch": 0.14842978046999183, + "grad_norm": 0.5874536149399996, + "learning_rate": 1.905068380658114e-05, + "loss": 0.805, + "step": 6910 + }, + { + "epoch": 0.14864458478326245, + "grad_norm": 0.5970361855515987, + "learning_rate": 1.9047782874166308e-05, + "loss": 0.8067, + "step": 6920 + }, + { + "epoch": 0.14885938909653307, + "grad_norm": 0.6177964172907193, + "learning_rate": 1.9044877737717916e-05, + "loss": 0.8372, + "step": 6930 + }, + { + "epoch": 0.14907419340980366, + "grad_norm": 0.5961958626534212, + "learning_rate": 1.904196839858583e-05, + "loss": 0.8336, + "step": 6940 + }, + { + "epoch": 0.14928899772307427, + "grad_norm": 0.6016838781672733, + "learning_rate": 1.9039054858121872e-05, + "loss": 0.8138, + "step": 6950 + }, + { + "epoch": 0.1495038020363449, + "grad_norm": 0.5772199742586428, + "learning_rate": 1.903613711767981e-05, + "loss": 0.8061, + "step": 6960 + }, + { + "epoch": 0.1497186063496155, + "grad_norm": 0.6091085341764657, + "learning_rate": 1.9033215178615363e-05, + "loss": 0.8292, + "step": 6970 + }, + { + "epoch": 0.14993341066288612, + "grad_norm": 0.6111170798001496, + "learning_rate": 1.903028904228621e-05, + "loss": 0.8123, + "step": 6980 + }, + { + "epoch": 0.1501482149761567, + "grad_norm": 0.641212096959498, + "learning_rate": 1.902735871005197e-05, + "loss": 0.8281, + "step": 6990 + }, + { + "epoch": 0.15036301928942733, + "grad_norm": 0.601677168468839, + "learning_rate": 1.9024424183274216e-05, + "loss": 0.8302, + "step": 7000 + }, + { + "epoch": 0.15057782360269795, + "grad_norm": 0.6001519478847499, + "learning_rate": 1.9021485463316468e-05, + "loss": 0.8328, + "step": 7010 + }, + { + "epoch": 0.15079262791596856, + "grad_norm": 0.590756165100162, + "learning_rate": 1.9018542551544205e-05, + "loss": 0.8283, + "step": 7020 + }, + { + "epoch": 0.15100743222923915, + "grad_norm": 0.5930250026896874, + "learning_rate": 1.9015595449324837e-05, + "loss": 0.8211, + "step": 7030 + }, + { + "epoch": 0.15122223654250977, + "grad_norm": 0.6342243168511045, + "learning_rate": 1.9012644158027727e-05, + "loss": 0.8136, + "step": 7040 + }, + { + "epoch": 0.15143704085578039, + "grad_norm": 0.6288449639482769, + "learning_rate": 1.900968867902419e-05, + "loss": 0.8161, + "step": 7050 + }, + { + "epoch": 0.151651845169051, + "grad_norm": 0.5885895763623495, + "learning_rate": 1.9006729013687488e-05, + "loss": 0.823, + "step": 7060 + }, + { + "epoch": 0.1518666494823216, + "grad_norm": 0.5945227181413594, + "learning_rate": 1.900376516339282e-05, + "loss": 0.8184, + "step": 7070 + }, + { + "epoch": 0.1520814537955922, + "grad_norm": 0.6038107671354757, + "learning_rate": 1.9000797129517326e-05, + "loss": 0.8338, + "step": 7080 + }, + { + "epoch": 0.15229625810886283, + "grad_norm": 0.6179750378509373, + "learning_rate": 1.8997824913440102e-05, + "loss": 0.819, + "step": 7090 + }, + { + "epoch": 0.15251106242213344, + "grad_norm": 0.580594427759617, + "learning_rate": 1.8994848516542187e-05, + "loss": 0.8116, + "step": 7100 + }, + { + "epoch": 0.15272586673540406, + "grad_norm": 0.6629610839791461, + "learning_rate": 1.899186794020655e-05, + "loss": 0.8164, + "step": 7110 + }, + { + "epoch": 0.15294067104867465, + "grad_norm": 0.630038582706391, + "learning_rate": 1.898888318581811e-05, + "loss": 0.8197, + "step": 7120 + }, + { + "epoch": 0.15315547536194526, + "grad_norm": 0.6060433882973041, + "learning_rate": 1.8985894254763734e-05, + "loss": 0.8345, + "step": 7130 + }, + { + "epoch": 0.15337027967521588, + "grad_norm": 0.5763806285603041, + "learning_rate": 1.8982901148432214e-05, + "loss": 0.8195, + "step": 7140 + }, + { + "epoch": 0.1535850839884865, + "grad_norm": 0.5858697966874623, + "learning_rate": 1.89799038682143e-05, + "loss": 0.8108, + "step": 7150 + }, + { + "epoch": 0.1537998883017571, + "grad_norm": 0.6192381232542434, + "learning_rate": 1.8976902415502664e-05, + "loss": 0.8096, + "step": 7160 + }, + { + "epoch": 0.1540146926150277, + "grad_norm": 0.5775749857070134, + "learning_rate": 1.8973896791691925e-05, + "loss": 0.8152, + "step": 7170 + }, + { + "epoch": 0.15422949692829832, + "grad_norm": 0.6222213814962546, + "learning_rate": 1.8970886998178648e-05, + "loss": 0.8409, + "step": 7180 + }, + { + "epoch": 0.15444430124156894, + "grad_norm": 0.6088269913390629, + "learning_rate": 1.8967873036361316e-05, + "loss": 0.8327, + "step": 7190 + }, + { + "epoch": 0.15465910555483955, + "grad_norm": 0.5930210210719912, + "learning_rate": 1.896485490764037e-05, + "loss": 0.8115, + "step": 7200 + }, + { + "epoch": 0.15487390986811014, + "grad_norm": 0.6000706146439299, + "learning_rate": 1.8961832613418173e-05, + "loss": 0.8238, + "step": 7210 + }, + { + "epoch": 0.15508871418138076, + "grad_norm": 0.6006800299840814, + "learning_rate": 1.895880615509903e-05, + "loss": 0.8145, + "step": 7220 + }, + { + "epoch": 0.15530351849465138, + "grad_norm": 0.6451495489221977, + "learning_rate": 1.8955775534089168e-05, + "loss": 0.8222, + "step": 7230 + }, + { + "epoch": 0.155518322807922, + "grad_norm": 0.5789768463565536, + "learning_rate": 1.8952740751796776e-05, + "loss": 0.8025, + "step": 7240 + }, + { + "epoch": 0.15573312712119258, + "grad_norm": 0.5853169790059398, + "learning_rate": 1.8949701809631945e-05, + "loss": 0.8204, + "step": 7250 + }, + { + "epoch": 0.1559479314344632, + "grad_norm": 0.5695304385263977, + "learning_rate": 1.894665870900672e-05, + "loss": 0.8125, + "step": 7260 + }, + { + "epoch": 0.15616273574773382, + "grad_norm": 0.6049599632857573, + "learning_rate": 1.8943611451335075e-05, + "loss": 0.8107, + "step": 7270 + }, + { + "epoch": 0.15637754006100443, + "grad_norm": 0.573259462488128, + "learning_rate": 1.89405600380329e-05, + "loss": 0.8135, + "step": 7280 + }, + { + "epoch": 0.15659234437427502, + "grad_norm": 0.5940723497985662, + "learning_rate": 1.893750447051804e-05, + "loss": 0.808, + "step": 7290 + }, + { + "epoch": 0.15680714868754564, + "grad_norm": 0.641894313123313, + "learning_rate": 1.893444475021025e-05, + "loss": 0.8177, + "step": 7300 + }, + { + "epoch": 0.15702195300081626, + "grad_norm": 0.5858730208906369, + "learning_rate": 1.8931380878531228e-05, + "loss": 0.8286, + "step": 7310 + }, + { + "epoch": 0.15723675731408687, + "grad_norm": 0.582476355104475, + "learning_rate": 1.892831285690459e-05, + "loss": 0.8257, + "step": 7320 + }, + { + "epoch": 0.1574515616273575, + "grad_norm": 0.5832307163787261, + "learning_rate": 1.892524068675589e-05, + "loss": 0.8007, + "step": 7330 + }, + { + "epoch": 0.15766636594062808, + "grad_norm": 0.5790768352438785, + "learning_rate": 1.892216436951261e-05, + "loss": 0.7953, + "step": 7340 + }, + { + "epoch": 0.1578811702538987, + "grad_norm": 0.633690146863657, + "learning_rate": 1.8919083906604144e-05, + "loss": 0.8205, + "step": 7350 + }, + { + "epoch": 0.1580959745671693, + "grad_norm": 0.617991296431158, + "learning_rate": 1.891599929946183e-05, + "loss": 0.8098, + "step": 7360 + }, + { + "epoch": 0.15831077888043993, + "grad_norm": 0.5974548816213079, + "learning_rate": 1.8912910549518924e-05, + "loss": 0.8079, + "step": 7370 + }, + { + "epoch": 0.15852558319371052, + "grad_norm": 0.6020174180448247, + "learning_rate": 1.89098176582106e-05, + "loss": 0.8177, + "step": 7380 + }, + { + "epoch": 0.15874038750698113, + "grad_norm": 0.5899662948515287, + "learning_rate": 1.8906720626973975e-05, + "loss": 0.8175, + "step": 7390 + }, + { + "epoch": 0.15895519182025175, + "grad_norm": 0.5872850170519595, + "learning_rate": 1.8903619457248072e-05, + "loss": 0.8005, + "step": 7400 + }, + { + "epoch": 0.15916999613352237, + "grad_norm": 0.5852430036799516, + "learning_rate": 1.890051415047384e-05, + "loss": 0.8219, + "step": 7410 + }, + { + "epoch": 0.15938480044679298, + "grad_norm": 0.5917524465188728, + "learning_rate": 1.8897404708094154e-05, + "loss": 0.8072, + "step": 7420 + }, + { + "epoch": 0.15959960476006357, + "grad_norm": 0.5508421705343692, + "learning_rate": 1.8894291131553817e-05, + "loss": 0.8198, + "step": 7430 + }, + { + "epoch": 0.1598144090733342, + "grad_norm": 0.5852065142165715, + "learning_rate": 1.889117342229954e-05, + "loss": 0.8194, + "step": 7440 + }, + { + "epoch": 0.1600292133866048, + "grad_norm": 0.5801695841837192, + "learning_rate": 1.8888051581779964e-05, + "loss": 0.8203, + "step": 7450 + }, + { + "epoch": 0.16024401769987542, + "grad_norm": 0.602952041149204, + "learning_rate": 1.888492561144564e-05, + "loss": 0.8059, + "step": 7460 + }, + { + "epoch": 0.160458822013146, + "grad_norm": 0.6431449708685523, + "learning_rate": 1.8881795512749046e-05, + "loss": 0.8326, + "step": 7470 + }, + { + "epoch": 0.16067362632641663, + "grad_norm": 0.6196861452682948, + "learning_rate": 1.887866128714458e-05, + "loss": 0.8457, + "step": 7480 + }, + { + "epoch": 0.16088843063968725, + "grad_norm": 0.5649025093930304, + "learning_rate": 1.887552293608855e-05, + "loss": 0.8113, + "step": 7490 + }, + { + "epoch": 0.16110323495295786, + "grad_norm": 0.63666785388803, + "learning_rate": 1.8872380461039184e-05, + "loss": 0.8264, + "step": 7500 + }, + { + "epoch": 0.16131803926622845, + "grad_norm": 0.6005193455583923, + "learning_rate": 1.8869233863456627e-05, + "loss": 0.819, + "step": 7510 + }, + { + "epoch": 0.16153284357949907, + "grad_norm": 0.6077387623232092, + "learning_rate": 1.8866083144802938e-05, + "loss": 0.8185, + "step": 7520 + }, + { + "epoch": 0.16174764789276969, + "grad_norm": 0.5938972114256622, + "learning_rate": 1.8862928306542093e-05, + "loss": 0.8225, + "step": 7530 + }, + { + "epoch": 0.1619624522060403, + "grad_norm": 0.6156548912606762, + "learning_rate": 1.8859769350139982e-05, + "loss": 0.8263, + "step": 7540 + }, + { + "epoch": 0.16217725651931092, + "grad_norm": 0.6056374183349044, + "learning_rate": 1.8856606277064407e-05, + "loss": 0.8198, + "step": 7550 + }, + { + "epoch": 0.1623920608325815, + "grad_norm": 0.5663370767459714, + "learning_rate": 1.8853439088785084e-05, + "loss": 0.8159, + "step": 7560 + }, + { + "epoch": 0.16260686514585213, + "grad_norm": 0.5775902577797735, + "learning_rate": 1.885026778677364e-05, + "loss": 0.8139, + "step": 7570 + }, + { + "epoch": 0.16282166945912274, + "grad_norm": 0.591831820384521, + "learning_rate": 1.884709237250361e-05, + "loss": 0.8176, + "step": 7580 + }, + { + "epoch": 0.16303647377239336, + "grad_norm": 0.5987459436284528, + "learning_rate": 1.884391284745045e-05, + "loss": 0.8246, + "step": 7590 + }, + { + "epoch": 0.16325127808566395, + "grad_norm": 0.5985317736090139, + "learning_rate": 1.8840729213091514e-05, + "loss": 0.8211, + "step": 7600 + }, + { + "epoch": 0.16346608239893456, + "grad_norm": 0.5967857623504043, + "learning_rate": 1.8837541470906076e-05, + "loss": 0.8068, + "step": 7610 + }, + { + "epoch": 0.16368088671220518, + "grad_norm": 0.5809065765322244, + "learning_rate": 1.883434962237531e-05, + "loss": 0.8008, + "step": 7620 + }, + { + "epoch": 0.1638956910254758, + "grad_norm": 0.5851551177907041, + "learning_rate": 1.8831153668982304e-05, + "loss": 0.8204, + "step": 7630 + }, + { + "epoch": 0.16411049533874641, + "grad_norm": 0.5917914776651239, + "learning_rate": 1.882795361221205e-05, + "loss": 0.8209, + "step": 7640 + }, + { + "epoch": 0.164325299652017, + "grad_norm": 0.6224876855421713, + "learning_rate": 1.882474945355145e-05, + "loss": 0.7925, + "step": 7650 + }, + { + "epoch": 0.16454010396528762, + "grad_norm": 0.5603554031670195, + "learning_rate": 1.8821541194489307e-05, + "loss": 0.8085, + "step": 7660 + }, + { + "epoch": 0.16475490827855824, + "grad_norm": 0.5489520906362242, + "learning_rate": 1.8818328836516334e-05, + "loss": 0.8161, + "step": 7670 + }, + { + "epoch": 0.16496971259182885, + "grad_norm": 0.6169881588767844, + "learning_rate": 1.8815112381125146e-05, + "loss": 0.8297, + "step": 7680 + }, + { + "epoch": 0.16518451690509944, + "grad_norm": 0.5901587326877425, + "learning_rate": 1.8811891829810257e-05, + "loss": 0.8172, + "step": 7690 + }, + { + "epoch": 0.16539932121837006, + "grad_norm": 0.561361319802584, + "learning_rate": 1.8808667184068098e-05, + "loss": 0.8172, + "step": 7700 + }, + { + "epoch": 0.16561412553164068, + "grad_norm": 0.5859380687978089, + "learning_rate": 1.880543844539699e-05, + "loss": 0.8145, + "step": 7710 + }, + { + "epoch": 0.1658289298449113, + "grad_norm": 0.5693560253720011, + "learning_rate": 1.880220561529716e-05, + "loss": 0.8259, + "step": 7720 + }, + { + "epoch": 0.16604373415818188, + "grad_norm": 0.6279685202690973, + "learning_rate": 1.8798968695270735e-05, + "loss": 0.8073, + "step": 7730 + }, + { + "epoch": 0.1662585384714525, + "grad_norm": 0.5868609499267986, + "learning_rate": 1.879572768682174e-05, + "loss": 0.8045, + "step": 7740 + }, + { + "epoch": 0.16647334278472312, + "grad_norm": 0.5949413009843553, + "learning_rate": 1.8792482591456115e-05, + "loss": 0.821, + "step": 7750 + }, + { + "epoch": 0.16668814709799373, + "grad_norm": 0.5734770401983357, + "learning_rate": 1.8789233410681675e-05, + "loss": 0.8037, + "step": 7760 + }, + { + "epoch": 0.16690295141126435, + "grad_norm": 0.6165809896599992, + "learning_rate": 1.8785980146008146e-05, + "loss": 0.8229, + "step": 7770 + }, + { + "epoch": 0.16711775572453494, + "grad_norm": 0.5747676549015432, + "learning_rate": 1.8782722798947154e-05, + "loss": 0.8145, + "step": 7780 + }, + { + "epoch": 0.16733256003780556, + "grad_norm": 0.587504015936523, + "learning_rate": 1.8779461371012224e-05, + "loss": 0.8096, + "step": 7790 + }, + { + "epoch": 0.16754736435107617, + "grad_norm": 0.6174090349249043, + "learning_rate": 1.877619586371876e-05, + "loss": 0.8142, + "step": 7800 + }, + { + "epoch": 0.1677621686643468, + "grad_norm": 0.5847105333987745, + "learning_rate": 1.8772926278584077e-05, + "loss": 0.8247, + "step": 7810 + }, + { + "epoch": 0.16797697297761738, + "grad_norm": 0.588578056608747, + "learning_rate": 1.8769652617127388e-05, + "loss": 0.8043, + "step": 7820 + }, + { + "epoch": 0.168191777290888, + "grad_norm": 0.577693735778559, + "learning_rate": 1.8766374880869785e-05, + "loss": 0.8224, + "step": 7830 + }, + { + "epoch": 0.1684065816041586, + "grad_norm": 0.5827121728285275, + "learning_rate": 1.8763093071334263e-05, + "loss": 0.8016, + "step": 7840 + }, + { + "epoch": 0.16862138591742923, + "grad_norm": 0.5897150773152798, + "learning_rate": 1.8759807190045715e-05, + "loss": 0.8078, + "step": 7850 + }, + { + "epoch": 0.16883619023069985, + "grad_norm": 0.5759666753347583, + "learning_rate": 1.8756517238530904e-05, + "loss": 0.8072, + "step": 7860 + }, + { + "epoch": 0.16905099454397043, + "grad_norm": 0.5818077020634499, + "learning_rate": 1.8753223218318515e-05, + "loss": 0.8117, + "step": 7870 + }, + { + "epoch": 0.16926579885724105, + "grad_norm": 0.6076398421069771, + "learning_rate": 1.87499251309391e-05, + "loss": 0.8033, + "step": 7880 + }, + { + "epoch": 0.16948060317051167, + "grad_norm": 0.5810094500721327, + "learning_rate": 1.874662297792511e-05, + "loss": 0.8123, + "step": 7890 + }, + { + "epoch": 0.16969540748378228, + "grad_norm": 0.6001843216635706, + "learning_rate": 1.874331676081088e-05, + "loss": 0.8239, + "step": 7900 + }, + { + "epoch": 0.16991021179705287, + "grad_norm": 0.6000314594312535, + "learning_rate": 1.874000648113264e-05, + "loss": 0.8276, + "step": 7910 + }, + { + "epoch": 0.1701250161103235, + "grad_norm": 0.5742460502628838, + "learning_rate": 1.8736692140428506e-05, + "loss": 0.8053, + "step": 7920 + }, + { + "epoch": 0.1703398204235941, + "grad_norm": 0.5832371696394795, + "learning_rate": 1.873337374023848e-05, + "loss": 0.8116, + "step": 7930 + }, + { + "epoch": 0.17055462473686472, + "grad_norm": 0.5845557501110522, + "learning_rate": 1.8730051282104446e-05, + "loss": 0.8221, + "step": 7940 + }, + { + "epoch": 0.1707694290501353, + "grad_norm": 0.6092794931268248, + "learning_rate": 1.8726724767570178e-05, + "loss": 0.8034, + "step": 7950 + }, + { + "epoch": 0.17098423336340593, + "grad_norm": 0.5941748660659438, + "learning_rate": 1.8723394198181333e-05, + "loss": 0.7995, + "step": 7960 + }, + { + "epoch": 0.17119903767667655, + "grad_norm": 0.5742787477493947, + "learning_rate": 1.8720059575485462e-05, + "loss": 0.8093, + "step": 7970 + }, + { + "epoch": 0.17141384198994716, + "grad_norm": 0.5862196213050433, + "learning_rate": 1.8716720901031983e-05, + "loss": 0.82, + "step": 7980 + }, + { + "epoch": 0.17162864630321778, + "grad_norm": 0.595590568398061, + "learning_rate": 1.8713378176372198e-05, + "loss": 0.8208, + "step": 7990 + }, + { + "epoch": 0.17184345061648837, + "grad_norm": 0.6014060853390284, + "learning_rate": 1.871003140305931e-05, + "loss": 0.8243, + "step": 8000 + }, + { + "epoch": 0.17205825492975899, + "grad_norm": 0.5885951381362904, + "learning_rate": 1.870668058264838e-05, + "loss": 0.8184, + "step": 8010 + }, + { + "epoch": 0.1722730592430296, + "grad_norm": 0.5675767542282781, + "learning_rate": 1.870332571669637e-05, + "loss": 0.8353, + "step": 8020 + }, + { + "epoch": 0.17248786355630022, + "grad_norm": 0.5885374361788073, + "learning_rate": 1.86999668067621e-05, + "loss": 0.8067, + "step": 8030 + }, + { + "epoch": 0.1727026678695708, + "grad_norm": 0.5725047507144028, + "learning_rate": 1.869660385440629e-05, + "loss": 0.809, + "step": 8040 + }, + { + "epoch": 0.17291747218284143, + "grad_norm": 0.5669767417801742, + "learning_rate": 1.8693236861191524e-05, + "loss": 0.7987, + "step": 8050 + }, + { + "epoch": 0.17313227649611204, + "grad_norm": 0.6015883976560725, + "learning_rate": 1.8689865828682266e-05, + "loss": 0.8249, + "step": 8060 + }, + { + "epoch": 0.17334708080938266, + "grad_norm": 0.5761246694287212, + "learning_rate": 1.868649075844487e-05, + "loss": 0.8161, + "step": 8070 + }, + { + "epoch": 0.17356188512265328, + "grad_norm": 0.5969047559623099, + "learning_rate": 1.8683111652047543e-05, + "loss": 0.8361, + "step": 8080 + }, + { + "epoch": 0.17377668943592386, + "grad_norm": 0.5929471728979538, + "learning_rate": 1.8679728511060385e-05, + "loss": 0.8157, + "step": 8090 + }, + { + "epoch": 0.17399149374919448, + "grad_norm": 0.5709547291713754, + "learning_rate": 1.8676341337055367e-05, + "loss": 0.7972, + "step": 8100 + }, + { + "epoch": 0.1742062980624651, + "grad_norm": 0.5792794867839971, + "learning_rate": 1.867295013160633e-05, + "loss": 0.8236, + "step": 8110 + }, + { + "epoch": 0.17442110237573571, + "grad_norm": 0.580543068320473, + "learning_rate": 1.866955489628899e-05, + "loss": 0.8322, + "step": 8120 + }, + { + "epoch": 0.1746359066890063, + "grad_norm": 0.5795792932547941, + "learning_rate": 1.8666155632680938e-05, + "loss": 0.8128, + "step": 8130 + }, + { + "epoch": 0.17485071100227692, + "grad_norm": 0.5800071115326852, + "learning_rate": 1.8662752342361633e-05, + "loss": 0.8164, + "step": 8140 + }, + { + "epoch": 0.17506551531554754, + "grad_norm": 0.5845246581742584, + "learning_rate": 1.865934502691241e-05, + "loss": 0.8117, + "step": 8150 + }, + { + "epoch": 0.17528031962881815, + "grad_norm": 0.5894935518444326, + "learning_rate": 1.865593368791647e-05, + "loss": 0.8063, + "step": 8160 + }, + { + "epoch": 0.17549512394208877, + "grad_norm": 0.5877616770415447, + "learning_rate": 1.8652518326958882e-05, + "loss": 0.8112, + "step": 8170 + }, + { + "epoch": 0.17570992825535936, + "grad_norm": 0.6072757728168394, + "learning_rate": 1.8649098945626588e-05, + "loss": 0.8029, + "step": 8180 + }, + { + "epoch": 0.17592473256862998, + "grad_norm": 0.6010925475932754, + "learning_rate": 1.8645675545508397e-05, + "loss": 0.8162, + "step": 8190 + }, + { + "epoch": 0.1761395368819006, + "grad_norm": 0.5718419491303891, + "learning_rate": 1.864224812819498e-05, + "loss": 0.8124, + "step": 8200 + }, + { + "epoch": 0.1763543411951712, + "grad_norm": 0.5527383739528845, + "learning_rate": 1.863881669527889e-05, + "loss": 0.808, + "step": 8210 + }, + { + "epoch": 0.1765691455084418, + "grad_norm": 0.573823793660002, + "learning_rate": 1.8635381248354525e-05, + "loss": 0.8089, + "step": 8220 + }, + { + "epoch": 0.17678394982171242, + "grad_norm": 0.5638340296138096, + "learning_rate": 1.8631941789018162e-05, + "loss": 0.7961, + "step": 8230 + }, + { + "epoch": 0.17699875413498303, + "grad_norm": 0.5646934644189842, + "learning_rate": 1.862849831886794e-05, + "loss": 0.8187, + "step": 8240 + }, + { + "epoch": 0.17721355844825365, + "grad_norm": 0.5729549283565889, + "learning_rate": 1.862505083950386e-05, + "loss": 0.7946, + "step": 8250 + }, + { + "epoch": 0.17742836276152424, + "grad_norm": 0.5697299724643171, + "learning_rate": 1.8621599352527783e-05, + "loss": 0.7989, + "step": 8260 + }, + { + "epoch": 0.17764316707479486, + "grad_norm": 0.5742935431980787, + "learning_rate": 1.8618143859543436e-05, + "loss": 0.8068, + "step": 8270 + }, + { + "epoch": 0.17785797138806547, + "grad_norm": 0.6022944664204131, + "learning_rate": 1.861468436215641e-05, + "loss": 0.8316, + "step": 8280 + }, + { + "epoch": 0.1780727757013361, + "grad_norm": 0.6181521941813161, + "learning_rate": 1.861122086197415e-05, + "loss": 0.8297, + "step": 8290 + }, + { + "epoch": 0.1782875800146067, + "grad_norm": 0.5854718908960673, + "learning_rate": 1.8607753360605968e-05, + "loss": 0.8139, + "step": 8300 + }, + { + "epoch": 0.1785023843278773, + "grad_norm": 0.5677926977124813, + "learning_rate": 1.8604281859663027e-05, + "loss": 0.8067, + "step": 8310 + }, + { + "epoch": 0.1787171886411479, + "grad_norm": 1.2822660953314349, + "learning_rate": 1.8600806360758355e-05, + "loss": 0.7901, + "step": 8320 + }, + { + "epoch": 0.17893199295441853, + "grad_norm": 0.561871053213264, + "learning_rate": 1.8597326865506838e-05, + "loss": 0.8073, + "step": 8330 + }, + { + "epoch": 0.17914679726768915, + "grad_norm": 0.5900981652272886, + "learning_rate": 1.8593843375525205e-05, + "loss": 0.8019, + "step": 8340 + }, + { + "epoch": 0.17936160158095973, + "grad_norm": 0.6033017106283523, + "learning_rate": 1.8590355892432068e-05, + "loss": 0.8129, + "step": 8350 + }, + { + "epoch": 0.17957640589423035, + "grad_norm": 0.5755568064961115, + "learning_rate": 1.858686441784787e-05, + "loss": 0.825, + "step": 8360 + }, + { + "epoch": 0.17979121020750097, + "grad_norm": 0.5837961244746128, + "learning_rate": 1.8583368953394917e-05, + "loss": 0.8152, + "step": 8370 + }, + { + "epoch": 0.18000601452077158, + "grad_norm": 0.5830801987993122, + "learning_rate": 1.857986950069737e-05, + "loss": 0.7978, + "step": 8380 + }, + { + "epoch": 0.1802208188340422, + "grad_norm": 0.6071324798915776, + "learning_rate": 1.8576366061381246e-05, + "loss": 0.8173, + "step": 8390 + }, + { + "epoch": 0.1804356231473128, + "grad_norm": 0.5787242342053142, + "learning_rate": 1.857285863707441e-05, + "loss": 0.8266, + "step": 8400 + }, + { + "epoch": 0.1806504274605834, + "grad_norm": 0.5830266332748674, + "learning_rate": 1.8569347229406575e-05, + "loss": 0.8196, + "step": 8410 + }, + { + "epoch": 0.18086523177385402, + "grad_norm": 0.5826514559974909, + "learning_rate": 1.8565831840009312e-05, + "loss": 0.7978, + "step": 8420 + }, + { + "epoch": 0.18108003608712464, + "grad_norm": 0.5647693782228277, + "learning_rate": 1.856231247051604e-05, + "loss": 0.8152, + "step": 8430 + }, + { + "epoch": 0.18129484040039523, + "grad_norm": 0.6171161964315975, + "learning_rate": 1.8558789122562024e-05, + "loss": 0.805, + "step": 8440 + }, + { + "epoch": 0.18150964471366585, + "grad_norm": 0.574924762582272, + "learning_rate": 1.8555261797784387e-05, + "loss": 0.825, + "step": 8450 + }, + { + "epoch": 0.18172444902693646, + "grad_norm": 0.5849907741058937, + "learning_rate": 1.8551730497822086e-05, + "loss": 0.813, + "step": 8460 + }, + { + "epoch": 0.18193925334020708, + "grad_norm": 0.6006674682429172, + "learning_rate": 1.8548195224315933e-05, + "loss": 0.8058, + "step": 8470 + }, + { + "epoch": 0.18215405765347767, + "grad_norm": 0.5694938669138162, + "learning_rate": 1.854465597890859e-05, + "loss": 0.8128, + "step": 8480 + }, + { + "epoch": 0.18236886196674829, + "grad_norm": 0.5800620824875963, + "learning_rate": 1.8541112763244554e-05, + "loss": 0.8, + "step": 8490 + }, + { + "epoch": 0.1825836662800189, + "grad_norm": 0.6003822667618681, + "learning_rate": 1.8537565578970182e-05, + "loss": 0.8125, + "step": 8500 + }, + { + "epoch": 0.18279847059328952, + "grad_norm": 0.5805075835998008, + "learning_rate": 1.8534014427733655e-05, + "loss": 0.814, + "step": 8510 + }, + { + "epoch": 0.18301327490656014, + "grad_norm": 0.608140890949641, + "learning_rate": 1.8530459311185017e-05, + "loss": 0.8074, + "step": 8520 + }, + { + "epoch": 0.18322807921983073, + "grad_norm": 0.5939316987741534, + "learning_rate": 1.852690023097614e-05, + "loss": 0.8083, + "step": 8530 + }, + { + "epoch": 0.18344288353310134, + "grad_norm": 0.5634001132722806, + "learning_rate": 1.8523337188760747e-05, + "loss": 0.814, + "step": 8540 + }, + { + "epoch": 0.18365768784637196, + "grad_norm": 0.6076400115964997, + "learning_rate": 1.8519770186194393e-05, + "loss": 0.8131, + "step": 8550 + }, + { + "epoch": 0.18387249215964258, + "grad_norm": 0.5804582235616338, + "learning_rate": 1.851619922493448e-05, + "loss": 0.8026, + "step": 8560 + }, + { + "epoch": 0.18408729647291316, + "grad_norm": 0.5516633970987541, + "learning_rate": 1.8512624306640254e-05, + "loss": 0.8008, + "step": 8570 + }, + { + "epoch": 0.18430210078618378, + "grad_norm": 0.5977108054692924, + "learning_rate": 1.850904543297278e-05, + "loss": 0.8093, + "step": 8580 + }, + { + "epoch": 0.1845169050994544, + "grad_norm": 0.5751584053301783, + "learning_rate": 1.850546260559499e-05, + "loss": 0.8069, + "step": 8590 + }, + { + "epoch": 0.18473170941272501, + "grad_norm": 0.5683030910307522, + "learning_rate": 1.8501875826171632e-05, + "loss": 0.8088, + "step": 8600 + }, + { + "epoch": 0.18494651372599563, + "grad_norm": 0.5981914621652948, + "learning_rate": 1.8498285096369287e-05, + "loss": 0.815, + "step": 8610 + }, + { + "epoch": 0.18516131803926622, + "grad_norm": 0.5987116882619, + "learning_rate": 1.849469041785639e-05, + "loss": 0.8247, + "step": 8620 + }, + { + "epoch": 0.18537612235253684, + "grad_norm": 0.598340119849919, + "learning_rate": 1.8491091792303203e-05, + "loss": 0.7891, + "step": 8630 + }, + { + "epoch": 0.18559092666580745, + "grad_norm": 0.5874259069859069, + "learning_rate": 1.8487489221381807e-05, + "loss": 0.7963, + "step": 8640 + }, + { + "epoch": 0.18580573097907807, + "grad_norm": 0.60338871634641, + "learning_rate": 1.848388270676614e-05, + "loss": 0.8176, + "step": 8650 + }, + { + "epoch": 0.18602053529234866, + "grad_norm": 0.6382989451380708, + "learning_rate": 1.848027225013196e-05, + "loss": 0.8034, + "step": 8660 + }, + { + "epoch": 0.18623533960561928, + "grad_norm": 0.6100978833111902, + "learning_rate": 1.8476657853156854e-05, + "loss": 0.8295, + "step": 8670 + }, + { + "epoch": 0.1864501439188899, + "grad_norm": 0.5847210050409527, + "learning_rate": 1.8473039517520245e-05, + "loss": 0.8093, + "step": 8680 + }, + { + "epoch": 0.1866649482321605, + "grad_norm": 0.5720679964643604, + "learning_rate": 1.846941724490339e-05, + "loss": 0.7988, + "step": 8690 + }, + { + "epoch": 0.1868797525454311, + "grad_norm": 0.6251350978467913, + "learning_rate": 1.8465791036989367e-05, + "loss": 0.7998, + "step": 8700 + }, + { + "epoch": 0.18709455685870172, + "grad_norm": 0.5765117453954964, + "learning_rate": 1.8462160895463087e-05, + "loss": 0.8121, + "step": 8710 + }, + { + "epoch": 0.18730936117197233, + "grad_norm": 0.5652462786030213, + "learning_rate": 1.8458526822011283e-05, + "loss": 0.8102, + "step": 8720 + }, + { + "epoch": 0.18752416548524295, + "grad_norm": 0.5618404540951464, + "learning_rate": 1.8454888818322527e-05, + "loss": 0.7841, + "step": 8730 + }, + { + "epoch": 0.18773896979851357, + "grad_norm": 0.5715079105467191, + "learning_rate": 1.8451246886087207e-05, + "loss": 0.8019, + "step": 8740 + }, + { + "epoch": 0.18795377411178416, + "grad_norm": 0.5786067429900265, + "learning_rate": 1.8447601026997534e-05, + "loss": 0.8185, + "step": 8750 + }, + { + "epoch": 0.18816857842505477, + "grad_norm": 0.5618942537054571, + "learning_rate": 1.8443951242747558e-05, + "loss": 0.8122, + "step": 8760 + }, + { + "epoch": 0.1883833827383254, + "grad_norm": 0.5602005480295104, + "learning_rate": 1.8440297535033137e-05, + "loss": 0.8044, + "step": 8770 + }, + { + "epoch": 0.188598187051596, + "grad_norm": 0.5526135225587552, + "learning_rate": 1.8436639905551966e-05, + "loss": 0.8132, + "step": 8780 + }, + { + "epoch": 0.1888129913648666, + "grad_norm": 0.5708051150082964, + "learning_rate": 1.8432978356003544e-05, + "loss": 0.7984, + "step": 8790 + }, + { + "epoch": 0.1890277956781372, + "grad_norm": 0.5845241545642078, + "learning_rate": 1.842931288808921e-05, + "loss": 0.8215, + "step": 8800 + }, + { + "epoch": 0.18924259999140783, + "grad_norm": 0.5747369107161527, + "learning_rate": 1.8425643503512115e-05, + "loss": 0.8027, + "step": 8810 + }, + { + "epoch": 0.18945740430467845, + "grad_norm": 0.5726755466246907, + "learning_rate": 1.842197020397723e-05, + "loss": 0.7891, + "step": 8820 + }, + { + "epoch": 0.18967220861794906, + "grad_norm": 0.5757944229048639, + "learning_rate": 1.841829299119135e-05, + "loss": 0.8064, + "step": 8830 + }, + { + "epoch": 0.18988701293121965, + "grad_norm": 0.5911903584887567, + "learning_rate": 1.8414611866863078e-05, + "loss": 0.8239, + "step": 8840 + }, + { + "epoch": 0.19010181724449027, + "grad_norm": 0.5784649631481444, + "learning_rate": 1.8410926832702842e-05, + "loss": 0.8074, + "step": 8850 + }, + { + "epoch": 0.19031662155776088, + "grad_norm": 0.5930932814488833, + "learning_rate": 1.8407237890422888e-05, + "loss": 0.8226, + "step": 8860 + }, + { + "epoch": 0.1905314258710315, + "grad_norm": 0.5579023925128831, + "learning_rate": 1.8403545041737275e-05, + "loss": 0.8008, + "step": 8870 + }, + { + "epoch": 0.1907462301843021, + "grad_norm": 0.5988868685379616, + "learning_rate": 1.8399848288361878e-05, + "loss": 0.8202, + "step": 8880 + }, + { + "epoch": 0.1909610344975727, + "grad_norm": 0.5633872393293222, + "learning_rate": 1.8396147632014383e-05, + "loss": 0.7865, + "step": 8890 + }, + { + "epoch": 0.19117583881084332, + "grad_norm": 0.579775369575011, + "learning_rate": 1.8392443074414292e-05, + "loss": 0.8081, + "step": 8900 + }, + { + "epoch": 0.19139064312411394, + "grad_norm": 0.5761284131901169, + "learning_rate": 1.8388734617282926e-05, + "loss": 0.7957, + "step": 8910 + }, + { + "epoch": 0.19160544743738453, + "grad_norm": 0.5790288426152821, + "learning_rate": 1.8385022262343405e-05, + "loss": 0.8016, + "step": 8920 + }, + { + "epoch": 0.19182025175065515, + "grad_norm": 0.5835056469508209, + "learning_rate": 1.8381306011320673e-05, + "loss": 0.8192, + "step": 8930 + }, + { + "epoch": 0.19203505606392576, + "grad_norm": 0.5600680666644086, + "learning_rate": 1.837758586594147e-05, + "loss": 0.8041, + "step": 8940 + }, + { + "epoch": 0.19224986037719638, + "grad_norm": 0.6130002119585894, + "learning_rate": 1.8373861827934364e-05, + "loss": 0.8173, + "step": 8950 + }, + { + "epoch": 0.192464664690467, + "grad_norm": 0.5673926783038583, + "learning_rate": 1.8370133899029717e-05, + "loss": 0.8119, + "step": 8960 + }, + { + "epoch": 0.19267946900373759, + "grad_norm": 0.5917000843066154, + "learning_rate": 1.83664020809597e-05, + "loss": 0.8172, + "step": 8970 + }, + { + "epoch": 0.1928942733170082, + "grad_norm": 0.5733143994810274, + "learning_rate": 1.83626663754583e-05, + "loss": 0.8032, + "step": 8980 + }, + { + "epoch": 0.19310907763027882, + "grad_norm": 0.5592658911509888, + "learning_rate": 1.8358926784261303e-05, + "loss": 0.7981, + "step": 8990 + }, + { + "epoch": 0.19332388194354944, + "grad_norm": 0.562962728797357, + "learning_rate": 1.8355183309106297e-05, + "loss": 0.8156, + "step": 9000 + }, + { + "epoch": 0.19353868625682003, + "grad_norm": 0.569803487699369, + "learning_rate": 1.8351435951732692e-05, + "loss": 0.8001, + "step": 9010 + }, + { + "epoch": 0.19375349057009064, + "grad_norm": 0.5789202611510278, + "learning_rate": 1.8347684713881675e-05, + "loss": 0.8116, + "step": 9020 + }, + { + "epoch": 0.19396829488336126, + "grad_norm": 0.5768841779522369, + "learning_rate": 1.8343929597296265e-05, + "loss": 0.8075, + "step": 9030 + }, + { + "epoch": 0.19418309919663188, + "grad_norm": 0.5643884618141694, + "learning_rate": 1.8340170603721258e-05, + "loss": 0.8005, + "step": 9040 + }, + { + "epoch": 0.1943979035099025, + "grad_norm": 0.6040260449303918, + "learning_rate": 1.8336407734903266e-05, + "loss": 0.8037, + "step": 9050 + }, + { + "epoch": 0.19461270782317308, + "grad_norm": 0.547010947358847, + "learning_rate": 1.83326409925907e-05, + "loss": 0.8086, + "step": 9060 + }, + { + "epoch": 0.1948275121364437, + "grad_norm": 0.5791718088895571, + "learning_rate": 1.8328870378533774e-05, + "loss": 0.8221, + "step": 9070 + }, + { + "epoch": 0.19504231644971431, + "grad_norm": 0.5929678639479692, + "learning_rate": 1.8325095894484487e-05, + "loss": 0.8037, + "step": 9080 + }, + { + "epoch": 0.19525712076298493, + "grad_norm": 0.5837325290517947, + "learning_rate": 1.8321317542196645e-05, + "loss": 0.8214, + "step": 9090 + }, + { + "epoch": 0.19547192507625552, + "grad_norm": 0.6021341746404556, + "learning_rate": 1.831753532342586e-05, + "loss": 0.808, + "step": 9100 + }, + { + "epoch": 0.19568672938952614, + "grad_norm": 0.5851595859581337, + "learning_rate": 1.8313749239929527e-05, + "loss": 0.8027, + "step": 9110 + }, + { + "epoch": 0.19590153370279675, + "grad_norm": 0.6429053021754854, + "learning_rate": 1.830995929346684e-05, + "loss": 0.8165, + "step": 9120 + }, + { + "epoch": 0.19611633801606737, + "grad_norm": 0.6030014532201955, + "learning_rate": 1.830616548579879e-05, + "loss": 0.8246, + "step": 9130 + }, + { + "epoch": 0.19633114232933796, + "grad_norm": 0.6323373448001347, + "learning_rate": 1.8302367818688168e-05, + "loss": 0.8079, + "step": 9140 + }, + { + "epoch": 0.19654594664260858, + "grad_norm": 0.5846336022969297, + "learning_rate": 1.8298566293899543e-05, + "loss": 0.8269, + "step": 9150 + }, + { + "epoch": 0.1967607509558792, + "grad_norm": 0.5607765403444699, + "learning_rate": 1.8294760913199296e-05, + "loss": 0.7932, + "step": 9160 + }, + { + "epoch": 0.1969755552691498, + "grad_norm": 0.5738811351652273, + "learning_rate": 1.8290951678355583e-05, + "loss": 0.8121, + "step": 9170 + }, + { + "epoch": 0.19719035958242043, + "grad_norm": 0.5893344214505519, + "learning_rate": 1.8287138591138355e-05, + "loss": 0.8227, + "step": 9180 + }, + { + "epoch": 0.19740516389569102, + "grad_norm": 0.5849184704039714, + "learning_rate": 1.828332165331936e-05, + "loss": 0.8037, + "step": 9190 + }, + { + "epoch": 0.19761996820896163, + "grad_norm": 0.5844113484112203, + "learning_rate": 1.8279500866672124e-05, + "loss": 0.814, + "step": 9200 + }, + { + "epoch": 0.19783477252223225, + "grad_norm": 0.5652668990930761, + "learning_rate": 1.8275676232971977e-05, + "loss": 0.8119, + "step": 9210 + }, + { + "epoch": 0.19804957683550287, + "grad_norm": 0.597730632354087, + "learning_rate": 1.827184775399602e-05, + "loss": 0.8053, + "step": 9220 + }, + { + "epoch": 0.19826438114877346, + "grad_norm": 0.5888675343237012, + "learning_rate": 1.8268015431523147e-05, + "loss": 0.801, + "step": 9230 + }, + { + "epoch": 0.19847918546204407, + "grad_norm": 0.5663470763977005, + "learning_rate": 1.8264179267334043e-05, + "loss": 0.8135, + "step": 9240 + }, + { + "epoch": 0.1986939897753147, + "grad_norm": 0.5533581191534693, + "learning_rate": 1.8260339263211174e-05, + "loss": 0.79, + "step": 9250 + }, + { + "epoch": 0.1989087940885853, + "grad_norm": 0.5979441715439602, + "learning_rate": 1.8256495420938786e-05, + "loss": 0.7929, + "step": 9260 + }, + { + "epoch": 0.19912359840185592, + "grad_norm": 0.5825141443552068, + "learning_rate": 1.8252647742302914e-05, + "loss": 0.8056, + "step": 9270 + }, + { + "epoch": 0.1993384027151265, + "grad_norm": 0.5757431007161539, + "learning_rate": 1.8248796229091373e-05, + "loss": 0.8138, + "step": 9280 + }, + { + "epoch": 0.19955320702839713, + "grad_norm": 0.6259334303275133, + "learning_rate": 1.8244940883093767e-05, + "loss": 0.813, + "step": 9290 + }, + { + "epoch": 0.19976801134166774, + "grad_norm": 0.549857706300012, + "learning_rate": 1.824108170610147e-05, + "loss": 0.8077, + "step": 9300 + }, + { + "epoch": 0.19998281565493836, + "grad_norm": 0.56236031168535, + "learning_rate": 1.8237218699907635e-05, + "loss": 0.8045, + "step": 9310 + }, + { + "epoch": 0.20019761996820895, + "grad_norm": 0.6015575274536221, + "learning_rate": 1.8233351866307206e-05, + "loss": 0.8061, + "step": 9320 + }, + { + "epoch": 0.20041242428147957, + "grad_norm": 0.6085710236514931, + "learning_rate": 1.8229481207096896e-05, + "loss": 0.8018, + "step": 9330 + }, + { + "epoch": 0.20062722859475018, + "grad_norm": 0.597465062355765, + "learning_rate": 1.8225606724075205e-05, + "loss": 0.8122, + "step": 9340 + }, + { + "epoch": 0.2008420329080208, + "grad_norm": 0.5653695698060156, + "learning_rate": 1.82217284190424e-05, + "loss": 0.7998, + "step": 9350 + }, + { + "epoch": 0.2010568372212914, + "grad_norm": 0.5803499948562468, + "learning_rate": 1.8217846293800523e-05, + "loss": 0.8171, + "step": 9360 + }, + { + "epoch": 0.201271641534562, + "grad_norm": 0.5914980494379427, + "learning_rate": 1.82139603501534e-05, + "loss": 0.8078, + "step": 9370 + }, + { + "epoch": 0.20148644584783262, + "grad_norm": 0.5753469591052403, + "learning_rate": 1.8210070589906628e-05, + "loss": 0.8144, + "step": 9380 + }, + { + "epoch": 0.20170125016110324, + "grad_norm": 0.5917347916307549, + "learning_rate": 1.8206177014867572e-05, + "loss": 0.8078, + "step": 9390 + }, + { + "epoch": 0.20191605447437386, + "grad_norm": 0.5524089908557045, + "learning_rate": 1.8202279626845377e-05, + "loss": 0.8028, + "step": 9400 + }, + { + "epoch": 0.20213085878764445, + "grad_norm": 0.5529213134018454, + "learning_rate": 1.8198378427650955e-05, + "loss": 0.7994, + "step": 9410 + }, + { + "epoch": 0.20234566310091506, + "grad_norm": 0.567892445973469, + "learning_rate": 1.8194473419096987e-05, + "loss": 0.8094, + "step": 9420 + }, + { + "epoch": 0.20256046741418568, + "grad_norm": 0.5858778724872428, + "learning_rate": 1.819056460299793e-05, + "loss": 0.7983, + "step": 9430 + }, + { + "epoch": 0.2027752717274563, + "grad_norm": 0.573018383200498, + "learning_rate": 1.818665198117001e-05, + "loss": 0.8113, + "step": 9440 + }, + { + "epoch": 0.20299007604072689, + "grad_norm": 0.5815406945956942, + "learning_rate": 1.8182735555431213e-05, + "loss": 0.8131, + "step": 9450 + }, + { + "epoch": 0.2032048803539975, + "grad_norm": 0.5712680885146894, + "learning_rate": 1.8178815327601306e-05, + "loss": 0.8, + "step": 9460 + }, + { + "epoch": 0.20341968466726812, + "grad_norm": 0.7090531423336768, + "learning_rate": 1.8174891299501807e-05, + "loss": 0.8117, + "step": 9470 + }, + { + "epoch": 0.20363448898053874, + "grad_norm": 0.6104073346551036, + "learning_rate": 1.8170963472956006e-05, + "loss": 0.8203, + "step": 9480 + }, + { + "epoch": 0.20384929329380935, + "grad_norm": 0.5768361886322763, + "learning_rate": 1.816703184978897e-05, + "loss": 0.8079, + "step": 9490 + }, + { + "epoch": 0.20406409760707994, + "grad_norm": 0.5351105176871881, + "learning_rate": 1.816309643182751e-05, + "loss": 0.7961, + "step": 9500 + }, + { + "epoch": 0.20427890192035056, + "grad_norm": 0.5636835938070731, + "learning_rate": 1.8159157220900216e-05, + "loss": 0.7832, + "step": 9510 + }, + { + "epoch": 0.20449370623362118, + "grad_norm": 0.5730516941042331, + "learning_rate": 1.815521421883743e-05, + "loss": 0.7937, + "step": 9520 + }, + { + "epoch": 0.2047085105468918, + "grad_norm": 0.5733890782084365, + "learning_rate": 1.815126742747126e-05, + "loss": 0.8011, + "step": 9530 + }, + { + "epoch": 0.20492331486016238, + "grad_norm": 0.5616802718550227, + "learning_rate": 1.8147316848635576e-05, + "loss": 0.8038, + "step": 9540 + }, + { + "epoch": 0.205138119173433, + "grad_norm": 0.5889286620725945, + "learning_rate": 1.8143362484166004e-05, + "loss": 0.8021, + "step": 9550 + }, + { + "epoch": 0.20535292348670361, + "grad_norm": 0.5585779556583447, + "learning_rate": 1.8139404335899937e-05, + "loss": 0.8058, + "step": 9560 + }, + { + "epoch": 0.20556772779997423, + "grad_norm": 0.571453117672738, + "learning_rate": 1.813544240567651e-05, + "loss": 0.804, + "step": 9570 + }, + { + "epoch": 0.20578253211324482, + "grad_norm": 0.5654185920520682, + "learning_rate": 1.8131476695336636e-05, + "loss": 0.802, + "step": 9580 + }, + { + "epoch": 0.20599733642651544, + "grad_norm": 0.5844027397396927, + "learning_rate": 1.8127507206722973e-05, + "loss": 0.8019, + "step": 9590 + }, + { + "epoch": 0.20621214073978605, + "grad_norm": 0.5689638203458927, + "learning_rate": 1.8123533941679928e-05, + "loss": 0.8094, + "step": 9600 + }, + { + "epoch": 0.20642694505305667, + "grad_norm": 0.577234661880537, + "learning_rate": 1.8119556902053678e-05, + "loss": 0.8246, + "step": 9610 + }, + { + "epoch": 0.2066417493663273, + "grad_norm": 0.5763434534447465, + "learning_rate": 1.811557608969214e-05, + "loss": 0.8158, + "step": 9620 + }, + { + "epoch": 0.20685655367959788, + "grad_norm": 0.5778688135687731, + "learning_rate": 1.8111591506444997e-05, + "loss": 0.7955, + "step": 9630 + }, + { + "epoch": 0.2070713579928685, + "grad_norm": 0.5813830588006104, + "learning_rate": 1.810760315416367e-05, + "loss": 0.7891, + "step": 9640 + }, + { + "epoch": 0.2072861623061391, + "grad_norm": 0.576252297984537, + "learning_rate": 1.8103611034701348e-05, + "loss": 0.8085, + "step": 9650 + }, + { + "epoch": 0.20750096661940973, + "grad_norm": 0.5837430318834449, + "learning_rate": 1.8099615149912953e-05, + "loss": 0.807, + "step": 9660 + }, + { + "epoch": 0.20771577093268032, + "grad_norm": 0.5890346965782849, + "learning_rate": 1.8095615501655166e-05, + "loss": 0.7944, + "step": 9670 + }, + { + "epoch": 0.20793057524595093, + "grad_norm": 0.5852131021460842, + "learning_rate": 1.8091612091786416e-05, + "loss": 0.8088, + "step": 9680 + }, + { + "epoch": 0.20814537955922155, + "grad_norm": 0.5589031639536682, + "learning_rate": 1.8087604922166884e-05, + "loss": 0.8121, + "step": 9690 + }, + { + "epoch": 0.20836018387249217, + "grad_norm": 0.5837838090958873, + "learning_rate": 1.8083593994658483e-05, + "loss": 0.8031, + "step": 9700 + }, + { + "epoch": 0.20857498818576278, + "grad_norm": 0.5759528081913161, + "learning_rate": 1.807957931112489e-05, + "loss": 0.7909, + "step": 9710 + }, + { + "epoch": 0.20878979249903337, + "grad_norm": 0.6143282377901799, + "learning_rate": 1.807556087343152e-05, + "loss": 0.8115, + "step": 9720 + }, + { + "epoch": 0.209004596812304, + "grad_norm": 0.578534081939633, + "learning_rate": 1.8071538683445524e-05, + "loss": 0.8073, + "step": 9730 + }, + { + "epoch": 0.2092194011255746, + "grad_norm": 0.5990944031783112, + "learning_rate": 1.806751274303581e-05, + "loss": 0.8145, + "step": 9740 + }, + { + "epoch": 0.20943420543884522, + "grad_norm": 0.5374932915323056, + "learning_rate": 1.8063483054073026e-05, + "loss": 0.8186, + "step": 9750 + }, + { + "epoch": 0.2096490097521158, + "grad_norm": 0.573621801810147, + "learning_rate": 1.805944961842955e-05, + "loss": 0.7858, + "step": 9760 + }, + { + "epoch": 0.20986381406538643, + "grad_norm": 0.5667771093016414, + "learning_rate": 1.8055412437979517e-05, + "loss": 0.815, + "step": 9770 + }, + { + "epoch": 0.21007861837865704, + "grad_norm": 0.5777392591556257, + "learning_rate": 1.805137151459879e-05, + "loss": 0.8099, + "step": 9780 + }, + { + "epoch": 0.21029342269192766, + "grad_norm": 0.5716970868389603, + "learning_rate": 1.804732685016498e-05, + "loss": 0.7963, + "step": 9790 + }, + { + "epoch": 0.21050822700519825, + "grad_norm": 0.5784937064403081, + "learning_rate": 1.8043278446557424e-05, + "loss": 0.7985, + "step": 9800 + }, + { + "epoch": 0.21072303131846887, + "grad_norm": 0.5883045627604655, + "learning_rate": 1.8039226305657212e-05, + "loss": 0.8074, + "step": 9810 + }, + { + "epoch": 0.21093783563173948, + "grad_norm": 0.5514729284429919, + "learning_rate": 1.803517042934716e-05, + "loss": 0.8004, + "step": 9820 + }, + { + "epoch": 0.2111526399450101, + "grad_norm": 0.5816019682238576, + "learning_rate": 1.8031110819511825e-05, + "loss": 0.8012, + "step": 9830 + }, + { + "epoch": 0.21136744425828072, + "grad_norm": 0.5870070746603354, + "learning_rate": 1.8027047478037495e-05, + "loss": 0.8057, + "step": 9840 + }, + { + "epoch": 0.2115822485715513, + "grad_norm": 0.6076535851376632, + "learning_rate": 1.802298040681219e-05, + "loss": 0.8269, + "step": 9850 + }, + { + "epoch": 0.21179705288482192, + "grad_norm": 0.5748747580649357, + "learning_rate": 1.801890960772567e-05, + "loss": 0.8158, + "step": 9860 + }, + { + "epoch": 0.21201185719809254, + "grad_norm": 0.5786976018669034, + "learning_rate": 1.801483508266942e-05, + "loss": 0.7988, + "step": 9870 + }, + { + "epoch": 0.21222666151136316, + "grad_norm": 0.540488437271146, + "learning_rate": 1.8010756833536663e-05, + "loss": 0.8049, + "step": 9880 + }, + { + "epoch": 0.21244146582463375, + "grad_norm": 0.6221560289316503, + "learning_rate": 1.800667486222235e-05, + "loss": 0.7987, + "step": 9890 + }, + { + "epoch": 0.21265627013790436, + "grad_norm": 0.5961921503665486, + "learning_rate": 1.8002589170623155e-05, + "loss": 0.8264, + "step": 9900 + }, + { + "epoch": 0.21287107445117498, + "grad_norm": 0.5741672373117042, + "learning_rate": 1.7998499760637492e-05, + "loss": 0.798, + "step": 9910 + }, + { + "epoch": 0.2130858787644456, + "grad_norm": 0.5615151845280278, + "learning_rate": 1.7994406634165492e-05, + "loss": 0.8067, + "step": 9920 + }, + { + "epoch": 0.2133006830777162, + "grad_norm": 0.5836885054107523, + "learning_rate": 1.799030979310902e-05, + "loss": 0.7914, + "step": 9930 + }, + { + "epoch": 0.2135154873909868, + "grad_norm": 0.5764046639257419, + "learning_rate": 1.7986209239371665e-05, + "loss": 0.7975, + "step": 9940 + }, + { + "epoch": 0.21373029170425742, + "grad_norm": 0.5848553181136704, + "learning_rate": 1.798210497485874e-05, + "loss": 0.802, + "step": 9950 + }, + { + "epoch": 0.21394509601752804, + "grad_norm": 0.5739518953663529, + "learning_rate": 1.7977997001477285e-05, + "loss": 0.8157, + "step": 9960 + }, + { + "epoch": 0.21415990033079865, + "grad_norm": 0.5929025643332227, + "learning_rate": 1.797388532113606e-05, + "loss": 0.7859, + "step": 9970 + }, + { + "epoch": 0.21437470464406924, + "grad_norm": 0.5661521287114709, + "learning_rate": 1.7969769935745544e-05, + "loss": 0.7999, + "step": 9980 + }, + { + "epoch": 0.21458950895733986, + "grad_norm": 0.5422378316949167, + "learning_rate": 1.796565084721795e-05, + "loss": 0.7842, + "step": 9990 + }, + { + "epoch": 0.21480431327061048, + "grad_norm": 0.6043847597224838, + "learning_rate": 1.79615280574672e-05, + "loss": 0.8107, + "step": 10000 + }, + { + "epoch": 0.2150191175838811, + "grad_norm": 0.588943789547453, + "learning_rate": 1.795740156840894e-05, + "loss": 0.8095, + "step": 10010 + }, + { + "epoch": 0.21523392189715168, + "grad_norm": 0.5773036210661655, + "learning_rate": 1.7953271381960536e-05, + "loss": 0.8043, + "step": 10020 + }, + { + "epoch": 0.2154487262104223, + "grad_norm": 0.5758207275091537, + "learning_rate": 1.794913750004107e-05, + "loss": 0.793, + "step": 10030 + }, + { + "epoch": 0.21566353052369291, + "grad_norm": 0.5566484791876923, + "learning_rate": 1.7944999924571345e-05, + "loss": 0.7877, + "step": 10040 + }, + { + "epoch": 0.21587833483696353, + "grad_norm": 0.5695977896565292, + "learning_rate": 1.7940858657473867e-05, + "loss": 0.7956, + "step": 10050 + }, + { + "epoch": 0.21609313915023415, + "grad_norm": 0.5792438417645663, + "learning_rate": 1.7936713700672874e-05, + "loss": 0.7905, + "step": 10060 + }, + { + "epoch": 0.21630794346350474, + "grad_norm": 0.5702659128860479, + "learning_rate": 1.7932565056094312e-05, + "loss": 0.8087, + "step": 10070 + }, + { + "epoch": 0.21652274777677535, + "grad_norm": 0.6188318728871035, + "learning_rate": 1.7928412725665844e-05, + "loss": 0.8039, + "step": 10080 + }, + { + "epoch": 0.21673755209004597, + "grad_norm": 0.5606872536463133, + "learning_rate": 1.792425671131683e-05, + "loss": 0.7881, + "step": 10090 + }, + { + "epoch": 0.2169523564033166, + "grad_norm": 0.5722706317917913, + "learning_rate": 1.792009701497836e-05, + "loss": 0.7996, + "step": 10100 + }, + { + "epoch": 0.21716716071658718, + "grad_norm": 0.7107815425462004, + "learning_rate": 1.791593363858323e-05, + "loss": 0.8148, + "step": 10110 + }, + { + "epoch": 0.2173819650298578, + "grad_norm": 0.5883477981610786, + "learning_rate": 1.7911766584065945e-05, + "loss": 0.7909, + "step": 10120 + }, + { + "epoch": 0.2175967693431284, + "grad_norm": 0.5798116807331893, + "learning_rate": 1.7907595853362713e-05, + "loss": 0.7983, + "step": 10130 + }, + { + "epoch": 0.21781157365639903, + "grad_norm": 0.5609531537994755, + "learning_rate": 1.790342144841146e-05, + "loss": 0.7961, + "step": 10140 + }, + { + "epoch": 0.21802637796966964, + "grad_norm": 0.6517122093198296, + "learning_rate": 1.7899243371151813e-05, + "loss": 0.7861, + "step": 10150 + }, + { + "epoch": 0.21824118228294023, + "grad_norm": 0.5516309036102172, + "learning_rate": 1.7895061623525104e-05, + "loss": 0.8067, + "step": 10160 + }, + { + "epoch": 0.21845598659621085, + "grad_norm": 0.5613471363551992, + "learning_rate": 1.789087620747438e-05, + "loss": 0.7923, + "step": 10170 + }, + { + "epoch": 0.21867079090948147, + "grad_norm": 0.9188778029201942, + "learning_rate": 1.788668712494438e-05, + "loss": 0.8096, + "step": 10180 + }, + { + "epoch": 0.21888559522275208, + "grad_norm": 0.5617547681562008, + "learning_rate": 1.7882494377881558e-05, + "loss": 0.7964, + "step": 10190 + }, + { + "epoch": 0.21910039953602267, + "grad_norm": 0.61534749263247, + "learning_rate": 1.7878297968234055e-05, + "loss": 0.7946, + "step": 10200 + }, + { + "epoch": 0.2193152038492933, + "grad_norm": 0.5782211052832341, + "learning_rate": 1.7874097897951737e-05, + "loss": 0.7873, + "step": 10210 + }, + { + "epoch": 0.2195300081625639, + "grad_norm": 0.5514543385854074, + "learning_rate": 1.786989416898615e-05, + "loss": 0.7975, + "step": 10220 + }, + { + "epoch": 0.21974481247583452, + "grad_norm": 0.5480749170558831, + "learning_rate": 1.786568678329055e-05, + "loss": 0.8018, + "step": 10230 + }, + { + "epoch": 0.21995961678910514, + "grad_norm": 0.5783322467533664, + "learning_rate": 1.7861475742819885e-05, + "loss": 0.7848, + "step": 10240 + }, + { + "epoch": 0.22017442110237573, + "grad_norm": 0.5737996720368215, + "learning_rate": 1.7857261049530817e-05, + "loss": 0.7953, + "step": 10250 + }, + { + "epoch": 0.22038922541564634, + "grad_norm": 0.5763402239431955, + "learning_rate": 1.7853042705381684e-05, + "loss": 0.7851, + "step": 10260 + }, + { + "epoch": 0.22060402972891696, + "grad_norm": 0.5731754369723199, + "learning_rate": 1.7848820712332542e-05, + "loss": 0.8126, + "step": 10270 + }, + { + "epoch": 0.22081883404218758, + "grad_norm": 0.5772583690128964, + "learning_rate": 1.784459507234512e-05, + "loss": 0.82, + "step": 10280 + }, + { + "epoch": 0.22103363835545817, + "grad_norm": 0.5411569999708086, + "learning_rate": 1.7840365787382858e-05, + "loss": 0.7931, + "step": 10290 + }, + { + "epoch": 0.22124844266872878, + "grad_norm": 0.5653187339476065, + "learning_rate": 1.7836132859410885e-05, + "loss": 0.8067, + "step": 10300 + }, + { + "epoch": 0.2214632469819994, + "grad_norm": 0.565092919613001, + "learning_rate": 1.783189629039602e-05, + "loss": 0.7973, + "step": 10310 + }, + { + "epoch": 0.22167805129527002, + "grad_norm": 0.5520905710174285, + "learning_rate": 1.782765608230678e-05, + "loss": 0.8074, + "step": 10320 + }, + { + "epoch": 0.2218928556085406, + "grad_norm": 0.5499770181938648, + "learning_rate": 1.782341223711336e-05, + "loss": 0.8047, + "step": 10330 + }, + { + "epoch": 0.22210765992181122, + "grad_norm": 0.5770398453057336, + "learning_rate": 1.7819164756787667e-05, + "loss": 0.8024, + "step": 10340 + }, + { + "epoch": 0.22232246423508184, + "grad_norm": 0.6005922987428989, + "learning_rate": 1.781491364330327e-05, + "loss": 0.8023, + "step": 10350 + }, + { + "epoch": 0.22253726854835246, + "grad_norm": 0.5692299117115992, + "learning_rate": 1.7810658898635455e-05, + "loss": 0.7944, + "step": 10360 + }, + { + "epoch": 0.22275207286162307, + "grad_norm": 0.5657236504416966, + "learning_rate": 1.780640052476117e-05, + "loss": 0.8146, + "step": 10370 + }, + { + "epoch": 0.22296687717489366, + "grad_norm": 0.5561699306834719, + "learning_rate": 1.780213852365906e-05, + "loss": 0.785, + "step": 10380 + }, + { + "epoch": 0.22318168148816428, + "grad_norm": 0.7081144347430791, + "learning_rate": 1.779787289730946e-05, + "loss": 0.8007, + "step": 10390 + }, + { + "epoch": 0.2233964858014349, + "grad_norm": 0.5625449988527748, + "learning_rate": 1.779360364769438e-05, + "loss": 0.7903, + "step": 10400 + }, + { + "epoch": 0.2236112901147055, + "grad_norm": 0.5443207117710023, + "learning_rate": 1.7789330776797515e-05, + "loss": 0.8029, + "step": 10410 + }, + { + "epoch": 0.2238260944279761, + "grad_norm": 0.5625349389867544, + "learning_rate": 1.7785054286604254e-05, + "loss": 0.8169, + "step": 10420 + }, + { + "epoch": 0.22404089874124672, + "grad_norm": 0.5476014487186761, + "learning_rate": 1.7780774179101654e-05, + "loss": 0.7917, + "step": 10430 + }, + { + "epoch": 0.22425570305451734, + "grad_norm": 0.5800827416013181, + "learning_rate": 1.7776490456278462e-05, + "loss": 0.8073, + "step": 10440 + }, + { + "epoch": 0.22447050736778795, + "grad_norm": 0.5542113664451737, + "learning_rate": 1.7772203120125095e-05, + "loss": 0.7947, + "step": 10450 + }, + { + "epoch": 0.22468531168105857, + "grad_norm": 0.5404026820286727, + "learning_rate": 1.776791217263366e-05, + "loss": 0.8024, + "step": 10460 + }, + { + "epoch": 0.22490011599432916, + "grad_norm": 0.5538986415644934, + "learning_rate": 1.7763617615797934e-05, + "loss": 0.7828, + "step": 10470 + }, + { + "epoch": 0.22511492030759978, + "grad_norm": 0.5585745280123449, + "learning_rate": 1.7759319451613376e-05, + "loss": 0.801, + "step": 10480 + }, + { + "epoch": 0.2253297246208704, + "grad_norm": 0.5622099418484012, + "learning_rate": 1.7755017682077118e-05, + "loss": 0.7945, + "step": 10490 + }, + { + "epoch": 0.225544528934141, + "grad_norm": 0.5370985490496776, + "learning_rate": 1.7750712309187967e-05, + "loss": 0.7863, + "step": 10500 + }, + { + "epoch": 0.2257593332474116, + "grad_norm": 0.5372310056648028, + "learning_rate": 1.7746403334946407e-05, + "loss": 0.7946, + "step": 10510 + }, + { + "epoch": 0.22597413756068221, + "grad_norm": 0.556076402437103, + "learning_rate": 1.7742090761354596e-05, + "loss": 0.7881, + "step": 10520 + }, + { + "epoch": 0.22618894187395283, + "grad_norm": 0.5580749453853192, + "learning_rate": 1.7737774590416358e-05, + "loss": 0.7938, + "step": 10530 + }, + { + "epoch": 0.22640374618722345, + "grad_norm": 0.5957795192444436, + "learning_rate": 1.7733454824137196e-05, + "loss": 0.7955, + "step": 10540 + }, + { + "epoch": 0.22661855050049404, + "grad_norm": 0.5712477046163247, + "learning_rate": 1.772913146452428e-05, + "loss": 0.8038, + "step": 10550 + }, + { + "epoch": 0.22683335481376465, + "grad_norm": 0.548160058046596, + "learning_rate": 1.7724804513586448e-05, + "loss": 0.7939, + "step": 10560 + }, + { + "epoch": 0.22704815912703527, + "grad_norm": 0.5369592474615921, + "learning_rate": 1.7720473973334213e-05, + "loss": 0.7993, + "step": 10570 + }, + { + "epoch": 0.2272629634403059, + "grad_norm": 0.5423611002909383, + "learning_rate": 1.7716139845779746e-05, + "loss": 0.8045, + "step": 10580 + }, + { + "epoch": 0.2274777677535765, + "grad_norm": 0.5550989146015426, + "learning_rate": 1.7711802132936896e-05, + "loss": 0.7892, + "step": 10590 + }, + { + "epoch": 0.2276925720668471, + "grad_norm": 0.5703653518043215, + "learning_rate": 1.770746083682117e-05, + "loss": 0.8055, + "step": 10600 + }, + { + "epoch": 0.2279073763801177, + "grad_norm": 0.5615344431225333, + "learning_rate": 1.7703115959449738e-05, + "loss": 0.7979, + "step": 10610 + }, + { + "epoch": 0.22812218069338833, + "grad_norm": 0.553731045220489, + "learning_rate": 1.7698767502841445e-05, + "loss": 0.811, + "step": 10620 + }, + { + "epoch": 0.22833698500665894, + "grad_norm": 0.5501876227065152, + "learning_rate": 1.769441546901679e-05, + "loss": 0.7964, + "step": 10630 + }, + { + "epoch": 0.22855178931992953, + "grad_norm": 0.5412824361191674, + "learning_rate": 1.7690059859997935e-05, + "loss": 0.7859, + "step": 10640 + }, + { + "epoch": 0.22876659363320015, + "grad_norm": 0.5751057669815162, + "learning_rate": 1.7685700677808703e-05, + "loss": 0.8021, + "step": 10650 + }, + { + "epoch": 0.22898139794647077, + "grad_norm": 0.5413904254802253, + "learning_rate": 1.7681337924474585e-05, + "loss": 0.7911, + "step": 10660 + }, + { + "epoch": 0.22919620225974138, + "grad_norm": 0.5443592022724559, + "learning_rate": 1.7676971602022722e-05, + "loss": 0.804, + "step": 10670 + }, + { + "epoch": 0.229411006573012, + "grad_norm": 0.5963768922972459, + "learning_rate": 1.7672601712481916e-05, + "loss": 0.7838, + "step": 10680 + }, + { + "epoch": 0.2296258108862826, + "grad_norm": 0.5814546759073268, + "learning_rate": 1.7668228257882628e-05, + "loss": 0.7934, + "step": 10690 + }, + { + "epoch": 0.2298406151995532, + "grad_norm": 0.5513099385655584, + "learning_rate": 1.7663851240256973e-05, + "loss": 0.7901, + "step": 10700 + }, + { + "epoch": 0.23005541951282382, + "grad_norm": 0.5531620819292025, + "learning_rate": 1.7659470661638727e-05, + "loss": 0.8093, + "step": 10710 + }, + { + "epoch": 0.23027022382609444, + "grad_norm": 0.5656118475573507, + "learning_rate": 1.7655086524063314e-05, + "loss": 0.793, + "step": 10720 + }, + { + "epoch": 0.23048502813936503, + "grad_norm": 0.5575833476944946, + "learning_rate": 1.765069882956781e-05, + "loss": 0.7976, + "step": 10730 + }, + { + "epoch": 0.23069983245263564, + "grad_norm": 0.5564123290228586, + "learning_rate": 1.764630758019096e-05, + "loss": 0.8022, + "step": 10740 + }, + { + "epoch": 0.23091463676590626, + "grad_norm": 0.5534405369439489, + "learning_rate": 1.7641912777973136e-05, + "loss": 0.8079, + "step": 10750 + }, + { + "epoch": 0.23112944107917688, + "grad_norm": 0.5473270189074221, + "learning_rate": 1.7637514424956386e-05, + "loss": 0.7938, + "step": 10760 + }, + { + "epoch": 0.23134424539244747, + "grad_norm": 0.5357674731887498, + "learning_rate": 1.7633112523184383e-05, + "loss": 0.7846, + "step": 10770 + }, + { + "epoch": 0.23155904970571808, + "grad_norm": 0.5552201424591323, + "learning_rate": 1.762870707470247e-05, + "loss": 0.7803, + "step": 10780 + }, + { + "epoch": 0.2317738540189887, + "grad_norm": 0.558152601578086, + "learning_rate": 1.7624298081557626e-05, + "loss": 0.7902, + "step": 10790 + }, + { + "epoch": 0.23198865833225932, + "grad_norm": 0.5595014924866345, + "learning_rate": 1.7619885545798486e-05, + "loss": 0.7891, + "step": 10800 + }, + { + "epoch": 0.23220346264552993, + "grad_norm": 0.5495730986900741, + "learning_rate": 1.7615469469475315e-05, + "loss": 0.8026, + "step": 10810 + }, + { + "epoch": 0.23241826695880052, + "grad_norm": 0.5760660001958988, + "learning_rate": 1.7611049854640044e-05, + "loss": 0.8077, + "step": 10820 + }, + { + "epoch": 0.23263307127207114, + "grad_norm": 0.5550120424472271, + "learning_rate": 1.7606626703346235e-05, + "loss": 0.7965, + "step": 10830 + }, + { + "epoch": 0.23284787558534176, + "grad_norm": 0.5534917812163986, + "learning_rate": 1.7602200017649093e-05, + "loss": 0.7862, + "step": 10840 + }, + { + "epoch": 0.23306267989861237, + "grad_norm": 0.567697162536355, + "learning_rate": 1.759776979960547e-05, + "loss": 0.7882, + "step": 10850 + }, + { + "epoch": 0.23327748421188296, + "grad_norm": 0.5544878261968137, + "learning_rate": 1.7593336051273857e-05, + "loss": 0.7894, + "step": 10860 + }, + { + "epoch": 0.23349228852515358, + "grad_norm": 0.5522547428302197, + "learning_rate": 1.7588898774714387e-05, + "loss": 0.792, + "step": 10870 + }, + { + "epoch": 0.2337070928384242, + "grad_norm": 0.5561521408221043, + "learning_rate": 1.7584457971988836e-05, + "loss": 0.7984, + "step": 10880 + }, + { + "epoch": 0.2339218971516948, + "grad_norm": 0.5701789587662502, + "learning_rate": 1.75800136451606e-05, + "loss": 0.7891, + "step": 10890 + }, + { + "epoch": 0.23413670146496543, + "grad_norm": 0.5512069625729566, + "learning_rate": 1.7575565796294745e-05, + "loss": 0.8033, + "step": 10900 + }, + { + "epoch": 0.23435150577823602, + "grad_norm": 0.5671925282466402, + "learning_rate": 1.7571114427457942e-05, + "loss": 0.7914, + "step": 10910 + }, + { + "epoch": 0.23456631009150664, + "grad_norm": 0.5987674977748154, + "learning_rate": 1.7566659540718512e-05, + "loss": 0.8081, + "step": 10920 + }, + { + "epoch": 0.23478111440477725, + "grad_norm": 0.5576016505511755, + "learning_rate": 1.7562201138146407e-05, + "loss": 0.7881, + "step": 10930 + }, + { + "epoch": 0.23499591871804787, + "grad_norm": 0.5606224459108454, + "learning_rate": 1.7557739221813226e-05, + "loss": 0.7999, + "step": 10940 + }, + { + "epoch": 0.23521072303131846, + "grad_norm": 0.5778821827576899, + "learning_rate": 1.7553273793792176e-05, + "loss": 0.8156, + "step": 10950 + }, + { + "epoch": 0.23542552734458908, + "grad_norm": 0.5512352918888542, + "learning_rate": 1.7548804856158113e-05, + "loss": 0.7987, + "step": 10960 + }, + { + "epoch": 0.2356403316578597, + "grad_norm": 0.5566762408624399, + "learning_rate": 1.7544332410987523e-05, + "loss": 0.7976, + "step": 10970 + }, + { + "epoch": 0.2358551359711303, + "grad_norm": 0.5645656364576122, + "learning_rate": 1.7539856460358515e-05, + "loss": 0.7869, + "step": 10980 + }, + { + "epoch": 0.2360699402844009, + "grad_norm": 0.5445877178654857, + "learning_rate": 1.753537700635083e-05, + "loss": 0.7869, + "step": 10990 + }, + { + "epoch": 0.23628474459767151, + "grad_norm": 0.5590715894295356, + "learning_rate": 1.753089405104584e-05, + "loss": 0.8091, + "step": 11000 + }, + { + "epoch": 0.23649954891094213, + "grad_norm": 0.5671948914432249, + "learning_rate": 1.7526407596526536e-05, + "loss": 0.8074, + "step": 11010 + }, + { + "epoch": 0.23671435322421275, + "grad_norm": 0.5623507718873683, + "learning_rate": 1.7521917644877546e-05, + "loss": 0.7854, + "step": 11020 + }, + { + "epoch": 0.23692915753748336, + "grad_norm": 0.5932218934601358, + "learning_rate": 1.7517424198185108e-05, + "loss": 0.8094, + "step": 11030 + }, + { + "epoch": 0.23714396185075395, + "grad_norm": 0.6112761900916727, + "learning_rate": 1.7512927258537105e-05, + "loss": 0.7942, + "step": 11040 + }, + { + "epoch": 0.23735876616402457, + "grad_norm": 0.566759308520504, + "learning_rate": 1.750842682802302e-05, + "loss": 0.7797, + "step": 11050 + }, + { + "epoch": 0.2375735704772952, + "grad_norm": 0.536263668019581, + "learning_rate": 1.7503922908733972e-05, + "loss": 0.7899, + "step": 11060 + }, + { + "epoch": 0.2377883747905658, + "grad_norm": 0.5570104505775965, + "learning_rate": 1.74994155027627e-05, + "loss": 0.7971, + "step": 11070 + }, + { + "epoch": 0.2380031791038364, + "grad_norm": 0.5609858905324858, + "learning_rate": 1.7494904612203557e-05, + "loss": 0.7866, + "step": 11080 + }, + { + "epoch": 0.238217983417107, + "grad_norm": 0.5571287784622291, + "learning_rate": 1.7490390239152523e-05, + "loss": 0.7964, + "step": 11090 + }, + { + "epoch": 0.23843278773037763, + "grad_norm": 0.5618669494447492, + "learning_rate": 1.748587238570719e-05, + "loss": 0.8074, + "step": 11100 + }, + { + "epoch": 0.23864759204364824, + "grad_norm": 0.5955821944742522, + "learning_rate": 1.748135105396677e-05, + "loss": 0.8106, + "step": 11110 + }, + { + "epoch": 0.23886239635691886, + "grad_norm": 0.5507627007193192, + "learning_rate": 1.747682624603209e-05, + "loss": 0.7828, + "step": 11120 + }, + { + "epoch": 0.23907720067018945, + "grad_norm": 0.5861560768919388, + "learning_rate": 1.747229796400559e-05, + "loss": 0.8241, + "step": 11130 + }, + { + "epoch": 0.23929200498346007, + "grad_norm": 0.5718220707403837, + "learning_rate": 1.7467766209991332e-05, + "loss": 0.794, + "step": 11140 + }, + { + "epoch": 0.23950680929673068, + "grad_norm": 0.6034932739087517, + "learning_rate": 1.7463230986094982e-05, + "loss": 0.801, + "step": 11150 + }, + { + "epoch": 0.2397216136100013, + "grad_norm": 0.5581638199123323, + "learning_rate": 1.7458692294423825e-05, + "loss": 0.8061, + "step": 11160 + }, + { + "epoch": 0.2399364179232719, + "grad_norm": 0.5395247610317755, + "learning_rate": 1.7454150137086753e-05, + "loss": 0.7864, + "step": 11170 + }, + { + "epoch": 0.2401512222365425, + "grad_norm": 0.5931900048885603, + "learning_rate": 1.7449604516194266e-05, + "loss": 0.7947, + "step": 11180 + }, + { + "epoch": 0.24036602654981312, + "grad_norm": 0.5574150201939502, + "learning_rate": 1.7445055433858485e-05, + "loss": 0.7992, + "step": 11190 + }, + { + "epoch": 0.24058083086308374, + "grad_norm": 0.558048596510596, + "learning_rate": 1.744050289219313e-05, + "loss": 0.8036, + "step": 11200 + }, + { + "epoch": 0.24079563517635433, + "grad_norm": 0.5544503116923134, + "learning_rate": 1.743594689331353e-05, + "loss": 0.7892, + "step": 11210 + }, + { + "epoch": 0.24101043948962494, + "grad_norm": 0.5913966197755747, + "learning_rate": 1.7431387439336615e-05, + "loss": 0.7882, + "step": 11220 + }, + { + "epoch": 0.24122524380289556, + "grad_norm": 0.5609044365761472, + "learning_rate": 1.7426824532380932e-05, + "loss": 0.8006, + "step": 11230 + }, + { + "epoch": 0.24144004811616618, + "grad_norm": 0.5598501767608501, + "learning_rate": 1.7422258174566625e-05, + "loss": 0.8034, + "step": 11240 + }, + { + "epoch": 0.2416548524294368, + "grad_norm": 0.5598467174087662, + "learning_rate": 1.741768836801544e-05, + "loss": 0.808, + "step": 11250 + }, + { + "epoch": 0.24186965674270738, + "grad_norm": 0.5571089293330055, + "learning_rate": 1.741311511485073e-05, + "loss": 0.7828, + "step": 11260 + }, + { + "epoch": 0.242084461055978, + "grad_norm": 0.5858030780897929, + "learning_rate": 1.7408538417197447e-05, + "loss": 0.7881, + "step": 11270 + }, + { + "epoch": 0.24229926536924862, + "grad_norm": 0.5664884004513725, + "learning_rate": 1.7403958277182143e-05, + "loss": 0.792, + "step": 11280 + }, + { + "epoch": 0.24251406968251923, + "grad_norm": 0.5523972315190862, + "learning_rate": 1.7399374696932977e-05, + "loss": 0.795, + "step": 11290 + }, + { + "epoch": 0.24272887399578982, + "grad_norm": 0.5801081531529861, + "learning_rate": 1.7394787678579693e-05, + "loss": 0.8056, + "step": 11300 + }, + { + "epoch": 0.24294367830906044, + "grad_norm": 0.5851376223053917, + "learning_rate": 1.739019722425364e-05, + "loss": 0.797, + "step": 11310 + }, + { + "epoch": 0.24315848262233106, + "grad_norm": 0.5826239567427932, + "learning_rate": 1.7385603336087766e-05, + "loss": 0.7967, + "step": 11320 + }, + { + "epoch": 0.24337328693560167, + "grad_norm": 0.5671104979630887, + "learning_rate": 1.738100601621661e-05, + "loss": 0.8042, + "step": 11330 + }, + { + "epoch": 0.2435880912488723, + "grad_norm": 0.5526309623916124, + "learning_rate": 1.737640526677631e-05, + "loss": 0.7974, + "step": 11340 + }, + { + "epoch": 0.24380289556214288, + "grad_norm": 0.5952220328485525, + "learning_rate": 1.737180108990459e-05, + "loss": 0.7988, + "step": 11350 + }, + { + "epoch": 0.2440176998754135, + "grad_norm": 0.5576900075243258, + "learning_rate": 1.736719348774077e-05, + "loss": 0.792, + "step": 11360 + }, + { + "epoch": 0.2442325041886841, + "grad_norm": 0.5646590624675798, + "learning_rate": 1.7362582462425775e-05, + "loss": 0.7918, + "step": 11370 + }, + { + "epoch": 0.24444730850195473, + "grad_norm": 0.5650066999862957, + "learning_rate": 1.7357968016102094e-05, + "loss": 0.8026, + "step": 11380 + }, + { + "epoch": 0.24466211281522532, + "grad_norm": 0.5675843616193625, + "learning_rate": 1.7353350150913826e-05, + "loss": 0.7983, + "step": 11390 + }, + { + "epoch": 0.24487691712849594, + "grad_norm": 0.516651808309515, + "learning_rate": 1.734872886900665e-05, + "loss": 0.7707, + "step": 11400 + }, + { + "epoch": 0.24509172144176655, + "grad_norm": 0.5480548050237413, + "learning_rate": 1.7344104172527845e-05, + "loss": 0.7901, + "step": 11410 + }, + { + "epoch": 0.24530652575503717, + "grad_norm": 0.5467792895993273, + "learning_rate": 1.7339476063626252e-05, + "loss": 0.805, + "step": 11420 + }, + { + "epoch": 0.24552133006830776, + "grad_norm": 0.5675774870807921, + "learning_rate": 1.733484454445232e-05, + "loss": 0.7789, + "step": 11430 + }, + { + "epoch": 0.24573613438157837, + "grad_norm": 0.5430330356783232, + "learning_rate": 1.7330209617158075e-05, + "loss": 0.7893, + "step": 11440 + }, + { + "epoch": 0.245950938694849, + "grad_norm": 0.5408803807446219, + "learning_rate": 1.7325571283897126e-05, + "loss": 0.7897, + "step": 11450 + }, + { + "epoch": 0.2461657430081196, + "grad_norm": 0.5536255557245513, + "learning_rate": 1.7320929546824662e-05, + "loss": 0.793, + "step": 11460 + }, + { + "epoch": 0.24638054732139023, + "grad_norm": 0.5669589146457302, + "learning_rate": 1.731628440809746e-05, + "loss": 0.7802, + "step": 11470 + }, + { + "epoch": 0.24659535163466081, + "grad_norm": 0.5574302178274012, + "learning_rate": 1.731163586987387e-05, + "loss": 0.7847, + "step": 11480 + }, + { + "epoch": 0.24681015594793143, + "grad_norm": 0.5563586377765385, + "learning_rate": 1.7306983934313833e-05, + "loss": 0.7973, + "step": 11490 + }, + { + "epoch": 0.24702496026120205, + "grad_norm": 0.5541682433080475, + "learning_rate": 1.730232860357885e-05, + "loss": 0.7972, + "step": 11500 + }, + { + "epoch": 0.24723976457447266, + "grad_norm": 0.5487057694995032, + "learning_rate": 1.7297669879832025e-05, + "loss": 0.783, + "step": 11510 + }, + { + "epoch": 0.24745456888774325, + "grad_norm": 0.5407120966552204, + "learning_rate": 1.7293007765238012e-05, + "loss": 0.7858, + "step": 11520 + }, + { + "epoch": 0.24766937320101387, + "grad_norm": 0.6189329723987206, + "learning_rate": 1.728834226196306e-05, + "loss": 0.7928, + "step": 11530 + }, + { + "epoch": 0.2478841775142845, + "grad_norm": 0.5365545625639055, + "learning_rate": 1.728367337217498e-05, + "loss": 0.7922, + "step": 11540 + }, + { + "epoch": 0.2480989818275551, + "grad_norm": 0.5541237894376929, + "learning_rate": 1.7279001098043164e-05, + "loss": 0.7864, + "step": 11550 + }, + { + "epoch": 0.24831378614082572, + "grad_norm": 0.5518328452834319, + "learning_rate": 1.7274325441738578e-05, + "loss": 0.7993, + "step": 11560 + }, + { + "epoch": 0.2485285904540963, + "grad_norm": 0.5476209039060521, + "learning_rate": 1.7269646405433754e-05, + "loss": 0.7845, + "step": 11570 + }, + { + "epoch": 0.24874339476736693, + "grad_norm": 0.5374788890443903, + "learning_rate": 1.7264963991302798e-05, + "loss": 0.7736, + "step": 11580 + }, + { + "epoch": 0.24895819908063754, + "grad_norm": 0.5528441478075925, + "learning_rate": 1.7260278201521377e-05, + "loss": 0.8082, + "step": 11590 + }, + { + "epoch": 0.24917300339390816, + "grad_norm": 0.5517627198236814, + "learning_rate": 1.725558903826674e-05, + "loss": 0.7964, + "step": 11600 + }, + { + "epoch": 0.24938780770717875, + "grad_norm": 0.526708104255714, + "learning_rate": 1.7250896503717697e-05, + "loss": 0.7846, + "step": 11610 + }, + { + "epoch": 0.24960261202044937, + "grad_norm": 0.5677760683591515, + "learning_rate": 1.724620060005462e-05, + "loss": 0.8056, + "step": 11620 + }, + { + "epoch": 0.24981741633371998, + "grad_norm": 0.5445293154923263, + "learning_rate": 1.724150132945946e-05, + "loss": 0.7925, + "step": 11630 + }, + { + "epoch": 0.2500322206469906, + "grad_norm": 0.5253609554392292, + "learning_rate": 1.723679869411571e-05, + "loss": 0.795, + "step": 11640 + }, + { + "epoch": 0.2502470249602612, + "grad_norm": 0.5717202718646365, + "learning_rate": 1.723209269620845e-05, + "loss": 0.776, + "step": 11650 + }, + { + "epoch": 0.25046182927353183, + "grad_norm": 0.5795012398020939, + "learning_rate": 1.722738333792431e-05, + "loss": 0.7879, + "step": 11660 + }, + { + "epoch": 0.2506766335868024, + "grad_norm": 0.5572184542777283, + "learning_rate": 1.722267062145148e-05, + "loss": 0.7822, + "step": 11670 + }, + { + "epoch": 0.250891437900073, + "grad_norm": 0.5617363405871534, + "learning_rate": 1.721795454897972e-05, + "loss": 0.7905, + "step": 11680 + }, + { + "epoch": 0.25110624221334366, + "grad_norm": 0.5445190221346468, + "learning_rate": 1.7213235122700334e-05, + "loss": 0.8082, + "step": 11690 + }, + { + "epoch": 0.25132104652661424, + "grad_norm": 0.5395663294268565, + "learning_rate": 1.7208512344806204e-05, + "loss": 0.776, + "step": 11700 + }, + { + "epoch": 0.2515358508398849, + "grad_norm": 0.5454264355323872, + "learning_rate": 1.7203786217491757e-05, + "loss": 0.7828, + "step": 11710 + }, + { + "epoch": 0.2517506551531555, + "grad_norm": 0.5778897096941694, + "learning_rate": 1.7199056742952973e-05, + "loss": 0.7861, + "step": 11720 + }, + { + "epoch": 0.25196545946642607, + "grad_norm": 0.5436108360977332, + "learning_rate": 1.7194323923387396e-05, + "loss": 0.7887, + "step": 11730 + }, + { + "epoch": 0.2521802637796967, + "grad_norm": 0.5480998527967708, + "learning_rate": 1.718958776099412e-05, + "loss": 0.8009, + "step": 11740 + }, + { + "epoch": 0.2523950680929673, + "grad_norm": 0.5553968599080923, + "learning_rate": 1.718484825797379e-05, + "loss": 0.7953, + "step": 11750 + }, + { + "epoch": 0.2526098724062379, + "grad_norm": 0.5633121625376932, + "learning_rate": 1.7180105416528613e-05, + "loss": 0.7836, + "step": 11760 + }, + { + "epoch": 0.25282467671950853, + "grad_norm": 0.5706756900855683, + "learning_rate": 1.7175359238862335e-05, + "loss": 0.7963, + "step": 11770 + }, + { + "epoch": 0.2530394810327791, + "grad_norm": 0.5594666956751811, + "learning_rate": 1.7170609727180258e-05, + "loss": 0.7881, + "step": 11780 + }, + { + "epoch": 0.25325428534604977, + "grad_norm": 0.5537400526042484, + "learning_rate": 1.7165856883689237e-05, + "loss": 0.7862, + "step": 11790 + }, + { + "epoch": 0.25346908965932036, + "grad_norm": 0.5561566681420613, + "learning_rate": 1.7161100710597668e-05, + "loss": 0.7856, + "step": 11800 + }, + { + "epoch": 0.25368389397259095, + "grad_norm": 0.575034466737522, + "learning_rate": 1.71563412101155e-05, + "loss": 0.7872, + "step": 11810 + }, + { + "epoch": 0.2538986982858616, + "grad_norm": 0.5674721717746797, + "learning_rate": 1.7151578384454218e-05, + "loss": 0.8035, + "step": 11820 + }, + { + "epoch": 0.2541135025991322, + "grad_norm": 0.5471441442602709, + "learning_rate": 1.714681223582686e-05, + "loss": 0.7835, + "step": 11830 + }, + { + "epoch": 0.2543283069124028, + "grad_norm": 0.5349338344027432, + "learning_rate": 1.7142042766448016e-05, + "loss": 0.7704, + "step": 11840 + }, + { + "epoch": 0.2545431112256734, + "grad_norm": 0.5289756077207555, + "learning_rate": 1.7137269978533804e-05, + "loss": 0.7881, + "step": 11850 + }, + { + "epoch": 0.254757915538944, + "grad_norm": 0.5654604443984577, + "learning_rate": 1.713249387430189e-05, + "loss": 0.7873, + "step": 11860 + }, + { + "epoch": 0.25497271985221465, + "grad_norm": 0.5479301372643116, + "learning_rate": 1.7127714455971476e-05, + "loss": 0.7853, + "step": 11870 + }, + { + "epoch": 0.25518752416548524, + "grad_norm": 0.5642448334289781, + "learning_rate": 1.7122931725763318e-05, + "loss": 0.7955, + "step": 11880 + }, + { + "epoch": 0.2554023284787559, + "grad_norm": 0.5561951319477092, + "learning_rate": 1.71181456858997e-05, + "loss": 0.7998, + "step": 11890 + }, + { + "epoch": 0.25561713279202647, + "grad_norm": 0.5551743423950759, + "learning_rate": 1.711335633860444e-05, + "loss": 0.7868, + "step": 11900 + }, + { + "epoch": 0.25583193710529706, + "grad_norm": 0.575663087645707, + "learning_rate": 1.71085636861029e-05, + "loss": 0.7946, + "step": 11910 + }, + { + "epoch": 0.2560467414185677, + "grad_norm": 0.5705702380544423, + "learning_rate": 1.710376773062198e-05, + "loss": 0.8077, + "step": 11920 + }, + { + "epoch": 0.2562615457318383, + "grad_norm": 0.5517950323778168, + "learning_rate": 1.709896847439011e-05, + "loss": 0.7798, + "step": 11930 + }, + { + "epoch": 0.2564763500451089, + "grad_norm": 0.5654242254525084, + "learning_rate": 1.709416591963725e-05, + "loss": 0.7854, + "step": 11940 + }, + { + "epoch": 0.2566911543583795, + "grad_norm": 0.556721343917251, + "learning_rate": 1.7089360068594903e-05, + "loss": 0.7845, + "step": 11950 + }, + { + "epoch": 0.2569059586716501, + "grad_norm": 0.5647470558063254, + "learning_rate": 1.7084550923496094e-05, + "loss": 0.7827, + "step": 11960 + }, + { + "epoch": 0.25712076298492076, + "grad_norm": 0.5580955635924464, + "learning_rate": 1.7079738486575382e-05, + "loss": 0.7857, + "step": 11970 + }, + { + "epoch": 0.25733556729819135, + "grad_norm": 0.5655207694822407, + "learning_rate": 1.7074922760068855e-05, + "loss": 0.7751, + "step": 11980 + }, + { + "epoch": 0.25755037161146194, + "grad_norm": 0.5700568764310818, + "learning_rate": 1.7070103746214135e-05, + "loss": 0.796, + "step": 11990 + }, + { + "epoch": 0.2577651759247326, + "grad_norm": 0.530803254919797, + "learning_rate": 1.7065281447250363e-05, + "loss": 0.7799, + "step": 12000 + }, + { + "epoch": 0.25797998023800317, + "grad_norm": 0.559034831132991, + "learning_rate": 1.706045586541821e-05, + "loss": 0.7957, + "step": 12010 + }, + { + "epoch": 0.2581947845512738, + "grad_norm": 0.5597546125165273, + "learning_rate": 1.7055627002959872e-05, + "loss": 0.7845, + "step": 12020 + }, + { + "epoch": 0.2584095888645444, + "grad_norm": 0.6058840929391112, + "learning_rate": 1.7050794862119078e-05, + "loss": 0.7895, + "step": 12030 + }, + { + "epoch": 0.258624393177815, + "grad_norm": 0.5651994162157234, + "learning_rate": 1.704595944514106e-05, + "loss": 0.8003, + "step": 12040 + }, + { + "epoch": 0.25883919749108564, + "grad_norm": 0.5596861708905858, + "learning_rate": 1.7041120754272594e-05, + "loss": 0.8011, + "step": 12050 + }, + { + "epoch": 0.2590540018043562, + "grad_norm": 0.5505706888687052, + "learning_rate": 1.7036278791761965e-05, + "loss": 0.791, + "step": 12060 + }, + { + "epoch": 0.2592688061176268, + "grad_norm": 0.5545931326572373, + "learning_rate": 1.7031433559858977e-05, + "loss": 0.7804, + "step": 12070 + }, + { + "epoch": 0.25948361043089746, + "grad_norm": 3.102401423632908, + "learning_rate": 1.702658506081496e-05, + "loss": 0.8037, + "step": 12080 + }, + { + "epoch": 0.25969841474416805, + "grad_norm": 0.5450387286613136, + "learning_rate": 1.7021733296882758e-05, + "loss": 0.8026, + "step": 12090 + }, + { + "epoch": 0.2599132190574387, + "grad_norm": 0.541274551933326, + "learning_rate": 1.7016878270316738e-05, + "loss": 0.7963, + "step": 12100 + }, + { + "epoch": 0.2601280233707093, + "grad_norm": 0.5316129158445448, + "learning_rate": 1.701201998337277e-05, + "loss": 0.7797, + "step": 12110 + }, + { + "epoch": 0.26034282768397987, + "grad_norm": 0.5332911250510366, + "learning_rate": 1.700715843830825e-05, + "loss": 0.788, + "step": 12120 + }, + { + "epoch": 0.2605576319972505, + "grad_norm": 0.5385176920383756, + "learning_rate": 1.700229363738209e-05, + "loss": 0.7766, + "step": 12130 + }, + { + "epoch": 0.2607724363105211, + "grad_norm": 0.5598906242851724, + "learning_rate": 1.6997425582854704e-05, + "loss": 0.777, + "step": 12140 + }, + { + "epoch": 0.26098724062379175, + "grad_norm": 0.5487776354269147, + "learning_rate": 1.6992554276988022e-05, + "loss": 0.7737, + "step": 12150 + }, + { + "epoch": 0.26120204493706234, + "grad_norm": 0.5330929016698871, + "learning_rate": 1.6987679722045493e-05, + "loss": 0.7907, + "step": 12160 + }, + { + "epoch": 0.2614168492503329, + "grad_norm": 0.5382294593191113, + "learning_rate": 1.6982801920292063e-05, + "loss": 0.792, + "step": 12170 + }, + { + "epoch": 0.2616316535636036, + "grad_norm": 0.5458332411357165, + "learning_rate": 1.6977920873994196e-05, + "loss": 0.8109, + "step": 12180 + }, + { + "epoch": 0.26184645787687416, + "grad_norm": 0.5588172559612458, + "learning_rate": 1.697303658541986e-05, + "loss": 0.7976, + "step": 12190 + }, + { + "epoch": 0.2620612621901448, + "grad_norm": 0.5291579474271565, + "learning_rate": 1.6968149056838525e-05, + "loss": 0.7796, + "step": 12200 + }, + { + "epoch": 0.2622760665034154, + "grad_norm": 0.5477937857011975, + "learning_rate": 1.6963258290521173e-05, + "loss": 0.7831, + "step": 12210 + }, + { + "epoch": 0.262490870816686, + "grad_norm": 0.5311847854529211, + "learning_rate": 1.6958364288740293e-05, + "loss": 0.7584, + "step": 12220 + }, + { + "epoch": 0.26270567512995663, + "grad_norm": 0.5360611480627258, + "learning_rate": 1.6953467053769864e-05, + "loss": 0.7846, + "step": 12230 + }, + { + "epoch": 0.2629204794432272, + "grad_norm": 0.5704482862900305, + "learning_rate": 1.6948566587885388e-05, + "loss": 0.8022, + "step": 12240 + }, + { + "epoch": 0.2631352837564978, + "grad_norm": 0.5362588968839999, + "learning_rate": 1.6943662893363845e-05, + "loss": 0.7916, + "step": 12250 + }, + { + "epoch": 0.26335008806976845, + "grad_norm": 0.5675401491595801, + "learning_rate": 1.6938755972483732e-05, + "loss": 0.7959, + "step": 12260 + }, + { + "epoch": 0.26356489238303904, + "grad_norm": 0.5540442999120004, + "learning_rate": 1.6933845827525037e-05, + "loss": 0.7906, + "step": 12270 + }, + { + "epoch": 0.2637796966963097, + "grad_norm": 0.5414329042014562, + "learning_rate": 1.6928932460769254e-05, + "loss": 0.7819, + "step": 12280 + }, + { + "epoch": 0.2639945010095803, + "grad_norm": 0.5410748013073313, + "learning_rate": 1.6924015874499363e-05, + "loss": 0.7669, + "step": 12290 + }, + { + "epoch": 0.26420930532285086, + "grad_norm": 0.5472743067105443, + "learning_rate": 1.691909607099985e-05, + "loss": 0.7821, + "step": 12300 + }, + { + "epoch": 0.2644241096361215, + "grad_norm": 0.5475456006044419, + "learning_rate": 1.6914173052556688e-05, + "loss": 0.7811, + "step": 12310 + }, + { + "epoch": 0.2646389139493921, + "grad_norm": 0.5507328978978161, + "learning_rate": 1.6909246821457346e-05, + "loss": 0.7774, + "step": 12320 + }, + { + "epoch": 0.26485371826266274, + "grad_norm": 0.5599737303976604, + "learning_rate": 1.690431737999079e-05, + "loss": 0.7921, + "step": 12330 + }, + { + "epoch": 0.26506852257593333, + "grad_norm": 0.5280366450787144, + "learning_rate": 1.6899384730447477e-05, + "loss": 0.7847, + "step": 12340 + }, + { + "epoch": 0.2652833268892039, + "grad_norm": 0.5435632022539839, + "learning_rate": 1.689444887511935e-05, + "loss": 0.7854, + "step": 12350 + }, + { + "epoch": 0.26549813120247456, + "grad_norm": 0.5647493787301932, + "learning_rate": 1.6889509816299844e-05, + "loss": 0.7899, + "step": 12360 + }, + { + "epoch": 0.26571293551574515, + "grad_norm": 0.5610876101815535, + "learning_rate": 1.688456755628388e-05, + "loss": 0.7874, + "step": 12370 + }, + { + "epoch": 0.26592773982901574, + "grad_norm": 0.5498086456378252, + "learning_rate": 1.6879622097367874e-05, + "loss": 0.7941, + "step": 12380 + }, + { + "epoch": 0.2661425441422864, + "grad_norm": 0.5622568962892611, + "learning_rate": 1.6874673441849715e-05, + "loss": 0.7856, + "step": 12390 + }, + { + "epoch": 0.266357348455557, + "grad_norm": 0.5279820352190503, + "learning_rate": 1.6869721592028792e-05, + "loss": 0.7965, + "step": 12400 + }, + { + "epoch": 0.2665721527688276, + "grad_norm": 0.5316923893826371, + "learning_rate": 1.6864766550205977e-05, + "loss": 0.8056, + "step": 12410 + }, + { + "epoch": 0.2667869570820982, + "grad_norm": 0.5847422188711642, + "learning_rate": 1.6859808318683606e-05, + "loss": 0.7845, + "step": 12420 + }, + { + "epoch": 0.2670017613953688, + "grad_norm": 0.5183382490604992, + "learning_rate": 1.685484689976552e-05, + "loss": 0.7954, + "step": 12430 + }, + { + "epoch": 0.26721656570863944, + "grad_norm": 0.548534083313305, + "learning_rate": 1.6849882295757037e-05, + "loss": 0.7834, + "step": 12440 + }, + { + "epoch": 0.26743137002191003, + "grad_norm": 0.555857167141612, + "learning_rate": 1.6844914508964937e-05, + "loss": 0.7885, + "step": 12450 + }, + { + "epoch": 0.2676461743351807, + "grad_norm": 0.5609532665687568, + "learning_rate": 1.6839943541697498e-05, + "loss": 0.7804, + "step": 12460 + }, + { + "epoch": 0.26786097864845126, + "grad_norm": 0.5601428417687324, + "learning_rate": 1.6834969396264472e-05, + "loss": 0.7858, + "step": 12470 + }, + { + "epoch": 0.26807578296172185, + "grad_norm": 0.5476848469778967, + "learning_rate": 1.682999207497708e-05, + "loss": 0.7938, + "step": 12480 + }, + { + "epoch": 0.2682905872749925, + "grad_norm": 0.5638917271864747, + "learning_rate": 1.6825011580148033e-05, + "loss": 0.7837, + "step": 12490 + }, + { + "epoch": 0.2685053915882631, + "grad_norm": 0.5894522331907057, + "learning_rate": 1.6820027914091497e-05, + "loss": 0.7844, + "step": 12500 + }, + { + "epoch": 0.2687201959015337, + "grad_norm": 0.5610666708837045, + "learning_rate": 1.681504107912313e-05, + "loss": 0.7843, + "step": 12510 + }, + { + "epoch": 0.2689350002148043, + "grad_norm": 0.5446880622878415, + "learning_rate": 1.681005107756005e-05, + "loss": 0.7921, + "step": 12520 + }, + { + "epoch": 0.2691498045280749, + "grad_norm": 0.542926637375099, + "learning_rate": 1.6805057911720852e-05, + "loss": 0.7906, + "step": 12530 + }, + { + "epoch": 0.26936460884134555, + "grad_norm": 0.5750689040548754, + "learning_rate": 1.6800061583925603e-05, + "loss": 0.785, + "step": 12540 + }, + { + "epoch": 0.26957941315461614, + "grad_norm": 0.5144723555135201, + "learning_rate": 1.679506209649583e-05, + "loss": 0.7801, + "step": 12550 + }, + { + "epoch": 0.26979421746788673, + "grad_norm": 0.5368301892072017, + "learning_rate": 1.6790059451754545e-05, + "loss": 0.7764, + "step": 12560 + }, + { + "epoch": 0.2700090217811574, + "grad_norm": 0.5642425304345098, + "learning_rate": 1.6785053652026204e-05, + "loss": 0.8061, + "step": 12570 + }, + { + "epoch": 0.27022382609442797, + "grad_norm": 0.5539198208515579, + "learning_rate": 1.678004469963675e-05, + "loss": 0.7863, + "step": 12580 + }, + { + "epoch": 0.2704386304076986, + "grad_norm": 0.5512816690327085, + "learning_rate": 1.6775032596913576e-05, + "loss": 0.7911, + "step": 12590 + }, + { + "epoch": 0.2706534347209692, + "grad_norm": 0.5815790405385224, + "learning_rate": 1.677001734618555e-05, + "loss": 0.7852, + "step": 12600 + }, + { + "epoch": 0.2708682390342398, + "grad_norm": 0.5612088205707622, + "learning_rate": 1.6764998949783e-05, + "loss": 0.7828, + "step": 12610 + }, + { + "epoch": 0.27108304334751043, + "grad_norm": 0.530172464989036, + "learning_rate": 1.675997741003771e-05, + "loss": 0.7698, + "step": 12620 + }, + { + "epoch": 0.271297847660781, + "grad_norm": 0.552005993228374, + "learning_rate": 1.6754952729282926e-05, + "loss": 0.7745, + "step": 12630 + }, + { + "epoch": 0.27151265197405167, + "grad_norm": 0.5435122213473302, + "learning_rate": 1.6749924909853355e-05, + "loss": 0.7854, + "step": 12640 + }, + { + "epoch": 0.27172745628732226, + "grad_norm": 0.5530374063509907, + "learning_rate": 1.6744893954085173e-05, + "loss": 0.7958, + "step": 12650 + }, + { + "epoch": 0.27194226060059284, + "grad_norm": 0.5611802591773057, + "learning_rate": 1.673985986431599e-05, + "loss": 0.7966, + "step": 12660 + }, + { + "epoch": 0.2721570649138635, + "grad_norm": 0.5568671271875136, + "learning_rate": 1.6734822642884893e-05, + "loss": 0.7764, + "step": 12670 + }, + { + "epoch": 0.2723718692271341, + "grad_norm": 0.5572155300702621, + "learning_rate": 1.6729782292132415e-05, + "loss": 0.7757, + "step": 12680 + }, + { + "epoch": 0.27258667354040467, + "grad_norm": 0.5615138514243179, + "learning_rate": 1.6724738814400545e-05, + "loss": 0.798, + "step": 12690 + }, + { + "epoch": 0.2728014778536753, + "grad_norm": 0.5339476128169535, + "learning_rate": 1.671969221203272e-05, + "loss": 0.7828, + "step": 12700 + }, + { + "epoch": 0.2730162821669459, + "grad_norm": 0.5438064983467911, + "learning_rate": 1.671464248737384e-05, + "loss": 0.7792, + "step": 12710 + }, + { + "epoch": 0.27323108648021655, + "grad_norm": 0.5349876744589488, + "learning_rate": 1.6709589642770247e-05, + "loss": 0.7855, + "step": 12720 + }, + { + "epoch": 0.27344589079348713, + "grad_norm": 0.553863217138617, + "learning_rate": 1.6704533680569732e-05, + "loss": 0.7868, + "step": 12730 + }, + { + "epoch": 0.2736606951067577, + "grad_norm": 0.5390862263215795, + "learning_rate": 1.669947460312154e-05, + "loss": 0.7754, + "step": 12740 + }, + { + "epoch": 0.27387549942002837, + "grad_norm": 0.583549319445325, + "learning_rate": 1.669441241277636e-05, + "loss": 0.7917, + "step": 12750 + }, + { + "epoch": 0.27409030373329896, + "grad_norm": 0.5293298289589918, + "learning_rate": 1.668934711188633e-05, + "loss": 0.7909, + "step": 12760 + }, + { + "epoch": 0.2743051080465696, + "grad_norm": 0.569877867106919, + "learning_rate": 1.6684278702805024e-05, + "loss": 0.7767, + "step": 12770 + }, + { + "epoch": 0.2745199123598402, + "grad_norm": 0.5773582415344543, + "learning_rate": 1.667920718788748e-05, + "loss": 0.7856, + "step": 12780 + }, + { + "epoch": 0.2747347166731108, + "grad_norm": 0.5208586202102348, + "learning_rate": 1.667413256949016e-05, + "loss": 0.7805, + "step": 12790 + }, + { + "epoch": 0.2749495209863814, + "grad_norm": 0.5353938318257099, + "learning_rate": 1.666905484997097e-05, + "loss": 0.7876, + "step": 12800 + }, + { + "epoch": 0.275164325299652, + "grad_norm": 0.5424525339376945, + "learning_rate": 1.6663974031689272e-05, + "loss": 0.803, + "step": 12810 + }, + { + "epoch": 0.2753791296129226, + "grad_norm": 0.5505344864467518, + "learning_rate": 1.665889011700585e-05, + "loss": 0.7839, + "step": 12820 + }, + { + "epoch": 0.27559393392619325, + "grad_norm": 0.5688231819530734, + "learning_rate": 1.665380310828294e-05, + "loss": 0.8044, + "step": 12830 + }, + { + "epoch": 0.27580873823946384, + "grad_norm": 0.5423707638785459, + "learning_rate": 1.6648713007884207e-05, + "loss": 0.7813, + "step": 12840 + }, + { + "epoch": 0.2760235425527345, + "grad_norm": 0.5622431891874606, + "learning_rate": 1.6643619818174755e-05, + "loss": 0.792, + "step": 12850 + }, + { + "epoch": 0.27623834686600507, + "grad_norm": 0.5348635272783925, + "learning_rate": 1.6638523541521126e-05, + "loss": 0.8017, + "step": 12860 + }, + { + "epoch": 0.27645315117927566, + "grad_norm": 0.5539184848623694, + "learning_rate": 1.6633424180291293e-05, + "loss": 0.7935, + "step": 12870 + }, + { + "epoch": 0.2766679554925463, + "grad_norm": 0.531258076725924, + "learning_rate": 1.6628321736854664e-05, + "loss": 0.7771, + "step": 12880 + }, + { + "epoch": 0.2768827598058169, + "grad_norm": 0.5429903770907508, + "learning_rate": 1.662321621358208e-05, + "loss": 0.7682, + "step": 12890 + }, + { + "epoch": 0.27709756411908754, + "grad_norm": 0.5338561005284871, + "learning_rate": 1.6618107612845812e-05, + "loss": 0.7562, + "step": 12900 + }, + { + "epoch": 0.2773123684323581, + "grad_norm": 0.563699713244769, + "learning_rate": 1.6612995937019557e-05, + "loss": 0.7831, + "step": 12910 + }, + { + "epoch": 0.2775271727456287, + "grad_norm": 0.5974035990189168, + "learning_rate": 1.6607881188478446e-05, + "loss": 0.793, + "step": 12920 + }, + { + "epoch": 0.27774197705889936, + "grad_norm": 0.5786108959234898, + "learning_rate": 1.6602763369599037e-05, + "loss": 0.7701, + "step": 12930 + }, + { + "epoch": 0.27795678137216995, + "grad_norm": 0.5244598945283411, + "learning_rate": 1.659764248275932e-05, + "loss": 0.7752, + "step": 12940 + }, + { + "epoch": 0.27817158568544054, + "grad_norm": 0.5392263527270091, + "learning_rate": 1.6592518530338692e-05, + "loss": 0.789, + "step": 12950 + }, + { + "epoch": 0.2783863899987112, + "grad_norm": 0.5572545049836365, + "learning_rate": 1.6587391514718e-05, + "loss": 0.8154, + "step": 12960 + }, + { + "epoch": 0.27860119431198177, + "grad_norm": 0.5549009307377286, + "learning_rate": 1.6582261438279488e-05, + "loss": 0.7904, + "step": 12970 + }, + { + "epoch": 0.2788159986252524, + "grad_norm": 0.5736525203694448, + "learning_rate": 1.6577128303406843e-05, + "loss": 0.7838, + "step": 12980 + }, + { + "epoch": 0.279030802938523, + "grad_norm": 0.5422061227803314, + "learning_rate": 1.657199211248517e-05, + "loss": 0.7775, + "step": 12990 + }, + { + "epoch": 0.2792456072517936, + "grad_norm": 0.5552087862142333, + "learning_rate": 1.656685286790098e-05, + "loss": 0.7918, + "step": 13000 + }, + { + "epoch": 0.27946041156506424, + "grad_norm": 0.5769438755666713, + "learning_rate": 1.656171057204222e-05, + "loss": 0.773, + "step": 13010 + }, + { + "epoch": 0.2796752158783348, + "grad_norm": 0.544310206877259, + "learning_rate": 1.655656522729824e-05, + "loss": 0.787, + "step": 13020 + }, + { + "epoch": 0.27989002019160547, + "grad_norm": 0.5380675260235818, + "learning_rate": 1.6551416836059818e-05, + "loss": 0.7874, + "step": 13030 + }, + { + "epoch": 0.28010482450487606, + "grad_norm": 0.5340600751027904, + "learning_rate": 1.6546265400719143e-05, + "loss": 0.7858, + "step": 13040 + }, + { + "epoch": 0.28031962881814665, + "grad_norm": 0.547403375047864, + "learning_rate": 1.6541110923669816e-05, + "loss": 0.7847, + "step": 13050 + }, + { + "epoch": 0.2805344331314173, + "grad_norm": 0.533402580926284, + "learning_rate": 1.653595340730686e-05, + "loss": 0.8004, + "step": 13060 + }, + { + "epoch": 0.2807492374446879, + "grad_norm": 0.5426741999184683, + "learning_rate": 1.65307928540267e-05, + "loss": 0.7797, + "step": 13070 + }, + { + "epoch": 0.2809640417579585, + "grad_norm": 0.5342611404015724, + "learning_rate": 1.6525629266227177e-05, + "loss": 0.7787, + "step": 13080 + }, + { + "epoch": 0.2811788460712291, + "grad_norm": 1.7861965467084178, + "learning_rate": 1.6520462646307543e-05, + "loss": 0.8042, + "step": 13090 + }, + { + "epoch": 0.2813936503844997, + "grad_norm": 0.5453771703119942, + "learning_rate": 1.6515292996668452e-05, + "loss": 0.7896, + "step": 13100 + }, + { + "epoch": 0.28160845469777035, + "grad_norm": 0.5393581439017914, + "learning_rate": 1.6510120319711974e-05, + "loss": 0.7636, + "step": 13110 + }, + { + "epoch": 0.28182325901104094, + "grad_norm": 0.566923191821844, + "learning_rate": 1.650494461784159e-05, + "loss": 0.7824, + "step": 13120 + }, + { + "epoch": 0.2820380633243115, + "grad_norm": 0.5909033333766904, + "learning_rate": 1.649976589346217e-05, + "loss": 0.7791, + "step": 13130 + }, + { + "epoch": 0.28225286763758217, + "grad_norm": 0.5459678033195983, + "learning_rate": 1.6494584148979996e-05, + "loss": 0.7861, + "step": 13140 + }, + { + "epoch": 0.28246767195085276, + "grad_norm": 0.5763291162585454, + "learning_rate": 1.648939938680276e-05, + "loss": 0.7864, + "step": 13150 + }, + { + "epoch": 0.2826824762641234, + "grad_norm": 0.5341209524759871, + "learning_rate": 1.6484211609339555e-05, + "loss": 0.7803, + "step": 13160 + }, + { + "epoch": 0.282897280577394, + "grad_norm": 0.5736855699742509, + "learning_rate": 1.647902081900086e-05, + "loss": 0.7897, + "step": 13170 + }, + { + "epoch": 0.2831120848906646, + "grad_norm": 0.5531932895913767, + "learning_rate": 1.6473827018198573e-05, + "loss": 0.7928, + "step": 13180 + }, + { + "epoch": 0.28332688920393523, + "grad_norm": 0.5640369977192191, + "learning_rate": 1.6468630209345984e-05, + "loss": 0.8076, + "step": 13190 + }, + { + "epoch": 0.2835416935172058, + "grad_norm": 0.5609123963099327, + "learning_rate": 1.6463430394857772e-05, + "loss": 0.7912, + "step": 13200 + }, + { + "epoch": 0.28375649783047646, + "grad_norm": 0.5557233716140605, + "learning_rate": 1.6458227577150024e-05, + "loss": 0.788, + "step": 13210 + }, + { + "epoch": 0.28397130214374705, + "grad_norm": 0.5662855198896333, + "learning_rate": 1.6453021758640218e-05, + "loss": 0.7884, + "step": 13220 + }, + { + "epoch": 0.28418610645701764, + "grad_norm": 0.5412381516020387, + "learning_rate": 1.644781294174723e-05, + "loss": 0.7956, + "step": 13230 + }, + { + "epoch": 0.2844009107702883, + "grad_norm": 0.5385533947113791, + "learning_rate": 1.6442601128891323e-05, + "loss": 0.7864, + "step": 13240 + }, + { + "epoch": 0.2846157150835589, + "grad_norm": 0.531510421245019, + "learning_rate": 1.6437386322494155e-05, + "loss": 0.7705, + "step": 13250 + }, + { + "epoch": 0.28483051939682946, + "grad_norm": 0.5426033523747722, + "learning_rate": 1.6432168524978777e-05, + "loss": 0.7787, + "step": 13260 + }, + { + "epoch": 0.2850453237101001, + "grad_norm": 0.5623236256089558, + "learning_rate": 1.6426947738769623e-05, + "loss": 0.7813, + "step": 13270 + }, + { + "epoch": 0.2852601280233707, + "grad_norm": 0.5394725869592929, + "learning_rate": 1.6421723966292525e-05, + "loss": 0.7738, + "step": 13280 + }, + { + "epoch": 0.28547493233664134, + "grad_norm": 0.5465469510807215, + "learning_rate": 1.6416497209974697e-05, + "loss": 0.7557, + "step": 13290 + }, + { + "epoch": 0.28568973664991193, + "grad_norm": 0.5371708781271813, + "learning_rate": 1.641126747224474e-05, + "loss": 0.7871, + "step": 13300 + }, + { + "epoch": 0.2859045409631825, + "grad_norm": 0.5400513136671761, + "learning_rate": 1.640603475553264e-05, + "loss": 0.7835, + "step": 13310 + }, + { + "epoch": 0.28611934527645316, + "grad_norm": 0.5394221231389933, + "learning_rate": 1.640079906226977e-05, + "loss": 0.765, + "step": 13320 + }, + { + "epoch": 0.28633414958972375, + "grad_norm": 0.5222441443064099, + "learning_rate": 1.6395560394888888e-05, + "loss": 0.7757, + "step": 13330 + }, + { + "epoch": 0.2865489539029944, + "grad_norm": 0.5606502765218887, + "learning_rate": 1.6390318755824118e-05, + "loss": 0.7868, + "step": 13340 + }, + { + "epoch": 0.286763758216265, + "grad_norm": 0.609652591588806, + "learning_rate": 1.6385074147510987e-05, + "loss": 0.7809, + "step": 13350 + }, + { + "epoch": 0.2869785625295356, + "grad_norm": 0.526740208124346, + "learning_rate": 1.6379826572386386e-05, + "loss": 0.7898, + "step": 13360 + }, + { + "epoch": 0.2871933668428062, + "grad_norm": 0.5305922224130004, + "learning_rate": 1.6374576032888594e-05, + "loss": 0.7807, + "step": 13370 + }, + { + "epoch": 0.2874081711560768, + "grad_norm": 0.5540486723971066, + "learning_rate": 1.6369322531457263e-05, + "loss": 0.7788, + "step": 13380 + }, + { + "epoch": 0.2876229754693474, + "grad_norm": 0.52462212158824, + "learning_rate": 1.6364066070533414e-05, + "loss": 0.7933, + "step": 13390 + }, + { + "epoch": 0.28783777978261804, + "grad_norm": 0.531247543108683, + "learning_rate": 1.635880665255946e-05, + "loss": 0.776, + "step": 13400 + }, + { + "epoch": 0.28805258409588863, + "grad_norm": 0.5732783680452849, + "learning_rate": 1.6353544279979177e-05, + "loss": 0.7931, + "step": 13410 + }, + { + "epoch": 0.2882673884091593, + "grad_norm": 0.542974827627677, + "learning_rate": 1.634827895523771e-05, + "loss": 0.7782, + "step": 13420 + }, + { + "epoch": 0.28848219272242986, + "grad_norm": 0.5876492011137021, + "learning_rate": 1.6343010680781586e-05, + "loss": 0.7774, + "step": 13430 + }, + { + "epoch": 0.28869699703570045, + "grad_norm": 0.5333223558241071, + "learning_rate": 1.63377394590587e-05, + "loss": 0.7908, + "step": 13440 + }, + { + "epoch": 0.2889118013489711, + "grad_norm": 0.5435404704513007, + "learning_rate": 1.6332465292518306e-05, + "loss": 0.784, + "step": 13450 + }, + { + "epoch": 0.2891266056622417, + "grad_norm": 0.5372220581463112, + "learning_rate": 1.6327188183611043e-05, + "loss": 0.7877, + "step": 13460 + }, + { + "epoch": 0.28934140997551233, + "grad_norm": 0.5517875641980315, + "learning_rate": 1.6321908134788904e-05, + "loss": 0.7755, + "step": 13470 + }, + { + "epoch": 0.2895562142887829, + "grad_norm": 0.5624183224646342, + "learning_rate": 1.6316625148505253e-05, + "loss": 0.7697, + "step": 13480 + }, + { + "epoch": 0.2897710186020535, + "grad_norm": 0.5190475908632274, + "learning_rate": 1.631133922721482e-05, + "loss": 0.7768, + "step": 13490 + }, + { + "epoch": 0.28998582291532415, + "grad_norm": 0.5513772019738882, + "learning_rate": 1.6306050373373698e-05, + "loss": 0.7742, + "step": 13500 + }, + { + "epoch": 0.29020062722859474, + "grad_norm": 0.552607695073294, + "learning_rate": 1.6300758589439342e-05, + "loss": 0.787, + "step": 13510 + }, + { + "epoch": 0.2904154315418654, + "grad_norm": 0.5521440595481499, + "learning_rate": 1.6295463877870566e-05, + "loss": 0.7812, + "step": 13520 + }, + { + "epoch": 0.290630235855136, + "grad_norm": 0.541378746109039, + "learning_rate": 1.6290166241127545e-05, + "loss": 0.7813, + "step": 13530 + }, + { + "epoch": 0.29084504016840657, + "grad_norm": 0.5837702840940312, + "learning_rate": 1.6284865681671826e-05, + "loss": 0.7875, + "step": 13540 + }, + { + "epoch": 0.2910598444816772, + "grad_norm": 0.531456315705228, + "learning_rate": 1.6279562201966287e-05, + "loss": 0.7744, + "step": 13550 + }, + { + "epoch": 0.2912746487949478, + "grad_norm": 0.5409752871762843, + "learning_rate": 1.627425580447519e-05, + "loss": 0.7848, + "step": 13560 + }, + { + "epoch": 0.2914894531082184, + "grad_norm": 0.5685739203049746, + "learning_rate": 1.626894649166414e-05, + "loss": 0.7902, + "step": 13570 + }, + { + "epoch": 0.29170425742148903, + "grad_norm": 0.5290562340681636, + "learning_rate": 1.6263634266000093e-05, + "loss": 0.7748, + "step": 13580 + }, + { + "epoch": 0.2919190617347596, + "grad_norm": 0.5526529473322866, + "learning_rate": 1.6258319129951366e-05, + "loss": 0.7911, + "step": 13590 + }, + { + "epoch": 0.29213386604803027, + "grad_norm": 0.5379467854648743, + "learning_rate": 1.6253001085987635e-05, + "loss": 0.7837, + "step": 13600 + }, + { + "epoch": 0.29234867036130086, + "grad_norm": 0.5629511775533564, + "learning_rate": 1.6247680136579904e-05, + "loss": 0.7904, + "step": 13610 + }, + { + "epoch": 0.29256347467457144, + "grad_norm": 0.5637137982010608, + "learning_rate": 1.624235628420055e-05, + "loss": 0.7789, + "step": 13620 + }, + { + "epoch": 0.2927782789878421, + "grad_norm": 0.5552139861154454, + "learning_rate": 1.6237029531323286e-05, + "loss": 0.7766, + "step": 13630 + }, + { + "epoch": 0.2929930833011127, + "grad_norm": 0.5311375889317766, + "learning_rate": 1.6231699880423182e-05, + "loss": 0.7878, + "step": 13640 + }, + { + "epoch": 0.2932078876143833, + "grad_norm": 0.5438973923343646, + "learning_rate": 1.6226367333976642e-05, + "loss": 0.7665, + "step": 13650 + }, + { + "epoch": 0.2934226919276539, + "grad_norm": 0.5471337589971532, + "learning_rate": 1.6221031894461426e-05, + "loss": 0.7712, + "step": 13660 + }, + { + "epoch": 0.2936374962409245, + "grad_norm": 0.5478152109462807, + "learning_rate": 1.6215693564356635e-05, + "loss": 0.7734, + "step": 13670 + }, + { + "epoch": 0.29385230055419514, + "grad_norm": 0.5523511877357635, + "learning_rate": 1.6210352346142713e-05, + "loss": 0.7882, + "step": 13680 + }, + { + "epoch": 0.29406710486746573, + "grad_norm": 0.5405593748173753, + "learning_rate": 1.6205008242301445e-05, + "loss": 0.7967, + "step": 13690 + }, + { + "epoch": 0.2942819091807363, + "grad_norm": 0.5393904185913311, + "learning_rate": 1.619966125531596e-05, + "loss": 0.7975, + "step": 13700 + }, + { + "epoch": 0.29449671349400697, + "grad_norm": 0.5459167139610831, + "learning_rate": 1.6194311387670726e-05, + "loss": 0.7802, + "step": 13710 + }, + { + "epoch": 0.29471151780727756, + "grad_norm": 0.5734671415245739, + "learning_rate": 1.618895864185154e-05, + "loss": 0.7805, + "step": 13720 + }, + { + "epoch": 0.2949263221205482, + "grad_norm": 0.5200409133048494, + "learning_rate": 1.6183603020345552e-05, + "loss": 0.7684, + "step": 13730 + }, + { + "epoch": 0.2951411264338188, + "grad_norm": 0.532032766630498, + "learning_rate": 1.6178244525641238e-05, + "loss": 0.7607, + "step": 13740 + }, + { + "epoch": 0.2953559307470894, + "grad_norm": 0.5450386213659163, + "learning_rate": 1.617288316022841e-05, + "loss": 0.7853, + "step": 13750 + }, + { + "epoch": 0.29557073506036, + "grad_norm": 0.5412583231864979, + "learning_rate": 1.6167518926598215e-05, + "loss": 0.7878, + "step": 13760 + }, + { + "epoch": 0.2957855393736306, + "grad_norm": 0.5495103389055368, + "learning_rate": 1.616215182724314e-05, + "loss": 0.7768, + "step": 13770 + }, + { + "epoch": 0.29600034368690126, + "grad_norm": 0.5735150374820255, + "learning_rate": 1.6156781864656984e-05, + "loss": 0.7826, + "step": 13780 + }, + { + "epoch": 0.29621514800017185, + "grad_norm": 0.535148317937703, + "learning_rate": 1.6151409041334903e-05, + "loss": 0.7871, + "step": 13790 + }, + { + "epoch": 0.29642995231344244, + "grad_norm": 0.5237498381145003, + "learning_rate": 1.6146033359773356e-05, + "loss": 0.7759, + "step": 13800 + }, + { + "epoch": 0.2966447566267131, + "grad_norm": 0.5342689644277192, + "learning_rate": 1.614065482247015e-05, + "loss": 0.7736, + "step": 13810 + }, + { + "epoch": 0.29685956093998367, + "grad_norm": 0.5322183986533855, + "learning_rate": 1.6135273431924408e-05, + "loss": 0.7913, + "step": 13820 + }, + { + "epoch": 0.29707436525325426, + "grad_norm": 0.5536125865349972, + "learning_rate": 1.6129889190636582e-05, + "loss": 0.7836, + "step": 13830 + }, + { + "epoch": 0.2972891695665249, + "grad_norm": 0.5320481447634432, + "learning_rate": 1.6124502101108453e-05, + "loss": 0.7802, + "step": 13840 + }, + { + "epoch": 0.2975039738797955, + "grad_norm": 0.5337884474675023, + "learning_rate": 1.6119112165843114e-05, + "loss": 0.7711, + "step": 13850 + }, + { + "epoch": 0.29771877819306614, + "grad_norm": 0.5448873311868312, + "learning_rate": 1.6113719387344992e-05, + "loss": 0.7763, + "step": 13860 + }, + { + "epoch": 0.2979335825063367, + "grad_norm": 0.5390758922033408, + "learning_rate": 1.6108323768119827e-05, + "loss": 0.7739, + "step": 13870 + }, + { + "epoch": 0.2981483868196073, + "grad_norm": 0.5365705766236541, + "learning_rate": 1.6102925310674686e-05, + "loss": 0.7847, + "step": 13880 + }, + { + "epoch": 0.29836319113287796, + "grad_norm": 0.5590376051193626, + "learning_rate": 1.6097524017517948e-05, + "loss": 0.7854, + "step": 13890 + }, + { + "epoch": 0.29857799544614855, + "grad_norm": 0.5309793859058803, + "learning_rate": 1.609211989115932e-05, + "loss": 0.7692, + "step": 13900 + }, + { + "epoch": 0.2987927997594192, + "grad_norm": 0.5308210455003953, + "learning_rate": 1.6086712934109804e-05, + "loss": 0.785, + "step": 13910 + }, + { + "epoch": 0.2990076040726898, + "grad_norm": 0.5490327798250314, + "learning_rate": 1.608130314888174e-05, + "loss": 0.7735, + "step": 13920 + }, + { + "epoch": 0.29922240838596037, + "grad_norm": 0.5343320632700467, + "learning_rate": 1.6075890537988778e-05, + "loss": 0.7903, + "step": 13930 + }, + { + "epoch": 0.299437212699231, + "grad_norm": 0.5479636639748584, + "learning_rate": 1.6070475103945867e-05, + "loss": 0.7889, + "step": 13940 + }, + { + "epoch": 0.2996520170125016, + "grad_norm": 0.5527723222499173, + "learning_rate": 1.6065056849269286e-05, + "loss": 0.7697, + "step": 13950 + }, + { + "epoch": 0.29986682132577225, + "grad_norm": 0.5576773833757296, + "learning_rate": 1.6059635776476607e-05, + "loss": 0.7796, + "step": 13960 + }, + { + "epoch": 0.30008162563904284, + "grad_norm": 0.540017123946042, + "learning_rate": 1.6054211888086726e-05, + "loss": 0.7789, + "step": 13970 + }, + { + "epoch": 0.3002964299523134, + "grad_norm": 0.5382534676965747, + "learning_rate": 1.604878518661984e-05, + "loss": 0.7864, + "step": 13980 + }, + { + "epoch": 0.30051123426558407, + "grad_norm": 0.5620971765430572, + "learning_rate": 1.6043355674597456e-05, + "loss": 0.7724, + "step": 13990 + }, + { + "epoch": 0.30072603857885466, + "grad_norm": 0.5512920688522195, + "learning_rate": 1.603792335454238e-05, + "loss": 0.7686, + "step": 14000 + }, + { + "epoch": 0.30094084289212525, + "grad_norm": 0.5681571893413786, + "learning_rate": 1.603248822897874e-05, + "loss": 0.7796, + "step": 14010 + }, + { + "epoch": 0.3011556472053959, + "grad_norm": 0.5466486772113822, + "learning_rate": 1.6027050300431945e-05, + "loss": 0.7809, + "step": 14020 + }, + { + "epoch": 0.3013704515186665, + "grad_norm": 0.5235657333916433, + "learning_rate": 1.6021609571428718e-05, + "loss": 0.7868, + "step": 14030 + }, + { + "epoch": 0.3015852558319371, + "grad_norm": 0.5456016921796866, + "learning_rate": 1.6016166044497085e-05, + "loss": 0.7682, + "step": 14040 + }, + { + "epoch": 0.3018000601452077, + "grad_norm": 0.5352992818298888, + "learning_rate": 1.6010719722166373e-05, + "loss": 0.7809, + "step": 14050 + }, + { + "epoch": 0.3020148644584783, + "grad_norm": 0.5282682341595342, + "learning_rate": 1.6005270606967197e-05, + "loss": 0.7844, + "step": 14060 + }, + { + "epoch": 0.30222966877174895, + "grad_norm": 0.5139635209270281, + "learning_rate": 1.5999818701431485e-05, + "loss": 0.7667, + "step": 14070 + }, + { + "epoch": 0.30244447308501954, + "grad_norm": 0.5381885807038028, + "learning_rate": 1.599436400809245e-05, + "loss": 0.7911, + "step": 14080 + }, + { + "epoch": 0.3026592773982902, + "grad_norm": 0.5155734950185481, + "learning_rate": 1.59889065294846e-05, + "loss": 0.7656, + "step": 14090 + }, + { + "epoch": 0.30287408171156077, + "grad_norm": 0.5564242549239296, + "learning_rate": 1.5983446268143746e-05, + "loss": 0.7822, + "step": 14100 + }, + { + "epoch": 0.30308888602483136, + "grad_norm": 0.5187487100642706, + "learning_rate": 1.5977983226606987e-05, + "loss": 0.7908, + "step": 14110 + }, + { + "epoch": 0.303303690338102, + "grad_norm": 0.5642387470463246, + "learning_rate": 1.597251740741271e-05, + "loss": 0.7885, + "step": 14120 + }, + { + "epoch": 0.3035184946513726, + "grad_norm": 0.536223279687209, + "learning_rate": 1.59670488131006e-05, + "loss": 0.7718, + "step": 14130 + }, + { + "epoch": 0.3037332989646432, + "grad_norm": 0.5600475408070319, + "learning_rate": 1.5961577446211627e-05, + "loss": 0.7776, + "step": 14140 + }, + { + "epoch": 0.30394810327791383, + "grad_norm": 0.5437843190566551, + "learning_rate": 1.5956103309288053e-05, + "loss": 0.7685, + "step": 14150 + }, + { + "epoch": 0.3041629075911844, + "grad_norm": 0.5628863721101852, + "learning_rate": 1.5950626404873418e-05, + "loss": 0.7918, + "step": 14160 + }, + { + "epoch": 0.30437771190445506, + "grad_norm": 0.5149513964803338, + "learning_rate": 1.594514673551256e-05, + "loss": 0.7751, + "step": 14170 + }, + { + "epoch": 0.30459251621772565, + "grad_norm": 0.5563402316389365, + "learning_rate": 1.5939664303751596e-05, + "loss": 0.761, + "step": 14180 + }, + { + "epoch": 0.30480732053099624, + "grad_norm": 0.55820340770972, + "learning_rate": 1.5934179112137923e-05, + "loss": 0.8045, + "step": 14190 + }, + { + "epoch": 0.3050221248442669, + "grad_norm": 0.5265505329965681, + "learning_rate": 1.5928691163220228e-05, + "loss": 0.7685, + "step": 14200 + }, + { + "epoch": 0.3052369291575375, + "grad_norm": 0.5362485863281781, + "learning_rate": 1.592320045954847e-05, + "loss": 0.7765, + "step": 14210 + }, + { + "epoch": 0.3054517334708081, + "grad_norm": 0.5529028127823579, + "learning_rate": 1.5917707003673895e-05, + "loss": 0.7899, + "step": 14220 + }, + { + "epoch": 0.3056665377840787, + "grad_norm": 0.5546518668601578, + "learning_rate": 1.591221079814903e-05, + "loss": 0.7863, + "step": 14230 + }, + { + "epoch": 0.3058813420973493, + "grad_norm": 0.5542533511738308, + "learning_rate": 1.590671184552767e-05, + "loss": 0.7713, + "step": 14240 + }, + { + "epoch": 0.30609614641061994, + "grad_norm": 0.5234620656060907, + "learning_rate": 1.5901210148364895e-05, + "loss": 0.7673, + "step": 14250 + }, + { + "epoch": 0.30631095072389053, + "grad_norm": 0.5687517220863817, + "learning_rate": 1.5895705709217056e-05, + "loss": 0.796, + "step": 14260 + }, + { + "epoch": 0.3065257550371612, + "grad_norm": 0.5396488135156106, + "learning_rate": 1.589019853064178e-05, + "loss": 0.7718, + "step": 14270 + }, + { + "epoch": 0.30674055935043176, + "grad_norm": 0.538832149688794, + "learning_rate": 1.5884688615197964e-05, + "loss": 0.7881, + "step": 14280 + }, + { + "epoch": 0.30695536366370235, + "grad_norm": 0.5560141103067452, + "learning_rate": 1.5879175965445782e-05, + "loss": 0.7714, + "step": 14290 + }, + { + "epoch": 0.307170167976973, + "grad_norm": 0.5565327133097089, + "learning_rate": 1.5873660583946673e-05, + "loss": 0.781, + "step": 14300 + }, + { + "epoch": 0.3073849722902436, + "grad_norm": 0.561419912796845, + "learning_rate": 1.5868142473263352e-05, + "loss": 0.7683, + "step": 14310 + }, + { + "epoch": 0.3075997766035142, + "grad_norm": 0.5469680337173222, + "learning_rate": 1.5862621635959788e-05, + "loss": 0.7952, + "step": 14320 + }, + { + "epoch": 0.3078145809167848, + "grad_norm": 0.6073248751313165, + "learning_rate": 1.5857098074601236e-05, + "loss": 0.7978, + "step": 14330 + }, + { + "epoch": 0.3080293852300554, + "grad_norm": 0.5435528930279098, + "learning_rate": 1.5851571791754205e-05, + "loss": 0.7731, + "step": 14340 + }, + { + "epoch": 0.30824418954332605, + "grad_norm": 0.5201227548391855, + "learning_rate": 1.584604278998647e-05, + "loss": 0.7774, + "step": 14350 + }, + { + "epoch": 0.30845899385659664, + "grad_norm": 0.5494541529358633, + "learning_rate": 1.5840511071867065e-05, + "loss": 0.7668, + "step": 14360 + }, + { + "epoch": 0.30867379816986723, + "grad_norm": 0.5795936450755371, + "learning_rate": 1.58349766399663e-05, + "loss": 0.7727, + "step": 14370 + }, + { + "epoch": 0.3088886024831379, + "grad_norm": 0.569527328062543, + "learning_rate": 1.5829439496855735e-05, + "loss": 0.7942, + "step": 14380 + }, + { + "epoch": 0.30910340679640846, + "grad_norm": 0.5347014125524017, + "learning_rate": 1.582389964510819e-05, + "loss": 0.763, + "step": 14390 + }, + { + "epoch": 0.3093182111096791, + "grad_norm": 0.5292356683082766, + "learning_rate": 1.5818357087297746e-05, + "loss": 0.7733, + "step": 14400 + }, + { + "epoch": 0.3095330154229497, + "grad_norm": 0.5564018120086363, + "learning_rate": 1.5812811825999742e-05, + "loss": 0.7832, + "step": 14410 + }, + { + "epoch": 0.3097478197362203, + "grad_norm": 0.5406172696967337, + "learning_rate": 1.580726386379077e-05, + "loss": 0.7755, + "step": 14420 + }, + { + "epoch": 0.30996262404949093, + "grad_norm": 0.5461638054581808, + "learning_rate": 1.5801713203248683e-05, + "loss": 0.7625, + "step": 14430 + }, + { + "epoch": 0.3101774283627615, + "grad_norm": 0.5454736211561317, + "learning_rate": 1.5796159846952578e-05, + "loss": 0.7815, + "step": 14440 + }, + { + "epoch": 0.3103922326760321, + "grad_norm": 0.5474695031583966, + "learning_rate": 1.579060379748282e-05, + "loss": 0.793, + "step": 14450 + }, + { + "epoch": 0.31060703698930275, + "grad_norm": 0.5312856498486378, + "learning_rate": 1.5785045057421006e-05, + "loss": 0.7645, + "step": 14460 + }, + { + "epoch": 0.31082184130257334, + "grad_norm": 0.5165648723128465, + "learning_rate": 1.5779483629349997e-05, + "loss": 0.7595, + "step": 14470 + }, + { + "epoch": 0.311036645615844, + "grad_norm": 0.5559441987275184, + "learning_rate": 1.57739195158539e-05, + "loss": 0.7839, + "step": 14480 + }, + { + "epoch": 0.3112514499291146, + "grad_norm": 0.5769441262612243, + "learning_rate": 1.5768352719518068e-05, + "loss": 0.7738, + "step": 14490 + }, + { + "epoch": 0.31146625424238517, + "grad_norm": 0.5473900907836559, + "learning_rate": 1.57627832429291e-05, + "loss": 0.7865, + "step": 14500 + }, + { + "epoch": 0.3116810585556558, + "grad_norm": 0.529556628584659, + "learning_rate": 1.5757211088674845e-05, + "loss": 0.7745, + "step": 14510 + }, + { + "epoch": 0.3118958628689264, + "grad_norm": 0.5361987622170035, + "learning_rate": 1.575163625934439e-05, + "loss": 0.7777, + "step": 14520 + }, + { + "epoch": 0.31211066718219704, + "grad_norm": 0.5262241261547033, + "learning_rate": 1.574605875752807e-05, + "loss": 0.778, + "step": 14530 + }, + { + "epoch": 0.31232547149546763, + "grad_norm": 0.5232473606874927, + "learning_rate": 1.5740478585817455e-05, + "loss": 0.7606, + "step": 14540 + }, + { + "epoch": 0.3125402758087382, + "grad_norm": 0.5302585690675944, + "learning_rate": 1.573489574680537e-05, + "loss": 0.7738, + "step": 14550 + }, + { + "epoch": 0.31275508012200887, + "grad_norm": 0.5338970296170283, + "learning_rate": 1.5729310243085858e-05, + "loss": 0.7769, + "step": 14560 + }, + { + "epoch": 0.31296988443527946, + "grad_norm": 0.5337153071748394, + "learning_rate": 1.5723722077254216e-05, + "loss": 0.7738, + "step": 14570 + }, + { + "epoch": 0.31318468874855004, + "grad_norm": 0.5408069906876969, + "learning_rate": 1.5718131251906978e-05, + "loss": 0.7837, + "step": 14580 + }, + { + "epoch": 0.3133994930618207, + "grad_norm": 0.5317712434488392, + "learning_rate": 1.5712537769641905e-05, + "loss": 0.7823, + "step": 14590 + }, + { + "epoch": 0.3136142973750913, + "grad_norm": 0.5312973847806638, + "learning_rate": 1.5706941633058e-05, + "loss": 0.7712, + "step": 14600 + }, + { + "epoch": 0.3138291016883619, + "grad_norm": 0.5465034121886095, + "learning_rate": 1.5701342844755492e-05, + "loss": 0.7767, + "step": 14610 + }, + { + "epoch": 0.3140439060016325, + "grad_norm": 0.5301591392732491, + "learning_rate": 1.5695741407335852e-05, + "loss": 0.7629, + "step": 14620 + }, + { + "epoch": 0.3142587103149031, + "grad_norm": 0.5474541814548766, + "learning_rate": 1.569013732340177e-05, + "loss": 0.7772, + "step": 14630 + }, + { + "epoch": 0.31447351462817374, + "grad_norm": 0.5168423966046714, + "learning_rate": 1.5684530595557174e-05, + "loss": 0.7595, + "step": 14640 + }, + { + "epoch": 0.31468831894144433, + "grad_norm": 0.5322937512665464, + "learning_rate": 1.5678921226407222e-05, + "loss": 0.7806, + "step": 14650 + }, + { + "epoch": 0.314903123254715, + "grad_norm": 0.5418401693011264, + "learning_rate": 1.567330921855829e-05, + "loss": 0.7729, + "step": 14660 + }, + { + "epoch": 0.31511792756798557, + "grad_norm": 0.5301368247497126, + "learning_rate": 1.566769457461799e-05, + "loss": 0.7683, + "step": 14670 + }, + { + "epoch": 0.31533273188125616, + "grad_norm": 0.5273233080040064, + "learning_rate": 1.5662077297195154e-05, + "loss": 0.7711, + "step": 14680 + }, + { + "epoch": 0.3155475361945268, + "grad_norm": 0.5162157276165048, + "learning_rate": 1.565645738889984e-05, + "loss": 0.7688, + "step": 14690 + }, + { + "epoch": 0.3157623405077974, + "grad_norm": 0.5571046212802432, + "learning_rate": 1.5650834852343324e-05, + "loss": 0.7767, + "step": 14700 + }, + { + "epoch": 0.31597714482106803, + "grad_norm": 0.5174571005771366, + "learning_rate": 1.5645209690138107e-05, + "loss": 0.7717, + "step": 14710 + }, + { + "epoch": 0.3161919491343386, + "grad_norm": 0.5250948275688863, + "learning_rate": 1.5639581904897908e-05, + "loss": 0.7875, + "step": 14720 + }, + { + "epoch": 0.3164067534476092, + "grad_norm": 0.5449756805014006, + "learning_rate": 1.5633951499237667e-05, + "loss": 0.7702, + "step": 14730 + }, + { + "epoch": 0.31662155776087986, + "grad_norm": 0.5249159478890357, + "learning_rate": 1.5628318475773538e-05, + "loss": 0.7704, + "step": 14740 + }, + { + "epoch": 0.31683636207415045, + "grad_norm": 0.5363351915331944, + "learning_rate": 1.5622682837122895e-05, + "loss": 0.7795, + "step": 14750 + }, + { + "epoch": 0.31705116638742104, + "grad_norm": 0.5366684931300841, + "learning_rate": 1.5617044585904328e-05, + "loss": 0.7788, + "step": 14760 + }, + { + "epoch": 0.3172659707006917, + "grad_norm": 0.5176930213448244, + "learning_rate": 1.5611403724737635e-05, + "loss": 0.7729, + "step": 14770 + }, + { + "epoch": 0.31748077501396227, + "grad_norm": 0.53731576710967, + "learning_rate": 1.5605760256243834e-05, + "loss": 0.7673, + "step": 14780 + }, + { + "epoch": 0.3176955793272329, + "grad_norm": 0.5530960087868758, + "learning_rate": 1.560011418304515e-05, + "loss": 0.7719, + "step": 14790 + }, + { + "epoch": 0.3179103836405035, + "grad_norm": 0.5325204498819602, + "learning_rate": 1.559446550776502e-05, + "loss": 0.7777, + "step": 14800 + }, + { + "epoch": 0.3181251879537741, + "grad_norm": 0.5199725220262916, + "learning_rate": 1.558881423302808e-05, + "loss": 0.7752, + "step": 14810 + }, + { + "epoch": 0.31833999226704474, + "grad_norm": 0.5370286678853574, + "learning_rate": 1.5583160361460198e-05, + "loss": 0.7681, + "step": 14820 + }, + { + "epoch": 0.3185547965803153, + "grad_norm": 0.5257072462311427, + "learning_rate": 1.5577503895688427e-05, + "loss": 0.7619, + "step": 14830 + }, + { + "epoch": 0.31876960089358597, + "grad_norm": 0.5343487750052494, + "learning_rate": 1.557184483834103e-05, + "loss": 0.7703, + "step": 14840 + }, + { + "epoch": 0.31898440520685656, + "grad_norm": 0.5251831277823487, + "learning_rate": 1.5566183192047476e-05, + "loss": 0.7682, + "step": 14850 + }, + { + "epoch": 0.31919920952012715, + "grad_norm": 0.5458229759413251, + "learning_rate": 1.556051895943844e-05, + "loss": 0.7716, + "step": 14860 + }, + { + "epoch": 0.3194140138333978, + "grad_norm": 0.5272161429140526, + "learning_rate": 1.5554852143145794e-05, + "loss": 0.7759, + "step": 14870 + }, + { + "epoch": 0.3196288181466684, + "grad_norm": 0.5344254727616319, + "learning_rate": 1.5549182745802614e-05, + "loss": 0.7777, + "step": 14880 + }, + { + "epoch": 0.31984362245993897, + "grad_norm": 0.5548010647048678, + "learning_rate": 1.5543510770043177e-05, + "loss": 0.769, + "step": 14890 + }, + { + "epoch": 0.3200584267732096, + "grad_norm": 0.5412970475079941, + "learning_rate": 1.553783621850295e-05, + "loss": 0.7703, + "step": 14900 + }, + { + "epoch": 0.3202732310864802, + "grad_norm": 0.5351285928852882, + "learning_rate": 1.55321590938186e-05, + "loss": 0.7715, + "step": 14910 + }, + { + "epoch": 0.32048803539975085, + "grad_norm": 0.5223639221501771, + "learning_rate": 1.5526479398627997e-05, + "loss": 0.767, + "step": 14920 + }, + { + "epoch": 0.32070283971302144, + "grad_norm": 0.5335867628748313, + "learning_rate": 1.5520797135570192e-05, + "loss": 0.7703, + "step": 14930 + }, + { + "epoch": 0.320917644026292, + "grad_norm": 0.5441683074016587, + "learning_rate": 1.5515112307285446e-05, + "loss": 0.7753, + "step": 14940 + }, + { + "epoch": 0.32113244833956267, + "grad_norm": 0.5601042025539121, + "learning_rate": 1.5509424916415198e-05, + "loss": 0.7777, + "step": 14950 + }, + { + "epoch": 0.32134725265283326, + "grad_norm": 0.538515402379057, + "learning_rate": 1.5503734965602078e-05, + "loss": 0.7808, + "step": 14960 + }, + { + "epoch": 0.3215620569661039, + "grad_norm": 0.5436748120429375, + "learning_rate": 1.5498042457489916e-05, + "loss": 0.782, + "step": 14970 + }, + { + "epoch": 0.3217768612793745, + "grad_norm": 0.5158949131886207, + "learning_rate": 1.5492347394723726e-05, + "loss": 0.7631, + "step": 14980 + }, + { + "epoch": 0.3219916655926451, + "grad_norm": 0.5328439457730709, + "learning_rate": 1.5486649779949695e-05, + "loss": 0.7774, + "step": 14990 + }, + { + "epoch": 0.3222064699059157, + "grad_norm": 0.5433132241965292, + "learning_rate": 1.5480949615815223e-05, + "loss": 0.7834, + "step": 15000 + }, + { + "epoch": 0.3224212742191863, + "grad_norm": 0.5470052170551871, + "learning_rate": 1.547524690496887e-05, + "loss": 0.7727, + "step": 15010 + }, + { + "epoch": 0.3226360785324569, + "grad_norm": 0.5466443586756153, + "learning_rate": 1.5469541650060396e-05, + "loss": 0.7803, + "step": 15020 + }, + { + "epoch": 0.32285088284572755, + "grad_norm": 0.5351421951971927, + "learning_rate": 1.5463833853740723e-05, + "loss": 0.7955, + "step": 15030 + }, + { + "epoch": 0.32306568715899814, + "grad_norm": 0.5236071642893849, + "learning_rate": 1.5458123518661984e-05, + "loss": 0.774, + "step": 15040 + }, + { + "epoch": 0.3232804914722688, + "grad_norm": 0.5201113478881045, + "learning_rate": 1.5452410647477462e-05, + "loss": 0.7765, + "step": 15050 + }, + { + "epoch": 0.32349529578553937, + "grad_norm": 0.551550727831456, + "learning_rate": 1.544669524284163e-05, + "loss": 0.7749, + "step": 15060 + }, + { + "epoch": 0.32371010009880996, + "grad_norm": 0.5396886090448376, + "learning_rate": 1.5440977307410148e-05, + "loss": 0.7748, + "step": 15070 + }, + { + "epoch": 0.3239249044120806, + "grad_norm": 0.5490265215897929, + "learning_rate": 1.5435256843839837e-05, + "loss": 0.7782, + "step": 15080 + }, + { + "epoch": 0.3241397087253512, + "grad_norm": 0.5680139354556848, + "learning_rate": 1.5429533854788698e-05, + "loss": 0.769, + "step": 15090 + }, + { + "epoch": 0.32435451303862184, + "grad_norm": 0.5724567681997974, + "learning_rate": 1.542380834291591e-05, + "loss": 0.7743, + "step": 15100 + }, + { + "epoch": 0.32456931735189243, + "grad_norm": 0.5551606358878742, + "learning_rate": 1.5418080310881816e-05, + "loss": 0.7874, + "step": 15110 + }, + { + "epoch": 0.324784121665163, + "grad_norm": 0.5273126015719424, + "learning_rate": 1.5412349761347937e-05, + "loss": 0.7681, + "step": 15120 + }, + { + "epoch": 0.32499892597843366, + "grad_norm": 0.5347341788619094, + "learning_rate": 1.5406616696976957e-05, + "loss": 0.7616, + "step": 15130 + }, + { + "epoch": 0.32521373029170425, + "grad_norm": 0.5239352669850086, + "learning_rate": 1.540088112043274e-05, + "loss": 0.7626, + "step": 15140 + }, + { + "epoch": 0.3254285346049749, + "grad_norm": 0.5303037869360442, + "learning_rate": 1.53951430343803e-05, + "loss": 0.7695, + "step": 15150 + }, + { + "epoch": 0.3256433389182455, + "grad_norm": 0.5396447269485543, + "learning_rate": 1.5389402441485835e-05, + "loss": 0.7534, + "step": 15160 + }, + { + "epoch": 0.3258581432315161, + "grad_norm": 0.5288994376628046, + "learning_rate": 1.5383659344416696e-05, + "loss": 0.7661, + "step": 15170 + }, + { + "epoch": 0.3260729475447867, + "grad_norm": 0.5366154432296876, + "learning_rate": 1.5377913745841404e-05, + "loss": 0.7796, + "step": 15180 + }, + { + "epoch": 0.3262877518580573, + "grad_norm": 0.5221692922283262, + "learning_rate": 1.5372165648429633e-05, + "loss": 0.7726, + "step": 15190 + }, + { + "epoch": 0.3265025561713279, + "grad_norm": 0.5422324565519508, + "learning_rate": 1.536641505485223e-05, + "loss": 0.7633, + "step": 15200 + }, + { + "epoch": 0.32671736048459854, + "grad_norm": 0.5408071342677044, + "learning_rate": 1.5360661967781194e-05, + "loss": 0.7689, + "step": 15210 + }, + { + "epoch": 0.32693216479786913, + "grad_norm": 0.5454745974106339, + "learning_rate": 1.5354906389889686e-05, + "loss": 0.7741, + "step": 15220 + }, + { + "epoch": 0.3271469691111398, + "grad_norm": 0.5218591885366504, + "learning_rate": 1.534914832385203e-05, + "loss": 0.7909, + "step": 15230 + }, + { + "epoch": 0.32736177342441036, + "grad_norm": 0.5519162115511386, + "learning_rate": 1.5343387772343687e-05, + "loss": 0.7809, + "step": 15240 + }, + { + "epoch": 0.32757657773768095, + "grad_norm": 0.5327394461244905, + "learning_rate": 1.533762473804129e-05, + "loss": 0.773, + "step": 15250 + }, + { + "epoch": 0.3277913820509516, + "grad_norm": 0.5584463854444557, + "learning_rate": 1.5331859223622623e-05, + "loss": 0.7735, + "step": 15260 + }, + { + "epoch": 0.3280061863642222, + "grad_norm": 0.5390984045147804, + "learning_rate": 1.532609123176662e-05, + "loss": 0.7757, + "step": 15270 + }, + { + "epoch": 0.32822099067749283, + "grad_norm": 0.543837161370247, + "learning_rate": 1.5320320765153367e-05, + "loss": 0.762, + "step": 15280 + }, + { + "epoch": 0.3284357949907634, + "grad_norm": 0.5352816574606879, + "learning_rate": 1.53145478264641e-05, + "loss": 0.7693, + "step": 15290 + }, + { + "epoch": 0.328650599304034, + "grad_norm": 0.5605556044481591, + "learning_rate": 1.5308772418381196e-05, + "loss": 0.7623, + "step": 15300 + }, + { + "epoch": 0.32886540361730465, + "grad_norm": 0.5446921802277784, + "learning_rate": 1.5302994543588194e-05, + "loss": 0.774, + "step": 15310 + }, + { + "epoch": 0.32908020793057524, + "grad_norm": 0.5151829148535916, + "learning_rate": 1.529721420476977e-05, + "loss": 0.7826, + "step": 15320 + }, + { + "epoch": 0.32929501224384583, + "grad_norm": 0.5223909324949548, + "learning_rate": 1.5291431404611744e-05, + "loss": 0.7593, + "step": 15330 + }, + { + "epoch": 0.3295098165571165, + "grad_norm": 0.5267824527110152, + "learning_rate": 1.5285646145801086e-05, + "loss": 0.7665, + "step": 15340 + }, + { + "epoch": 0.32972462087038706, + "grad_norm": 0.5466419782290777, + "learning_rate": 1.5279858431025903e-05, + "loss": 0.7704, + "step": 15350 + }, + { + "epoch": 0.3299394251836577, + "grad_norm": 0.5557083967709183, + "learning_rate": 1.527406826297544e-05, + "loss": 0.7661, + "step": 15360 + }, + { + "epoch": 0.3301542294969283, + "grad_norm": 0.5337371028909507, + "learning_rate": 1.526827564434009e-05, + "loss": 0.7604, + "step": 15370 + }, + { + "epoch": 0.3303690338101989, + "grad_norm": 0.5703493813264273, + "learning_rate": 1.5262480577811386e-05, + "loss": 0.7672, + "step": 15380 + }, + { + "epoch": 0.33058383812346953, + "grad_norm": 0.5472412709147122, + "learning_rate": 1.5256683066081986e-05, + "loss": 0.7709, + "step": 15390 + }, + { + "epoch": 0.3307986424367401, + "grad_norm": 0.5457023754017395, + "learning_rate": 1.5250883111845697e-05, + "loss": 0.787, + "step": 15400 + }, + { + "epoch": 0.33101344675001076, + "grad_norm": 0.5416955760823877, + "learning_rate": 1.5245080717797454e-05, + "loss": 0.759, + "step": 15410 + }, + { + "epoch": 0.33122825106328135, + "grad_norm": 0.5138493444141494, + "learning_rate": 1.5239275886633326e-05, + "loss": 0.7735, + "step": 15420 + }, + { + "epoch": 0.33144305537655194, + "grad_norm": 0.5211626703386022, + "learning_rate": 1.5233468621050518e-05, + "loss": 0.7784, + "step": 15430 + }, + { + "epoch": 0.3316578596898226, + "grad_norm": 0.5275918206293356, + "learning_rate": 1.5227658923747364e-05, + "loss": 0.7833, + "step": 15440 + }, + { + "epoch": 0.3318726640030932, + "grad_norm": 0.6206866088264538, + "learning_rate": 1.5221846797423325e-05, + "loss": 0.7708, + "step": 15450 + }, + { + "epoch": 0.33208746831636377, + "grad_norm": 0.5261380809398264, + "learning_rate": 1.5216032244778993e-05, + "loss": 0.7782, + "step": 15460 + }, + { + "epoch": 0.3323022726296344, + "grad_norm": 0.541782059211599, + "learning_rate": 1.5210215268516093e-05, + "loss": 0.7608, + "step": 15470 + }, + { + "epoch": 0.332517076942905, + "grad_norm": 0.5240473732673129, + "learning_rate": 1.520439587133747e-05, + "loss": 0.7726, + "step": 15480 + }, + { + "epoch": 0.33273188125617564, + "grad_norm": 0.5191076944218657, + "learning_rate": 1.5198574055947091e-05, + "loss": 0.7642, + "step": 15490 + }, + { + "epoch": 0.33294668556944623, + "grad_norm": 0.539873653252131, + "learning_rate": 1.5192749825050052e-05, + "loss": 0.7636, + "step": 15500 + }, + { + "epoch": 0.3331614898827168, + "grad_norm": 0.531660226924968, + "learning_rate": 1.518692318135257e-05, + "loss": 0.7796, + "step": 15510 + }, + { + "epoch": 0.33337629419598747, + "grad_norm": 0.5173762361506008, + "learning_rate": 1.5181094127561982e-05, + "loss": 0.7615, + "step": 15520 + }, + { + "epoch": 0.33359109850925805, + "grad_norm": 0.5114268089145959, + "learning_rate": 1.5175262666386745e-05, + "loss": 0.7559, + "step": 15530 + }, + { + "epoch": 0.3338059028225287, + "grad_norm": 0.5271733156368176, + "learning_rate": 1.5169428800536441e-05, + "loss": 0.7711, + "step": 15540 + }, + { + "epoch": 0.3340207071357993, + "grad_norm": 0.5135401558436904, + "learning_rate": 1.5163592532721763e-05, + "loss": 0.7776, + "step": 15550 + }, + { + "epoch": 0.3342355114490699, + "grad_norm": 0.53726076093529, + "learning_rate": 1.515775386565451e-05, + "loss": 0.7594, + "step": 15560 + }, + { + "epoch": 0.3344503157623405, + "grad_norm": 0.5184174928971732, + "learning_rate": 1.515191280204762e-05, + "loss": 0.7774, + "step": 15570 + }, + { + "epoch": 0.3346651200756111, + "grad_norm": 0.5225627226289906, + "learning_rate": 1.514606934461512e-05, + "loss": 0.7653, + "step": 15580 + }, + { + "epoch": 0.33487992438888176, + "grad_norm": 0.521185908731502, + "learning_rate": 1.5140223496072168e-05, + "loss": 0.7702, + "step": 15590 + }, + { + "epoch": 0.33509472870215234, + "grad_norm": 0.5605456313685063, + "learning_rate": 1.5134375259135024e-05, + "loss": 0.7725, + "step": 15600 + }, + { + "epoch": 0.33530953301542293, + "grad_norm": 0.5504337164442501, + "learning_rate": 1.5128524636521058e-05, + "loss": 0.7502, + "step": 15610 + }, + { + "epoch": 0.3355243373286936, + "grad_norm": 0.5304219453129593, + "learning_rate": 1.5122671630948748e-05, + "loss": 0.7731, + "step": 15620 + }, + { + "epoch": 0.33573914164196417, + "grad_norm": 0.5405431462257487, + "learning_rate": 1.5116816245137684e-05, + "loss": 0.7787, + "step": 15630 + }, + { + "epoch": 0.33595394595523476, + "grad_norm": 0.540626596116053, + "learning_rate": 1.5110958481808558e-05, + "loss": 0.7707, + "step": 15640 + }, + { + "epoch": 0.3361687502685054, + "grad_norm": 0.5107167602298724, + "learning_rate": 1.5105098343683166e-05, + "loss": 0.7853, + "step": 15650 + }, + { + "epoch": 0.336383554581776, + "grad_norm": 0.5466449645523599, + "learning_rate": 1.5099235833484411e-05, + "loss": 0.7756, + "step": 15660 + }, + { + "epoch": 0.33659835889504663, + "grad_norm": 0.5123157159149067, + "learning_rate": 1.5093370953936298e-05, + "loss": 0.7635, + "step": 15670 + }, + { + "epoch": 0.3368131632083172, + "grad_norm": 0.5561787006388946, + "learning_rate": 1.5087503707763925e-05, + "loss": 0.7777, + "step": 15680 + }, + { + "epoch": 0.3370279675215878, + "grad_norm": 0.5423194514303097, + "learning_rate": 1.5081634097693498e-05, + "loss": 0.7709, + "step": 15690 + }, + { + "epoch": 0.33724277183485846, + "grad_norm": 0.5131134543368485, + "learning_rate": 1.5075762126452324e-05, + "loss": 0.7536, + "step": 15700 + }, + { + "epoch": 0.33745757614812905, + "grad_norm": 0.5263701017681474, + "learning_rate": 1.5069887796768798e-05, + "loss": 0.7646, + "step": 15710 + }, + { + "epoch": 0.3376723804613997, + "grad_norm": 0.5335909937404967, + "learning_rate": 1.5064011111372417e-05, + "loss": 0.774, + "step": 15720 + }, + { + "epoch": 0.3378871847746703, + "grad_norm": 0.5205555068375117, + "learning_rate": 1.5058132072993767e-05, + "loss": 0.7729, + "step": 15730 + }, + { + "epoch": 0.33810198908794087, + "grad_norm": 0.529162741413939, + "learning_rate": 1.5052250684364535e-05, + "loss": 0.7823, + "step": 15740 + }, + { + "epoch": 0.3383167934012115, + "grad_norm": 0.5335725684320926, + "learning_rate": 1.5046366948217495e-05, + "loss": 0.7881, + "step": 15750 + }, + { + "epoch": 0.3385315977144821, + "grad_norm": 0.5422608148576844, + "learning_rate": 1.5040480867286511e-05, + "loss": 0.7723, + "step": 15760 + }, + { + "epoch": 0.3387464020277527, + "grad_norm": 0.525108601464528, + "learning_rate": 1.5034592444306541e-05, + "loss": 0.7661, + "step": 15770 + }, + { + "epoch": 0.33896120634102334, + "grad_norm": 0.5212632664656071, + "learning_rate": 1.5028701682013626e-05, + "loss": 0.7722, + "step": 15780 + }, + { + "epoch": 0.3391760106542939, + "grad_norm": 0.523147333913358, + "learning_rate": 1.50228085831449e-05, + "loss": 0.766, + "step": 15790 + }, + { + "epoch": 0.33939081496756457, + "grad_norm": 0.5155900387298158, + "learning_rate": 1.5016913150438575e-05, + "loss": 0.7544, + "step": 15800 + }, + { + "epoch": 0.33960561928083516, + "grad_norm": 2.57094670571956, + "learning_rate": 1.5011015386633954e-05, + "loss": 0.7738, + "step": 15810 + }, + { + "epoch": 0.33982042359410575, + "grad_norm": 0.5163335640756341, + "learning_rate": 1.5005115294471422e-05, + "loss": 0.755, + "step": 15820 + }, + { + "epoch": 0.3400352279073764, + "grad_norm": 0.5597200416389281, + "learning_rate": 1.499921287669244e-05, + "loss": 0.7905, + "step": 15830 + }, + { + "epoch": 0.340250032220647, + "grad_norm": 0.5137889894949109, + "learning_rate": 1.4993308136039557e-05, + "loss": 0.7632, + "step": 15840 + }, + { + "epoch": 0.3404648365339176, + "grad_norm": 0.5245555303836951, + "learning_rate": 1.4987401075256398e-05, + "loss": 0.7573, + "step": 15850 + }, + { + "epoch": 0.3406796408471882, + "grad_norm": 0.5466373460073314, + "learning_rate": 1.498149169708767e-05, + "loss": 0.7738, + "step": 15860 + }, + { + "epoch": 0.3408944451604588, + "grad_norm": 0.5229866422016913, + "learning_rate": 1.4975580004279149e-05, + "loss": 0.7583, + "step": 15870 + }, + { + "epoch": 0.34110924947372945, + "grad_norm": 0.5114015357336622, + "learning_rate": 1.4969665999577693e-05, + "loss": 0.7522, + "step": 15880 + }, + { + "epoch": 0.34132405378700004, + "grad_norm": 0.5125004960033485, + "learning_rate": 1.4963749685731231e-05, + "loss": 0.7739, + "step": 15890 + }, + { + "epoch": 0.3415388581002706, + "grad_norm": 0.5365066975799684, + "learning_rate": 1.4957831065488763e-05, + "loss": 0.7767, + "step": 15900 + }, + { + "epoch": 0.34175366241354127, + "grad_norm": 0.5224206864785478, + "learning_rate": 1.495191014160037e-05, + "loss": 0.7736, + "step": 15910 + }, + { + "epoch": 0.34196846672681186, + "grad_norm": 0.5138735687941413, + "learning_rate": 1.4945986916817194e-05, + "loss": 0.7558, + "step": 15920 + }, + { + "epoch": 0.3421832710400825, + "grad_norm": 0.5156382777636157, + "learning_rate": 1.494006139389145e-05, + "loss": 0.755, + "step": 15930 + }, + { + "epoch": 0.3423980753533531, + "grad_norm": 0.5173997349464476, + "learning_rate": 1.4934133575576418e-05, + "loss": 0.7644, + "step": 15940 + }, + { + "epoch": 0.3426128796666237, + "grad_norm": 0.5773347686325697, + "learning_rate": 1.4928203464626446e-05, + "loss": 0.7803, + "step": 15950 + }, + { + "epoch": 0.3428276839798943, + "grad_norm": 0.5238727000577228, + "learning_rate": 1.4922271063796946e-05, + "loss": 0.7688, + "step": 15960 + }, + { + "epoch": 0.3430424882931649, + "grad_norm": 0.49544133793048006, + "learning_rate": 1.4916336375844402e-05, + "loss": 0.765, + "step": 15970 + }, + { + "epoch": 0.34325729260643556, + "grad_norm": 0.5559128491079445, + "learning_rate": 1.4910399403526355e-05, + "loss": 0.7643, + "step": 15980 + }, + { + "epoch": 0.34347209691970615, + "grad_norm": 0.5224474824510437, + "learning_rate": 1.4904460149601399e-05, + "loss": 0.7605, + "step": 15990 + }, + { + "epoch": 0.34368690123297674, + "grad_norm": 0.5084160847363332, + "learning_rate": 1.48985186168292e-05, + "loss": 0.7525, + "step": 16000 + }, + { + "epoch": 0.3439017055462474, + "grad_norm": 0.5178973898547171, + "learning_rate": 1.489257480797048e-05, + "loss": 0.7623, + "step": 16010 + }, + { + "epoch": 0.34411650985951797, + "grad_norm": 0.5480129624627523, + "learning_rate": 1.4886628725787017e-05, + "loss": 0.7831, + "step": 16020 + }, + { + "epoch": 0.3443313141727886, + "grad_norm": 0.524153874187991, + "learning_rate": 1.4880680373041646e-05, + "loss": 0.7659, + "step": 16030 + }, + { + "epoch": 0.3445461184860592, + "grad_norm": 0.5488165779868663, + "learning_rate": 1.4874729752498256e-05, + "loss": 0.7508, + "step": 16040 + }, + { + "epoch": 0.3447609227993298, + "grad_norm": 0.6146964896934437, + "learning_rate": 1.4868776866921792e-05, + "loss": 0.7708, + "step": 16050 + }, + { + "epoch": 0.34497572711260044, + "grad_norm": 0.5367282750533525, + "learning_rate": 1.4862821719078246e-05, + "loss": 0.7647, + "step": 16060 + }, + { + "epoch": 0.34519053142587103, + "grad_norm": 0.5370082911271288, + "learning_rate": 1.4856864311734667e-05, + "loss": 0.7716, + "step": 16070 + }, + { + "epoch": 0.3454053357391416, + "grad_norm": 0.5210318070953477, + "learning_rate": 1.4850904647659155e-05, + "loss": 0.7562, + "step": 16080 + }, + { + "epoch": 0.34562014005241226, + "grad_norm": 0.5426172156914827, + "learning_rate": 1.4844942729620853e-05, + "loss": 0.7517, + "step": 16090 + }, + { + "epoch": 0.34583494436568285, + "grad_norm": 0.5353756178515557, + "learning_rate": 1.4838978560389952e-05, + "loss": 0.777, + "step": 16100 + }, + { + "epoch": 0.3460497486789535, + "grad_norm": 0.5458690566171005, + "learning_rate": 1.4833012142737696e-05, + "loss": 0.7596, + "step": 16110 + }, + { + "epoch": 0.3462645529922241, + "grad_norm": 0.5222605704575792, + "learning_rate": 1.4827043479436362e-05, + "loss": 0.7641, + "step": 16120 + }, + { + "epoch": 0.3464793573054947, + "grad_norm": 0.517515304030216, + "learning_rate": 1.4821072573259283e-05, + "loss": 0.7735, + "step": 16130 + }, + { + "epoch": 0.3466941616187653, + "grad_norm": 0.5619458732612833, + "learning_rate": 1.4815099426980825e-05, + "loss": 0.7646, + "step": 16140 + }, + { + "epoch": 0.3469089659320359, + "grad_norm": 0.5308436572477332, + "learning_rate": 1.4809124043376399e-05, + "loss": 0.7767, + "step": 16150 + }, + { + "epoch": 0.34712377024530655, + "grad_norm": 0.5135551609727562, + "learning_rate": 1.4803146425222457e-05, + "loss": 0.7611, + "step": 16160 + }, + { + "epoch": 0.34733857455857714, + "grad_norm": 0.5330663686858607, + "learning_rate": 1.479716657529648e-05, + "loss": 0.7702, + "step": 16170 + }, + { + "epoch": 0.34755337887184773, + "grad_norm": 0.5359601127061994, + "learning_rate": 1.4791184496377e-05, + "loss": 0.776, + "step": 16180 + }, + { + "epoch": 0.3477681831851184, + "grad_norm": 0.5140711185809557, + "learning_rate": 1.4785200191243574e-05, + "loss": 0.7595, + "step": 16190 + }, + { + "epoch": 0.34798298749838896, + "grad_norm": 0.5406658844636193, + "learning_rate": 1.4779213662676797e-05, + "loss": 0.7729, + "step": 16200 + }, + { + "epoch": 0.34819779181165955, + "grad_norm": 0.5104834415117768, + "learning_rate": 1.4773224913458298e-05, + "loss": 0.7811, + "step": 16210 + }, + { + "epoch": 0.3484125961249302, + "grad_norm": 0.5595639311867889, + "learning_rate": 1.4767233946370735e-05, + "loss": 0.7691, + "step": 16220 + }, + { + "epoch": 0.3486274004382008, + "grad_norm": 0.5453224484229146, + "learning_rate": 1.4761240764197804e-05, + "loss": 0.7711, + "step": 16230 + }, + { + "epoch": 0.34884220475147143, + "grad_norm": 0.5533371724805077, + "learning_rate": 1.4755245369724219e-05, + "loss": 0.7647, + "step": 16240 + }, + { + "epoch": 0.349057009064742, + "grad_norm": 0.5029137026412714, + "learning_rate": 1.4749247765735727e-05, + "loss": 0.7625, + "step": 16250 + }, + { + "epoch": 0.3492718133780126, + "grad_norm": 0.5407424760271811, + "learning_rate": 1.474324795501911e-05, + "loss": 0.7777, + "step": 16260 + }, + { + "epoch": 0.34948661769128325, + "grad_norm": 0.5360642655145464, + "learning_rate": 1.4737245940362158e-05, + "loss": 0.7644, + "step": 16270 + }, + { + "epoch": 0.34970142200455384, + "grad_norm": 0.5431989168889423, + "learning_rate": 1.47312417245537e-05, + "loss": 0.7725, + "step": 16280 + }, + { + "epoch": 0.3499162263178245, + "grad_norm": 0.5212803701855959, + "learning_rate": 1.472523531038358e-05, + "loss": 0.7503, + "step": 16290 + }, + { + "epoch": 0.3501310306310951, + "grad_norm": 0.5375323551611733, + "learning_rate": 1.471922670064267e-05, + "loss": 0.7599, + "step": 16300 + }, + { + "epoch": 0.35034583494436566, + "grad_norm": 0.5239870812277655, + "learning_rate": 1.4713215898122857e-05, + "loss": 0.7863, + "step": 16310 + }, + { + "epoch": 0.3505606392576363, + "grad_norm": 0.5180937115766086, + "learning_rate": 1.4707202905617042e-05, + "loss": 0.7689, + "step": 16320 + }, + { + "epoch": 0.3507754435709069, + "grad_norm": 0.5417292224014176, + "learning_rate": 1.4701187725919157e-05, + "loss": 0.7527, + "step": 16330 + }, + { + "epoch": 0.35099024788417754, + "grad_norm": 0.5357164040418824, + "learning_rate": 1.4695170361824136e-05, + "loss": 0.7642, + "step": 16340 + }, + { + "epoch": 0.35120505219744813, + "grad_norm": 0.5303332337613179, + "learning_rate": 1.468915081612794e-05, + "loss": 0.7566, + "step": 16350 + }, + { + "epoch": 0.3514198565107187, + "grad_norm": 0.5382188260863363, + "learning_rate": 1.468312909162754e-05, + "loss": 0.7672, + "step": 16360 + }, + { + "epoch": 0.35163466082398936, + "grad_norm": 0.5385128353648729, + "learning_rate": 1.467710519112091e-05, + "loss": 0.7809, + "step": 16370 + }, + { + "epoch": 0.35184946513725995, + "grad_norm": 0.5310058917606305, + "learning_rate": 1.467107911740705e-05, + "loss": 0.7654, + "step": 16380 + }, + { + "epoch": 0.35206426945053054, + "grad_norm": 0.5031221816084535, + "learning_rate": 1.4665050873285957e-05, + "loss": 0.7507, + "step": 16390 + }, + { + "epoch": 0.3522790737638012, + "grad_norm": 0.54140785390428, + "learning_rate": 1.4659020461558649e-05, + "loss": 0.7545, + "step": 16400 + }, + { + "epoch": 0.3524938780770718, + "grad_norm": 0.5370610309213006, + "learning_rate": 1.4652987885027142e-05, + "loss": 0.7658, + "step": 16410 + }, + { + "epoch": 0.3527086823903424, + "grad_norm": 0.5490842521616487, + "learning_rate": 1.4646953146494454e-05, + "loss": 0.7606, + "step": 16420 + }, + { + "epoch": 0.352923486703613, + "grad_norm": 0.5131178943599031, + "learning_rate": 1.4640916248764621e-05, + "loss": 0.7738, + "step": 16430 + }, + { + "epoch": 0.3531382910168836, + "grad_norm": 0.5328970310455506, + "learning_rate": 1.4634877194642672e-05, + "loss": 0.7774, + "step": 16440 + }, + { + "epoch": 0.35335309533015424, + "grad_norm": 0.5137803015241591, + "learning_rate": 1.462883598693464e-05, + "loss": 0.7728, + "step": 16450 + }, + { + "epoch": 0.35356789964342483, + "grad_norm": 0.5199243192373794, + "learning_rate": 1.4622792628447562e-05, + "loss": 0.77, + "step": 16460 + }, + { + "epoch": 0.3537827039566955, + "grad_norm": 0.5425545285150617, + "learning_rate": 1.4616747121989474e-05, + "loss": 0.777, + "step": 16470 + }, + { + "epoch": 0.35399750826996607, + "grad_norm": 0.5395199010657613, + "learning_rate": 1.4610699470369401e-05, + "loss": 0.7688, + "step": 16480 + }, + { + "epoch": 0.35421231258323665, + "grad_norm": 0.5475850285247497, + "learning_rate": 1.4604649676397377e-05, + "loss": 0.769, + "step": 16490 + }, + { + "epoch": 0.3544271168965073, + "grad_norm": 0.5170429327520523, + "learning_rate": 1.4598597742884429e-05, + "loss": 0.7693, + "step": 16500 + }, + { + "epoch": 0.3546419212097779, + "grad_norm": 0.5402757575811128, + "learning_rate": 1.4592543672642567e-05, + "loss": 0.7663, + "step": 16510 + }, + { + "epoch": 0.3548567255230485, + "grad_norm": 0.536139194934942, + "learning_rate": 1.4586487468484809e-05, + "loss": 0.777, + "step": 16520 + }, + { + "epoch": 0.3550715298363191, + "grad_norm": 0.5276298531695289, + "learning_rate": 1.4580429133225153e-05, + "loss": 0.7669, + "step": 16530 + }, + { + "epoch": 0.3552863341495897, + "grad_norm": 0.5245685397779214, + "learning_rate": 1.4574368669678598e-05, + "loss": 0.7905, + "step": 16540 + }, + { + "epoch": 0.35550113846286036, + "grad_norm": 0.5281075002913366, + "learning_rate": 1.4568306080661118e-05, + "loss": 0.778, + "step": 16550 + }, + { + "epoch": 0.35571594277613094, + "grad_norm": 0.5113618186605529, + "learning_rate": 1.4562241368989691e-05, + "loss": 0.7669, + "step": 16560 + }, + { + "epoch": 0.35593074708940153, + "grad_norm": 0.506708948383745, + "learning_rate": 1.4556174537482267e-05, + "loss": 0.7476, + "step": 16570 + }, + { + "epoch": 0.3561455514026722, + "grad_norm": 0.5508043321880731, + "learning_rate": 1.4550105588957789e-05, + "loss": 0.7609, + "step": 16580 + }, + { + "epoch": 0.35636035571594277, + "grad_norm": 1.4314950449809294, + "learning_rate": 1.4544034526236174e-05, + "loss": 0.7623, + "step": 16590 + }, + { + "epoch": 0.3565751600292134, + "grad_norm": 0.5244372331321259, + "learning_rate": 1.453796135213834e-05, + "loss": 0.7669, + "step": 16600 + }, + { + "epoch": 0.356789964342484, + "grad_norm": 0.5224396045025858, + "learning_rate": 1.4531886069486169e-05, + "loss": 0.7605, + "step": 16610 + }, + { + "epoch": 0.3570047686557546, + "grad_norm": 0.5070138150623654, + "learning_rate": 1.4525808681102533e-05, + "loss": 0.7668, + "step": 16620 + }, + { + "epoch": 0.35721957296902523, + "grad_norm": 0.5288742137487699, + "learning_rate": 1.4519729189811271e-05, + "loss": 0.7554, + "step": 16630 + }, + { + "epoch": 0.3574343772822958, + "grad_norm": 0.5677338154821888, + "learning_rate": 1.4513647598437208e-05, + "loss": 0.7609, + "step": 16640 + }, + { + "epoch": 0.3576491815955664, + "grad_norm": 0.5357651788636495, + "learning_rate": 1.4507563909806146e-05, + "loss": 0.7742, + "step": 16650 + }, + { + "epoch": 0.35786398590883706, + "grad_norm": 0.5179083963651603, + "learning_rate": 1.4501478126744855e-05, + "loss": 0.7726, + "step": 16660 + }, + { + "epoch": 0.35807879022210765, + "grad_norm": 0.5295056672371503, + "learning_rate": 1.4495390252081082e-05, + "loss": 0.7458, + "step": 16670 + }, + { + "epoch": 0.3582935945353783, + "grad_norm": 0.5214891906282768, + "learning_rate": 1.448930028864355e-05, + "loss": 0.7619, + "step": 16680 + }, + { + "epoch": 0.3585083988486489, + "grad_norm": 0.5578244478041603, + "learning_rate": 1.4483208239261943e-05, + "loss": 0.7703, + "step": 16690 + }, + { + "epoch": 0.35872320316191947, + "grad_norm": 0.5376124885496097, + "learning_rate": 1.4477114106766921e-05, + "loss": 0.7546, + "step": 16700 + }, + { + "epoch": 0.3589380074751901, + "grad_norm": 0.5177377132348334, + "learning_rate": 1.4471017893990107e-05, + "loss": 0.7662, + "step": 16710 + }, + { + "epoch": 0.3591528117884607, + "grad_norm": 0.5134062584490465, + "learning_rate": 1.4464919603764097e-05, + "loss": 0.7667, + "step": 16720 + }, + { + "epoch": 0.35936761610173135, + "grad_norm": 0.5054504951310723, + "learning_rate": 1.4458819238922446e-05, + "loss": 0.7467, + "step": 16730 + }, + { + "epoch": 0.35958242041500194, + "grad_norm": 0.5344939536393053, + "learning_rate": 1.445271680229968e-05, + "loss": 0.7705, + "step": 16740 + }, + { + "epoch": 0.3597972247282725, + "grad_norm": 0.5470998293542991, + "learning_rate": 1.4446612296731282e-05, + "loss": 0.7564, + "step": 16750 + }, + { + "epoch": 0.36001202904154317, + "grad_norm": 0.5743146214103152, + "learning_rate": 1.4440505725053693e-05, + "loss": 0.7623, + "step": 16760 + }, + { + "epoch": 0.36022683335481376, + "grad_norm": 0.5333319615482879, + "learning_rate": 1.4434397090104324e-05, + "loss": 0.7714, + "step": 16770 + }, + { + "epoch": 0.3604416376680844, + "grad_norm": 0.5202228507285751, + "learning_rate": 1.442828639472154e-05, + "loss": 0.7515, + "step": 16780 + }, + { + "epoch": 0.360656441981355, + "grad_norm": 0.5277143639598636, + "learning_rate": 1.4422173641744663e-05, + "loss": 0.7633, + "step": 16790 + }, + { + "epoch": 0.3608712462946256, + "grad_norm": 0.5468528449571223, + "learning_rate": 1.4416058834013967e-05, + "loss": 0.7629, + "step": 16800 + }, + { + "epoch": 0.3610860506078962, + "grad_norm": 0.5389600692166813, + "learning_rate": 1.440994197437069e-05, + "loss": 0.7681, + "step": 16810 + }, + { + "epoch": 0.3613008549211668, + "grad_norm": 0.5335455178234667, + "learning_rate": 1.4403823065657012e-05, + "loss": 0.7607, + "step": 16820 + }, + { + "epoch": 0.3615156592344374, + "grad_norm": 0.5400771216289987, + "learning_rate": 1.4397702110716076e-05, + "loss": 0.7659, + "step": 16830 + }, + { + "epoch": 0.36173046354770805, + "grad_norm": 0.5602481821229619, + "learning_rate": 1.4391579112391969e-05, + "loss": 0.7765, + "step": 16840 + }, + { + "epoch": 0.36194526786097864, + "grad_norm": 0.5272902803012298, + "learning_rate": 1.4385454073529731e-05, + "loss": 0.7649, + "step": 16850 + }, + { + "epoch": 0.3621600721742493, + "grad_norm": 0.5187058176818714, + "learning_rate": 1.4379326996975347e-05, + "loss": 0.7648, + "step": 16860 + }, + { + "epoch": 0.36237487648751987, + "grad_norm": 0.5331452093710046, + "learning_rate": 1.4373197885575752e-05, + "loss": 0.7713, + "step": 16870 + }, + { + "epoch": 0.36258968080079046, + "grad_norm": 0.5302443268053321, + "learning_rate": 1.4367066742178824e-05, + "loss": 0.7568, + "step": 16880 + }, + { + "epoch": 0.3628044851140611, + "grad_norm": 0.5114903800320675, + "learning_rate": 1.4360933569633386e-05, + "loss": 0.7532, + "step": 16890 + }, + { + "epoch": 0.3630192894273317, + "grad_norm": 0.5240676752585789, + "learning_rate": 1.4354798370789204e-05, + "loss": 0.7726, + "step": 16900 + }, + { + "epoch": 0.36323409374060234, + "grad_norm": 0.5477577408032249, + "learning_rate": 1.4348661148496985e-05, + "loss": 0.773, + "step": 16910 + }, + { + "epoch": 0.3634488980538729, + "grad_norm": 0.5276323223923417, + "learning_rate": 1.4342521905608377e-05, + "loss": 0.7604, + "step": 16920 + }, + { + "epoch": 0.3636637023671435, + "grad_norm": 0.5170851911406245, + "learning_rate": 1.4336380644975964e-05, + "loss": 0.7491, + "step": 16930 + }, + { + "epoch": 0.36387850668041416, + "grad_norm": 0.5356549892783816, + "learning_rate": 1.433023736945328e-05, + "loss": 0.7657, + "step": 16940 + }, + { + "epoch": 0.36409331099368475, + "grad_norm": 0.5428984566871172, + "learning_rate": 1.4324092081894771e-05, + "loss": 0.7661, + "step": 16950 + }, + { + "epoch": 0.36430811530695534, + "grad_norm": 0.5324713712005715, + "learning_rate": 1.4317944785155841e-05, + "loss": 0.7628, + "step": 16960 + }, + { + "epoch": 0.364522919620226, + "grad_norm": 0.5162388083041777, + "learning_rate": 1.4311795482092819e-05, + "loss": 0.7667, + "step": 16970 + }, + { + "epoch": 0.36473772393349657, + "grad_norm": 0.5143878924802234, + "learning_rate": 1.430564417556296e-05, + "loss": 0.7567, + "step": 16980 + }, + { + "epoch": 0.3649525282467672, + "grad_norm": 0.5152886688089272, + "learning_rate": 1.429949086842446e-05, + "loss": 0.7579, + "step": 16990 + }, + { + "epoch": 0.3651673325600378, + "grad_norm": 0.5232658905035079, + "learning_rate": 1.4293335563536444e-05, + "loss": 0.773, + "step": 17000 + }, + { + "epoch": 0.3653821368733084, + "grad_norm": 0.5351086541332937, + "learning_rate": 1.4287178263758954e-05, + "loss": 0.7743, + "step": 17010 + }, + { + "epoch": 0.36559694118657904, + "grad_norm": 0.534471003893465, + "learning_rate": 1.4281018971952968e-05, + "loss": 0.7766, + "step": 17020 + }, + { + "epoch": 0.3658117454998496, + "grad_norm": 0.5285689768103836, + "learning_rate": 1.4274857690980393e-05, + "loss": 0.7609, + "step": 17030 + }, + { + "epoch": 0.36602654981312027, + "grad_norm": 0.5247525596746352, + "learning_rate": 1.426869442370405e-05, + "loss": 0.7584, + "step": 17040 + }, + { + "epoch": 0.36624135412639086, + "grad_norm": 0.5343814259733137, + "learning_rate": 1.4262529172987694e-05, + "loss": 0.7715, + "step": 17050 + }, + { + "epoch": 0.36645615843966145, + "grad_norm": 1.5564508451969874, + "learning_rate": 1.4256361941695994e-05, + "loss": 0.7641, + "step": 17060 + }, + { + "epoch": 0.3666709627529321, + "grad_norm": 0.5184034090359657, + "learning_rate": 1.4250192732694539e-05, + "loss": 0.7544, + "step": 17070 + }, + { + "epoch": 0.3668857670662027, + "grad_norm": 0.9518622956465803, + "learning_rate": 1.4244021548849838e-05, + "loss": 0.7681, + "step": 17080 + }, + { + "epoch": 0.3671005713794733, + "grad_norm": 0.5468495559267383, + "learning_rate": 1.4237848393029321e-05, + "loss": 0.761, + "step": 17090 + }, + { + "epoch": 0.3673153756927439, + "grad_norm": 0.5213501404226146, + "learning_rate": 1.4231673268101334e-05, + "loss": 0.7667, + "step": 17100 + }, + { + "epoch": 0.3675301800060145, + "grad_norm": 0.5088259296767228, + "learning_rate": 1.4225496176935135e-05, + "loss": 0.7781, + "step": 17110 + }, + { + "epoch": 0.36774498431928515, + "grad_norm": 0.5236538699356668, + "learning_rate": 1.4219317122400895e-05, + "loss": 0.771, + "step": 17120 + }, + { + "epoch": 0.36795978863255574, + "grad_norm": 0.5211467299731635, + "learning_rate": 1.4213136107369701e-05, + "loss": 0.7793, + "step": 17130 + }, + { + "epoch": 0.36817459294582633, + "grad_norm": 0.5472140162284628, + "learning_rate": 1.4206953134713546e-05, + "loss": 0.7692, + "step": 17140 + }, + { + "epoch": 0.368389397259097, + "grad_norm": 0.5189807730044983, + "learning_rate": 1.4200768207305338e-05, + "loss": 0.7814, + "step": 17150 + }, + { + "epoch": 0.36860420157236756, + "grad_norm": 0.5355282252957151, + "learning_rate": 1.4194581328018887e-05, + "loss": 0.7701, + "step": 17160 + }, + { + "epoch": 0.3688190058856382, + "grad_norm": 0.5178079624135835, + "learning_rate": 1.418839249972892e-05, + "loss": 0.7692, + "step": 17170 + }, + { + "epoch": 0.3690338101989088, + "grad_norm": 0.5294877879343505, + "learning_rate": 1.4182201725311056e-05, + "loss": 0.7493, + "step": 17180 + }, + { + "epoch": 0.3692486145121794, + "grad_norm": 0.506889416207276, + "learning_rate": 1.417600900764183e-05, + "loss": 0.7575, + "step": 17190 + }, + { + "epoch": 0.36946341882545003, + "grad_norm": 0.5285238726260074, + "learning_rate": 1.4169814349598676e-05, + "loss": 0.761, + "step": 17200 + }, + { + "epoch": 0.3696782231387206, + "grad_norm": 0.5263966505400081, + "learning_rate": 1.4163617754059927e-05, + "loss": 0.7574, + "step": 17210 + }, + { + "epoch": 0.36989302745199126, + "grad_norm": 0.5444318728567634, + "learning_rate": 1.4157419223904816e-05, + "loss": 0.7836, + "step": 17220 + }, + { + "epoch": 0.37010783176526185, + "grad_norm": 0.504161618679252, + "learning_rate": 1.4151218762013486e-05, + "loss": 0.7667, + "step": 17230 + }, + { + "epoch": 0.37032263607853244, + "grad_norm": 0.5367134980237321, + "learning_rate": 1.4145016371266958e-05, + "loss": 0.7658, + "step": 17240 + }, + { + "epoch": 0.3705374403918031, + "grad_norm": 0.5437915470695406, + "learning_rate": 1.4138812054547164e-05, + "loss": 0.7642, + "step": 17250 + }, + { + "epoch": 0.3707522447050737, + "grad_norm": 0.5250865582583174, + "learning_rate": 1.413260581473693e-05, + "loss": 0.7629, + "step": 17260 + }, + { + "epoch": 0.37096704901834426, + "grad_norm": 0.5219131992470888, + "learning_rate": 1.412639765471997e-05, + "loss": 0.7492, + "step": 17270 + }, + { + "epoch": 0.3711818533316149, + "grad_norm": 0.5434175810188079, + "learning_rate": 1.4120187577380893e-05, + "loss": 0.7721, + "step": 17280 + }, + { + "epoch": 0.3713966576448855, + "grad_norm": 0.5297492710247524, + "learning_rate": 1.4113975585605197e-05, + "loss": 0.7627, + "step": 17290 + }, + { + "epoch": 0.37161146195815614, + "grad_norm": 0.5172129186389717, + "learning_rate": 1.4107761682279273e-05, + "loss": 0.7824, + "step": 17300 + }, + { + "epoch": 0.37182626627142673, + "grad_norm": 0.5530724301179245, + "learning_rate": 1.4101545870290397e-05, + "loss": 0.7623, + "step": 17310 + }, + { + "epoch": 0.3720410705846973, + "grad_norm": 0.5228165330027325, + "learning_rate": 1.4095328152526734e-05, + "loss": 0.7603, + "step": 17320 + }, + { + "epoch": 0.37225587489796796, + "grad_norm": 0.5134940227470199, + "learning_rate": 1.4089108531877336e-05, + "loss": 0.7396, + "step": 17330 + }, + { + "epoch": 0.37247067921123855, + "grad_norm": 0.5259963794438499, + "learning_rate": 1.4082887011232131e-05, + "loss": 0.7694, + "step": 17340 + }, + { + "epoch": 0.3726854835245092, + "grad_norm": 0.5189340620745447, + "learning_rate": 1.4076663593481943e-05, + "loss": 0.7759, + "step": 17350 + }, + { + "epoch": 0.3729002878377798, + "grad_norm": 0.5035877483790274, + "learning_rate": 1.4070438281518462e-05, + "loss": 0.7516, + "step": 17360 + }, + { + "epoch": 0.3731150921510504, + "grad_norm": 0.5274597389686024, + "learning_rate": 1.406421107823427e-05, + "loss": 0.7705, + "step": 17370 + }, + { + "epoch": 0.373329896464321, + "grad_norm": 0.5817576244957883, + "learning_rate": 1.405798198652283e-05, + "loss": 0.7664, + "step": 17380 + }, + { + "epoch": 0.3735447007775916, + "grad_norm": 0.5517499638677766, + "learning_rate": 1.405175100927847e-05, + "loss": 0.775, + "step": 17390 + }, + { + "epoch": 0.3737595050908622, + "grad_norm": 0.5527733297913258, + "learning_rate": 1.40455181493964e-05, + "loss": 0.7621, + "step": 17400 + }, + { + "epoch": 0.37397430940413284, + "grad_norm": 0.5363364768618227, + "learning_rate": 1.4039283409772706e-05, + "loss": 0.755, + "step": 17410 + }, + { + "epoch": 0.37418911371740343, + "grad_norm": 0.5375291344235176, + "learning_rate": 1.4033046793304348e-05, + "loss": 0.7594, + "step": 17420 + }, + { + "epoch": 0.3744039180306741, + "grad_norm": 0.5402648263612099, + "learning_rate": 1.402680830288916e-05, + "loss": 0.7605, + "step": 17430 + }, + { + "epoch": 0.37461872234394467, + "grad_norm": 0.538129594884827, + "learning_rate": 1.4020567941425837e-05, + "loss": 0.7753, + "step": 17440 + }, + { + "epoch": 0.37483352665721525, + "grad_norm": 0.5261936791371908, + "learning_rate": 1.4014325711813958e-05, + "loss": 0.7571, + "step": 17450 + }, + { + "epoch": 0.3750483309704859, + "grad_norm": 0.534378842346243, + "learning_rate": 1.4008081616953954e-05, + "loss": 0.7813, + "step": 17460 + }, + { + "epoch": 0.3752631352837565, + "grad_norm": 0.5320321584083502, + "learning_rate": 1.4001835659747137e-05, + "loss": 0.7602, + "step": 17470 + }, + { + "epoch": 0.37547793959702713, + "grad_norm": 0.5315203643805748, + "learning_rate": 1.3995587843095675e-05, + "loss": 0.7575, + "step": 17480 + }, + { + "epoch": 0.3756927439102977, + "grad_norm": 0.5301936608790958, + "learning_rate": 1.3989338169902604e-05, + "loss": 0.7676, + "step": 17490 + }, + { + "epoch": 0.3759075482235683, + "grad_norm": 0.5577865205877498, + "learning_rate": 1.398308664307182e-05, + "loss": 0.7614, + "step": 17500 + }, + { + "epoch": 0.37612235253683896, + "grad_norm": 0.5179881999767708, + "learning_rate": 1.3976833265508085e-05, + "loss": 0.7601, + "step": 17510 + }, + { + "epoch": 0.37633715685010954, + "grad_norm": 0.5208749334999229, + "learning_rate": 1.3970578040117013e-05, + "loss": 0.7525, + "step": 17520 + }, + { + "epoch": 0.37655196116338013, + "grad_norm": 0.5352630545243395, + "learning_rate": 1.3964320969805085e-05, + "loss": 0.7647, + "step": 17530 + }, + { + "epoch": 0.3767667654766508, + "grad_norm": 0.5193667340738525, + "learning_rate": 1.3958062057479638e-05, + "loss": 0.7527, + "step": 17540 + }, + { + "epoch": 0.37698156978992137, + "grad_norm": 0.5361696072953572, + "learning_rate": 1.395180130604886e-05, + "loss": 0.7672, + "step": 17550 + }, + { + "epoch": 0.377196374103192, + "grad_norm": 0.5518592474145112, + "learning_rate": 1.3945538718421797e-05, + "loss": 0.769, + "step": 17560 + }, + { + "epoch": 0.3774111784164626, + "grad_norm": 0.5148045343692099, + "learning_rate": 1.3939274297508343e-05, + "loss": 0.7542, + "step": 17570 + }, + { + "epoch": 0.3776259827297332, + "grad_norm": 0.5228184172172592, + "learning_rate": 1.3933008046219256e-05, + "loss": 0.7722, + "step": 17580 + }, + { + "epoch": 0.37784078704300383, + "grad_norm": 0.5606699310701568, + "learning_rate": 1.392673996746613e-05, + "loss": 0.7608, + "step": 17590 + }, + { + "epoch": 0.3780555913562744, + "grad_norm": 0.5155322885195631, + "learning_rate": 1.3920470064161417e-05, + "loss": 0.7648, + "step": 17600 + }, + { + "epoch": 0.37827039566954507, + "grad_norm": 0.5295832405444447, + "learning_rate": 1.3914198339218417e-05, + "loss": 0.7529, + "step": 17610 + }, + { + "epoch": 0.37848519998281566, + "grad_norm": 0.5379568307902901, + "learning_rate": 1.3907924795551269e-05, + "loss": 0.7638, + "step": 17620 + }, + { + "epoch": 0.37870000429608625, + "grad_norm": 0.5202511460977277, + "learning_rate": 1.3901649436074967e-05, + "loss": 0.747, + "step": 17630 + }, + { + "epoch": 0.3789148086093569, + "grad_norm": 0.5082094007910867, + "learning_rate": 1.3895372263705342e-05, + "loss": 0.7553, + "step": 17640 + }, + { + "epoch": 0.3791296129226275, + "grad_norm": 0.5019905535533106, + "learning_rate": 1.3889093281359068e-05, + "loss": 0.7723, + "step": 17650 + }, + { + "epoch": 0.3793444172358981, + "grad_norm": 0.5289787148328216, + "learning_rate": 1.388281249195366e-05, + "loss": 0.7538, + "step": 17660 + }, + { + "epoch": 0.3795592215491687, + "grad_norm": 0.5138445337600223, + "learning_rate": 1.3876529898407479e-05, + "loss": 0.7703, + "step": 17670 + }, + { + "epoch": 0.3797740258624393, + "grad_norm": 0.4950419489829235, + "learning_rate": 1.3870245503639715e-05, + "loss": 0.7576, + "step": 17680 + }, + { + "epoch": 0.37998883017570995, + "grad_norm": 0.5254250563222473, + "learning_rate": 1.3863959310570398e-05, + "loss": 0.7667, + "step": 17690 + }, + { + "epoch": 0.38020363448898054, + "grad_norm": 0.520664787706605, + "learning_rate": 1.38576713221204e-05, + "loss": 0.7635, + "step": 17700 + }, + { + "epoch": 0.3804184388022511, + "grad_norm": 0.5313493460126173, + "learning_rate": 1.3851381541211418e-05, + "loss": 0.7751, + "step": 17710 + }, + { + "epoch": 0.38063324311552177, + "grad_norm": 0.5398410905297825, + "learning_rate": 1.384508997076598e-05, + "loss": 0.7624, + "step": 17720 + }, + { + "epoch": 0.38084804742879236, + "grad_norm": 0.5263114278035196, + "learning_rate": 1.3838796613707462e-05, + "loss": 0.7613, + "step": 17730 + }, + { + "epoch": 0.381062851742063, + "grad_norm": 0.5283488770743783, + "learning_rate": 1.3832501472960051e-05, + "loss": 0.7544, + "step": 17740 + }, + { + "epoch": 0.3812776560553336, + "grad_norm": 0.543513524216455, + "learning_rate": 1.3826204551448777e-05, + "loss": 0.7504, + "step": 17750 + }, + { + "epoch": 0.3814924603686042, + "grad_norm": 0.5214420491236335, + "learning_rate": 1.3819905852099492e-05, + "loss": 0.7651, + "step": 17760 + }, + { + "epoch": 0.3817072646818748, + "grad_norm": 0.5184770230999961, + "learning_rate": 1.3813605377838866e-05, + "loss": 0.752, + "step": 17770 + }, + { + "epoch": 0.3819220689951454, + "grad_norm": 0.5283519182814274, + "learning_rate": 1.3807303131594407e-05, + "loss": 0.7547, + "step": 17780 + }, + { + "epoch": 0.38213687330841606, + "grad_norm": 0.5385698106036354, + "learning_rate": 1.380099911629444e-05, + "loss": 0.7593, + "step": 17790 + }, + { + "epoch": 0.38235167762168665, + "grad_norm": 0.5402749618995843, + "learning_rate": 1.379469333486811e-05, + "loss": 0.7516, + "step": 17800 + }, + { + "epoch": 0.38256648193495724, + "grad_norm": 0.5247063198184665, + "learning_rate": 1.378838579024539e-05, + "loss": 0.7652, + "step": 17810 + }, + { + "epoch": 0.3827812862482279, + "grad_norm": 0.5378877903900204, + "learning_rate": 1.3782076485357062e-05, + "loss": 0.7635, + "step": 17820 + }, + { + "epoch": 0.38299609056149847, + "grad_norm": 0.5338589078746822, + "learning_rate": 1.377576542313474e-05, + "loss": 0.7582, + "step": 17830 + }, + { + "epoch": 0.38321089487476906, + "grad_norm": 0.5103257254373668, + "learning_rate": 1.3769452606510837e-05, + "loss": 0.76, + "step": 17840 + }, + { + "epoch": 0.3834256991880397, + "grad_norm": 0.5109691457275021, + "learning_rate": 1.3763138038418592e-05, + "loss": 0.7556, + "step": 17850 + }, + { + "epoch": 0.3836405035013103, + "grad_norm": 0.5358067276980497, + "learning_rate": 1.375682172179206e-05, + "loss": 0.7708, + "step": 17860 + }, + { + "epoch": 0.38385530781458094, + "grad_norm": 0.518350530032854, + "learning_rate": 1.37505036595661e-05, + "loss": 0.7784, + "step": 17870 + }, + { + "epoch": 0.3840701121278515, + "grad_norm": 0.5255617991273894, + "learning_rate": 1.374418385467639e-05, + "loss": 0.751, + "step": 17880 + }, + { + "epoch": 0.3842849164411221, + "grad_norm": 0.5491809745075493, + "learning_rate": 1.3737862310059413e-05, + "loss": 0.7485, + "step": 17890 + }, + { + "epoch": 0.38449972075439276, + "grad_norm": 0.5437186078897788, + "learning_rate": 1.373153902865246e-05, + "loss": 0.7716, + "step": 17900 + }, + { + "epoch": 0.38471452506766335, + "grad_norm": 0.553356354641266, + "learning_rate": 1.372521401339363e-05, + "loss": 0.7578, + "step": 17910 + }, + { + "epoch": 0.384929329380934, + "grad_norm": 0.531549500667387, + "learning_rate": 1.3718887267221835e-05, + "loss": 0.759, + "step": 17920 + }, + { + "epoch": 0.3851441336942046, + "grad_norm": 0.5164949440136289, + "learning_rate": 1.3712558793076777e-05, + "loss": 0.7706, + "step": 17930 + }, + { + "epoch": 0.38535893800747517, + "grad_norm": 0.508363864448989, + "learning_rate": 1.3706228593898971e-05, + "loss": 0.761, + "step": 17940 + }, + { + "epoch": 0.3855737423207458, + "grad_norm": 0.5645367042304027, + "learning_rate": 1.369989667262973e-05, + "loss": 0.7529, + "step": 17950 + }, + { + "epoch": 0.3857885466340164, + "grad_norm": 0.5276474618348226, + "learning_rate": 1.3693563032211173e-05, + "loss": 0.7619, + "step": 17960 + }, + { + "epoch": 0.386003350947287, + "grad_norm": 0.5219439134743928, + "learning_rate": 1.3687227675586205e-05, + "loss": 0.7602, + "step": 17970 + }, + { + "epoch": 0.38621815526055764, + "grad_norm": 0.4996278152443501, + "learning_rate": 1.3680890605698543e-05, + "loss": 0.7644, + "step": 17980 + }, + { + "epoch": 0.3864329595738282, + "grad_norm": 0.522998743172047, + "learning_rate": 1.3674551825492688e-05, + "loss": 0.7701, + "step": 17990 + }, + { + "epoch": 0.38664776388709887, + "grad_norm": 0.5374567407341325, + "learning_rate": 1.3668211337913944e-05, + "loss": 0.7746, + "step": 18000 + }, + { + "epoch": 0.38686256820036946, + "grad_norm": 0.5196875770806691, + "learning_rate": 1.3661869145908407e-05, + "loss": 0.7478, + "step": 18010 + }, + { + "epoch": 0.38707737251364005, + "grad_norm": 0.5292498035951604, + "learning_rate": 1.365552525242296e-05, + "loss": 0.7645, + "step": 18020 + }, + { + "epoch": 0.3872921768269107, + "grad_norm": 0.5248382854490489, + "learning_rate": 1.3649179660405282e-05, + "loss": 0.7602, + "step": 18030 + }, + { + "epoch": 0.3875069811401813, + "grad_norm": 0.5313710119749431, + "learning_rate": 1.364283237280384e-05, + "loss": 0.7786, + "step": 18040 + }, + { + "epoch": 0.38772178545345193, + "grad_norm": 0.5307887108904168, + "learning_rate": 1.3636483392567884e-05, + "loss": 0.7738, + "step": 18050 + }, + { + "epoch": 0.3879365897667225, + "grad_norm": 0.540751829854641, + "learning_rate": 1.363013272264746e-05, + "loss": 0.7731, + "step": 18060 + }, + { + "epoch": 0.3881513940799931, + "grad_norm": 0.5156037186616597, + "learning_rate": 1.3623780365993389e-05, + "loss": 0.7552, + "step": 18070 + }, + { + "epoch": 0.38836619839326375, + "grad_norm": 0.5141088635988575, + "learning_rate": 1.3617426325557283e-05, + "loss": 0.7547, + "step": 18080 + }, + { + "epoch": 0.38858100270653434, + "grad_norm": 0.5318673688677644, + "learning_rate": 1.3611070604291535e-05, + "loss": 0.7689, + "step": 18090 + }, + { + "epoch": 0.388795807019805, + "grad_norm": 0.5436642226772909, + "learning_rate": 1.360471320514931e-05, + "loss": 0.7598, + "step": 18100 + }, + { + "epoch": 0.3890106113330756, + "grad_norm": 0.5438843302772419, + "learning_rate": 1.359835413108457e-05, + "loss": 0.7589, + "step": 18110 + }, + { + "epoch": 0.38922541564634616, + "grad_norm": 0.5257581273173912, + "learning_rate": 1.359199338505204e-05, + "loss": 0.7497, + "step": 18120 + }, + { + "epoch": 0.3894402199596168, + "grad_norm": 0.5190745673753684, + "learning_rate": 1.3585630970007228e-05, + "loss": 0.757, + "step": 18130 + }, + { + "epoch": 0.3896550242728874, + "grad_norm": 0.5299254643622509, + "learning_rate": 1.3579266888906422e-05, + "loss": 0.7564, + "step": 18140 + }, + { + "epoch": 0.389869828586158, + "grad_norm": 0.5305677893941887, + "learning_rate": 1.3572901144706675e-05, + "loss": 0.743, + "step": 18150 + }, + { + "epoch": 0.39008463289942863, + "grad_norm": 0.5173856069530872, + "learning_rate": 1.3566533740365812e-05, + "loss": 0.7552, + "step": 18160 + }, + { + "epoch": 0.3902994372126992, + "grad_norm": 0.5148950374087616, + "learning_rate": 1.3560164678842442e-05, + "loss": 0.7568, + "step": 18170 + }, + { + "epoch": 0.39051424152596986, + "grad_norm": 0.5407560251354756, + "learning_rate": 1.355379396309593e-05, + "loss": 0.7546, + "step": 18180 + }, + { + "epoch": 0.39072904583924045, + "grad_norm": 0.5124304727359418, + "learning_rate": 1.3547421596086425e-05, + "loss": 0.7565, + "step": 18190 + }, + { + "epoch": 0.39094385015251104, + "grad_norm": 0.5431987417174345, + "learning_rate": 1.3541047580774827e-05, + "loss": 0.756, + "step": 18200 + }, + { + "epoch": 0.3911586544657817, + "grad_norm": 0.5073011287145005, + "learning_rate": 1.3534671920122809e-05, + "loss": 0.7464, + "step": 18210 + }, + { + "epoch": 0.3913734587790523, + "grad_norm": 0.5325329739414308, + "learning_rate": 1.3528294617092807e-05, + "loss": 0.7605, + "step": 18220 + }, + { + "epoch": 0.3915882630923229, + "grad_norm": 0.5369634007500615, + "learning_rate": 1.3521915674648027e-05, + "loss": 0.7621, + "step": 18230 + }, + { + "epoch": 0.3918030674055935, + "grad_norm": 0.5167864671283865, + "learning_rate": 1.351553509575243e-05, + "loss": 0.7719, + "step": 18240 + }, + { + "epoch": 0.3920178717188641, + "grad_norm": 0.5135322995315039, + "learning_rate": 1.3509152883370738e-05, + "loss": 0.7687, + "step": 18250 + }, + { + "epoch": 0.39223267603213474, + "grad_norm": 0.5320096412814102, + "learning_rate": 1.3502769040468428e-05, + "loss": 0.7483, + "step": 18260 + }, + { + "epoch": 0.39244748034540533, + "grad_norm": 0.5413014307472465, + "learning_rate": 1.3496383570011749e-05, + "loss": 0.7631, + "step": 18270 + }, + { + "epoch": 0.3926622846586759, + "grad_norm": 0.5335344669447194, + "learning_rate": 1.3489996474967688e-05, + "loss": 0.7688, + "step": 18280 + }, + { + "epoch": 0.39287708897194656, + "grad_norm": 0.5155463587590748, + "learning_rate": 1.3483607758304e-05, + "loss": 0.7603, + "step": 18290 + }, + { + "epoch": 0.39309189328521715, + "grad_norm": 0.5144797987402185, + "learning_rate": 1.3477217422989186e-05, + "loss": 0.7516, + "step": 18300 + }, + { + "epoch": 0.3933066975984878, + "grad_norm": 0.5138996863954206, + "learning_rate": 1.3470825471992508e-05, + "loss": 0.743, + "step": 18310 + }, + { + "epoch": 0.3935215019117584, + "grad_norm": 0.5084404683155206, + "learning_rate": 1.3464431908283966e-05, + "loss": 0.748, + "step": 18320 + }, + { + "epoch": 0.393736306225029, + "grad_norm": 0.5145070791156562, + "learning_rate": 1.3458036734834317e-05, + "loss": 0.7624, + "step": 18330 + }, + { + "epoch": 0.3939511105382996, + "grad_norm": 0.5228286492426271, + "learning_rate": 1.345163995461507e-05, + "loss": 0.7453, + "step": 18340 + }, + { + "epoch": 0.3941659148515702, + "grad_norm": 0.5234868717582319, + "learning_rate": 1.3445241570598471e-05, + "loss": 0.7755, + "step": 18350 + }, + { + "epoch": 0.39438071916484085, + "grad_norm": 0.5157437356324394, + "learning_rate": 1.3438841585757518e-05, + "loss": 0.7495, + "step": 18360 + }, + { + "epoch": 0.39459552347811144, + "grad_norm": 0.5390531614993764, + "learning_rate": 1.3432440003065949e-05, + "loss": 0.76, + "step": 18370 + }, + { + "epoch": 0.39481032779138203, + "grad_norm": 0.5384002860850948, + "learning_rate": 1.3426036825498248e-05, + "loss": 0.7666, + "step": 18380 + }, + { + "epoch": 0.3950251321046527, + "grad_norm": 0.5121988305761532, + "learning_rate": 1.3419632056029637e-05, + "loss": 0.7597, + "step": 18390 + }, + { + "epoch": 0.39523993641792327, + "grad_norm": 0.5104755486106719, + "learning_rate": 1.3413225697636079e-05, + "loss": 0.7483, + "step": 18400 + }, + { + "epoch": 0.3954547407311939, + "grad_norm": 0.5032637298232104, + "learning_rate": 1.3406817753294277e-05, + "loss": 0.7607, + "step": 18410 + }, + { + "epoch": 0.3956695450444645, + "grad_norm": 0.510737282878729, + "learning_rate": 1.3400408225981666e-05, + "loss": 0.7487, + "step": 18420 + }, + { + "epoch": 0.3958843493577351, + "grad_norm": 0.5527046441176208, + "learning_rate": 1.339399711867642e-05, + "loss": 0.76, + "step": 18430 + }, + { + "epoch": 0.39609915367100573, + "grad_norm": 0.5226806691957405, + "learning_rate": 1.338758443435745e-05, + "loss": 0.7537, + "step": 18440 + }, + { + "epoch": 0.3963139579842763, + "grad_norm": 0.5235233310915026, + "learning_rate": 1.3381170176004393e-05, + "loss": 0.7559, + "step": 18450 + }, + { + "epoch": 0.3965287622975469, + "grad_norm": 0.5168608712748513, + "learning_rate": 1.3374754346597622e-05, + "loss": 0.7523, + "step": 18460 + }, + { + "epoch": 0.39674356661081756, + "grad_norm": 0.5506026541978102, + "learning_rate": 1.3368336949118241e-05, + "loss": 0.7521, + "step": 18470 + }, + { + "epoch": 0.39695837092408814, + "grad_norm": 0.5268087236613826, + "learning_rate": 1.3361917986548076e-05, + "loss": 0.7693, + "step": 18480 + }, + { + "epoch": 0.3971731752373588, + "grad_norm": 0.5299142550170114, + "learning_rate": 1.3355497461869686e-05, + "loss": 0.762, + "step": 18490 + }, + { + "epoch": 0.3973879795506294, + "grad_norm": 0.49923625368152, + "learning_rate": 1.3349075378066358e-05, + "loss": 0.7538, + "step": 18500 + }, + { + "epoch": 0.39760278386389997, + "grad_norm": 0.526770181891027, + "learning_rate": 1.3342651738122096e-05, + "loss": 0.7692, + "step": 18510 + }, + { + "epoch": 0.3978175881771706, + "grad_norm": 0.5278097868657259, + "learning_rate": 1.3336226545021633e-05, + "loss": 0.7674, + "step": 18520 + }, + { + "epoch": 0.3980323924904412, + "grad_norm": 0.5253722249102108, + "learning_rate": 1.3329799801750425e-05, + "loss": 0.768, + "step": 18530 + }, + { + "epoch": 0.39824719680371184, + "grad_norm": 0.536105570764583, + "learning_rate": 1.3323371511294636e-05, + "loss": 0.762, + "step": 18540 + }, + { + "epoch": 0.39846200111698243, + "grad_norm": 0.5193573833212372, + "learning_rate": 1.3316941676641162e-05, + "loss": 0.7589, + "step": 18550 + }, + { + "epoch": 0.398676805430253, + "grad_norm": 0.5070127139643907, + "learning_rate": 1.3310510300777615e-05, + "loss": 0.7628, + "step": 18560 + }, + { + "epoch": 0.39889160974352367, + "grad_norm": 0.5191238299803989, + "learning_rate": 1.3304077386692318e-05, + "loss": 0.751, + "step": 18570 + }, + { + "epoch": 0.39910641405679426, + "grad_norm": 0.5205364024981556, + "learning_rate": 1.3297642937374313e-05, + "loss": 0.7545, + "step": 18580 + }, + { + "epoch": 0.39932121837006485, + "grad_norm": 0.5187329471387797, + "learning_rate": 1.3291206955813354e-05, + "loss": 0.7464, + "step": 18590 + }, + { + "epoch": 0.3995360226833355, + "grad_norm": 0.5049021637761884, + "learning_rate": 1.3284769444999901e-05, + "loss": 0.7595, + "step": 18600 + }, + { + "epoch": 0.3997508269966061, + "grad_norm": 0.5029990992742273, + "learning_rate": 1.3278330407925135e-05, + "loss": 0.7543, + "step": 18610 + }, + { + "epoch": 0.3999656313098767, + "grad_norm": 0.5173702160515916, + "learning_rate": 1.327188984758094e-05, + "loss": 0.7619, + "step": 18620 + }, + { + "epoch": 0.4001804356231473, + "grad_norm": 0.5192562549904551, + "learning_rate": 1.3265447766959911e-05, + "loss": 0.7561, + "step": 18630 + }, + { + "epoch": 0.4003952399364179, + "grad_norm": 0.5312951137431535, + "learning_rate": 1.3259004169055346e-05, + "loss": 0.7694, + "step": 18640 + }, + { + "epoch": 0.40061004424968855, + "grad_norm": 0.5266364154510266, + "learning_rate": 1.3252559056861246e-05, + "loss": 0.7543, + "step": 18650 + }, + { + "epoch": 0.40082484856295914, + "grad_norm": 0.533316964707828, + "learning_rate": 1.3246112433372322e-05, + "loss": 0.7554, + "step": 18660 + }, + { + "epoch": 0.4010396528762298, + "grad_norm": 0.5482477597760441, + "learning_rate": 1.3239664301583988e-05, + "loss": 0.7656, + "step": 18670 + }, + { + "epoch": 0.40125445718950037, + "grad_norm": 0.5206451326269249, + "learning_rate": 1.3233214664492349e-05, + "loss": 0.7543, + "step": 18680 + }, + { + "epoch": 0.40146926150277096, + "grad_norm": 0.5418016042557484, + "learning_rate": 1.3226763525094217e-05, + "loss": 0.7611, + "step": 18690 + }, + { + "epoch": 0.4016840658160416, + "grad_norm": 0.5558615173058707, + "learning_rate": 1.3220310886387103e-05, + "loss": 0.7627, + "step": 18700 + }, + { + "epoch": 0.4018988701293122, + "grad_norm": 0.5143709640231149, + "learning_rate": 1.3213856751369207e-05, + "loss": 0.7714, + "step": 18710 + }, + { + "epoch": 0.4021136744425828, + "grad_norm": 0.5376677743737073, + "learning_rate": 1.3207401123039438e-05, + "loss": 0.7691, + "step": 18720 + }, + { + "epoch": 0.4023284787558534, + "grad_norm": 0.5399761390877215, + "learning_rate": 1.3200944004397383e-05, + "loss": 0.7456, + "step": 18730 + }, + { + "epoch": 0.402543283069124, + "grad_norm": 0.5249199592562105, + "learning_rate": 1.3194485398443332e-05, + "loss": 0.7505, + "step": 18740 + }, + { + "epoch": 0.40275808738239466, + "grad_norm": 0.5155915016561029, + "learning_rate": 1.318802530817826e-05, + "loss": 0.7306, + "step": 18750 + }, + { + "epoch": 0.40297289169566525, + "grad_norm": 0.5318691830490915, + "learning_rate": 1.3181563736603837e-05, + "loss": 0.748, + "step": 18760 + }, + { + "epoch": 0.40318769600893584, + "grad_norm": 0.515075633683763, + "learning_rate": 1.317510068672242e-05, + "loss": 0.7597, + "step": 18770 + }, + { + "epoch": 0.4034025003222065, + "grad_norm": 0.5179116497827865, + "learning_rate": 1.316863616153705e-05, + "loss": 0.7613, + "step": 18780 + }, + { + "epoch": 0.40361730463547707, + "grad_norm": 0.5332150745577201, + "learning_rate": 1.3162170164051456e-05, + "loss": 0.7521, + "step": 18790 + }, + { + "epoch": 0.4038321089487477, + "grad_norm": 0.5253427224156337, + "learning_rate": 1.3155702697270047e-05, + "loss": 0.7733, + "step": 18800 + }, + { + "epoch": 0.4040469132620183, + "grad_norm": 0.49768785272010546, + "learning_rate": 1.3149233764197922e-05, + "loss": 0.7566, + "step": 18810 + }, + { + "epoch": 0.4042617175752889, + "grad_norm": 0.5267923414911647, + "learning_rate": 1.3142763367840857e-05, + "loss": 0.7667, + "step": 18820 + }, + { + "epoch": 0.40447652188855954, + "grad_norm": 0.5282610764687665, + "learning_rate": 1.3136291511205306e-05, + "loss": 0.755, + "step": 18830 + }, + { + "epoch": 0.4046913262018301, + "grad_norm": 0.5223892713619213, + "learning_rate": 1.3129818197298405e-05, + "loss": 0.7738, + "step": 18840 + }, + { + "epoch": 0.40490613051510077, + "grad_norm": 0.5096764777578232, + "learning_rate": 1.3123343429127968e-05, + "loss": 0.7588, + "step": 18850 + }, + { + "epoch": 0.40512093482837136, + "grad_norm": 0.5467891192906261, + "learning_rate": 1.3116867209702479e-05, + "loss": 0.7592, + "step": 18860 + }, + { + "epoch": 0.40533573914164195, + "grad_norm": 1.134506804779343, + "learning_rate": 1.3110389542031102e-05, + "loss": 0.7478, + "step": 18870 + }, + { + "epoch": 0.4055505434549126, + "grad_norm": 0.5097103865358088, + "learning_rate": 1.310391042912367e-05, + "loss": 0.7512, + "step": 18880 + }, + { + "epoch": 0.4057653477681832, + "grad_norm": 0.5393124410906558, + "learning_rate": 1.3097429873990693e-05, + "loss": 0.7485, + "step": 18890 + }, + { + "epoch": 0.40598015208145377, + "grad_norm": 0.5190274805087902, + "learning_rate": 1.3090947879643344e-05, + "loss": 0.7406, + "step": 18900 + }, + { + "epoch": 0.4061949563947244, + "grad_norm": 0.5440017480426464, + "learning_rate": 1.308446444909347e-05, + "loss": 0.7711, + "step": 18910 + }, + { + "epoch": 0.406409760707995, + "grad_norm": 0.5154904534059127, + "learning_rate": 1.3077979585353582e-05, + "loss": 0.75, + "step": 18920 + }, + { + "epoch": 0.40662456502126565, + "grad_norm": 0.5278477316199867, + "learning_rate": 1.307149329143686e-05, + "loss": 0.7612, + "step": 18930 + }, + { + "epoch": 0.40683936933453624, + "grad_norm": 0.5322812965577661, + "learning_rate": 1.3065005570357148e-05, + "loss": 0.7624, + "step": 18940 + }, + { + "epoch": 0.4070541736478068, + "grad_norm": 0.5160265527970386, + "learning_rate": 1.305851642512895e-05, + "loss": 0.7472, + "step": 18950 + }, + { + "epoch": 0.40726897796107747, + "grad_norm": 0.5318911272420563, + "learning_rate": 1.3052025858767435e-05, + "loss": 0.7703, + "step": 18960 + }, + { + "epoch": 0.40748378227434806, + "grad_norm": 0.5055369294794778, + "learning_rate": 1.3045533874288429e-05, + "loss": 0.7505, + "step": 18970 + }, + { + "epoch": 0.4076985865876187, + "grad_norm": 0.5298819562145214, + "learning_rate": 1.3039040474708422e-05, + "loss": 0.7645, + "step": 18980 + }, + { + "epoch": 0.4079133909008893, + "grad_norm": 0.552924739228477, + "learning_rate": 1.3032545663044558e-05, + "loss": 0.7734, + "step": 18990 + }, + { + "epoch": 0.4081281952141599, + "grad_norm": 0.5346123293209584, + "learning_rate": 1.3026049442314636e-05, + "loss": 0.7553, + "step": 19000 + }, + { + "epoch": 0.40834299952743053, + "grad_norm": 0.5246739179110647, + "learning_rate": 1.3019551815537117e-05, + "loss": 0.7505, + "step": 19010 + }, + { + "epoch": 0.4085578038407011, + "grad_norm": 0.5338166445740993, + "learning_rate": 1.3013052785731102e-05, + "loss": 0.7414, + "step": 19020 + }, + { + "epoch": 0.4087726081539717, + "grad_norm": 0.5109686263652257, + "learning_rate": 1.3006552355916355e-05, + "loss": 0.7477, + "step": 19030 + }, + { + "epoch": 0.40898741246724235, + "grad_norm": 0.5045191677684051, + "learning_rate": 1.300005052911329e-05, + "loss": 0.7546, + "step": 19040 + }, + { + "epoch": 0.40920221678051294, + "grad_norm": 0.5374579208608851, + "learning_rate": 1.2993547308342965e-05, + "loss": 0.7639, + "step": 19050 + }, + { + "epoch": 0.4094170210937836, + "grad_norm": 0.5316853043110944, + "learning_rate": 1.2987042696627085e-05, + "loss": 0.7595, + "step": 19060 + }, + { + "epoch": 0.4096318254070542, + "grad_norm": 0.5278988439756973, + "learning_rate": 1.298053669698801e-05, + "loss": 0.735, + "step": 19070 + }, + { + "epoch": 0.40984662972032476, + "grad_norm": 0.5654598465285191, + "learning_rate": 1.2974029312448737e-05, + "loss": 0.7684, + "step": 19080 + }, + { + "epoch": 0.4100614340335954, + "grad_norm": 0.5162837541623254, + "learning_rate": 1.2967520546032905e-05, + "loss": 0.7579, + "step": 19090 + }, + { + "epoch": 0.410276238346866, + "grad_norm": 0.5224389671190344, + "learning_rate": 1.2961010400764807e-05, + "loss": 0.7513, + "step": 19100 + }, + { + "epoch": 0.41049104266013664, + "grad_norm": 0.5176219223001366, + "learning_rate": 1.2954498879669361e-05, + "loss": 0.7638, + "step": 19110 + }, + { + "epoch": 0.41070584697340723, + "grad_norm": 0.534813023204064, + "learning_rate": 1.2947985985772137e-05, + "loss": 0.762, + "step": 19120 + }, + { + "epoch": 0.4109206512866778, + "grad_norm": 0.5272695180224509, + "learning_rate": 1.2941471722099332e-05, + "loss": 0.7592, + "step": 19130 + }, + { + "epoch": 0.41113545559994846, + "grad_norm": 0.5378612978101573, + "learning_rate": 1.2934956091677788e-05, + "loss": 0.7554, + "step": 19140 + }, + { + "epoch": 0.41135025991321905, + "grad_norm": 0.5258722653546618, + "learning_rate": 1.2928439097534981e-05, + "loss": 0.7635, + "step": 19150 + }, + { + "epoch": 0.41156506422648964, + "grad_norm": 0.5351602497774721, + "learning_rate": 1.2921920742699019e-05, + "loss": 0.7483, + "step": 19160 + }, + { + "epoch": 0.4117798685397603, + "grad_norm": 0.5367356185774689, + "learning_rate": 1.291540103019864e-05, + "loss": 0.755, + "step": 19170 + }, + { + "epoch": 0.4119946728530309, + "grad_norm": 0.5255579867546255, + "learning_rate": 1.2908879963063212e-05, + "loss": 0.7603, + "step": 19180 + }, + { + "epoch": 0.4122094771663015, + "grad_norm": 0.5058299224660839, + "learning_rate": 1.2902357544322741e-05, + "loss": 0.7623, + "step": 19190 + }, + { + "epoch": 0.4124242814795721, + "grad_norm": 0.5257377107858072, + "learning_rate": 1.289583377700785e-05, + "loss": 0.7433, + "step": 19200 + }, + { + "epoch": 0.4126390857928427, + "grad_norm": 0.5202414578374462, + "learning_rate": 1.28893086641498e-05, + "loss": 0.7522, + "step": 19210 + }, + { + "epoch": 0.41285389010611334, + "grad_norm": 0.514742134271601, + "learning_rate": 1.288278220878047e-05, + "loss": 0.7478, + "step": 19220 + }, + { + "epoch": 0.41306869441938393, + "grad_norm": 0.498665233670508, + "learning_rate": 1.287625441393236e-05, + "loss": 0.7732, + "step": 19230 + }, + { + "epoch": 0.4132834987326546, + "grad_norm": 0.5152993923389213, + "learning_rate": 1.2869725282638596e-05, + "loss": 0.7389, + "step": 19240 + }, + { + "epoch": 0.41349830304592516, + "grad_norm": 0.5369968074928185, + "learning_rate": 1.286319481793293e-05, + "loss": 0.7559, + "step": 19250 + }, + { + "epoch": 0.41371310735919575, + "grad_norm": 0.5168991961504903, + "learning_rate": 1.2856663022849724e-05, + "loss": 0.7425, + "step": 19260 + }, + { + "epoch": 0.4139279116724664, + "grad_norm": 0.5226064009439039, + "learning_rate": 1.2850129900423972e-05, + "loss": 0.7476, + "step": 19270 + }, + { + "epoch": 0.414142715985737, + "grad_norm": 0.5338894520277626, + "learning_rate": 1.2843595453691262e-05, + "loss": 0.7701, + "step": 19280 + }, + { + "epoch": 0.41435752029900763, + "grad_norm": 0.5143229406464946, + "learning_rate": 1.2837059685687823e-05, + "loss": 0.7565, + "step": 19290 + }, + { + "epoch": 0.4145723246122782, + "grad_norm": 0.5162616207638232, + "learning_rate": 1.2830522599450479e-05, + "loss": 0.765, + "step": 19300 + }, + { + "epoch": 0.4147871289255488, + "grad_norm": 0.5006245141202402, + "learning_rate": 1.2823984198016676e-05, + "loss": 0.7403, + "step": 19310 + }, + { + "epoch": 0.41500193323881945, + "grad_norm": 0.5284695404789536, + "learning_rate": 1.281744448442447e-05, + "loss": 0.7545, + "step": 19320 + }, + { + "epoch": 0.41521673755209004, + "grad_norm": 0.5214124159977402, + "learning_rate": 1.2810903461712524e-05, + "loss": 0.761, + "step": 19330 + }, + { + "epoch": 0.41543154186536063, + "grad_norm": 0.5254239866013033, + "learning_rate": 1.2804361132920114e-05, + "loss": 0.7446, + "step": 19340 + }, + { + "epoch": 0.4156463461786313, + "grad_norm": 0.5278037952638479, + "learning_rate": 1.2797817501087113e-05, + "loss": 0.7572, + "step": 19350 + }, + { + "epoch": 0.41586115049190187, + "grad_norm": 0.5367094234076616, + "learning_rate": 1.2791272569254009e-05, + "loss": 0.7462, + "step": 19360 + }, + { + "epoch": 0.4160759548051725, + "grad_norm": 0.5252418860683364, + "learning_rate": 1.2784726340461892e-05, + "loss": 0.7693, + "step": 19370 + }, + { + "epoch": 0.4162907591184431, + "grad_norm": 0.5055520616899003, + "learning_rate": 1.2778178817752454e-05, + "loss": 0.7481, + "step": 19380 + }, + { + "epoch": 0.4165055634317137, + "grad_norm": 0.5204456122832675, + "learning_rate": 1.2771630004167985e-05, + "loss": 0.7419, + "step": 19390 + }, + { + "epoch": 0.41672036774498433, + "grad_norm": 0.5217514946799076, + "learning_rate": 1.2765079902751381e-05, + "loss": 0.7459, + "step": 19400 + }, + { + "epoch": 0.4169351720582549, + "grad_norm": 0.53197721768162, + "learning_rate": 1.275852851654613e-05, + "loss": 0.7451, + "step": 19410 + }, + { + "epoch": 0.41714997637152557, + "grad_norm": 0.5345894761871923, + "learning_rate": 1.2751975848596324e-05, + "loss": 0.7545, + "step": 19420 + }, + { + "epoch": 0.41736478068479615, + "grad_norm": 0.5172473442706328, + "learning_rate": 1.274542190194664e-05, + "loss": 0.7633, + "step": 19430 + }, + { + "epoch": 0.41757958499806674, + "grad_norm": 0.5325296172080332, + "learning_rate": 1.2738866679642365e-05, + "loss": 0.749, + "step": 19440 + }, + { + "epoch": 0.4177943893113374, + "grad_norm": 0.5270837129930179, + "learning_rate": 1.2732310184729362e-05, + "loss": 0.7563, + "step": 19450 + }, + { + "epoch": 0.418009193624608, + "grad_norm": 0.514258180577775, + "learning_rate": 1.2725752420254094e-05, + "loss": 0.7591, + "step": 19460 + }, + { + "epoch": 0.41822399793787857, + "grad_norm": 0.5187969540802417, + "learning_rate": 1.2719193389263613e-05, + "loss": 0.7482, + "step": 19470 + }, + { + "epoch": 0.4184388022511492, + "grad_norm": 0.5303861691448711, + "learning_rate": 1.2712633094805561e-05, + "loss": 0.7547, + "step": 19480 + }, + { + "epoch": 0.4186536065644198, + "grad_norm": 0.5105008679720247, + "learning_rate": 1.2706071539928166e-05, + "loss": 0.7591, + "step": 19490 + }, + { + "epoch": 0.41886841087769044, + "grad_norm": 0.5277024751757646, + "learning_rate": 1.2699508727680238e-05, + "loss": 0.7563, + "step": 19500 + }, + { + "epoch": 0.41908321519096103, + "grad_norm": 0.5341955327167359, + "learning_rate": 1.2692944661111176e-05, + "loss": 0.7616, + "step": 19510 + }, + { + "epoch": 0.4192980195042316, + "grad_norm": 0.5321207219044839, + "learning_rate": 1.2686379343270956e-05, + "loss": 0.7487, + "step": 19520 + }, + { + "epoch": 0.41951282381750227, + "grad_norm": 0.5028973933951222, + "learning_rate": 1.2679812777210142e-05, + "loss": 0.7589, + "step": 19530 + }, + { + "epoch": 0.41972762813077286, + "grad_norm": 0.5333649392140662, + "learning_rate": 1.2673244965979881e-05, + "loss": 0.7499, + "step": 19540 + }, + { + "epoch": 0.4199424324440435, + "grad_norm": 0.8809161047806207, + "learning_rate": 1.2666675912631885e-05, + "loss": 0.7431, + "step": 19550 + }, + { + "epoch": 0.4201572367573141, + "grad_norm": 0.547365448998943, + "learning_rate": 1.2660105620218452e-05, + "loss": 0.7516, + "step": 19560 + }, + { + "epoch": 0.4203720410705847, + "grad_norm": 0.5278558806116719, + "learning_rate": 1.2653534091792459e-05, + "loss": 0.7572, + "step": 19570 + }, + { + "epoch": 0.4205868453838553, + "grad_norm": 0.5242417730519813, + "learning_rate": 1.2646961330407349e-05, + "loss": 0.7561, + "step": 19580 + }, + { + "epoch": 0.4208016496971259, + "grad_norm": 0.5145264148757996, + "learning_rate": 1.264038733911714e-05, + "loss": 0.7526, + "step": 19590 + }, + { + "epoch": 0.4210164540103965, + "grad_norm": 0.5226054611704246, + "learning_rate": 1.2633812120976432e-05, + "loss": 0.7585, + "step": 19600 + }, + { + "epoch": 0.42123125832366715, + "grad_norm": 0.5094656107452357, + "learning_rate": 1.262723567904038e-05, + "loss": 0.7431, + "step": 19610 + }, + { + "epoch": 0.42144606263693773, + "grad_norm": 0.5208004120644633, + "learning_rate": 1.2620658016364713e-05, + "loss": 0.7481, + "step": 19620 + }, + { + "epoch": 0.4216608669502084, + "grad_norm": 0.5368286486289077, + "learning_rate": 1.2614079136005732e-05, + "loss": 0.7659, + "step": 19630 + }, + { + "epoch": 0.42187567126347897, + "grad_norm": 0.5470952446979304, + "learning_rate": 1.26074990410203e-05, + "loss": 0.7511, + "step": 19640 + }, + { + "epoch": 0.42209047557674956, + "grad_norm": 0.5132946022336456, + "learning_rate": 1.2600917734465843e-05, + "loss": 0.7725, + "step": 19650 + }, + { + "epoch": 0.4223052798900202, + "grad_norm": 0.5597310989128308, + "learning_rate": 1.2594335219400349e-05, + "loss": 0.7431, + "step": 19660 + }, + { + "epoch": 0.4225200842032908, + "grad_norm": 0.5260208167606223, + "learning_rate": 1.2587751498882376e-05, + "loss": 0.7533, + "step": 19670 + }, + { + "epoch": 0.42273488851656144, + "grad_norm": 0.5029129520256707, + "learning_rate": 1.2581166575971031e-05, + "loss": 0.7506, + "step": 19680 + }, + { + "epoch": 0.422949692829832, + "grad_norm": 0.5202725382006363, + "learning_rate": 1.2574580453725987e-05, + "loss": 0.7541, + "step": 19690 + }, + { + "epoch": 0.4231644971431026, + "grad_norm": 0.5134917201548623, + "learning_rate": 1.256799313520747e-05, + "loss": 0.7564, + "step": 19700 + }, + { + "epoch": 0.42337930145637326, + "grad_norm": 0.5354376298898013, + "learning_rate": 1.256140462347627e-05, + "loss": 0.7594, + "step": 19710 + }, + { + "epoch": 0.42359410576964385, + "grad_norm": 0.5211973403768663, + "learning_rate": 1.2554814921593716e-05, + "loss": 0.7584, + "step": 19720 + }, + { + "epoch": 0.4238089100829145, + "grad_norm": 0.5520144836915775, + "learning_rate": 1.2548224032621707e-05, + "loss": 0.7486, + "step": 19730 + }, + { + "epoch": 0.4240237143961851, + "grad_norm": 0.4998308570756764, + "learning_rate": 1.2541631959622684e-05, + "loss": 0.7462, + "step": 19740 + }, + { + "epoch": 0.42423851870945567, + "grad_norm": 0.496745741982448, + "learning_rate": 1.2535038705659637e-05, + "loss": 0.7557, + "step": 19750 + }, + { + "epoch": 0.4244533230227263, + "grad_norm": 0.5179608442643614, + "learning_rate": 1.2528444273796113e-05, + "loss": 0.7559, + "step": 19760 + }, + { + "epoch": 0.4246681273359969, + "grad_norm": 0.5237889562247441, + "learning_rate": 1.2521848667096196e-05, + "loss": 0.759, + "step": 19770 + }, + { + "epoch": 0.4248829316492675, + "grad_norm": 0.5448250284811663, + "learning_rate": 1.251525188862452e-05, + "loss": 0.7612, + "step": 19780 + }, + { + "epoch": 0.42509773596253814, + "grad_norm": 0.5060894163492381, + "learning_rate": 1.250865394144627e-05, + "loss": 0.7593, + "step": 19790 + }, + { + "epoch": 0.4253125402758087, + "grad_norm": 0.49819573581917886, + "learning_rate": 1.2502054828627168e-05, + "loss": 0.7429, + "step": 19800 + }, + { + "epoch": 0.42552734458907937, + "grad_norm": 0.5213140703162686, + "learning_rate": 1.2495454553233473e-05, + "loss": 0.7549, + "step": 19810 + }, + { + "epoch": 0.42574214890234996, + "grad_norm": 0.5205619160973527, + "learning_rate": 1.2488853118331993e-05, + "loss": 0.7381, + "step": 19820 + }, + { + "epoch": 0.42595695321562055, + "grad_norm": 0.5411772964173832, + "learning_rate": 1.2482250526990072e-05, + "loss": 0.7563, + "step": 19830 + }, + { + "epoch": 0.4261717575288912, + "grad_norm": 0.5070043547298757, + "learning_rate": 1.2475646782275588e-05, + "loss": 0.7345, + "step": 19840 + }, + { + "epoch": 0.4263865618421618, + "grad_norm": 0.5446352761437885, + "learning_rate": 1.2469041887256955e-05, + "loss": 0.7575, + "step": 19850 + }, + { + "epoch": 0.4266013661554324, + "grad_norm": 0.5071987431688498, + "learning_rate": 1.2462435845003131e-05, + "loss": 0.7446, + "step": 19860 + }, + { + "epoch": 0.426816170468703, + "grad_norm": 0.5207497470969883, + "learning_rate": 1.2455828658583595e-05, + "loss": 0.7656, + "step": 19870 + }, + { + "epoch": 0.4270309747819736, + "grad_norm": 0.5117446569849975, + "learning_rate": 1.2449220331068363e-05, + "loss": 0.7582, + "step": 19880 + }, + { + "epoch": 0.42724577909524425, + "grad_norm": 0.5264990840912481, + "learning_rate": 1.2442610865527977e-05, + "loss": 0.7442, + "step": 19890 + }, + { + "epoch": 0.42746058340851484, + "grad_norm": 0.5258670236773598, + "learning_rate": 1.2436000265033518e-05, + "loss": 0.7531, + "step": 19900 + }, + { + "epoch": 0.4276753877217854, + "grad_norm": 0.538439943387158, + "learning_rate": 1.2429388532656586e-05, + "loss": 0.764, + "step": 19910 + }, + { + "epoch": 0.42789019203505607, + "grad_norm": 0.5177931014614214, + "learning_rate": 1.242277567146931e-05, + "loss": 0.7528, + "step": 19920 + }, + { + "epoch": 0.42810499634832666, + "grad_norm": 0.5194066108047999, + "learning_rate": 1.2416161684544337e-05, + "loss": 0.7478, + "step": 19930 + }, + { + "epoch": 0.4283198006615973, + "grad_norm": 0.5223576429675216, + "learning_rate": 1.2409546574954844e-05, + "loss": 0.7436, + "step": 19940 + }, + { + "epoch": 0.4285346049748679, + "grad_norm": 0.5331311220294771, + "learning_rate": 1.2402930345774533e-05, + "loss": 0.7626, + "step": 19950 + }, + { + "epoch": 0.4287494092881385, + "grad_norm": 0.5321622654012298, + "learning_rate": 1.2396313000077613e-05, + "loss": 0.7528, + "step": 19960 + }, + { + "epoch": 0.42896421360140913, + "grad_norm": 0.5213440299088972, + "learning_rate": 1.2389694540938828e-05, + "loss": 0.7607, + "step": 19970 + }, + { + "epoch": 0.4291790179146797, + "grad_norm": 0.5451420730102323, + "learning_rate": 1.238307497143343e-05, + "loss": 0.7514, + "step": 19980 + }, + { + "epoch": 0.42939382222795036, + "grad_norm": 0.5396602450861735, + "learning_rate": 1.2376454294637182e-05, + "loss": 0.754, + "step": 19990 + }, + { + "epoch": 0.42960862654122095, + "grad_norm": 0.5294320493844773, + "learning_rate": 1.2369832513626372e-05, + "loss": 0.7417, + "step": 20000 + }, + { + "epoch": 0.42982343085449154, + "grad_norm": 0.5229398240238433, + "learning_rate": 1.2363209631477797e-05, + "loss": 0.7516, + "step": 20010 + }, + { + "epoch": 0.4300382351677622, + "grad_norm": 0.5169446602861949, + "learning_rate": 1.2356585651268765e-05, + "loss": 0.7475, + "step": 20020 + }, + { + "epoch": 0.4302530394810328, + "grad_norm": 0.5471875007778104, + "learning_rate": 1.2349960576077097e-05, + "loss": 0.7589, + "step": 20030 + }, + { + "epoch": 0.43046784379430336, + "grad_norm": 0.5125110507107423, + "learning_rate": 1.2343334408981116e-05, + "loss": 0.7572, + "step": 20040 + }, + { + "epoch": 0.430682648107574, + "grad_norm": 0.5036197114282005, + "learning_rate": 1.2336707153059662e-05, + "loss": 0.7507, + "step": 20050 + }, + { + "epoch": 0.4308974524208446, + "grad_norm": 0.5169855576103094, + "learning_rate": 1.2330078811392068e-05, + "loss": 0.7567, + "step": 20060 + }, + { + "epoch": 0.43111225673411524, + "grad_norm": 0.5229836121047303, + "learning_rate": 1.2323449387058185e-05, + "loss": 0.7433, + "step": 20070 + }, + { + "epoch": 0.43132706104738583, + "grad_norm": 0.5272331493503679, + "learning_rate": 1.2316818883138362e-05, + "loss": 0.7618, + "step": 20080 + }, + { + "epoch": 0.4315418653606564, + "grad_norm": 0.5127750810913362, + "learning_rate": 1.2310187302713448e-05, + "loss": 0.7557, + "step": 20090 + }, + { + "epoch": 0.43175666967392706, + "grad_norm": 0.5046998338723229, + "learning_rate": 1.2303554648864791e-05, + "loss": 0.7406, + "step": 20100 + }, + { + "epoch": 0.43197147398719765, + "grad_norm": 0.5256594356973203, + "learning_rate": 1.229692092467424e-05, + "loss": 0.7516, + "step": 20110 + }, + { + "epoch": 0.4321862783004683, + "grad_norm": 0.5171058923668201, + "learning_rate": 1.2290286133224146e-05, + "loss": 0.7513, + "step": 20120 + }, + { + "epoch": 0.4324010826137389, + "grad_norm": 0.5104680411343472, + "learning_rate": 1.2283650277597343e-05, + "loss": 0.7378, + "step": 20130 + }, + { + "epoch": 0.4326158869270095, + "grad_norm": 0.5133259060250314, + "learning_rate": 1.2277013360877174e-05, + "loss": 0.7497, + "step": 20140 + }, + { + "epoch": 0.4328306912402801, + "grad_norm": 0.5068993271066068, + "learning_rate": 1.2270375386147466e-05, + "loss": 0.7562, + "step": 20150 + }, + { + "epoch": 0.4330454955535507, + "grad_norm": 0.5411116105681395, + "learning_rate": 1.2263736356492541e-05, + "loss": 0.7513, + "step": 20160 + }, + { + "epoch": 0.43326029986682135, + "grad_norm": 0.49754370415948745, + "learning_rate": 1.2257096274997209e-05, + "loss": 0.7427, + "step": 20170 + }, + { + "epoch": 0.43347510418009194, + "grad_norm": 0.5163307527085924, + "learning_rate": 1.2250455144746776e-05, + "loss": 0.7383, + "step": 20180 + }, + { + "epoch": 0.43368990849336253, + "grad_norm": 0.5302499060158602, + "learning_rate": 1.2243812968827021e-05, + "loss": 0.7304, + "step": 20190 + }, + { + "epoch": 0.4339047128066332, + "grad_norm": 0.5089583346915815, + "learning_rate": 1.223716975032422e-05, + "loss": 0.7482, + "step": 20200 + }, + { + "epoch": 0.43411951711990376, + "grad_norm": 0.5302791890282693, + "learning_rate": 1.2230525492325133e-05, + "loss": 0.7587, + "step": 20210 + }, + { + "epoch": 0.43433432143317435, + "grad_norm": 0.5183200687604282, + "learning_rate": 1.2223880197916997e-05, + "loss": 0.7385, + "step": 20220 + }, + { + "epoch": 0.434549125746445, + "grad_norm": 0.5283129948501645, + "learning_rate": 1.2217233870187539e-05, + "loss": 0.7618, + "step": 20230 + }, + { + "epoch": 0.4347639300597156, + "grad_norm": 0.5334500046605607, + "learning_rate": 1.221058651222496e-05, + "loss": 0.7574, + "step": 20240 + }, + { + "epoch": 0.43497873437298623, + "grad_norm": 0.5397388701861974, + "learning_rate": 1.220393812711794e-05, + "loss": 0.7544, + "step": 20250 + }, + { + "epoch": 0.4351935386862568, + "grad_norm": 0.5557994197082688, + "learning_rate": 1.2197288717955636e-05, + "loss": 0.7515, + "step": 20260 + }, + { + "epoch": 0.4354083429995274, + "grad_norm": 0.5214757763867583, + "learning_rate": 1.2190638287827683e-05, + "loss": 0.7562, + "step": 20270 + }, + { + "epoch": 0.43562314731279805, + "grad_norm": 0.5363420017553637, + "learning_rate": 1.2183986839824191e-05, + "loss": 0.7552, + "step": 20280 + }, + { + "epoch": 0.43583795162606864, + "grad_norm": 0.5173925675212045, + "learning_rate": 1.217733437703574e-05, + "loss": 0.7454, + "step": 20290 + }, + { + "epoch": 0.4360527559393393, + "grad_norm": 0.5052520562415731, + "learning_rate": 1.2170680902553388e-05, + "loss": 0.7335, + "step": 20300 + }, + { + "epoch": 0.4362675602526099, + "grad_norm": 0.5214947525173893, + "learning_rate": 1.216402641946865e-05, + "loss": 0.7533, + "step": 20310 + }, + { + "epoch": 0.43648236456588047, + "grad_norm": 0.5066915832021032, + "learning_rate": 1.2157370930873521e-05, + "loss": 0.7466, + "step": 20320 + }, + { + "epoch": 0.4366971688791511, + "grad_norm": 0.544927092725932, + "learning_rate": 1.2150714439860463e-05, + "loss": 0.7422, + "step": 20330 + }, + { + "epoch": 0.4369119731924217, + "grad_norm": 0.5232539415746384, + "learning_rate": 1.2144056949522396e-05, + "loss": 0.7456, + "step": 20340 + }, + { + "epoch": 0.4371267775056923, + "grad_norm": 0.5358017048774394, + "learning_rate": 1.2137398462952714e-05, + "loss": 0.761, + "step": 20350 + }, + { + "epoch": 0.43734158181896293, + "grad_norm": 0.514296276827805, + "learning_rate": 1.2130738983245265e-05, + "loss": 0.7406, + "step": 20360 + }, + { + "epoch": 0.4375563861322335, + "grad_norm": 0.5274917168112747, + "learning_rate": 1.2124078513494366e-05, + "loss": 0.7498, + "step": 20370 + }, + { + "epoch": 0.43777119044550417, + "grad_norm": 0.5091641073845611, + "learning_rate": 1.2117417056794787e-05, + "loss": 0.7669, + "step": 20380 + }, + { + "epoch": 0.43798599475877475, + "grad_norm": 0.5238531386173619, + "learning_rate": 1.211075461624176e-05, + "loss": 0.7454, + "step": 20390 + }, + { + "epoch": 0.43820079907204534, + "grad_norm": 0.5263012775160619, + "learning_rate": 1.2104091194930976e-05, + "loss": 0.7546, + "step": 20400 + }, + { + "epoch": 0.438415603385316, + "grad_norm": 0.5206973036533034, + "learning_rate": 1.2097426795958582e-05, + "loss": 0.7633, + "step": 20410 + }, + { + "epoch": 0.4386304076985866, + "grad_norm": 0.5329927051305835, + "learning_rate": 1.2090761422421173e-05, + "loss": 0.7492, + "step": 20420 + }, + { + "epoch": 0.4388452120118572, + "grad_norm": 0.5248347095576246, + "learning_rate": 1.2084095077415802e-05, + "loss": 0.7525, + "step": 20430 + }, + { + "epoch": 0.4390600163251278, + "grad_norm": 0.5763619430364404, + "learning_rate": 1.2077427764039976e-05, + "loss": 0.763, + "step": 20440 + }, + { + "epoch": 0.4392748206383984, + "grad_norm": 0.5277721502587376, + "learning_rate": 1.2070759485391642e-05, + "loss": 0.7728, + "step": 20450 + }, + { + "epoch": 0.43948962495166904, + "grad_norm": 0.5349602899110175, + "learning_rate": 1.206409024456921e-05, + "loss": 0.7529, + "step": 20460 + }, + { + "epoch": 0.43970442926493963, + "grad_norm": 0.5322814748856364, + "learning_rate": 1.2057420044671517e-05, + "loss": 0.7403, + "step": 20470 + }, + { + "epoch": 0.4399192335782103, + "grad_norm": 0.5118893319232871, + "learning_rate": 1.205074888879787e-05, + "loss": 0.7409, + "step": 20480 + }, + { + "epoch": 0.44013403789148087, + "grad_norm": 0.5455136550456627, + "learning_rate": 1.2044076780048e-05, + "loss": 0.7471, + "step": 20490 + }, + { + "epoch": 0.44034884220475146, + "grad_norm": 0.5299308092265677, + "learning_rate": 1.2037403721522095e-05, + "loss": 0.7577, + "step": 20500 + }, + { + "epoch": 0.4405636465180221, + "grad_norm": 0.5121296470133871, + "learning_rate": 1.2030729716320768e-05, + "loss": 0.747, + "step": 20510 + }, + { + "epoch": 0.4407784508312927, + "grad_norm": 0.5386378414121707, + "learning_rate": 1.2024054767545092e-05, + "loss": 0.7534, + "step": 20520 + }, + { + "epoch": 0.4409932551445633, + "grad_norm": 0.5433674923886992, + "learning_rate": 1.2017378878296562e-05, + "loss": 0.7503, + "step": 20530 + }, + { + "epoch": 0.4412080594578339, + "grad_norm": 0.5016141269307198, + "learning_rate": 1.2010702051677118e-05, + "loss": 0.7502, + "step": 20540 + }, + { + "epoch": 0.4414228637711045, + "grad_norm": 0.504046182876863, + "learning_rate": 1.2004024290789133e-05, + "loss": 0.7516, + "step": 20550 + }, + { + "epoch": 0.44163766808437516, + "grad_norm": 0.5226732086112348, + "learning_rate": 1.1997345598735418e-05, + "loss": 0.7494, + "step": 20560 + }, + { + "epoch": 0.44185247239764575, + "grad_norm": 0.5114919367543062, + "learning_rate": 1.1990665978619207e-05, + "loss": 0.7402, + "step": 20570 + }, + { + "epoch": 0.44206727671091633, + "grad_norm": 0.5440210462668759, + "learning_rate": 1.1983985433544176e-05, + "loss": 0.7533, + "step": 20580 + }, + { + "epoch": 0.442282081024187, + "grad_norm": 0.4942048477137153, + "learning_rate": 1.1977303966614426e-05, + "loss": 0.7581, + "step": 20590 + }, + { + "epoch": 0.44249688533745757, + "grad_norm": 0.5320405941370843, + "learning_rate": 1.1970621580934487e-05, + "loss": 0.7624, + "step": 20600 + }, + { + "epoch": 0.4427116896507282, + "grad_norm": 0.5185257384736662, + "learning_rate": 1.1963938279609313e-05, + "loss": 0.7406, + "step": 20610 + }, + { + "epoch": 0.4429264939639988, + "grad_norm": 0.5263134282281284, + "learning_rate": 1.195725406574429e-05, + "loss": 0.7621, + "step": 20620 + }, + { + "epoch": 0.4431412982772694, + "grad_norm": 0.5089836853097602, + "learning_rate": 1.1950568942445225e-05, + "loss": 0.7457, + "step": 20630 + }, + { + "epoch": 0.44335610259054004, + "grad_norm": 0.5201727085179321, + "learning_rate": 1.1943882912818339e-05, + "loss": 0.7349, + "step": 20640 + }, + { + "epoch": 0.4435709069038106, + "grad_norm": 0.5595981699934405, + "learning_rate": 1.1937195979970289e-05, + "loss": 0.7588, + "step": 20650 + }, + { + "epoch": 0.4437857112170812, + "grad_norm": 0.5276981692403241, + "learning_rate": 1.193050814700814e-05, + "loss": 0.7736, + "step": 20660 + }, + { + "epoch": 0.44400051553035186, + "grad_norm": 0.5236268232985871, + "learning_rate": 1.1923819417039383e-05, + "loss": 0.7595, + "step": 20670 + }, + { + "epoch": 0.44421531984362245, + "grad_norm": 0.49905338520313625, + "learning_rate": 1.1917129793171924e-05, + "loss": 0.7413, + "step": 20680 + }, + { + "epoch": 0.4444301241568931, + "grad_norm": 0.5164080638755711, + "learning_rate": 1.1910439278514081e-05, + "loss": 0.752, + "step": 20690 + }, + { + "epoch": 0.4446449284701637, + "grad_norm": 0.49930489760055635, + "learning_rate": 1.1903747876174583e-05, + "loss": 0.7514, + "step": 20700 + }, + { + "epoch": 0.44485973278343427, + "grad_norm": 0.5300452626534115, + "learning_rate": 1.1897055589262583e-05, + "loss": 0.7396, + "step": 20710 + }, + { + "epoch": 0.4450745370967049, + "grad_norm": 0.5321479894953371, + "learning_rate": 1.1890362420887634e-05, + "loss": 0.7508, + "step": 20720 + }, + { + "epoch": 0.4452893414099755, + "grad_norm": 0.5308819147481129, + "learning_rate": 1.1883668374159705e-05, + "loss": 0.7538, + "step": 20730 + }, + { + "epoch": 0.44550414572324615, + "grad_norm": 0.5069966537674646, + "learning_rate": 1.1876973452189172e-05, + "loss": 0.7629, + "step": 20740 + }, + { + "epoch": 0.44571895003651674, + "grad_norm": 0.5585367021939927, + "learning_rate": 1.1870277658086813e-05, + "loss": 0.7432, + "step": 20750 + }, + { + "epoch": 0.4459337543497873, + "grad_norm": 0.5223674054406362, + "learning_rate": 1.1863580994963817e-05, + "loss": 0.7439, + "step": 20760 + }, + { + "epoch": 0.44614855866305797, + "grad_norm": 0.5141402859358744, + "learning_rate": 1.1856883465931772e-05, + "loss": 0.746, + "step": 20770 + }, + { + "epoch": 0.44636336297632856, + "grad_norm": 0.5461400382474692, + "learning_rate": 1.1850185074102675e-05, + "loss": 0.7681, + "step": 20780 + }, + { + "epoch": 0.44657816728959915, + "grad_norm": 0.5190276865079356, + "learning_rate": 1.1843485822588923e-05, + "loss": 0.746, + "step": 20790 + }, + { + "epoch": 0.4467929716028698, + "grad_norm": 0.5149818719484064, + "learning_rate": 1.18367857145033e-05, + "loss": 0.7511, + "step": 20800 + }, + { + "epoch": 0.4470077759161404, + "grad_norm": 0.5038594272188172, + "learning_rate": 1.1830084752959002e-05, + "loss": 0.7601, + "step": 20810 + }, + { + "epoch": 0.447222580229411, + "grad_norm": 0.5057367274782396, + "learning_rate": 1.1823382941069618e-05, + "loss": 0.7504, + "step": 20820 + }, + { + "epoch": 0.4474373845426816, + "grad_norm": 0.5278200169568622, + "learning_rate": 1.1816680281949132e-05, + "loss": 0.734, + "step": 20830 + }, + { + "epoch": 0.4476521888559522, + "grad_norm": 0.4932889583978271, + "learning_rate": 1.1809976778711915e-05, + "loss": 0.7427, + "step": 20840 + }, + { + "epoch": 0.44786699316922285, + "grad_norm": 0.5254976142469129, + "learning_rate": 1.180327243447274e-05, + "loss": 0.7495, + "step": 20850 + }, + { + "epoch": 0.44808179748249344, + "grad_norm": 0.5258601024362219, + "learning_rate": 1.1796567252346766e-05, + "loss": 0.751, + "step": 20860 + }, + { + "epoch": 0.4482966017957641, + "grad_norm": 0.5361224414625771, + "learning_rate": 1.1789861235449542e-05, + "loss": 0.7354, + "step": 20870 + }, + { + "epoch": 0.44851140610903467, + "grad_norm": 0.508201235290661, + "learning_rate": 1.1783154386897008e-05, + "loss": 0.7453, + "step": 20880 + }, + { + "epoch": 0.44872621042230526, + "grad_norm": 0.5054092342839664, + "learning_rate": 1.1776446709805482e-05, + "loss": 0.7541, + "step": 20890 + }, + { + "epoch": 0.4489410147355759, + "grad_norm": 0.5178562956293307, + "learning_rate": 1.1769738207291674e-05, + "loss": 0.7564, + "step": 20900 + }, + { + "epoch": 0.4491558190488465, + "grad_norm": 0.5160643686715897, + "learning_rate": 1.1763028882472678e-05, + "loss": 0.7482, + "step": 20910 + }, + { + "epoch": 0.44937062336211714, + "grad_norm": 0.5200060584249153, + "learning_rate": 1.1756318738465963e-05, + "loss": 0.7432, + "step": 20920 + }, + { + "epoch": 0.4495854276753877, + "grad_norm": 0.5059948102109856, + "learning_rate": 1.1749607778389386e-05, + "loss": 0.7501, + "step": 20930 + }, + { + "epoch": 0.4498002319886583, + "grad_norm": 0.5186746353091075, + "learning_rate": 1.1742896005361186e-05, + "loss": 0.7333, + "step": 20940 + }, + { + "epoch": 0.45001503630192896, + "grad_norm": 0.5600841006515612, + "learning_rate": 1.173618342249997e-05, + "loss": 0.7478, + "step": 20950 + }, + { + "epoch": 0.45022984061519955, + "grad_norm": 0.5766313324253101, + "learning_rate": 1.1729470032924721e-05, + "loss": 0.7432, + "step": 20960 + }, + { + "epoch": 0.45044464492847014, + "grad_norm": 0.5166006082552747, + "learning_rate": 1.1722755839754807e-05, + "loss": 0.7473, + "step": 20970 + }, + { + "epoch": 0.4506594492417408, + "grad_norm": 0.49578934781253853, + "learning_rate": 1.1716040846109965e-05, + "loss": 0.7479, + "step": 20980 + }, + { + "epoch": 0.4508742535550114, + "grad_norm": 0.5016517079821076, + "learning_rate": 1.1709325055110296e-05, + "loss": 0.7349, + "step": 20990 + }, + { + "epoch": 0.451089057868282, + "grad_norm": 0.5210845740181136, + "learning_rate": 1.1702608469876288e-05, + "loss": 0.7338, + "step": 21000 + }, + { + "epoch": 0.4513038621815526, + "grad_norm": 0.5096278925929334, + "learning_rate": 1.1695891093528785e-05, + "loss": 0.7563, + "step": 21010 + }, + { + "epoch": 0.4515186664948232, + "grad_norm": 0.5086441029859898, + "learning_rate": 1.1689172929188997e-05, + "loss": 0.748, + "step": 21020 + }, + { + "epoch": 0.45173347080809384, + "grad_norm": 0.5387202509164648, + "learning_rate": 1.1682453979978507e-05, + "loss": 0.7562, + "step": 21030 + }, + { + "epoch": 0.45194827512136443, + "grad_norm": 0.5167975539045492, + "learning_rate": 1.1675734249019264e-05, + "loss": 0.7459, + "step": 21040 + }, + { + "epoch": 0.4521630794346351, + "grad_norm": 0.5094712446059221, + "learning_rate": 1.1669013739433576e-05, + "loss": 0.7372, + "step": 21050 + }, + { + "epoch": 0.45237788374790566, + "grad_norm": 0.5467898326634167, + "learning_rate": 1.1662292454344116e-05, + "loss": 0.7407, + "step": 21060 + }, + { + "epoch": 0.45259268806117625, + "grad_norm": 0.4989509970894457, + "learning_rate": 1.1655570396873911e-05, + "loss": 0.7469, + "step": 21070 + }, + { + "epoch": 0.4528074923744469, + "grad_norm": 0.5500057146231173, + "learning_rate": 1.1648847570146353e-05, + "loss": 0.7385, + "step": 21080 + }, + { + "epoch": 0.4530222966877175, + "grad_norm": 0.5119213457731346, + "learning_rate": 1.1642123977285187e-05, + "loss": 0.7535, + "step": 21090 + }, + { + "epoch": 0.4532371010009881, + "grad_norm": 0.5347652595873134, + "learning_rate": 1.163539962141452e-05, + "loss": 0.7514, + "step": 21100 + }, + { + "epoch": 0.4534519053142587, + "grad_norm": 0.5181244057297967, + "learning_rate": 1.162867450565881e-05, + "loss": 0.7448, + "step": 21110 + }, + { + "epoch": 0.4536667096275293, + "grad_norm": 0.5097669753456439, + "learning_rate": 1.1621948633142863e-05, + "loss": 0.7512, + "step": 21120 + }, + { + "epoch": 0.45388151394079995, + "grad_norm": 0.521551942346145, + "learning_rate": 1.161522200699185e-05, + "loss": 0.7335, + "step": 21130 + }, + { + "epoch": 0.45409631825407054, + "grad_norm": 0.5183982050773465, + "learning_rate": 1.1608494630331278e-05, + "loss": 0.7389, + "step": 21140 + }, + { + "epoch": 0.45431112256734113, + "grad_norm": 0.5005973180005574, + "learning_rate": 1.1601766506287009e-05, + "loss": 0.7244, + "step": 21150 + }, + { + "epoch": 0.4545259268806118, + "grad_norm": 0.5510297098838621, + "learning_rate": 1.1595037637985254e-05, + "loss": 0.7363, + "step": 21160 + }, + { + "epoch": 0.45474073119388236, + "grad_norm": 0.5172681651878976, + "learning_rate": 1.1588308028552567e-05, + "loss": 0.7469, + "step": 21170 + }, + { + "epoch": 0.454955535507153, + "grad_norm": 0.5101036668126001, + "learning_rate": 1.1581577681115844e-05, + "loss": 0.7403, + "step": 21180 + }, + { + "epoch": 0.4551703398204236, + "grad_norm": 0.5128096662612829, + "learning_rate": 1.1574846598802331e-05, + "loss": 0.7546, + "step": 21190 + }, + { + "epoch": 0.4553851441336942, + "grad_norm": 0.500612279234514, + "learning_rate": 1.1568114784739612e-05, + "loss": 0.748, + "step": 21200 + }, + { + "epoch": 0.45559994844696483, + "grad_norm": 0.5376520924082702, + "learning_rate": 1.1561382242055607e-05, + "loss": 0.7583, + "step": 21210 + }, + { + "epoch": 0.4558147527602354, + "grad_norm": 0.536096912372685, + "learning_rate": 1.1554648973878582e-05, + "loss": 0.7516, + "step": 21220 + }, + { + "epoch": 0.456029557073506, + "grad_norm": 0.5174865347901014, + "learning_rate": 1.154791498333713e-05, + "loss": 0.7433, + "step": 21230 + }, + { + "epoch": 0.45624436138677665, + "grad_norm": 0.5344982319266313, + "learning_rate": 1.154118027356019e-05, + "loss": 0.7348, + "step": 21240 + }, + { + "epoch": 0.45645916570004724, + "grad_norm": 0.49211553724020624, + "learning_rate": 1.153444484767703e-05, + "loss": 0.7431, + "step": 21250 + }, + { + "epoch": 0.4566739700133179, + "grad_norm": 0.5309412807127188, + "learning_rate": 1.1527708708817255e-05, + "loss": 0.7592, + "step": 21260 + }, + { + "epoch": 0.4568887743265885, + "grad_norm": 0.5266934403871943, + "learning_rate": 1.1520971860110795e-05, + "loss": 0.7467, + "step": 21270 + }, + { + "epoch": 0.45710357863985907, + "grad_norm": 0.5113799966183405, + "learning_rate": 1.1514234304687912e-05, + "loss": 0.7492, + "step": 21280 + }, + { + "epoch": 0.4573183829531297, + "grad_norm": 0.5265597304769074, + "learning_rate": 1.1507496045679196e-05, + "loss": 0.7577, + "step": 21290 + }, + { + "epoch": 0.4575331872664003, + "grad_norm": 0.5011207398210541, + "learning_rate": 1.1500757086215568e-05, + "loss": 0.7358, + "step": 21300 + }, + { + "epoch": 0.45774799157967094, + "grad_norm": 0.5197831944252082, + "learning_rate": 1.1494017429428271e-05, + "loss": 0.7402, + "step": 21310 + }, + { + "epoch": 0.45796279589294153, + "grad_norm": 0.5051983633873826, + "learning_rate": 1.1487277078448873e-05, + "loss": 0.7386, + "step": 21320 + }, + { + "epoch": 0.4581776002062121, + "grad_norm": 0.5177898532161163, + "learning_rate": 1.1480536036409262e-05, + "loss": 0.74, + "step": 21330 + }, + { + "epoch": 0.45839240451948277, + "grad_norm": 0.5150685367159163, + "learning_rate": 1.1473794306441652e-05, + "loss": 0.7379, + "step": 21340 + }, + { + "epoch": 0.45860720883275335, + "grad_norm": 0.5074160095947531, + "learning_rate": 1.1467051891678568e-05, + "loss": 0.7314, + "step": 21350 + }, + { + "epoch": 0.458822013146024, + "grad_norm": 0.5145616285881711, + "learning_rate": 1.1460308795252867e-05, + "loss": 0.7399, + "step": 21360 + }, + { + "epoch": 0.4590368174592946, + "grad_norm": 0.5160410203461876, + "learning_rate": 1.145356502029771e-05, + "loss": 0.7495, + "step": 21370 + }, + { + "epoch": 0.4592516217725652, + "grad_norm": 0.5315723943002935, + "learning_rate": 1.1446820569946581e-05, + "loss": 0.75, + "step": 21380 + }, + { + "epoch": 0.4594664260858358, + "grad_norm": 0.5106116756563384, + "learning_rate": 1.1440075447333274e-05, + "loss": 0.7417, + "step": 21390 + }, + { + "epoch": 0.4596812303991064, + "grad_norm": 0.5122130397254849, + "learning_rate": 1.1433329655591894e-05, + "loss": 0.7449, + "step": 21400 + }, + { + "epoch": 0.459896034712377, + "grad_norm": 0.5054378223152889, + "learning_rate": 1.1426583197856858e-05, + "loss": 0.7418, + "step": 21410 + }, + { + "epoch": 0.46011083902564764, + "grad_norm": 0.5244487700127819, + "learning_rate": 1.1419836077262899e-05, + "loss": 0.7419, + "step": 21420 + }, + { + "epoch": 0.46032564333891823, + "grad_norm": 0.5252805536342786, + "learning_rate": 1.1413088296945048e-05, + "loss": 0.7474, + "step": 21430 + }, + { + "epoch": 0.4605404476521889, + "grad_norm": 0.5098516964749148, + "learning_rate": 1.1406339860038648e-05, + "loss": 0.7566, + "step": 21440 + }, + { + "epoch": 0.46075525196545947, + "grad_norm": 0.53055505223306, + "learning_rate": 1.139959076967935e-05, + "loss": 0.7448, + "step": 21450 + }, + { + "epoch": 0.46097005627873006, + "grad_norm": 0.5135560710510545, + "learning_rate": 1.1392841029003102e-05, + "loss": 0.74, + "step": 21460 + }, + { + "epoch": 0.4611848605920007, + "grad_norm": 0.5214337220451569, + "learning_rate": 1.1386090641146152e-05, + "loss": 0.745, + "step": 21470 + }, + { + "epoch": 0.4613996649052713, + "grad_norm": 0.5375224268150335, + "learning_rate": 1.137933960924506e-05, + "loss": 0.7442, + "step": 21480 + }, + { + "epoch": 0.46161446921854193, + "grad_norm": 0.5139666564481621, + "learning_rate": 1.1372587936436683e-05, + "loss": 0.7419, + "step": 21490 + }, + { + "epoch": 0.4618292735318125, + "grad_norm": 0.5559740898339312, + "learning_rate": 1.1365835625858162e-05, + "loss": 0.7544, + "step": 21500 + }, + { + "epoch": 0.4620440778450831, + "grad_norm": 0.49830043641658905, + "learning_rate": 1.1359082680646952e-05, + "loss": 0.7341, + "step": 21510 + }, + { + "epoch": 0.46225888215835376, + "grad_norm": 0.5125992039187067, + "learning_rate": 1.1352329103940788e-05, + "loss": 0.7559, + "step": 21520 + }, + { + "epoch": 0.46247368647162435, + "grad_norm": 0.5194082716862066, + "learning_rate": 1.1345574898877707e-05, + "loss": 0.7439, + "step": 21530 + }, + { + "epoch": 0.46268849078489493, + "grad_norm": 0.5147396997044464, + "learning_rate": 1.1338820068596044e-05, + "loss": 0.7511, + "step": 21540 + }, + { + "epoch": 0.4629032950981656, + "grad_norm": 0.5392671660722685, + "learning_rate": 1.1332064616234407e-05, + "loss": 0.7653, + "step": 21550 + }, + { + "epoch": 0.46311809941143617, + "grad_norm": 0.5201496726087629, + "learning_rate": 1.1325308544931706e-05, + "loss": 0.7381, + "step": 21560 + }, + { + "epoch": 0.4633329037247068, + "grad_norm": 0.5180175465707748, + "learning_rate": 1.131855185782714e-05, + "loss": 0.7391, + "step": 21570 + }, + { + "epoch": 0.4635477080379774, + "grad_norm": 0.5336962808784633, + "learning_rate": 1.1311794558060186e-05, + "loss": 0.7469, + "step": 21580 + }, + { + "epoch": 0.463762512351248, + "grad_norm": 0.49650035979570856, + "learning_rate": 1.1305036648770608e-05, + "loss": 0.732, + "step": 21590 + }, + { + "epoch": 0.46397731666451864, + "grad_norm": 0.5281758487503815, + "learning_rate": 1.129827813309846e-05, + "loss": 0.7528, + "step": 21600 + }, + { + "epoch": 0.4641921209777892, + "grad_norm": 0.5234542589841056, + "learning_rate": 1.1291519014184062e-05, + "loss": 0.7247, + "step": 21610 + }, + { + "epoch": 0.46440692529105987, + "grad_norm": 0.517114689702638, + "learning_rate": 1.1284759295168035e-05, + "loss": 0.7511, + "step": 21620 + }, + { + "epoch": 0.46462172960433046, + "grad_norm": 0.5054831713408943, + "learning_rate": 1.1277998979191262e-05, + "loss": 0.7348, + "step": 21630 + }, + { + "epoch": 0.46483653391760105, + "grad_norm": 0.5169749307688332, + "learning_rate": 1.1271238069394916e-05, + "loss": 0.7517, + "step": 21640 + }, + { + "epoch": 0.4650513382308717, + "grad_norm": 0.5287808498540633, + "learning_rate": 1.1264476568920434e-05, + "loss": 0.7437, + "step": 21650 + }, + { + "epoch": 0.4652661425441423, + "grad_norm": 0.5133827362263325, + "learning_rate": 1.1257714480909538e-05, + "loss": 0.7256, + "step": 21660 + }, + { + "epoch": 0.46548094685741287, + "grad_norm": 0.4986667333468926, + "learning_rate": 1.125095180850421e-05, + "loss": 0.733, + "step": 21670 + }, + { + "epoch": 0.4656957511706835, + "grad_norm": 0.5041449057994531, + "learning_rate": 1.1244188554846722e-05, + "loss": 0.739, + "step": 21680 + }, + { + "epoch": 0.4659105554839541, + "grad_norm": 0.5091046285922776, + "learning_rate": 1.1237424723079597e-05, + "loss": 0.7352, + "step": 21690 + }, + { + "epoch": 0.46612535979722475, + "grad_norm": 0.5135885820320195, + "learning_rate": 1.1230660316345643e-05, + "loss": 0.7329, + "step": 21700 + }, + { + "epoch": 0.46634016411049534, + "grad_norm": 0.5200190247573033, + "learning_rate": 1.1223895337787924e-05, + "loss": 0.743, + "step": 21710 + }, + { + "epoch": 0.4665549684237659, + "grad_norm": 0.5353216172156325, + "learning_rate": 1.121712979054977e-05, + "loss": 0.7483, + "step": 21720 + }, + { + "epoch": 0.46676977273703657, + "grad_norm": 0.5082835412156764, + "learning_rate": 1.1210363677774782e-05, + "loss": 0.7464, + "step": 21730 + }, + { + "epoch": 0.46698457705030716, + "grad_norm": 0.5280559944508908, + "learning_rate": 1.1203597002606821e-05, + "loss": 0.7537, + "step": 21740 + }, + { + "epoch": 0.4671993813635778, + "grad_norm": 0.5260182340412108, + "learning_rate": 1.1196829768190008e-05, + "loss": 0.7474, + "step": 21750 + }, + { + "epoch": 0.4674141856768484, + "grad_norm": 0.5318302840617868, + "learning_rate": 1.1190061977668723e-05, + "loss": 0.752, + "step": 21760 + }, + { + "epoch": 0.467628989990119, + "grad_norm": 0.5202925069615598, + "learning_rate": 1.1183293634187609e-05, + "loss": 0.7395, + "step": 21770 + }, + { + "epoch": 0.4678437943033896, + "grad_norm": 0.5420558362059565, + "learning_rate": 1.1176524740891558e-05, + "loss": 0.746, + "step": 21780 + }, + { + "epoch": 0.4680585986166602, + "grad_norm": 0.5209989958223947, + "learning_rate": 1.1169755300925723e-05, + "loss": 0.7528, + "step": 21790 + }, + { + "epoch": 0.46827340292993086, + "grad_norm": 0.5323335344870374, + "learning_rate": 1.1162985317435514e-05, + "loss": 0.7419, + "step": 21800 + }, + { + "epoch": 0.46848820724320145, + "grad_norm": 0.5268811768109316, + "learning_rate": 1.1156214793566591e-05, + "loss": 0.7413, + "step": 21810 + }, + { + "epoch": 0.46870301155647204, + "grad_norm": 0.5526549348147461, + "learning_rate": 1.1149443732464858e-05, + "loss": 0.7423, + "step": 21820 + }, + { + "epoch": 0.4689178158697427, + "grad_norm": 0.5428671229053159, + "learning_rate": 1.1142672137276478e-05, + "loss": 0.7478, + "step": 21830 + }, + { + "epoch": 0.46913262018301327, + "grad_norm": 0.513399124764161, + "learning_rate": 1.1135900011147858e-05, + "loss": 0.7479, + "step": 21840 + }, + { + "epoch": 0.46934742449628386, + "grad_norm": 0.536835542346075, + "learning_rate": 1.1129127357225648e-05, + "loss": 0.7415, + "step": 21850 + }, + { + "epoch": 0.4695622288095545, + "grad_norm": 0.517525876955086, + "learning_rate": 1.1122354178656756e-05, + "loss": 0.7454, + "step": 21860 + }, + { + "epoch": 0.4697770331228251, + "grad_norm": 0.5016054637504687, + "learning_rate": 1.111558047858832e-05, + "loss": 0.7571, + "step": 21870 + }, + { + "epoch": 0.46999183743609574, + "grad_norm": 0.5114598800325706, + "learning_rate": 1.1108806260167727e-05, + "loss": 0.728, + "step": 21880 + }, + { + "epoch": 0.4702066417493663, + "grad_norm": 0.5092042135062025, + "learning_rate": 1.1102031526542605e-05, + "loss": 0.7356, + "step": 21890 + }, + { + "epoch": 0.4704214460626369, + "grad_norm": 0.5223633189785332, + "learning_rate": 1.109525628086082e-05, + "loss": 0.7477, + "step": 21900 + }, + { + "epoch": 0.47063625037590756, + "grad_norm": 0.5137541232678928, + "learning_rate": 1.1088480526270472e-05, + "loss": 0.7384, + "step": 21910 + }, + { + "epoch": 0.47085105468917815, + "grad_norm": 0.5268764811281718, + "learning_rate": 1.1081704265919904e-05, + "loss": 0.7434, + "step": 21920 + }, + { + "epoch": 0.4710658590024488, + "grad_norm": 0.5214178282194654, + "learning_rate": 1.1074927502957688e-05, + "loss": 0.747, + "step": 21930 + }, + { + "epoch": 0.4712806633157194, + "grad_norm": 0.5150008031537309, + "learning_rate": 1.1068150240532637e-05, + "loss": 0.7427, + "step": 21940 + }, + { + "epoch": 0.47149546762899, + "grad_norm": 0.5155181176356146, + "learning_rate": 1.1061372481793793e-05, + "loss": 0.735, + "step": 21950 + }, + { + "epoch": 0.4717102719422606, + "grad_norm": 0.528603646425843, + "learning_rate": 1.1054594229890425e-05, + "loss": 0.7206, + "step": 21960 + }, + { + "epoch": 0.4719250762555312, + "grad_norm": 0.5119688474719511, + "learning_rate": 1.1047815487972034e-05, + "loss": 0.7467, + "step": 21970 + }, + { + "epoch": 0.4721398805688018, + "grad_norm": 0.5252997267209152, + "learning_rate": 1.104103625918835e-05, + "loss": 0.7466, + "step": 21980 + }, + { + "epoch": 0.47235468488207244, + "grad_norm": 0.5257720369841751, + "learning_rate": 1.1034256546689321e-05, + "loss": 0.7479, + "step": 21990 + }, + { + "epoch": 0.47256948919534303, + "grad_norm": 0.536029357234098, + "learning_rate": 1.1027476353625132e-05, + "loss": 0.7447, + "step": 22000 + }, + { + "epoch": 0.4727842935086137, + "grad_norm": 0.5050633427695835, + "learning_rate": 1.1020695683146188e-05, + "loss": 0.7504, + "step": 22010 + }, + { + "epoch": 0.47299909782188426, + "grad_norm": 0.514261629466442, + "learning_rate": 1.101391453840311e-05, + "loss": 0.7397, + "step": 22020 + }, + { + "epoch": 0.47321390213515485, + "grad_norm": 0.5198813281021253, + "learning_rate": 1.1007132922546743e-05, + "loss": 0.7506, + "step": 22030 + }, + { + "epoch": 0.4734287064484255, + "grad_norm": 0.5181411136523617, + "learning_rate": 1.100035083872815e-05, + "loss": 0.746, + "step": 22040 + }, + { + "epoch": 0.4736435107616961, + "grad_norm": 0.5147323905400899, + "learning_rate": 1.0993568290098615e-05, + "loss": 0.7356, + "step": 22050 + }, + { + "epoch": 0.47385831507496673, + "grad_norm": 0.526477837321041, + "learning_rate": 1.0986785279809631e-05, + "loss": 0.7412, + "step": 22060 + }, + { + "epoch": 0.4740731193882373, + "grad_norm": 0.5313979721453921, + "learning_rate": 1.0980001811012915e-05, + "loss": 0.7336, + "step": 22070 + }, + { + "epoch": 0.4742879237015079, + "grad_norm": 0.5170598479629712, + "learning_rate": 1.0973217886860387e-05, + "loss": 0.7238, + "step": 22080 + }, + { + "epoch": 0.47450272801477855, + "grad_norm": 0.5221293133998406, + "learning_rate": 1.0966433510504188e-05, + "loss": 0.7344, + "step": 22090 + }, + { + "epoch": 0.47471753232804914, + "grad_norm": 0.5190509203550013, + "learning_rate": 1.0959648685096657e-05, + "loss": 0.7536, + "step": 22100 + }, + { + "epoch": 0.47493233664131973, + "grad_norm": 0.5103636926313977, + "learning_rate": 1.0952863413790355e-05, + "loss": 0.7493, + "step": 22110 + }, + { + "epoch": 0.4751471409545904, + "grad_norm": 0.5017701329508603, + "learning_rate": 1.0946077699738045e-05, + "loss": 0.7218, + "step": 22120 + }, + { + "epoch": 0.47536194526786096, + "grad_norm": 0.5334534885238904, + "learning_rate": 1.093929154609269e-05, + "loss": 0.7355, + "step": 22130 + }, + { + "epoch": 0.4755767495811316, + "grad_norm": 0.5347297096552461, + "learning_rate": 1.0932504956007468e-05, + "loss": 0.7414, + "step": 22140 + }, + { + "epoch": 0.4757915538944022, + "grad_norm": 0.5361415400554347, + "learning_rate": 1.0925717932635751e-05, + "loss": 0.7429, + "step": 22150 + }, + { + "epoch": 0.4760063582076728, + "grad_norm": 0.5276150896460355, + "learning_rate": 1.0918930479131114e-05, + "loss": 0.7343, + "step": 22160 + }, + { + "epoch": 0.47622116252094343, + "grad_norm": 0.5242096273960569, + "learning_rate": 1.0912142598647332e-05, + "loss": 0.7352, + "step": 22170 + }, + { + "epoch": 0.476435966834214, + "grad_norm": 0.5124773180856481, + "learning_rate": 1.0905354294338384e-05, + "loss": 0.7458, + "step": 22180 + }, + { + "epoch": 0.47665077114748466, + "grad_norm": 0.5009918414284297, + "learning_rate": 1.089856556935844e-05, + "loss": 0.7338, + "step": 22190 + }, + { + "epoch": 0.47686557546075525, + "grad_norm": 0.5108245418067267, + "learning_rate": 1.0891776426861868e-05, + "loss": 0.7421, + "step": 22200 + }, + { + "epoch": 0.47708037977402584, + "grad_norm": 0.509138834615506, + "learning_rate": 1.0884986870003229e-05, + "loss": 0.7431, + "step": 22210 + }, + { + "epoch": 0.4772951840872965, + "grad_norm": 0.5272156276514611, + "learning_rate": 1.0878196901937272e-05, + "loss": 0.7522, + "step": 22220 + }, + { + "epoch": 0.4775099884005671, + "grad_norm": 0.5053807372710165, + "learning_rate": 1.0871406525818947e-05, + "loss": 0.735, + "step": 22230 + }, + { + "epoch": 0.4777247927138377, + "grad_norm": 0.5166733531977794, + "learning_rate": 1.0864615744803382e-05, + "loss": 0.7278, + "step": 22240 + }, + { + "epoch": 0.4779395970271083, + "grad_norm": 0.5117417119746009, + "learning_rate": 1.0857824562045907e-05, + "loss": 0.7521, + "step": 22250 + }, + { + "epoch": 0.4781544013403789, + "grad_norm": 0.5256764124583501, + "learning_rate": 1.0851032980702025e-05, + "loss": 0.7392, + "step": 22260 + }, + { + "epoch": 0.47836920565364954, + "grad_norm": 0.5169468372261654, + "learning_rate": 1.0844241003927433e-05, + "loss": 0.7442, + "step": 22270 + }, + { + "epoch": 0.47858400996692013, + "grad_norm": 0.5075813885039875, + "learning_rate": 1.0837448634878011e-05, + "loss": 0.7498, + "step": 22280 + }, + { + "epoch": 0.4787988142801907, + "grad_norm": 0.49867951346910183, + "learning_rate": 1.0830655876709817e-05, + "loss": 0.7405, + "step": 22290 + }, + { + "epoch": 0.47901361859346137, + "grad_norm": 0.549353416061091, + "learning_rate": 1.0823862732579088e-05, + "loss": 0.7356, + "step": 22300 + }, + { + "epoch": 0.47922842290673195, + "grad_norm": 0.5233565980832593, + "learning_rate": 1.081706920564225e-05, + "loss": 0.7393, + "step": 22310 + }, + { + "epoch": 0.4794432272200026, + "grad_norm": 0.5166833901430437, + "learning_rate": 1.0810275299055899e-05, + "loss": 0.7399, + "step": 22320 + }, + { + "epoch": 0.4796580315332732, + "grad_norm": 0.5156376883464526, + "learning_rate": 1.0803481015976809e-05, + "loss": 0.7356, + "step": 22330 + }, + { + "epoch": 0.4798728358465438, + "grad_norm": 0.5361312416973215, + "learning_rate": 1.0796686359561931e-05, + "loss": 0.7575, + "step": 22340 + }, + { + "epoch": 0.4800876401598144, + "grad_norm": 0.52020980112163, + "learning_rate": 1.0789891332968387e-05, + "loss": 0.7236, + "step": 22350 + }, + { + "epoch": 0.480302444473085, + "grad_norm": 0.5250129443878427, + "learning_rate": 1.0783095939353474e-05, + "loss": 0.7297, + "step": 22360 + }, + { + "epoch": 0.48051724878635566, + "grad_norm": 0.5134420144953245, + "learning_rate": 1.0776300181874654e-05, + "loss": 0.7355, + "step": 22370 + }, + { + "epoch": 0.48073205309962624, + "grad_norm": 0.5291422486339019, + "learning_rate": 1.0769504063689564e-05, + "loss": 0.7422, + "step": 22380 + }, + { + "epoch": 0.48094685741289683, + "grad_norm": 0.5115283701598856, + "learning_rate": 1.0762707587956005e-05, + "loss": 0.7521, + "step": 22390 + }, + { + "epoch": 0.4811616617261675, + "grad_norm": 0.5148143798916653, + "learning_rate": 1.0755910757831949e-05, + "loss": 0.7351, + "step": 22400 + }, + { + "epoch": 0.48137646603943807, + "grad_norm": 0.5291173579659362, + "learning_rate": 1.0749113576475525e-05, + "loss": 0.7431, + "step": 22410 + }, + { + "epoch": 0.48159127035270866, + "grad_norm": 0.5391435806265806, + "learning_rate": 1.0742316047045029e-05, + "loss": 0.7435, + "step": 22420 + }, + { + "epoch": 0.4818060746659793, + "grad_norm": 0.5092459862659836, + "learning_rate": 1.0735518172698922e-05, + "loss": 0.7368, + "step": 22430 + }, + { + "epoch": 0.4820208789792499, + "grad_norm": 0.5267935560236964, + "learning_rate": 1.0728719956595818e-05, + "loss": 0.7469, + "step": 22440 + }, + { + "epoch": 0.48223568329252053, + "grad_norm": 0.5176688281994023, + "learning_rate": 1.0721921401894498e-05, + "loss": 0.7501, + "step": 22450 + }, + { + "epoch": 0.4824504876057911, + "grad_norm": 0.5137388029359484, + "learning_rate": 1.0715122511753897e-05, + "loss": 0.7425, + "step": 22460 + }, + { + "epoch": 0.4826652919190617, + "grad_norm": 0.5159930397277978, + "learning_rate": 1.0708323289333102e-05, + "loss": 0.7635, + "step": 22470 + }, + { + "epoch": 0.48288009623233236, + "grad_norm": 0.5256807433908982, + "learning_rate": 1.0701523737791359e-05, + "loss": 0.7319, + "step": 22480 + }, + { + "epoch": 0.48309490054560295, + "grad_norm": 0.5210214722370281, + "learning_rate": 1.0694723860288063e-05, + "loss": 0.7492, + "step": 22490 + }, + { + "epoch": 0.4833097048588736, + "grad_norm": 0.5231744475769452, + "learning_rate": 1.0687923659982766e-05, + "loss": 0.735, + "step": 22500 + }, + { + "epoch": 0.4835245091721442, + "grad_norm": 0.5011667972879604, + "learning_rate": 1.068112314003517e-05, + "loss": 0.7355, + "step": 22510 + }, + { + "epoch": 0.48373931348541477, + "grad_norm": 0.5265184060707572, + "learning_rate": 1.0674322303605115e-05, + "loss": 0.7395, + "step": 22520 + }, + { + "epoch": 0.4839541177986854, + "grad_norm": 0.5125129868588316, + "learning_rate": 1.0667521153852603e-05, + "loss": 0.7268, + "step": 22530 + }, + { + "epoch": 0.484168922111956, + "grad_norm": 0.5220283396876785, + "learning_rate": 1.0660719693937766e-05, + "loss": 0.7373, + "step": 22540 + }, + { + "epoch": 0.48438372642522665, + "grad_norm": 0.5091857205706148, + "learning_rate": 1.0653917927020894e-05, + "loss": 0.7402, + "step": 22550 + }, + { + "epoch": 0.48459853073849724, + "grad_norm": 0.5107328961554215, + "learning_rate": 1.0647115856262413e-05, + "loss": 0.7306, + "step": 22560 + }, + { + "epoch": 0.4848133350517678, + "grad_norm": 0.5133105974860892, + "learning_rate": 1.0640313484822893e-05, + "loss": 0.7324, + "step": 22570 + }, + { + "epoch": 0.48502813936503847, + "grad_norm": 0.529832381078322, + "learning_rate": 1.0633510815863036e-05, + "loss": 0.744, + "step": 22580 + }, + { + "epoch": 0.48524294367830906, + "grad_norm": 0.5223357765212729, + "learning_rate": 1.0626707852543695e-05, + "loss": 0.7363, + "step": 22590 + }, + { + "epoch": 0.48545774799157965, + "grad_norm": 0.5300543063286008, + "learning_rate": 1.0619904598025846e-05, + "loss": 0.7604, + "step": 22600 + }, + { + "epoch": 0.4856725523048503, + "grad_norm": 0.5379619059578166, + "learning_rate": 1.0613101055470612e-05, + "loss": 0.7536, + "step": 22610 + }, + { + "epoch": 0.4858873566181209, + "grad_norm": 0.5133418467154645, + "learning_rate": 1.0606297228039244e-05, + "loss": 0.7463, + "step": 22620 + }, + { + "epoch": 0.4861021609313915, + "grad_norm": 0.506231778810294, + "learning_rate": 1.0599493118893122e-05, + "loss": 0.7364, + "step": 22630 + }, + { + "epoch": 0.4863169652446621, + "grad_norm": 0.5019381915489841, + "learning_rate": 1.0592688731193768e-05, + "loss": 0.7333, + "step": 22640 + }, + { + "epoch": 0.4865317695579327, + "grad_norm": 0.5133937862155273, + "learning_rate": 1.0585884068102824e-05, + "loss": 0.7392, + "step": 22650 + }, + { + "epoch": 0.48674657387120335, + "grad_norm": 0.5225027361367924, + "learning_rate": 1.0579079132782061e-05, + "loss": 0.7376, + "step": 22660 + }, + { + "epoch": 0.48696137818447394, + "grad_norm": 0.5266639720499878, + "learning_rate": 1.0572273928393379e-05, + "loss": 0.7254, + "step": 22670 + }, + { + "epoch": 0.4871761824977446, + "grad_norm": 0.5509818361008318, + "learning_rate": 1.0565468458098806e-05, + "loss": 0.7393, + "step": 22680 + }, + { + "epoch": 0.48739098681101517, + "grad_norm": 0.5237573163610023, + "learning_rate": 1.0558662725060483e-05, + "loss": 0.7359, + "step": 22690 + }, + { + "epoch": 0.48760579112428576, + "grad_norm": 0.5107223127670096, + "learning_rate": 1.0551856732440681e-05, + "loss": 0.7585, + "step": 22700 + }, + { + "epoch": 0.4878205954375564, + "grad_norm": 0.5059159746679568, + "learning_rate": 1.0545050483401793e-05, + "loss": 0.7401, + "step": 22710 + }, + { + "epoch": 0.488035399750827, + "grad_norm": 0.5200289327198926, + "learning_rate": 1.0538243981106331e-05, + "loss": 0.7498, + "step": 22720 + }, + { + "epoch": 0.4882502040640976, + "grad_norm": 0.5189637119559256, + "learning_rate": 1.0531437228716914e-05, + "loss": 0.7482, + "step": 22730 + }, + { + "epoch": 0.4884650083773682, + "grad_norm": 0.49679955732243036, + "learning_rate": 1.0524630229396295e-05, + "loss": 0.7446, + "step": 22740 + }, + { + "epoch": 0.4886798126906388, + "grad_norm": 0.5207941387326519, + "learning_rate": 1.0517822986307325e-05, + "loss": 0.7624, + "step": 22750 + }, + { + "epoch": 0.48889461700390946, + "grad_norm": 0.5286403629591689, + "learning_rate": 1.0511015502612975e-05, + "loss": 0.7414, + "step": 22760 + }, + { + "epoch": 0.48910942131718005, + "grad_norm": 0.5120125001043839, + "learning_rate": 1.0504207781476334e-05, + "loss": 0.7366, + "step": 22770 + }, + { + "epoch": 0.48932422563045064, + "grad_norm": 0.5179656319122231, + "learning_rate": 1.0497399826060596e-05, + "loss": 0.7418, + "step": 22780 + }, + { + "epoch": 0.4895390299437213, + "grad_norm": 0.5366945858487391, + "learning_rate": 1.0490591639529055e-05, + "loss": 0.7357, + "step": 22790 + }, + { + "epoch": 0.48975383425699187, + "grad_norm": 0.5154119510735722, + "learning_rate": 1.0483783225045126e-05, + "loss": 0.7219, + "step": 22800 + }, + { + "epoch": 0.4899686385702625, + "grad_norm": 0.518597726407149, + "learning_rate": 1.0476974585772323e-05, + "loss": 0.741, + "step": 22810 + }, + { + "epoch": 0.4901834428835331, + "grad_norm": 0.5140931629957429, + "learning_rate": 1.047016572487427e-05, + "loss": 0.7578, + "step": 22820 + }, + { + "epoch": 0.4903982471968037, + "grad_norm": 0.5104752969998968, + "learning_rate": 1.0463356645514687e-05, + "loss": 0.7416, + "step": 22830 + }, + { + "epoch": 0.49061305151007434, + "grad_norm": 0.5313548540682782, + "learning_rate": 1.0456547350857397e-05, + "loss": 0.7472, + "step": 22840 + }, + { + "epoch": 0.4908278558233449, + "grad_norm": 0.5104717883570301, + "learning_rate": 1.0449737844066332e-05, + "loss": 0.7415, + "step": 22850 + }, + { + "epoch": 0.4910426601366155, + "grad_norm": 0.5321738566400499, + "learning_rate": 1.0442928128305504e-05, + "loss": 0.7307, + "step": 22860 + }, + { + "epoch": 0.49125746444988616, + "grad_norm": 0.5180000313964818, + "learning_rate": 1.0436118206739044e-05, + "loss": 0.7319, + "step": 22870 + }, + { + "epoch": 0.49147226876315675, + "grad_norm": 0.5269259655740001, + "learning_rate": 1.0429308082531157e-05, + "loss": 0.7297, + "step": 22880 + }, + { + "epoch": 0.4916870730764274, + "grad_norm": 0.5162542621179522, + "learning_rate": 1.0422497758846166e-05, + "loss": 0.7303, + "step": 22890 + }, + { + "epoch": 0.491901877389698, + "grad_norm": 0.9082353493403971, + "learning_rate": 1.0415687238848465e-05, + "loss": 0.7393, + "step": 22900 + }, + { + "epoch": 0.4921166817029686, + "grad_norm": 0.5249523721707448, + "learning_rate": 1.040887652570255e-05, + "loss": 0.7359, + "step": 22910 + }, + { + "epoch": 0.4923314860162392, + "grad_norm": 0.5310314571240732, + "learning_rate": 1.0402065622573003e-05, + "loss": 0.7473, + "step": 22920 + }, + { + "epoch": 0.4925462903295098, + "grad_norm": 0.5152700502053869, + "learning_rate": 1.03952545326245e-05, + "loss": 0.7386, + "step": 22930 + }, + { + "epoch": 0.49276109464278045, + "grad_norm": 0.5137486693952514, + "learning_rate": 1.0388443259021794e-05, + "loss": 0.7416, + "step": 22940 + }, + { + "epoch": 0.49297589895605104, + "grad_norm": 0.5147003203872301, + "learning_rate": 1.0381631804929737e-05, + "loss": 0.7558, + "step": 22950 + }, + { + "epoch": 0.49319070326932163, + "grad_norm": 0.5150451603396607, + "learning_rate": 1.0374820173513252e-05, + "loss": 0.7323, + "step": 22960 + }, + { + "epoch": 0.4934055075825923, + "grad_norm": 0.4998638447851373, + "learning_rate": 1.0368008367937348e-05, + "loss": 0.7394, + "step": 22970 + }, + { + "epoch": 0.49362031189586286, + "grad_norm": 0.5065272447150604, + "learning_rate": 1.036119639136712e-05, + "loss": 0.7547, + "step": 22980 + }, + { + "epoch": 0.4938351162091335, + "grad_norm": 0.5106286499648831, + "learning_rate": 1.035438424696774e-05, + "loss": 0.7497, + "step": 22990 + }, + { + "epoch": 0.4940499205224041, + "grad_norm": 0.5101651671094659, + "learning_rate": 1.0347571937904452e-05, + "loss": 0.7363, + "step": 23000 + }, + { + "epoch": 0.4942647248356747, + "grad_norm": 0.5243897522248346, + "learning_rate": 1.0340759467342582e-05, + "loss": 0.7444, + "step": 23010 + }, + { + "epoch": 0.49447952914894533, + "grad_norm": 0.5402515262236681, + "learning_rate": 1.0333946838447533e-05, + "loss": 0.7315, + "step": 23020 + }, + { + "epoch": 0.4946943334622159, + "grad_norm": 0.5051561251334392, + "learning_rate": 1.0327134054384778e-05, + "loss": 0.7332, + "step": 23030 + }, + { + "epoch": 0.4949091377754865, + "grad_norm": 0.5685316863277247, + "learning_rate": 1.0320321118319865e-05, + "loss": 0.7501, + "step": 23040 + }, + { + "epoch": 0.49512394208875715, + "grad_norm": 0.5057771401661005, + "learning_rate": 1.0313508033418405e-05, + "loss": 0.7315, + "step": 23050 + }, + { + "epoch": 0.49533874640202774, + "grad_norm": 0.5152345284853551, + "learning_rate": 1.0306694802846089e-05, + "loss": 0.7327, + "step": 23060 + }, + { + "epoch": 0.4955535507152984, + "grad_norm": 0.5021280830392914, + "learning_rate": 1.0299881429768669e-05, + "loss": 0.7376, + "step": 23070 + }, + { + "epoch": 0.495768355028569, + "grad_norm": 0.5249378964815303, + "learning_rate": 1.029306791735196e-05, + "loss": 0.7422, + "step": 23080 + }, + { + "epoch": 0.49598315934183956, + "grad_norm": 0.5032927828015555, + "learning_rate": 1.0286254268761853e-05, + "loss": 0.7412, + "step": 23090 + }, + { + "epoch": 0.4961979636551102, + "grad_norm": 0.5120900752083057, + "learning_rate": 1.0279440487164296e-05, + "loss": 0.7411, + "step": 23100 + }, + { + "epoch": 0.4964127679683808, + "grad_norm": 0.5194032778917269, + "learning_rate": 1.0272626575725296e-05, + "loss": 0.7422, + "step": 23110 + }, + { + "epoch": 0.49662757228165144, + "grad_norm": 0.5051279599131995, + "learning_rate": 1.0265812537610918e-05, + "loss": 0.734, + "step": 23120 + }, + { + "epoch": 0.49684237659492203, + "grad_norm": 0.524066136680626, + "learning_rate": 1.0258998375987297e-05, + "loss": 0.7471, + "step": 23130 + }, + { + "epoch": 0.4970571809081926, + "grad_norm": 0.5212407578128498, + "learning_rate": 1.025218409402062e-05, + "loss": 0.7234, + "step": 23140 + }, + { + "epoch": 0.49727198522146326, + "grad_norm": 0.5294461859743091, + "learning_rate": 1.0245369694877121e-05, + "loss": 0.7226, + "step": 23150 + }, + { + "epoch": 0.49748678953473385, + "grad_norm": 0.5223634925273634, + "learning_rate": 1.0238555181723108e-05, + "loss": 0.7453, + "step": 23160 + }, + { + "epoch": 0.49770159384800444, + "grad_norm": 0.5233047547217023, + "learning_rate": 1.0231740557724922e-05, + "loss": 0.7343, + "step": 23170 + }, + { + "epoch": 0.4979163981612751, + "grad_norm": 0.507862147559393, + "learning_rate": 1.0224925826048966e-05, + "loss": 0.7274, + "step": 23180 + }, + { + "epoch": 0.4981312024745457, + "grad_norm": 0.5155495724786578, + "learning_rate": 1.0218110989861691e-05, + "loss": 0.7443, + "step": 23190 + }, + { + "epoch": 0.4983460067878163, + "grad_norm": 0.5206048961869586, + "learning_rate": 1.0211296052329596e-05, + "loss": 0.7242, + "step": 23200 + }, + { + "epoch": 0.4985608111010869, + "grad_norm": 0.5119701323767093, + "learning_rate": 1.020448101661923e-05, + "loss": 0.7229, + "step": 23210 + }, + { + "epoch": 0.4987756154143575, + "grad_norm": 0.5253915545306624, + "learning_rate": 1.0197665885897184e-05, + "loss": 0.7243, + "step": 23220 + }, + { + "epoch": 0.49899041972762814, + "grad_norm": 0.4909170913393957, + "learning_rate": 1.0190850663330093e-05, + "loss": 0.7365, + "step": 23230 + }, + { + "epoch": 0.49920522404089873, + "grad_norm": 0.513781488832539, + "learning_rate": 1.0184035352084635e-05, + "loss": 0.7399, + "step": 23240 + }, + { + "epoch": 0.4994200283541694, + "grad_norm": 0.5186220905358521, + "learning_rate": 1.0177219955327533e-05, + "loss": 0.7416, + "step": 23250 + }, + { + "epoch": 0.49963483266743997, + "grad_norm": 0.5273590434779399, + "learning_rate": 1.0170404476225546e-05, + "loss": 0.7388, + "step": 23260 + }, + { + "epoch": 0.49984963698071055, + "grad_norm": 0.5270725741076642, + "learning_rate": 1.0163588917945472e-05, + "loss": 0.7465, + "step": 23270 + }, + { + "epoch": 0.5000644412939812, + "grad_norm": 0.5115666946192222, + "learning_rate": 1.0156773283654146e-05, + "loss": 0.7514, + "step": 23280 + }, + { + "epoch": 0.5002792456072518, + "grad_norm": 0.5140916715485198, + "learning_rate": 1.0149957576518444e-05, + "loss": 0.743, + "step": 23290 + }, + { + "epoch": 0.5004940499205224, + "grad_norm": 0.5049649532600137, + "learning_rate": 1.0143141799705259e-05, + "loss": 0.7492, + "step": 23300 + }, + { + "epoch": 0.500708854233793, + "grad_norm": 0.5148345236098256, + "learning_rate": 1.0136325956381535e-05, + "loss": 0.754, + "step": 23310 + }, + { + "epoch": 0.5009236585470637, + "grad_norm": 0.5081943498814092, + "learning_rate": 1.0129510049714238e-05, + "loss": 0.7464, + "step": 23320 + }, + { + "epoch": 0.5011384628603343, + "grad_norm": 0.5263481032653132, + "learning_rate": 1.0122694082870365e-05, + "loss": 0.7269, + "step": 23330 + }, + { + "epoch": 0.5013532671736048, + "grad_norm": 0.49404517879695686, + "learning_rate": 1.0115878059016942e-05, + "loss": 0.7431, + "step": 23340 + }, + { + "epoch": 0.5015680714868754, + "grad_norm": 0.5058326845907175, + "learning_rate": 1.0109061981321015e-05, + "loss": 0.7396, + "step": 23350 + }, + { + "epoch": 0.501782875800146, + "grad_norm": 0.5264138844155276, + "learning_rate": 1.0102245852949668e-05, + "loss": 0.7262, + "step": 23360 + }, + { + "epoch": 0.5019976801134167, + "grad_norm": 0.528096691604748, + "learning_rate": 1.0095429677069997e-05, + "loss": 0.7305, + "step": 23370 + }, + { + "epoch": 0.5022124844266873, + "grad_norm": 0.5116467023890083, + "learning_rate": 1.0088613456849125e-05, + "loss": 0.7339, + "step": 23380 + }, + { + "epoch": 0.5024272887399579, + "grad_norm": 0.4984116595301803, + "learning_rate": 1.0081797195454193e-05, + "loss": 0.734, + "step": 23390 + }, + { + "epoch": 0.5026420930532285, + "grad_norm": 0.5172883195495444, + "learning_rate": 1.0074980896052361e-05, + "loss": 0.7202, + "step": 23400 + }, + { + "epoch": 0.5028568973664991, + "grad_norm": 0.5069474447864731, + "learning_rate": 1.0068164561810814e-05, + "loss": 0.7261, + "step": 23410 + }, + { + "epoch": 0.5030717016797698, + "grad_norm": 0.5339797128974847, + "learning_rate": 1.0061348195896745e-05, + "loss": 0.756, + "step": 23420 + }, + { + "epoch": 0.5032865059930404, + "grad_norm": 0.5143777722194492, + "learning_rate": 1.0054531801477364e-05, + "loss": 0.7355, + "step": 23430 + }, + { + "epoch": 0.503501310306311, + "grad_norm": 0.5374619747017549, + "learning_rate": 1.0047715381719893e-05, + "loss": 0.7168, + "step": 23440 + }, + { + "epoch": 0.5037161146195815, + "grad_norm": 0.5141935837309727, + "learning_rate": 1.004089893979157e-05, + "loss": 0.7423, + "step": 23450 + }, + { + "epoch": 0.5039309189328521, + "grad_norm": 0.5097667863863926, + "learning_rate": 1.003408247885964e-05, + "loss": 0.7273, + "step": 23460 + }, + { + "epoch": 0.5041457232461228, + "grad_norm": 0.7070241417378597, + "learning_rate": 1.0027266002091353e-05, + "loss": 0.751, + "step": 23470 + }, + { + "epoch": 0.5043605275593934, + "grad_norm": 0.5200900402474737, + "learning_rate": 1.0020449512653978e-05, + "loss": 0.7479, + "step": 23480 + }, + { + "epoch": 0.504575331872664, + "grad_norm": 0.502245321750959, + "learning_rate": 1.001363301371478e-05, + "loss": 0.7344, + "step": 23490 + }, + { + "epoch": 0.5047901361859346, + "grad_norm": 0.5102853378974559, + "learning_rate": 1.0006816508441028e-05, + "loss": 0.7261, + "step": 23500 + }, + { + "epoch": 0.5050049404992052, + "grad_norm": 0.5139914043526608, + "learning_rate": 1e-05, + "loss": 0.7387, + "step": 23510 + }, + { + "epoch": 0.5052197448124758, + "grad_norm": 0.5138651494123287, + "learning_rate": 9.993183491558975e-06, + "loss": 0.7448, + "step": 23520 + }, + { + "epoch": 0.5054345491257465, + "grad_norm": 0.5138929240424788, + "learning_rate": 9.986366986285222e-06, + "loss": 0.7249, + "step": 23530 + }, + { + "epoch": 0.5056493534390171, + "grad_norm": 0.5184056795842351, + "learning_rate": 9.979550487346024e-06, + "loss": 0.74, + "step": 23540 + }, + { + "epoch": 0.5058641577522877, + "grad_norm": 0.5485760882777143, + "learning_rate": 9.972733997908648e-06, + "loss": 0.7288, + "step": 23550 + }, + { + "epoch": 0.5060789620655582, + "grad_norm": 0.5106653968189862, + "learning_rate": 9.965917521140365e-06, + "loss": 0.7364, + "step": 23560 + }, + { + "epoch": 0.5062937663788288, + "grad_norm": 0.49612689415307143, + "learning_rate": 9.95910106020843e-06, + "loss": 0.7324, + "step": 23570 + }, + { + "epoch": 0.5065085706920995, + "grad_norm": 0.5297725980627347, + "learning_rate": 9.952284618280108e-06, + "loss": 0.7332, + "step": 23580 + }, + { + "epoch": 0.5067233750053701, + "grad_norm": 0.5170669007337249, + "learning_rate": 9.94546819852264e-06, + "loss": 0.7483, + "step": 23590 + }, + { + "epoch": 0.5069381793186407, + "grad_norm": 0.5038468226015915, + "learning_rate": 9.938651804103257e-06, + "loss": 0.7398, + "step": 23600 + }, + { + "epoch": 0.5071529836319113, + "grad_norm": 0.5087783532039669, + "learning_rate": 9.93183543818919e-06, + "loss": 0.7325, + "step": 23610 + }, + { + "epoch": 0.5073677879451819, + "grad_norm": 0.5160070069541578, + "learning_rate": 9.925019103947639e-06, + "loss": 0.7325, + "step": 23620 + }, + { + "epoch": 0.5075825922584526, + "grad_norm": 0.5075365995460227, + "learning_rate": 9.91820280454581e-06, + "loss": 0.7553, + "step": 23630 + }, + { + "epoch": 0.5077973965717232, + "grad_norm": 0.5267021157619796, + "learning_rate": 9.91138654315088e-06, + "loss": 0.7298, + "step": 23640 + }, + { + "epoch": 0.5080122008849938, + "grad_norm": 0.5186830564197705, + "learning_rate": 9.904570322930006e-06, + "loss": 0.7556, + "step": 23650 + }, + { + "epoch": 0.5082270051982644, + "grad_norm": 0.5210004208221275, + "learning_rate": 9.897754147050335e-06, + "loss": 0.7376, + "step": 23660 + }, + { + "epoch": 0.508441809511535, + "grad_norm": 0.48685588400494934, + "learning_rate": 9.890938018678985e-06, + "loss": 0.7347, + "step": 23670 + }, + { + "epoch": 0.5086566138248056, + "grad_norm": 0.5539745463393094, + "learning_rate": 9.884121940983062e-06, + "loss": 0.7515, + "step": 23680 + }, + { + "epoch": 0.5088714181380762, + "grad_norm": 0.514283104621466, + "learning_rate": 9.877305917129636e-06, + "loss": 0.7456, + "step": 23690 + }, + { + "epoch": 0.5090862224513468, + "grad_norm": 0.5050582130270673, + "learning_rate": 9.870489950285765e-06, + "loss": 0.7217, + "step": 23700 + }, + { + "epoch": 0.5093010267646174, + "grad_norm": 0.5153338859894899, + "learning_rate": 9.86367404361847e-06, + "loss": 0.7345, + "step": 23710 + }, + { + "epoch": 0.509515831077888, + "grad_norm": 0.5292429611522316, + "learning_rate": 9.856858200294742e-06, + "loss": 0.7455, + "step": 23720 + }, + { + "epoch": 0.5097306353911587, + "grad_norm": 0.535174907505527, + "learning_rate": 9.850042423481561e-06, + "loss": 0.7321, + "step": 23730 + }, + { + "epoch": 0.5099454397044293, + "grad_norm": 0.5137588499191782, + "learning_rate": 9.843226716345852e-06, + "loss": 0.7283, + "step": 23740 + }, + { + "epoch": 0.5101602440176999, + "grad_norm": 0.5261263905458244, + "learning_rate": 9.83641108205453e-06, + "loss": 0.7356, + "step": 23750 + }, + { + "epoch": 0.5103750483309705, + "grad_norm": 0.5057280317999486, + "learning_rate": 9.829595523774456e-06, + "loss": 0.7346, + "step": 23760 + }, + { + "epoch": 0.5105898526442411, + "grad_norm": 0.5366500418456487, + "learning_rate": 9.82278004467247e-06, + "loss": 0.7353, + "step": 23770 + }, + { + "epoch": 0.5108046569575118, + "grad_norm": 0.5064327805142302, + "learning_rate": 9.81596464791537e-06, + "loss": 0.7224, + "step": 23780 + }, + { + "epoch": 0.5110194612707823, + "grad_norm": 0.48473823245015357, + "learning_rate": 9.80914933666991e-06, + "loss": 0.7302, + "step": 23790 + }, + { + "epoch": 0.5112342655840529, + "grad_norm": 0.530367936435279, + "learning_rate": 9.802334114102821e-06, + "loss": 0.723, + "step": 23800 + }, + { + "epoch": 0.5114490698973235, + "grad_norm": 0.53191867394961, + "learning_rate": 9.795518983380771e-06, + "loss": 0.7322, + "step": 23810 + }, + { + "epoch": 0.5116638742105941, + "grad_norm": 0.5145552336460878, + "learning_rate": 9.788703947670407e-06, + "loss": 0.7174, + "step": 23820 + }, + { + "epoch": 0.5118786785238647, + "grad_norm": 0.5160752379900866, + "learning_rate": 9.781889010138315e-06, + "loss": 0.7371, + "step": 23830 + }, + { + "epoch": 0.5120934828371354, + "grad_norm": 0.5215561353383581, + "learning_rate": 9.775074173951038e-06, + "loss": 0.7386, + "step": 23840 + }, + { + "epoch": 0.512308287150406, + "grad_norm": 0.546987216601412, + "learning_rate": 9.768259442275083e-06, + "loss": 0.7252, + "step": 23850 + }, + { + "epoch": 0.5125230914636766, + "grad_norm": 0.5250432835086648, + "learning_rate": 9.761444818276895e-06, + "loss": 0.7326, + "step": 23860 + }, + { + "epoch": 0.5127378957769472, + "grad_norm": 0.5201485746022573, + "learning_rate": 9.75463030512288e-06, + "loss": 0.7361, + "step": 23870 + }, + { + "epoch": 0.5129527000902178, + "grad_norm": 0.5161112017032182, + "learning_rate": 9.747815905979382e-06, + "loss": 0.7175, + "step": 23880 + }, + { + "epoch": 0.5131675044034885, + "grad_norm": 0.5194134872899566, + "learning_rate": 9.741001624012706e-06, + "loss": 0.7306, + "step": 23890 + }, + { + "epoch": 0.513382308716759, + "grad_norm": 0.501372178188598, + "learning_rate": 9.734187462389086e-06, + "loss": 0.7359, + "step": 23900 + }, + { + "epoch": 0.5135971130300296, + "grad_norm": 0.5021971430887777, + "learning_rate": 9.72737342427471e-06, + "loss": 0.7423, + "step": 23910 + }, + { + "epoch": 0.5138119173433002, + "grad_norm": 0.501620537070542, + "learning_rate": 9.720559512835708e-06, + "loss": 0.7262, + "step": 23920 + }, + { + "epoch": 0.5140267216565708, + "grad_norm": 0.5022999836926363, + "learning_rate": 9.713745731238147e-06, + "loss": 0.7331, + "step": 23930 + }, + { + "epoch": 0.5142415259698415, + "grad_norm": 0.48851644845547765, + "learning_rate": 9.706932082648043e-06, + "loss": 0.7392, + "step": 23940 + }, + { + "epoch": 0.5144563302831121, + "grad_norm": 0.5066291795790192, + "learning_rate": 9.700118570231333e-06, + "loss": 0.7338, + "step": 23950 + }, + { + "epoch": 0.5146711345963827, + "grad_norm": 0.5589181767659236, + "learning_rate": 9.693305197153914e-06, + "loss": 0.7272, + "step": 23960 + }, + { + "epoch": 0.5148859389096533, + "grad_norm": 0.5241863149299513, + "learning_rate": 9.686491966581598e-06, + "loss": 0.7308, + "step": 23970 + }, + { + "epoch": 0.5151007432229239, + "grad_norm": 0.540530773434582, + "learning_rate": 9.679678881680138e-06, + "loss": 0.7351, + "step": 23980 + }, + { + "epoch": 0.5153155475361946, + "grad_norm": 0.5390573047706596, + "learning_rate": 9.672865945615225e-06, + "loss": 0.7376, + "step": 23990 + }, + { + "epoch": 0.5155303518494652, + "grad_norm": 0.5375969313392924, + "learning_rate": 9.666053161552467e-06, + "loss": 0.7415, + "step": 24000 + }, + { + "epoch": 0.5157451561627358, + "grad_norm": 0.5327773082890134, + "learning_rate": 9.65924053265742e-06, + "loss": 0.7533, + "step": 24010 + }, + { + "epoch": 0.5159599604760063, + "grad_norm": 0.5150496428189917, + "learning_rate": 9.652428062095553e-06, + "loss": 0.7342, + "step": 24020 + }, + { + "epoch": 0.5161747647892769, + "grad_norm": 0.5334047289102489, + "learning_rate": 9.645615753032264e-06, + "loss": 0.7336, + "step": 24030 + }, + { + "epoch": 0.5163895691025476, + "grad_norm": 0.4961872565305141, + "learning_rate": 9.638803608632883e-06, + "loss": 0.7264, + "step": 24040 + }, + { + "epoch": 0.5166043734158182, + "grad_norm": 0.4911939322989355, + "learning_rate": 9.631991632062652e-06, + "loss": 0.7257, + "step": 24050 + }, + { + "epoch": 0.5168191777290888, + "grad_norm": 0.5200979968551847, + "learning_rate": 9.625179826486752e-06, + "loss": 0.7361, + "step": 24060 + }, + { + "epoch": 0.5170339820423594, + "grad_norm": 0.5127022024427715, + "learning_rate": 9.618368195070265e-06, + "loss": 0.7463, + "step": 24070 + }, + { + "epoch": 0.51724878635563, + "grad_norm": 0.5185873242375817, + "learning_rate": 9.611556740978208e-06, + "loss": 0.741, + "step": 24080 + }, + { + "epoch": 0.5174635906689007, + "grad_norm": 0.5115026188640582, + "learning_rate": 9.604745467375507e-06, + "loss": 0.727, + "step": 24090 + }, + { + "epoch": 0.5176783949821713, + "grad_norm": 0.5114651615343405, + "learning_rate": 9.597934377427e-06, + "loss": 0.7401, + "step": 24100 + }, + { + "epoch": 0.5178931992954419, + "grad_norm": 0.5155760878770381, + "learning_rate": 9.591123474297456e-06, + "loss": 0.7316, + "step": 24110 + }, + { + "epoch": 0.5181080036087125, + "grad_norm": 0.5207602050012875, + "learning_rate": 9.584312761151537e-06, + "loss": 0.7303, + "step": 24120 + }, + { + "epoch": 0.518322807921983, + "grad_norm": 0.5101795182054067, + "learning_rate": 9.577502241153836e-06, + "loss": 0.7541, + "step": 24130 + }, + { + "epoch": 0.5185376122352536, + "grad_norm": 0.5031850419195654, + "learning_rate": 9.570691917468841e-06, + "loss": 0.7178, + "step": 24140 + }, + { + "epoch": 0.5187524165485243, + "grad_norm": 0.5059459890339639, + "learning_rate": 9.563881793260961e-06, + "loss": 0.74, + "step": 24150 + }, + { + "epoch": 0.5189672208617949, + "grad_norm": 0.5252032527071196, + "learning_rate": 9.5570718716945e-06, + "loss": 0.724, + "step": 24160 + }, + { + "epoch": 0.5191820251750655, + "grad_norm": 0.49701627155963785, + "learning_rate": 9.55026215593367e-06, + "loss": 0.7352, + "step": 24170 + }, + { + "epoch": 0.5193968294883361, + "grad_norm": 0.5011050975997907, + "learning_rate": 9.543452649142605e-06, + "loss": 0.7114, + "step": 24180 + }, + { + "epoch": 0.5196116338016067, + "grad_norm": 0.5166863959564101, + "learning_rate": 9.536643354485315e-06, + "loss": 0.7287, + "step": 24190 + }, + { + "epoch": 0.5198264381148774, + "grad_norm": 0.5320535495138456, + "learning_rate": 9.529834275125733e-06, + "loss": 0.7318, + "step": 24200 + }, + { + "epoch": 0.520041242428148, + "grad_norm": 0.5427362518912576, + "learning_rate": 9.52302541422768e-06, + "loss": 0.7383, + "step": 24210 + }, + { + "epoch": 0.5202560467414186, + "grad_norm": 0.518569825254274, + "learning_rate": 9.516216774954876e-06, + "loss": 0.7255, + "step": 24220 + }, + { + "epoch": 0.5204708510546892, + "grad_norm": 0.5023215208325698, + "learning_rate": 9.50940836047095e-06, + "loss": 0.7217, + "step": 24230 + }, + { + "epoch": 0.5206856553679597, + "grad_norm": 0.5280315511540243, + "learning_rate": 9.50260017393941e-06, + "loss": 0.7396, + "step": 24240 + }, + { + "epoch": 0.5209004596812304, + "grad_norm": 0.5262868684024251, + "learning_rate": 9.495792218523668e-06, + "loss": 0.7404, + "step": 24250 + }, + { + "epoch": 0.521115263994501, + "grad_norm": 0.5399291263932274, + "learning_rate": 9.488984497387023e-06, + "loss": 0.7301, + "step": 24260 + }, + { + "epoch": 0.5213300683077716, + "grad_norm": 0.5090315167154729, + "learning_rate": 9.482177013692678e-06, + "loss": 0.7246, + "step": 24270 + }, + { + "epoch": 0.5215448726210422, + "grad_norm": 0.5056887738360717, + "learning_rate": 9.47536977060371e-06, + "loss": 0.7354, + "step": 24280 + }, + { + "epoch": 0.5217596769343128, + "grad_norm": 0.5576273098938848, + "learning_rate": 9.468562771283088e-06, + "loss": 0.7494, + "step": 24290 + }, + { + "epoch": 0.5219744812475835, + "grad_norm": 0.5192891146116362, + "learning_rate": 9.461756018893674e-06, + "loss": 0.7231, + "step": 24300 + }, + { + "epoch": 0.5221892855608541, + "grad_norm": 0.5176441563361027, + "learning_rate": 9.454949516598207e-06, + "loss": 0.7508, + "step": 24310 + }, + { + "epoch": 0.5224040898741247, + "grad_norm": 0.5440570683474872, + "learning_rate": 9.448143267559322e-06, + "loss": 0.7358, + "step": 24320 + }, + { + "epoch": 0.5226188941873953, + "grad_norm": 0.5095332879054673, + "learning_rate": 9.441337274939519e-06, + "loss": 0.7388, + "step": 24330 + }, + { + "epoch": 0.5228336985006659, + "grad_norm": 0.5097526753015743, + "learning_rate": 9.434531541901197e-06, + "loss": 0.7306, + "step": 24340 + }, + { + "epoch": 0.5230485028139366, + "grad_norm": 0.5232789277714874, + "learning_rate": 9.427726071606623e-06, + "loss": 0.7242, + "step": 24350 + }, + { + "epoch": 0.5232633071272071, + "grad_norm": 0.5054634510771875, + "learning_rate": 9.42092086721794e-06, + "loss": 0.7254, + "step": 24360 + }, + { + "epoch": 0.5234781114404777, + "grad_norm": 0.532869087530748, + "learning_rate": 9.41411593189718e-06, + "loss": 0.7344, + "step": 24370 + }, + { + "epoch": 0.5236929157537483, + "grad_norm": 0.5096100205368526, + "learning_rate": 9.407311268806232e-06, + "loss": 0.7313, + "step": 24380 + }, + { + "epoch": 0.5239077200670189, + "grad_norm": 0.5296911724090648, + "learning_rate": 9.40050688110688e-06, + "loss": 0.7265, + "step": 24390 + }, + { + "epoch": 0.5241225243802896, + "grad_norm": 0.5213949606620313, + "learning_rate": 9.393702771960763e-06, + "loss": 0.7306, + "step": 24400 + }, + { + "epoch": 0.5243373286935602, + "grad_norm": 0.518401803230462, + "learning_rate": 9.386898944529392e-06, + "loss": 0.7404, + "step": 24410 + }, + { + "epoch": 0.5245521330068308, + "grad_norm": 0.519834518279471, + "learning_rate": 9.380095401974159e-06, + "loss": 0.7409, + "step": 24420 + }, + { + "epoch": 0.5247669373201014, + "grad_norm": 0.5243292050356617, + "learning_rate": 9.373292147456309e-06, + "loss": 0.7264, + "step": 24430 + }, + { + "epoch": 0.524981741633372, + "grad_norm": 0.5058736452889017, + "learning_rate": 9.366489184136966e-06, + "loss": 0.7207, + "step": 24440 + }, + { + "epoch": 0.5251965459466426, + "grad_norm": 0.5213210419850763, + "learning_rate": 9.35968651517711e-06, + "loss": 0.746, + "step": 24450 + }, + { + "epoch": 0.5254113502599133, + "grad_norm": 0.4984379629009062, + "learning_rate": 9.35288414373759e-06, + "loss": 0.7372, + "step": 24460 + }, + { + "epoch": 0.5256261545731838, + "grad_norm": 0.524707210819013, + "learning_rate": 9.346082072979111e-06, + "loss": 0.7375, + "step": 24470 + }, + { + "epoch": 0.5258409588864544, + "grad_norm": 0.5200185641061693, + "learning_rate": 9.339280306062237e-06, + "loss": 0.7366, + "step": 24480 + }, + { + "epoch": 0.526055763199725, + "grad_norm": 0.5277055511224572, + "learning_rate": 9.332478846147404e-06, + "loss": 0.7346, + "step": 24490 + }, + { + "epoch": 0.5262705675129956, + "grad_norm": 0.5580648886270715, + "learning_rate": 9.325677696394887e-06, + "loss": 0.7513, + "step": 24500 + }, + { + "epoch": 0.5264853718262663, + "grad_norm": 0.4991201127455899, + "learning_rate": 9.318876859964832e-06, + "loss": 0.738, + "step": 24510 + }, + { + "epoch": 0.5267001761395369, + "grad_norm": 0.5098501461011234, + "learning_rate": 9.312076340017232e-06, + "loss": 0.7287, + "step": 24520 + }, + { + "epoch": 0.5269149804528075, + "grad_norm": 0.6021201034831747, + "learning_rate": 9.30527613971194e-06, + "loss": 0.7499, + "step": 24530 + }, + { + "epoch": 0.5271297847660781, + "grad_norm": 0.5252128416172374, + "learning_rate": 9.298476262208646e-06, + "loss": 0.7326, + "step": 24540 + }, + { + "epoch": 0.5273445890793487, + "grad_norm": 0.49998760334312775, + "learning_rate": 9.2916767106669e-06, + "loss": 0.7307, + "step": 24550 + }, + { + "epoch": 0.5275593933926194, + "grad_norm": 0.5125373956927092, + "learning_rate": 9.284877488246105e-06, + "loss": 0.7298, + "step": 24560 + }, + { + "epoch": 0.52777419770589, + "grad_norm": 0.5080343316486924, + "learning_rate": 9.278078598105502e-06, + "loss": 0.7345, + "step": 24570 + }, + { + "epoch": 0.5279890020191605, + "grad_norm": 0.5174244580842269, + "learning_rate": 9.271280043404185e-06, + "loss": 0.7332, + "step": 24580 + }, + { + "epoch": 0.5282038063324311, + "grad_norm": 0.5058297456693376, + "learning_rate": 9.264481827301083e-06, + "loss": 0.734, + "step": 24590 + }, + { + "epoch": 0.5284186106457017, + "grad_norm": 0.5292500345044558, + "learning_rate": 9.257683952954973e-06, + "loss": 0.7409, + "step": 24600 + }, + { + "epoch": 0.5286334149589724, + "grad_norm": 0.5225245379527742, + "learning_rate": 9.25088642352448e-06, + "loss": 0.7321, + "step": 24610 + }, + { + "epoch": 0.528848219272243, + "grad_norm": 0.5224232644015462, + "learning_rate": 9.244089242168055e-06, + "loss": 0.7172, + "step": 24620 + }, + { + "epoch": 0.5290630235855136, + "grad_norm": 0.5062021193853431, + "learning_rate": 9.237292412043997e-06, + "loss": 0.736, + "step": 24630 + }, + { + "epoch": 0.5292778278987842, + "grad_norm": 0.5295018228622798, + "learning_rate": 9.230495936310436e-06, + "loss": 0.7307, + "step": 24640 + }, + { + "epoch": 0.5294926322120548, + "grad_norm": 0.5267263552657305, + "learning_rate": 9.223699818125348e-06, + "loss": 0.7345, + "step": 24650 + }, + { + "epoch": 0.5297074365253255, + "grad_norm": 0.5311126333449129, + "learning_rate": 9.21690406064653e-06, + "loss": 0.7405, + "step": 24660 + }, + { + "epoch": 0.5299222408385961, + "grad_norm": 0.5153915101075538, + "learning_rate": 9.210108667031616e-06, + "loss": 0.7391, + "step": 24670 + }, + { + "epoch": 0.5301370451518667, + "grad_norm": 0.5151283148764161, + "learning_rate": 9.203313640438074e-06, + "loss": 0.7148, + "step": 24680 + }, + { + "epoch": 0.5303518494651372, + "grad_norm": 0.5089214087459585, + "learning_rate": 9.196518984023191e-06, + "loss": 0.7539, + "step": 24690 + }, + { + "epoch": 0.5305666537784078, + "grad_norm": 0.5127764385700229, + "learning_rate": 9.189724700944104e-06, + "loss": 0.734, + "step": 24700 + }, + { + "epoch": 0.5307814580916784, + "grad_norm": 0.4988269906652733, + "learning_rate": 9.182930794357749e-06, + "loss": 0.7217, + "step": 24710 + }, + { + "epoch": 0.5309962624049491, + "grad_norm": 0.5289065899803795, + "learning_rate": 9.176137267420913e-06, + "loss": 0.7228, + "step": 24720 + }, + { + "epoch": 0.5312110667182197, + "grad_norm": 0.520505526330622, + "learning_rate": 9.169344123290186e-06, + "loss": 0.7126, + "step": 24730 + }, + { + "epoch": 0.5314258710314903, + "grad_norm": 0.4998804860251492, + "learning_rate": 9.16255136512199e-06, + "loss": 0.7288, + "step": 24740 + }, + { + "epoch": 0.5316406753447609, + "grad_norm": 0.505080866954583, + "learning_rate": 9.155758996072568e-06, + "loss": 0.7338, + "step": 24750 + }, + { + "epoch": 0.5318554796580315, + "grad_norm": 0.4987180793184345, + "learning_rate": 9.148967019297973e-06, + "loss": 0.7137, + "step": 24760 + }, + { + "epoch": 0.5320702839713022, + "grad_norm": 0.5197695735697442, + "learning_rate": 9.142175437954095e-06, + "loss": 0.7395, + "step": 24770 + }, + { + "epoch": 0.5322850882845728, + "grad_norm": 0.5291148961767198, + "learning_rate": 9.13538425519662e-06, + "loss": 0.7339, + "step": 24780 + }, + { + "epoch": 0.5324998925978434, + "grad_norm": 0.5129546756881223, + "learning_rate": 9.128593474181058e-06, + "loss": 0.7253, + "step": 24790 + }, + { + "epoch": 0.532714696911114, + "grad_norm": 0.5032558888699209, + "learning_rate": 9.121803098062732e-06, + "loss": 0.728, + "step": 24800 + }, + { + "epoch": 0.5329295012243845, + "grad_norm": 0.5159297096786016, + "learning_rate": 9.115013129996774e-06, + "loss": 0.7179, + "step": 24810 + }, + { + "epoch": 0.5331443055376552, + "grad_norm": 0.5167687353856899, + "learning_rate": 9.108223573138133e-06, + "loss": 0.7252, + "step": 24820 + }, + { + "epoch": 0.5333591098509258, + "grad_norm": 0.5259491525566676, + "learning_rate": 9.101434430641561e-06, + "loss": 0.7201, + "step": 24830 + }, + { + "epoch": 0.5335739141641964, + "grad_norm": 0.5236412827950238, + "learning_rate": 9.09464570566162e-06, + "loss": 0.7237, + "step": 24840 + }, + { + "epoch": 0.533788718477467, + "grad_norm": 0.5181715919819297, + "learning_rate": 9.087857401352673e-06, + "loss": 0.7275, + "step": 24850 + }, + { + "epoch": 0.5340035227907376, + "grad_norm": 0.5281987861982105, + "learning_rate": 9.081069520868891e-06, + "loss": 0.7338, + "step": 24860 + }, + { + "epoch": 0.5342183271040083, + "grad_norm": 0.5075187850648184, + "learning_rate": 9.074282067364254e-06, + "loss": 0.7287, + "step": 24870 + }, + { + "epoch": 0.5344331314172789, + "grad_norm": 0.505210251407526, + "learning_rate": 9.067495043992532e-06, + "loss": 0.7448, + "step": 24880 + }, + { + "epoch": 0.5346479357305495, + "grad_norm": 0.5045253045306024, + "learning_rate": 9.060708453907312e-06, + "loss": 0.7124, + "step": 24890 + }, + { + "epoch": 0.5348627400438201, + "grad_norm": 0.5332241555538618, + "learning_rate": 9.053922300261957e-06, + "loss": 0.7295, + "step": 24900 + }, + { + "epoch": 0.5350775443570907, + "grad_norm": 0.5242064871975776, + "learning_rate": 9.047136586209646e-06, + "loss": 0.7305, + "step": 24910 + }, + { + "epoch": 0.5352923486703614, + "grad_norm": 0.5213752485486699, + "learning_rate": 9.040351314903346e-06, + "loss": 0.7344, + "step": 24920 + }, + { + "epoch": 0.5355071529836319, + "grad_norm": 0.5279860659235984, + "learning_rate": 9.033566489495815e-06, + "loss": 0.7149, + "step": 24930 + }, + { + "epoch": 0.5357219572969025, + "grad_norm": 0.5153059474379569, + "learning_rate": 9.026782113139614e-06, + "loss": 0.7272, + "step": 24940 + }, + { + "epoch": 0.5359367616101731, + "grad_norm": 0.502633555256086, + "learning_rate": 9.019998188987087e-06, + "loss": 0.7358, + "step": 24950 + }, + { + "epoch": 0.5361515659234437, + "grad_norm": 0.5227824018715104, + "learning_rate": 9.01321472019037e-06, + "loss": 0.7322, + "step": 24960 + }, + { + "epoch": 0.5363663702367144, + "grad_norm": 0.5096066611505108, + "learning_rate": 9.006431709901385e-06, + "loss": 0.7291, + "step": 24970 + }, + { + "epoch": 0.536581174549985, + "grad_norm": 0.5050380085409679, + "learning_rate": 8.999649161271851e-06, + "loss": 0.7383, + "step": 24980 + }, + { + "epoch": 0.5367959788632556, + "grad_norm": 0.5098544768515563, + "learning_rate": 8.99286707745326e-06, + "loss": 0.7296, + "step": 24990 + }, + { + "epoch": 0.5370107831765262, + "grad_norm": 0.49596489778805924, + "learning_rate": 8.986085461596892e-06, + "loss": 0.7448, + "step": 25000 + }, + { + "epoch": 0.5372255874897968, + "grad_norm": 0.524177361015968, + "learning_rate": 8.979304316853816e-06, + "loss": 0.7223, + "step": 25010 + }, + { + "epoch": 0.5374403918030674, + "grad_norm": 0.5275256334675215, + "learning_rate": 8.972523646374868e-06, + "loss": 0.7155, + "step": 25020 + }, + { + "epoch": 0.537655196116338, + "grad_norm": 0.5230415273183412, + "learning_rate": 8.965743453310682e-06, + "loss": 0.7244, + "step": 25030 + }, + { + "epoch": 0.5378700004296086, + "grad_norm": 0.5056528573977891, + "learning_rate": 8.958963740811657e-06, + "loss": 0.7373, + "step": 25040 + }, + { + "epoch": 0.5380848047428792, + "grad_norm": 0.5129477293931691, + "learning_rate": 8.952184512027971e-06, + "loss": 0.7409, + "step": 25050 + }, + { + "epoch": 0.5382996090561498, + "grad_norm": 0.5254219239381808, + "learning_rate": 8.94540577010958e-06, + "loss": 0.7289, + "step": 25060 + }, + { + "epoch": 0.5385144133694204, + "grad_norm": 0.5137037649048417, + "learning_rate": 8.938627518206207e-06, + "loss": 0.7346, + "step": 25070 + }, + { + "epoch": 0.5387292176826911, + "grad_norm": 0.501166734367294, + "learning_rate": 8.931849759467364e-06, + "loss": 0.7376, + "step": 25080 + }, + { + "epoch": 0.5389440219959617, + "grad_norm": 0.5158686563553051, + "learning_rate": 8.925072497042312e-06, + "loss": 0.7246, + "step": 25090 + }, + { + "epoch": 0.5391588263092323, + "grad_norm": 0.5064258891872856, + "learning_rate": 8.9182957340801e-06, + "loss": 0.7224, + "step": 25100 + }, + { + "epoch": 0.5393736306225029, + "grad_norm": 0.5109514152684812, + "learning_rate": 8.911519473729533e-06, + "loss": 0.7258, + "step": 25110 + }, + { + "epoch": 0.5395884349357735, + "grad_norm": 0.4965294744044547, + "learning_rate": 8.904743719139184e-06, + "loss": 0.7231, + "step": 25120 + }, + { + "epoch": 0.5398032392490442, + "grad_norm": 0.5163978388155885, + "learning_rate": 8.897968473457397e-06, + "loss": 0.7213, + "step": 25130 + }, + { + "epoch": 0.5400180435623148, + "grad_norm": 0.5179202460764463, + "learning_rate": 8.89119373983227e-06, + "loss": 0.7333, + "step": 25140 + }, + { + "epoch": 0.5402328478755853, + "grad_norm": 0.5267218520730645, + "learning_rate": 8.884419521411681e-06, + "loss": 0.7271, + "step": 25150 + }, + { + "epoch": 0.5404476521888559, + "grad_norm": 0.49970760568396944, + "learning_rate": 8.877645821343245e-06, + "loss": 0.7134, + "step": 25160 + }, + { + "epoch": 0.5406624565021265, + "grad_norm": 0.5110857110324059, + "learning_rate": 8.870872642774354e-06, + "loss": 0.7371, + "step": 25170 + }, + { + "epoch": 0.5408772608153972, + "grad_norm": 0.524857212027166, + "learning_rate": 8.864099988852149e-06, + "loss": 0.7303, + "step": 25180 + }, + { + "epoch": 0.5410920651286678, + "grad_norm": 0.5056157960738001, + "learning_rate": 8.857327862723524e-06, + "loss": 0.7163, + "step": 25190 + }, + { + "epoch": 0.5413068694419384, + "grad_norm": 0.4958979488976298, + "learning_rate": 8.850556267535146e-06, + "loss": 0.7364, + "step": 25200 + }, + { + "epoch": 0.541521673755209, + "grad_norm": 0.5132188143017179, + "learning_rate": 8.843785206433412e-06, + "loss": 0.7335, + "step": 25210 + }, + { + "epoch": 0.5417364780684796, + "grad_norm": 0.4994335318852988, + "learning_rate": 8.837014682564487e-06, + "loss": 0.7257, + "step": 25220 + }, + { + "epoch": 0.5419512823817503, + "grad_norm": 0.5384327760658743, + "learning_rate": 8.83024469907428e-06, + "loss": 0.7351, + "step": 25230 + }, + { + "epoch": 0.5421660866950209, + "grad_norm": 0.5261027642763866, + "learning_rate": 8.823475259108445e-06, + "loss": 0.7356, + "step": 25240 + }, + { + "epoch": 0.5423808910082915, + "grad_norm": 0.5060595175869693, + "learning_rate": 8.816706365812396e-06, + "loss": 0.7136, + "step": 25250 + }, + { + "epoch": 0.542595695321562, + "grad_norm": 0.5089315623590392, + "learning_rate": 8.809938022331279e-06, + "loss": 0.7582, + "step": 25260 + }, + { + "epoch": 0.5428104996348326, + "grad_norm": 0.5517010005843618, + "learning_rate": 8.803170231809997e-06, + "loss": 0.7264, + "step": 25270 + }, + { + "epoch": 0.5430253039481033, + "grad_norm": 0.5192946990016879, + "learning_rate": 8.796402997393179e-06, + "loss": 0.7338, + "step": 25280 + }, + { + "epoch": 0.5432401082613739, + "grad_norm": 0.5141174201291905, + "learning_rate": 8.789636322225221e-06, + "loss": 0.7301, + "step": 25290 + }, + { + "epoch": 0.5434549125746445, + "grad_norm": 0.5156454974533952, + "learning_rate": 8.782870209450234e-06, + "loss": 0.7386, + "step": 25300 + }, + { + "epoch": 0.5436697168879151, + "grad_norm": 0.5399434222135454, + "learning_rate": 8.776104662212077e-06, + "loss": 0.7297, + "step": 25310 + }, + { + "epoch": 0.5438845212011857, + "grad_norm": 0.5080337339754446, + "learning_rate": 8.769339683654358e-06, + "loss": 0.7268, + "step": 25320 + }, + { + "epoch": 0.5440993255144563, + "grad_norm": 0.5211965166488158, + "learning_rate": 8.762575276920403e-06, + "loss": 0.72, + "step": 25330 + }, + { + "epoch": 0.544314129827727, + "grad_norm": 0.511040069798665, + "learning_rate": 8.755811445153282e-06, + "loss": 0.7245, + "step": 25340 + }, + { + "epoch": 0.5445289341409976, + "grad_norm": 0.5496977731743303, + "learning_rate": 8.749048191495787e-06, + "loss": 0.7362, + "step": 25350 + }, + { + "epoch": 0.5447437384542682, + "grad_norm": 0.497836078774839, + "learning_rate": 8.742285519090465e-06, + "loss": 0.7369, + "step": 25360 + }, + { + "epoch": 0.5449585427675387, + "grad_norm": 0.4958506383207596, + "learning_rate": 8.735523431079567e-06, + "loss": 0.6986, + "step": 25370 + }, + { + "epoch": 0.5451733470808093, + "grad_norm": 0.5023433708525002, + "learning_rate": 8.728761930605086e-06, + "loss": 0.7225, + "step": 25380 + }, + { + "epoch": 0.54538815139408, + "grad_norm": 0.5115175514759246, + "learning_rate": 8.72200102080874e-06, + "loss": 0.7311, + "step": 25390 + }, + { + "epoch": 0.5456029557073506, + "grad_norm": 0.49400314000603524, + "learning_rate": 8.715240704831965e-06, + "loss": 0.711, + "step": 25400 + }, + { + "epoch": 0.5458177600206212, + "grad_norm": 0.5281302488810649, + "learning_rate": 8.70848098581594e-06, + "loss": 0.7307, + "step": 25410 + }, + { + "epoch": 0.5460325643338918, + "grad_norm": 0.5253108140644606, + "learning_rate": 8.701721866901548e-06, + "loss": 0.7542, + "step": 25420 + }, + { + "epoch": 0.5462473686471624, + "grad_norm": 0.49677506907339725, + "learning_rate": 8.694963351229395e-06, + "loss": 0.7346, + "step": 25430 + }, + { + "epoch": 0.5464621729604331, + "grad_norm": 0.49156308079799516, + "learning_rate": 8.68820544193982e-06, + "loss": 0.7301, + "step": 25440 + }, + { + "epoch": 0.5466769772737037, + "grad_norm": 0.5054408047803833, + "learning_rate": 8.681448142172862e-06, + "loss": 0.717, + "step": 25450 + }, + { + "epoch": 0.5468917815869743, + "grad_norm": 0.5250496149552367, + "learning_rate": 8.674691455068296e-06, + "loss": 0.7208, + "step": 25460 + }, + { + "epoch": 0.5471065859002449, + "grad_norm": 3.4136357360926253, + "learning_rate": 8.667935383765595e-06, + "loss": 0.735, + "step": 25470 + }, + { + "epoch": 0.5473213902135154, + "grad_norm": 0.4871497116063486, + "learning_rate": 8.66117993140396e-06, + "loss": 0.7342, + "step": 25480 + }, + { + "epoch": 0.5475361945267861, + "grad_norm": 0.5209915171759771, + "learning_rate": 8.654425101122296e-06, + "loss": 0.7336, + "step": 25490 + }, + { + "epoch": 0.5477509988400567, + "grad_norm": 0.5156635252595837, + "learning_rate": 8.647670896059216e-06, + "loss": 0.7293, + "step": 25500 + }, + { + "epoch": 0.5479658031533273, + "grad_norm": 0.5152409120879327, + "learning_rate": 8.640917319353055e-06, + "loss": 0.7273, + "step": 25510 + }, + { + "epoch": 0.5481806074665979, + "grad_norm": 0.5416207574425208, + "learning_rate": 8.634164374141838e-06, + "loss": 0.7258, + "step": 25520 + }, + { + "epoch": 0.5483954117798685, + "grad_norm": 0.515769700298451, + "learning_rate": 8.62741206356332e-06, + "loss": 0.7196, + "step": 25530 + }, + { + "epoch": 0.5486102160931392, + "grad_norm": 0.5161072492397152, + "learning_rate": 8.62066039075494e-06, + "loss": 0.7146, + "step": 25540 + }, + { + "epoch": 0.5488250204064098, + "grad_norm": 0.5188545967485997, + "learning_rate": 8.61390935885385e-06, + "loss": 0.7343, + "step": 25550 + }, + { + "epoch": 0.5490398247196804, + "grad_norm": 0.5220576998409723, + "learning_rate": 8.607158970996905e-06, + "loss": 0.7225, + "step": 25560 + }, + { + "epoch": 0.549254629032951, + "grad_norm": 0.4933476797650889, + "learning_rate": 8.600409230320652e-06, + "loss": 0.727, + "step": 25570 + }, + { + "epoch": 0.5494694333462216, + "grad_norm": 0.5360731957654817, + "learning_rate": 8.593660139961354e-06, + "loss": 0.7304, + "step": 25580 + }, + { + "epoch": 0.5496842376594921, + "grad_norm": 0.4967871791391136, + "learning_rate": 8.586911703054953e-06, + "loss": 0.718, + "step": 25590 + }, + { + "epoch": 0.5498990419727628, + "grad_norm": 0.5094344264185746, + "learning_rate": 8.580163922737107e-06, + "loss": 0.7351, + "step": 25600 + }, + { + "epoch": 0.5501138462860334, + "grad_norm": 0.4998333921079445, + "learning_rate": 8.573416802143147e-06, + "loss": 0.7362, + "step": 25610 + }, + { + "epoch": 0.550328650599304, + "grad_norm": 0.49839738637776776, + "learning_rate": 8.56667034440811e-06, + "loss": 0.7343, + "step": 25620 + }, + { + "epoch": 0.5505434549125746, + "grad_norm": 0.5122812659780278, + "learning_rate": 8.559924552666731e-06, + "loss": 0.7299, + "step": 25630 + }, + { + "epoch": 0.5507582592258452, + "grad_norm": 0.510657449522192, + "learning_rate": 8.55317943005342e-06, + "loss": 0.7323, + "step": 25640 + }, + { + "epoch": 0.5509730635391159, + "grad_norm": 0.5029028813355643, + "learning_rate": 8.546434979702293e-06, + "loss": 0.7287, + "step": 25650 + }, + { + "epoch": 0.5511878678523865, + "grad_norm": 0.5228476287215734, + "learning_rate": 8.539691204747134e-06, + "loss": 0.7321, + "step": 25660 + }, + { + "epoch": 0.5514026721656571, + "grad_norm": 0.5359812041522654, + "learning_rate": 8.532948108321433e-06, + "loss": 0.742, + "step": 25670 + }, + { + "epoch": 0.5516174764789277, + "grad_norm": 0.5130595618709932, + "learning_rate": 8.526205693558353e-06, + "loss": 0.7319, + "step": 25680 + }, + { + "epoch": 0.5518322807921983, + "grad_norm": 0.4996255988319911, + "learning_rate": 8.51946396359074e-06, + "loss": 0.7407, + "step": 25690 + }, + { + "epoch": 0.552047085105469, + "grad_norm": 0.49407356190346213, + "learning_rate": 8.51272292155113e-06, + "loss": 0.717, + "step": 25700 + }, + { + "epoch": 0.5522618894187395, + "grad_norm": 0.4898737936647392, + "learning_rate": 8.50598257057173e-06, + "loss": 0.747, + "step": 25710 + }, + { + "epoch": 0.5524766937320101, + "grad_norm": 0.5212613411297291, + "learning_rate": 8.499242913784436e-06, + "loss": 0.733, + "step": 25720 + }, + { + "epoch": 0.5526914980452807, + "grad_norm": 0.5428952625628243, + "learning_rate": 8.492503954320804e-06, + "loss": 0.7235, + "step": 25730 + }, + { + "epoch": 0.5529063023585513, + "grad_norm": 0.5179866204385108, + "learning_rate": 8.485765695312091e-06, + "loss": 0.7353, + "step": 25740 + }, + { + "epoch": 0.553121106671822, + "grad_norm": 0.5143508651285503, + "learning_rate": 8.479028139889209e-06, + "loss": 0.7092, + "step": 25750 + }, + { + "epoch": 0.5533359109850926, + "grad_norm": 0.4899586729640197, + "learning_rate": 8.472291291182747e-06, + "loss": 0.7214, + "step": 25760 + }, + { + "epoch": 0.5535507152983632, + "grad_norm": 0.5104765609615125, + "learning_rate": 8.465555152322971e-06, + "loss": 0.7343, + "step": 25770 + }, + { + "epoch": 0.5537655196116338, + "grad_norm": 0.4936010581482795, + "learning_rate": 8.45881972643981e-06, + "loss": 0.7201, + "step": 25780 + }, + { + "epoch": 0.5539803239249044, + "grad_norm": 0.5103205277286559, + "learning_rate": 8.452085016662873e-06, + "loss": 0.7249, + "step": 25790 + }, + { + "epoch": 0.5541951282381751, + "grad_norm": 0.5337997235376728, + "learning_rate": 8.445351026121425e-06, + "loss": 0.723, + "step": 25800 + }, + { + "epoch": 0.5544099325514457, + "grad_norm": 0.5330888068677769, + "learning_rate": 8.438617757944396e-06, + "loss": 0.7368, + "step": 25810 + }, + { + "epoch": 0.5546247368647163, + "grad_norm": 0.507422198237814, + "learning_rate": 8.431885215260393e-06, + "loss": 0.7277, + "step": 25820 + }, + { + "epoch": 0.5548395411779868, + "grad_norm": 0.5092852644195597, + "learning_rate": 8.42515340119767e-06, + "loss": 0.7141, + "step": 25830 + }, + { + "epoch": 0.5550543454912574, + "grad_norm": 0.4983369620376665, + "learning_rate": 8.418422318884158e-06, + "loss": 0.7213, + "step": 25840 + }, + { + "epoch": 0.5552691498045281, + "grad_norm": 0.5057768818854864, + "learning_rate": 8.411691971447437e-06, + "loss": 0.7204, + "step": 25850 + }, + { + "epoch": 0.5554839541177987, + "grad_norm": 0.4957062726709344, + "learning_rate": 8.40496236201475e-06, + "loss": 0.7239, + "step": 25860 + }, + { + "epoch": 0.5556987584310693, + "grad_norm": 0.5098362148369251, + "learning_rate": 8.398233493712997e-06, + "loss": 0.7229, + "step": 25870 + }, + { + "epoch": 0.5559135627443399, + "grad_norm": 0.540586305639984, + "learning_rate": 8.391505369668725e-06, + "loss": 0.7223, + "step": 25880 + }, + { + "epoch": 0.5561283670576105, + "grad_norm": 0.5164869278284776, + "learning_rate": 8.384777993008154e-06, + "loss": 0.7241, + "step": 25890 + }, + { + "epoch": 0.5563431713708811, + "grad_norm": 0.5178900728279536, + "learning_rate": 8.378051366857137e-06, + "loss": 0.7324, + "step": 25900 + }, + { + "epoch": 0.5565579756841518, + "grad_norm": 0.5093380002448458, + "learning_rate": 8.371325494341193e-06, + "loss": 0.7337, + "step": 25910 + }, + { + "epoch": 0.5567727799974224, + "grad_norm": 0.5228044379168946, + "learning_rate": 8.364600378585482e-06, + "loss": 0.7399, + "step": 25920 + }, + { + "epoch": 0.556987584310693, + "grad_norm": 0.5144671939481396, + "learning_rate": 8.357876022714816e-06, + "loss": 0.7254, + "step": 25930 + }, + { + "epoch": 0.5572023886239635, + "grad_norm": 0.4986034297019067, + "learning_rate": 8.351152429853653e-06, + "loss": 0.7106, + "step": 25940 + }, + { + "epoch": 0.5574171929372341, + "grad_norm": 0.5023603101794233, + "learning_rate": 8.34442960312609e-06, + "loss": 0.7226, + "step": 25950 + }, + { + "epoch": 0.5576319972505048, + "grad_norm": 0.5209318452676072, + "learning_rate": 8.337707545655886e-06, + "loss": 0.7238, + "step": 25960 + }, + { + "epoch": 0.5578468015637754, + "grad_norm": 0.4926074970141638, + "learning_rate": 8.330986260566424e-06, + "loss": 0.7276, + "step": 25970 + }, + { + "epoch": 0.558061605877046, + "grad_norm": 0.4958510123483159, + "learning_rate": 8.324265750980738e-06, + "loss": 0.7148, + "step": 25980 + }, + { + "epoch": 0.5582764101903166, + "grad_norm": 0.5075764876809195, + "learning_rate": 8.317546020021498e-06, + "loss": 0.7155, + "step": 25990 + }, + { + "epoch": 0.5584912145035872, + "grad_norm": 0.5141742208103084, + "learning_rate": 8.310827070811008e-06, + "loss": 0.7385, + "step": 26000 + }, + { + "epoch": 0.5587060188168579, + "grad_norm": 0.5084465568638146, + "learning_rate": 8.30410890647122e-06, + "loss": 0.7202, + "step": 26010 + }, + { + "epoch": 0.5589208231301285, + "grad_norm": 0.502206471808609, + "learning_rate": 8.297391530123713e-06, + "loss": 0.7237, + "step": 26020 + }, + { + "epoch": 0.5591356274433991, + "grad_norm": 0.5086015295612804, + "learning_rate": 8.290674944889705e-06, + "loss": 0.7387, + "step": 26030 + }, + { + "epoch": 0.5593504317566697, + "grad_norm": 0.5252472355926682, + "learning_rate": 8.283959153890037e-06, + "loss": 0.728, + "step": 26040 + }, + { + "epoch": 0.5595652360699402, + "grad_norm": 0.4975594240387986, + "learning_rate": 8.277244160245196e-06, + "loss": 0.7213, + "step": 26050 + }, + { + "epoch": 0.5597800403832109, + "grad_norm": 0.4978146880109962, + "learning_rate": 8.270529967075284e-06, + "loss": 0.7173, + "step": 26060 + }, + { + "epoch": 0.5599948446964815, + "grad_norm": 0.5268438508793595, + "learning_rate": 8.263816577500034e-06, + "loss": 0.7371, + "step": 26070 + }, + { + "epoch": 0.5602096490097521, + "grad_norm": 0.5137983659248461, + "learning_rate": 8.257103994638817e-06, + "loss": 0.7197, + "step": 26080 + }, + { + "epoch": 0.5604244533230227, + "grad_norm": 0.5194520105890317, + "learning_rate": 8.250392221610612e-06, + "loss": 0.7297, + "step": 26090 + }, + { + "epoch": 0.5606392576362933, + "grad_norm": 0.5133858765119592, + "learning_rate": 8.24368126153404e-06, + "loss": 0.732, + "step": 26100 + }, + { + "epoch": 0.560854061949564, + "grad_norm": 0.5029432924404077, + "learning_rate": 8.236971117527324e-06, + "loss": 0.728, + "step": 26110 + }, + { + "epoch": 0.5610688662628346, + "grad_norm": 0.5081831178394076, + "learning_rate": 8.230261792708328e-06, + "loss": 0.7324, + "step": 26120 + }, + { + "epoch": 0.5612836705761052, + "grad_norm": 0.5046477194727244, + "learning_rate": 8.223553290194521e-06, + "loss": 0.7363, + "step": 26130 + }, + { + "epoch": 0.5614984748893758, + "grad_norm": 0.4956098306676567, + "learning_rate": 8.216845613102995e-06, + "loss": 0.7245, + "step": 26140 + }, + { + "epoch": 0.5617132792026464, + "grad_norm": 0.5157536272581615, + "learning_rate": 8.21013876455046e-06, + "loss": 0.7117, + "step": 26150 + }, + { + "epoch": 0.561928083515917, + "grad_norm": 0.5168564344065179, + "learning_rate": 8.203432747653234e-06, + "loss": 0.7245, + "step": 26160 + }, + { + "epoch": 0.5621428878291876, + "grad_norm": 0.505045763218236, + "learning_rate": 8.19672756552726e-06, + "loss": 0.7176, + "step": 26170 + }, + { + "epoch": 0.5623576921424582, + "grad_norm": 0.5239596568078339, + "learning_rate": 8.190023221288088e-06, + "loss": 0.7326, + "step": 26180 + }, + { + "epoch": 0.5625724964557288, + "grad_norm": 0.5302666154295604, + "learning_rate": 8.183319718050873e-06, + "loss": 0.7422, + "step": 26190 + }, + { + "epoch": 0.5627873007689994, + "grad_norm": 0.5003281299438962, + "learning_rate": 8.176617058930385e-06, + "loss": 0.7284, + "step": 26200 + }, + { + "epoch": 0.56300210508227, + "grad_norm": 0.49200878286488725, + "learning_rate": 8.169915247040998e-06, + "loss": 0.7313, + "step": 26210 + }, + { + "epoch": 0.5632169093955407, + "grad_norm": 0.49815572715612766, + "learning_rate": 8.163214285496704e-06, + "loss": 0.7216, + "step": 26220 + }, + { + "epoch": 0.5634317137088113, + "grad_norm": 0.5142148330083207, + "learning_rate": 8.15651417741108e-06, + "loss": 0.7239, + "step": 26230 + }, + { + "epoch": 0.5636465180220819, + "grad_norm": 0.49075756148020155, + "learning_rate": 8.149814925897327e-06, + "loss": 0.7295, + "step": 26240 + }, + { + "epoch": 0.5638613223353525, + "grad_norm": 0.5246558400473214, + "learning_rate": 8.143116534068231e-06, + "loss": 0.726, + "step": 26250 + }, + { + "epoch": 0.564076126648623, + "grad_norm": 0.5246290515230206, + "learning_rate": 8.136419005036186e-06, + "loss": 0.7182, + "step": 26260 + }, + { + "epoch": 0.5642909309618938, + "grad_norm": 0.5036297463223443, + "learning_rate": 8.129722341913192e-06, + "loss": 0.7384, + "step": 26270 + }, + { + "epoch": 0.5645057352751643, + "grad_norm": 0.527968106908949, + "learning_rate": 8.12302654781083e-06, + "loss": 0.7215, + "step": 26280 + }, + { + "epoch": 0.5647205395884349, + "grad_norm": 0.5225876564331369, + "learning_rate": 8.116331625840297e-06, + "loss": 0.7244, + "step": 26290 + }, + { + "epoch": 0.5649353439017055, + "grad_norm": 0.5436618520471259, + "learning_rate": 8.109637579112368e-06, + "loss": 0.7398, + "step": 26300 + }, + { + "epoch": 0.5651501482149761, + "grad_norm": 0.5148458404115451, + "learning_rate": 8.102944410737422e-06, + "loss": 0.7273, + "step": 26310 + }, + { + "epoch": 0.5653649525282468, + "grad_norm": 0.5015994626598883, + "learning_rate": 8.096252123825422e-06, + "loss": 0.7157, + "step": 26320 + }, + { + "epoch": 0.5655797568415174, + "grad_norm": 0.505696959132355, + "learning_rate": 8.089560721485922e-06, + "loss": 0.7234, + "step": 26330 + }, + { + "epoch": 0.565794561154788, + "grad_norm": 0.5359793879908359, + "learning_rate": 8.082870206828078e-06, + "loss": 0.72, + "step": 26340 + }, + { + "epoch": 0.5660093654680586, + "grad_norm": 0.5110305690812829, + "learning_rate": 8.076180582960618e-06, + "loss": 0.7234, + "step": 26350 + }, + { + "epoch": 0.5662241697813292, + "grad_norm": 0.498824135773946, + "learning_rate": 8.069491852991861e-06, + "loss": 0.7137, + "step": 26360 + }, + { + "epoch": 0.5664389740945999, + "grad_norm": 0.5071066304511312, + "learning_rate": 8.062804020029716e-06, + "loss": 0.7257, + "step": 26370 + }, + { + "epoch": 0.5666537784078705, + "grad_norm": 0.5382565877388203, + "learning_rate": 8.056117087181663e-06, + "loss": 0.7233, + "step": 26380 + }, + { + "epoch": 0.566868582721141, + "grad_norm": 0.507293946375138, + "learning_rate": 8.04943105755478e-06, + "loss": 0.7283, + "step": 26390 + }, + { + "epoch": 0.5670833870344116, + "grad_norm": 0.5280130888890053, + "learning_rate": 8.04274593425571e-06, + "loss": 0.7317, + "step": 26400 + }, + { + "epoch": 0.5672981913476822, + "grad_norm": 0.518867511724086, + "learning_rate": 8.03606172039069e-06, + "loss": 0.7167, + "step": 26410 + }, + { + "epoch": 0.5675129956609529, + "grad_norm": 0.5060431523298652, + "learning_rate": 8.029378419065515e-06, + "loss": 0.717, + "step": 26420 + }, + { + "epoch": 0.5677277999742235, + "grad_norm": 0.5063699893804817, + "learning_rate": 8.022696033385576e-06, + "loss": 0.7272, + "step": 26430 + }, + { + "epoch": 0.5679426042874941, + "grad_norm": 0.5240786574435802, + "learning_rate": 8.016014566455827e-06, + "loss": 0.7104, + "step": 26440 + }, + { + "epoch": 0.5681574086007647, + "grad_norm": 0.5023416444895722, + "learning_rate": 8.009334021380797e-06, + "loss": 0.7271, + "step": 26450 + }, + { + "epoch": 0.5683722129140353, + "grad_norm": 0.5364383380095676, + "learning_rate": 8.002654401264587e-06, + "loss": 0.7263, + "step": 26460 + }, + { + "epoch": 0.568587017227306, + "grad_norm": 0.5281110306818434, + "learning_rate": 7.995975709210869e-06, + "loss": 0.7145, + "step": 26470 + }, + { + "epoch": 0.5688018215405766, + "grad_norm": 0.5172649393256681, + "learning_rate": 7.989297948322885e-06, + "loss": 0.7279, + "step": 26480 + }, + { + "epoch": 0.5690166258538472, + "grad_norm": 0.4984810683384183, + "learning_rate": 7.982621121703438e-06, + "loss": 0.7017, + "step": 26490 + }, + { + "epoch": 0.5692314301671177, + "grad_norm": 0.5013994336883097, + "learning_rate": 7.97594523245491e-06, + "loss": 0.7052, + "step": 26500 + }, + { + "epoch": 0.5694462344803883, + "grad_norm": 0.4962092438638248, + "learning_rate": 7.969270283679233e-06, + "loss": 0.7145, + "step": 26510 + }, + { + "epoch": 0.5696610387936589, + "grad_norm": 0.5113556248884656, + "learning_rate": 7.96259627847791e-06, + "loss": 0.7182, + "step": 26520 + }, + { + "epoch": 0.5698758431069296, + "grad_norm": 0.5049922401075675, + "learning_rate": 7.955923219952002e-06, + "loss": 0.7093, + "step": 26530 + }, + { + "epoch": 0.5700906474202002, + "grad_norm": 0.5033957581062982, + "learning_rate": 7.949251111202132e-06, + "loss": 0.7274, + "step": 26540 + }, + { + "epoch": 0.5703054517334708, + "grad_norm": 0.5356557064198388, + "learning_rate": 7.942579955328485e-06, + "loss": 0.7241, + "step": 26550 + }, + { + "epoch": 0.5705202560467414, + "grad_norm": 0.5345233979565142, + "learning_rate": 7.935909755430797e-06, + "loss": 0.7334, + "step": 26560 + }, + { + "epoch": 0.570735060360012, + "grad_norm": 0.49378966951960895, + "learning_rate": 7.92924051460836e-06, + "loss": 0.7388, + "step": 26570 + }, + { + "epoch": 0.5709498646732827, + "grad_norm": 0.5077813116094394, + "learning_rate": 7.92257223596003e-06, + "loss": 0.7226, + "step": 26580 + }, + { + "epoch": 0.5711646689865533, + "grad_norm": 0.5242154347144848, + "learning_rate": 7.9159049225842e-06, + "loss": 0.7293, + "step": 26590 + }, + { + "epoch": 0.5713794732998239, + "grad_norm": 0.4939088384409381, + "learning_rate": 7.90923857757883e-06, + "loss": 0.7351, + "step": 26600 + }, + { + "epoch": 0.5715942776130944, + "grad_norm": 0.49854560906783285, + "learning_rate": 7.902573204041422e-06, + "loss": 0.7112, + "step": 26610 + }, + { + "epoch": 0.571809081926365, + "grad_norm": 0.489407868294081, + "learning_rate": 7.895908805069026e-06, + "loss": 0.731, + "step": 26620 + }, + { + "epoch": 0.5720238862396357, + "grad_norm": 0.5063998256679312, + "learning_rate": 7.889245383758247e-06, + "loss": 0.733, + "step": 26630 + }, + { + "epoch": 0.5722386905529063, + "grad_norm": 0.5118782447758873, + "learning_rate": 7.882582943205218e-06, + "loss": 0.7172, + "step": 26640 + }, + { + "epoch": 0.5724534948661769, + "grad_norm": 0.4961115360495314, + "learning_rate": 7.87592148650564e-06, + "loss": 0.7183, + "step": 26650 + }, + { + "epoch": 0.5726682991794475, + "grad_norm": 0.5280800091056106, + "learning_rate": 7.869261016754736e-06, + "loss": 0.7295, + "step": 26660 + }, + { + "epoch": 0.5728831034927181, + "grad_norm": 0.5289805883203285, + "learning_rate": 7.862601537047289e-06, + "loss": 0.7265, + "step": 26670 + }, + { + "epoch": 0.5730979078059888, + "grad_norm": 0.5193534190312329, + "learning_rate": 7.855943050477605e-06, + "loss": 0.7333, + "step": 26680 + }, + { + "epoch": 0.5733127121192594, + "grad_norm": 0.5290783598213893, + "learning_rate": 7.84928556013954e-06, + "loss": 0.7221, + "step": 26690 + }, + { + "epoch": 0.57352751643253, + "grad_norm": 0.5127746177226163, + "learning_rate": 7.842629069126484e-06, + "loss": 0.7388, + "step": 26700 + }, + { + "epoch": 0.5737423207458006, + "grad_norm": 0.512414892050149, + "learning_rate": 7.835973580531353e-06, + "loss": 0.7284, + "step": 26710 + }, + { + "epoch": 0.5739571250590711, + "grad_norm": 0.5029147333178203, + "learning_rate": 7.829319097446617e-06, + "loss": 0.716, + "step": 26720 + }, + { + "epoch": 0.5741719293723418, + "grad_norm": 0.520040331471592, + "learning_rate": 7.82266562296426e-06, + "loss": 0.7159, + "step": 26730 + }, + { + "epoch": 0.5743867336856124, + "grad_norm": 0.4978396177758656, + "learning_rate": 7.816013160175812e-06, + "loss": 0.7208, + "step": 26740 + }, + { + "epoch": 0.574601537998883, + "grad_norm": 0.538410253835665, + "learning_rate": 7.809361712172322e-06, + "loss": 0.7146, + "step": 26750 + }, + { + "epoch": 0.5748163423121536, + "grad_norm": 0.5071511006184085, + "learning_rate": 7.802711282044366e-06, + "loss": 0.7314, + "step": 26760 + }, + { + "epoch": 0.5750311466254242, + "grad_norm": 0.5026752106850176, + "learning_rate": 7.796061872882065e-06, + "loss": 0.7353, + "step": 26770 + }, + { + "epoch": 0.5752459509386948, + "grad_norm": 0.4928490982584089, + "learning_rate": 7.789413487775043e-06, + "loss": 0.7161, + "step": 26780 + }, + { + "epoch": 0.5754607552519655, + "grad_norm": 0.5054902551078443, + "learning_rate": 7.782766129812463e-06, + "loss": 0.7227, + "step": 26790 + }, + { + "epoch": 0.5756755595652361, + "grad_norm": 0.5106529312162003, + "learning_rate": 7.776119802083001e-06, + "loss": 0.7173, + "step": 26800 + }, + { + "epoch": 0.5758903638785067, + "grad_norm": 0.5157371052564345, + "learning_rate": 7.76947450767487e-06, + "loss": 0.7122, + "step": 26810 + }, + { + "epoch": 0.5761051681917773, + "grad_norm": 0.5284843076164706, + "learning_rate": 7.762830249675785e-06, + "loss": 0.7256, + "step": 26820 + }, + { + "epoch": 0.5763199725050479, + "grad_norm": 0.5456146407952996, + "learning_rate": 7.756187031172984e-06, + "loss": 0.7391, + "step": 26830 + }, + { + "epoch": 0.5765347768183186, + "grad_norm": 0.4917659110447439, + "learning_rate": 7.74954485525323e-06, + "loss": 0.7185, + "step": 26840 + }, + { + "epoch": 0.5767495811315891, + "grad_norm": 0.506243431890311, + "learning_rate": 7.74290372500279e-06, + "loss": 0.7367, + "step": 26850 + }, + { + "epoch": 0.5769643854448597, + "grad_norm": 0.506566764718463, + "learning_rate": 7.736263643507462e-06, + "loss": 0.7336, + "step": 26860 + }, + { + "epoch": 0.5771791897581303, + "grad_norm": 0.5188308701341517, + "learning_rate": 7.729624613852533e-06, + "loss": 0.7165, + "step": 26870 + }, + { + "epoch": 0.5773939940714009, + "grad_norm": 0.5149438496598627, + "learning_rate": 7.722986639122827e-06, + "loss": 0.7287, + "step": 26880 + }, + { + "epoch": 0.5776087983846716, + "grad_norm": 0.5235992368572671, + "learning_rate": 7.71634972240266e-06, + "loss": 0.7174, + "step": 26890 + }, + { + "epoch": 0.5778236026979422, + "grad_norm": 0.5113815635765164, + "learning_rate": 7.70971386677586e-06, + "loss": 0.7109, + "step": 26900 + }, + { + "epoch": 0.5780384070112128, + "grad_norm": 0.49355622034783786, + "learning_rate": 7.703079075325764e-06, + "loss": 0.7103, + "step": 26910 + }, + { + "epoch": 0.5782532113244834, + "grad_norm": 0.519299512412156, + "learning_rate": 7.69644535113521e-06, + "loss": 0.7369, + "step": 26920 + }, + { + "epoch": 0.578468015637754, + "grad_norm": 0.5310388569132752, + "learning_rate": 7.689812697286555e-06, + "loss": 0.7188, + "step": 26930 + }, + { + "epoch": 0.5786828199510247, + "grad_norm": 0.5168701357967611, + "learning_rate": 7.683181116861642e-06, + "loss": 0.73, + "step": 26940 + }, + { + "epoch": 0.5788976242642953, + "grad_norm": 0.5042983692202927, + "learning_rate": 7.676550612941816e-06, + "loss": 0.7135, + "step": 26950 + }, + { + "epoch": 0.5791124285775658, + "grad_norm": 0.5177162051099963, + "learning_rate": 7.669921188607935e-06, + "loss": 0.7394, + "step": 26960 + }, + { + "epoch": 0.5793272328908364, + "grad_norm": 0.5119619648123758, + "learning_rate": 7.663292846940343e-06, + "loss": 0.7207, + "step": 26970 + }, + { + "epoch": 0.579542037204107, + "grad_norm": 0.5362452673286001, + "learning_rate": 7.656665591018887e-06, + "loss": 0.7143, + "step": 26980 + }, + { + "epoch": 0.5797568415173777, + "grad_norm": 0.5129247448122536, + "learning_rate": 7.650039423922905e-06, + "loss": 0.7224, + "step": 26990 + }, + { + "epoch": 0.5799716458306483, + "grad_norm": 0.5088798828391344, + "learning_rate": 7.643414348731237e-06, + "loss": 0.7148, + "step": 27000 + }, + { + "epoch": 0.5801864501439189, + "grad_norm": 0.5027910917195445, + "learning_rate": 7.636790368522208e-06, + "loss": 0.7219, + "step": 27010 + }, + { + "epoch": 0.5804012544571895, + "grad_norm": 0.5185537348120938, + "learning_rate": 7.630167486373632e-06, + "loss": 0.7221, + "step": 27020 + }, + { + "epoch": 0.5806160587704601, + "grad_norm": 0.5272726326291755, + "learning_rate": 7.623545705362822e-06, + "loss": 0.7175, + "step": 27030 + }, + { + "epoch": 0.5808308630837308, + "grad_norm": 0.5166485329784066, + "learning_rate": 7.616925028566575e-06, + "loss": 0.7341, + "step": 27040 + }, + { + "epoch": 0.5810456673970014, + "grad_norm": 0.5315842783462765, + "learning_rate": 7.6103054590611755e-06, + "loss": 0.7391, + "step": 27050 + }, + { + "epoch": 0.581260471710272, + "grad_norm": 0.5015083854889525, + "learning_rate": 7.603686999922386e-06, + "loss": 0.7144, + "step": 27060 + }, + { + "epoch": 0.5814752760235425, + "grad_norm": 0.5184207030120949, + "learning_rate": 7.597069654225471e-06, + "loss": 0.7248, + "step": 27070 + }, + { + "epoch": 0.5816900803368131, + "grad_norm": 0.5316867610900236, + "learning_rate": 7.590453425045159e-06, + "loss": 0.733, + "step": 27080 + }, + { + "epoch": 0.5819048846500837, + "grad_norm": 0.4965452979386377, + "learning_rate": 7.583838315455665e-06, + "loss": 0.7204, + "step": 27090 + }, + { + "epoch": 0.5821196889633544, + "grad_norm": 0.5297374352101749, + "learning_rate": 7.577224328530694e-06, + "loss": 0.7227, + "step": 27100 + }, + { + "epoch": 0.582334493276625, + "grad_norm": 0.5359023033266158, + "learning_rate": 7.570611467343414e-06, + "loss": 0.7341, + "step": 27110 + }, + { + "epoch": 0.5825492975898956, + "grad_norm": 0.5131959097511573, + "learning_rate": 7.563999734966483e-06, + "loss": 0.7137, + "step": 27120 + }, + { + "epoch": 0.5827641019031662, + "grad_norm": 0.5332592734188711, + "learning_rate": 7.557389134472021e-06, + "loss": 0.7339, + "step": 27130 + }, + { + "epoch": 0.5829789062164368, + "grad_norm": 0.4950524149977223, + "learning_rate": 7.550779668931641e-06, + "loss": 0.7278, + "step": 27140 + }, + { + "epoch": 0.5831937105297075, + "grad_norm": 0.4979574277254489, + "learning_rate": 7.544171341416409e-06, + "loss": 0.7116, + "step": 27150 + }, + { + "epoch": 0.5834085148429781, + "grad_norm": 0.5184623277698175, + "learning_rate": 7.537564154996871e-06, + "loss": 0.7171, + "step": 27160 + }, + { + "epoch": 0.5836233191562487, + "grad_norm": 0.5028351179542662, + "learning_rate": 7.530958112743048e-06, + "loss": 0.7074, + "step": 27170 + }, + { + "epoch": 0.5838381234695192, + "grad_norm": 0.5140269218645958, + "learning_rate": 7.524353217724414e-06, + "loss": 0.7183, + "step": 27180 + }, + { + "epoch": 0.5840529277827898, + "grad_norm": 0.5116611764499623, + "learning_rate": 7.517749473009931e-06, + "loss": 0.731, + "step": 27190 + }, + { + "epoch": 0.5842677320960605, + "grad_norm": 0.5034040525475355, + "learning_rate": 7.51114688166801e-06, + "loss": 0.7026, + "step": 27200 + }, + { + "epoch": 0.5844825364093311, + "grad_norm": 0.5120240441103905, + "learning_rate": 7.50454544676653e-06, + "loss": 0.7298, + "step": 27210 + }, + { + "epoch": 0.5846973407226017, + "grad_norm": 0.5141277473304001, + "learning_rate": 7.497945171372838e-06, + "loss": 0.7169, + "step": 27220 + }, + { + "epoch": 0.5849121450358723, + "grad_norm": 0.5092099955268595, + "learning_rate": 7.4913460585537314e-06, + "loss": 0.7227, + "step": 27230 + }, + { + "epoch": 0.5851269493491429, + "grad_norm": 0.5394087392182072, + "learning_rate": 7.484748111375482e-06, + "loss": 0.7368, + "step": 27240 + }, + { + "epoch": 0.5853417536624136, + "grad_norm": 0.5296527582320034, + "learning_rate": 7.478151332903807e-06, + "loss": 0.7249, + "step": 27250 + }, + { + "epoch": 0.5855565579756842, + "grad_norm": 0.527906994069362, + "learning_rate": 7.4715557262038904e-06, + "loss": 0.702, + "step": 27260 + }, + { + "epoch": 0.5857713622889548, + "grad_norm": 0.49003860514458714, + "learning_rate": 7.464961294340366e-06, + "loss": 0.7216, + "step": 27270 + }, + { + "epoch": 0.5859861666022254, + "grad_norm": 0.5158001608175528, + "learning_rate": 7.45836804037732e-06, + "loss": 0.7076, + "step": 27280 + }, + { + "epoch": 0.586200970915496, + "grad_norm": 0.5189638533176454, + "learning_rate": 7.451775967378296e-06, + "loss": 0.7302, + "step": 27290 + }, + { + "epoch": 0.5864157752287666, + "grad_norm": 0.5114303128229795, + "learning_rate": 7.4451850784062825e-06, + "loss": 0.7195, + "step": 27300 + }, + { + "epoch": 0.5866305795420372, + "grad_norm": 0.5141598815151798, + "learning_rate": 7.438595376523734e-06, + "loss": 0.7259, + "step": 27310 + }, + { + "epoch": 0.5868453838553078, + "grad_norm": 0.5115551026689363, + "learning_rate": 7.432006864792529e-06, + "loss": 0.7363, + "step": 27320 + }, + { + "epoch": 0.5870601881685784, + "grad_norm": 0.5067452647479966, + "learning_rate": 7.4254195462740165e-06, + "loss": 0.7088, + "step": 27330 + }, + { + "epoch": 0.587274992481849, + "grad_norm": 0.5201913281991899, + "learning_rate": 7.418833424028974e-06, + "loss": 0.7268, + "step": 27340 + }, + { + "epoch": 0.5874897967951197, + "grad_norm": 0.5178789741191531, + "learning_rate": 7.412248501117627e-06, + "loss": 0.701, + "step": 27350 + }, + { + "epoch": 0.5877046011083903, + "grad_norm": 0.5059570164144281, + "learning_rate": 7.4056647805996526e-06, + "loss": 0.7159, + "step": 27360 + }, + { + "epoch": 0.5879194054216609, + "grad_norm": 0.5130596201558596, + "learning_rate": 7.399082265534161e-06, + "loss": 0.7208, + "step": 27370 + }, + { + "epoch": 0.5881342097349315, + "grad_norm": 0.5129021388577076, + "learning_rate": 7.392500958979705e-06, + "loss": 0.7268, + "step": 27380 + }, + { + "epoch": 0.5883490140482021, + "grad_norm": 0.49928892977313877, + "learning_rate": 7.385920863994273e-06, + "loss": 0.7291, + "step": 27390 + }, + { + "epoch": 0.5885638183614726, + "grad_norm": 0.5013211052439496, + "learning_rate": 7.3793419836352884e-06, + "loss": 0.7319, + "step": 27400 + }, + { + "epoch": 0.5887786226747433, + "grad_norm": 0.5333496221119072, + "learning_rate": 7.372764320959624e-06, + "loss": 0.7174, + "step": 27410 + }, + { + "epoch": 0.5889934269880139, + "grad_norm": 0.516097492110587, + "learning_rate": 7.366187879023572e-06, + "loss": 0.7144, + "step": 27420 + }, + { + "epoch": 0.5892082313012845, + "grad_norm": 0.5135516714260109, + "learning_rate": 7.359612660882862e-06, + "loss": 0.7107, + "step": 27430 + }, + { + "epoch": 0.5894230356145551, + "grad_norm": 0.5339129088339684, + "learning_rate": 7.353038669592654e-06, + "loss": 0.7238, + "step": 27440 + }, + { + "epoch": 0.5896378399278257, + "grad_norm": 0.4964474003475041, + "learning_rate": 7.346465908207545e-06, + "loss": 0.7176, + "step": 27450 + }, + { + "epoch": 0.5898526442410964, + "grad_norm": 0.501452498338148, + "learning_rate": 7.339894379781551e-06, + "loss": 0.7131, + "step": 27460 + }, + { + "epoch": 0.590067448554367, + "grad_norm": 0.5133436088736439, + "learning_rate": 7.333324087368117e-06, + "loss": 0.7215, + "step": 27470 + }, + { + "epoch": 0.5902822528676376, + "grad_norm": 0.5192593089145867, + "learning_rate": 7.326755034020122e-06, + "loss": 0.7162, + "step": 27480 + }, + { + "epoch": 0.5904970571809082, + "grad_norm": 0.5477084008394134, + "learning_rate": 7.320187222789856e-06, + "loss": 0.7042, + "step": 27490 + }, + { + "epoch": 0.5907118614941788, + "grad_norm": 0.5180536818105407, + "learning_rate": 7.3136206567290465e-06, + "loss": 0.7305, + "step": 27500 + }, + { + "epoch": 0.5909266658074495, + "grad_norm": 0.502178932682032, + "learning_rate": 7.307055338888826e-06, + "loss": 0.7139, + "step": 27510 + }, + { + "epoch": 0.59114147012072, + "grad_norm": 0.5157309880089097, + "learning_rate": 7.300491272319764e-06, + "loss": 0.7163, + "step": 27520 + }, + { + "epoch": 0.5913562744339906, + "grad_norm": 0.5148147550554781, + "learning_rate": 7.293928460071838e-06, + "loss": 0.7175, + "step": 27530 + }, + { + "epoch": 0.5915710787472612, + "grad_norm": 0.5082539143510494, + "learning_rate": 7.287366905194439e-06, + "loss": 0.7155, + "step": 27540 + }, + { + "epoch": 0.5917858830605318, + "grad_norm": 0.5141335882021415, + "learning_rate": 7.280806610736391e-06, + "loss": 0.7287, + "step": 27550 + }, + { + "epoch": 0.5920006873738025, + "grad_norm": 0.48870302456568654, + "learning_rate": 7.274247579745908e-06, + "loss": 0.717, + "step": 27560 + }, + { + "epoch": 0.5922154916870731, + "grad_norm": 0.514022070926401, + "learning_rate": 7.267689815270642e-06, + "loss": 0.7339, + "step": 27570 + }, + { + "epoch": 0.5924302960003437, + "grad_norm": 0.5135139016976253, + "learning_rate": 7.261133320357641e-06, + "loss": 0.7198, + "step": 27580 + }, + { + "epoch": 0.5926451003136143, + "grad_norm": 0.53429824873582, + "learning_rate": 7.254578098053362e-06, + "loss": 0.7349, + "step": 27590 + }, + { + "epoch": 0.5928599046268849, + "grad_norm": 0.5058250349786495, + "learning_rate": 7.248024151403682e-06, + "loss": 0.724, + "step": 27600 + }, + { + "epoch": 0.5930747089401556, + "grad_norm": 0.5206560440876598, + "learning_rate": 7.2414714834538725e-06, + "loss": 0.7337, + "step": 27610 + }, + { + "epoch": 0.5932895132534262, + "grad_norm": 0.48980099237908875, + "learning_rate": 7.234920097248623e-06, + "loss": 0.7181, + "step": 27620 + }, + { + "epoch": 0.5935043175666967, + "grad_norm": 0.5115417079620694, + "learning_rate": 7.228369995832015e-06, + "loss": 0.7297, + "step": 27630 + }, + { + "epoch": 0.5937191218799673, + "grad_norm": 0.5016557907859813, + "learning_rate": 7.221821182247548e-06, + "loss": 0.732, + "step": 27640 + }, + { + "epoch": 0.5939339261932379, + "grad_norm": 0.49835395618153433, + "learning_rate": 7.215273659538114e-06, + "loss": 0.7145, + "step": 27650 + }, + { + "epoch": 0.5941487305065085, + "grad_norm": 0.5169175369018176, + "learning_rate": 7.2087274307459945e-06, + "loss": 0.7091, + "step": 27660 + }, + { + "epoch": 0.5943635348197792, + "grad_norm": 0.5068225266138379, + "learning_rate": 7.2021824989128915e-06, + "loss": 0.7318, + "step": 27670 + }, + { + "epoch": 0.5945783391330498, + "grad_norm": 0.5091346931786659, + "learning_rate": 7.195638867079889e-06, + "loss": 0.7203, + "step": 27680 + }, + { + "epoch": 0.5947931434463204, + "grad_norm": 0.5271726430775541, + "learning_rate": 7.1890965382874765e-06, + "loss": 0.7318, + "step": 27690 + }, + { + "epoch": 0.595007947759591, + "grad_norm": 0.5205082559060062, + "learning_rate": 7.182555515575531e-06, + "loss": 0.7221, + "step": 27700 + }, + { + "epoch": 0.5952227520728616, + "grad_norm": 0.5108228067074766, + "learning_rate": 7.176015801983326e-06, + "loss": 0.7186, + "step": 27710 + }, + { + "epoch": 0.5954375563861323, + "grad_norm": 0.5032605086696729, + "learning_rate": 7.169477400549525e-06, + "loss": 0.7159, + "step": 27720 + }, + { + "epoch": 0.5956523606994029, + "grad_norm": 0.5143877348374966, + "learning_rate": 7.16294031431218e-06, + "loss": 0.7161, + "step": 27730 + }, + { + "epoch": 0.5958671650126734, + "grad_norm": 0.5175753229468208, + "learning_rate": 7.156404546308741e-06, + "loss": 0.7241, + "step": 27740 + }, + { + "epoch": 0.596081969325944, + "grad_norm": 0.5160103076524589, + "learning_rate": 7.149870099576033e-06, + "loss": 0.722, + "step": 27750 + }, + { + "epoch": 0.5962967736392146, + "grad_norm": 0.5276236091636177, + "learning_rate": 7.143336977150278e-06, + "loss": 0.7271, + "step": 27760 + }, + { + "epoch": 0.5965115779524853, + "grad_norm": 0.5115582800303922, + "learning_rate": 7.136805182067074e-06, + "loss": 0.7276, + "step": 27770 + }, + { + "epoch": 0.5967263822657559, + "grad_norm": 0.5007489074574742, + "learning_rate": 7.130274717361405e-06, + "loss": 0.7101, + "step": 27780 + }, + { + "epoch": 0.5969411865790265, + "grad_norm": 0.5247645617356222, + "learning_rate": 7.123745586067645e-06, + "loss": 0.7214, + "step": 27790 + }, + { + "epoch": 0.5971559908922971, + "grad_norm": 0.509054996286859, + "learning_rate": 7.117217791219533e-06, + "loss": 0.7062, + "step": 27800 + }, + { + "epoch": 0.5973707952055677, + "grad_norm": 0.5146383947427272, + "learning_rate": 7.110691335850202e-06, + "loss": 0.727, + "step": 27810 + }, + { + "epoch": 0.5975855995188384, + "grad_norm": 0.5107471000601168, + "learning_rate": 7.1041662229921485e-06, + "loss": 0.7078, + "step": 27820 + }, + { + "epoch": 0.597800403832109, + "grad_norm": 0.48699390234221435, + "learning_rate": 7.097642455677261e-06, + "loss": 0.7127, + "step": 27830 + }, + { + "epoch": 0.5980152081453796, + "grad_norm": 0.48779003203616056, + "learning_rate": 7.091120036936791e-06, + "loss": 0.713, + "step": 27840 + }, + { + "epoch": 0.5982300124586502, + "grad_norm": 0.5073544067387802, + "learning_rate": 7.084598969801362e-06, + "loss": 0.7194, + "step": 27850 + }, + { + "epoch": 0.5984448167719207, + "grad_norm": 0.5143979161778222, + "learning_rate": 7.0780792573009835e-06, + "loss": 0.7171, + "step": 27860 + }, + { + "epoch": 0.5986596210851914, + "grad_norm": 0.5100260027061173, + "learning_rate": 7.07156090246502e-06, + "loss": 0.7219, + "step": 27870 + }, + { + "epoch": 0.598874425398462, + "grad_norm": 0.5080855464944665, + "learning_rate": 7.065043908322214e-06, + "loss": 0.7288, + "step": 27880 + }, + { + "epoch": 0.5990892297117326, + "grad_norm": 0.503233048103245, + "learning_rate": 7.058528277900669e-06, + "loss": 0.708, + "step": 27890 + }, + { + "epoch": 0.5993040340250032, + "grad_norm": 0.5262280681492298, + "learning_rate": 7.052014014227866e-06, + "loss": 0.7167, + "step": 27900 + }, + { + "epoch": 0.5995188383382738, + "grad_norm": 0.522581486548443, + "learning_rate": 7.045501120330642e-06, + "loss": 0.7272, + "step": 27910 + }, + { + "epoch": 0.5997336426515445, + "grad_norm": 0.5104789267173563, + "learning_rate": 7.0389895992351956e-06, + "loss": 0.7087, + "step": 27920 + }, + { + "epoch": 0.5999484469648151, + "grad_norm": 0.5146754901769843, + "learning_rate": 7.032479453967097e-06, + "loss": 0.728, + "step": 27930 + }, + { + "epoch": 0.6001632512780857, + "grad_norm": 0.5116496747974748, + "learning_rate": 7.0259706875512645e-06, + "loss": 0.7125, + "step": 27940 + }, + { + "epoch": 0.6003780555913563, + "grad_norm": 0.521148762194517, + "learning_rate": 7.019463303011993e-06, + "loss": 0.7174, + "step": 27950 + }, + { + "epoch": 0.6005928599046269, + "grad_norm": 0.5311474419836788, + "learning_rate": 7.012957303372918e-06, + "loss": 0.7239, + "step": 27960 + }, + { + "epoch": 0.6008076642178974, + "grad_norm": 0.5235689594460002, + "learning_rate": 7.006452691657039e-06, + "loss": 0.7212, + "step": 27970 + }, + { + "epoch": 0.6010224685311681, + "grad_norm": 0.5092035810607163, + "learning_rate": 6.999949470886715e-06, + "loss": 0.7092, + "step": 27980 + }, + { + "epoch": 0.6012372728444387, + "grad_norm": 0.4938851390556326, + "learning_rate": 6.9934476440836465e-06, + "loss": 0.7244, + "step": 27990 + }, + { + "epoch": 0.6014520771577093, + "grad_norm": 0.5309729653806932, + "learning_rate": 6.986947214268902e-06, + "loss": 0.7263, + "step": 28000 + }, + { + "epoch": 0.6016668814709799, + "grad_norm": 0.5203026110014208, + "learning_rate": 6.980448184462887e-06, + "loss": 0.707, + "step": 28010 + }, + { + "epoch": 0.6018816857842505, + "grad_norm": 0.508186230193396, + "learning_rate": 6.973950557685366e-06, + "loss": 0.7188, + "step": 28020 + }, + { + "epoch": 0.6020964900975212, + "grad_norm": 0.5101290046607262, + "learning_rate": 6.967454336955447e-06, + "loss": 0.7261, + "step": 28030 + }, + { + "epoch": 0.6023112944107918, + "grad_norm": 0.5004521118510085, + "learning_rate": 6.96095952529158e-06, + "loss": 0.7097, + "step": 28040 + }, + { + "epoch": 0.6025260987240624, + "grad_norm": 0.49710168230451124, + "learning_rate": 6.9544661257115745e-06, + "loss": 0.7275, + "step": 28050 + }, + { + "epoch": 0.602740903037333, + "grad_norm": 0.5324092271912054, + "learning_rate": 6.947974141232568e-06, + "loss": 0.7176, + "step": 28060 + }, + { + "epoch": 0.6029557073506036, + "grad_norm": 0.5254153398251872, + "learning_rate": 6.9414835748710525e-06, + "loss": 0.7329, + "step": 28070 + }, + { + "epoch": 0.6031705116638743, + "grad_norm": 0.5018438803534653, + "learning_rate": 6.934994429642854e-06, + "loss": 0.7114, + "step": 28080 + }, + { + "epoch": 0.6033853159771448, + "grad_norm": 0.5110608431528573, + "learning_rate": 6.928506708563142e-06, + "loss": 0.7175, + "step": 28090 + }, + { + "epoch": 0.6036001202904154, + "grad_norm": 0.5110626161063511, + "learning_rate": 6.922020414646422e-06, + "loss": 0.7228, + "step": 28100 + }, + { + "epoch": 0.603814924603686, + "grad_norm": 0.5165958575854644, + "learning_rate": 6.915535550906532e-06, + "loss": 0.7282, + "step": 28110 + }, + { + "epoch": 0.6040297289169566, + "grad_norm": 0.5252321669011311, + "learning_rate": 6.909052120356659e-06, + "loss": 0.7218, + "step": 28120 + }, + { + "epoch": 0.6042445332302273, + "grad_norm": 0.5145699979700377, + "learning_rate": 6.902570126009309e-06, + "loss": 0.721, + "step": 28130 + }, + { + "epoch": 0.6044593375434979, + "grad_norm": 0.5257878272393476, + "learning_rate": 6.8960895708763335e-06, + "loss": 0.7236, + "step": 28140 + }, + { + "epoch": 0.6046741418567685, + "grad_norm": 0.5107902953084463, + "learning_rate": 6.8896104579689026e-06, + "loss": 0.7139, + "step": 28150 + }, + { + "epoch": 0.6048889461700391, + "grad_norm": 0.5187512853597482, + "learning_rate": 6.8831327902975245e-06, + "loss": 0.7211, + "step": 28160 + }, + { + "epoch": 0.6051037504833097, + "grad_norm": 0.5179836553200129, + "learning_rate": 6.876656570872036e-06, + "loss": 0.7157, + "step": 28170 + }, + { + "epoch": 0.6053185547965804, + "grad_norm": 0.4961138882817056, + "learning_rate": 6.870181802701596e-06, + "loss": 0.7144, + "step": 28180 + }, + { + "epoch": 0.605533359109851, + "grad_norm": 0.4994657394772925, + "learning_rate": 6.8637084887946985e-06, + "loss": 0.7154, + "step": 28190 + }, + { + "epoch": 0.6057481634231215, + "grad_norm": 0.5360515600802547, + "learning_rate": 6.857236632159146e-06, + "loss": 0.6994, + "step": 28200 + }, + { + "epoch": 0.6059629677363921, + "grad_norm": 0.49991257688614876, + "learning_rate": 6.85076623580208e-06, + "loss": 0.7249, + "step": 28210 + }, + { + "epoch": 0.6061777720496627, + "grad_norm": 0.5108166128653483, + "learning_rate": 6.8442973027299565e-06, + "loss": 0.7229, + "step": 28220 + }, + { + "epoch": 0.6063925763629334, + "grad_norm": 0.49773101558129873, + "learning_rate": 6.837829835948546e-06, + "loss": 0.7091, + "step": 28230 + }, + { + "epoch": 0.606607380676204, + "grad_norm": 0.505655201231906, + "learning_rate": 6.8313638384629525e-06, + "loss": 0.7206, + "step": 28240 + }, + { + "epoch": 0.6068221849894746, + "grad_norm": 0.5131719935934191, + "learning_rate": 6.824899313277582e-06, + "loss": 0.7252, + "step": 28250 + }, + { + "epoch": 0.6070369893027452, + "grad_norm": 0.503882580363197, + "learning_rate": 6.8184362633961655e-06, + "loss": 0.7305, + "step": 28260 + }, + { + "epoch": 0.6072517936160158, + "grad_norm": 0.5001026171381061, + "learning_rate": 6.811974691821741e-06, + "loss": 0.7131, + "step": 28270 + }, + { + "epoch": 0.6074665979292864, + "grad_norm": 0.5018092976471628, + "learning_rate": 6.805514601556671e-06, + "loss": 0.7179, + "step": 28280 + }, + { + "epoch": 0.6076814022425571, + "grad_norm": 0.507419770302612, + "learning_rate": 6.799055995602621e-06, + "loss": 0.7056, + "step": 28290 + }, + { + "epoch": 0.6078962065558277, + "grad_norm": 0.5171706639932914, + "learning_rate": 6.7925988769605656e-06, + "loss": 0.7265, + "step": 28300 + }, + { + "epoch": 0.6081110108690982, + "grad_norm": 0.5057274040421198, + "learning_rate": 6.786143248630795e-06, + "loss": 0.7055, + "step": 28310 + }, + { + "epoch": 0.6083258151823688, + "grad_norm": 0.5132979835569406, + "learning_rate": 6.7796891136129e-06, + "loss": 0.7117, + "step": 28320 + }, + { + "epoch": 0.6085406194956394, + "grad_norm": 0.5100482558341333, + "learning_rate": 6.773236474905786e-06, + "loss": 0.7099, + "step": 28330 + }, + { + "epoch": 0.6087554238089101, + "grad_norm": 0.5068203945434347, + "learning_rate": 6.7667853355076565e-06, + "loss": 0.7152, + "step": 28340 + }, + { + "epoch": 0.6089702281221807, + "grad_norm": 0.4922194906275137, + "learning_rate": 6.760335698416016e-06, + "loss": 0.7248, + "step": 28350 + }, + { + "epoch": 0.6091850324354513, + "grad_norm": 0.4940640802222499, + "learning_rate": 6.753887566627682e-06, + "loss": 0.7088, + "step": 28360 + }, + { + "epoch": 0.6093998367487219, + "grad_norm": 0.4917970774712095, + "learning_rate": 6.747440943138757e-06, + "loss": 0.7041, + "step": 28370 + }, + { + "epoch": 0.6096146410619925, + "grad_norm": 0.5157876999086878, + "learning_rate": 6.740995830944658e-06, + "loss": 0.7242, + "step": 28380 + }, + { + "epoch": 0.6098294453752632, + "grad_norm": 0.638677062933045, + "learning_rate": 6.7345522330400915e-06, + "loss": 0.7107, + "step": 28390 + }, + { + "epoch": 0.6100442496885338, + "grad_norm": 0.5291704161918205, + "learning_rate": 6.728110152419063e-06, + "loss": 0.7259, + "step": 28400 + }, + { + "epoch": 0.6102590540018044, + "grad_norm": 0.5435155538816592, + "learning_rate": 6.7216695920748694e-06, + "loss": 0.7292, + "step": 28410 + }, + { + "epoch": 0.610473858315075, + "grad_norm": 0.5249535872174969, + "learning_rate": 6.715230555000102e-06, + "loss": 0.7124, + "step": 28420 + }, + { + "epoch": 0.6106886626283455, + "grad_norm": 0.5168556551996696, + "learning_rate": 6.708793044186652e-06, + "loss": 0.7135, + "step": 28430 + }, + { + "epoch": 0.6109034669416162, + "grad_norm": 0.506513826242894, + "learning_rate": 6.702357062625689e-06, + "loss": 0.7171, + "step": 28440 + }, + { + "epoch": 0.6111182712548868, + "grad_norm": 0.5122231459063203, + "learning_rate": 6.695922613307684e-06, + "loss": 0.7366, + "step": 28450 + }, + { + "epoch": 0.6113330755681574, + "grad_norm": 0.5266734325137272, + "learning_rate": 6.689489699222387e-06, + "loss": 0.7177, + "step": 28460 + }, + { + "epoch": 0.611547879881428, + "grad_norm": 0.5249480955219177, + "learning_rate": 6.6830583233588406e-06, + "loss": 0.7177, + "step": 28470 + }, + { + "epoch": 0.6117626841946986, + "grad_norm": 0.536864147141546, + "learning_rate": 6.67662848870537e-06, + "loss": 0.7334, + "step": 28480 + }, + { + "epoch": 0.6119774885079693, + "grad_norm": 0.5017540321010265, + "learning_rate": 6.670200198249579e-06, + "loss": 0.7141, + "step": 28490 + }, + { + "epoch": 0.6121922928212399, + "grad_norm": 0.5022749037709945, + "learning_rate": 6.663773454978368e-06, + "loss": 0.7087, + "step": 28500 + }, + { + "epoch": 0.6124070971345105, + "grad_norm": 0.5079751963910859, + "learning_rate": 6.657348261877905e-06, + "loss": 0.7411, + "step": 28510 + }, + { + "epoch": 0.6126219014477811, + "grad_norm": 0.5184599582736278, + "learning_rate": 6.650924621933645e-06, + "loss": 0.7171, + "step": 28520 + }, + { + "epoch": 0.6128367057610516, + "grad_norm": 0.50121952483461, + "learning_rate": 6.644502538130317e-06, + "loss": 0.7209, + "step": 28530 + }, + { + "epoch": 0.6130515100743223, + "grad_norm": 0.515280894738279, + "learning_rate": 6.638082013451925e-06, + "loss": 0.7198, + "step": 28540 + }, + { + "epoch": 0.6132663143875929, + "grad_norm": 0.5136328125428188, + "learning_rate": 6.631663050881763e-06, + "loss": 0.7099, + "step": 28550 + }, + { + "epoch": 0.6134811187008635, + "grad_norm": 0.5174719027017979, + "learning_rate": 6.625245653402379e-06, + "loss": 0.725, + "step": 28560 + }, + { + "epoch": 0.6136959230141341, + "grad_norm": 0.5250930188904968, + "learning_rate": 6.6188298239956105e-06, + "loss": 0.7072, + "step": 28570 + }, + { + "epoch": 0.6139107273274047, + "grad_norm": 0.5369921932935243, + "learning_rate": 6.612415565642552e-06, + "loss": 0.7229, + "step": 28580 + }, + { + "epoch": 0.6141255316406753, + "grad_norm": 0.5023867945450698, + "learning_rate": 6.606002881323581e-06, + "loss": 0.7225, + "step": 28590 + }, + { + "epoch": 0.614340335953946, + "grad_norm": 0.5107950386038079, + "learning_rate": 6.599591774018338e-06, + "loss": 0.7155, + "step": 28600 + }, + { + "epoch": 0.6145551402672166, + "grad_norm": 0.5003163460337205, + "learning_rate": 6.5931822467057275e-06, + "loss": 0.7168, + "step": 28610 + }, + { + "epoch": 0.6147699445804872, + "grad_norm": 0.5273078172630057, + "learning_rate": 6.5867743023639255e-06, + "loss": 0.7251, + "step": 28620 + }, + { + "epoch": 0.6149847488937578, + "grad_norm": 0.5126901132037992, + "learning_rate": 6.580367943970365e-06, + "loss": 0.7083, + "step": 28630 + }, + { + "epoch": 0.6151995532070283, + "grad_norm": 0.5155415180923669, + "learning_rate": 6.573963174501755e-06, + "loss": 0.7117, + "step": 28640 + }, + { + "epoch": 0.615414357520299, + "grad_norm": 0.514330215394583, + "learning_rate": 6.567559996934052e-06, + "loss": 0.7147, + "step": 28650 + }, + { + "epoch": 0.6156291618335696, + "grad_norm": 0.5245843260086881, + "learning_rate": 6.5611584142424845e-06, + "loss": 0.7109, + "step": 28660 + }, + { + "epoch": 0.6158439661468402, + "grad_norm": 0.5167536487547844, + "learning_rate": 6.554758429401532e-06, + "loss": 0.7216, + "step": 28670 + }, + { + "epoch": 0.6160587704601108, + "grad_norm": 0.5109351895706468, + "learning_rate": 6.548360045384933e-06, + "loss": 0.7056, + "step": 28680 + }, + { + "epoch": 0.6162735747733814, + "grad_norm": 0.5190795799887076, + "learning_rate": 6.541963265165686e-06, + "loss": 0.7242, + "step": 28690 + }, + { + "epoch": 0.6164883790866521, + "grad_norm": 0.5106655693293612, + "learning_rate": 6.535568091716036e-06, + "loss": 0.7168, + "step": 28700 + }, + { + "epoch": 0.6167031833999227, + "grad_norm": 0.5047058090402041, + "learning_rate": 6.529174528007496e-06, + "loss": 0.7412, + "step": 28710 + }, + { + "epoch": 0.6169179877131933, + "grad_norm": 0.5197711629668383, + "learning_rate": 6.522782577010815e-06, + "loss": 0.7214, + "step": 28720 + }, + { + "epoch": 0.6171327920264639, + "grad_norm": 0.4994718882123556, + "learning_rate": 6.516392241696004e-06, + "loss": 0.7331, + "step": 28730 + }, + { + "epoch": 0.6173475963397345, + "grad_norm": 0.5072482739785458, + "learning_rate": 6.5100035250323155e-06, + "loss": 0.7052, + "step": 28740 + }, + { + "epoch": 0.6175624006530052, + "grad_norm": 0.4899692225682852, + "learning_rate": 6.503616429988253e-06, + "loss": 0.6999, + "step": 28750 + }, + { + "epoch": 0.6177772049662758, + "grad_norm": 0.5047645583660254, + "learning_rate": 6.497230959531573e-06, + "loss": 0.7141, + "step": 28760 + }, + { + "epoch": 0.6179920092795463, + "grad_norm": 0.5144621325420531, + "learning_rate": 6.490847116629267e-06, + "loss": 0.721, + "step": 28770 + }, + { + "epoch": 0.6182068135928169, + "grad_norm": 0.5073195245109307, + "learning_rate": 6.484464904247573e-06, + "loss": 0.7095, + "step": 28780 + }, + { + "epoch": 0.6184216179060875, + "grad_norm": 0.5122140072591916, + "learning_rate": 6.4780843253519766e-06, + "loss": 0.7085, + "step": 28790 + }, + { + "epoch": 0.6186364222193582, + "grad_norm": 0.5103737664691493, + "learning_rate": 6.471705382907194e-06, + "loss": 0.7023, + "step": 28800 + }, + { + "epoch": 0.6188512265326288, + "grad_norm": 0.49611210872837896, + "learning_rate": 6.465328079877196e-06, + "loss": 0.7027, + "step": 28810 + }, + { + "epoch": 0.6190660308458994, + "grad_norm": 0.5196971341780113, + "learning_rate": 6.458952419225175e-06, + "loss": 0.7147, + "step": 28820 + }, + { + "epoch": 0.61928083515917, + "grad_norm": 0.5200694337234366, + "learning_rate": 6.452578403913577e-06, + "loss": 0.7129, + "step": 28830 + }, + { + "epoch": 0.6194956394724406, + "grad_norm": 0.49974616265603755, + "learning_rate": 6.446206036904068e-06, + "loss": 0.7025, + "step": 28840 + }, + { + "epoch": 0.6197104437857112, + "grad_norm": 0.5036120598369263, + "learning_rate": 6.439835321157561e-06, + "loss": 0.7117, + "step": 28850 + }, + { + "epoch": 0.6199252480989819, + "grad_norm": 0.5312724368454428, + "learning_rate": 6.433466259634191e-06, + "loss": 0.7174, + "step": 28860 + }, + { + "epoch": 0.6201400524122525, + "grad_norm": 0.5424611207118475, + "learning_rate": 6.427098855293328e-06, + "loss": 0.7106, + "step": 28870 + }, + { + "epoch": 0.620354856725523, + "grad_norm": 0.5100284315892076, + "learning_rate": 6.42073311109358e-06, + "loss": 0.7226, + "step": 28880 + }, + { + "epoch": 0.6205696610387936, + "grad_norm": 0.5173898778049251, + "learning_rate": 6.414369029992771e-06, + "loss": 0.7311, + "step": 28890 + }, + { + "epoch": 0.6207844653520642, + "grad_norm": 0.5125989979068201, + "learning_rate": 6.408006614947963e-06, + "loss": 0.7136, + "step": 28900 + }, + { + "epoch": 0.6209992696653349, + "grad_norm": 0.5241560351725514, + "learning_rate": 6.401645868915434e-06, + "loss": 0.7309, + "step": 28910 + }, + { + "epoch": 0.6212140739786055, + "grad_norm": 0.5326577223760485, + "learning_rate": 6.39528679485069e-06, + "loss": 0.7222, + "step": 28920 + }, + { + "epoch": 0.6214288782918761, + "grad_norm": 0.5412010757518007, + "learning_rate": 6.388929395708469e-06, + "loss": 0.7275, + "step": 28930 + }, + { + "epoch": 0.6216436826051467, + "grad_norm": 0.501280757518304, + "learning_rate": 6.382573674442719e-06, + "loss": 0.7043, + "step": 28940 + }, + { + "epoch": 0.6218584869184173, + "grad_norm": 0.5035153061061383, + "learning_rate": 6.376219634006614e-06, + "loss": 0.72, + "step": 28950 + }, + { + "epoch": 0.622073291231688, + "grad_norm": 0.5006644086233506, + "learning_rate": 6.3698672773525414e-06, + "loss": 0.7375, + "step": 28960 + }, + { + "epoch": 0.6222880955449586, + "grad_norm": 0.5148002471276852, + "learning_rate": 6.363516607432116e-06, + "loss": 0.7029, + "step": 28970 + }, + { + "epoch": 0.6225028998582292, + "grad_norm": 0.5117306851220371, + "learning_rate": 6.357167627196164e-06, + "loss": 0.7268, + "step": 28980 + }, + { + "epoch": 0.6227177041714997, + "grad_norm": 0.5040793669298755, + "learning_rate": 6.35082033959472e-06, + "loss": 0.7366, + "step": 28990 + }, + { + "epoch": 0.6229325084847703, + "grad_norm": 0.511200471454493, + "learning_rate": 6.344474747577043e-06, + "loss": 0.7241, + "step": 29000 + }, + { + "epoch": 0.623147312798041, + "grad_norm": 0.5050786822698712, + "learning_rate": 6.338130854091595e-06, + "loss": 0.7216, + "step": 29010 + }, + { + "epoch": 0.6233621171113116, + "grad_norm": 0.5058176815929498, + "learning_rate": 6.331788662086058e-06, + "loss": 0.7058, + "step": 29020 + }, + { + "epoch": 0.6235769214245822, + "grad_norm": 0.4981789798742353, + "learning_rate": 6.325448174507312e-06, + "loss": 0.7211, + "step": 29030 + }, + { + "epoch": 0.6237917257378528, + "grad_norm": 0.496475062930444, + "learning_rate": 6.319109394301459e-06, + "loss": 0.7246, + "step": 29040 + }, + { + "epoch": 0.6240065300511234, + "grad_norm": 0.5206633397961193, + "learning_rate": 6.312772324413798e-06, + "loss": 0.719, + "step": 29050 + }, + { + "epoch": 0.6242213343643941, + "grad_norm": 0.5147683511194359, + "learning_rate": 6.30643696778883e-06, + "loss": 0.7319, + "step": 29060 + }, + { + "epoch": 0.6244361386776647, + "grad_norm": 0.5013409604266713, + "learning_rate": 6.300103327370272e-06, + "loss": 0.7138, + "step": 29070 + }, + { + "epoch": 0.6246509429909353, + "grad_norm": 0.5448909474545818, + "learning_rate": 6.29377140610103e-06, + "loss": 0.7219, + "step": 29080 + }, + { + "epoch": 0.6248657473042059, + "grad_norm": 0.4980788653525116, + "learning_rate": 6.287441206923225e-06, + "loss": 0.7054, + "step": 29090 + }, + { + "epoch": 0.6250805516174764, + "grad_norm": 0.5063361287454743, + "learning_rate": 6.28111273277817e-06, + "loss": 0.7291, + "step": 29100 + }, + { + "epoch": 0.6252953559307471, + "grad_norm": 0.5079410093076961, + "learning_rate": 6.274785986606371e-06, + "loss": 0.7176, + "step": 29110 + }, + { + "epoch": 0.6255101602440177, + "grad_norm": 0.5169067311424639, + "learning_rate": 6.2684609713475454e-06, + "loss": 0.7153, + "step": 29120 + }, + { + "epoch": 0.6257249645572883, + "grad_norm": 0.5053854343865869, + "learning_rate": 6.26213768994059e-06, + "loss": 0.7257, + "step": 29130 + }, + { + "epoch": 0.6259397688705589, + "grad_norm": 0.5019853299444441, + "learning_rate": 6.255816145323613e-06, + "loss": 0.7243, + "step": 29140 + }, + { + "epoch": 0.6261545731838295, + "grad_norm": 0.5136425354507431, + "learning_rate": 6.249496340433903e-06, + "loss": 0.7133, + "step": 29150 + }, + { + "epoch": 0.6263693774971001, + "grad_norm": 0.4961378621057317, + "learning_rate": 6.243178278207944e-06, + "loss": 0.7201, + "step": 29160 + }, + { + "epoch": 0.6265841818103708, + "grad_norm": 0.500774286297422, + "learning_rate": 6.236861961581413e-06, + "loss": 0.7183, + "step": 29170 + }, + { + "epoch": 0.6267989861236414, + "grad_norm": 0.5360545274720105, + "learning_rate": 6.230547393489166e-06, + "loss": 0.7197, + "step": 29180 + }, + { + "epoch": 0.627013790436912, + "grad_norm": 0.5298466892431664, + "learning_rate": 6.224234576865264e-06, + "loss": 0.7137, + "step": 29190 + }, + { + "epoch": 0.6272285947501826, + "grad_norm": 0.5397675431496055, + "learning_rate": 6.217923514642938e-06, + "loss": 0.7302, + "step": 29200 + }, + { + "epoch": 0.6274433990634531, + "grad_norm": 0.5102301973810862, + "learning_rate": 6.211614209754615e-06, + "loss": 0.7001, + "step": 29210 + }, + { + "epoch": 0.6276582033767238, + "grad_norm": 0.49821596276059255, + "learning_rate": 6.205306665131892e-06, + "loss": 0.7234, + "step": 29220 + }, + { + "epoch": 0.6278730076899944, + "grad_norm": 0.5107970378794817, + "learning_rate": 6.199000883705563e-06, + "loss": 0.7074, + "step": 29230 + }, + { + "epoch": 0.628087812003265, + "grad_norm": 0.5262658842700451, + "learning_rate": 6.192696868405598e-06, + "loss": 0.7082, + "step": 29240 + }, + { + "epoch": 0.6283026163165356, + "grad_norm": 0.49640493413761044, + "learning_rate": 6.186394622161136e-06, + "loss": 0.7117, + "step": 29250 + }, + { + "epoch": 0.6285174206298062, + "grad_norm": 0.520113591142662, + "learning_rate": 6.1800941479005125e-06, + "loss": 0.7089, + "step": 29260 + }, + { + "epoch": 0.6287322249430769, + "grad_norm": 0.5152262365486174, + "learning_rate": 6.173795448551223e-06, + "loss": 0.7041, + "step": 29270 + }, + { + "epoch": 0.6289470292563475, + "grad_norm": 0.5044955000995295, + "learning_rate": 6.16749852703995e-06, + "loss": 0.7066, + "step": 29280 + }, + { + "epoch": 0.6291618335696181, + "grad_norm": 0.5039566573378436, + "learning_rate": 6.161203386292539e-06, + "loss": 0.7168, + "step": 29290 + }, + { + "epoch": 0.6293766378828887, + "grad_norm": 0.5132657302646363, + "learning_rate": 6.154910029234022e-06, + "loss": 0.7282, + "step": 29300 + }, + { + "epoch": 0.6295914421961593, + "grad_norm": 0.501215277645178, + "learning_rate": 6.148618458788589e-06, + "loss": 0.7125, + "step": 29310 + }, + { + "epoch": 0.62980624650943, + "grad_norm": 0.5171520975134748, + "learning_rate": 6.142328677879604e-06, + "loss": 0.7201, + "step": 29320 + }, + { + "epoch": 0.6300210508227005, + "grad_norm": 0.5093541985407058, + "learning_rate": 6.136040689429606e-06, + "loss": 0.7249, + "step": 29330 + }, + { + "epoch": 0.6302358551359711, + "grad_norm": 0.5038211596059813, + "learning_rate": 6.1297544963602874e-06, + "loss": 0.7116, + "step": 29340 + }, + { + "epoch": 0.6304506594492417, + "grad_norm": 0.499683261524144, + "learning_rate": 6.123470101592524e-06, + "loss": 0.7168, + "step": 29350 + }, + { + "epoch": 0.6306654637625123, + "grad_norm": 0.5524946516291463, + "learning_rate": 6.1171875080463425e-06, + "loss": 0.7087, + "step": 29360 + }, + { + "epoch": 0.630880268075783, + "grad_norm": 0.5018295390971191, + "learning_rate": 6.1109067186409365e-06, + "loss": 0.7219, + "step": 29370 + }, + { + "epoch": 0.6310950723890536, + "grad_norm": 0.53849226672192, + "learning_rate": 6.104627736294664e-06, + "loss": 0.7008, + "step": 29380 + }, + { + "epoch": 0.6313098767023242, + "grad_norm": 0.5134656847802564, + "learning_rate": 6.098350563925035e-06, + "loss": 0.7165, + "step": 29390 + }, + { + "epoch": 0.6315246810155948, + "grad_norm": 0.5395193132941271, + "learning_rate": 6.092075204448734e-06, + "loss": 0.7263, + "step": 29400 + }, + { + "epoch": 0.6317394853288654, + "grad_norm": 0.5291663702673787, + "learning_rate": 6.085801660781585e-06, + "loss": 0.7226, + "step": 29410 + }, + { + "epoch": 0.6319542896421361, + "grad_norm": 0.4920471964797305, + "learning_rate": 6.079529935838584e-06, + "loss": 0.7253, + "step": 29420 + }, + { + "epoch": 0.6321690939554067, + "grad_norm": 0.5091308822653934, + "learning_rate": 6.0732600325338745e-06, + "loss": 0.7084, + "step": 29430 + }, + { + "epoch": 0.6323838982686772, + "grad_norm": 0.5083144836724668, + "learning_rate": 6.066991953780748e-06, + "loss": 0.7096, + "step": 29440 + }, + { + "epoch": 0.6325987025819478, + "grad_norm": 0.5553675193548707, + "learning_rate": 6.06072570249166e-06, + "loss": 0.7189, + "step": 29450 + }, + { + "epoch": 0.6328135068952184, + "grad_norm": 0.5042084082023858, + "learning_rate": 6.054461281578206e-06, + "loss": 0.7076, + "step": 29460 + }, + { + "epoch": 0.633028311208489, + "grad_norm": 0.4889129315159301, + "learning_rate": 6.048198693951142e-06, + "loss": 0.7034, + "step": 29470 + }, + { + "epoch": 0.6332431155217597, + "grad_norm": 0.49120737847508095, + "learning_rate": 6.041937942520363e-06, + "loss": 0.7162, + "step": 29480 + }, + { + "epoch": 0.6334579198350303, + "grad_norm": 0.5106224481761457, + "learning_rate": 6.035679030194917e-06, + "loss": 0.7108, + "step": 29490 + }, + { + "epoch": 0.6336727241483009, + "grad_norm": 0.5142715172785339, + "learning_rate": 6.02942195988299e-06, + "loss": 0.7182, + "step": 29500 + }, + { + "epoch": 0.6338875284615715, + "grad_norm": 0.5245286567422575, + "learning_rate": 6.023166734491919e-06, + "loss": 0.7069, + "step": 29510 + }, + { + "epoch": 0.6341023327748421, + "grad_norm": 0.49896046500592833, + "learning_rate": 6.0169133569281835e-06, + "loss": 0.7217, + "step": 29520 + }, + { + "epoch": 0.6343171370881128, + "grad_norm": 0.5076374052572614, + "learning_rate": 6.010661830097399e-06, + "loss": 0.7244, + "step": 29530 + }, + { + "epoch": 0.6345319414013834, + "grad_norm": 0.5036960875426778, + "learning_rate": 6.004412156904329e-06, + "loss": 0.7246, + "step": 29540 + }, + { + "epoch": 0.634746745714654, + "grad_norm": 0.5190186034362325, + "learning_rate": 5.9981643402528675e-06, + "loss": 0.7181, + "step": 29550 + }, + { + "epoch": 0.6349615500279245, + "grad_norm": 0.5185275006081458, + "learning_rate": 5.991918383046047e-06, + "loss": 0.7171, + "step": 29560 + }, + { + "epoch": 0.6351763543411951, + "grad_norm": 0.5092203362213314, + "learning_rate": 5.985674288186045e-06, + "loss": 0.7282, + "step": 29570 + }, + { + "epoch": 0.6353911586544658, + "grad_norm": 0.4997149128548273, + "learning_rate": 5.979432058574164e-06, + "loss": 0.7174, + "step": 29580 + }, + { + "epoch": 0.6356059629677364, + "grad_norm": 0.5068981922104722, + "learning_rate": 5.973191697110845e-06, + "loss": 0.7141, + "step": 29590 + }, + { + "epoch": 0.635820767281007, + "grad_norm": 0.5110915167748981, + "learning_rate": 5.966953206695653e-06, + "loss": 0.7102, + "step": 29600 + }, + { + "epoch": 0.6360355715942776, + "grad_norm": 0.511793837071635, + "learning_rate": 5.960716590227298e-06, + "loss": 0.7029, + "step": 29610 + }, + { + "epoch": 0.6362503759075482, + "grad_norm": 0.5010481752806605, + "learning_rate": 5.954481850603606e-06, + "loss": 0.7052, + "step": 29620 + }, + { + "epoch": 0.6364651802208189, + "grad_norm": 0.5184653587723411, + "learning_rate": 5.9482489907215324e-06, + "loss": 0.709, + "step": 29630 + }, + { + "epoch": 0.6366799845340895, + "grad_norm": 0.5186353614978536, + "learning_rate": 5.9420180134771735e-06, + "loss": 0.7256, + "step": 29640 + }, + { + "epoch": 0.6368947888473601, + "grad_norm": 0.5090791214108952, + "learning_rate": 5.935788921765728e-06, + "loss": 0.7125, + "step": 29650 + }, + { + "epoch": 0.6371095931606306, + "grad_norm": 0.5182097600656689, + "learning_rate": 5.92956171848154e-06, + "loss": 0.7095, + "step": 29660 + }, + { + "epoch": 0.6373243974739012, + "grad_norm": 0.49906621293560977, + "learning_rate": 5.923336406518059e-06, + "loss": 0.7179, + "step": 29670 + }, + { + "epoch": 0.6375392017871719, + "grad_norm": 0.49207042909075416, + "learning_rate": 5.91711298876787e-06, + "loss": 0.7141, + "step": 29680 + }, + { + "epoch": 0.6377540061004425, + "grad_norm": 0.491835738604546, + "learning_rate": 5.910891468122668e-06, + "loss": 0.7078, + "step": 29690 + }, + { + "epoch": 0.6379688104137131, + "grad_norm": 0.5078817311304347, + "learning_rate": 5.9046718474732666e-06, + "loss": 0.7072, + "step": 29700 + }, + { + "epoch": 0.6381836147269837, + "grad_norm": 0.5124871601736757, + "learning_rate": 5.898454129709606e-06, + "loss": 0.7202, + "step": 29710 + }, + { + "epoch": 0.6383984190402543, + "grad_norm": 0.5134905108899009, + "learning_rate": 5.8922383177207286e-06, + "loss": 0.7282, + "step": 29720 + }, + { + "epoch": 0.6386132233535249, + "grad_norm": 0.5155981047848882, + "learning_rate": 5.886024414394806e-06, + "loss": 0.7113, + "step": 29730 + }, + { + "epoch": 0.6388280276667956, + "grad_norm": 0.5060176620986123, + "learning_rate": 5.879812422619111e-06, + "loss": 0.7152, + "step": 29740 + }, + { + "epoch": 0.6390428319800662, + "grad_norm": 0.5031393620149552, + "learning_rate": 5.873602345280033e-06, + "loss": 0.7171, + "step": 29750 + }, + { + "epoch": 0.6392576362933368, + "grad_norm": 0.5091217506585529, + "learning_rate": 5.8673941852630735e-06, + "loss": 0.7072, + "step": 29760 + }, + { + "epoch": 0.6394724406066074, + "grad_norm": 0.5124995237491698, + "learning_rate": 5.861187945452836e-06, + "loss": 0.7179, + "step": 29770 + }, + { + "epoch": 0.6396872449198779, + "grad_norm": 0.5111197620823217, + "learning_rate": 5.854983628733046e-06, + "loss": 0.7265, + "step": 29780 + }, + { + "epoch": 0.6399020492331486, + "grad_norm": 0.5121226878357348, + "learning_rate": 5.848781237986516e-06, + "loss": 0.7058, + "step": 29790 + }, + { + "epoch": 0.6401168535464192, + "grad_norm": 0.5029940756710116, + "learning_rate": 5.842580776095186e-06, + "loss": 0.7242, + "step": 29800 + }, + { + "epoch": 0.6403316578596898, + "grad_norm": 0.518317509648807, + "learning_rate": 5.8363822459400766e-06, + "loss": 0.7122, + "step": 29810 + }, + { + "epoch": 0.6405464621729604, + "grad_norm": 0.513796209625438, + "learning_rate": 5.830185650401327e-06, + "loss": 0.7109, + "step": 29820 + }, + { + "epoch": 0.640761266486231, + "grad_norm": 0.5065915184714491, + "learning_rate": 5.82399099235817e-06, + "loss": 0.7185, + "step": 29830 + }, + { + "epoch": 0.6409760707995017, + "grad_norm": 0.5383003131895742, + "learning_rate": 5.817798274688945e-06, + "loss": 0.7149, + "step": 29840 + }, + { + "epoch": 0.6411908751127723, + "grad_norm": 0.5013183139637445, + "learning_rate": 5.811607500271086e-06, + "loss": 0.7061, + "step": 29850 + }, + { + "epoch": 0.6414056794260429, + "grad_norm": 0.5084891027020223, + "learning_rate": 5.805418671981112e-06, + "loss": 0.7078, + "step": 29860 + }, + { + "epoch": 0.6416204837393135, + "grad_norm": 0.5358856173892373, + "learning_rate": 5.7992317926946666e-06, + "loss": 0.7059, + "step": 29870 + }, + { + "epoch": 0.641835288052584, + "grad_norm": 0.5118937595826556, + "learning_rate": 5.793046865286457e-06, + "loss": 0.7084, + "step": 29880 + }, + { + "epoch": 0.6420500923658548, + "grad_norm": 0.5289502239664226, + "learning_rate": 5.786863892630301e-06, + "loss": 0.7186, + "step": 29890 + }, + { + "epoch": 0.6422648966791253, + "grad_norm": 0.5030440160930806, + "learning_rate": 5.78068287759911e-06, + "loss": 0.7093, + "step": 29900 + }, + { + "epoch": 0.6424797009923959, + "grad_norm": 0.5013233963022401, + "learning_rate": 5.774503823064865e-06, + "loss": 0.7046, + "step": 29910 + }, + { + "epoch": 0.6426945053056665, + "grad_norm": 0.5218815308682272, + "learning_rate": 5.768326731898668e-06, + "loss": 0.7093, + "step": 29920 + }, + { + "epoch": 0.6429093096189371, + "grad_norm": 0.4911776039589, + "learning_rate": 5.762151606970681e-06, + "loss": 0.6985, + "step": 29930 + }, + { + "epoch": 0.6431241139322078, + "grad_norm": 0.4990229286965237, + "learning_rate": 5.755978451150165e-06, + "loss": 0.7039, + "step": 29940 + }, + { + "epoch": 0.6433389182454784, + "grad_norm": 0.5017215398112681, + "learning_rate": 5.749807267305469e-06, + "loss": 0.719, + "step": 29950 + }, + { + "epoch": 0.643553722558749, + "grad_norm": 0.49447190334501206, + "learning_rate": 5.743638058304009e-06, + "loss": 0.7096, + "step": 29960 + }, + { + "epoch": 0.6437685268720196, + "grad_norm": 0.4932132883026573, + "learning_rate": 5.737470827012309e-06, + "loss": 0.719, + "step": 29970 + }, + { + "epoch": 0.6439833311852902, + "grad_norm": 0.4990600666743509, + "learning_rate": 5.731305576295951e-06, + "loss": 0.7119, + "step": 29980 + }, + { + "epoch": 0.6441981354985609, + "grad_norm": 0.49829438730309705, + "learning_rate": 5.72514230901961e-06, + "loss": 0.7111, + "step": 29990 + }, + { + "epoch": 0.6444129398118315, + "grad_norm": 0.5142379504720469, + "learning_rate": 5.7189810280470374e-06, + "loss": 0.7182, + "step": 30000 + }, + { + "epoch": 0.644627744125102, + "grad_norm": 0.5004279772636778, + "learning_rate": 5.71282173624105e-06, + "loss": 0.7123, + "step": 30010 + }, + { + "epoch": 0.6448425484383726, + "grad_norm": 0.4997893834727797, + "learning_rate": 5.706664436463563e-06, + "loss": 0.7069, + "step": 30020 + }, + { + "epoch": 0.6450573527516432, + "grad_norm": 0.5224573953092382, + "learning_rate": 5.700509131575538e-06, + "loss": 0.7315, + "step": 30030 + }, + { + "epoch": 0.6452721570649138, + "grad_norm": 0.5157124232819824, + "learning_rate": 5.6943558244370435e-06, + "loss": 0.7085, + "step": 30040 + }, + { + "epoch": 0.6454869613781845, + "grad_norm": 0.5098311956322868, + "learning_rate": 5.688204517907184e-06, + "loss": 0.698, + "step": 30050 + }, + { + "epoch": 0.6457017656914551, + "grad_norm": 0.5585768535083954, + "learning_rate": 5.6820552148441595e-06, + "loss": 0.721, + "step": 30060 + }, + { + "epoch": 0.6459165700047257, + "grad_norm": 0.5191401680283296, + "learning_rate": 5.675907918105233e-06, + "loss": 0.718, + "step": 30070 + }, + { + "epoch": 0.6461313743179963, + "grad_norm": 0.5154719384443771, + "learning_rate": 5.669762630546722e-06, + "loss": 0.7135, + "step": 30080 + }, + { + "epoch": 0.6463461786312669, + "grad_norm": 0.5161802110337572, + "learning_rate": 5.663619355024037e-06, + "loss": 0.7299, + "step": 30090 + }, + { + "epoch": 0.6465609829445376, + "grad_norm": 0.5251836486068403, + "learning_rate": 5.6574780943916265e-06, + "loss": 0.7139, + "step": 30100 + }, + { + "epoch": 0.6467757872578082, + "grad_norm": 0.5253745077383208, + "learning_rate": 5.651338851503017e-06, + "loss": 0.7081, + "step": 30110 + }, + { + "epoch": 0.6469905915710787, + "grad_norm": 0.499068951345218, + "learning_rate": 5.645201629210802e-06, + "loss": 0.7118, + "step": 30120 + }, + { + "epoch": 0.6472053958843493, + "grad_norm": 0.5212681373915045, + "learning_rate": 5.639066430366616e-06, + "loss": 0.7058, + "step": 30130 + }, + { + "epoch": 0.6474202001976199, + "grad_norm": 0.5192551695189365, + "learning_rate": 5.63293325782118e-06, + "loss": 0.7209, + "step": 30140 + }, + { + "epoch": 0.6476350045108906, + "grad_norm": 0.5009778176216481, + "learning_rate": 5.626802114424252e-06, + "loss": 0.7147, + "step": 30150 + }, + { + "epoch": 0.6478498088241612, + "grad_norm": 0.511657473010408, + "learning_rate": 5.620673003024654e-06, + "loss": 0.7111, + "step": 30160 + }, + { + "epoch": 0.6480646131374318, + "grad_norm": 0.4946400904637439, + "learning_rate": 5.614545926470272e-06, + "loss": 0.7039, + "step": 30170 + }, + { + "epoch": 0.6482794174507024, + "grad_norm": 0.5452566834347506, + "learning_rate": 5.608420887608033e-06, + "loss": 0.7124, + "step": 30180 + }, + { + "epoch": 0.648494221763973, + "grad_norm": 0.5187590204840195, + "learning_rate": 5.60229788928393e-06, + "loss": 0.7138, + "step": 30190 + }, + { + "epoch": 0.6487090260772437, + "grad_norm": 0.4961448077999401, + "learning_rate": 5.5961769343429885e-06, + "loss": 0.7088, + "step": 30200 + }, + { + "epoch": 0.6489238303905143, + "grad_norm": 0.49664867664736395, + "learning_rate": 5.590058025629315e-06, + "loss": 0.7043, + "step": 30210 + }, + { + "epoch": 0.6491386347037849, + "grad_norm": 0.5089161076297286, + "learning_rate": 5.5839411659860355e-06, + "loss": 0.696, + "step": 30220 + }, + { + "epoch": 0.6493534390170554, + "grad_norm": 0.5007244262694044, + "learning_rate": 5.577826358255339e-06, + "loss": 0.7104, + "step": 30230 + }, + { + "epoch": 0.649568243330326, + "grad_norm": 0.5116267642012309, + "learning_rate": 5.57171360527846e-06, + "loss": 0.7176, + "step": 30240 + }, + { + "epoch": 0.6497830476435967, + "grad_norm": 0.5110539680425629, + "learning_rate": 5.5656029098956755e-06, + "loss": 0.7176, + "step": 30250 + }, + { + "epoch": 0.6499978519568673, + "grad_norm": 0.5087432238272799, + "learning_rate": 5.559494274946311e-06, + "loss": 0.7045, + "step": 30260 + }, + { + "epoch": 0.6502126562701379, + "grad_norm": 0.5049786677025178, + "learning_rate": 5.553387703268725e-06, + "loss": 0.7105, + "step": 30270 + }, + { + "epoch": 0.6504274605834085, + "grad_norm": 0.5099641743518565, + "learning_rate": 5.547283197700324e-06, + "loss": 0.7056, + "step": 30280 + }, + { + "epoch": 0.6506422648966791, + "grad_norm": 0.5143331023536819, + "learning_rate": 5.541180761077556e-06, + "loss": 0.6937, + "step": 30290 + }, + { + "epoch": 0.6508570692099498, + "grad_norm": 0.49683778878230767, + "learning_rate": 5.535080396235906e-06, + "loss": 0.7087, + "step": 30300 + }, + { + "epoch": 0.6510718735232204, + "grad_norm": 0.533544500826429, + "learning_rate": 5.528982106009899e-06, + "loss": 0.7144, + "step": 30310 + }, + { + "epoch": 0.651286677836491, + "grad_norm": 0.5151062048248995, + "learning_rate": 5.5228858932330845e-06, + "loss": 0.7009, + "step": 30320 + }, + { + "epoch": 0.6515014821497616, + "grad_norm": 0.5122586500692712, + "learning_rate": 5.5167917607380605e-06, + "loss": 0.7031, + "step": 30330 + }, + { + "epoch": 0.6517162864630321, + "grad_norm": 0.5212961303785209, + "learning_rate": 5.510699711356451e-06, + "loss": 0.6934, + "step": 30340 + }, + { + "epoch": 0.6519310907763027, + "grad_norm": 0.5100624104875204, + "learning_rate": 5.504609747918916e-06, + "loss": 0.7259, + "step": 30350 + }, + { + "epoch": 0.6521458950895734, + "grad_norm": 0.5072539914616855, + "learning_rate": 5.498521873255145e-06, + "loss": 0.7156, + "step": 30360 + }, + { + "epoch": 0.652360699402844, + "grad_norm": 0.5025326619318836, + "learning_rate": 5.4924360901938574e-06, + "loss": 0.6881, + "step": 30370 + }, + { + "epoch": 0.6525755037161146, + "grad_norm": 0.5194768705340874, + "learning_rate": 5.486352401562796e-06, + "loss": 0.6997, + "step": 30380 + }, + { + "epoch": 0.6527903080293852, + "grad_norm": 0.4926980584558694, + "learning_rate": 5.480270810188732e-06, + "loss": 0.7079, + "step": 30390 + }, + { + "epoch": 0.6530051123426558, + "grad_norm": 0.5301706061164216, + "learning_rate": 5.4741913188974705e-06, + "loss": 0.7148, + "step": 30400 + }, + { + "epoch": 0.6532199166559265, + "grad_norm": 0.5077273663914399, + "learning_rate": 5.468113930513831e-06, + "loss": 0.702, + "step": 30410 + }, + { + "epoch": 0.6534347209691971, + "grad_norm": 0.5113783715432562, + "learning_rate": 5.462038647861659e-06, + "loss": 0.7237, + "step": 30420 + }, + { + "epoch": 0.6536495252824677, + "grad_norm": 0.5350113300965483, + "learning_rate": 5.455965473763824e-06, + "loss": 0.7077, + "step": 30430 + }, + { + "epoch": 0.6538643295957383, + "grad_norm": 0.534802331833065, + "learning_rate": 5.449894411042218e-06, + "loss": 0.7098, + "step": 30440 + }, + { + "epoch": 0.6540791339090088, + "grad_norm": 0.505768970566755, + "learning_rate": 5.4438254625177376e-06, + "loss": 0.7037, + "step": 30450 + }, + { + "epoch": 0.6542939382222795, + "grad_norm": 0.5243742881101912, + "learning_rate": 5.437758631010313e-06, + "loss": 0.7184, + "step": 30460 + }, + { + "epoch": 0.6545087425355501, + "grad_norm": 0.5174893952785108, + "learning_rate": 5.431693919338883e-06, + "loss": 0.7074, + "step": 30470 + }, + { + "epoch": 0.6547235468488207, + "grad_norm": 0.5018291601456127, + "learning_rate": 5.425631330321403e-06, + "loss": 0.7181, + "step": 30480 + }, + { + "epoch": 0.6549383511620913, + "grad_norm": 0.509935560750042, + "learning_rate": 5.41957086677485e-06, + "loss": 0.7272, + "step": 30490 + }, + { + "epoch": 0.6551531554753619, + "grad_norm": 0.5106281701236529, + "learning_rate": 5.413512531515195e-06, + "loss": 0.7181, + "step": 30500 + }, + { + "epoch": 0.6553679597886326, + "grad_norm": 0.5021707092734177, + "learning_rate": 5.407456327357437e-06, + "loss": 0.7122, + "step": 30510 + }, + { + "epoch": 0.6555827641019032, + "grad_norm": 0.5211648146663314, + "learning_rate": 5.401402257115576e-06, + "loss": 0.7042, + "step": 30520 + }, + { + "epoch": 0.6557975684151738, + "grad_norm": 0.5108248508895996, + "learning_rate": 5.395350323602624e-06, + "loss": 0.7027, + "step": 30530 + }, + { + "epoch": 0.6560123727284444, + "grad_norm": 0.504994079751055, + "learning_rate": 5.389300529630603e-06, + "loss": 0.7124, + "step": 30540 + }, + { + "epoch": 0.656227177041715, + "grad_norm": 0.5035319701169906, + "learning_rate": 5.383252878010528e-06, + "loss": 0.7117, + "step": 30550 + }, + { + "epoch": 0.6564419813549857, + "grad_norm": 0.5140239243917751, + "learning_rate": 5.3772073715524405e-06, + "loss": 0.7143, + "step": 30560 + }, + { + "epoch": 0.6566567856682562, + "grad_norm": 0.5014878071394002, + "learning_rate": 5.371164013065362e-06, + "loss": 0.7013, + "step": 30570 + }, + { + "epoch": 0.6568715899815268, + "grad_norm": 0.530273663858455, + "learning_rate": 5.365122805357331e-06, + "loss": 0.7137, + "step": 30580 + }, + { + "epoch": 0.6570863942947974, + "grad_norm": 0.5249382641477562, + "learning_rate": 5.3590837512353855e-06, + "loss": 0.7093, + "step": 30590 + }, + { + "epoch": 0.657301198608068, + "grad_norm": 0.5048796782346253, + "learning_rate": 5.353046853505548e-06, + "loss": 0.7091, + "step": 30600 + }, + { + "epoch": 0.6575160029213387, + "grad_norm": 0.5091027814069775, + "learning_rate": 5.347012114972865e-06, + "loss": 0.7028, + "step": 30610 + }, + { + "epoch": 0.6577308072346093, + "grad_norm": 0.5113428171167113, + "learning_rate": 5.34097953844135e-06, + "loss": 0.7142, + "step": 30620 + }, + { + "epoch": 0.6579456115478799, + "grad_norm": 0.4914977814112626, + "learning_rate": 5.334949126714044e-06, + "loss": 0.7137, + "step": 30630 + }, + { + "epoch": 0.6581604158611505, + "grad_norm": 0.4964243055566761, + "learning_rate": 5.328920882592953e-06, + "loss": 0.7087, + "step": 30640 + }, + { + "epoch": 0.6583752201744211, + "grad_norm": 0.5207847747861839, + "learning_rate": 5.322894808879091e-06, + "loss": 0.7134, + "step": 30650 + }, + { + "epoch": 0.6585900244876917, + "grad_norm": 0.5043669757937382, + "learning_rate": 5.3168709083724664e-06, + "loss": 0.7004, + "step": 30660 + }, + { + "epoch": 0.6588048288009624, + "grad_norm": 0.511841340255158, + "learning_rate": 5.310849183872059e-06, + "loss": 0.7081, + "step": 30670 + }, + { + "epoch": 0.659019633114233, + "grad_norm": 0.5084802396159405, + "learning_rate": 5.304829638175866e-06, + "loss": 0.7027, + "step": 30680 + }, + { + "epoch": 0.6592344374275035, + "grad_norm": 0.5068077969636335, + "learning_rate": 5.298812274080847e-06, + "loss": 0.7068, + "step": 30690 + }, + { + "epoch": 0.6594492417407741, + "grad_norm": 0.5156077038006672, + "learning_rate": 5.29279709438296e-06, + "loss": 0.7155, + "step": 30700 + }, + { + "epoch": 0.6596640460540447, + "grad_norm": 0.5004198196593579, + "learning_rate": 5.286784101877149e-06, + "loss": 0.7049, + "step": 30710 + }, + { + "epoch": 0.6598788503673154, + "grad_norm": 0.5306581949971477, + "learning_rate": 5.2807732993573295e-06, + "loss": 0.7078, + "step": 30720 + }, + { + "epoch": 0.660093654680586, + "grad_norm": 0.5036574433980202, + "learning_rate": 5.2747646896164215e-06, + "loss": 0.6997, + "step": 30730 + }, + { + "epoch": 0.6603084589938566, + "grad_norm": 0.52149079735347, + "learning_rate": 5.268758275446303e-06, + "loss": 0.7228, + "step": 30740 + }, + { + "epoch": 0.6605232633071272, + "grad_norm": 0.5079017436216969, + "learning_rate": 5.262754059637845e-06, + "loss": 0.7057, + "step": 30750 + }, + { + "epoch": 0.6607380676203978, + "grad_norm": 0.5013233677412027, + "learning_rate": 5.2567520449808975e-06, + "loss": 0.7175, + "step": 30760 + }, + { + "epoch": 0.6609528719336685, + "grad_norm": 0.5172472025182714, + "learning_rate": 5.2507522342642725e-06, + "loss": 0.7074, + "step": 30770 + }, + { + "epoch": 0.6611676762469391, + "grad_norm": 0.5302279379741301, + "learning_rate": 5.244754630275786e-06, + "loss": 0.7046, + "step": 30780 + }, + { + "epoch": 0.6613824805602097, + "grad_norm": 0.5103884640220524, + "learning_rate": 5.2387592358021954e-06, + "loss": 0.7087, + "step": 30790 + }, + { + "epoch": 0.6615972848734802, + "grad_norm": 0.5266498719563615, + "learning_rate": 5.2327660536292655e-06, + "loss": 0.7208, + "step": 30800 + }, + { + "epoch": 0.6618120891867508, + "grad_norm": 0.547112136398287, + "learning_rate": 5.226775086541705e-06, + "loss": 0.7246, + "step": 30810 + }, + { + "epoch": 0.6620268935000215, + "grad_norm": 0.5134807747932422, + "learning_rate": 5.220786337323205e-06, + "loss": 0.7038, + "step": 30820 + }, + { + "epoch": 0.6622416978132921, + "grad_norm": 0.5069417485434651, + "learning_rate": 5.214799808756432e-06, + "loss": 0.7267, + "step": 30830 + }, + { + "epoch": 0.6624565021265627, + "grad_norm": 0.5112475490765994, + "learning_rate": 5.208815503623001e-06, + "loss": 0.7049, + "step": 30840 + }, + { + "epoch": 0.6626713064398333, + "grad_norm": 0.49798758139115207, + "learning_rate": 5.202833424703524e-06, + "loss": 0.7049, + "step": 30850 + }, + { + "epoch": 0.6628861107531039, + "grad_norm": 0.5208558474736136, + "learning_rate": 5.196853574777547e-06, + "loss": 0.7054, + "step": 30860 + }, + { + "epoch": 0.6631009150663746, + "grad_norm": 0.5143732263552888, + "learning_rate": 5.190875956623602e-06, + "loss": 0.7161, + "step": 30870 + }, + { + "epoch": 0.6633157193796452, + "grad_norm": 0.5072200840559442, + "learning_rate": 5.184900573019179e-06, + "loss": 0.7143, + "step": 30880 + }, + { + "epoch": 0.6635305236929158, + "grad_norm": 0.5148338042683931, + "learning_rate": 5.1789274267407174e-06, + "loss": 0.7197, + "step": 30890 + }, + { + "epoch": 0.6637453280061864, + "grad_norm": 0.5355543429844896, + "learning_rate": 5.172956520563641e-06, + "loss": 0.6953, + "step": 30900 + }, + { + "epoch": 0.6639601323194569, + "grad_norm": 0.5111517548781118, + "learning_rate": 5.166987857262309e-06, + "loss": 0.704, + "step": 30910 + }, + { + "epoch": 0.6641749366327275, + "grad_norm": 0.5176946751784909, + "learning_rate": 5.16102143961005e-06, + "loss": 0.7041, + "step": 30920 + }, + { + "epoch": 0.6643897409459982, + "grad_norm": 0.5096883894063213, + "learning_rate": 5.155057270379149e-06, + "loss": 0.7065, + "step": 30930 + }, + { + "epoch": 0.6646045452592688, + "grad_norm": 0.5077111477162392, + "learning_rate": 5.149095352340847e-06, + "loss": 0.6996, + "step": 30940 + }, + { + "epoch": 0.6648193495725394, + "grad_norm": 0.49554593063940544, + "learning_rate": 5.143135688265337e-06, + "loss": 0.7097, + "step": 30950 + }, + { + "epoch": 0.66503415388581, + "grad_norm": 0.499479767542394, + "learning_rate": 5.137178280921759e-06, + "loss": 0.7176, + "step": 30960 + }, + { + "epoch": 0.6652489581990806, + "grad_norm": 0.49131352882866786, + "learning_rate": 5.131223133078213e-06, + "loss": 0.7166, + "step": 30970 + }, + { + "epoch": 0.6654637625123513, + "grad_norm": 0.5522986513731968, + "learning_rate": 5.125270247501746e-06, + "loss": 0.7076, + "step": 30980 + }, + { + "epoch": 0.6656785668256219, + "grad_norm": 0.5241166130212557, + "learning_rate": 5.119319626958354e-06, + "loss": 0.7071, + "step": 30990 + }, + { + "epoch": 0.6658933711388925, + "grad_norm": 0.5161672474322907, + "learning_rate": 5.113371274212984e-06, + "loss": 0.6892, + "step": 31000 + }, + { + "epoch": 0.666108175452163, + "grad_norm": 0.5019103502653095, + "learning_rate": 5.1074251920295194e-06, + "loss": 0.6991, + "step": 31010 + }, + { + "epoch": 0.6663229797654336, + "grad_norm": 0.5032297589781682, + "learning_rate": 5.1014813831708035e-06, + "loss": 0.7206, + "step": 31020 + }, + { + "epoch": 0.6665377840787043, + "grad_norm": 0.5002814476280841, + "learning_rate": 5.095539850398605e-06, + "loss": 0.7048, + "step": 31030 + }, + { + "epoch": 0.6667525883919749, + "grad_norm": 0.5156688113615284, + "learning_rate": 5.089600596473649e-06, + "loss": 0.7189, + "step": 31040 + }, + { + "epoch": 0.6669673927052455, + "grad_norm": 0.5188971013023798, + "learning_rate": 5.083663624155598e-06, + "loss": 0.7107, + "step": 31050 + }, + { + "epoch": 0.6671821970185161, + "grad_norm": 0.5203051062665321, + "learning_rate": 5.077728936203055e-06, + "loss": 0.7045, + "step": 31060 + }, + { + "epoch": 0.6673970013317867, + "grad_norm": 0.5041312268909838, + "learning_rate": 5.071796535373561e-06, + "loss": 0.7156, + "step": 31070 + }, + { + "epoch": 0.6676118056450574, + "grad_norm": 0.5159105151923982, + "learning_rate": 5.065866424423589e-06, + "loss": 0.6996, + "step": 31080 + }, + { + "epoch": 0.667826609958328, + "grad_norm": 0.5186673038935381, + "learning_rate": 5.059938606108554e-06, + "loss": 0.6947, + "step": 31090 + }, + { + "epoch": 0.6680414142715986, + "grad_norm": 0.5233119049957315, + "learning_rate": 5.054013083182808e-06, + "loss": 0.7172, + "step": 31100 + }, + { + "epoch": 0.6682562185848692, + "grad_norm": 0.5067736527978581, + "learning_rate": 5.048089858399632e-06, + "loss": 0.7021, + "step": 31110 + }, + { + "epoch": 0.6684710228981398, + "grad_norm": 0.5039632454745936, + "learning_rate": 5.042168934511237e-06, + "loss": 0.7059, + "step": 31120 + }, + { + "epoch": 0.6686858272114105, + "grad_norm": 0.5504075401326516, + "learning_rate": 5.036250314268775e-06, + "loss": 0.7125, + "step": 31130 + }, + { + "epoch": 0.668900631524681, + "grad_norm": 0.5140327017691517, + "learning_rate": 5.030334000422311e-06, + "loss": 0.716, + "step": 31140 + }, + { + "epoch": 0.6691154358379516, + "grad_norm": 0.5174324057304772, + "learning_rate": 5.024419995720854e-06, + "loss": 0.7091, + "step": 31150 + }, + { + "epoch": 0.6693302401512222, + "grad_norm": 0.5105348539745065, + "learning_rate": 5.0185083029123326e-06, + "loss": 0.7188, + "step": 31160 + }, + { + "epoch": 0.6695450444644928, + "grad_norm": 0.5104216621284471, + "learning_rate": 5.012598924743603e-06, + "loss": 0.6973, + "step": 31170 + }, + { + "epoch": 0.6697598487777635, + "grad_norm": 0.5015920565585724, + "learning_rate": 5.006691863960448e-06, + "loss": 0.7, + "step": 31180 + }, + { + "epoch": 0.6699746530910341, + "grad_norm": 0.5246820583424576, + "learning_rate": 5.000787123307562e-06, + "loss": 0.7043, + "step": 31190 + }, + { + "epoch": 0.6701894574043047, + "grad_norm": 0.5157821738943842, + "learning_rate": 4.994884705528583e-06, + "loss": 0.7145, + "step": 31200 + }, + { + "epoch": 0.6704042617175753, + "grad_norm": 0.49631469998188016, + "learning_rate": 4.9889846133660495e-06, + "loss": 0.7218, + "step": 31210 + }, + { + "epoch": 0.6706190660308459, + "grad_norm": 0.49915868212535197, + "learning_rate": 4.983086849561427e-06, + "loss": 0.7047, + "step": 31220 + }, + { + "epoch": 0.6708338703441165, + "grad_norm": 0.5307081421883926, + "learning_rate": 4.977191416855102e-06, + "loss": 0.7166, + "step": 31230 + }, + { + "epoch": 0.6710486746573872, + "grad_norm": 0.5169884060557652, + "learning_rate": 4.971298317986374e-06, + "loss": 0.7257, + "step": 31240 + }, + { + "epoch": 0.6712634789706577, + "grad_norm": 0.5009977824029985, + "learning_rate": 4.965407555693464e-06, + "loss": 0.7004, + "step": 31250 + }, + { + "epoch": 0.6714782832839283, + "grad_norm": 0.5155270739896842, + "learning_rate": 4.9595191327134915e-06, + "loss": 0.7153, + "step": 31260 + }, + { + "epoch": 0.6716930875971989, + "grad_norm": 0.5044556594755597, + "learning_rate": 4.9536330517825085e-06, + "loss": 0.7048, + "step": 31270 + }, + { + "epoch": 0.6719078919104695, + "grad_norm": 0.5050373362709072, + "learning_rate": 4.947749315635467e-06, + "loss": 0.7176, + "step": 31280 + }, + { + "epoch": 0.6721226962237402, + "grad_norm": 0.4952372981882212, + "learning_rate": 4.9418679270062345e-06, + "loss": 0.7315, + "step": 31290 + }, + { + "epoch": 0.6723375005370108, + "grad_norm": 0.5075005573307158, + "learning_rate": 4.935988888627589e-06, + "loss": 0.6999, + "step": 31300 + }, + { + "epoch": 0.6725523048502814, + "grad_norm": 0.5237607258041448, + "learning_rate": 4.930112203231202e-06, + "loss": 0.7199, + "step": 31310 + }, + { + "epoch": 0.672767109163552, + "grad_norm": 0.5140569272272469, + "learning_rate": 4.924237873547678e-06, + "loss": 0.7062, + "step": 31320 + }, + { + "epoch": 0.6729819134768226, + "grad_norm": 0.5064496524470513, + "learning_rate": 4.9183659023065035e-06, + "loss": 0.7098, + "step": 31330 + }, + { + "epoch": 0.6731967177900933, + "grad_norm": 0.49850427047279267, + "learning_rate": 4.912496292236078e-06, + "loss": 0.7127, + "step": 31340 + }, + { + "epoch": 0.6734115221033639, + "grad_norm": 0.5129869200842382, + "learning_rate": 4.906629046063709e-06, + "loss": 0.6957, + "step": 31350 + }, + { + "epoch": 0.6736263264166344, + "grad_norm": 0.5037255148710768, + "learning_rate": 4.900764166515589e-06, + "loss": 0.7053, + "step": 31360 + }, + { + "epoch": 0.673841130729905, + "grad_norm": 0.5218708185856743, + "learning_rate": 4.894901656316837e-06, + "loss": 0.6969, + "step": 31370 + }, + { + "epoch": 0.6740559350431756, + "grad_norm": 0.5106451275587679, + "learning_rate": 4.889041518191442e-06, + "loss": 0.7029, + "step": 31380 + }, + { + "epoch": 0.6742707393564463, + "grad_norm": 0.504620024778387, + "learning_rate": 4.883183754862319e-06, + "loss": 0.7108, + "step": 31390 + }, + { + "epoch": 0.6744855436697169, + "grad_norm": 0.500245116066161, + "learning_rate": 4.877328369051254e-06, + "loss": 0.7023, + "step": 31400 + }, + { + "epoch": 0.6747003479829875, + "grad_norm": 0.4984112969568294, + "learning_rate": 4.871475363478945e-06, + "loss": 0.6984, + "step": 31410 + }, + { + "epoch": 0.6749151522962581, + "grad_norm": 0.490340528966453, + "learning_rate": 4.865624740864981e-06, + "loss": 0.7171, + "step": 31420 + }, + { + "epoch": 0.6751299566095287, + "grad_norm": 0.5154565088297944, + "learning_rate": 4.859776503927831e-06, + "loss": 0.7254, + "step": 31430 + }, + { + "epoch": 0.6753447609227994, + "grad_norm": 0.5162711441600745, + "learning_rate": 4.8539306553848835e-06, + "loss": 0.7, + "step": 31440 + }, + { + "epoch": 0.67555956523607, + "grad_norm": 0.5296439320774667, + "learning_rate": 4.848087197952385e-06, + "loss": 0.7201, + "step": 31450 + }, + { + "epoch": 0.6757743695493406, + "grad_norm": 0.5211601804495921, + "learning_rate": 4.842246134345492e-06, + "loss": 0.7077, + "step": 31460 + }, + { + "epoch": 0.6759891738626111, + "grad_norm": 0.5204672709973052, + "learning_rate": 4.8364074672782445e-06, + "loss": 0.7025, + "step": 31470 + }, + { + "epoch": 0.6762039781758817, + "grad_norm": 0.5443914115894181, + "learning_rate": 4.8305711994635585e-06, + "loss": 0.7161, + "step": 31480 + }, + { + "epoch": 0.6764187824891524, + "grad_norm": 0.5398665273534337, + "learning_rate": 4.8247373336132565e-06, + "loss": 0.7152, + "step": 31490 + }, + { + "epoch": 0.676633586802423, + "grad_norm": 0.5205394432033161, + "learning_rate": 4.818905872438021e-06, + "loss": 0.7045, + "step": 31500 + }, + { + "epoch": 0.6768483911156936, + "grad_norm": 0.505861773994322, + "learning_rate": 4.813076818647434e-06, + "loss": 0.7176, + "step": 31510 + }, + { + "epoch": 0.6770631954289642, + "grad_norm": 0.5138045231439949, + "learning_rate": 4.807250174949955e-06, + "loss": 0.7092, + "step": 31520 + }, + { + "epoch": 0.6772779997422348, + "grad_norm": 0.5198137824432044, + "learning_rate": 4.801425944052911e-06, + "loss": 0.7115, + "step": 31530 + }, + { + "epoch": 0.6774928040555054, + "grad_norm": 0.5121939716704446, + "learning_rate": 4.795604128662536e-06, + "loss": 0.7001, + "step": 31540 + }, + { + "epoch": 0.6777076083687761, + "grad_norm": 0.5036727571961231, + "learning_rate": 4.78978473148391e-06, + "loss": 0.7005, + "step": 31550 + }, + { + "epoch": 0.6779224126820467, + "grad_norm": 0.4963339926656166, + "learning_rate": 4.783967755221008e-06, + "loss": 0.6981, + "step": 31560 + }, + { + "epoch": 0.6781372169953173, + "grad_norm": 0.5026469805657872, + "learning_rate": 4.778153202576678e-06, + "loss": 0.7061, + "step": 31570 + }, + { + "epoch": 0.6783520213085878, + "grad_norm": 0.4888239012176708, + "learning_rate": 4.772341076252639e-06, + "loss": 0.6928, + "step": 31580 + }, + { + "epoch": 0.6785668256218584, + "grad_norm": 0.5033386278198433, + "learning_rate": 4.766531378949487e-06, + "loss": 0.7066, + "step": 31590 + }, + { + "epoch": 0.6787816299351291, + "grad_norm": 0.5076861079010834, + "learning_rate": 4.7607241133666735e-06, + "loss": 0.7045, + "step": 31600 + }, + { + "epoch": 0.6789964342483997, + "grad_norm": 0.5368479007734174, + "learning_rate": 4.75491928220255e-06, + "loss": 0.7136, + "step": 31610 + }, + { + "epoch": 0.6792112385616703, + "grad_norm": 0.5082100349533373, + "learning_rate": 4.749116888154306e-06, + "loss": 0.7046, + "step": 31620 + }, + { + "epoch": 0.6794260428749409, + "grad_norm": 0.5133784115796547, + "learning_rate": 4.743316933918016e-06, + "loss": 0.7035, + "step": 31630 + }, + { + "epoch": 0.6796408471882115, + "grad_norm": 0.5274702887841751, + "learning_rate": 4.737519422188617e-06, + "loss": 0.707, + "step": 31640 + }, + { + "epoch": 0.6798556515014822, + "grad_norm": 0.5053552682276936, + "learning_rate": 4.73172435565991e-06, + "loss": 0.7126, + "step": 31650 + }, + { + "epoch": 0.6800704558147528, + "grad_norm": 0.529992238236458, + "learning_rate": 4.725931737024565e-06, + "loss": 0.7124, + "step": 31660 + }, + { + "epoch": 0.6802852601280234, + "grad_norm": 0.5316204503933539, + "learning_rate": 4.720141568974104e-06, + "loss": 0.7188, + "step": 31670 + }, + { + "epoch": 0.680500064441294, + "grad_norm": 0.5054012212316678, + "learning_rate": 4.714353854198918e-06, + "loss": 0.7118, + "step": 31680 + }, + { + "epoch": 0.6807148687545646, + "grad_norm": 0.5143920728251262, + "learning_rate": 4.708568595388258e-06, + "loss": 0.7087, + "step": 31690 + }, + { + "epoch": 0.6809296730678353, + "grad_norm": 0.5322332430035215, + "learning_rate": 4.7027857952302315e-06, + "loss": 0.7123, + "step": 31700 + }, + { + "epoch": 0.6811444773811058, + "grad_norm": 0.520262064220361, + "learning_rate": 4.697005456411811e-06, + "loss": 0.7156, + "step": 31710 + }, + { + "epoch": 0.6813592816943764, + "grad_norm": 0.5095960914177793, + "learning_rate": 4.691227581618808e-06, + "loss": 0.7102, + "step": 31720 + }, + { + "epoch": 0.681574086007647, + "grad_norm": 0.509847182822789, + "learning_rate": 4.685452173535906e-06, + "loss": 0.7073, + "step": 31730 + }, + { + "epoch": 0.6817888903209176, + "grad_norm": 0.4928338256132167, + "learning_rate": 4.679679234846636e-06, + "loss": 0.7094, + "step": 31740 + }, + { + "epoch": 0.6820036946341883, + "grad_norm": 0.5281801273797183, + "learning_rate": 4.67390876823338e-06, + "loss": 0.7103, + "step": 31750 + }, + { + "epoch": 0.6822184989474589, + "grad_norm": 0.5147073369745232, + "learning_rate": 4.668140776377378e-06, + "loss": 0.7138, + "step": 31760 + }, + { + "epoch": 0.6824333032607295, + "grad_norm": 0.518041164686382, + "learning_rate": 4.66237526195871e-06, + "loss": 0.7056, + "step": 31770 + }, + { + "epoch": 0.6826481075740001, + "grad_norm": 0.5032736884933615, + "learning_rate": 4.656612227656318e-06, + "loss": 0.7005, + "step": 31780 + }, + { + "epoch": 0.6828629118872707, + "grad_norm": 0.5164912240008823, + "learning_rate": 4.650851676147976e-06, + "loss": 0.7112, + "step": 31790 + }, + { + "epoch": 0.6830777162005413, + "grad_norm": 0.5190164964456715, + "learning_rate": 4.645093610110314e-06, + "loss": 0.7031, + "step": 31800 + }, + { + "epoch": 0.683292520513812, + "grad_norm": 0.499193870377874, + "learning_rate": 4.639338032218806e-06, + "loss": 0.6953, + "step": 31810 + }, + { + "epoch": 0.6835073248270825, + "grad_norm": 0.5107643101481627, + "learning_rate": 4.633584945147771e-06, + "loss": 0.7058, + "step": 31820 + }, + { + "epoch": 0.6837221291403531, + "grad_norm": 0.4788958047068638, + "learning_rate": 4.627834351570368e-06, + "loss": 0.6999, + "step": 31830 + }, + { + "epoch": 0.6839369334536237, + "grad_norm": 0.5243752262151895, + "learning_rate": 4.622086254158602e-06, + "loss": 0.7093, + "step": 31840 + }, + { + "epoch": 0.6841517377668943, + "grad_norm": 0.5117247308515849, + "learning_rate": 4.616340655583307e-06, + "loss": 0.7127, + "step": 31850 + }, + { + "epoch": 0.684366542080165, + "grad_norm": 0.5039057854230863, + "learning_rate": 4.610597558514167e-06, + "loss": 0.7155, + "step": 31860 + }, + { + "epoch": 0.6845813463934356, + "grad_norm": 0.5110748098732393, + "learning_rate": 4.6048569656197005e-06, + "loss": 0.7182, + "step": 31870 + }, + { + "epoch": 0.6847961507067062, + "grad_norm": 0.5033241610487543, + "learning_rate": 4.599118879567262e-06, + "loss": 0.7178, + "step": 31880 + }, + { + "epoch": 0.6850109550199768, + "grad_norm": 0.5330181219085199, + "learning_rate": 4.593383303023047e-06, + "loss": 0.6988, + "step": 31890 + }, + { + "epoch": 0.6852257593332474, + "grad_norm": 0.5146643640112155, + "learning_rate": 4.587650238652068e-06, + "loss": 0.7071, + "step": 31900 + }, + { + "epoch": 0.6854405636465181, + "grad_norm": 0.5558498206129705, + "learning_rate": 4.581919689118187e-06, + "loss": 0.7089, + "step": 31910 + }, + { + "epoch": 0.6856553679597887, + "grad_norm": 0.5051408069436808, + "learning_rate": 4.576191657084093e-06, + "loss": 0.7171, + "step": 31920 + }, + { + "epoch": 0.6858701722730592, + "grad_norm": 0.4947858066673929, + "learning_rate": 4.570466145211303e-06, + "loss": 0.705, + "step": 31930 + }, + { + "epoch": 0.6860849765863298, + "grad_norm": 0.49215658445409516, + "learning_rate": 4.564743156160167e-06, + "loss": 0.7004, + "step": 31940 + }, + { + "epoch": 0.6862997808996004, + "grad_norm": 0.5052799951057044, + "learning_rate": 4.559022692589851e-06, + "loss": 0.7048, + "step": 31950 + }, + { + "epoch": 0.6865145852128711, + "grad_norm": 0.49573841106926664, + "learning_rate": 4.553304757158372e-06, + "loss": 0.7074, + "step": 31960 + }, + { + "epoch": 0.6867293895261417, + "grad_norm": 0.5143793031370387, + "learning_rate": 4.547589352522542e-06, + "loss": 0.7039, + "step": 31970 + }, + { + "epoch": 0.6869441938394123, + "grad_norm": 0.504158519855662, + "learning_rate": 4.541876481338019e-06, + "loss": 0.7006, + "step": 31980 + }, + { + "epoch": 0.6871589981526829, + "grad_norm": 0.503671838515148, + "learning_rate": 4.536166146259276e-06, + "loss": 0.7147, + "step": 31990 + }, + { + "epoch": 0.6873738024659535, + "grad_norm": 0.5287913454757696, + "learning_rate": 4.530458349939606e-06, + "loss": 0.7243, + "step": 32000 + }, + { + "epoch": 0.6875886067792242, + "grad_norm": 0.5016494023049036, + "learning_rate": 4.524753095031133e-06, + "loss": 0.7028, + "step": 32010 + }, + { + "epoch": 0.6878034110924948, + "grad_norm": 0.5166519761915471, + "learning_rate": 4.519050384184775e-06, + "loss": 0.7129, + "step": 32020 + }, + { + "epoch": 0.6880182154057654, + "grad_norm": 0.5158977598310385, + "learning_rate": 4.513350220050305e-06, + "loss": 0.7071, + "step": 32030 + }, + { + "epoch": 0.6882330197190359, + "grad_norm": 0.5016269659965205, + "learning_rate": 4.507652605276278e-06, + "loss": 0.7047, + "step": 32040 + }, + { + "epoch": 0.6884478240323065, + "grad_norm": 0.5163690901069261, + "learning_rate": 4.501957542510085e-06, + "loss": 0.7117, + "step": 32050 + }, + { + "epoch": 0.6886626283455772, + "grad_norm": 0.5108399393202574, + "learning_rate": 4.4962650343979255e-06, + "loss": 0.7183, + "step": 32060 + }, + { + "epoch": 0.6888774326588478, + "grad_norm": 0.5084486965060432, + "learning_rate": 4.490575083584804e-06, + "loss": 0.7138, + "step": 32070 + }, + { + "epoch": 0.6890922369721184, + "grad_norm": 0.5218646619009035, + "learning_rate": 4.484887692714557e-06, + "loss": 0.7106, + "step": 32080 + }, + { + "epoch": 0.689307041285389, + "grad_norm": 0.5201258534596843, + "learning_rate": 4.479202864429808e-06, + "loss": 0.6951, + "step": 32090 + }, + { + "epoch": 0.6895218455986596, + "grad_norm": 0.516396272076133, + "learning_rate": 4.473520601372006e-06, + "loss": 0.7136, + "step": 32100 + }, + { + "epoch": 0.6897366499119302, + "grad_norm": 0.5032875799939698, + "learning_rate": 4.467840906181403e-06, + "loss": 0.6944, + "step": 32110 + }, + { + "epoch": 0.6899514542252009, + "grad_norm": 0.49750240137602164, + "learning_rate": 4.462163781497051e-06, + "loss": 0.7029, + "step": 32120 + }, + { + "epoch": 0.6901662585384715, + "grad_norm": 0.5033581986528483, + "learning_rate": 4.456489229956826e-06, + "loss": 0.7043, + "step": 32130 + }, + { + "epoch": 0.6903810628517421, + "grad_norm": 0.5166768407944105, + "learning_rate": 4.450817254197386e-06, + "loss": 0.6929, + "step": 32140 + }, + { + "epoch": 0.6905958671650126, + "grad_norm": 0.4918195538890641, + "learning_rate": 4.4451478568542064e-06, + "loss": 0.7109, + "step": 32150 + }, + { + "epoch": 0.6908106714782832, + "grad_norm": 0.5145980685669729, + "learning_rate": 4.439481040561565e-06, + "loss": 0.7008, + "step": 32160 + }, + { + "epoch": 0.6910254757915539, + "grad_norm": 0.5170120361869788, + "learning_rate": 4.433816807952525e-06, + "loss": 0.7102, + "step": 32170 + }, + { + "epoch": 0.6912402801048245, + "grad_norm": 0.5390912485506946, + "learning_rate": 4.428155161658976e-06, + "loss": 0.6978, + "step": 32180 + }, + { + "epoch": 0.6914550844180951, + "grad_norm": 0.513671096300247, + "learning_rate": 4.422496104311574e-06, + "loss": 0.704, + "step": 32190 + }, + { + "epoch": 0.6916698887313657, + "grad_norm": 0.5512396747122822, + "learning_rate": 4.416839638539804e-06, + "loss": 0.7071, + "step": 32200 + }, + { + "epoch": 0.6918846930446363, + "grad_norm": 0.5039999780605813, + "learning_rate": 4.411185766971919e-06, + "loss": 0.7098, + "step": 32210 + }, + { + "epoch": 0.692099497357907, + "grad_norm": 0.5056673300275484, + "learning_rate": 4.4055344922349845e-06, + "loss": 0.7002, + "step": 32220 + }, + { + "epoch": 0.6923143016711776, + "grad_norm": 0.5228713724860102, + "learning_rate": 4.399885816954855e-06, + "loss": 0.7069, + "step": 32230 + }, + { + "epoch": 0.6925291059844482, + "grad_norm": 0.5154170548308558, + "learning_rate": 4.394239743756166e-06, + "loss": 0.7173, + "step": 32240 + }, + { + "epoch": 0.6927439102977188, + "grad_norm": 0.4932044034667852, + "learning_rate": 4.3885962752623675e-06, + "loss": 0.7, + "step": 32250 + }, + { + "epoch": 0.6929587146109893, + "grad_norm": 0.5291577432915663, + "learning_rate": 4.382955414095675e-06, + "loss": 0.707, + "step": 32260 + }, + { + "epoch": 0.69317351892426, + "grad_norm": 0.5062081138297674, + "learning_rate": 4.3773171628771075e-06, + "loss": 0.7053, + "step": 32270 + }, + { + "epoch": 0.6933883232375306, + "grad_norm": 0.5235760210430898, + "learning_rate": 4.371681524226469e-06, + "loss": 0.7203, + "step": 32280 + }, + { + "epoch": 0.6936031275508012, + "grad_norm": 0.5061927155930297, + "learning_rate": 4.366048500762335e-06, + "loss": 0.7112, + "step": 32290 + }, + { + "epoch": 0.6938179318640718, + "grad_norm": 0.5131390727212936, + "learning_rate": 4.360418095102097e-06, + "loss": 0.7095, + "step": 32300 + }, + { + "epoch": 0.6940327361773424, + "grad_norm": 0.5280050051289648, + "learning_rate": 4.354790309861897e-06, + "loss": 0.7088, + "step": 32310 + }, + { + "epoch": 0.6942475404906131, + "grad_norm": 0.5170240200200222, + "learning_rate": 4.349165147656679e-06, + "loss": 0.6946, + "step": 32320 + }, + { + "epoch": 0.6944623448038837, + "grad_norm": 0.49194844383818315, + "learning_rate": 4.343542611100161e-06, + "loss": 0.7109, + "step": 32330 + }, + { + "epoch": 0.6946771491171543, + "grad_norm": 0.5021844766055354, + "learning_rate": 4.337922702804846e-06, + "loss": 0.6949, + "step": 32340 + }, + { + "epoch": 0.6948919534304249, + "grad_norm": 0.5099714451400136, + "learning_rate": 4.332305425382013e-06, + "loss": 0.707, + "step": 32350 + }, + { + "epoch": 0.6951067577436955, + "grad_norm": 0.5212733236473239, + "learning_rate": 4.326690781441711e-06, + "loss": 0.6997, + "step": 32360 + }, + { + "epoch": 0.6953215620569662, + "grad_norm": 0.4903110024798184, + "learning_rate": 4.3210787735927824e-06, + "loss": 0.7021, + "step": 32370 + }, + { + "epoch": 0.6955363663702367, + "grad_norm": 0.505886652482517, + "learning_rate": 4.315469404442829e-06, + "loss": 0.7, + "step": 32380 + }, + { + "epoch": 0.6957511706835073, + "grad_norm": 0.5348702621789376, + "learning_rate": 4.309862676598233e-06, + "loss": 0.6989, + "step": 32390 + }, + { + "epoch": 0.6959659749967779, + "grad_norm": 0.5053002509205807, + "learning_rate": 4.304258592664151e-06, + "loss": 0.705, + "step": 32400 + }, + { + "epoch": 0.6961807793100485, + "grad_norm": 0.49313377903105116, + "learning_rate": 4.298657155244508e-06, + "loss": 0.7126, + "step": 32410 + }, + { + "epoch": 0.6963955836233191, + "grad_norm": 0.49836073223251326, + "learning_rate": 4.293058366942004e-06, + "loss": 0.6942, + "step": 32420 + }, + { + "epoch": 0.6966103879365898, + "grad_norm": 0.5033826955151206, + "learning_rate": 4.287462230358097e-06, + "loss": 0.69, + "step": 32430 + }, + { + "epoch": 0.6968251922498604, + "grad_norm": 0.5124591714204908, + "learning_rate": 4.281868748093023e-06, + "loss": 0.7088, + "step": 32440 + }, + { + "epoch": 0.697039996563131, + "grad_norm": 0.5152063252453369, + "learning_rate": 4.276277922745784e-06, + "loss": 0.7235, + "step": 32450 + }, + { + "epoch": 0.6972548008764016, + "grad_norm": 0.5068795899169701, + "learning_rate": 4.2706897569141435e-06, + "loss": 0.6995, + "step": 32460 + }, + { + "epoch": 0.6974696051896722, + "grad_norm": 0.5194180550407479, + "learning_rate": 4.2651042531946364e-06, + "loss": 0.7132, + "step": 32470 + }, + { + "epoch": 0.6976844095029429, + "grad_norm": 0.5093344990238854, + "learning_rate": 4.259521414182547e-06, + "loss": 0.6991, + "step": 32480 + }, + { + "epoch": 0.6978992138162134, + "grad_norm": 0.5226727716813793, + "learning_rate": 4.253941242471934e-06, + "loss": 0.7052, + "step": 32490 + }, + { + "epoch": 0.698114018129484, + "grad_norm": 0.5101918234709942, + "learning_rate": 4.248363740655612e-06, + "loss": 0.709, + "step": 32500 + }, + { + "epoch": 0.6983288224427546, + "grad_norm": 0.5101101585718761, + "learning_rate": 4.242788911325156e-06, + "loss": 0.7108, + "step": 32510 + }, + { + "epoch": 0.6985436267560252, + "grad_norm": 0.4867970175259229, + "learning_rate": 4.2372167570709e-06, + "loss": 0.7042, + "step": 32520 + }, + { + "epoch": 0.6987584310692959, + "grad_norm": 0.5014423746270559, + "learning_rate": 4.231647280481936e-06, + "loss": 0.694, + "step": 32530 + }, + { + "epoch": 0.6989732353825665, + "grad_norm": 0.5146828954090973, + "learning_rate": 4.226080484146103e-06, + "loss": 0.7207, + "step": 32540 + }, + { + "epoch": 0.6991880396958371, + "grad_norm": 0.4901928041129853, + "learning_rate": 4.220516370650007e-06, + "loss": 0.6995, + "step": 32550 + }, + { + "epoch": 0.6994028440091077, + "grad_norm": 0.5066794982082476, + "learning_rate": 4.214954942578997e-06, + "loss": 0.7031, + "step": 32560 + }, + { + "epoch": 0.6996176483223783, + "grad_norm": 0.5117861279536446, + "learning_rate": 4.209396202517183e-06, + "loss": 0.6918, + "step": 32570 + }, + { + "epoch": 0.699832452635649, + "grad_norm": 0.5064003948776196, + "learning_rate": 4.203840153047422e-06, + "loss": 0.706, + "step": 32580 + }, + { + "epoch": 0.7000472569489196, + "grad_norm": 0.5292023184958814, + "learning_rate": 4.198286796751318e-06, + "loss": 0.7005, + "step": 32590 + }, + { + "epoch": 0.7002620612621901, + "grad_norm": 0.5050282381222843, + "learning_rate": 4.1927361362092336e-06, + "loss": 0.699, + "step": 32600 + }, + { + "epoch": 0.7004768655754607, + "grad_norm": 0.5072730993957972, + "learning_rate": 4.187188174000262e-06, + "loss": 0.6993, + "step": 32610 + }, + { + "epoch": 0.7006916698887313, + "grad_norm": 0.5081741609904642, + "learning_rate": 4.181642912702256e-06, + "loss": 0.7046, + "step": 32620 + }, + { + "epoch": 0.700906474202002, + "grad_norm": 0.5132090809761299, + "learning_rate": 4.176100354891812e-06, + "loss": 0.6907, + "step": 32630 + }, + { + "epoch": 0.7011212785152726, + "grad_norm": 0.5114069928059646, + "learning_rate": 4.170560503144266e-06, + "loss": 0.7162, + "step": 32640 + }, + { + "epoch": 0.7013360828285432, + "grad_norm": 0.4963333550516421, + "learning_rate": 4.165023360033703e-06, + "loss": 0.706, + "step": 32650 + }, + { + "epoch": 0.7015508871418138, + "grad_norm": 0.5148865519289305, + "learning_rate": 4.159488928132938e-06, + "loss": 0.7147, + "step": 32660 + }, + { + "epoch": 0.7017656914550844, + "grad_norm": 0.5264557896427997, + "learning_rate": 4.153957210013535e-06, + "loss": 0.7022, + "step": 32670 + }, + { + "epoch": 0.7019804957683551, + "grad_norm": 0.5085093647122813, + "learning_rate": 4.148428208245799e-06, + "loss": 0.714, + "step": 32680 + }, + { + "epoch": 0.7021953000816257, + "grad_norm": 0.5323744427421241, + "learning_rate": 4.142901925398766e-06, + "loss": 0.6977, + "step": 32690 + }, + { + "epoch": 0.7024101043948963, + "grad_norm": 0.5166468220520238, + "learning_rate": 4.137378364040216e-06, + "loss": 0.6951, + "step": 32700 + }, + { + "epoch": 0.7026249087081669, + "grad_norm": 0.5004414110795155, + "learning_rate": 4.1318575267366515e-06, + "loss": 0.718, + "step": 32710 + }, + { + "epoch": 0.7028397130214374, + "grad_norm": 0.5144502686296928, + "learning_rate": 4.12633941605333e-06, + "loss": 0.6955, + "step": 32720 + }, + { + "epoch": 0.703054517334708, + "grad_norm": 0.503363662007366, + "learning_rate": 4.120824034554221e-06, + "loss": 0.709, + "step": 32730 + }, + { + "epoch": 0.7032693216479787, + "grad_norm": 0.532794569128917, + "learning_rate": 4.115311384802038e-06, + "loss": 0.7011, + "step": 32740 + }, + { + "epoch": 0.7034841259612493, + "grad_norm": 0.5310969014155835, + "learning_rate": 4.1098014693582265e-06, + "loss": 0.7008, + "step": 32750 + }, + { + "epoch": 0.7036989302745199, + "grad_norm": 0.4961846506249242, + "learning_rate": 4.104294290782946e-06, + "loss": 0.6958, + "step": 32760 + }, + { + "epoch": 0.7039137345877905, + "grad_norm": 0.5131064931445611, + "learning_rate": 4.09878985163511e-06, + "loss": 0.7031, + "step": 32770 + }, + { + "epoch": 0.7041285389010611, + "grad_norm": 0.501337109243204, + "learning_rate": 4.09328815447233e-06, + "loss": 0.7002, + "step": 32780 + }, + { + "epoch": 0.7043433432143318, + "grad_norm": 0.5010141549810861, + "learning_rate": 4.0877892018509735e-06, + "loss": 0.6982, + "step": 32790 + }, + { + "epoch": 0.7045581475276024, + "grad_norm": 0.5190222646312463, + "learning_rate": 4.082292996326107e-06, + "loss": 0.6942, + "step": 32800 + }, + { + "epoch": 0.704772951840873, + "grad_norm": 0.5021762528824651, + "learning_rate": 4.076799540451532e-06, + "loss": 0.7036, + "step": 32810 + }, + { + "epoch": 0.7049877561541436, + "grad_norm": 0.5216162795537221, + "learning_rate": 4.071308836779778e-06, + "loss": 0.705, + "step": 32820 + }, + { + "epoch": 0.7052025604674141, + "grad_norm": 0.5081741693301844, + "learning_rate": 4.065820887862077e-06, + "loss": 0.7118, + "step": 32830 + }, + { + "epoch": 0.7054173647806848, + "grad_norm": 0.4991610018130501, + "learning_rate": 4.0603356962484075e-06, + "loss": 0.7004, + "step": 32840 + }, + { + "epoch": 0.7056321690939554, + "grad_norm": 0.5315742680425537, + "learning_rate": 4.054853264487442e-06, + "loss": 0.6897, + "step": 32850 + }, + { + "epoch": 0.705846973407226, + "grad_norm": 0.5012628613471524, + "learning_rate": 4.049373595126584e-06, + "loss": 0.6952, + "step": 32860 + }, + { + "epoch": 0.7060617777204966, + "grad_norm": 0.5112201352671399, + "learning_rate": 4.043896690711954e-06, + "loss": 0.7163, + "step": 32870 + }, + { + "epoch": 0.7062765820337672, + "grad_norm": 0.4955072270942802, + "learning_rate": 4.0384225537883735e-06, + "loss": 0.7037, + "step": 32880 + }, + { + "epoch": 0.7064913863470379, + "grad_norm": 0.5272007156898026, + "learning_rate": 4.032951186899404e-06, + "loss": 0.7045, + "step": 32890 + }, + { + "epoch": 0.7067061906603085, + "grad_norm": 0.5131148249079197, + "learning_rate": 4.027482592587294e-06, + "loss": 0.7132, + "step": 32900 + }, + { + "epoch": 0.7069209949735791, + "grad_norm": 0.50205423046405, + "learning_rate": 4.022016773393017e-06, + "loss": 0.7166, + "step": 32910 + }, + { + "epoch": 0.7071357992868497, + "grad_norm": 0.5022328702294467, + "learning_rate": 4.0165537318562595e-06, + "loss": 0.7065, + "step": 32920 + }, + { + "epoch": 0.7073506036001203, + "grad_norm": 0.5175065388500295, + "learning_rate": 4.011093470515402e-06, + "loss": 0.6979, + "step": 32930 + }, + { + "epoch": 0.707565407913391, + "grad_norm": 0.5212370076896278, + "learning_rate": 4.005635991907556e-06, + "loss": 0.7022, + "step": 32940 + }, + { + "epoch": 0.7077802122266615, + "grad_norm": 0.4941680464626519, + "learning_rate": 4.000181298568514e-06, + "loss": 0.6891, + "step": 32950 + }, + { + "epoch": 0.7079950165399321, + "grad_norm": 0.5160432814123912, + "learning_rate": 3.994729393032803e-06, + "loss": 0.6975, + "step": 32960 + }, + { + "epoch": 0.7082098208532027, + "grad_norm": 0.5090291581171473, + "learning_rate": 3.989280277833629e-06, + "loss": 0.7021, + "step": 32970 + }, + { + "epoch": 0.7084246251664733, + "grad_norm": 0.5048922334558574, + "learning_rate": 3.983833955502915e-06, + "loss": 0.6897, + "step": 32980 + }, + { + "epoch": 0.7086394294797439, + "grad_norm": 0.5065038482347751, + "learning_rate": 3.978390428571286e-06, + "loss": 0.6934, + "step": 32990 + }, + { + "epoch": 0.7088542337930146, + "grad_norm": 0.535448626077636, + "learning_rate": 3.972949699568057e-06, + "loss": 0.6893, + "step": 33000 + }, + { + "epoch": 0.7090690381062852, + "grad_norm": 0.5169501432061164, + "learning_rate": 3.967511771021264e-06, + "loss": 0.7029, + "step": 33010 + }, + { + "epoch": 0.7092838424195558, + "grad_norm": 0.511359264294529, + "learning_rate": 3.96207664545762e-06, + "loss": 0.7034, + "step": 33020 + }, + { + "epoch": 0.7094986467328264, + "grad_norm": 0.4960055217526054, + "learning_rate": 3.956644325402547e-06, + "loss": 0.7024, + "step": 33030 + }, + { + "epoch": 0.709713451046097, + "grad_norm": 0.5274217499816306, + "learning_rate": 3.951214813380164e-06, + "loss": 0.7051, + "step": 33040 + }, + { + "epoch": 0.7099282553593677, + "grad_norm": 0.4966382776596969, + "learning_rate": 3.9457881119132745e-06, + "loss": 0.6994, + "step": 33050 + }, + { + "epoch": 0.7101430596726382, + "grad_norm": 0.5360848716987743, + "learning_rate": 3.940364223523398e-06, + "loss": 0.7146, + "step": 33060 + }, + { + "epoch": 0.7103578639859088, + "grad_norm": 0.5765957098247436, + "learning_rate": 3.934943150730719e-06, + "loss": 0.6882, + "step": 33070 + }, + { + "epoch": 0.7105726682991794, + "grad_norm": 0.5219156969848493, + "learning_rate": 3.9295248960541355e-06, + "loss": 0.7012, + "step": 33080 + }, + { + "epoch": 0.71078747261245, + "grad_norm": 0.5177328505359572, + "learning_rate": 3.924109462011225e-06, + "loss": 0.6934, + "step": 33090 + }, + { + "epoch": 0.7110022769257207, + "grad_norm": 0.5083853674745945, + "learning_rate": 3.91869685111826e-06, + "loss": 0.699, + "step": 33100 + }, + { + "epoch": 0.7112170812389913, + "grad_norm": 0.5161556089894507, + "learning_rate": 3.913287065890201e-06, + "loss": 0.6894, + "step": 33110 + }, + { + "epoch": 0.7114318855522619, + "grad_norm": 0.5176090611045527, + "learning_rate": 3.907880108840688e-06, + "loss": 0.7132, + "step": 33120 + }, + { + "epoch": 0.7116466898655325, + "grad_norm": 0.5048282675725968, + "learning_rate": 3.902475982482055e-06, + "loss": 0.7103, + "step": 33130 + }, + { + "epoch": 0.7118614941788031, + "grad_norm": 0.5184451927132917, + "learning_rate": 3.897074689325316e-06, + "loss": 0.6916, + "step": 33140 + }, + { + "epoch": 0.7120762984920738, + "grad_norm": 0.5010826412828934, + "learning_rate": 3.891676231880175e-06, + "loss": 0.6897, + "step": 33150 + }, + { + "epoch": 0.7122911028053444, + "grad_norm": 0.524146057682627, + "learning_rate": 3.8862806126550105e-06, + "loss": 0.7169, + "step": 33160 + }, + { + "epoch": 0.712505907118615, + "grad_norm": 0.5038515983066828, + "learning_rate": 3.8808878341568875e-06, + "loss": 0.7081, + "step": 33170 + }, + { + "epoch": 0.7127207114318855, + "grad_norm": 0.5255219224371609, + "learning_rate": 3.875497898891552e-06, + "loss": 0.7022, + "step": 33180 + }, + { + "epoch": 0.7129355157451561, + "grad_norm": 0.4985578239131686, + "learning_rate": 3.87011080936342e-06, + "loss": 0.7012, + "step": 33190 + }, + { + "epoch": 0.7131503200584268, + "grad_norm": 0.4924989128893002, + "learning_rate": 3.864726568075595e-06, + "loss": 0.6947, + "step": 33200 + }, + { + "epoch": 0.7133651243716974, + "grad_norm": 0.5309886497942049, + "learning_rate": 3.859345177529853e-06, + "loss": 0.7049, + "step": 33210 + }, + { + "epoch": 0.713579928684968, + "grad_norm": 0.5206730299572171, + "learning_rate": 3.8539666402266465e-06, + "loss": 0.6977, + "step": 33220 + }, + { + "epoch": 0.7137947329982386, + "grad_norm": 0.5158488974635094, + "learning_rate": 3.848590958665104e-06, + "loss": 0.7065, + "step": 33230 + }, + { + "epoch": 0.7140095373115092, + "grad_norm": 0.48965491649636267, + "learning_rate": 3.843218135343019e-06, + "loss": 0.7011, + "step": 33240 + }, + { + "epoch": 0.7142243416247799, + "grad_norm": 0.564742066719133, + "learning_rate": 3.837848172756865e-06, + "loss": 0.7058, + "step": 33250 + }, + { + "epoch": 0.7144391459380505, + "grad_norm": 0.5137654269820426, + "learning_rate": 3.832481073401786e-06, + "loss": 0.7052, + "step": 33260 + }, + { + "epoch": 0.7146539502513211, + "grad_norm": 0.5180489971377459, + "learning_rate": 3.827116839771593e-06, + "loss": 0.7148, + "step": 33270 + }, + { + "epoch": 0.7148687545645916, + "grad_norm": 0.5058146041046021, + "learning_rate": 3.821755474358764e-06, + "loss": 0.6951, + "step": 33280 + }, + { + "epoch": 0.7150835588778622, + "grad_norm": 0.5048478043474748, + "learning_rate": 3.816396979654452e-06, + "loss": 0.708, + "step": 33290 + }, + { + "epoch": 0.7152983631911328, + "grad_norm": 0.5200290541035588, + "learning_rate": 3.8110413581484628e-06, + "loss": 0.7076, + "step": 33300 + }, + { + "epoch": 0.7155131675044035, + "grad_norm": 0.5041024765974936, + "learning_rate": 3.805688612329279e-06, + "loss": 0.7087, + "step": 33310 + }, + { + "epoch": 0.7157279718176741, + "grad_norm": 0.517113427361278, + "learning_rate": 3.800338744684041e-06, + "loss": 0.7035, + "step": 33320 + }, + { + "epoch": 0.7159427761309447, + "grad_norm": 0.5180046345306407, + "learning_rate": 3.794991757698555e-06, + "loss": 0.7112, + "step": 33330 + }, + { + "epoch": 0.7161575804442153, + "grad_norm": 0.5088817995846314, + "learning_rate": 3.7896476538572914e-06, + "loss": 0.7107, + "step": 33340 + }, + { + "epoch": 0.7163723847574859, + "grad_norm": 0.49205509109092965, + "learning_rate": 3.7843064356433656e-06, + "loss": 0.6988, + "step": 33350 + }, + { + "epoch": 0.7165871890707566, + "grad_norm": 0.5006056060538654, + "learning_rate": 3.7789681055385787e-06, + "loss": 0.7052, + "step": 33360 + }, + { + "epoch": 0.7168019933840272, + "grad_norm": 0.521153940432141, + "learning_rate": 3.7736326660233623e-06, + "loss": 0.7135, + "step": 33370 + }, + { + "epoch": 0.7170167976972978, + "grad_norm": 0.5069579531044478, + "learning_rate": 3.768300119576822e-06, + "loss": 0.7183, + "step": 33380 + }, + { + "epoch": 0.7172316020105683, + "grad_norm": 0.5215580306799487, + "learning_rate": 3.7629704686767144e-06, + "loss": 0.6942, + "step": 33390 + }, + { + "epoch": 0.7174464063238389, + "grad_norm": 0.5122954931151245, + "learning_rate": 3.7576437157994506e-06, + "loss": 0.6879, + "step": 33400 + }, + { + "epoch": 0.7176612106371096, + "grad_norm": 0.5464224896140304, + "learning_rate": 3.7523198634200985e-06, + "loss": 0.7155, + "step": 33410 + }, + { + "epoch": 0.7178760149503802, + "grad_norm": 0.4963871129860938, + "learning_rate": 3.7469989140123696e-06, + "loss": 0.6932, + "step": 33420 + }, + { + "epoch": 0.7180908192636508, + "grad_norm": 0.5220526668691016, + "learning_rate": 3.7416808700486328e-06, + "loss": 0.7096, + "step": 33430 + }, + { + "epoch": 0.7183056235769214, + "grad_norm": 0.5142411024190581, + "learning_rate": 3.7363657339999093e-06, + "loss": 0.6957, + "step": 33440 + }, + { + "epoch": 0.718520427890192, + "grad_norm": 0.5078089733725818, + "learning_rate": 3.7310535083358635e-06, + "loss": 0.7011, + "step": 33450 + }, + { + "epoch": 0.7187352322034627, + "grad_norm": 0.49558634222610376, + "learning_rate": 3.7257441955248153e-06, + "loss": 0.6895, + "step": 33460 + }, + { + "epoch": 0.7189500365167333, + "grad_norm": 0.48750240509688914, + "learning_rate": 3.7204377980337137e-06, + "loss": 0.718, + "step": 33470 + }, + { + "epoch": 0.7191648408300039, + "grad_norm": 0.5033127653562809, + "learning_rate": 3.7151343183281808e-06, + "loss": 0.7034, + "step": 33480 + }, + { + "epoch": 0.7193796451432745, + "grad_norm": 0.4924112060947027, + "learning_rate": 3.7098337588724565e-06, + "loss": 0.6955, + "step": 33490 + }, + { + "epoch": 0.719594449456545, + "grad_norm": 0.5143671743596542, + "learning_rate": 3.7045361221294375e-06, + "loss": 0.708, + "step": 33500 + }, + { + "epoch": 0.7198092537698157, + "grad_norm": 0.5082249361246125, + "learning_rate": 3.6992414105606644e-06, + "loss": 0.6782, + "step": 33510 + }, + { + "epoch": 0.7200240580830863, + "grad_norm": 0.5043808950755216, + "learning_rate": 3.693949626626302e-06, + "loss": 0.6979, + "step": 33520 + }, + { + "epoch": 0.7202388623963569, + "grad_norm": 0.5219484042509999, + "learning_rate": 3.688660772785183e-06, + "loss": 0.704, + "step": 33530 + }, + { + "epoch": 0.7204536667096275, + "grad_norm": 0.5449994389723714, + "learning_rate": 3.6833748514947465e-06, + "loss": 0.709, + "step": 33540 + }, + { + "epoch": 0.7206684710228981, + "grad_norm": 0.5187402619586201, + "learning_rate": 3.6780918652110984e-06, + "loss": 0.7063, + "step": 33550 + }, + { + "epoch": 0.7208832753361688, + "grad_norm": 0.5025397434112188, + "learning_rate": 3.672811816388959e-06, + "loss": 0.7125, + "step": 33560 + }, + { + "epoch": 0.7210980796494394, + "grad_norm": 0.4922234036325025, + "learning_rate": 3.6675347074816948e-06, + "loss": 0.699, + "step": 33570 + }, + { + "epoch": 0.72131288396271, + "grad_norm": 0.5049263733928765, + "learning_rate": 3.662260540941306e-06, + "loss": 0.7042, + "step": 33580 + }, + { + "epoch": 0.7215276882759806, + "grad_norm": 0.5209764065547637, + "learning_rate": 3.6569893192184123e-06, + "loss": 0.7002, + "step": 33590 + }, + { + "epoch": 0.7217424925892512, + "grad_norm": 0.5046889630269408, + "learning_rate": 3.6517210447622918e-06, + "loss": 0.7017, + "step": 33600 + }, + { + "epoch": 0.7219572969025218, + "grad_norm": 0.5059886025352888, + "learning_rate": 3.646455720020826e-06, + "loss": 0.6994, + "step": 33610 + }, + { + "epoch": 0.7221721012157925, + "grad_norm": 0.5098305495450107, + "learning_rate": 3.6411933474405402e-06, + "loss": 0.7073, + "step": 33620 + }, + { + "epoch": 0.722386905529063, + "grad_norm": 0.5002036914879331, + "learning_rate": 3.6359339294665897e-06, + "loss": 0.6901, + "step": 33630 + }, + { + "epoch": 0.7226017098423336, + "grad_norm": 0.523077232704163, + "learning_rate": 3.630677468542739e-06, + "loss": 0.7114, + "step": 33640 + }, + { + "epoch": 0.7228165141556042, + "grad_norm": 0.5040742343196016, + "learning_rate": 3.625423967111409e-06, + "loss": 0.6915, + "step": 33650 + }, + { + "epoch": 0.7230313184688748, + "grad_norm": 0.504689168487558, + "learning_rate": 3.6201734276136156e-06, + "loss": 0.7027, + "step": 33660 + }, + { + "epoch": 0.7232461227821455, + "grad_norm": 0.5360928374223011, + "learning_rate": 3.614925852489015e-06, + "loss": 0.7044, + "step": 33670 + }, + { + "epoch": 0.7234609270954161, + "grad_norm": 0.508286028640259, + "learning_rate": 3.6096812441758865e-06, + "loss": 0.7102, + "step": 33680 + }, + { + "epoch": 0.7236757314086867, + "grad_norm": 0.4950815691274723, + "learning_rate": 3.604439605111114e-06, + "loss": 0.7042, + "step": 33690 + }, + { + "epoch": 0.7238905357219573, + "grad_norm": 0.4990133005177096, + "learning_rate": 3.5992009377302306e-06, + "loss": 0.7092, + "step": 33700 + }, + { + "epoch": 0.7241053400352279, + "grad_norm": 0.5080144595958782, + "learning_rate": 3.5939652444673614e-06, + "loss": 0.7057, + "step": 33710 + }, + { + "epoch": 0.7243201443484986, + "grad_norm": 0.49504096828193644, + "learning_rate": 3.588732527755262e-06, + "loss": 0.6866, + "step": 33720 + }, + { + "epoch": 0.7245349486617692, + "grad_norm": 0.5010995541979635, + "learning_rate": 3.583502790025304e-06, + "loss": 0.7017, + "step": 33730 + }, + { + "epoch": 0.7247497529750397, + "grad_norm": 0.5259173065794601, + "learning_rate": 3.578276033707476e-06, + "loss": 0.7094, + "step": 33740 + }, + { + "epoch": 0.7249645572883103, + "grad_norm": 0.5119905092520929, + "learning_rate": 3.5730522612303808e-06, + "loss": 0.701, + "step": 33750 + }, + { + "epoch": 0.7251793616015809, + "grad_norm": 0.5647220761659497, + "learning_rate": 3.5678314750212253e-06, + "loss": 0.6946, + "step": 33760 + }, + { + "epoch": 0.7253941659148516, + "grad_norm": 0.5084711336782207, + "learning_rate": 3.5626136775058484e-06, + "loss": 0.7022, + "step": 33770 + }, + { + "epoch": 0.7256089702281222, + "grad_norm": 0.5259298296821393, + "learning_rate": 3.5573988711086793e-06, + "loss": 0.6953, + "step": 33780 + }, + { + "epoch": 0.7258237745413928, + "grad_norm": 0.4968324999154312, + "learning_rate": 3.552187058252772e-06, + "loss": 0.7083, + "step": 33790 + }, + { + "epoch": 0.7260385788546634, + "grad_norm": 0.5111234225003652, + "learning_rate": 3.546978241359785e-06, + "loss": 0.6995, + "step": 33800 + }, + { + "epoch": 0.726253383167934, + "grad_norm": 0.5321941233022512, + "learning_rate": 3.541772422849977e-06, + "loss": 0.6862, + "step": 33810 + }, + { + "epoch": 0.7264681874812047, + "grad_norm": 0.5446186196935507, + "learning_rate": 3.5365696051422337e-06, + "loss": 0.7064, + "step": 33820 + }, + { + "epoch": 0.7266829917944753, + "grad_norm": 0.5339112094968238, + "learning_rate": 3.531369790654022e-06, + "loss": 0.6939, + "step": 33830 + }, + { + "epoch": 0.7268977961077459, + "grad_norm": 0.5116370373744857, + "learning_rate": 3.526172981801429e-06, + "loss": 0.6848, + "step": 33840 + }, + { + "epoch": 0.7271126004210164, + "grad_norm": 0.49391991528376755, + "learning_rate": 3.5209791809991424e-06, + "loss": 0.6974, + "step": 33850 + }, + { + "epoch": 0.727327404734287, + "grad_norm": 0.4986794011059733, + "learning_rate": 3.5157883906604484e-06, + "loss": 0.6986, + "step": 33860 + }, + { + "epoch": 0.7275422090475576, + "grad_norm": 0.5365339519034793, + "learning_rate": 3.5106006131972425e-06, + "loss": 0.7011, + "step": 33870 + }, + { + "epoch": 0.7277570133608283, + "grad_norm": 0.505426889604584, + "learning_rate": 3.5054158510200077e-06, + "loss": 0.7272, + "step": 33880 + }, + { + "epoch": 0.7279718176740989, + "grad_norm": 0.534863544759657, + "learning_rate": 3.5002341065378352e-06, + "loss": 0.6975, + "step": 33890 + }, + { + "epoch": 0.7281866219873695, + "grad_norm": 0.5029648260679437, + "learning_rate": 3.4950553821584133e-06, + "loss": 0.6967, + "step": 33900 + }, + { + "epoch": 0.7284014263006401, + "grad_norm": 0.5014206921527354, + "learning_rate": 3.4898796802880253e-06, + "loss": 0.7089, + "step": 33910 + }, + { + "epoch": 0.7286162306139107, + "grad_norm": 0.5234721876994048, + "learning_rate": 3.484707003331549e-06, + "loss": 0.7085, + "step": 33920 + }, + { + "epoch": 0.7288310349271814, + "grad_norm": 0.5239292676381829, + "learning_rate": 3.4795373536924627e-06, + "loss": 0.6843, + "step": 33930 + }, + { + "epoch": 0.729045839240452, + "grad_norm": 0.4980484392742611, + "learning_rate": 3.474370733772827e-06, + "loss": 0.6965, + "step": 33940 + }, + { + "epoch": 0.7292606435537226, + "grad_norm": 0.5364410045950752, + "learning_rate": 3.4692071459733024e-06, + "loss": 0.6984, + "step": 33950 + }, + { + "epoch": 0.7294754478669931, + "grad_norm": 0.5079175468345598, + "learning_rate": 3.464046592693142e-06, + "loss": 0.6894, + "step": 33960 + }, + { + "epoch": 0.7296902521802637, + "grad_norm": 0.5195456559419563, + "learning_rate": 3.4588890763301843e-06, + "loss": 0.7166, + "step": 33970 + }, + { + "epoch": 0.7299050564935344, + "grad_norm": 0.5069432370518213, + "learning_rate": 3.453734599280859e-06, + "loss": 0.7039, + "step": 33980 + }, + { + "epoch": 0.730119860806805, + "grad_norm": 0.5132347077274939, + "learning_rate": 3.4485831639401836e-06, + "loss": 0.7007, + "step": 33990 + }, + { + "epoch": 0.7303346651200756, + "grad_norm": 0.4969882977586769, + "learning_rate": 3.4434347727017645e-06, + "loss": 0.7043, + "step": 34000 + }, + { + "epoch": 0.7305494694333462, + "grad_norm": 0.519434920750127, + "learning_rate": 3.438289427957785e-06, + "loss": 0.7021, + "step": 34010 + }, + { + "epoch": 0.7307642737466168, + "grad_norm": 0.5195248274879436, + "learning_rate": 3.4331471320990216e-06, + "loss": 0.7106, + "step": 34020 + }, + { + "epoch": 0.7309790780598875, + "grad_norm": 0.5452676568255899, + "learning_rate": 3.4280078875148317e-06, + "loss": 0.689, + "step": 34030 + }, + { + "epoch": 0.7311938823731581, + "grad_norm": 0.5006895911589073, + "learning_rate": 3.4228716965931553e-06, + "loss": 0.7172, + "step": 34040 + }, + { + "epoch": 0.7314086866864287, + "grad_norm": 0.543855444481588, + "learning_rate": 3.417738561720515e-06, + "loss": 0.7082, + "step": 34050 + }, + { + "epoch": 0.7316234909996993, + "grad_norm": 0.5225704241808111, + "learning_rate": 3.4126084852820064e-06, + "loss": 0.7012, + "step": 34060 + }, + { + "epoch": 0.7318382953129698, + "grad_norm": 0.5439757755860316, + "learning_rate": 3.40748146966131e-06, + "loss": 0.6898, + "step": 34070 + }, + { + "epoch": 0.7320530996262405, + "grad_norm": 0.521232354304882, + "learning_rate": 3.402357517240684e-06, + "loss": 0.7246, + "step": 34080 + }, + { + "epoch": 0.7322679039395111, + "grad_norm": 0.504585628259099, + "learning_rate": 3.397236630400962e-06, + "loss": 0.6846, + "step": 34090 + }, + { + "epoch": 0.7324827082527817, + "grad_norm": 0.5105232285125065, + "learning_rate": 3.3921188115215574e-06, + "loss": 0.709, + "step": 34100 + }, + { + "epoch": 0.7326975125660523, + "grad_norm": 0.5118542121131522, + "learning_rate": 3.3870040629804445e-06, + "loss": 0.6957, + "step": 34110 + }, + { + "epoch": 0.7329123168793229, + "grad_norm": 0.494913168921892, + "learning_rate": 3.3818923871541932e-06, + "loss": 0.6846, + "step": 34120 + }, + { + "epoch": 0.7331271211925936, + "grad_norm": 0.5369888915842843, + "learning_rate": 3.376783786417922e-06, + "loss": 0.7195, + "step": 34130 + }, + { + "epoch": 0.7333419255058642, + "grad_norm": 0.49649984813451475, + "learning_rate": 3.371678263145337e-06, + "loss": 0.6934, + "step": 34140 + }, + { + "epoch": 0.7335567298191348, + "grad_norm": 0.5374808788911171, + "learning_rate": 3.3665758197087084e-06, + "loss": 0.7042, + "step": 34150 + }, + { + "epoch": 0.7337715341324054, + "grad_norm": 0.508279517829229, + "learning_rate": 3.361476458478875e-06, + "loss": 0.7006, + "step": 34160 + }, + { + "epoch": 0.733986338445676, + "grad_norm": 0.506858247543215, + "learning_rate": 3.3563801818252483e-06, + "loss": 0.6905, + "step": 34170 + }, + { + "epoch": 0.7342011427589465, + "grad_norm": 0.5233572520758975, + "learning_rate": 3.351286992115793e-06, + "loss": 0.7044, + "step": 34180 + }, + { + "epoch": 0.7344159470722172, + "grad_norm": 0.49926488863527185, + "learning_rate": 3.346196891717063e-06, + "loss": 0.6974, + "step": 34190 + }, + { + "epoch": 0.7346307513854878, + "grad_norm": 0.5127254239928138, + "learning_rate": 3.3411098829941513e-06, + "loss": 0.6949, + "step": 34200 + }, + { + "epoch": 0.7348455556987584, + "grad_norm": 0.5157624607732113, + "learning_rate": 3.33602596831073e-06, + "loss": 0.7038, + "step": 34210 + }, + { + "epoch": 0.735060360012029, + "grad_norm": 0.5324610708151536, + "learning_rate": 3.3309451500290336e-06, + "loss": 0.7072, + "step": 34220 + }, + { + "epoch": 0.7352751643252996, + "grad_norm": 0.5185302310161645, + "learning_rate": 3.3258674305098435e-06, + "loss": 0.6948, + "step": 34230 + }, + { + "epoch": 0.7354899686385703, + "grad_norm": 0.4957613188241341, + "learning_rate": 3.3207928121125243e-06, + "loss": 0.7011, + "step": 34240 + }, + { + "epoch": 0.7357047729518409, + "grad_norm": 0.5441591292886901, + "learning_rate": 3.315721297194977e-06, + "loss": 0.705, + "step": 34250 + }, + { + "epoch": 0.7359195772651115, + "grad_norm": 0.5278633009999051, + "learning_rate": 3.3106528881136745e-06, + "loss": 0.7038, + "step": 34260 + }, + { + "epoch": 0.7361343815783821, + "grad_norm": 0.49493002519430035, + "learning_rate": 3.305587587223645e-06, + "loss": 0.7014, + "step": 34270 + }, + { + "epoch": 0.7363491858916527, + "grad_norm": 0.5164985719789421, + "learning_rate": 3.300525396878461e-06, + "loss": 0.6918, + "step": 34280 + }, + { + "epoch": 0.7365639902049234, + "grad_norm": 0.5385527870192864, + "learning_rate": 3.2954663194302715e-06, + "loss": 0.7056, + "step": 34290 + }, + { + "epoch": 0.736778794518194, + "grad_norm": 0.5222972879975742, + "learning_rate": 3.2904103572297565e-06, + "loss": 0.7012, + "step": 34300 + }, + { + "epoch": 0.7369935988314645, + "grad_norm": 0.515795075393324, + "learning_rate": 3.2853575126261606e-06, + "loss": 0.704, + "step": 34310 + }, + { + "epoch": 0.7372084031447351, + "grad_norm": 0.4972752236528448, + "learning_rate": 3.280307787967283e-06, + "loss": 0.6891, + "step": 34320 + }, + { + "epoch": 0.7374232074580057, + "grad_norm": 0.5276200292672564, + "learning_rate": 3.2752611855994566e-06, + "loss": 0.7069, + "step": 34330 + }, + { + "epoch": 0.7376380117712764, + "grad_norm": 0.5527465990583433, + "learning_rate": 3.270217707867588e-06, + "loss": 0.6867, + "step": 34340 + }, + { + "epoch": 0.737852816084547, + "grad_norm": 0.5098818638723858, + "learning_rate": 3.265177357115106e-06, + "loss": 0.6933, + "step": 34350 + }, + { + "epoch": 0.7380676203978176, + "grad_norm": 0.5001020449149964, + "learning_rate": 3.2601401356840112e-06, + "loss": 0.7017, + "step": 34360 + }, + { + "epoch": 0.7382824247110882, + "grad_norm": 0.5090868444518032, + "learning_rate": 3.2551060459148297e-06, + "loss": 0.6899, + "step": 34370 + }, + { + "epoch": 0.7384972290243588, + "grad_norm": 0.5291041455847213, + "learning_rate": 3.250075090146644e-06, + "loss": 0.6951, + "step": 34380 + }, + { + "epoch": 0.7387120333376295, + "grad_norm": 0.49906694116191236, + "learning_rate": 3.2450472707170786e-06, + "loss": 0.704, + "step": 34390 + }, + { + "epoch": 0.7389268376509001, + "grad_norm": 0.49374014306535213, + "learning_rate": 3.2400225899622917e-06, + "loss": 0.6808, + "step": 34400 + }, + { + "epoch": 0.7391416419641706, + "grad_norm": 0.5331437741952022, + "learning_rate": 3.235001050217003e-06, + "loss": 0.7108, + "step": 34410 + }, + { + "epoch": 0.7393564462774412, + "grad_norm": 0.5433540729594903, + "learning_rate": 3.2299826538144506e-06, + "loss": 0.7037, + "step": 34420 + }, + { + "epoch": 0.7395712505907118, + "grad_norm": 0.49990473998285656, + "learning_rate": 3.2249674030864254e-06, + "loss": 0.7034, + "step": 34430 + }, + { + "epoch": 0.7397860549039825, + "grad_norm": 0.5184986266564906, + "learning_rate": 3.2199553003632566e-06, + "loss": 0.7059, + "step": 34440 + }, + { + "epoch": 0.7400008592172531, + "grad_norm": 0.5255092855129455, + "learning_rate": 3.2149463479737974e-06, + "loss": 0.7078, + "step": 34450 + }, + { + "epoch": 0.7402156635305237, + "grad_norm": 0.5078685785943006, + "learning_rate": 3.2099405482454613e-06, + "loss": 0.6856, + "step": 34460 + }, + { + "epoch": 0.7404304678437943, + "grad_norm": 0.5001995241491086, + "learning_rate": 3.2049379035041718e-06, + "loss": 0.6947, + "step": 34470 + }, + { + "epoch": 0.7406452721570649, + "grad_norm": 0.5229199908697051, + "learning_rate": 3.199938416074401e-06, + "loss": 0.698, + "step": 34480 + }, + { + "epoch": 0.7408600764703355, + "grad_norm": 0.506611104007963, + "learning_rate": 3.1949420882791493e-06, + "loss": 0.7095, + "step": 34490 + }, + { + "epoch": 0.7410748807836062, + "grad_norm": 0.5063586727734837, + "learning_rate": 3.1899489224399514e-06, + "loss": 0.7033, + "step": 34500 + }, + { + "epoch": 0.7412896850968768, + "grad_norm": 0.5267235455499195, + "learning_rate": 3.184958920876874e-06, + "loss": 0.695, + "step": 34510 + }, + { + "epoch": 0.7415044894101473, + "grad_norm": 0.501781417708024, + "learning_rate": 3.1799720859085025e-06, + "loss": 0.6983, + "step": 34520 + }, + { + "epoch": 0.7417192937234179, + "grad_norm": 0.5127416336972987, + "learning_rate": 3.174988419851971e-06, + "loss": 0.6996, + "step": 34530 + }, + { + "epoch": 0.7419340980366885, + "grad_norm": 0.5238966491029383, + "learning_rate": 3.170007925022921e-06, + "loss": 0.7003, + "step": 34540 + }, + { + "epoch": 0.7421489023499592, + "grad_norm": 0.5027088254862997, + "learning_rate": 3.1650306037355303e-06, + "loss": 0.693, + "step": 34550 + }, + { + "epoch": 0.7423637066632298, + "grad_norm": 0.5038648172133018, + "learning_rate": 3.160056458302504e-06, + "loss": 0.6989, + "step": 34560 + }, + { + "epoch": 0.7425785109765004, + "grad_norm": 0.5125824985664078, + "learning_rate": 3.155085491035066e-06, + "loss": 0.7069, + "step": 34570 + }, + { + "epoch": 0.742793315289771, + "grad_norm": 0.5375010969807497, + "learning_rate": 3.1501177042429697e-06, + "loss": 0.7113, + "step": 34580 + }, + { + "epoch": 0.7430081196030416, + "grad_norm": 0.507552166036012, + "learning_rate": 3.1451531002344804e-06, + "loss": 0.7009, + "step": 34590 + }, + { + "epoch": 0.7432229239163123, + "grad_norm": 0.5130116795996187, + "learning_rate": 3.140191681316396e-06, + "loss": 0.7009, + "step": 34600 + }, + { + "epoch": 0.7434377282295829, + "grad_norm": 0.513751129751177, + "learning_rate": 3.1352334497940262e-06, + "loss": 0.7051, + "step": 34610 + }, + { + "epoch": 0.7436525325428535, + "grad_norm": 0.5088528949925636, + "learning_rate": 3.1302784079712067e-06, + "loss": 0.7024, + "step": 34620 + }, + { + "epoch": 0.743867336856124, + "grad_norm": 0.519047865808951, + "learning_rate": 3.1253265581502877e-06, + "loss": 0.6916, + "step": 34630 + }, + { + "epoch": 0.7440821411693946, + "grad_norm": 0.5261411126518063, + "learning_rate": 3.1203779026321313e-06, + "loss": 0.6956, + "step": 34640 + }, + { + "epoch": 0.7442969454826653, + "grad_norm": 0.5173606042960484, + "learning_rate": 3.115432443716123e-06, + "loss": 0.7191, + "step": 34650 + }, + { + "epoch": 0.7445117497959359, + "grad_norm": 0.5197079005508889, + "learning_rate": 3.110490183700159e-06, + "loss": 0.7082, + "step": 34660 + }, + { + "epoch": 0.7447265541092065, + "grad_norm": 0.5109626703024929, + "learning_rate": 3.1055511248806514e-06, + "loss": 0.6868, + "step": 34670 + }, + { + "epoch": 0.7449413584224771, + "grad_norm": 0.5194401187437783, + "learning_rate": 3.100615269552523e-06, + "loss": 0.6972, + "step": 34680 + }, + { + "epoch": 0.7451561627357477, + "grad_norm": 0.5061536798223207, + "learning_rate": 3.0956826200092114e-06, + "loss": 0.6966, + "step": 34690 + }, + { + "epoch": 0.7453709670490184, + "grad_norm": 0.5083820365394748, + "learning_rate": 3.090753178542657e-06, + "loss": 0.6783, + "step": 34700 + }, + { + "epoch": 0.745585771362289, + "grad_norm": 0.5077977495235433, + "learning_rate": 3.0858269474433165e-06, + "loss": 0.7079, + "step": 34710 + }, + { + "epoch": 0.7458005756755596, + "grad_norm": 0.512990160380994, + "learning_rate": 3.080903929000153e-06, + "loss": 0.6993, + "step": 34720 + }, + { + "epoch": 0.7460153799888302, + "grad_norm": 0.517593034507449, + "learning_rate": 3.0759841255006386e-06, + "loss": 0.7033, + "step": 34730 + }, + { + "epoch": 0.7462301843021008, + "grad_norm": 0.4897976460660666, + "learning_rate": 3.0710675392307477e-06, + "loss": 0.6933, + "step": 34740 + }, + { + "epoch": 0.7464449886153715, + "grad_norm": 0.4947523248312534, + "learning_rate": 3.066154172474962e-06, + "loss": 0.6946, + "step": 34750 + }, + { + "epoch": 0.746659792928642, + "grad_norm": 0.5091626362163222, + "learning_rate": 3.0612440275162727e-06, + "loss": 0.6867, + "step": 34760 + }, + { + "epoch": 0.7468745972419126, + "grad_norm": 0.5114642942026933, + "learning_rate": 3.056337106636159e-06, + "loss": 0.6996, + "step": 34770 + }, + { + "epoch": 0.7470894015551832, + "grad_norm": 0.5014471572550719, + "learning_rate": 3.0514334121146173e-06, + "loss": 0.7086, + "step": 34780 + }, + { + "epoch": 0.7473042058684538, + "grad_norm": 0.5084778446042698, + "learning_rate": 3.046532946230136e-06, + "loss": 0.6935, + "step": 34790 + }, + { + "epoch": 0.7475190101817244, + "grad_norm": 0.5229812004697073, + "learning_rate": 3.0416357112597108e-06, + "loss": 0.7039, + "step": 34800 + }, + { + "epoch": 0.7477338144949951, + "grad_norm": 0.5019941677521293, + "learning_rate": 3.0367417094788308e-06, + "loss": 0.7018, + "step": 34810 + }, + { + "epoch": 0.7479486188082657, + "grad_norm": 0.5255533916684315, + "learning_rate": 3.0318509431614794e-06, + "loss": 0.7103, + "step": 34820 + }, + { + "epoch": 0.7481634231215363, + "grad_norm": 0.5094742009361463, + "learning_rate": 3.026963414580145e-06, + "loss": 0.6916, + "step": 34830 + }, + { + "epoch": 0.7483782274348069, + "grad_norm": 0.503694503035299, + "learning_rate": 3.0220791260058057e-06, + "loss": 0.7062, + "step": 34840 + }, + { + "epoch": 0.7485930317480775, + "grad_norm": 0.5306897240771807, + "learning_rate": 3.017198079707937e-06, + "loss": 0.7022, + "step": 34850 + }, + { + "epoch": 0.7488078360613482, + "grad_norm": 0.5166101019632601, + "learning_rate": 3.01232027795451e-06, + "loss": 0.7041, + "step": 34860 + }, + { + "epoch": 0.7490226403746187, + "grad_norm": 0.5145609190625092, + "learning_rate": 3.0074457230119768e-06, + "loss": 0.6888, + "step": 34870 + }, + { + "epoch": 0.7492374446878893, + "grad_norm": 0.508976620646315, + "learning_rate": 3.0025744171452997e-06, + "loss": 0.7115, + "step": 34880 + }, + { + "epoch": 0.7494522490011599, + "grad_norm": 0.5253897144615868, + "learning_rate": 2.9977063626179128e-06, + "loss": 0.6893, + "step": 34890 + }, + { + "epoch": 0.7496670533144305, + "grad_norm": 0.5137649765422893, + "learning_rate": 2.9928415616917505e-06, + "loss": 0.6867, + "step": 34900 + }, + { + "epoch": 0.7498818576277012, + "grad_norm": 0.49499574504817595, + "learning_rate": 2.9879800166272355e-06, + "loss": 0.7002, + "step": 34910 + }, + { + "epoch": 0.7500966619409718, + "grad_norm": 0.519084029719955, + "learning_rate": 2.983121729683265e-06, + "loss": 0.7104, + "step": 34920 + }, + { + "epoch": 0.7503114662542424, + "grad_norm": 0.4981518656316401, + "learning_rate": 2.9782667031172454e-06, + "loss": 0.7028, + "step": 34930 + }, + { + "epoch": 0.750526270567513, + "grad_norm": 0.51741442278905, + "learning_rate": 2.973414939185041e-06, + "loss": 0.6982, + "step": 34940 + }, + { + "epoch": 0.7507410748807836, + "grad_norm": 0.5060387675297806, + "learning_rate": 2.9685664401410277e-06, + "loss": 0.697, + "step": 34950 + }, + { + "epoch": 0.7509558791940543, + "grad_norm": 0.5048062525549186, + "learning_rate": 2.9637212082380395e-06, + "loss": 0.71, + "step": 34960 + }, + { + "epoch": 0.7511706835073249, + "grad_norm": 0.5212032772870495, + "learning_rate": 2.9588792457274075e-06, + "loss": 0.6943, + "step": 34970 + }, + { + "epoch": 0.7513854878205954, + "grad_norm": 0.5107973393581554, + "learning_rate": 2.9540405548589434e-06, + "loss": 0.7015, + "step": 34980 + }, + { + "epoch": 0.751600292133866, + "grad_norm": 0.49864860876835415, + "learning_rate": 2.949205137880924e-06, + "loss": 0.69, + "step": 34990 + }, + { + "epoch": 0.7518150964471366, + "grad_norm": 0.5307082939307299, + "learning_rate": 2.944372997040129e-06, + "loss": 0.7061, + "step": 35000 + }, + { + "epoch": 0.7520299007604073, + "grad_norm": 0.5193800817097175, + "learning_rate": 2.9395441345817932e-06, + "loss": 0.6902, + "step": 35010 + }, + { + "epoch": 0.7522447050736779, + "grad_norm": 0.5177658570066975, + "learning_rate": 2.9347185527496403e-06, + "loss": 0.6956, + "step": 35020 + }, + { + "epoch": 0.7524595093869485, + "grad_norm": 0.5015771316930479, + "learning_rate": 2.929896253785871e-06, + "loss": 0.7147, + "step": 35030 + }, + { + "epoch": 0.7526743137002191, + "grad_norm": 0.4998092573649113, + "learning_rate": 2.9250772399311457e-06, + "loss": 0.7021, + "step": 35040 + }, + { + "epoch": 0.7528891180134897, + "grad_norm": 0.5249322187174793, + "learning_rate": 2.9202615134246225e-06, + "loss": 0.6977, + "step": 35050 + }, + { + "epoch": 0.7531039223267603, + "grad_norm": 0.5148512865886042, + "learning_rate": 2.91544907650391e-06, + "loss": 0.6941, + "step": 35060 + }, + { + "epoch": 0.753318726640031, + "grad_norm": 0.5031383431200541, + "learning_rate": 2.9106399314050993e-06, + "loss": 0.7068, + "step": 35070 + }, + { + "epoch": 0.7535335309533016, + "grad_norm": 0.5077882216038482, + "learning_rate": 2.905834080362754e-06, + "loss": 0.7062, + "step": 35080 + }, + { + "epoch": 0.7537483352665721, + "grad_norm": 0.5259578113827944, + "learning_rate": 2.901031525609891e-06, + "loss": 0.6936, + "step": 35090 + }, + { + "epoch": 0.7539631395798427, + "grad_norm": 0.5193893619415478, + "learning_rate": 2.896232269378022e-06, + "loss": 0.7213, + "step": 35100 + }, + { + "epoch": 0.7541779438931133, + "grad_norm": 0.5323126089856631, + "learning_rate": 2.8914363138970992e-06, + "loss": 0.703, + "step": 35110 + }, + { + "epoch": 0.754392748206384, + "grad_norm": 0.5173517751625921, + "learning_rate": 2.886643661395564e-06, + "loss": 0.7092, + "step": 35120 + }, + { + "epoch": 0.7546075525196546, + "grad_norm": 0.6741926928108194, + "learning_rate": 2.8818543141003043e-06, + "loss": 0.6951, + "step": 35130 + }, + { + "epoch": 0.7548223568329252, + "grad_norm": 0.5178832293172879, + "learning_rate": 2.877068274236683e-06, + "loss": 0.6924, + "step": 35140 + }, + { + "epoch": 0.7550371611461958, + "grad_norm": 0.5132574620286683, + "learning_rate": 2.872285544028528e-06, + "loss": 0.6977, + "step": 35150 + }, + { + "epoch": 0.7552519654594664, + "grad_norm": 0.523341159554861, + "learning_rate": 2.867506125698114e-06, + "loss": 0.7007, + "step": 35160 + }, + { + "epoch": 0.7554667697727371, + "grad_norm": 0.559952979776443, + "learning_rate": 2.862730021466201e-06, + "loss": 0.7094, + "step": 35170 + }, + { + "epoch": 0.7556815740860077, + "grad_norm": 0.5149609461834663, + "learning_rate": 2.8579572335519866e-06, + "loss": 0.6943, + "step": 35180 + }, + { + "epoch": 0.7558963783992783, + "grad_norm": 0.507539558550583, + "learning_rate": 2.853187764173141e-06, + "loss": 0.7008, + "step": 35190 + }, + { + "epoch": 0.7561111827125488, + "grad_norm": 0.49479288184079256, + "learning_rate": 2.848421615545789e-06, + "loss": 0.7042, + "step": 35200 + }, + { + "epoch": 0.7563259870258194, + "grad_norm": 0.5189990316887987, + "learning_rate": 2.8436587898845035e-06, + "loss": 0.6971, + "step": 35210 + }, + { + "epoch": 0.7565407913390901, + "grad_norm": 0.5162232202770758, + "learning_rate": 2.838899289402335e-06, + "loss": 0.7109, + "step": 35220 + }, + { + "epoch": 0.7567555956523607, + "grad_norm": 0.5165521047716195, + "learning_rate": 2.8341431163107648e-06, + "loss": 0.6952, + "step": 35230 + }, + { + "epoch": 0.7569703999656313, + "grad_norm": 0.4956540738020749, + "learning_rate": 2.829390272819742e-06, + "loss": 0.6871, + "step": 35240 + }, + { + "epoch": 0.7571852042789019, + "grad_norm": 0.513330560976213, + "learning_rate": 2.824640761137667e-06, + "loss": 0.6974, + "step": 35250 + }, + { + "epoch": 0.7574000085921725, + "grad_norm": 0.5032601979915532, + "learning_rate": 2.8198945834713885e-06, + "loss": 0.6913, + "step": 35260 + }, + { + "epoch": 0.7576148129054432, + "grad_norm": 0.5128246318375348, + "learning_rate": 2.815151742026213e-06, + "loss": 0.7056, + "step": 35270 + }, + { + "epoch": 0.7578296172187138, + "grad_norm": 0.5030995060530129, + "learning_rate": 2.810412239005885e-06, + "loss": 0.713, + "step": 35280 + }, + { + "epoch": 0.7580444215319844, + "grad_norm": 0.5038841598854293, + "learning_rate": 2.805676076612608e-06, + "loss": 0.6996, + "step": 35290 + }, + { + "epoch": 0.758259225845255, + "grad_norm": 0.5122170399356182, + "learning_rate": 2.8009432570470296e-06, + "loss": 0.7059, + "step": 35300 + }, + { + "epoch": 0.7584740301585255, + "grad_norm": 0.5156117155914561, + "learning_rate": 2.7962137825082446e-06, + "loss": 0.684, + "step": 35310 + }, + { + "epoch": 0.7586888344717962, + "grad_norm": 0.5049866933728283, + "learning_rate": 2.7914876551937953e-06, + "loss": 0.6942, + "step": 35320 + }, + { + "epoch": 0.7589036387850668, + "grad_norm": 0.5094420858773214, + "learning_rate": 2.7867648772996634e-06, + "loss": 0.708, + "step": 35330 + }, + { + "epoch": 0.7591184430983374, + "grad_norm": 0.5011895237248974, + "learning_rate": 2.7820454510202843e-06, + "loss": 0.6918, + "step": 35340 + }, + { + "epoch": 0.759333247411608, + "grad_norm": 0.504089241568298, + "learning_rate": 2.777329378548522e-06, + "loss": 0.7058, + "step": 35350 + }, + { + "epoch": 0.7595480517248786, + "grad_norm": 0.5365866928409356, + "learning_rate": 2.7726166620756934e-06, + "loss": 0.692, + "step": 35360 + }, + { + "epoch": 0.7597628560381492, + "grad_norm": 0.4984929620909205, + "learning_rate": 2.7679073037915516e-06, + "loss": 0.6874, + "step": 35370 + }, + { + "epoch": 0.7599776603514199, + "grad_norm": 0.5410453059904045, + "learning_rate": 2.763201305884291e-06, + "loss": 0.7145, + "step": 35380 + }, + { + "epoch": 0.7601924646646905, + "grad_norm": 0.5522929417691606, + "learning_rate": 2.758498670540546e-06, + "loss": 0.7132, + "step": 35390 + }, + { + "epoch": 0.7604072689779611, + "grad_norm": 0.5191149506874112, + "learning_rate": 2.7537993999453818e-06, + "loss": 0.7094, + "step": 35400 + }, + { + "epoch": 0.7606220732912317, + "grad_norm": 0.5364420903269488, + "learning_rate": 2.749103496282306e-06, + "loss": 0.7105, + "step": 35410 + }, + { + "epoch": 0.7608368776045022, + "grad_norm": 0.5307396572245091, + "learning_rate": 2.7444109617332614e-06, + "loss": 0.7067, + "step": 35420 + }, + { + "epoch": 0.761051681917773, + "grad_norm": 0.5189987491520354, + "learning_rate": 2.739721798478625e-06, + "loss": 0.6967, + "step": 35430 + }, + { + "epoch": 0.7612664862310435, + "grad_norm": 0.5522945068168588, + "learning_rate": 2.735036008697205e-06, + "loss": 0.7211, + "step": 35440 + }, + { + "epoch": 0.7614812905443141, + "grad_norm": 0.49783370978462715, + "learning_rate": 2.7303535945662485e-06, + "loss": 0.6972, + "step": 35450 + }, + { + "epoch": 0.7616960948575847, + "grad_norm": 0.5134039941417213, + "learning_rate": 2.725674558261423e-06, + "loss": 0.6903, + "step": 35460 + }, + { + "epoch": 0.7619108991708553, + "grad_norm": 0.5056018475415773, + "learning_rate": 2.720998901956836e-06, + "loss": 0.6978, + "step": 35470 + }, + { + "epoch": 0.762125703484126, + "grad_norm": 0.5016409320759787, + "learning_rate": 2.7163266278250222e-06, + "loss": 0.7092, + "step": 35480 + }, + { + "epoch": 0.7623405077973966, + "grad_norm": 0.5090009069207025, + "learning_rate": 2.7116577380369434e-06, + "loss": 0.7068, + "step": 35490 + }, + { + "epoch": 0.7625553121106672, + "grad_norm": 0.5138423219447262, + "learning_rate": 2.7069922347619926e-06, + "loss": 0.6999, + "step": 35500 + }, + { + "epoch": 0.7627701164239378, + "grad_norm": 0.5079624500816473, + "learning_rate": 2.7023301201679763e-06, + "loss": 0.7128, + "step": 35510 + }, + { + "epoch": 0.7629849207372084, + "grad_norm": 0.519763641884365, + "learning_rate": 2.69767139642115e-06, + "loss": 0.6984, + "step": 35520 + }, + { + "epoch": 0.7631997250504791, + "grad_norm": 0.526599503026077, + "learning_rate": 2.6930160656861704e-06, + "loss": 0.6996, + "step": 35530 + }, + { + "epoch": 0.7634145293637496, + "grad_norm": 0.527777664386869, + "learning_rate": 2.688364130126131e-06, + "loss": 0.7001, + "step": 35540 + }, + { + "epoch": 0.7636293336770202, + "grad_norm": 0.5167948414932604, + "learning_rate": 2.6837155919025426e-06, + "loss": 0.6886, + "step": 35550 + }, + { + "epoch": 0.7638441379902908, + "grad_norm": 0.5005085326477348, + "learning_rate": 2.6790704531753385e-06, + "loss": 0.6922, + "step": 35560 + }, + { + "epoch": 0.7640589423035614, + "grad_norm": 0.5008777298038203, + "learning_rate": 2.6744287161028782e-06, + "loss": 0.6941, + "step": 35570 + }, + { + "epoch": 0.7642737466168321, + "grad_norm": 0.5168378019628393, + "learning_rate": 2.669790382841928e-06, + "loss": 0.7073, + "step": 35580 + }, + { + "epoch": 0.7644885509301027, + "grad_norm": 0.5064269251779776, + "learning_rate": 2.665155455547682e-06, + "loss": 0.7061, + "step": 35590 + }, + { + "epoch": 0.7647033552433733, + "grad_norm": 0.5045676176502482, + "learning_rate": 2.66052393637375e-06, + "loss": 0.7015, + "step": 35600 + }, + { + "epoch": 0.7649181595566439, + "grad_norm": 0.4956210829558816, + "learning_rate": 2.655895827472158e-06, + "loss": 0.6862, + "step": 35610 + }, + { + "epoch": 0.7651329638699145, + "grad_norm": 0.5097661066228676, + "learning_rate": 2.6512711309933503e-06, + "loss": 0.6848, + "step": 35620 + }, + { + "epoch": 0.7653477681831852, + "grad_norm": 0.510005397820754, + "learning_rate": 2.6466498490861734e-06, + "loss": 0.6905, + "step": 35630 + }, + { + "epoch": 0.7655625724964558, + "grad_norm": 0.5253052971997748, + "learning_rate": 2.642031983897909e-06, + "loss": 0.6904, + "step": 35640 + }, + { + "epoch": 0.7657773768097264, + "grad_norm": 0.5027879257266836, + "learning_rate": 2.6374175375742285e-06, + "loss": 0.685, + "step": 35650 + }, + { + "epoch": 0.7659921811229969, + "grad_norm": 0.519001207669886, + "learning_rate": 2.6328065122592284e-06, + "loss": 0.6946, + "step": 35660 + }, + { + "epoch": 0.7662069854362675, + "grad_norm": 0.5156599981554072, + "learning_rate": 2.6281989100954155e-06, + "loss": 0.6955, + "step": 35670 + }, + { + "epoch": 0.7664217897495381, + "grad_norm": 0.5065842964681723, + "learning_rate": 2.623594733223692e-06, + "loss": 0.6901, + "step": 35680 + }, + { + "epoch": 0.7666365940628088, + "grad_norm": 0.5043045850760082, + "learning_rate": 2.6189939837833934e-06, + "loss": 0.6807, + "step": 35690 + }, + { + "epoch": 0.7668513983760794, + "grad_norm": 0.5203139211551309, + "learning_rate": 2.6143966639122343e-06, + "loss": 0.7069, + "step": 35700 + }, + { + "epoch": 0.76706620268935, + "grad_norm": 0.5059570893757329, + "learning_rate": 2.609802775746363e-06, + "loss": 0.6829, + "step": 35710 + }, + { + "epoch": 0.7672810070026206, + "grad_norm": 0.5292183230754148, + "learning_rate": 2.6052123214203106e-06, + "loss": 0.7116, + "step": 35720 + }, + { + "epoch": 0.7674958113158912, + "grad_norm": 0.5050171351436751, + "learning_rate": 2.6006253030670246e-06, + "loss": 0.6933, + "step": 35730 + }, + { + "epoch": 0.7677106156291619, + "grad_norm": 0.5367079725288396, + "learning_rate": 2.596041722817859e-06, + "loss": 0.6986, + "step": 35740 + }, + { + "epoch": 0.7679254199424325, + "grad_norm": 0.5157066385692741, + "learning_rate": 2.5914615828025534e-06, + "loss": 0.7029, + "step": 35750 + }, + { + "epoch": 0.768140224255703, + "grad_norm": 0.5036694678541429, + "learning_rate": 2.5868848851492733e-06, + "loss": 0.688, + "step": 35760 + }, + { + "epoch": 0.7683550285689736, + "grad_norm": 0.5071285342347541, + "learning_rate": 2.5823116319845633e-06, + "loss": 0.6878, + "step": 35770 + }, + { + "epoch": 0.7685698328822442, + "grad_norm": 0.5068127626353475, + "learning_rate": 2.5777418254333775e-06, + "loss": 0.6938, + "step": 35780 + }, + { + "epoch": 0.7687846371955149, + "grad_norm": 0.5198659666241273, + "learning_rate": 2.5731754676190725e-06, + "loss": 0.6919, + "step": 35790 + }, + { + "epoch": 0.7689994415087855, + "grad_norm": 0.5040011486527468, + "learning_rate": 2.568612560663385e-06, + "loss": 0.6883, + "step": 35800 + }, + { + "epoch": 0.7692142458220561, + "grad_norm": 0.5095849632751419, + "learning_rate": 2.5640531066864737e-06, + "loss": 0.6935, + "step": 35810 + }, + { + "epoch": 0.7694290501353267, + "grad_norm": 0.5031387126877207, + "learning_rate": 2.559497107806871e-06, + "loss": 0.6873, + "step": 35820 + }, + { + "epoch": 0.7696438544485973, + "grad_norm": 0.5069738792715828, + "learning_rate": 2.554944566141515e-06, + "loss": 0.6938, + "step": 35830 + }, + { + "epoch": 0.769858658761868, + "grad_norm": 0.5225177154611406, + "learning_rate": 2.5503954838057367e-06, + "loss": 0.6986, + "step": 35840 + }, + { + "epoch": 0.7700734630751386, + "grad_norm": 0.5337253376024659, + "learning_rate": 2.54584986291325e-06, + "loss": 0.6938, + "step": 35850 + }, + { + "epoch": 0.7702882673884092, + "grad_norm": 0.4974683440922778, + "learning_rate": 2.54130770557618e-06, + "loss": 0.7002, + "step": 35860 + }, + { + "epoch": 0.7705030717016798, + "grad_norm": 0.5164372270920562, + "learning_rate": 2.536769013905022e-06, + "loss": 0.7042, + "step": 35870 + }, + { + "epoch": 0.7707178760149503, + "grad_norm": 0.5045862598256815, + "learning_rate": 2.532233790008671e-06, + "loss": 0.7122, + "step": 35880 + }, + { + "epoch": 0.770932680328221, + "grad_norm": 0.5039493617623143, + "learning_rate": 2.5277020359944114e-06, + "loss": 0.6996, + "step": 35890 + }, + { + "epoch": 0.7711474846414916, + "grad_norm": 0.4974176270519638, + "learning_rate": 2.5231737539679124e-06, + "loss": 0.7015, + "step": 35900 + }, + { + "epoch": 0.7713622889547622, + "grad_norm": 0.5183097985832663, + "learning_rate": 2.518648946033234e-06, + "loss": 0.6908, + "step": 35910 + }, + { + "epoch": 0.7715770932680328, + "grad_norm": 0.525976552546943, + "learning_rate": 2.5141276142928093e-06, + "loss": 0.6969, + "step": 35920 + }, + { + "epoch": 0.7717918975813034, + "grad_norm": 0.5332780233463744, + "learning_rate": 2.509609760847479e-06, + "loss": 0.6916, + "step": 35930 + }, + { + "epoch": 0.772006701894574, + "grad_norm": 0.5111293385118552, + "learning_rate": 2.5050953877964446e-06, + "loss": 0.7115, + "step": 35940 + }, + { + "epoch": 0.7722215062078447, + "grad_norm": 0.5304473121055377, + "learning_rate": 2.500584497237303e-06, + "loss": 0.6997, + "step": 35950 + }, + { + "epoch": 0.7724363105211153, + "grad_norm": 0.5202204337822746, + "learning_rate": 2.4960770912660324e-06, + "loss": 0.6949, + "step": 35960 + }, + { + "epoch": 0.7726511148343859, + "grad_norm": 0.5082441180489494, + "learning_rate": 2.491573171976982e-06, + "loss": 0.6936, + "step": 35970 + }, + { + "epoch": 0.7728659191476565, + "grad_norm": 0.5085533382278553, + "learning_rate": 2.4870727414629005e-06, + "loss": 0.6995, + "step": 35980 + }, + { + "epoch": 0.773080723460927, + "grad_norm": 0.5110984394108816, + "learning_rate": 2.482575801814894e-06, + "loss": 0.6912, + "step": 35990 + }, + { + "epoch": 0.7732955277741977, + "grad_norm": 0.5333234352124336, + "learning_rate": 2.4780823551224586e-06, + "loss": 0.7008, + "step": 36000 + }, + { + "epoch": 0.7735103320874683, + "grad_norm": 0.5104268992806394, + "learning_rate": 2.473592403473466e-06, + "loss": 0.6972, + "step": 36010 + }, + { + "epoch": 0.7737251364007389, + "grad_norm": 0.5095548549309175, + "learning_rate": 2.4691059489541637e-06, + "loss": 0.6833, + "step": 36020 + }, + { + "epoch": 0.7739399407140095, + "grad_norm": 0.5379203466934536, + "learning_rate": 2.464622993649174e-06, + "loss": 0.6885, + "step": 36030 + }, + { + "epoch": 0.7741547450272801, + "grad_norm": 0.5103185774726944, + "learning_rate": 2.46014353964149e-06, + "loss": 0.6972, + "step": 36040 + }, + { + "epoch": 0.7743695493405508, + "grad_norm": 0.5108530905870736, + "learning_rate": 2.4556675890124803e-06, + "loss": 0.6918, + "step": 36050 + }, + { + "epoch": 0.7745843536538214, + "grad_norm": 0.5114449569980617, + "learning_rate": 2.451195143841889e-06, + "loss": 0.6982, + "step": 36060 + }, + { + "epoch": 0.774799157967092, + "grad_norm": 0.5117859340036564, + "learning_rate": 2.446726206207827e-06, + "loss": 0.7139, + "step": 36070 + }, + { + "epoch": 0.7750139622803626, + "grad_norm": 0.5482258536620003, + "learning_rate": 2.442260778186777e-06, + "loss": 0.7027, + "step": 36080 + }, + { + "epoch": 0.7752287665936332, + "grad_norm": 0.5072515818925265, + "learning_rate": 2.4377988618535943e-06, + "loss": 0.6881, + "step": 36090 + }, + { + "epoch": 0.7754435709069039, + "grad_norm": 0.4985942219561439, + "learning_rate": 2.4333404592814923e-06, + "loss": 0.6863, + "step": 36100 + }, + { + "epoch": 0.7756583752201744, + "grad_norm": 0.5028179650880126, + "learning_rate": 2.4288855725420625e-06, + "loss": 0.6791, + "step": 36110 + }, + { + "epoch": 0.775873179533445, + "grad_norm": 0.5497148140060542, + "learning_rate": 2.424434203705257e-06, + "loss": 0.7117, + "step": 36120 + }, + { + "epoch": 0.7760879838467156, + "grad_norm": 0.5121974367383552, + "learning_rate": 2.4199863548393978e-06, + "loss": 0.6947, + "step": 36130 + }, + { + "epoch": 0.7763027881599862, + "grad_norm": 0.5024469961030319, + "learning_rate": 2.4155420280111673e-06, + "loss": 0.7016, + "step": 36140 + }, + { + "epoch": 0.7765175924732569, + "grad_norm": 0.49804034625215227, + "learning_rate": 2.411101225285616e-06, + "loss": 0.6963, + "step": 36150 + }, + { + "epoch": 0.7767323967865275, + "grad_norm": 0.5197970242944042, + "learning_rate": 2.406663948726147e-06, + "loss": 0.6917, + "step": 36160 + }, + { + "epoch": 0.7769472010997981, + "grad_norm": 0.5243687759024098, + "learning_rate": 2.4022302003945343e-06, + "loss": 0.6886, + "step": 36170 + }, + { + "epoch": 0.7771620054130687, + "grad_norm": 0.5067864445033509, + "learning_rate": 2.3977999823509112e-06, + "loss": 0.7043, + "step": 36180 + }, + { + "epoch": 0.7773768097263393, + "grad_norm": 0.4969747214927612, + "learning_rate": 2.3933732966537683e-06, + "loss": 0.7067, + "step": 36190 + }, + { + "epoch": 0.77759161403961, + "grad_norm": 0.5173099358575356, + "learning_rate": 2.3889501453599575e-06, + "loss": 0.7042, + "step": 36200 + }, + { + "epoch": 0.7778064183528806, + "grad_norm": 0.5247869692545869, + "learning_rate": 2.384530530524688e-06, + "loss": 0.6935, + "step": 36210 + }, + { + "epoch": 0.7780212226661511, + "grad_norm": 0.5564841459852391, + "learning_rate": 2.3801144542015197e-06, + "loss": 0.7144, + "step": 36220 + }, + { + "epoch": 0.7782360269794217, + "grad_norm": 0.5269720198709235, + "learning_rate": 2.3757019184423756e-06, + "loss": 0.7053, + "step": 36230 + }, + { + "epoch": 0.7784508312926923, + "grad_norm": 0.4911231646939383, + "learning_rate": 2.3712929252975327e-06, + "loss": 0.705, + "step": 36240 + }, + { + "epoch": 0.7786656356059629, + "grad_norm": 0.5168096144036898, + "learning_rate": 2.366887476815619e-06, + "loss": 0.7133, + "step": 36250 + }, + { + "epoch": 0.7788804399192336, + "grad_norm": 0.4905748157218672, + "learning_rate": 2.3624855750436206e-06, + "loss": 0.7, + "step": 36260 + }, + { + "epoch": 0.7790952442325042, + "grad_norm": 0.5059325828111079, + "learning_rate": 2.3580872220268623e-06, + "loss": 0.6928, + "step": 36270 + }, + { + "epoch": 0.7793100485457748, + "grad_norm": 0.5163071515499138, + "learning_rate": 2.3536924198090437e-06, + "loss": 0.7068, + "step": 36280 + }, + { + "epoch": 0.7795248528590454, + "grad_norm": 0.5432830473338806, + "learning_rate": 2.34930117043219e-06, + "loss": 0.7001, + "step": 36290 + }, + { + "epoch": 0.779739657172316, + "grad_norm": 0.5014221779118088, + "learning_rate": 2.344913475936689e-06, + "loss": 0.7067, + "step": 36300 + }, + { + "epoch": 0.7799544614855867, + "grad_norm": 0.5190757630635009, + "learning_rate": 2.340529338361275e-06, + "loss": 0.7077, + "step": 36310 + }, + { + "epoch": 0.7801692657988573, + "grad_norm": 0.5123110569142603, + "learning_rate": 2.3361487597430265e-06, + "loss": 0.71, + "step": 36320 + }, + { + "epoch": 0.7803840701121278, + "grad_norm": 0.5208933584534376, + "learning_rate": 2.3317717421173747e-06, + "loss": 0.7115, + "step": 36330 + }, + { + "epoch": 0.7805988744253984, + "grad_norm": 0.5101156973048868, + "learning_rate": 2.3273982875180832e-06, + "loss": 0.6866, + "step": 36340 + }, + { + "epoch": 0.780813678738669, + "grad_norm": 0.5277994764678194, + "learning_rate": 2.3230283979772795e-06, + "loss": 0.7037, + "step": 36350 + }, + { + "epoch": 0.7810284830519397, + "grad_norm": 0.5609421047646885, + "learning_rate": 2.318662075525415e-06, + "loss": 0.7071, + "step": 36360 + }, + { + "epoch": 0.7812432873652103, + "grad_norm": 0.5162031245256725, + "learning_rate": 2.3142993221912968e-06, + "loss": 0.6859, + "step": 36370 + }, + { + "epoch": 0.7814580916784809, + "grad_norm": 0.526786457219609, + "learning_rate": 2.3099401400020693e-06, + "loss": 0.7008, + "step": 36380 + }, + { + "epoch": 0.7816728959917515, + "grad_norm": 0.5462499765613852, + "learning_rate": 2.3055845309832113e-06, + "loss": 0.6883, + "step": 36390 + }, + { + "epoch": 0.7818877003050221, + "grad_norm": 0.517477588318434, + "learning_rate": 2.3012324971585575e-06, + "loss": 0.7137, + "step": 36400 + }, + { + "epoch": 0.7821025046182928, + "grad_norm": 0.5159739518570036, + "learning_rate": 2.2968840405502636e-06, + "loss": 0.6996, + "step": 36410 + }, + { + "epoch": 0.7823173089315634, + "grad_norm": 0.511918446890818, + "learning_rate": 2.292539163178834e-06, + "loss": 0.6882, + "step": 36420 + }, + { + "epoch": 0.782532113244834, + "grad_norm": 0.5294058439127566, + "learning_rate": 2.288197867063109e-06, + "loss": 0.6968, + "step": 36430 + }, + { + "epoch": 0.7827469175581045, + "grad_norm": 0.5295192397598697, + "learning_rate": 2.2838601542202543e-06, + "loss": 0.699, + "step": 36440 + }, + { + "epoch": 0.7829617218713751, + "grad_norm": 0.5163303982789976, + "learning_rate": 2.2795260266657905e-06, + "loss": 0.6876, + "step": 36450 + }, + { + "epoch": 0.7831765261846458, + "grad_norm": 0.528330319215088, + "learning_rate": 2.275195486413554e-06, + "loss": 0.6974, + "step": 36460 + }, + { + "epoch": 0.7833913304979164, + "grad_norm": 0.5239254843684411, + "learning_rate": 2.270868535475722e-06, + "loss": 0.6845, + "step": 36470 + }, + { + "epoch": 0.783606134811187, + "grad_norm": 0.5129180784875449, + "learning_rate": 2.266545175862809e-06, + "loss": 0.6919, + "step": 36480 + }, + { + "epoch": 0.7838209391244576, + "grad_norm": 0.553503515608531, + "learning_rate": 2.262225409583643e-06, + "loss": 0.6931, + "step": 36490 + }, + { + "epoch": 0.7840357434377282, + "grad_norm": 0.5193440729217803, + "learning_rate": 2.257909238645408e-06, + "loss": 0.691, + "step": 36500 + }, + { + "epoch": 0.7842505477509989, + "grad_norm": 0.5409849700495803, + "learning_rate": 2.253596665053592e-06, + "loss": 0.6901, + "step": 36510 + }, + { + "epoch": 0.7844653520642695, + "grad_norm": 0.50913652185814, + "learning_rate": 2.2492876908120355e-06, + "loss": 0.6969, + "step": 36520 + }, + { + "epoch": 0.7846801563775401, + "grad_norm": 0.5190130256439345, + "learning_rate": 2.2449823179228846e-06, + "loss": 0.7067, + "step": 36530 + }, + { + "epoch": 0.7848949606908107, + "grad_norm": 0.5142508244546575, + "learning_rate": 2.240680548386626e-06, + "loss": 0.6986, + "step": 36540 + }, + { + "epoch": 0.7851097650040813, + "grad_norm": 0.5389748247074397, + "learning_rate": 2.2363823842020694e-06, + "loss": 0.7, + "step": 36550 + }, + { + "epoch": 0.7853245693173518, + "grad_norm": 0.5077039536630492, + "learning_rate": 2.2320878273663402e-06, + "loss": 0.6961, + "step": 36560 + }, + { + "epoch": 0.7855393736306225, + "grad_norm": 0.5387782887838709, + "learning_rate": 2.2277968798749074e-06, + "loss": 0.7003, + "step": 36570 + }, + { + "epoch": 0.7857541779438931, + "grad_norm": 0.5181751565840694, + "learning_rate": 2.2235095437215416e-06, + "loss": 0.6927, + "step": 36580 + }, + { + "epoch": 0.7859689822571637, + "grad_norm": 0.5150089685898623, + "learning_rate": 2.2192258208983474e-06, + "loss": 0.6922, + "step": 36590 + }, + { + "epoch": 0.7861837865704343, + "grad_norm": 0.5203313759269558, + "learning_rate": 2.2149457133957498e-06, + "loss": 0.6952, + "step": 36600 + }, + { + "epoch": 0.7863985908837049, + "grad_norm": 0.513794624005246, + "learning_rate": 2.210669223202485e-06, + "loss": 0.7039, + "step": 36610 + }, + { + "epoch": 0.7866133951969756, + "grad_norm": 0.49987037240309906, + "learning_rate": 2.2063963523056265e-06, + "loss": 0.6922, + "step": 36620 + }, + { + "epoch": 0.7868281995102462, + "grad_norm": 0.5086035486321187, + "learning_rate": 2.2021271026905444e-06, + "loss": 0.6855, + "step": 36630 + }, + { + "epoch": 0.7870430038235168, + "grad_norm": 0.5105940396334009, + "learning_rate": 2.1978614763409424e-06, + "loss": 0.6969, + "step": 36640 + }, + { + "epoch": 0.7872578081367874, + "grad_norm": 0.5037059973128649, + "learning_rate": 2.1935994752388323e-06, + "loss": 0.696, + "step": 36650 + }, + { + "epoch": 0.787472612450058, + "grad_norm": 0.528348362552611, + "learning_rate": 2.189341101364546e-06, + "loss": 0.6948, + "step": 36660 + }, + { + "epoch": 0.7876874167633287, + "grad_norm": 0.5312271902558634, + "learning_rate": 2.1850863566967296e-06, + "loss": 0.7015, + "step": 36670 + }, + { + "epoch": 0.7879022210765992, + "grad_norm": 0.5205776526916766, + "learning_rate": 2.1808352432123338e-06, + "loss": 0.6935, + "step": 36680 + }, + { + "epoch": 0.7881170253898698, + "grad_norm": 0.517986778286259, + "learning_rate": 2.1765877628866405e-06, + "loss": 0.695, + "step": 36690 + }, + { + "epoch": 0.7883318297031404, + "grad_norm": 0.5059031168383098, + "learning_rate": 2.172343917693225e-06, + "loss": 0.694, + "step": 36700 + }, + { + "epoch": 0.788546634016411, + "grad_norm": 0.4977347989860999, + "learning_rate": 2.1681037096039826e-06, + "loss": 0.6954, + "step": 36710 + }, + { + "epoch": 0.7887614383296817, + "grad_norm": 0.5166359451995465, + "learning_rate": 2.1638671405891177e-06, + "loss": 0.6997, + "step": 36720 + }, + { + "epoch": 0.7889762426429523, + "grad_norm": 0.5074952802695203, + "learning_rate": 2.159634212617143e-06, + "loss": 0.6871, + "step": 36730 + }, + { + "epoch": 0.7891910469562229, + "grad_norm": 0.525379462136144, + "learning_rate": 2.155404927654884e-06, + "loss": 0.6867, + "step": 36740 + }, + { + "epoch": 0.7894058512694935, + "grad_norm": 0.5055105654931678, + "learning_rate": 2.151179287667463e-06, + "loss": 0.685, + "step": 36750 + }, + { + "epoch": 0.7896206555827641, + "grad_norm": 0.5392743199170078, + "learning_rate": 2.146957294618316e-06, + "loss": 0.7023, + "step": 36760 + }, + { + "epoch": 0.7898354598960348, + "grad_norm": 0.5245335783053737, + "learning_rate": 2.1427389504691854e-06, + "loss": 0.687, + "step": 36770 + }, + { + "epoch": 0.7900502642093054, + "grad_norm": 0.5382517802199199, + "learning_rate": 2.1385242571801145e-06, + "loss": 0.702, + "step": 36780 + }, + { + "epoch": 0.7902650685225759, + "grad_norm": 0.5268825342443021, + "learning_rate": 2.134313216709456e-06, + "loss": 0.6948, + "step": 36790 + }, + { + "epoch": 0.7904798728358465, + "grad_norm": 0.5169835737072852, + "learning_rate": 2.1301058310138556e-06, + "loss": 0.676, + "step": 36800 + }, + { + "epoch": 0.7906946771491171, + "grad_norm": 0.5044997612963653, + "learning_rate": 2.1259021020482674e-06, + "loss": 0.7033, + "step": 36810 + }, + { + "epoch": 0.7909094814623878, + "grad_norm": 0.514705637218055, + "learning_rate": 2.1217020317659463e-06, + "loss": 0.6949, + "step": 36820 + }, + { + "epoch": 0.7911242857756584, + "grad_norm": 0.5165996618712064, + "learning_rate": 2.1175056221184465e-06, + "loss": 0.7022, + "step": 36830 + }, + { + "epoch": 0.791339090088929, + "grad_norm": 0.5027344033789356, + "learning_rate": 2.113312875055621e-06, + "loss": 0.6877, + "step": 36840 + }, + { + "epoch": 0.7915538944021996, + "grad_norm": 0.5122991945340712, + "learning_rate": 2.1091237925256235e-06, + "loss": 0.6949, + "step": 36850 + }, + { + "epoch": 0.7917686987154702, + "grad_norm": 0.5295857931561139, + "learning_rate": 2.1049383764748977e-06, + "loss": 0.6942, + "step": 36860 + }, + { + "epoch": 0.7919835030287408, + "grad_norm": 0.5120717805001744, + "learning_rate": 2.1007566288481905e-06, + "loss": 0.6997, + "step": 36870 + }, + { + "epoch": 0.7921983073420115, + "grad_norm": 0.519141610866559, + "learning_rate": 2.0965785515885416e-06, + "loss": 0.6988, + "step": 36880 + }, + { + "epoch": 0.792413111655282, + "grad_norm": 0.5333090103926987, + "learning_rate": 2.0924041466372878e-06, + "loss": 0.7008, + "step": 36890 + }, + { + "epoch": 0.7926279159685526, + "grad_norm": 0.5168789441545899, + "learning_rate": 2.0882334159340566e-06, + "loss": 0.6837, + "step": 36900 + }, + { + "epoch": 0.7928427202818232, + "grad_norm": 0.5249764329034577, + "learning_rate": 2.0840663614167698e-06, + "loss": 0.699, + "step": 36910 + }, + { + "epoch": 0.7930575245950938, + "grad_norm": 0.5146899618296968, + "learning_rate": 2.0799029850216424e-06, + "loss": 0.6942, + "step": 36920 + }, + { + "epoch": 0.7932723289083645, + "grad_norm": 0.5166243708864191, + "learning_rate": 2.075743288683174e-06, + "loss": 0.7027, + "step": 36930 + }, + { + "epoch": 0.7934871332216351, + "grad_norm": 0.520103403645907, + "learning_rate": 2.0715872743341613e-06, + "loss": 0.6913, + "step": 36940 + }, + { + "epoch": 0.7937019375349057, + "grad_norm": 0.5661241316056728, + "learning_rate": 2.0674349439056884e-06, + "loss": 0.6916, + "step": 36950 + }, + { + "epoch": 0.7939167418481763, + "grad_norm": 0.5413841506028565, + "learning_rate": 2.0632862993271264e-06, + "loss": 0.6956, + "step": 36960 + }, + { + "epoch": 0.7941315461614469, + "grad_norm": 0.5057462755052492, + "learning_rate": 2.0591413425261364e-06, + "loss": 0.6967, + "step": 36970 + }, + { + "epoch": 0.7943463504747176, + "grad_norm": 0.5036998109773616, + "learning_rate": 2.0550000754286603e-06, + "loss": 0.6904, + "step": 36980 + }, + { + "epoch": 0.7945611547879882, + "grad_norm": 0.5141790017851666, + "learning_rate": 2.05086249995893e-06, + "loss": 0.6862, + "step": 36990 + }, + { + "epoch": 0.7947759591012588, + "grad_norm": 0.504848106422294, + "learning_rate": 2.046728618039464e-06, + "loss": 0.6813, + "step": 37000 + }, + { + "epoch": 0.7949907634145293, + "grad_norm": 0.5069448860207141, + "learning_rate": 2.0425984315910597e-06, + "loss": 0.6896, + "step": 37010 + }, + { + "epoch": 0.7952055677277999, + "grad_norm": 0.5179956584180097, + "learning_rate": 2.0384719425328025e-06, + "loss": 0.6923, + "step": 37020 + }, + { + "epoch": 0.7954203720410706, + "grad_norm": 0.5137679574180213, + "learning_rate": 2.0343491527820504e-06, + "loss": 0.7031, + "step": 37030 + }, + { + "epoch": 0.7956351763543412, + "grad_norm": 0.5076257848053785, + "learning_rate": 2.0302300642544583e-06, + "loss": 0.6845, + "step": 37040 + }, + { + "epoch": 0.7958499806676118, + "grad_norm": 0.5259087825677118, + "learning_rate": 2.0261146788639453e-06, + "loss": 0.6858, + "step": 37050 + }, + { + "epoch": 0.7960647849808824, + "grad_norm": 0.5244573164373156, + "learning_rate": 2.0220029985227175e-06, + "loss": 0.6967, + "step": 37060 + }, + { + "epoch": 0.796279589294153, + "grad_norm": 0.5289311220896381, + "learning_rate": 2.017895025141264e-06, + "loss": 0.7002, + "step": 37070 + }, + { + "epoch": 0.7964943936074237, + "grad_norm": 0.5141566850943272, + "learning_rate": 2.013790760628336e-06, + "loss": 0.694, + "step": 37080 + }, + { + "epoch": 0.7967091979206943, + "grad_norm": 0.5130864502827068, + "learning_rate": 2.009690206890984e-06, + "loss": 0.7102, + "step": 37090 + }, + { + "epoch": 0.7969240022339649, + "grad_norm": 0.5258889044125008, + "learning_rate": 2.0055933658345094e-06, + "loss": 0.7073, + "step": 37100 + }, + { + "epoch": 0.7971388065472355, + "grad_norm": 0.5220405637403223, + "learning_rate": 2.0015002393625114e-06, + "loss": 0.6862, + "step": 37110 + }, + { + "epoch": 0.797353610860506, + "grad_norm": 0.5143925147907012, + "learning_rate": 1.997410829376847e-06, + "loss": 0.6992, + "step": 37120 + }, + { + "epoch": 0.7975684151737766, + "grad_norm": 0.501877470630008, + "learning_rate": 1.993325137777652e-06, + "loss": 0.6885, + "step": 37130 + }, + { + "epoch": 0.7977832194870473, + "grad_norm": 0.5203927062197888, + "learning_rate": 1.9892431664633393e-06, + "loss": 0.697, + "step": 37140 + }, + { + "epoch": 0.7979980238003179, + "grad_norm": 0.5025995904904371, + "learning_rate": 1.9851649173305798e-06, + "loss": 0.6942, + "step": 37150 + }, + { + "epoch": 0.7982128281135885, + "grad_norm": 0.5191545346856352, + "learning_rate": 1.981090392274333e-06, + "loss": 0.6966, + "step": 37160 + }, + { + "epoch": 0.7984276324268591, + "grad_norm": 0.5131619635945721, + "learning_rate": 1.9770195931878123e-06, + "loss": 0.6826, + "step": 37170 + }, + { + "epoch": 0.7986424367401297, + "grad_norm": 0.5218712729260536, + "learning_rate": 1.9729525219625077e-06, + "loss": 0.6889, + "step": 37180 + }, + { + "epoch": 0.7988572410534004, + "grad_norm": 0.5326896100885731, + "learning_rate": 1.968889180488178e-06, + "loss": 0.6956, + "step": 37190 + }, + { + "epoch": 0.799072045366671, + "grad_norm": 0.5179996887681789, + "learning_rate": 1.9648295706528385e-06, + "loss": 0.691, + "step": 37200 + }, + { + "epoch": 0.7992868496799416, + "grad_norm": 0.49638076183346225, + "learning_rate": 1.96077369434279e-06, + "loss": 0.6881, + "step": 37210 + }, + { + "epoch": 0.7995016539932122, + "grad_norm": 0.5142154100208954, + "learning_rate": 1.9567215534425777e-06, + "loss": 0.6821, + "step": 37220 + }, + { + "epoch": 0.7997164583064827, + "grad_norm": 0.5097975763047736, + "learning_rate": 1.952673149835025e-06, + "loss": 0.6934, + "step": 37230 + }, + { + "epoch": 0.7999312626197534, + "grad_norm": 0.5188923865330697, + "learning_rate": 1.948628485401215e-06, + "loss": 0.7017, + "step": 37240 + }, + { + "epoch": 0.800146066933024, + "grad_norm": 0.5393099873888024, + "learning_rate": 1.9445875620204846e-06, + "loss": 0.6985, + "step": 37250 + }, + { + "epoch": 0.8003608712462946, + "grad_norm": 0.5270181645580231, + "learning_rate": 1.940550381570453e-06, + "loss": 0.7074, + "step": 37260 + }, + { + "epoch": 0.8005756755595652, + "grad_norm": 0.49448978860517945, + "learning_rate": 1.9365169459269763e-06, + "loss": 0.704, + "step": 37270 + }, + { + "epoch": 0.8007904798728358, + "grad_norm": 0.5090418918653451, + "learning_rate": 1.932487256964191e-06, + "loss": 0.6871, + "step": 37280 + }, + { + "epoch": 0.8010052841861065, + "grad_norm": 0.5372331103704078, + "learning_rate": 1.9284613165544776e-06, + "loss": 0.7009, + "step": 37290 + }, + { + "epoch": 0.8012200884993771, + "grad_norm": 0.5123892333670363, + "learning_rate": 1.9244391265684836e-06, + "loss": 0.6939, + "step": 37300 + }, + { + "epoch": 0.8014348928126477, + "grad_norm": 0.4978925292624661, + "learning_rate": 1.9204206888751133e-06, + "loss": 0.6927, + "step": 37310 + }, + { + "epoch": 0.8016496971259183, + "grad_norm": 0.4954993561307306, + "learning_rate": 1.916406005341517e-06, + "loss": 0.6832, + "step": 37320 + }, + { + "epoch": 0.8018645014391889, + "grad_norm": 0.5248932509325274, + "learning_rate": 1.9123950778331204e-06, + "loss": 0.6889, + "step": 37330 + }, + { + "epoch": 0.8020793057524596, + "grad_norm": 0.503520388001021, + "learning_rate": 1.908387908213585e-06, + "loss": 0.6941, + "step": 37340 + }, + { + "epoch": 0.8022941100657301, + "grad_norm": 0.5217962284837745, + "learning_rate": 1.9043844983448356e-06, + "loss": 0.7014, + "step": 37350 + }, + { + "epoch": 0.8025089143790007, + "grad_norm": 0.5172482221861939, + "learning_rate": 1.9003848500870514e-06, + "loss": 0.6816, + "step": 37360 + }, + { + "epoch": 0.8027237186922713, + "grad_norm": 0.5171999415635867, + "learning_rate": 1.8963889652986533e-06, + "loss": 0.6918, + "step": 37370 + }, + { + "epoch": 0.8029385230055419, + "grad_norm": 0.531206719078998, + "learning_rate": 1.8923968458363307e-06, + "loss": 0.6894, + "step": 37380 + }, + { + "epoch": 0.8031533273188126, + "grad_norm": 0.509749112200991, + "learning_rate": 1.8884084935550063e-06, + "loss": 0.6937, + "step": 37390 + }, + { + "epoch": 0.8033681316320832, + "grad_norm": 0.5234029461890899, + "learning_rate": 1.884423910307861e-06, + "loss": 0.7173, + "step": 37400 + }, + { + "epoch": 0.8035829359453538, + "grad_norm": 0.520100769338295, + "learning_rate": 1.880443097946325e-06, + "loss": 0.6965, + "step": 37410 + }, + { + "epoch": 0.8037977402586244, + "grad_norm": 0.5198521028897318, + "learning_rate": 1.8764660583200733e-06, + "loss": 0.6857, + "step": 37420 + }, + { + "epoch": 0.804012544571895, + "grad_norm": 0.5170110623698808, + "learning_rate": 1.872492793277032e-06, + "loss": 0.6751, + "step": 37430 + }, + { + "epoch": 0.8042273488851656, + "grad_norm": 0.5058623102726381, + "learning_rate": 1.8685233046633655e-06, + "loss": 0.6925, + "step": 37440 + }, + { + "epoch": 0.8044421531984363, + "grad_norm": 0.5111088811413376, + "learning_rate": 1.8645575943234906e-06, + "loss": 0.6838, + "step": 37450 + }, + { + "epoch": 0.8046569575117068, + "grad_norm": 0.5010466802240249, + "learning_rate": 1.8605956641000678e-06, + "loss": 0.6894, + "step": 37460 + }, + { + "epoch": 0.8048717618249774, + "grad_norm": 0.5003570547858869, + "learning_rate": 1.8566375158339977e-06, + "loss": 0.6896, + "step": 37470 + }, + { + "epoch": 0.805086566138248, + "grad_norm": 0.5067814043345279, + "learning_rate": 1.852683151364426e-06, + "loss": 0.6804, + "step": 37480 + }, + { + "epoch": 0.8053013704515186, + "grad_norm": 0.5060047213750052, + "learning_rate": 1.8487325725287419e-06, + "loss": 0.7015, + "step": 37490 + }, + { + "epoch": 0.8055161747647893, + "grad_norm": 0.5139506510936809, + "learning_rate": 1.8447857811625747e-06, + "loss": 0.6823, + "step": 37500 + }, + { + "epoch": 0.8057309790780599, + "grad_norm": 0.4996768937608163, + "learning_rate": 1.8408427790997873e-06, + "loss": 0.6838, + "step": 37510 + }, + { + "epoch": 0.8059457833913305, + "grad_norm": 0.5125914344469875, + "learning_rate": 1.8369035681724912e-06, + "loss": 0.6789, + "step": 37520 + }, + { + "epoch": 0.8061605877046011, + "grad_norm": 0.5101593507880621, + "learning_rate": 1.8329681502110308e-06, + "loss": 0.7152, + "step": 37530 + }, + { + "epoch": 0.8063753920178717, + "grad_norm": 0.5089159371492329, + "learning_rate": 1.8290365270439926e-06, + "loss": 0.6919, + "step": 37540 + }, + { + "epoch": 0.8065901963311424, + "grad_norm": 0.5046175462761139, + "learning_rate": 1.8251087004981972e-06, + "loss": 0.6962, + "step": 37550 + }, + { + "epoch": 0.806805000644413, + "grad_norm": 0.535765255941607, + "learning_rate": 1.821184672398698e-06, + "loss": 0.6824, + "step": 37560 + }, + { + "epoch": 0.8070198049576836, + "grad_norm": 0.5057658285384766, + "learning_rate": 1.817264444568787e-06, + "loss": 0.7004, + "step": 37570 + }, + { + "epoch": 0.8072346092709541, + "grad_norm": 0.5305219423002547, + "learning_rate": 1.8133480188299913e-06, + "loss": 0.6881, + "step": 37580 + }, + { + "epoch": 0.8074494135842247, + "grad_norm": 0.5093435581220985, + "learning_rate": 1.8094353970020705e-06, + "loss": 0.7052, + "step": 37590 + }, + { + "epoch": 0.8076642178974954, + "grad_norm": 0.5057006548211692, + "learning_rate": 1.8055265809030142e-06, + "loss": 0.6937, + "step": 37600 + }, + { + "epoch": 0.807879022210766, + "grad_norm": 0.5062825370919449, + "learning_rate": 1.8016215723490504e-06, + "loss": 0.711, + "step": 37610 + }, + { + "epoch": 0.8080938265240366, + "grad_norm": 0.5082475799530384, + "learning_rate": 1.7977203731546266e-06, + "loss": 0.7022, + "step": 37620 + }, + { + "epoch": 0.8083086308373072, + "grad_norm": 0.5006515892573048, + "learning_rate": 1.7938229851324308e-06, + "loss": 0.6874, + "step": 37630 + }, + { + "epoch": 0.8085234351505778, + "grad_norm": 0.5148519330223464, + "learning_rate": 1.789929410093375e-06, + "loss": 0.6914, + "step": 37640 + }, + { + "epoch": 0.8087382394638485, + "grad_norm": 0.5026476503083747, + "learning_rate": 1.7860396498466004e-06, + "loss": 0.6916, + "step": 37650 + }, + { + "epoch": 0.8089530437771191, + "grad_norm": 0.5054813405354616, + "learning_rate": 1.782153706199481e-06, + "loss": 0.6939, + "step": 37660 + }, + { + "epoch": 0.8091678480903897, + "grad_norm": 0.5104786331574065, + "learning_rate": 1.7782715809576023e-06, + "loss": 0.708, + "step": 37670 + }, + { + "epoch": 0.8093826524036603, + "grad_norm": 0.5104889216759052, + "learning_rate": 1.7743932759247973e-06, + "loss": 0.6941, + "step": 37680 + }, + { + "epoch": 0.8095974567169308, + "grad_norm": 0.5301500050999222, + "learning_rate": 1.7705187929031042e-06, + "loss": 0.6935, + "step": 37690 + }, + { + "epoch": 0.8098122610302015, + "grad_norm": 0.5008139461716258, + "learning_rate": 1.7666481336927965e-06, + "loss": 0.6966, + "step": 37700 + }, + { + "epoch": 0.8100270653434721, + "grad_norm": 0.5292331018191944, + "learning_rate": 1.7627813000923677e-06, + "loss": 0.6869, + "step": 37710 + }, + { + "epoch": 0.8102418696567427, + "grad_norm": 0.5214981520880394, + "learning_rate": 1.7589182938985338e-06, + "loss": 0.6881, + "step": 37720 + }, + { + "epoch": 0.8104566739700133, + "grad_norm": 0.5162236641163164, + "learning_rate": 1.755059116906236e-06, + "loss": 0.6898, + "step": 37730 + }, + { + "epoch": 0.8106714782832839, + "grad_norm": 0.5157714549239334, + "learning_rate": 1.751203770908627e-06, + "loss": 0.6875, + "step": 37740 + }, + { + "epoch": 0.8108862825965545, + "grad_norm": 0.5107688828062216, + "learning_rate": 1.7473522576970881e-06, + "loss": 0.7055, + "step": 37750 + }, + { + "epoch": 0.8111010869098252, + "grad_norm": 0.5467930765197646, + "learning_rate": 1.743504579061216e-06, + "loss": 0.6868, + "step": 37760 + }, + { + "epoch": 0.8113158912230958, + "grad_norm": 0.5160466173487962, + "learning_rate": 1.7396607367888286e-06, + "loss": 0.6901, + "step": 37770 + }, + { + "epoch": 0.8115306955363664, + "grad_norm": 0.5405614071125214, + "learning_rate": 1.7358207326659604e-06, + "loss": 0.689, + "step": 37780 + }, + { + "epoch": 0.811745499849637, + "grad_norm": 0.5336778524063787, + "learning_rate": 1.7319845684768533e-06, + "loss": 0.6959, + "step": 37790 + }, + { + "epoch": 0.8119603041629075, + "grad_norm": 0.5104195973429427, + "learning_rate": 1.7281522460039845e-06, + "loss": 0.6881, + "step": 37800 + }, + { + "epoch": 0.8121751084761782, + "grad_norm": 0.5172156188555316, + "learning_rate": 1.7243237670280267e-06, + "loss": 0.6996, + "step": 37810 + }, + { + "epoch": 0.8123899127894488, + "grad_norm": 0.5140221172801053, + "learning_rate": 1.7204991333278776e-06, + "loss": 0.7085, + "step": 37820 + }, + { + "epoch": 0.8126047171027194, + "grad_norm": 0.5224661451049724, + "learning_rate": 1.7166783466806458e-06, + "loss": 0.7031, + "step": 37830 + }, + { + "epoch": 0.81281952141599, + "grad_norm": 0.5110409285199197, + "learning_rate": 1.712861408861647e-06, + "loss": 0.7039, + "step": 37840 + }, + { + "epoch": 0.8130343257292606, + "grad_norm": 0.5427771272170174, + "learning_rate": 1.709048321644422e-06, + "loss": 0.7081, + "step": 37850 + }, + { + "epoch": 0.8132491300425313, + "grad_norm": 0.5132044659392205, + "learning_rate": 1.705239086800704e-06, + "loss": 0.7014, + "step": 37860 + }, + { + "epoch": 0.8134639343558019, + "grad_norm": 0.5148999630994164, + "learning_rate": 1.701433706100457e-06, + "loss": 0.6973, + "step": 37870 + }, + { + "epoch": 0.8136787386690725, + "grad_norm": 0.5079680548504395, + "learning_rate": 1.6976321813118346e-06, + "loss": 0.7022, + "step": 37880 + }, + { + "epoch": 0.8138935429823431, + "grad_norm": 0.4960125156821642, + "learning_rate": 1.6938345142012102e-06, + "loss": 0.6785, + "step": 37890 + }, + { + "epoch": 0.8141083472956137, + "grad_norm": 0.5272204610524729, + "learning_rate": 1.6900407065331649e-06, + "loss": 0.6939, + "step": 37900 + }, + { + "epoch": 0.8143231516088844, + "grad_norm": 0.4848903289756121, + "learning_rate": 1.6862507600704748e-06, + "loss": 0.6998, + "step": 37910 + }, + { + "epoch": 0.8145379559221549, + "grad_norm": 0.5052375107473448, + "learning_rate": 1.6824646765741426e-06, + "loss": 0.6828, + "step": 37920 + }, + { + "epoch": 0.8147527602354255, + "grad_norm": 0.5116653542411183, + "learning_rate": 1.6786824578033556e-06, + "loss": 0.6925, + "step": 37930 + }, + { + "epoch": 0.8149675645486961, + "grad_norm": 0.5191668583354877, + "learning_rate": 1.6749041055155157e-06, + "loss": 0.7069, + "step": 37940 + }, + { + "epoch": 0.8151823688619667, + "grad_norm": 0.5216137206032925, + "learning_rate": 1.6711296214662308e-06, + "loss": 0.7014, + "step": 37950 + }, + { + "epoch": 0.8153971731752374, + "grad_norm": 0.5212090406729156, + "learning_rate": 1.6673590074092981e-06, + "loss": 0.6915, + "step": 37960 + }, + { + "epoch": 0.815611977488508, + "grad_norm": 0.5280444927816935, + "learning_rate": 1.6635922650967363e-06, + "loss": 0.7045, + "step": 37970 + }, + { + "epoch": 0.8158267818017786, + "grad_norm": 0.5302776730833281, + "learning_rate": 1.659829396278746e-06, + "loss": 0.698, + "step": 37980 + }, + { + "epoch": 0.8160415861150492, + "grad_norm": 0.5194339368072196, + "learning_rate": 1.65607040270374e-06, + "loss": 0.6844, + "step": 37990 + }, + { + "epoch": 0.8162563904283198, + "grad_norm": 0.5268789032071284, + "learning_rate": 1.6523152861183288e-06, + "loss": 0.6933, + "step": 38000 + }, + { + "epoch": 0.8164711947415904, + "grad_norm": 0.5034790229387067, + "learning_rate": 1.6485640482673126e-06, + "loss": 0.7029, + "step": 38010 + }, + { + "epoch": 0.8166859990548611, + "grad_norm": 0.5238642863385727, + "learning_rate": 1.6448166908937046e-06, + "loss": 0.6984, + "step": 38020 + }, + { + "epoch": 0.8169008033681316, + "grad_norm": 0.5384949344125787, + "learning_rate": 1.641073215738702e-06, + "loss": 0.6989, + "step": 38030 + }, + { + "epoch": 0.8171156076814022, + "grad_norm": 0.515676479439265, + "learning_rate": 1.6373336245417025e-06, + "loss": 0.6967, + "step": 38040 + }, + { + "epoch": 0.8173304119946728, + "grad_norm": 0.5176807526555749, + "learning_rate": 1.6335979190403006e-06, + "loss": 0.6783, + "step": 38050 + }, + { + "epoch": 0.8175452163079434, + "grad_norm": 0.5006394103833677, + "learning_rate": 1.6298661009702855e-06, + "loss": 0.6995, + "step": 38060 + }, + { + "epoch": 0.8177600206212141, + "grad_norm": 0.49950489053942465, + "learning_rate": 1.6261381720656378e-06, + "loss": 0.6855, + "step": 38070 + }, + { + "epoch": 0.8179748249344847, + "grad_norm": 0.5114816231641758, + "learning_rate": 1.6224141340585276e-06, + "loss": 0.7026, + "step": 38080 + }, + { + "epoch": 0.8181896292477553, + "grad_norm": 0.5242198840868686, + "learning_rate": 1.6186939886793307e-06, + "loss": 0.6848, + "step": 38090 + }, + { + "epoch": 0.8184044335610259, + "grad_norm": 0.5396691916322732, + "learning_rate": 1.6149777376565967e-06, + "loss": 0.689, + "step": 38100 + }, + { + "epoch": 0.8186192378742965, + "grad_norm": 0.5157018020298604, + "learning_rate": 1.6112653827170754e-06, + "loss": 0.6761, + "step": 38110 + }, + { + "epoch": 0.8188340421875672, + "grad_norm": 0.5249870157381407, + "learning_rate": 1.6075569255857104e-06, + "loss": 0.6992, + "step": 38120 + }, + { + "epoch": 0.8190488465008378, + "grad_norm": 0.5053189312234057, + "learning_rate": 1.6038523679856189e-06, + "loss": 0.6875, + "step": 38130 + }, + { + "epoch": 0.8192636508141083, + "grad_norm": 0.5293662990085333, + "learning_rate": 1.600151711638126e-06, + "loss": 0.6911, + "step": 38140 + }, + { + "epoch": 0.8194784551273789, + "grad_norm": 0.5048218038817608, + "learning_rate": 1.5964549582627276e-06, + "loss": 0.6813, + "step": 38150 + }, + { + "epoch": 0.8196932594406495, + "grad_norm": 0.5288004984283555, + "learning_rate": 1.592762109577114e-06, + "loss": 0.6959, + "step": 38160 + }, + { + "epoch": 0.8199080637539202, + "grad_norm": 0.5294144163040986, + "learning_rate": 1.5890731672971594e-06, + "loss": 0.7005, + "step": 38170 + }, + { + "epoch": 0.8201228680671908, + "grad_norm": 0.5143583114789181, + "learning_rate": 1.5853881331369247e-06, + "loss": 0.6998, + "step": 38180 + }, + { + "epoch": 0.8203376723804614, + "grad_norm": 0.5009264407402928, + "learning_rate": 1.581707008808655e-06, + "loss": 0.6915, + "step": 38190 + }, + { + "epoch": 0.820552476693732, + "grad_norm": 0.4988425919957875, + "learning_rate": 1.5780297960227708e-06, + "loss": 0.6996, + "step": 38200 + }, + { + "epoch": 0.8207672810070026, + "grad_norm": 0.5001446365022995, + "learning_rate": 1.5743564964878866e-06, + "loss": 0.7043, + "step": 38210 + }, + { + "epoch": 0.8209820853202733, + "grad_norm": 0.5238040005094907, + "learning_rate": 1.5706871119107914e-06, + "loss": 0.6975, + "step": 38220 + }, + { + "epoch": 0.8211968896335439, + "grad_norm": 0.49772899954971367, + "learning_rate": 1.567021643996458e-06, + "loss": 0.69, + "step": 38230 + }, + { + "epoch": 0.8214116939468145, + "grad_norm": 0.5226931397210955, + "learning_rate": 1.5633600944480377e-06, + "loss": 0.7145, + "step": 38240 + }, + { + "epoch": 0.821626498260085, + "grad_norm": 0.5111897161908127, + "learning_rate": 1.5597024649668645e-06, + "loss": 0.6868, + "step": 38250 + }, + { + "epoch": 0.8218413025733556, + "grad_norm": 0.5194930113430709, + "learning_rate": 1.5560487572524452e-06, + "loss": 0.694, + "step": 38260 + }, + { + "epoch": 0.8220561068866263, + "grad_norm": 0.5122991460889587, + "learning_rate": 1.552398973002467e-06, + "loss": 0.6853, + "step": 38270 + }, + { + "epoch": 0.8222709111998969, + "grad_norm": 0.5144915325254265, + "learning_rate": 1.5487531139127976e-06, + "loss": 0.6981, + "step": 38280 + }, + { + "epoch": 0.8224857155131675, + "grad_norm": 0.5099897016133259, + "learning_rate": 1.5451111816774756e-06, + "loss": 0.7032, + "step": 38290 + }, + { + "epoch": 0.8227005198264381, + "grad_norm": 0.5162555967078819, + "learning_rate": 1.5414731779887182e-06, + "loss": 0.6842, + "step": 38300 + }, + { + "epoch": 0.8229153241397087, + "grad_norm": 0.5338368411100364, + "learning_rate": 1.537839104536918e-06, + "loss": 0.688, + "step": 38310 + }, + { + "epoch": 0.8231301284529793, + "grad_norm": 0.5112979366014295, + "learning_rate": 1.534208963010636e-06, + "loss": 0.6911, + "step": 38320 + }, + { + "epoch": 0.82334493276625, + "grad_norm": 0.5087777288883957, + "learning_rate": 1.5305827550966113e-06, + "loss": 0.6872, + "step": 38330 + }, + { + "epoch": 0.8235597370795206, + "grad_norm": 0.49160735113437615, + "learning_rate": 1.5269604824797556e-06, + "loss": 0.6877, + "step": 38340 + }, + { + "epoch": 0.8237745413927912, + "grad_norm": 0.5281039504251113, + "learning_rate": 1.5233421468431475e-06, + "loss": 0.6916, + "step": 38350 + }, + { + "epoch": 0.8239893457060617, + "grad_norm": 0.49685966763451844, + "learning_rate": 1.519727749868042e-06, + "loss": 0.6868, + "step": 38360 + }, + { + "epoch": 0.8242041500193323, + "grad_norm": 0.5292477798818466, + "learning_rate": 1.516117293233862e-06, + "loss": 0.6813, + "step": 38370 + }, + { + "epoch": 0.824418954332603, + "grad_norm": 0.4922303762705952, + "learning_rate": 1.5125107786181948e-06, + "loss": 0.6939, + "step": 38380 + }, + { + "epoch": 0.8246337586458736, + "grad_norm": 0.5098238815457077, + "learning_rate": 1.508908207696801e-06, + "loss": 0.7038, + "step": 38390 + }, + { + "epoch": 0.8248485629591442, + "grad_norm": 0.5204133525606097, + "learning_rate": 1.5053095821436092e-06, + "loss": 0.6912, + "step": 38400 + }, + { + "epoch": 0.8250633672724148, + "grad_norm": 0.5110478312144826, + "learning_rate": 1.501714903630712e-06, + "loss": 0.6935, + "step": 38410 + }, + { + "epoch": 0.8252781715856854, + "grad_norm": 0.4977353499043285, + "learning_rate": 1.4981241738283724e-06, + "loss": 0.6926, + "step": 38420 + }, + { + "epoch": 0.8254929758989561, + "grad_norm": 0.49726992306043954, + "learning_rate": 1.494537394405008e-06, + "loss": 0.6898, + "step": 38430 + }, + { + "epoch": 0.8257077802122267, + "grad_norm": 0.5192905390752384, + "learning_rate": 1.4909545670272186e-06, + "loss": 0.6868, + "step": 38440 + }, + { + "epoch": 0.8259225845254973, + "grad_norm": 0.5050774765754228, + "learning_rate": 1.4873756933597505e-06, + "loss": 0.6783, + "step": 38450 + }, + { + "epoch": 0.8261373888387679, + "grad_norm": 0.5182788128937821, + "learning_rate": 1.4838007750655214e-06, + "loss": 0.6937, + "step": 38460 + }, + { + "epoch": 0.8263521931520385, + "grad_norm": 0.5075379442809032, + "learning_rate": 1.4802298138056104e-06, + "loss": 0.6824, + "step": 38470 + }, + { + "epoch": 0.8265669974653092, + "grad_norm": 0.5149603108095413, + "learning_rate": 1.4766628112392566e-06, + "loss": 0.7117, + "step": 38480 + }, + { + "epoch": 0.8267818017785797, + "grad_norm": 0.5170003208350263, + "learning_rate": 1.4730997690238635e-06, + "loss": 0.6859, + "step": 38490 + }, + { + "epoch": 0.8269966060918503, + "grad_norm": 0.5188232774350414, + "learning_rate": 1.4695406888149832e-06, + "loss": 0.6959, + "step": 38500 + }, + { + "epoch": 0.8272114104051209, + "grad_norm": 0.6124746424283791, + "learning_rate": 1.4659855722663453e-06, + "loss": 0.6881, + "step": 38510 + }, + { + "epoch": 0.8274262147183915, + "grad_norm": 0.5820473251466972, + "learning_rate": 1.4624344210298202e-06, + "loss": 0.7056, + "step": 38520 + }, + { + "epoch": 0.8276410190316622, + "grad_norm": 0.5487848245547172, + "learning_rate": 1.4588872367554452e-06, + "loss": 0.6813, + "step": 38530 + }, + { + "epoch": 0.8278558233449328, + "grad_norm": 0.5052583706726699, + "learning_rate": 1.455344021091414e-06, + "loss": 0.702, + "step": 38540 + }, + { + "epoch": 0.8280706276582034, + "grad_norm": 0.5449975100876051, + "learning_rate": 1.451804775684067e-06, + "loss": 0.6823, + "step": 38550 + }, + { + "epoch": 0.828285431971474, + "grad_norm": 0.5086547663242421, + "learning_rate": 1.448269502177918e-06, + "loss": 0.6842, + "step": 38560 + }, + { + "epoch": 0.8285002362847446, + "grad_norm": 0.5137832027200622, + "learning_rate": 1.4447382022156165e-06, + "loss": 0.687, + "step": 38570 + }, + { + "epoch": 0.8287150405980153, + "grad_norm": 0.49746601253508876, + "learning_rate": 1.4412108774379775e-06, + "loss": 0.6818, + "step": 38580 + }, + { + "epoch": 0.8289298449112859, + "grad_norm": 0.5103152319908182, + "learning_rate": 1.4376875294839643e-06, + "loss": 0.6914, + "step": 38590 + }, + { + "epoch": 0.8291446492245564, + "grad_norm": 0.5275562256088486, + "learning_rate": 1.4341681599906897e-06, + "loss": 0.6901, + "step": 38600 + }, + { + "epoch": 0.829359453537827, + "grad_norm": 0.5062231253926271, + "learning_rate": 1.4306527705934293e-06, + "loss": 0.6869, + "step": 38610 + }, + { + "epoch": 0.8295742578510976, + "grad_norm": 0.5338108634050998, + "learning_rate": 1.4271413629255937e-06, + "loss": 0.6922, + "step": 38620 + }, + { + "epoch": 0.8297890621643682, + "grad_norm": 0.5016771702455557, + "learning_rate": 1.4236339386187547e-06, + "loss": 0.6893, + "step": 38630 + }, + { + "epoch": 0.8300038664776389, + "grad_norm": 0.5191984822566196, + "learning_rate": 1.4201304993026321e-06, + "loss": 0.6925, + "step": 38640 + }, + { + "epoch": 0.8302186707909095, + "grad_norm": 0.5207586260457043, + "learning_rate": 1.4166310466050836e-06, + "loss": 0.6847, + "step": 38650 + }, + { + "epoch": 0.8304334751041801, + "grad_norm": 0.5139953998878591, + "learning_rate": 1.413135582152133e-06, + "loss": 0.685, + "step": 38660 + }, + { + "epoch": 0.8306482794174507, + "grad_norm": 0.5263182428133765, + "learning_rate": 1.4096441075679325e-06, + "loss": 0.6863, + "step": 38670 + }, + { + "epoch": 0.8308630837307213, + "grad_norm": 0.5282654822309718, + "learning_rate": 1.4061566244747937e-06, + "loss": 0.6957, + "step": 38680 + }, + { + "epoch": 0.831077888043992, + "grad_norm": 0.5209258215224781, + "learning_rate": 1.4026731344931655e-06, + "loss": 0.6956, + "step": 38690 + }, + { + "epoch": 0.8312926923572626, + "grad_norm": 0.5106724966574794, + "learning_rate": 1.399193639241645e-06, + "loss": 0.684, + "step": 38700 + }, + { + "epoch": 0.8315074966705331, + "grad_norm": 0.5033855375018464, + "learning_rate": 1.3957181403369747e-06, + "loss": 0.6867, + "step": 38710 + }, + { + "epoch": 0.8317223009838037, + "grad_norm": 0.5085681808640763, + "learning_rate": 1.3922466393940315e-06, + "loss": 0.6937, + "step": 38720 + }, + { + "epoch": 0.8319371052970743, + "grad_norm": 0.5034486647222803, + "learning_rate": 1.3887791380258509e-06, + "loss": 0.7001, + "step": 38730 + }, + { + "epoch": 0.832151909610345, + "grad_norm": 0.5080003477917522, + "learning_rate": 1.3853156378435916e-06, + "loss": 0.6964, + "step": 38740 + }, + { + "epoch": 0.8323667139236156, + "grad_norm": 0.5111977023138986, + "learning_rate": 1.3818561404565645e-06, + "loss": 0.6881, + "step": 38750 + }, + { + "epoch": 0.8325815182368862, + "grad_norm": 0.5061875410904043, + "learning_rate": 1.3784006474722212e-06, + "loss": 0.6964, + "step": 38760 + }, + { + "epoch": 0.8327963225501568, + "grad_norm": 0.5333922720806704, + "learning_rate": 1.374949160496143e-06, + "loss": 0.6946, + "step": 38770 + }, + { + "epoch": 0.8330111268634274, + "grad_norm": 0.5285837150812875, + "learning_rate": 1.3715016811320626e-06, + "loss": 0.6896, + "step": 38780 + }, + { + "epoch": 0.8332259311766981, + "grad_norm": 0.519609938847976, + "learning_rate": 1.3680582109818396e-06, + "loss": 0.6826, + "step": 38790 + }, + { + "epoch": 0.8334407354899687, + "grad_norm": 0.5263908151996978, + "learning_rate": 1.3646187516454778e-06, + "loss": 0.6948, + "step": 38800 + }, + { + "epoch": 0.8336555398032393, + "grad_norm": 0.50222280387675, + "learning_rate": 1.3611833047211132e-06, + "loss": 0.6938, + "step": 38810 + }, + { + "epoch": 0.8338703441165098, + "grad_norm": 0.5295682152493145, + "learning_rate": 1.3577518718050199e-06, + "loss": 0.6847, + "step": 38820 + }, + { + "epoch": 0.8340851484297804, + "grad_norm": 0.526283044923053, + "learning_rate": 1.3543244544916078e-06, + "loss": 0.6799, + "step": 38830 + }, + { + "epoch": 0.8342999527430511, + "grad_norm": 0.521205682856861, + "learning_rate": 1.3509010543734158e-06, + "loss": 0.7013, + "step": 38840 + }, + { + "epoch": 0.8345147570563217, + "grad_norm": 0.499623044668992, + "learning_rate": 1.347481673041121e-06, + "loss": 0.6916, + "step": 38850 + }, + { + "epoch": 0.8347295613695923, + "grad_norm": 0.49936320967671904, + "learning_rate": 1.3440663120835329e-06, + "loss": 0.6938, + "step": 38860 + }, + { + "epoch": 0.8349443656828629, + "grad_norm": 0.5293633851844111, + "learning_rate": 1.3406549730875916e-06, + "loss": 0.6836, + "step": 38870 + }, + { + "epoch": 0.8351591699961335, + "grad_norm": 0.5168353636514088, + "learning_rate": 1.3372476576383675e-06, + "loss": 0.6904, + "step": 38880 + }, + { + "epoch": 0.8353739743094042, + "grad_norm": 0.49422042008773903, + "learning_rate": 1.3338443673190637e-06, + "loss": 0.695, + "step": 38890 + }, + { + "epoch": 0.8355887786226748, + "grad_norm": 0.5056409802616189, + "learning_rate": 1.3304451037110132e-06, + "loss": 0.6968, + "step": 38900 + }, + { + "epoch": 0.8358035829359454, + "grad_norm": 0.517589863459402, + "learning_rate": 1.3270498683936738e-06, + "loss": 0.69, + "step": 38910 + }, + { + "epoch": 0.836018387249216, + "grad_norm": 0.5214346220274612, + "learning_rate": 1.3236586629446369e-06, + "loss": 0.6914, + "step": 38920 + }, + { + "epoch": 0.8362331915624865, + "grad_norm": 0.5087168684935114, + "learning_rate": 1.3202714889396172e-06, + "loss": 0.6795, + "step": 38930 + }, + { + "epoch": 0.8364479958757571, + "grad_norm": 0.5205133962298867, + "learning_rate": 1.3168883479524598e-06, + "loss": 0.6909, + "step": 38940 + }, + { + "epoch": 0.8366628001890278, + "grad_norm": 0.5318929561552359, + "learning_rate": 1.3135092415551354e-06, + "loss": 0.7047, + "step": 38950 + }, + { + "epoch": 0.8368776045022984, + "grad_norm": 0.5028926526152647, + "learning_rate": 1.3101341713177351e-06, + "loss": 0.6878, + "step": 38960 + }, + { + "epoch": 0.837092408815569, + "grad_norm": 0.5277421426222694, + "learning_rate": 1.306763138808479e-06, + "loss": 0.6932, + "step": 38970 + }, + { + "epoch": 0.8373072131288396, + "grad_norm": 0.4805738830452809, + "learning_rate": 1.3033961455937127e-06, + "loss": 0.6822, + "step": 38980 + }, + { + "epoch": 0.8375220174421102, + "grad_norm": 0.5108469134488737, + "learning_rate": 1.3000331932379007e-06, + "loss": 0.6893, + "step": 38990 + }, + { + "epoch": 0.8377368217553809, + "grad_norm": 0.5201560237520049, + "learning_rate": 1.2966742833036327e-06, + "loss": 0.6857, + "step": 39000 + }, + { + "epoch": 0.8379516260686515, + "grad_norm": 0.5188163877062854, + "learning_rate": 1.2933194173516218e-06, + "loss": 0.7, + "step": 39010 + }, + { + "epoch": 0.8381664303819221, + "grad_norm": 0.5093643945181999, + "learning_rate": 1.289968596940694e-06, + "loss": 0.7081, + "step": 39020 + }, + { + "epoch": 0.8383812346951927, + "grad_norm": 0.5203767610471154, + "learning_rate": 1.2866218236278038e-06, + "loss": 0.6819, + "step": 39030 + }, + { + "epoch": 0.8385960390084632, + "grad_norm": 0.5184824000269601, + "learning_rate": 1.2832790989680221e-06, + "loss": 0.6855, + "step": 39040 + }, + { + "epoch": 0.838810843321734, + "grad_norm": 0.5019301811954551, + "learning_rate": 1.2799404245145396e-06, + "loss": 0.6947, + "step": 39050 + }, + { + "epoch": 0.8390256476350045, + "grad_norm": 0.5346206336302403, + "learning_rate": 1.2766058018186645e-06, + "loss": 0.703, + "step": 39060 + }, + { + "epoch": 0.8392404519482751, + "grad_norm": 0.5227834077619942, + "learning_rate": 1.2732752324298225e-06, + "loss": 0.6945, + "step": 39070 + }, + { + "epoch": 0.8394552562615457, + "grad_norm": 0.5216423504327496, + "learning_rate": 1.2699487178955573e-06, + "loss": 0.6927, + "step": 39080 + }, + { + "epoch": 0.8396700605748163, + "grad_norm": 0.5446631129970325, + "learning_rate": 1.266626259761523e-06, + "loss": 0.6824, + "step": 39090 + }, + { + "epoch": 0.839884864888087, + "grad_norm": 0.49940235609166955, + "learning_rate": 1.2633078595714943e-06, + "loss": 0.7022, + "step": 39100 + }, + { + "epoch": 0.8400996692013576, + "grad_norm": 0.5079994069706872, + "learning_rate": 1.2599935188673596e-06, + "loss": 0.6923, + "step": 39110 + }, + { + "epoch": 0.8403144735146282, + "grad_norm": 0.5057473773527276, + "learning_rate": 1.2566832391891215e-06, + "loss": 0.7097, + "step": 39120 + }, + { + "epoch": 0.8405292778278988, + "grad_norm": 0.5100763490334336, + "learning_rate": 1.2533770220748942e-06, + "loss": 0.679, + "step": 39130 + }, + { + "epoch": 0.8407440821411694, + "grad_norm": 0.5003721745532178, + "learning_rate": 1.2500748690609033e-06, + "loss": 0.699, + "step": 39140 + }, + { + "epoch": 0.8409588864544401, + "grad_norm": 0.5137872175833427, + "learning_rate": 1.246776781681487e-06, + "loss": 0.6928, + "step": 39150 + }, + { + "epoch": 0.8411736907677106, + "grad_norm": 0.497587322672259, + "learning_rate": 1.2434827614690958e-06, + "loss": 0.6988, + "step": 39160 + }, + { + "epoch": 0.8413884950809812, + "grad_norm": 0.5202834990746827, + "learning_rate": 1.2401928099542892e-06, + "loss": 0.6861, + "step": 39170 + }, + { + "epoch": 0.8416032993942518, + "grad_norm": 0.5189296596390629, + "learning_rate": 1.2369069286657388e-06, + "loss": 0.6805, + "step": 39180 + }, + { + "epoch": 0.8418181037075224, + "grad_norm": 0.5141868855723348, + "learning_rate": 1.2336251191302162e-06, + "loss": 0.6897, + "step": 39190 + }, + { + "epoch": 0.842032908020793, + "grad_norm": 0.5124494606085882, + "learning_rate": 1.2303473828726153e-06, + "loss": 0.6957, + "step": 39200 + }, + { + "epoch": 0.8422477123340637, + "grad_norm": 0.5214072352573822, + "learning_rate": 1.2270737214159245e-06, + "loss": 0.6796, + "step": 39210 + }, + { + "epoch": 0.8424625166473343, + "grad_norm": 0.528792736816499, + "learning_rate": 1.2238041362812435e-06, + "loss": 0.6899, + "step": 39220 + }, + { + "epoch": 0.8426773209606049, + "grad_norm": 0.5093238296040209, + "learning_rate": 1.2205386289877829e-06, + "loss": 0.6911, + "step": 39230 + }, + { + "epoch": 0.8428921252738755, + "grad_norm": 0.4978808338740638, + "learning_rate": 1.2172772010528455e-06, + "loss": 0.6933, + "step": 39240 + }, + { + "epoch": 0.8431069295871461, + "grad_norm": 0.5325179259250296, + "learning_rate": 1.2140198539918568e-06, + "loss": 0.6933, + "step": 39250 + }, + { + "epoch": 0.8433217339004168, + "grad_norm": 0.5239028106635986, + "learning_rate": 1.2107665893183274e-06, + "loss": 0.6845, + "step": 39260 + }, + { + "epoch": 0.8435365382136873, + "grad_norm": 0.5206218974407494, + "learning_rate": 1.2075174085438878e-06, + "loss": 0.704, + "step": 39270 + }, + { + "epoch": 0.8437513425269579, + "grad_norm": 0.5234721000920526, + "learning_rate": 1.2042723131782586e-06, + "loss": 0.6886, + "step": 39280 + }, + { + "epoch": 0.8439661468402285, + "grad_norm": 0.5166325698154588, + "learning_rate": 1.2010313047292677e-06, + "loss": 0.6921, + "step": 39290 + }, + { + "epoch": 0.8441809511534991, + "grad_norm": 0.5013497589587982, + "learning_rate": 1.197794384702844e-06, + "loss": 0.6839, + "step": 39300 + }, + { + "epoch": 0.8443957554667698, + "grad_norm": 0.5392745947364839, + "learning_rate": 1.1945615546030109e-06, + "loss": 0.6848, + "step": 39310 + }, + { + "epoch": 0.8446105597800404, + "grad_norm": 0.5083861122426512, + "learning_rate": 1.1913328159319049e-06, + "loss": 0.6914, + "step": 39320 + }, + { + "epoch": 0.844825364093311, + "grad_norm": 0.49717604044905356, + "learning_rate": 1.1881081701897446e-06, + "loss": 0.6882, + "step": 39330 + }, + { + "epoch": 0.8450401684065816, + "grad_norm": 0.5273892484603656, + "learning_rate": 1.1848876188748583e-06, + "loss": 0.7001, + "step": 39340 + }, + { + "epoch": 0.8452549727198522, + "grad_norm": 0.5274494392930585, + "learning_rate": 1.1816711634836708e-06, + "loss": 0.6842, + "step": 39350 + }, + { + "epoch": 0.8454697770331229, + "grad_norm": 0.5550754818367603, + "learning_rate": 1.1784588055106937e-06, + "loss": 0.6957, + "step": 39360 + }, + { + "epoch": 0.8456845813463935, + "grad_norm": 0.5093805942815454, + "learning_rate": 1.175250546448553e-06, + "loss": 0.69, + "step": 39370 + }, + { + "epoch": 0.845899385659664, + "grad_norm": 0.5291605321484583, + "learning_rate": 1.1720463877879518e-06, + "loss": 0.6954, + "step": 39380 + }, + { + "epoch": 0.8461141899729346, + "grad_norm": 0.5213523166182829, + "learning_rate": 1.1688463310176979e-06, + "loss": 0.7063, + "step": 39390 + }, + { + "epoch": 0.8463289942862052, + "grad_norm": 0.5043482415928106, + "learning_rate": 1.1656503776246942e-06, + "loss": 0.6896, + "step": 39400 + }, + { + "epoch": 0.8465437985994759, + "grad_norm": 0.5340179003407842, + "learning_rate": 1.1624585290939261e-06, + "loss": 0.6895, + "step": 39410 + }, + { + "epoch": 0.8467586029127465, + "grad_norm": 0.5334819112678492, + "learning_rate": 1.1592707869084896e-06, + "loss": 0.6925, + "step": 39420 + }, + { + "epoch": 0.8469734072260171, + "grad_norm": 0.5196334748388658, + "learning_rate": 1.156087152549552e-06, + "loss": 0.6972, + "step": 39430 + }, + { + "epoch": 0.8471882115392877, + "grad_norm": 0.5212173781531991, + "learning_rate": 1.1529076274963925e-06, + "loss": 0.6837, + "step": 39440 + }, + { + "epoch": 0.8474030158525583, + "grad_norm": 0.5271644084715066, + "learning_rate": 1.1497322132263643e-06, + "loss": 0.6781, + "step": 39450 + }, + { + "epoch": 0.847617820165829, + "grad_norm": 0.5251522649812483, + "learning_rate": 1.1465609112149178e-06, + "loss": 0.6966, + "step": 39460 + }, + { + "epoch": 0.8478326244790996, + "grad_norm": 0.5028358197340315, + "learning_rate": 1.1433937229355951e-06, + "loss": 0.682, + "step": 39470 + }, + { + "epoch": 0.8480474287923702, + "grad_norm": 0.5260187617540428, + "learning_rate": 1.140230649860018e-06, + "loss": 0.7039, + "step": 39480 + }, + { + "epoch": 0.8482622331056408, + "grad_norm": 0.49354427489124325, + "learning_rate": 1.1370716934579086e-06, + "loss": 0.6806, + "step": 39490 + }, + { + "epoch": 0.8484770374189113, + "grad_norm": 0.5353136217093745, + "learning_rate": 1.1339168551970647e-06, + "loss": 0.7164, + "step": 39500 + }, + { + "epoch": 0.8486918417321819, + "grad_norm": 0.5172265496588412, + "learning_rate": 1.1307661365433763e-06, + "loss": 0.6857, + "step": 39510 + }, + { + "epoch": 0.8489066460454526, + "grad_norm": 0.5240872817688168, + "learning_rate": 1.1276195389608201e-06, + "loss": 0.6842, + "step": 39520 + }, + { + "epoch": 0.8491214503587232, + "grad_norm": 0.5120247735049234, + "learning_rate": 1.1244770639114522e-06, + "loss": 0.7057, + "step": 39530 + }, + { + "epoch": 0.8493362546719938, + "grad_norm": 0.49496102620753335, + "learning_rate": 1.1213387128554232e-06, + "loss": 0.6822, + "step": 39540 + }, + { + "epoch": 0.8495510589852644, + "grad_norm": 0.5000858443900138, + "learning_rate": 1.1182044872509556e-06, + "loss": 0.6881, + "step": 39550 + }, + { + "epoch": 0.849765863298535, + "grad_norm": 0.5040809516882646, + "learning_rate": 1.1150743885543636e-06, + "loss": 0.6947, + "step": 39560 + }, + { + "epoch": 0.8499806676118057, + "grad_norm": 0.5420538811255085, + "learning_rate": 1.11194841822004e-06, + "loss": 0.6772, + "step": 39570 + }, + { + "epoch": 0.8501954719250763, + "grad_norm": 0.504747244903184, + "learning_rate": 1.1088265777004613e-06, + "loss": 0.6809, + "step": 39580 + }, + { + "epoch": 0.8504102762383469, + "grad_norm": 0.49931071427553786, + "learning_rate": 1.105708868446187e-06, + "loss": 0.6829, + "step": 39590 + }, + { + "epoch": 0.8506250805516175, + "grad_norm": 0.5090934536047, + "learning_rate": 1.102595291905848e-06, + "loss": 0.694, + "step": 39600 + }, + { + "epoch": 0.850839884864888, + "grad_norm": 0.4994127034268232, + "learning_rate": 1.099485849526165e-06, + "loss": 0.6967, + "step": 39610 + }, + { + "epoch": 0.8510546891781587, + "grad_norm": 0.5202678470784433, + "learning_rate": 1.0963805427519335e-06, + "loss": 0.6893, + "step": 39620 + }, + { + "epoch": 0.8512694934914293, + "grad_norm": 0.5007238242310413, + "learning_rate": 1.0932793730260272e-06, + "loss": 0.6945, + "step": 39630 + }, + { + "epoch": 0.8514842978046999, + "grad_norm": 0.5208651575984488, + "learning_rate": 1.0901823417893997e-06, + "loss": 0.6903, + "step": 39640 + }, + { + "epoch": 0.8516991021179705, + "grad_norm": 0.511764861180484, + "learning_rate": 1.087089450481078e-06, + "loss": 0.6934, + "step": 39650 + }, + { + "epoch": 0.8519139064312411, + "grad_norm": 0.5194391424779614, + "learning_rate": 1.0840007005381714e-06, + "loss": 0.6772, + "step": 39660 + }, + { + "epoch": 0.8521287107445118, + "grad_norm": 0.5000894132679686, + "learning_rate": 1.0809160933958574e-06, + "loss": 0.6918, + "step": 39670 + }, + { + "epoch": 0.8523435150577824, + "grad_norm": 0.5349623963637503, + "learning_rate": 1.0778356304873927e-06, + "loss": 0.6931, + "step": 39680 + }, + { + "epoch": 0.852558319371053, + "grad_norm": 0.5127707494353972, + "learning_rate": 1.074759313244108e-06, + "loss": 0.6858, + "step": 39690 + }, + { + "epoch": 0.8527731236843236, + "grad_norm": 0.5050351323720006, + "learning_rate": 1.07168714309541e-06, + "loss": 0.6983, + "step": 39700 + }, + { + "epoch": 0.8529879279975942, + "grad_norm": 0.5042118701302163, + "learning_rate": 1.0686191214687758e-06, + "loss": 0.6907, + "step": 39710 + }, + { + "epoch": 0.8532027323108649, + "grad_norm": 0.5319983433434204, + "learning_rate": 1.0655552497897525e-06, + "loss": 0.6797, + "step": 39720 + }, + { + "epoch": 0.8534175366241354, + "grad_norm": 0.5126038269348526, + "learning_rate": 1.0624955294819628e-06, + "loss": 0.6798, + "step": 39730 + }, + { + "epoch": 0.853632340937406, + "grad_norm": 0.5005716309497225, + "learning_rate": 1.0594399619671013e-06, + "loss": 0.6933, + "step": 39740 + }, + { + "epoch": 0.8538471452506766, + "grad_norm": 0.5167440374916815, + "learning_rate": 1.0563885486649284e-06, + "loss": 0.6905, + "step": 39750 + }, + { + "epoch": 0.8540619495639472, + "grad_norm": 0.504354990255589, + "learning_rate": 1.0533412909932793e-06, + "loss": 0.6799, + "step": 39760 + }, + { + "epoch": 0.8542767538772179, + "grad_norm": 0.5143195869144642, + "learning_rate": 1.0502981903680565e-06, + "loss": 0.6833, + "step": 39770 + }, + { + "epoch": 0.8544915581904885, + "grad_norm": 0.49947041857768637, + "learning_rate": 1.0472592482032272e-06, + "loss": 0.6746, + "step": 39780 + }, + { + "epoch": 0.8547063625037591, + "grad_norm": 0.5217962095232429, + "learning_rate": 1.0442244659108325e-06, + "loss": 0.7035, + "step": 39790 + }, + { + "epoch": 0.8549211668170297, + "grad_norm": 0.5096704985368335, + "learning_rate": 1.0411938449009752e-06, + "loss": 0.6825, + "step": 39800 + }, + { + "epoch": 0.8551359711303003, + "grad_norm": 0.5135857160178005, + "learning_rate": 1.0381673865818288e-06, + "loss": 0.6831, + "step": 39810 + }, + { + "epoch": 0.8553507754435709, + "grad_norm": 0.522664943240014, + "learning_rate": 1.0351450923596317e-06, + "loss": 0.6931, + "step": 39820 + }, + { + "epoch": 0.8555655797568416, + "grad_norm": 0.49812520742428695, + "learning_rate": 1.0321269636386833e-06, + "loss": 0.6881, + "step": 39830 + }, + { + "epoch": 0.8557803840701121, + "grad_norm": 0.5120365797945067, + "learning_rate": 1.0291130018213558e-06, + "loss": 0.6995, + "step": 39840 + }, + { + "epoch": 0.8559951883833827, + "grad_norm": 0.5392653302148872, + "learning_rate": 1.0261032083080746e-06, + "loss": 0.6926, + "step": 39850 + }, + { + "epoch": 0.8562099926966533, + "grad_norm": 1.451337023254773, + "learning_rate": 1.023097584497339e-06, + "loss": 0.6877, + "step": 39860 + }, + { + "epoch": 0.8564247970099239, + "grad_norm": 0.5476743766148364, + "learning_rate": 1.0200961317857027e-06, + "loss": 0.6825, + "step": 39870 + }, + { + "epoch": 0.8566396013231946, + "grad_norm": 0.5313081010660222, + "learning_rate": 1.017098851567785e-06, + "loss": 0.7, + "step": 39880 + }, + { + "epoch": 0.8568544056364652, + "grad_norm": 0.5362672147803951, + "learning_rate": 1.014105745236269e-06, + "loss": 0.6909, + "step": 39890 + }, + { + "epoch": 0.8570692099497358, + "grad_norm": 0.5217352952005082, + "learning_rate": 1.0111168141818916e-06, + "loss": 0.6921, + "step": 39900 + }, + { + "epoch": 0.8572840142630064, + "grad_norm": 0.5111751841927796, + "learning_rate": 1.0081320597934542e-06, + "loss": 0.6819, + "step": 39910 + }, + { + "epoch": 0.857498818576277, + "grad_norm": 0.4993174113029642, + "learning_rate": 1.0051514834578169e-06, + "loss": 0.6958, + "step": 39920 + }, + { + "epoch": 0.8577136228895477, + "grad_norm": 0.5053368624600403, + "learning_rate": 1.002175086559899e-06, + "loss": 0.6881, + "step": 39930 + }, + { + "epoch": 0.8579284272028183, + "grad_norm": 0.5130032786788024, + "learning_rate": 9.992028704826785e-07, + "loss": 0.7007, + "step": 39940 + }, + { + "epoch": 0.8581432315160888, + "grad_norm": 0.5157365810431223, + "learning_rate": 9.96234836607184e-07, + "loss": 0.6744, + "step": 39950 + }, + { + "epoch": 0.8583580358293594, + "grad_norm": 0.527685117613382, + "learning_rate": 9.932709863125133e-07, + "loss": 0.6949, + "step": 39960 + }, + { + "epoch": 0.85857284014263, + "grad_norm": 0.512839755051422, + "learning_rate": 9.903113209758098e-07, + "loss": 0.6948, + "step": 39970 + }, + { + "epoch": 0.8587876444559007, + "grad_norm": 0.5211080071086338, + "learning_rate": 9.87355841972274e-07, + "loss": 0.71, + "step": 39980 + }, + { + "epoch": 0.8590024487691713, + "grad_norm": 0.5098392488580249, + "learning_rate": 9.844045506751687e-07, + "loss": 0.6889, + "step": 39990 + }, + { + "epoch": 0.8592172530824419, + "grad_norm": 0.5218396209074099, + "learning_rate": 9.81457448455797e-07, + "loss": 0.6872, + "step": 40000 + }, + { + "epoch": 0.8594320573957125, + "grad_norm": 0.5008077047222401, + "learning_rate": 9.785145366835325e-07, + "loss": 0.6867, + "step": 40010 + }, + { + "epoch": 0.8596468617089831, + "grad_norm": 0.5055415929729381, + "learning_rate": 9.755758167257857e-07, + "loss": 0.7009, + "step": 40020 + }, + { + "epoch": 0.8598616660222538, + "grad_norm": 0.5309493067199006, + "learning_rate": 9.726412899480342e-07, + "loss": 0.689, + "step": 40030 + }, + { + "epoch": 0.8600764703355244, + "grad_norm": 0.5187592677496787, + "learning_rate": 9.697109577137943e-07, + "loss": 0.6768, + "step": 40040 + }, + { + "epoch": 0.860291274648795, + "grad_norm": 0.5192073916886506, + "learning_rate": 9.667848213846386e-07, + "loss": 0.6891, + "step": 40050 + }, + { + "epoch": 0.8605060789620655, + "grad_norm": 0.5259783631972406, + "learning_rate": 9.638628823201956e-07, + "loss": 0.6895, + "step": 40060 + }, + { + "epoch": 0.8607208832753361, + "grad_norm": 0.5226106882651611, + "learning_rate": 9.609451418781302e-07, + "loss": 0.69, + "step": 40070 + }, + { + "epoch": 0.8609356875886067, + "grad_norm": 0.5071335472658586, + "learning_rate": 9.580316014141722e-07, + "loss": 0.6846, + "step": 40080 + }, + { + "epoch": 0.8611504919018774, + "grad_norm": 0.5119813837953315, + "learning_rate": 9.551222622820866e-07, + "loss": 0.695, + "step": 40090 + }, + { + "epoch": 0.861365296215148, + "grad_norm": 0.4918570321661607, + "learning_rate": 9.522171258336943e-07, + "loss": 0.6898, + "step": 40100 + }, + { + "epoch": 0.8615801005284186, + "grad_norm": 0.5038278185618428, + "learning_rate": 9.493161934188632e-07, + "loss": 0.6902, + "step": 40110 + }, + { + "epoch": 0.8617949048416892, + "grad_norm": 0.5131068608059737, + "learning_rate": 9.464194663854986e-07, + "loss": 0.687, + "step": 40120 + }, + { + "epoch": 0.8620097091549598, + "grad_norm": 0.5313015107754585, + "learning_rate": 9.435269460795671e-07, + "loss": 0.6973, + "step": 40130 + }, + { + "epoch": 0.8622245134682305, + "grad_norm": 0.509195051088408, + "learning_rate": 9.406386338450657e-07, + "loss": 0.6946, + "step": 40140 + }, + { + "epoch": 0.8624393177815011, + "grad_norm": 0.502073057996939, + "learning_rate": 9.377545310240454e-07, + "loss": 0.6909, + "step": 40150 + }, + { + "epoch": 0.8626541220947717, + "grad_norm": 0.5081753672118122, + "learning_rate": 9.34874638956601e-07, + "loss": 0.6946, + "step": 40160 + }, + { + "epoch": 0.8628689264080422, + "grad_norm": 0.5075208175361569, + "learning_rate": 9.31998958980862e-07, + "loss": 0.6846, + "step": 40170 + }, + { + "epoch": 0.8630837307213128, + "grad_norm": 0.5559833588569257, + "learning_rate": 9.291274924330141e-07, + "loss": 0.6993, + "step": 40180 + }, + { + "epoch": 0.8632985350345835, + "grad_norm": 0.5406798025815065, + "learning_rate": 9.262602406472732e-07, + "loss": 0.6948, + "step": 40190 + }, + { + "epoch": 0.8635133393478541, + "grad_norm": 0.5074440350796556, + "learning_rate": 9.233972049559037e-07, + "loss": 0.6845, + "step": 40200 + }, + { + "epoch": 0.8637281436611247, + "grad_norm": 0.5231140038044005, + "learning_rate": 9.205383866892092e-07, + "loss": 0.6887, + "step": 40210 + }, + { + "epoch": 0.8639429479743953, + "grad_norm": 0.5120556983369559, + "learning_rate": 9.176837871755351e-07, + "loss": 0.6885, + "step": 40220 + }, + { + "epoch": 0.8641577522876659, + "grad_norm": 0.5083353819396028, + "learning_rate": 9.148334077412646e-07, + "loss": 0.6913, + "step": 40230 + }, + { + "epoch": 0.8643725566009366, + "grad_norm": 0.5038742161832033, + "learning_rate": 9.119872497108162e-07, + "loss": 0.691, + "step": 40240 + }, + { + "epoch": 0.8645873609142072, + "grad_norm": 0.5099893292803395, + "learning_rate": 9.091453144066587e-07, + "loss": 0.6849, + "step": 40250 + }, + { + "epoch": 0.8648021652274778, + "grad_norm": 0.5172315105235323, + "learning_rate": 9.063076031492857e-07, + "loss": 0.6825, + "step": 40260 + }, + { + "epoch": 0.8650169695407484, + "grad_norm": 0.5187093289593909, + "learning_rate": 9.034741172572359e-07, + "loss": 0.6824, + "step": 40270 + }, + { + "epoch": 0.865231773854019, + "grad_norm": 0.522303012114229, + "learning_rate": 9.006448580470839e-07, + "loss": 0.6887, + "step": 40280 + }, + { + "epoch": 0.8654465781672896, + "grad_norm": 0.49894862369670223, + "learning_rate": 8.978198268334348e-07, + "loss": 0.6978, + "step": 40290 + }, + { + "epoch": 0.8656613824805602, + "grad_norm": 0.5002768042699983, + "learning_rate": 8.949990249289409e-07, + "loss": 0.7016, + "step": 40300 + }, + { + "epoch": 0.8658761867938308, + "grad_norm": 0.5333278616857066, + "learning_rate": 8.921824536442747e-07, + "loss": 0.7005, + "step": 40310 + }, + { + "epoch": 0.8660909911071014, + "grad_norm": 0.4947738923548671, + "learning_rate": 8.893701142881539e-07, + "loss": 0.6847, + "step": 40320 + }, + { + "epoch": 0.866305795420372, + "grad_norm": 0.5180236450171597, + "learning_rate": 8.865620081673243e-07, + "loss": 0.6955, + "step": 40330 + }, + { + "epoch": 0.8665205997336427, + "grad_norm": 0.5318690818922256, + "learning_rate": 8.837581365865688e-07, + "loss": 0.6963, + "step": 40340 + }, + { + "epoch": 0.8667354040469133, + "grad_norm": 0.5330479773777189, + "learning_rate": 8.809585008487009e-07, + "loss": 0.6853, + "step": 40350 + }, + { + "epoch": 0.8669502083601839, + "grad_norm": 0.516868926152685, + "learning_rate": 8.781631022545611e-07, + "loss": 0.6887, + "step": 40360 + }, + { + "epoch": 0.8671650126734545, + "grad_norm": 0.4970913477817239, + "learning_rate": 8.753719421030294e-07, + "loss": 0.6974, + "step": 40370 + }, + { + "epoch": 0.8673798169867251, + "grad_norm": 0.5426821783948291, + "learning_rate": 8.725850216910115e-07, + "loss": 0.6916, + "step": 40380 + }, + { + "epoch": 0.8675946212999956, + "grad_norm": 0.5320712513650713, + "learning_rate": 8.69802342313445e-07, + "loss": 0.6862, + "step": 40390 + }, + { + "epoch": 0.8678094256132663, + "grad_norm": 0.514150609745708, + "learning_rate": 8.670239052632945e-07, + "loss": 0.6902, + "step": 40400 + }, + { + "epoch": 0.8680242299265369, + "grad_norm": 0.5027008974103963, + "learning_rate": 8.642497118315584e-07, + "loss": 0.6743, + "step": 40410 + }, + { + "epoch": 0.8682390342398075, + "grad_norm": 0.49920507536090114, + "learning_rate": 8.614797633072547e-07, + "loss": 0.6853, + "step": 40420 + }, + { + "epoch": 0.8684538385530781, + "grad_norm": 0.5078214025245414, + "learning_rate": 8.58714060977438e-07, + "loss": 0.6927, + "step": 40430 + }, + { + "epoch": 0.8686686428663487, + "grad_norm": 0.5068985776833694, + "learning_rate": 8.55952606127185e-07, + "loss": 0.6877, + "step": 40440 + }, + { + "epoch": 0.8688834471796194, + "grad_norm": 0.5042210366606157, + "learning_rate": 8.531954000395992e-07, + "loss": 0.6797, + "step": 40450 + }, + { + "epoch": 0.86909825149289, + "grad_norm": 0.5041201672207695, + "learning_rate": 8.504424439958125e-07, + "loss": 0.6912, + "step": 40460 + }, + { + "epoch": 0.8693130558061606, + "grad_norm": 0.49939475577575226, + "learning_rate": 8.476937392749817e-07, + "loss": 0.689, + "step": 40470 + }, + { + "epoch": 0.8695278601194312, + "grad_norm": 0.508423114252119, + "learning_rate": 8.449492871542819e-07, + "loss": 0.6967, + "step": 40480 + }, + { + "epoch": 0.8697426644327018, + "grad_norm": 0.5049557993031742, + "learning_rate": 8.422090889089196e-07, + "loss": 0.6842, + "step": 40490 + }, + { + "epoch": 0.8699574687459725, + "grad_norm": 0.5210648597690205, + "learning_rate": 8.394731458121219e-07, + "loss": 0.6926, + "step": 40500 + }, + { + "epoch": 0.870172273059243, + "grad_norm": 0.5043799660787082, + "learning_rate": 8.367414591351408e-07, + "loss": 0.69, + "step": 40510 + }, + { + "epoch": 0.8703870773725136, + "grad_norm": 0.5317434333391947, + "learning_rate": 8.340140301472466e-07, + "loss": 0.6889, + "step": 40520 + }, + { + "epoch": 0.8706018816857842, + "grad_norm": 0.5400007003253303, + "learning_rate": 8.312908601157355e-07, + "loss": 0.6937, + "step": 40530 + }, + { + "epoch": 0.8708166859990548, + "grad_norm": 0.5127402676218981, + "learning_rate": 8.285719503059209e-07, + "loss": 0.6916, + "step": 40540 + }, + { + "epoch": 0.8710314903123255, + "grad_norm": 0.5151193689571636, + "learning_rate": 8.258573019811389e-07, + "loss": 0.6951, + "step": 40550 + }, + { + "epoch": 0.8712462946255961, + "grad_norm": 0.5239133915563157, + "learning_rate": 8.231469164027461e-07, + "loss": 0.691, + "step": 40560 + }, + { + "epoch": 0.8714610989388667, + "grad_norm": 0.5147336548996377, + "learning_rate": 8.204407948301174e-07, + "loss": 0.6881, + "step": 40570 + }, + { + "epoch": 0.8716759032521373, + "grad_norm": 0.5191148599179711, + "learning_rate": 8.177389385206469e-07, + "loss": 0.6861, + "step": 40580 + }, + { + "epoch": 0.8718907075654079, + "grad_norm": 0.5159462930405265, + "learning_rate": 8.150413487297438e-07, + "loss": 0.6907, + "step": 40590 + }, + { + "epoch": 0.8721055118786786, + "grad_norm": 0.5512478977489063, + "learning_rate": 8.12348026710843e-07, + "loss": 0.6971, + "step": 40600 + }, + { + "epoch": 0.8723203161919492, + "grad_norm": 0.512581984887224, + "learning_rate": 8.096589737153859e-07, + "loss": 0.6738, + "step": 40610 + }, + { + "epoch": 0.8725351205052198, + "grad_norm": 0.5228312882242846, + "learning_rate": 8.069741909928374e-07, + "loss": 0.688, + "step": 40620 + }, + { + "epoch": 0.8727499248184903, + "grad_norm": 0.5006383946158728, + "learning_rate": 8.042936797906753e-07, + "loss": 0.687, + "step": 40630 + }, + { + "epoch": 0.8729647291317609, + "grad_norm": 0.5006156486251359, + "learning_rate": 8.016174413543953e-07, + "loss": 0.6831, + "step": 40640 + }, + { + "epoch": 0.8731795334450316, + "grad_norm": 0.5111940788173959, + "learning_rate": 7.989454769275073e-07, + "loss": 0.6885, + "step": 40650 + }, + { + "epoch": 0.8733943377583022, + "grad_norm": 0.5449360427418215, + "learning_rate": 7.962777877515293e-07, + "loss": 0.6927, + "step": 40660 + }, + { + "epoch": 0.8736091420715728, + "grad_norm": 0.5089460028641523, + "learning_rate": 7.936143750660008e-07, + "loss": 0.6742, + "step": 40670 + }, + { + "epoch": 0.8738239463848434, + "grad_norm": 0.5017956450345394, + "learning_rate": 7.909552401084697e-07, + "loss": 0.6715, + "step": 40680 + }, + { + "epoch": 0.874038750698114, + "grad_norm": 0.49880673793119984, + "learning_rate": 7.883003841144976e-07, + "loss": 0.679, + "step": 40690 + }, + { + "epoch": 0.8742535550113846, + "grad_norm": 0.4907335395967764, + "learning_rate": 7.856498083176612e-07, + "loss": 0.6785, + "step": 40700 + }, + { + "epoch": 0.8744683593246553, + "grad_norm": 0.5329308849961445, + "learning_rate": 7.830035139495384e-07, + "loss": 0.6933, + "step": 40710 + }, + { + "epoch": 0.8746831636379259, + "grad_norm": 0.5130223382623801, + "learning_rate": 7.803615022397315e-07, + "loss": 0.7, + "step": 40720 + }, + { + "epoch": 0.8748979679511965, + "grad_norm": 0.5211784970357225, + "learning_rate": 7.777237744158406e-07, + "loss": 0.7002, + "step": 40730 + }, + { + "epoch": 0.875112772264467, + "grad_norm": 0.5102775700430415, + "learning_rate": 7.750903317034831e-07, + "loss": 0.692, + "step": 40740 + }, + { + "epoch": 0.8753275765777376, + "grad_norm": 0.5042499830623806, + "learning_rate": 7.724611753262834e-07, + "loss": 0.6793, + "step": 40750 + }, + { + "epoch": 0.8755423808910083, + "grad_norm": 0.5160190803282572, + "learning_rate": 7.698363065058689e-07, + "loss": 0.6889, + "step": 40760 + }, + { + "epoch": 0.8757571852042789, + "grad_norm": 0.50370584207469, + "learning_rate": 7.672157264618852e-07, + "loss": 0.6773, + "step": 40770 + }, + { + "epoch": 0.8759719895175495, + "grad_norm": 0.5051104461571446, + "learning_rate": 7.645994364119758e-07, + "loss": 0.6992, + "step": 40780 + }, + { + "epoch": 0.8761867938308201, + "grad_norm": 0.4946566053954112, + "learning_rate": 7.61987437571795e-07, + "loss": 0.6878, + "step": 40790 + }, + { + "epoch": 0.8764015981440907, + "grad_norm": 0.4996685140970515, + "learning_rate": 7.593797311550055e-07, + "loss": 0.6808, + "step": 40800 + }, + { + "epoch": 0.8766164024573614, + "grad_norm": 0.5172882831148481, + "learning_rate": 7.567763183732668e-07, + "loss": 0.6892, + "step": 40810 + }, + { + "epoch": 0.876831206770632, + "grad_norm": 0.5091750814973897, + "learning_rate": 7.541772004362557e-07, + "loss": 0.693, + "step": 40820 + }, + { + "epoch": 0.8770460110839026, + "grad_norm": 0.500703308186769, + "learning_rate": 7.515823785516418e-07, + "loss": 0.6884, + "step": 40830 + }, + { + "epoch": 0.8772608153971732, + "grad_norm": 0.5224680213539507, + "learning_rate": 7.489918539251085e-07, + "loss": 0.7, + "step": 40840 + }, + { + "epoch": 0.8774756197104437, + "grad_norm": 0.5058732772246898, + "learning_rate": 7.464056277603326e-07, + "loss": 0.6924, + "step": 40850 + }, + { + "epoch": 0.8776904240237144, + "grad_norm": 0.5113746801292882, + "learning_rate": 7.438237012590033e-07, + "loss": 0.6887, + "step": 40860 + }, + { + "epoch": 0.877905228336985, + "grad_norm": 0.5305907079168781, + "learning_rate": 7.412460756208051e-07, + "loss": 0.6798, + "step": 40870 + }, + { + "epoch": 0.8781200326502556, + "grad_norm": 0.5126751045545114, + "learning_rate": 7.386727520434245e-07, + "loss": 0.6885, + "step": 40880 + }, + { + "epoch": 0.8783348369635262, + "grad_norm": 0.5022215260758026, + "learning_rate": 7.361037317225561e-07, + "loss": 0.6854, + "step": 40890 + }, + { + "epoch": 0.8785496412767968, + "grad_norm": 0.5192119588923588, + "learning_rate": 7.335390158518852e-07, + "loss": 0.6853, + "step": 40900 + }, + { + "epoch": 0.8787644455900675, + "grad_norm": 0.49672397477009517, + "learning_rate": 7.309786056231039e-07, + "loss": 0.6874, + "step": 40910 + }, + { + "epoch": 0.8789792499033381, + "grad_norm": 0.5145358299532603, + "learning_rate": 7.284225022259028e-07, + "loss": 0.6935, + "step": 40920 + }, + { + "epoch": 0.8791940542166087, + "grad_norm": 0.5226202982810969, + "learning_rate": 7.258707068479642e-07, + "loss": 0.6899, + "step": 40930 + }, + { + "epoch": 0.8794088585298793, + "grad_norm": 0.513542215006283, + "learning_rate": 7.23323220674983e-07, + "loss": 0.6793, + "step": 40940 + }, + { + "epoch": 0.8796236628431499, + "grad_norm": 0.5225094478413966, + "learning_rate": 7.207800448906366e-07, + "loss": 0.7004, + "step": 40950 + }, + { + "epoch": 0.8798384671564206, + "grad_norm": 0.5150496074301979, + "learning_rate": 7.182411806766088e-07, + "loss": 0.6911, + "step": 40960 + }, + { + "epoch": 0.8800532714696911, + "grad_norm": 0.49652687262660034, + "learning_rate": 7.157066292125769e-07, + "loss": 0.6853, + "step": 40970 + }, + { + "epoch": 0.8802680757829617, + "grad_norm": 0.5241260101055815, + "learning_rate": 7.131763916762169e-07, + "loss": 0.6935, + "step": 40980 + }, + { + "epoch": 0.8804828800962323, + "grad_norm": 0.48817871163918836, + "learning_rate": 7.106504692431981e-07, + "loss": 0.693, + "step": 40990 + }, + { + "epoch": 0.8806976844095029, + "grad_norm": 0.529292181472698, + "learning_rate": 7.081288630871819e-07, + "loss": 0.7034, + "step": 41000 + }, + { + "epoch": 0.8809124887227735, + "grad_norm": 0.5059883454495374, + "learning_rate": 7.056115743798309e-07, + "loss": 0.6962, + "step": 41010 + }, + { + "epoch": 0.8811272930360442, + "grad_norm": 0.5470086285335591, + "learning_rate": 7.030986042907962e-07, + "loss": 0.6866, + "step": 41020 + }, + { + "epoch": 0.8813420973493148, + "grad_norm": 0.5236923421331561, + "learning_rate": 7.005899539877248e-07, + "loss": 0.6817, + "step": 41030 + }, + { + "epoch": 0.8815569016625854, + "grad_norm": 0.5089883929134034, + "learning_rate": 6.980856246362566e-07, + "loss": 0.6948, + "step": 41040 + }, + { + "epoch": 0.881771705975856, + "grad_norm": 0.5218195607179691, + "learning_rate": 6.955856174000208e-07, + "loss": 0.6975, + "step": 41050 + }, + { + "epoch": 0.8819865102891266, + "grad_norm": 0.5082317075268683, + "learning_rate": 6.93089933440645e-07, + "loss": 0.7135, + "step": 41060 + }, + { + "epoch": 0.8822013146023973, + "grad_norm": 0.5088831853959854, + "learning_rate": 6.905985739177379e-07, + "loss": 0.6887, + "step": 41070 + }, + { + "epoch": 0.8824161189156678, + "grad_norm": 0.5185177820509219, + "learning_rate": 6.881115399889083e-07, + "loss": 0.6974, + "step": 41080 + }, + { + "epoch": 0.8826309232289384, + "grad_norm": 0.5063149686563211, + "learning_rate": 6.856288328097505e-07, + "loss": 0.6998, + "step": 41090 + }, + { + "epoch": 0.882845727542209, + "grad_norm": 0.5250508854568222, + "learning_rate": 6.831504535338485e-07, + "loss": 0.6856, + "step": 41100 + }, + { + "epoch": 0.8830605318554796, + "grad_norm": 0.5124153836462256, + "learning_rate": 6.806764033127778e-07, + "loss": 0.6836, + "step": 41110 + }, + { + "epoch": 0.8832753361687503, + "grad_norm": 0.5274948224156129, + "learning_rate": 6.782066832960987e-07, + "loss": 0.6924, + "step": 41120 + }, + { + "epoch": 0.8834901404820209, + "grad_norm": 0.6205306011716412, + "learning_rate": 6.757412946313613e-07, + "loss": 0.6819, + "step": 41130 + }, + { + "epoch": 0.8837049447952915, + "grad_norm": 0.5245818667913486, + "learning_rate": 6.732802384641057e-07, + "loss": 0.6844, + "step": 41140 + }, + { + "epoch": 0.8839197491085621, + "grad_norm": 0.5205364438282932, + "learning_rate": 6.708235159378551e-07, + "loss": 0.6859, + "step": 41150 + }, + { + "epoch": 0.8841345534218327, + "grad_norm": 0.5131908898170064, + "learning_rate": 6.683711281941196e-07, + "loss": 0.6902, + "step": 41160 + }, + { + "epoch": 0.8843493577351034, + "grad_norm": 0.5062297624465112, + "learning_rate": 6.659230763724001e-07, + "loss": 0.6906, + "step": 41170 + }, + { + "epoch": 0.884564162048374, + "grad_norm": 0.5026327253540595, + "learning_rate": 6.634793616101732e-07, + "loss": 0.6722, + "step": 41180 + }, + { + "epoch": 0.8847789663616445, + "grad_norm": 0.5009299214524836, + "learning_rate": 6.610399850429094e-07, + "loss": 0.6872, + "step": 41190 + }, + { + "epoch": 0.8849937706749151, + "grad_norm": 0.5238156380409812, + "learning_rate": 6.58604947804059e-07, + "loss": 0.6917, + "step": 41200 + }, + { + "epoch": 0.8852085749881857, + "grad_norm": 0.5083745372288995, + "learning_rate": 6.561742510250569e-07, + "loss": 0.6809, + "step": 41210 + }, + { + "epoch": 0.8854233793014564, + "grad_norm": 0.5383915042584236, + "learning_rate": 6.537478958353216e-07, + "loss": 0.6979, + "step": 41220 + }, + { + "epoch": 0.885638183614727, + "grad_norm": 0.5043226314095659, + "learning_rate": 6.513258833622537e-07, + "loss": 0.682, + "step": 41230 + }, + { + "epoch": 0.8858529879279976, + "grad_norm": 0.5127868644616522, + "learning_rate": 6.489082147312387e-07, + "loss": 0.6821, + "step": 41240 + }, + { + "epoch": 0.8860677922412682, + "grad_norm": 0.4979888752333213, + "learning_rate": 6.464948910656377e-07, + "loss": 0.685, + "step": 41250 + }, + { + "epoch": 0.8862825965545388, + "grad_norm": 0.5128304605983001, + "learning_rate": 6.440859134867972e-07, + "loss": 0.6962, + "step": 41260 + }, + { + "epoch": 0.8864974008678094, + "grad_norm": 0.5051244175392152, + "learning_rate": 6.416812831140451e-07, + "loss": 0.6909, + "step": 41270 + }, + { + "epoch": 0.8867122051810801, + "grad_norm": 0.5101008415865377, + "learning_rate": 6.392810010646866e-07, + "loss": 0.6855, + "step": 41280 + }, + { + "epoch": 0.8869270094943507, + "grad_norm": 0.5202190940464628, + "learning_rate": 6.368850684540106e-07, + "loss": 0.7058, + "step": 41290 + }, + { + "epoch": 0.8871418138076212, + "grad_norm": 0.5186081016150472, + "learning_rate": 6.344934863952779e-07, + "loss": 0.6781, + "step": 41300 + }, + { + "epoch": 0.8873566181208918, + "grad_norm": 0.5329527297577857, + "learning_rate": 6.321062559997349e-07, + "loss": 0.7019, + "step": 41310 + }, + { + "epoch": 0.8875714224341624, + "grad_norm": 0.5217265024678235, + "learning_rate": 6.297233783766022e-07, + "loss": 0.6985, + "step": 41320 + }, + { + "epoch": 0.8877862267474331, + "grad_norm": 0.5194330367298682, + "learning_rate": 6.273448546330796e-07, + "loss": 0.6839, + "step": 41330 + }, + { + "epoch": 0.8880010310607037, + "grad_norm": 0.5097902257790524, + "learning_rate": 6.249706858743443e-07, + "loss": 0.6973, + "step": 41340 + }, + { + "epoch": 0.8882158353739743, + "grad_norm": 0.5142203693554276, + "learning_rate": 6.22600873203545e-07, + "loss": 0.6985, + "step": 41350 + }, + { + "epoch": 0.8884306396872449, + "grad_norm": 0.5090401566472215, + "learning_rate": 6.202354177218161e-07, + "loss": 0.6795, + "step": 41360 + }, + { + "epoch": 0.8886454440005155, + "grad_norm": 0.49677462148757334, + "learning_rate": 6.178743205282567e-07, + "loss": 0.6861, + "step": 41370 + }, + { + "epoch": 0.8888602483137862, + "grad_norm": 0.521382571115747, + "learning_rate": 6.155175827199467e-07, + "loss": 0.7016, + "step": 41380 + }, + { + "epoch": 0.8890750526270568, + "grad_norm": 0.5317933595388324, + "learning_rate": 6.131652053919424e-07, + "loss": 0.6907, + "step": 41390 + }, + { + "epoch": 0.8892898569403274, + "grad_norm": 0.5213449864141357, + "learning_rate": 6.10817189637265e-07, + "loss": 0.702, + "step": 41400 + }, + { + "epoch": 0.889504661253598, + "grad_norm": 0.5514505419123323, + "learning_rate": 6.084735365469229e-07, + "loss": 0.7022, + "step": 41410 + }, + { + "epoch": 0.8897194655668685, + "grad_norm": 0.5138208717231828, + "learning_rate": 6.061342472098819e-07, + "loss": 0.6742, + "step": 41420 + }, + { + "epoch": 0.8899342698801392, + "grad_norm": 0.5071996581331448, + "learning_rate": 6.037993227130945e-07, + "loss": 0.6922, + "step": 41430 + }, + { + "epoch": 0.8901490741934098, + "grad_norm": 0.5102097970519316, + "learning_rate": 6.014687641414751e-07, + "loss": 0.6929, + "step": 41440 + }, + { + "epoch": 0.8903638785066804, + "grad_norm": 0.5143324484042711, + "learning_rate": 5.991425725779132e-07, + "loss": 0.6975, + "step": 41450 + }, + { + "epoch": 0.890578682819951, + "grad_norm": 0.518475417925795, + "learning_rate": 5.968207491032707e-07, + "loss": 0.6917, + "step": 41460 + }, + { + "epoch": 0.8907934871332216, + "grad_norm": 0.4994682556931308, + "learning_rate": 5.945032947963736e-07, + "loss": 0.6811, + "step": 41470 + }, + { + "epoch": 0.8910082914464923, + "grad_norm": 0.5070960760778904, + "learning_rate": 5.921902107340282e-07, + "loss": 0.7016, + "step": 41480 + }, + { + "epoch": 0.8912230957597629, + "grad_norm": 0.5191444966925488, + "learning_rate": 5.898814979909995e-07, + "loss": 0.7052, + "step": 41490 + }, + { + "epoch": 0.8914379000730335, + "grad_norm": 0.5035080497362481, + "learning_rate": 5.87577157640028e-07, + "loss": 0.6973, + "step": 41500 + }, + { + "epoch": 0.8916527043863041, + "grad_norm": 0.5333052134877714, + "learning_rate": 5.852771907518228e-07, + "loss": 0.7011, + "step": 41510 + }, + { + "epoch": 0.8918675086995747, + "grad_norm": 0.5155211109068989, + "learning_rate": 5.829815983950526e-07, + "loss": 0.693, + "step": 41520 + }, + { + "epoch": 0.8920823130128454, + "grad_norm": 0.5023502013771952, + "learning_rate": 5.806903816363663e-07, + "loss": 0.6906, + "step": 41530 + }, + { + "epoch": 0.8922971173261159, + "grad_norm": 0.5214356225380249, + "learning_rate": 5.784035415403688e-07, + "loss": 0.6883, + "step": 41540 + }, + { + "epoch": 0.8925119216393865, + "grad_norm": 0.5159695126204489, + "learning_rate": 5.761210791696381e-07, + "loss": 0.6897, + "step": 41550 + }, + { + "epoch": 0.8927267259526571, + "grad_norm": 0.5209933564528445, + "learning_rate": 5.738429955847147e-07, + "loss": 0.7004, + "step": 41560 + }, + { + "epoch": 0.8929415302659277, + "grad_norm": 0.498039038963185, + "learning_rate": 5.715692918441029e-07, + "loss": 0.683, + "step": 41570 + }, + { + "epoch": 0.8931563345791983, + "grad_norm": 0.5006203623139068, + "learning_rate": 5.692999690042799e-07, + "loss": 0.6945, + "step": 41580 + }, + { + "epoch": 0.893371138892469, + "grad_norm": 0.5233476464091876, + "learning_rate": 5.670350281196768e-07, + "loss": 0.6987, + "step": 41590 + }, + { + "epoch": 0.8935859432057396, + "grad_norm": 0.5136980303605799, + "learning_rate": 5.64774470242696e-07, + "loss": 0.7007, + "step": 41600 + }, + { + "epoch": 0.8938007475190102, + "grad_norm": 0.512143271533227, + "learning_rate": 5.625182964236997e-07, + "loss": 0.6913, + "step": 41610 + }, + { + "epoch": 0.8940155518322808, + "grad_norm": 0.512132113394993, + "learning_rate": 5.602665077110169e-07, + "loss": 0.6854, + "step": 41620 + }, + { + "epoch": 0.8942303561455514, + "grad_norm": 0.51811405156484, + "learning_rate": 5.580191051509354e-07, + "loss": 0.691, + "step": 41630 + }, + { + "epoch": 0.894445160458822, + "grad_norm": 0.5115508553053043, + "learning_rate": 5.55776089787703e-07, + "loss": 0.6748, + "step": 41640 + }, + { + "epoch": 0.8946599647720926, + "grad_norm": 0.5241262214299848, + "learning_rate": 5.535374626635381e-07, + "loss": 0.6934, + "step": 41650 + }, + { + "epoch": 0.8948747690853632, + "grad_norm": 0.5244130502553322, + "learning_rate": 5.51303224818609e-07, + "loss": 0.6871, + "step": 41660 + }, + { + "epoch": 0.8950895733986338, + "grad_norm": 0.4994599674132126, + "learning_rate": 5.490733772910517e-07, + "loss": 0.6813, + "step": 41670 + }, + { + "epoch": 0.8953043777119044, + "grad_norm": 0.4986836428006602, + "learning_rate": 5.46847921116962e-07, + "loss": 0.6902, + "step": 41680 + }, + { + "epoch": 0.8955191820251751, + "grad_norm": 0.5152538290727082, + "learning_rate": 5.446268573303881e-07, + "loss": 0.6833, + "step": 41690 + }, + { + "epoch": 0.8957339863384457, + "grad_norm": 0.502386100555975, + "learning_rate": 5.424101869633491e-07, + "loss": 0.6927, + "step": 41700 + }, + { + "epoch": 0.8959487906517163, + "grad_norm": 0.5074939313677448, + "learning_rate": 5.401979110458133e-07, + "loss": 0.6758, + "step": 41710 + }, + { + "epoch": 0.8961635949649869, + "grad_norm": 0.5058971850323379, + "learning_rate": 5.379900306057084e-07, + "loss": 0.6797, + "step": 41720 + }, + { + "epoch": 0.8963783992782575, + "grad_norm": 0.4926506337654991, + "learning_rate": 5.357865466689249e-07, + "loss": 0.6822, + "step": 41730 + }, + { + "epoch": 0.8965932035915282, + "grad_norm": 0.5202246837788422, + "learning_rate": 5.33587460259305e-07, + "loss": 0.686, + "step": 41740 + }, + { + "epoch": 0.8968080079047988, + "grad_norm": 0.5224471201252612, + "learning_rate": 5.313927723986511e-07, + "loss": 0.6988, + "step": 41750 + }, + { + "epoch": 0.8970228122180693, + "grad_norm": 0.499998013117945, + "learning_rate": 5.292024841067179e-07, + "loss": 0.6913, + "step": 41760 + }, + { + "epoch": 0.8972376165313399, + "grad_norm": 0.5068130297990905, + "learning_rate": 5.270165964012198e-07, + "loss": 0.6918, + "step": 41770 + }, + { + "epoch": 0.8974524208446105, + "grad_norm": 0.5178316720171696, + "learning_rate": 5.248351102978255e-07, + "loss": 0.7096, + "step": 41780 + }, + { + "epoch": 0.8976672251578812, + "grad_norm": 0.5351505572464849, + "learning_rate": 5.226580268101566e-07, + "loss": 0.6831, + "step": 41790 + }, + { + "epoch": 0.8978820294711518, + "grad_norm": 0.5113178684301397, + "learning_rate": 5.204853469497917e-07, + "loss": 0.6945, + "step": 41800 + }, + { + "epoch": 0.8980968337844224, + "grad_norm": 0.5047128008727567, + "learning_rate": 5.18317071726262e-07, + "loss": 0.688, + "step": 41810 + }, + { + "epoch": 0.898311638097693, + "grad_norm": 0.5100213323202718, + "learning_rate": 5.161532021470527e-07, + "loss": 0.6941, + "step": 41820 + }, + { + "epoch": 0.8985264424109636, + "grad_norm": 0.5412542192317169, + "learning_rate": 5.139937392175987e-07, + "loss": 0.6804, + "step": 41830 + }, + { + "epoch": 0.8987412467242343, + "grad_norm": 0.5102282656359679, + "learning_rate": 5.118386839412925e-07, + "loss": 0.6867, + "step": 41840 + }, + { + "epoch": 0.8989560510375049, + "grad_norm": 0.5108191464876962, + "learning_rate": 5.096880373194745e-07, + "loss": 0.694, + "step": 41850 + }, + { + "epoch": 0.8991708553507755, + "grad_norm": 0.5080216890454713, + "learning_rate": 5.075418003514388e-07, + "loss": 0.6807, + "step": 41860 + }, + { + "epoch": 0.899385659664046, + "grad_norm": 0.5088339519727141, + "learning_rate": 5.053999740344306e-07, + "loss": 0.6868, + "step": 41870 + }, + { + "epoch": 0.8996004639773166, + "grad_norm": 0.5152387132782632, + "learning_rate": 5.032625593636442e-07, + "loss": 0.7018, + "step": 41880 + }, + { + "epoch": 0.8998152682905872, + "grad_norm": 0.4901427047062954, + "learning_rate": 5.01129557332225e-07, + "loss": 0.6845, + "step": 41890 + }, + { + "epoch": 0.9000300726038579, + "grad_norm": 0.5129493119966259, + "learning_rate": 4.990009689312669e-07, + "loss": 0.6826, + "step": 41900 + }, + { + "epoch": 0.9002448769171285, + "grad_norm": 0.5223099648979949, + "learning_rate": 4.968767951498166e-07, + "loss": 0.7005, + "step": 41910 + }, + { + "epoch": 0.9004596812303991, + "grad_norm": 0.4891041542553347, + "learning_rate": 4.947570369748656e-07, + "loss": 0.688, + "step": 41920 + }, + { + "epoch": 0.9006744855436697, + "grad_norm": 0.49608949461071944, + "learning_rate": 4.92641695391356e-07, + "loss": 0.6922, + "step": 41930 + }, + { + "epoch": 0.9008892898569403, + "grad_norm": 0.521684682281621, + "learning_rate": 4.905307713821761e-07, + "loss": 0.6964, + "step": 41940 + }, + { + "epoch": 0.901104094170211, + "grad_norm": 0.5052599021707938, + "learning_rate": 4.884242659281613e-07, + "loss": 0.6806, + "step": 41950 + }, + { + "epoch": 0.9013188984834816, + "grad_norm": 0.5154672787022767, + "learning_rate": 4.863221800080964e-07, + "loss": 0.685, + "step": 41960 + }, + { + "epoch": 0.9015337027967522, + "grad_norm": 0.5384871380139898, + "learning_rate": 4.842245145987112e-07, + "loss": 0.6828, + "step": 41970 + }, + { + "epoch": 0.9017485071100227, + "grad_norm": 0.5399455434197028, + "learning_rate": 4.821312706746817e-07, + "loss": 0.6962, + "step": 41980 + }, + { + "epoch": 0.9019633114232933, + "grad_norm": 0.5155738707830432, + "learning_rate": 4.800424492086275e-07, + "loss": 0.683, + "step": 41990 + }, + { + "epoch": 0.902178115736564, + "grad_norm": 0.5046827454219124, + "learning_rate": 4.779580511711191e-07, + "loss": 0.6852, + "step": 42000 + }, + { + "epoch": 0.9023929200498346, + "grad_norm": 0.5236358307924249, + "learning_rate": 4.758780775306637e-07, + "loss": 0.6973, + "step": 42010 + }, + { + "epoch": 0.9026077243631052, + "grad_norm": 0.5308120199608228, + "learning_rate": 4.738025292537185e-07, + "loss": 0.6911, + "step": 42020 + }, + { + "epoch": 0.9028225286763758, + "grad_norm": 0.5135158557502522, + "learning_rate": 4.71731407304683e-07, + "loss": 0.6889, + "step": 42030 + }, + { + "epoch": 0.9030373329896464, + "grad_norm": 0.5192712739754789, + "learning_rate": 4.6966471264589865e-07, + "loss": 0.6966, + "step": 42040 + }, + { + "epoch": 0.9032521373029171, + "grad_norm": 0.5213999046870769, + "learning_rate": 4.6760244623765384e-07, + "loss": 0.7042, + "step": 42050 + }, + { + "epoch": 0.9034669416161877, + "grad_norm": 0.5105007301231993, + "learning_rate": 4.65544609038171e-07, + "loss": 0.6781, + "step": 42060 + }, + { + "epoch": 0.9036817459294583, + "grad_norm": 0.500621312368029, + "learning_rate": 4.634912020036242e-07, + "loss": 0.6749, + "step": 42070 + }, + { + "epoch": 0.9038965502427289, + "grad_norm": 0.505010805688616, + "learning_rate": 4.614422260881235e-07, + "loss": 0.6732, + "step": 42080 + }, + { + "epoch": 0.9041113545559994, + "grad_norm": 0.5342937103145424, + "learning_rate": 4.593976822437207e-07, + "loss": 0.6844, + "step": 42090 + }, + { + "epoch": 0.9043261588692701, + "grad_norm": 0.5183262843971005, + "learning_rate": 4.573575714204115e-07, + "loss": 0.6984, + "step": 42100 + }, + { + "epoch": 0.9045409631825407, + "grad_norm": 0.517561745524737, + "learning_rate": 4.5532189456612306e-07, + "loss": 0.6889, + "step": 42110 + }, + { + "epoch": 0.9047557674958113, + "grad_norm": 0.5026306781586204, + "learning_rate": 4.5329065262673666e-07, + "loss": 0.6823, + "step": 42120 + }, + { + "epoch": 0.9049705718090819, + "grad_norm": 0.5427704129364224, + "learning_rate": 4.512638465460584e-07, + "loss": 0.6906, + "step": 42130 + }, + { + "epoch": 0.9051853761223525, + "grad_norm": 0.5074939200787636, + "learning_rate": 4.4924147726584153e-07, + "loss": 0.672, + "step": 42140 + }, + { + "epoch": 0.9054001804356231, + "grad_norm": 0.504225946197821, + "learning_rate": 4.4722354572577785e-07, + "loss": 0.6911, + "step": 42150 + }, + { + "epoch": 0.9056149847488938, + "grad_norm": 0.5145785005958214, + "learning_rate": 4.452100528634906e-07, + "loss": 0.694, + "step": 42160 + }, + { + "epoch": 0.9058297890621644, + "grad_norm": 0.49881418371078307, + "learning_rate": 4.432009996145492e-07, + "loss": 0.6794, + "step": 42170 + }, + { + "epoch": 0.906044593375435, + "grad_norm": 0.5176800894518867, + "learning_rate": 4.411963869124525e-07, + "loss": 0.6945, + "step": 42180 + }, + { + "epoch": 0.9062593976887056, + "grad_norm": 0.5090959735816222, + "learning_rate": 4.3919621568864313e-07, + "loss": 0.6875, + "step": 42190 + }, + { + "epoch": 0.9064742020019761, + "grad_norm": 0.5075300086152117, + "learning_rate": 4.372004868724944e-07, + "loss": 0.6857, + "step": 42200 + }, + { + "epoch": 0.9066890063152468, + "grad_norm": 0.5005344252080504, + "learning_rate": 4.3520920139131784e-07, + "loss": 0.6908, + "step": 42210 + }, + { + "epoch": 0.9069038106285174, + "grad_norm": 0.5092091177314088, + "learning_rate": 4.3322236017036114e-07, + "loss": 0.684, + "step": 42220 + }, + { + "epoch": 0.907118614941788, + "grad_norm": 0.5128571527964733, + "learning_rate": 4.3123996413280356e-07, + "loss": 0.6903, + "step": 42230 + }, + { + "epoch": 0.9073334192550586, + "grad_norm": 0.5236930918406155, + "learning_rate": 4.29262014199765e-07, + "loss": 0.6884, + "step": 42240 + }, + { + "epoch": 0.9075482235683292, + "grad_norm": 0.522031014128186, + "learning_rate": 4.272885112902925e-07, + "loss": 0.6862, + "step": 42250 + }, + { + "epoch": 0.9077630278815999, + "grad_norm": 0.534657132580671, + "learning_rate": 4.253194563213714e-07, + "loss": 0.6976, + "step": 42260 + }, + { + "epoch": 0.9079778321948705, + "grad_norm": 0.5270962563830953, + "learning_rate": 4.233548502079199e-07, + "loss": 0.6871, + "step": 42270 + }, + { + "epoch": 0.9081926365081411, + "grad_norm": 0.5397710000399429, + "learning_rate": 4.2139469386278445e-07, + "loss": 0.6787, + "step": 42280 + }, + { + "epoch": 0.9084074408214117, + "grad_norm": 0.5157952407298018, + "learning_rate": 4.194389881967531e-07, + "loss": 0.6929, + "step": 42290 + }, + { + "epoch": 0.9086222451346823, + "grad_norm": 0.5155262585551267, + "learning_rate": 4.174877341185368e-07, + "loss": 0.699, + "step": 42300 + }, + { + "epoch": 0.908837049447953, + "grad_norm": 0.6004351700870802, + "learning_rate": 4.155409325347826e-07, + "loss": 0.68, + "step": 42310 + }, + { + "epoch": 0.9090518537612235, + "grad_norm": 0.5179978734902895, + "learning_rate": 4.135985843500678e-07, + "loss": 0.6855, + "step": 42320 + }, + { + "epoch": 0.9092666580744941, + "grad_norm": 0.5172988314550917, + "learning_rate": 4.116606904668996e-07, + "loss": 0.6903, + "step": 42330 + }, + { + "epoch": 0.9094814623877647, + "grad_norm": 0.527441343917074, + "learning_rate": 4.097272517857187e-07, + "loss": 0.6835, + "step": 42340 + }, + { + "epoch": 0.9096962667010353, + "grad_norm": 0.5094088299463104, + "learning_rate": 4.07798269204891e-07, + "loss": 0.6745, + "step": 42350 + }, + { + "epoch": 0.909911071014306, + "grad_norm": 0.49824225660299976, + "learning_rate": 4.0587374362071495e-07, + "loss": 0.692, + "step": 42360 + }, + { + "epoch": 0.9101258753275766, + "grad_norm": 0.4958646778169003, + "learning_rate": 4.0395367592741876e-07, + "loss": 0.6927, + "step": 42370 + }, + { + "epoch": 0.9103406796408472, + "grad_norm": 0.5330942753471896, + "learning_rate": 4.020380670171553e-07, + "loss": 0.6929, + "step": 42380 + }, + { + "epoch": 0.9105554839541178, + "grad_norm": 0.5233184041925335, + "learning_rate": 4.001269177800116e-07, + "loss": 0.6965, + "step": 42390 + }, + { + "epoch": 0.9107702882673884, + "grad_norm": 0.5120728262031152, + "learning_rate": 3.9822022910399496e-07, + "loss": 0.6741, + "step": 42400 + }, + { + "epoch": 0.9109850925806591, + "grad_norm": 0.5064121099027354, + "learning_rate": 3.96318001875049e-07, + "loss": 0.6784, + "step": 42410 + }, + { + "epoch": 0.9111998968939297, + "grad_norm": 0.5602640114772643, + "learning_rate": 3.944202369770367e-07, + "loss": 0.6961, + "step": 42420 + }, + { + "epoch": 0.9114147012072003, + "grad_norm": 0.5432682414153873, + "learning_rate": 3.925269352917505e-07, + "loss": 0.6895, + "step": 42430 + }, + { + "epoch": 0.9116295055204708, + "grad_norm": 0.5159534104410033, + "learning_rate": 3.90638097698911e-07, + "loss": 0.6967, + "step": 42440 + }, + { + "epoch": 0.9118443098337414, + "grad_norm": 0.5421695386729521, + "learning_rate": 3.887537250761597e-07, + "loss": 0.6906, + "step": 42450 + }, + { + "epoch": 0.912059114147012, + "grad_norm": 0.5269412372870173, + "learning_rate": 3.8687381829906944e-07, + "loss": 0.6921, + "step": 42460 + }, + { + "epoch": 0.9122739184602827, + "grad_norm": 0.509562529599299, + "learning_rate": 3.849983782411337e-07, + "loss": 0.6885, + "step": 42470 + }, + { + "epoch": 0.9124887227735533, + "grad_norm": 0.510134688474772, + "learning_rate": 3.8312740577377214e-07, + "loss": 0.6815, + "step": 42480 + }, + { + "epoch": 0.9127035270868239, + "grad_norm": 0.512454202372446, + "learning_rate": 3.812609017663271e-07, + "loss": 0.7021, + "step": 42490 + }, + { + "epoch": 0.9129183314000945, + "grad_norm": 0.5069629345687713, + "learning_rate": 3.793988670860671e-07, + "loss": 0.6908, + "step": 42500 + }, + { + "epoch": 0.9131331357133651, + "grad_norm": 0.5145243709537642, + "learning_rate": 3.7754130259818334e-07, + "loss": 0.6761, + "step": 42510 + }, + { + "epoch": 0.9133479400266358, + "grad_norm": 0.5057114456413828, + "learning_rate": 3.7568820916578765e-07, + "loss": 0.7133, + "step": 42520 + }, + { + "epoch": 0.9135627443399064, + "grad_norm": 0.5027601053480496, + "learning_rate": 3.738395876499157e-07, + "loss": 0.68, + "step": 42530 + }, + { + "epoch": 0.913777548653177, + "grad_norm": 0.500024246376363, + "learning_rate": 3.7199543890952817e-07, + "loss": 0.6921, + "step": 42540 + }, + { + "epoch": 0.9139923529664475, + "grad_norm": 0.5082257613290262, + "learning_rate": 3.7015576380150187e-07, + "loss": 0.6812, + "step": 42550 + }, + { + "epoch": 0.9142071572797181, + "grad_norm": 0.530878498561435, + "learning_rate": 3.683205631806408e-07, + "loss": 0.6864, + "step": 42560 + }, + { + "epoch": 0.9144219615929888, + "grad_norm": 0.5061215824429818, + "learning_rate": 3.664898378996662e-07, + "loss": 0.6769, + "step": 42570 + }, + { + "epoch": 0.9146367659062594, + "grad_norm": 0.4972627962848076, + "learning_rate": 3.6466358880921984e-07, + "loss": 0.6796, + "step": 42580 + }, + { + "epoch": 0.91485157021953, + "grad_norm": 0.514903636534617, + "learning_rate": 3.628418167578651e-07, + "loss": 0.6843, + "step": 42590 + }, + { + "epoch": 0.9150663745328006, + "grad_norm": 0.5047790440228213, + "learning_rate": 3.610245225920839e-07, + "loss": 0.6826, + "step": 42600 + }, + { + "epoch": 0.9152811788460712, + "grad_norm": 0.5083554832988898, + "learning_rate": 3.5921170715627953e-07, + "loss": 0.6869, + "step": 42610 + }, + { + "epoch": 0.9154959831593419, + "grad_norm": 0.5133159115347935, + "learning_rate": 3.574033712927727e-07, + "loss": 0.6966, + "step": 42620 + }, + { + "epoch": 0.9157107874726125, + "grad_norm": 0.5164607440700887, + "learning_rate": 3.5559951584180355e-07, + "loss": 0.6882, + "step": 42630 + }, + { + "epoch": 0.9159255917858831, + "grad_norm": 0.513461482277684, + "learning_rate": 3.538001416415282e-07, + "loss": 0.6884, + "step": 42640 + }, + { + "epoch": 0.9161403960991537, + "grad_norm": 0.5008580969233325, + "learning_rate": 3.520052495280213e-07, + "loss": 0.6853, + "step": 42650 + }, + { + "epoch": 0.9163552004124242, + "grad_norm": 0.5047493200035524, + "learning_rate": 3.50214840335279e-07, + "loss": 0.6987, + "step": 42660 + }, + { + "epoch": 0.9165700047256949, + "grad_norm": 0.5126226903956973, + "learning_rate": 3.4842891489520803e-07, + "loss": 0.6782, + "step": 42670 + }, + { + "epoch": 0.9167848090389655, + "grad_norm": 0.5250140948834445, + "learning_rate": 3.466474740376369e-07, + "loss": 0.6831, + "step": 42680 + }, + { + "epoch": 0.9169996133522361, + "grad_norm": 0.4970994749703241, + "learning_rate": 3.448705185903101e-07, + "loss": 0.6813, + "step": 42690 + }, + { + "epoch": 0.9172144176655067, + "grad_norm": 0.4957874759690821, + "learning_rate": 3.4309804937888155e-07, + "loss": 0.6812, + "step": 42700 + }, + { + "epoch": 0.9174292219787773, + "grad_norm": 0.5126567512309492, + "learning_rate": 3.413300672269282e-07, + "loss": 0.6904, + "step": 42710 + }, + { + "epoch": 0.917644026292048, + "grad_norm": 0.5269391777536964, + "learning_rate": 3.3956657295593944e-07, + "loss": 0.6799, + "step": 42720 + }, + { + "epoch": 0.9178588306053186, + "grad_norm": 0.5037049888031642, + "learning_rate": 3.378075673853187e-07, + "loss": 0.6747, + "step": 42730 + }, + { + "epoch": 0.9180736349185892, + "grad_norm": 0.5180923103541171, + "learning_rate": 3.360530513323845e-07, + "loss": 0.696, + "step": 42740 + }, + { + "epoch": 0.9182884392318598, + "grad_norm": 0.5377551337981332, + "learning_rate": 3.3430302561236806e-07, + "loss": 0.7026, + "step": 42750 + }, + { + "epoch": 0.9185032435451304, + "grad_norm": 0.5086131451212155, + "learning_rate": 3.325574910384177e-07, + "loss": 0.6867, + "step": 42760 + }, + { + "epoch": 0.9187180478584009, + "grad_norm": 0.5360041477265602, + "learning_rate": 3.3081644842158924e-07, + "loss": 0.6868, + "step": 42770 + }, + { + "epoch": 0.9189328521716716, + "grad_norm": 0.5259395774668065, + "learning_rate": 3.290798985708576e-07, + "loss": 0.6822, + "step": 42780 + }, + { + "epoch": 0.9191476564849422, + "grad_norm": 0.511030877333274, + "learning_rate": 3.2734784229310425e-07, + "loss": 0.6748, + "step": 42790 + }, + { + "epoch": 0.9193624607982128, + "grad_norm": 0.507095211823108, + "learning_rate": 3.2562028039312746e-07, + "loss": 0.6916, + "step": 42800 + }, + { + "epoch": 0.9195772651114834, + "grad_norm": 0.5108278970810879, + "learning_rate": 3.2389721367363623e-07, + "loss": 0.6905, + "step": 42810 + }, + { + "epoch": 0.919792069424754, + "grad_norm": 0.5063482123733604, + "learning_rate": 3.221786429352458e-07, + "loss": 0.6867, + "step": 42820 + }, + { + "epoch": 0.9200068737380247, + "grad_norm": 0.5321360439147953, + "learning_rate": 3.204645689764907e-07, + "loss": 0.6799, + "step": 42830 + }, + { + "epoch": 0.9202216780512953, + "grad_norm": 0.5010045520484869, + "learning_rate": 3.187549925938094e-07, + "loss": 0.6897, + "step": 42840 + }, + { + "epoch": 0.9204364823645659, + "grad_norm": 0.5146448011202501, + "learning_rate": 3.170499145815542e-07, + "loss": 0.6936, + "step": 42850 + }, + { + "epoch": 0.9206512866778365, + "grad_norm": 0.522131748562637, + "learning_rate": 3.15349335731987e-07, + "loss": 0.6758, + "step": 42860 + }, + { + "epoch": 0.9208660909911071, + "grad_norm": 0.5084382700595336, + "learning_rate": 3.136532568352746e-07, + "loss": 0.6872, + "step": 42870 + }, + { + "epoch": 0.9210808953043778, + "grad_norm": 0.5063314963517712, + "learning_rate": 3.11961678679501e-07, + "loss": 0.6763, + "step": 42880 + }, + { + "epoch": 0.9212956996176483, + "grad_norm": 0.529160941637022, + "learning_rate": 3.1027460205065194e-07, + "loss": 0.6898, + "step": 42890 + }, + { + "epoch": 0.9215105039309189, + "grad_norm": 0.5245818745748351, + "learning_rate": 3.085920277326249e-07, + "loss": 0.6639, + "step": 42900 + }, + { + "epoch": 0.9217253082441895, + "grad_norm": 0.5217386624549989, + "learning_rate": 3.0691395650722434e-07, + "loss": 0.6969, + "step": 42910 + }, + { + "epoch": 0.9219401125574601, + "grad_norm": 0.5129921994136167, + "learning_rate": 3.0524038915416e-07, + "loss": 0.6841, + "step": 42920 + }, + { + "epoch": 0.9221549168707308, + "grad_norm": 0.5021469780724663, + "learning_rate": 3.035713264510565e-07, + "loss": 0.6826, + "step": 42930 + }, + { + "epoch": 0.9223697211840014, + "grad_norm": 0.5116730541146018, + "learning_rate": 3.0190676917343673e-07, + "loss": 0.6923, + "step": 42940 + }, + { + "epoch": 0.922584525497272, + "grad_norm": 0.5041634183521139, + "learning_rate": 3.002467180947355e-07, + "loss": 0.6801, + "step": 42950 + }, + { + "epoch": 0.9227993298105426, + "grad_norm": 0.5071126517885879, + "learning_rate": 2.9859117398629236e-07, + "loss": 0.6707, + "step": 42960 + }, + { + "epoch": 0.9230141341238132, + "grad_norm": 0.5191938232678536, + "learning_rate": 2.969401376173486e-07, + "loss": 0.6887, + "step": 42970 + }, + { + "epoch": 0.9232289384370839, + "grad_norm": 0.51739812424603, + "learning_rate": 2.9529360975506074e-07, + "loss": 0.6799, + "step": 42980 + }, + { + "epoch": 0.9234437427503545, + "grad_norm": 0.5153545341424074, + "learning_rate": 2.9365159116447886e-07, + "loss": 0.6835, + "step": 42990 + }, + { + "epoch": 0.923658547063625, + "grad_norm": 0.5306207680932173, + "learning_rate": 2.920140826085682e-07, + "loss": 0.6986, + "step": 43000 + }, + { + "epoch": 0.9238733513768956, + "grad_norm": 0.5055030531585294, + "learning_rate": 2.9038108484819137e-07, + "loss": 0.6895, + "step": 43010 + }, + { + "epoch": 0.9240881556901662, + "grad_norm": 0.5063675711910615, + "learning_rate": 2.887525986421169e-07, + "loss": 0.6995, + "step": 43020 + }, + { + "epoch": 0.9243029600034369, + "grad_norm": 0.4996360884485676, + "learning_rate": 2.8712862474702063e-07, + "loss": 0.6906, + "step": 43030 + }, + { + "epoch": 0.9245177643167075, + "grad_norm": 0.5106698528538595, + "learning_rate": 2.855091639174734e-07, + "loss": 0.6989, + "step": 43040 + }, + { + "epoch": 0.9247325686299781, + "grad_norm": 0.5074236659920939, + "learning_rate": 2.8389421690595995e-07, + "loss": 0.6839, + "step": 43050 + }, + { + "epoch": 0.9249473729432487, + "grad_norm": 0.572337745383503, + "learning_rate": 2.8228378446285784e-07, + "loss": 0.6902, + "step": 43060 + }, + { + "epoch": 0.9251621772565193, + "grad_norm": 0.5068410597647415, + "learning_rate": 2.8067786733645296e-07, + "loss": 0.6867, + "step": 43070 + }, + { + "epoch": 0.9253769815697899, + "grad_norm": 0.5186456864886076, + "learning_rate": 2.7907646627293284e-07, + "loss": 0.6919, + "step": 43080 + }, + { + "epoch": 0.9255917858830606, + "grad_norm": 0.5126033061284958, + "learning_rate": 2.7747958201638113e-07, + "loss": 0.6965, + "step": 43090 + }, + { + "epoch": 0.9258065901963312, + "grad_norm": 0.5161766653294025, + "learning_rate": 2.75887215308791e-07, + "loss": 0.6901, + "step": 43100 + }, + { + "epoch": 0.9260213945096017, + "grad_norm": 0.5049885765836929, + "learning_rate": 2.7429936689004956e-07, + "loss": 0.6788, + "step": 43110 + }, + { + "epoch": 0.9262361988228723, + "grad_norm": 0.5106995321298553, + "learning_rate": 2.7271603749794763e-07, + "loss": 0.6804, + "step": 43120 + }, + { + "epoch": 0.9264510031361429, + "grad_norm": 0.5080317309925427, + "learning_rate": 2.711372278681779e-07, + "loss": 0.6844, + "step": 43130 + }, + { + "epoch": 0.9266658074494136, + "grad_norm": 0.5164907399314109, + "learning_rate": 2.69562938734329e-07, + "loss": 0.6936, + "step": 43140 + }, + { + "epoch": 0.9268806117626842, + "grad_norm": 0.5205333655443777, + "learning_rate": 2.6799317082789267e-07, + "loss": 0.6772, + "step": 43150 + }, + { + "epoch": 0.9270954160759548, + "grad_norm": 0.5259790796204346, + "learning_rate": 2.664279248782564e-07, + "loss": 0.7015, + "step": 43160 + }, + { + "epoch": 0.9273102203892254, + "grad_norm": 0.48989809551529545, + "learning_rate": 2.648672016127096e-07, + "loss": 0.6818, + "step": 43170 + }, + { + "epoch": 0.927525024702496, + "grad_norm": 0.5186292029129187, + "learning_rate": 2.633110017564389e-07, + "loss": 0.6841, + "step": 43180 + }, + { + "epoch": 0.9277398290157667, + "grad_norm": 0.494677339162897, + "learning_rate": 2.6175932603253017e-07, + "loss": 0.6857, + "step": 43190 + }, + { + "epoch": 0.9279546333290373, + "grad_norm": 0.513254704122593, + "learning_rate": 2.6021217516196464e-07, + "loss": 0.6844, + "step": 43200 + }, + { + "epoch": 0.9281694376423079, + "grad_norm": 0.5097903856248501, + "learning_rate": 2.586695498636238e-07, + "loss": 0.6865, + "step": 43210 + }, + { + "epoch": 0.9283842419555784, + "grad_norm": 0.5158784848429144, + "learning_rate": 2.571314508542866e-07, + "loss": 0.6883, + "step": 43220 + }, + { + "epoch": 0.928599046268849, + "grad_norm": 0.49451018639662936, + "learning_rate": 2.5559787884862465e-07, + "loss": 0.6923, + "step": 43230 + }, + { + "epoch": 0.9288138505821197, + "grad_norm": 0.5206523113168011, + "learning_rate": 2.5406883455921015e-07, + "loss": 0.6845, + "step": 43240 + }, + { + "epoch": 0.9290286548953903, + "grad_norm": 0.5183662832534691, + "learning_rate": 2.5254431869651154e-07, + "loss": 0.6888, + "step": 43250 + }, + { + "epoch": 0.9292434592086609, + "grad_norm": 0.5012033057724504, + "learning_rate": 2.51024331968891e-07, + "loss": 0.6838, + "step": 43260 + }, + { + "epoch": 0.9294582635219315, + "grad_norm": 0.5271739265196879, + "learning_rate": 2.49508875082608e-07, + "loss": 0.6925, + "step": 43270 + }, + { + "epoch": 0.9296730678352021, + "grad_norm": 0.5196998730244764, + "learning_rate": 2.4799794874181496e-07, + "loss": 0.6826, + "step": 43280 + }, + { + "epoch": 0.9298878721484728, + "grad_norm": 0.5154557177789315, + "learning_rate": 2.4649155364856127e-07, + "loss": 0.6947, + "step": 43290 + }, + { + "epoch": 0.9301026764617434, + "grad_norm": 0.5158926177913861, + "learning_rate": 2.449896905027904e-07, + "loss": 0.6774, + "step": 43300 + }, + { + "epoch": 0.930317480775014, + "grad_norm": 0.5452488869938171, + "learning_rate": 2.434923600023409e-07, + "loss": 0.6895, + "step": 43310 + }, + { + "epoch": 0.9305322850882846, + "grad_norm": 0.5240815118076753, + "learning_rate": 2.419995628429428e-07, + "loss": 0.6814, + "step": 43320 + }, + { + "epoch": 0.9307470894015551, + "grad_norm": 0.5208459297385867, + "learning_rate": 2.405112997182224e-07, + "loss": 0.6986, + "step": 43330 + }, + { + "epoch": 0.9309618937148257, + "grad_norm": 0.5146723954727245, + "learning_rate": 2.3902757131969657e-07, + "loss": 0.6936, + "step": 43340 + }, + { + "epoch": 0.9311766980280964, + "grad_norm": 0.4988199872705836, + "learning_rate": 2.3754837833677823e-07, + "loss": 0.6921, + "step": 43350 + }, + { + "epoch": 0.931391502341367, + "grad_norm": 0.5061737031935741, + "learning_rate": 2.3607372145677098e-07, + "loss": 0.6819, + "step": 43360 + }, + { + "epoch": 0.9316063066546376, + "grad_norm": 0.5118209164152153, + "learning_rate": 2.3460360136487004e-07, + "loss": 0.6938, + "step": 43370 + }, + { + "epoch": 0.9318211109679082, + "grad_norm": 0.5028715193115553, + "learning_rate": 2.3313801874416452e-07, + "loss": 0.6838, + "step": 43380 + }, + { + "epoch": 0.9320359152811788, + "grad_norm": 0.5037421436122391, + "learning_rate": 2.3167697427563418e-07, + "loss": 0.6695, + "step": 43390 + }, + { + "epoch": 0.9322507195944495, + "grad_norm": 0.5056308074418332, + "learning_rate": 2.302204686381515e-07, + "loss": 0.6915, + "step": 43400 + }, + { + "epoch": 0.9324655239077201, + "grad_norm": 0.4973980726367497, + "learning_rate": 2.2876850250847626e-07, + "loss": 0.6902, + "step": 43410 + }, + { + "epoch": 0.9326803282209907, + "grad_norm": 0.49416302287965536, + "learning_rate": 2.2732107656126435e-07, + "loss": 0.6778, + "step": 43420 + }, + { + "epoch": 0.9328951325342613, + "grad_norm": 0.5373794494908628, + "learning_rate": 2.2587819146905665e-07, + "loss": 0.6905, + "step": 43430 + }, + { + "epoch": 0.9331099368475319, + "grad_norm": 0.527952990743072, + "learning_rate": 2.2443984790228802e-07, + "loss": 0.6798, + "step": 43440 + }, + { + "epoch": 0.9333247411608026, + "grad_norm": 0.5510638102933684, + "learning_rate": 2.230060465292827e-07, + "loss": 0.7014, + "step": 43450 + }, + { + "epoch": 0.9335395454740731, + "grad_norm": 0.534593564488736, + "learning_rate": 2.2157678801625115e-07, + "loss": 0.6812, + "step": 43460 + }, + { + "epoch": 0.9337543497873437, + "grad_norm": 0.51290579083354, + "learning_rate": 2.201520730272966e-07, + "loss": 0.6848, + "step": 43470 + }, + { + "epoch": 0.9339691541006143, + "grad_norm": 0.498976254097296, + "learning_rate": 2.1873190222441054e-07, + "loss": 0.7, + "step": 43480 + }, + { + "epoch": 0.9341839584138849, + "grad_norm": 0.5110569428279691, + "learning_rate": 2.1731627626747186e-07, + "loss": 0.691, + "step": 43490 + }, + { + "epoch": 0.9343987627271556, + "grad_norm": 0.5108780076816333, + "learning_rate": 2.1590519581424885e-07, + "loss": 0.6902, + "step": 43500 + }, + { + "epoch": 0.9346135670404262, + "grad_norm": 0.5104285936640807, + "learning_rate": 2.144986615203959e-07, + "loss": 0.6925, + "step": 43510 + }, + { + "epoch": 0.9348283713536968, + "grad_norm": 0.5096080388433524, + "learning_rate": 2.13096674039458e-07, + "loss": 0.6992, + "step": 43520 + }, + { + "epoch": 0.9350431756669674, + "grad_norm": 0.5402406252479437, + "learning_rate": 2.1169923402286298e-07, + "loss": 0.6953, + "step": 43530 + }, + { + "epoch": 0.935257979980238, + "grad_norm": 0.5096415078767643, + "learning_rate": 2.1030634211993028e-07, + "loss": 0.6897, + "step": 43540 + }, + { + "epoch": 0.9354727842935087, + "grad_norm": 0.5172382489410695, + "learning_rate": 2.089179989778656e-07, + "loss": 0.6967, + "step": 43550 + }, + { + "epoch": 0.9356875886067793, + "grad_norm": 0.507182558755288, + "learning_rate": 2.0753420524175616e-07, + "loss": 0.6952, + "step": 43560 + }, + { + "epoch": 0.9359023929200498, + "grad_norm": 0.525056213053443, + "learning_rate": 2.0615496155458214e-07, + "loss": 0.684, + "step": 43570 + }, + { + "epoch": 0.9361171972333204, + "grad_norm": 0.5190206893050133, + "learning_rate": 2.0478026855720313e-07, + "loss": 0.6954, + "step": 43580 + }, + { + "epoch": 0.936332001546591, + "grad_norm": 0.5217080830284128, + "learning_rate": 2.0341012688837146e-07, + "loss": 0.6891, + "step": 43590 + }, + { + "epoch": 0.9365468058598617, + "grad_norm": 0.5373069241110895, + "learning_rate": 2.0204453718471795e-07, + "loss": 0.6683, + "step": 43600 + }, + { + "epoch": 0.9367616101731323, + "grad_norm": 0.512356091417784, + "learning_rate": 2.0068350008076055e-07, + "loss": 0.6877, + "step": 43610 + }, + { + "epoch": 0.9369764144864029, + "grad_norm": 0.5038542302769475, + "learning_rate": 1.9932701620890448e-07, + "loss": 0.6708, + "step": 43620 + }, + { + "epoch": 0.9371912187996735, + "grad_norm": 0.5079741863034887, + "learning_rate": 1.979750861994334e-07, + "loss": 0.6676, + "step": 43630 + }, + { + "epoch": 0.9374060231129441, + "grad_norm": 0.516342583535674, + "learning_rate": 1.9662771068052367e-07, + "loss": 0.6884, + "step": 43640 + }, + { + "epoch": 0.9376208274262147, + "grad_norm": 0.5147837537823089, + "learning_rate": 1.9528489027822784e-07, + "loss": 0.6785, + "step": 43650 + }, + { + "epoch": 0.9378356317394854, + "grad_norm": 0.5238895174683065, + "learning_rate": 1.9394662561648458e-07, + "loss": 0.6855, + "step": 43660 + }, + { + "epoch": 0.938050436052756, + "grad_norm": 0.5038680839138875, + "learning_rate": 1.926129173171165e-07, + "loss": 0.6833, + "step": 43670 + }, + { + "epoch": 0.9382652403660265, + "grad_norm": 0.510424924822099, + "learning_rate": 1.912837659998268e-07, + "loss": 0.6974, + "step": 43680 + }, + { + "epoch": 0.9384800446792971, + "grad_norm": 0.5039561381967255, + "learning_rate": 1.8995917228220473e-07, + "loss": 0.6938, + "step": 43690 + }, + { + "epoch": 0.9386948489925677, + "grad_norm": 0.5135618071345601, + "learning_rate": 1.8863913677972023e-07, + "loss": 0.6808, + "step": 43700 + }, + { + "epoch": 0.9389096533058384, + "grad_norm": 0.517271399611518, + "learning_rate": 1.8732366010572268e-07, + "loss": 0.673, + "step": 43710 + }, + { + "epoch": 0.939124457619109, + "grad_norm": 0.5125943059284018, + "learning_rate": 1.8601274287144865e-07, + "loss": 0.6988, + "step": 43720 + }, + { + "epoch": 0.9393392619323796, + "grad_norm": 0.5196803747985291, + "learning_rate": 1.8470638568601096e-07, + "loss": 0.6921, + "step": 43730 + }, + { + "epoch": 0.9395540662456502, + "grad_norm": 0.49370739380736406, + "learning_rate": 1.8340458915640623e-07, + "loss": 0.6917, + "step": 43740 + }, + { + "epoch": 0.9397688705589208, + "grad_norm": 0.5091442466929231, + "learning_rate": 1.8210735388751177e-07, + "loss": 0.684, + "step": 43750 + }, + { + "epoch": 0.9399836748721915, + "grad_norm": 0.5047518312454556, + "learning_rate": 1.8081468048208539e-07, + "loss": 0.6877, + "step": 43760 + }, + { + "epoch": 0.9401984791854621, + "grad_norm": 0.5206191020901989, + "learning_rate": 1.7952656954076443e-07, + "loss": 0.7039, + "step": 43770 + }, + { + "epoch": 0.9404132834987327, + "grad_norm": 0.5123513333596542, + "learning_rate": 1.7824302166206786e-07, + "loss": 0.683, + "step": 43780 + }, + { + "epoch": 0.9406280878120032, + "grad_norm": 0.5064144872086758, + "learning_rate": 1.7696403744239422e-07, + "loss": 0.682, + "step": 43790 + }, + { + "epoch": 0.9408428921252738, + "grad_norm": 0.5332117626196589, + "learning_rate": 1.7568961747601808e-07, + "loss": 0.6859, + "step": 43800 + }, + { + "epoch": 0.9410576964385445, + "grad_norm": 0.500584413978133, + "learning_rate": 1.7441976235509917e-07, + "loss": 0.6863, + "step": 43810 + }, + { + "epoch": 0.9412725007518151, + "grad_norm": 0.4941849890778416, + "learning_rate": 1.731544726696699e-07, + "loss": 0.7026, + "step": 43820 + }, + { + "epoch": 0.9414873050650857, + "grad_norm": 0.5196772492638178, + "learning_rate": 1.7189374900764776e-07, + "loss": 0.6894, + "step": 43830 + }, + { + "epoch": 0.9417021093783563, + "grad_norm": 0.5164400053424427, + "learning_rate": 1.706375919548231e-07, + "loss": 0.6822, + "step": 43840 + }, + { + "epoch": 0.9419169136916269, + "grad_norm": 0.53777605471878, + "learning_rate": 1.6938600209486678e-07, + "loss": 0.6878, + "step": 43850 + }, + { + "epoch": 0.9421317180048976, + "grad_norm": 0.5217587662345642, + "learning_rate": 1.6813898000933025e-07, + "loss": 0.6864, + "step": 43860 + }, + { + "epoch": 0.9423465223181682, + "grad_norm": 0.5253001291239328, + "learning_rate": 1.6689652627763563e-07, + "loss": 0.6871, + "step": 43870 + }, + { + "epoch": 0.9425613266314388, + "grad_norm": 0.5145763066504357, + "learning_rate": 1.6565864147708888e-07, + "loss": 0.6927, + "step": 43880 + }, + { + "epoch": 0.9427761309447094, + "grad_norm": 0.5166068562999455, + "learning_rate": 1.6442532618286987e-07, + "loss": 0.6903, + "step": 43890 + }, + { + "epoch": 0.94299093525798, + "grad_norm": 0.5020503378401979, + "learning_rate": 1.631965809680369e-07, + "loss": 0.6952, + "step": 43900 + }, + { + "epoch": 0.9432057395712506, + "grad_norm": 0.5118081944837007, + "learning_rate": 1.619724064035233e-07, + "loss": 0.6976, + "step": 43910 + }, + { + "epoch": 0.9434205438845212, + "grad_norm": 0.4971652889934386, + "learning_rate": 1.6075280305813845e-07, + "loss": 0.6816, + "step": 43920 + }, + { + "epoch": 0.9436353481977918, + "grad_norm": 0.5322834743237593, + "learning_rate": 1.5953777149857018e-07, + "loss": 0.6843, + "step": 43930 + }, + { + "epoch": 0.9438501525110624, + "grad_norm": 0.5255977281637059, + "learning_rate": 1.5832731228937915e-07, + "loss": 0.6849, + "step": 43940 + }, + { + "epoch": 0.944064956824333, + "grad_norm": 0.5143738059522022, + "learning_rate": 1.5712142599300206e-07, + "loss": 0.6919, + "step": 43950 + }, + { + "epoch": 0.9442797611376036, + "grad_norm": 0.5106099594288762, + "learning_rate": 1.5592011316975297e-07, + "loss": 0.6896, + "step": 43960 + }, + { + "epoch": 0.9444945654508743, + "grad_norm": 0.4995085706778138, + "learning_rate": 1.5472337437781982e-07, + "loss": 0.6837, + "step": 43970 + }, + { + "epoch": 0.9447093697641449, + "grad_norm": 0.5261391145616446, + "learning_rate": 1.5353121017326334e-07, + "loss": 0.674, + "step": 43980 + }, + { + "epoch": 0.9449241740774155, + "grad_norm": 0.5230909604182425, + "learning_rate": 1.5234362111002043e-07, + "loss": 0.6711, + "step": 43990 + }, + { + "epoch": 0.9451389783906861, + "grad_norm": 0.5062065384872058, + "learning_rate": 1.5116060773990304e-07, + "loss": 0.6824, + "step": 44000 + }, + { + "epoch": 0.9453537827039566, + "grad_norm": 0.533824670779333, + "learning_rate": 1.499821706125937e-07, + "loss": 0.6932, + "step": 44010 + }, + { + "epoch": 0.9455685870172273, + "grad_norm": 0.5288369825844738, + "learning_rate": 1.4880831027565213e-07, + "loss": 0.6815, + "step": 44020 + }, + { + "epoch": 0.9457833913304979, + "grad_norm": 0.5279056326291518, + "learning_rate": 1.4763902727451206e-07, + "loss": 0.6859, + "step": 44030 + }, + { + "epoch": 0.9459981956437685, + "grad_norm": 0.5141616425492499, + "learning_rate": 1.4647432215247448e-07, + "loss": 0.688, + "step": 44040 + }, + { + "epoch": 0.9462129999570391, + "grad_norm": 0.506499141279277, + "learning_rate": 1.4531419545071866e-07, + "loss": 0.6842, + "step": 44050 + }, + { + "epoch": 0.9464278042703097, + "grad_norm": 0.5137357443604906, + "learning_rate": 1.441586477082968e-07, + "loss": 0.7037, + "step": 44060 + }, + { + "epoch": 0.9466426085835804, + "grad_norm": 0.5179057297643055, + "learning_rate": 1.4300767946213046e-07, + "loss": 0.6836, + "step": 44070 + }, + { + "epoch": 0.946857412896851, + "grad_norm": 0.5062004724754988, + "learning_rate": 1.4186129124701408e-07, + "loss": 0.6934, + "step": 44080 + }, + { + "epoch": 0.9470722172101216, + "grad_norm": 0.516759003331539, + "learning_rate": 1.4071948359561605e-07, + "loss": 0.6876, + "step": 44090 + }, + { + "epoch": 0.9472870215233922, + "grad_norm": 0.5121867129971354, + "learning_rate": 1.3958225703847305e-07, + "loss": 0.6866, + "step": 44100 + }, + { + "epoch": 0.9475018258366628, + "grad_norm": 0.5029706915206402, + "learning_rate": 1.3844961210399576e-07, + "loss": 0.6829, + "step": 44110 + }, + { + "epoch": 0.9477166301499335, + "grad_norm": 0.5223591095442727, + "learning_rate": 1.3732154931846652e-07, + "loss": 0.6899, + "step": 44120 + }, + { + "epoch": 0.947931434463204, + "grad_norm": 0.5029921805410988, + "learning_rate": 1.3619806920603608e-07, + "loss": 0.6945, + "step": 44130 + }, + { + "epoch": 0.9481462387764746, + "grad_norm": 0.4862859723168768, + "learning_rate": 1.350791722887279e-07, + "loss": 0.6706, + "step": 44140 + }, + { + "epoch": 0.9483610430897452, + "grad_norm": 0.4934171630865501, + "learning_rate": 1.3396485908643398e-07, + "loss": 0.6844, + "step": 44150 + }, + { + "epoch": 0.9485758474030158, + "grad_norm": 0.4991882715586846, + "learning_rate": 1.3285513011691898e-07, + "loss": 0.6998, + "step": 44160 + }, + { + "epoch": 0.9487906517162865, + "grad_norm": 0.5065887293501903, + "learning_rate": 1.317499858958149e-07, + "loss": 0.6983, + "step": 44170 + }, + { + "epoch": 0.9490054560295571, + "grad_norm": 0.5217077228947482, + "learning_rate": 1.306494269366254e-07, + "loss": 0.6968, + "step": 44180 + }, + { + "epoch": 0.9492202603428277, + "grad_norm": 0.5284986091713506, + "learning_rate": 1.295534537507237e-07, + "loss": 0.6784, + "step": 44190 + }, + { + "epoch": 0.9494350646560983, + "grad_norm": 0.5007427227768886, + "learning_rate": 1.284620668473502e-07, + "loss": 0.6892, + "step": 44200 + }, + { + "epoch": 0.9496498689693689, + "grad_norm": 0.5364368649236175, + "learning_rate": 1.273752667336159e-07, + "loss": 0.7067, + "step": 44210 + }, + { + "epoch": 0.9498646732826395, + "grad_norm": 0.5149978301773179, + "learning_rate": 1.262930539145013e-07, + "loss": 0.6788, + "step": 44220 + }, + { + "epoch": 0.9500794775959102, + "grad_norm": 0.48810830703974617, + "learning_rate": 1.25215428892852e-07, + "loss": 0.6894, + "step": 44230 + }, + { + "epoch": 0.9502942819091807, + "grad_norm": 0.5094043254510362, + "learning_rate": 1.2414239216938628e-07, + "loss": 0.6734, + "step": 44240 + }, + { + "epoch": 0.9505090862224513, + "grad_norm": 0.5180377313222472, + "learning_rate": 1.2307394424268758e-07, + "loss": 0.6876, + "step": 44250 + }, + { + "epoch": 0.9507238905357219, + "grad_norm": 0.5003767010301355, + "learning_rate": 1.220100856092088e-07, + "loss": 0.6943, + "step": 44260 + }, + { + "epoch": 0.9509386948489925, + "grad_norm": 0.5036430920842442, + "learning_rate": 1.2095081676326669e-07, + "loss": 0.6908, + "step": 44270 + }, + { + "epoch": 0.9511534991622632, + "grad_norm": 0.5045971916714835, + "learning_rate": 1.1989613819705314e-07, + "loss": 0.6761, + "step": 44280 + }, + { + "epoch": 0.9513683034755338, + "grad_norm": 0.538898900267551, + "learning_rate": 1.1884605040061947e-07, + "loss": 0.6907, + "step": 44290 + }, + { + "epoch": 0.9515831077888044, + "grad_norm": 0.5035666702349213, + "learning_rate": 1.178005538618865e-07, + "loss": 0.6878, + "step": 44300 + }, + { + "epoch": 0.951797912102075, + "grad_norm": 0.5149697763083855, + "learning_rate": 1.167596490666445e-07, + "loss": 0.6937, + "step": 44310 + }, + { + "epoch": 0.9520127164153456, + "grad_norm": 0.5107879059485395, + "learning_rate": 1.1572333649854328e-07, + "loss": 0.6783, + "step": 44320 + }, + { + "epoch": 0.9522275207286163, + "grad_norm": 0.5232337790057829, + "learning_rate": 1.1469161663910877e-07, + "loss": 0.6997, + "step": 44330 + }, + { + "epoch": 0.9524423250418869, + "grad_norm": 0.5034816506492367, + "learning_rate": 1.1366448996772195e-07, + "loss": 0.6856, + "step": 44340 + }, + { + "epoch": 0.9526571293551575, + "grad_norm": 0.524026297949951, + "learning_rate": 1.1264195696163993e-07, + "loss": 0.6832, + "step": 44350 + }, + { + "epoch": 0.952871933668428, + "grad_norm": 0.5103648732522968, + "learning_rate": 1.1162401809597822e-07, + "loss": 0.6933, + "step": 44360 + }, + { + "epoch": 0.9530867379816986, + "grad_norm": 0.5041210092671914, + "learning_rate": 1.106106738437196e-07, + "loss": 0.6835, + "step": 44370 + }, + { + "epoch": 0.9533015422949693, + "grad_norm": 0.505579894206394, + "learning_rate": 1.0960192467571407e-07, + "loss": 0.6848, + "step": 44380 + }, + { + "epoch": 0.9535163466082399, + "grad_norm": 0.48925078556562024, + "learning_rate": 1.0859777106067226e-07, + "loss": 0.6867, + "step": 44390 + }, + { + "epoch": 0.9537311509215105, + "grad_norm": 0.510694819304209, + "learning_rate": 1.0759821346517541e-07, + "loss": 0.6942, + "step": 44400 + }, + { + "epoch": 0.9539459552347811, + "grad_norm": 0.5020801402247028, + "learning_rate": 1.0660325235366309e-07, + "loss": 0.6752, + "step": 44410 + }, + { + "epoch": 0.9541607595480517, + "grad_norm": 0.514838466266744, + "learning_rate": 1.0561288818844217e-07, + "loss": 0.6812, + "step": 44420 + }, + { + "epoch": 0.9543755638613224, + "grad_norm": 0.5136075576538729, + "learning_rate": 1.0462712142968567e-07, + "loss": 0.6984, + "step": 44430 + }, + { + "epoch": 0.954590368174593, + "grad_norm": 0.5373229072892426, + "learning_rate": 1.0364595253542498e-07, + "loss": 0.6844, + "step": 44440 + }, + { + "epoch": 0.9548051724878636, + "grad_norm": 0.5177203696614198, + "learning_rate": 1.0266938196156096e-07, + "loss": 0.6842, + "step": 44450 + }, + { + "epoch": 0.9550199768011342, + "grad_norm": 0.5304130297606695, + "learning_rate": 1.0169741016185286e-07, + "loss": 0.698, + "step": 44460 + }, + { + "epoch": 0.9552347811144047, + "grad_norm": 0.521535821590921, + "learning_rate": 1.0073003758792721e-07, + "loss": 0.6791, + "step": 44470 + }, + { + "epoch": 0.9554495854276754, + "grad_norm": 0.5216014836223208, + "learning_rate": 9.976726468927112e-08, + "loss": 0.6875, + "step": 44480 + }, + { + "epoch": 0.955664389740946, + "grad_norm": 0.4958526215570128, + "learning_rate": 9.88090919132334e-08, + "loss": 0.6845, + "step": 44490 + }, + { + "epoch": 0.9558791940542166, + "grad_norm": 0.491068148568293, + "learning_rate": 9.785551970502904e-08, + "loss": 0.6901, + "step": 44500 + }, + { + "epoch": 0.9560939983674872, + "grad_norm": 0.5334092820305429, + "learning_rate": 9.690654850773251e-08, + "loss": 0.6981, + "step": 44510 + }, + { + "epoch": 0.9563088026807578, + "grad_norm": 0.5455356568529366, + "learning_rate": 9.596217876228109e-08, + "loss": 0.6899, + "step": 44520 + }, + { + "epoch": 0.9565236069940284, + "grad_norm": 0.5233272097407236, + "learning_rate": 9.502241090747488e-08, + "loss": 0.6965, + "step": 44530 + }, + { + "epoch": 0.9567384113072991, + "grad_norm": 0.5221024613165127, + "learning_rate": 9.40872453799746e-08, + "loss": 0.6846, + "step": 44540 + }, + { + "epoch": 0.9569532156205697, + "grad_norm": 0.5392768910456224, + "learning_rate": 9.315668261430378e-08, + "loss": 0.6956, + "step": 44550 + }, + { + "epoch": 0.9571680199338403, + "grad_norm": 0.5040996409629596, + "learning_rate": 9.223072304284542e-08, + "loss": 0.6804, + "step": 44560 + }, + { + "epoch": 0.9573828242471109, + "grad_norm": 0.5214855537136885, + "learning_rate": 9.130936709584537e-08, + "loss": 0.7017, + "step": 44570 + }, + { + "epoch": 0.9575976285603814, + "grad_norm": 0.5117354205252992, + "learning_rate": 9.039261520141008e-08, + "loss": 0.6884, + "step": 44580 + }, + { + "epoch": 0.9578124328736521, + "grad_norm": 0.5130259202484456, + "learning_rate": 8.948046778550546e-08, + "loss": 0.6864, + "step": 44590 + }, + { + "epoch": 0.9580272371869227, + "grad_norm": 0.5153683100477022, + "learning_rate": 8.857292527195916e-08, + "loss": 0.6873, + "step": 44600 + }, + { + "epoch": 0.9582420415001933, + "grad_norm": 0.52407348167293, + "learning_rate": 8.76699880824583e-08, + "loss": 0.6873, + "step": 44610 + }, + { + "epoch": 0.9584568458134639, + "grad_norm": 0.5284487657831876, + "learning_rate": 8.677165663655396e-08, + "loss": 0.6678, + "step": 44620 + }, + { + "epoch": 0.9586716501267345, + "grad_norm": 0.5136375221731139, + "learning_rate": 8.587793135165001e-08, + "loss": 0.6918, + "step": 44630 + }, + { + "epoch": 0.9588864544400052, + "grad_norm": 0.5376228954573528, + "learning_rate": 8.49888126430154e-08, + "loss": 0.6918, + "step": 44640 + }, + { + "epoch": 0.9591012587532758, + "grad_norm": 0.5102793357805696, + "learning_rate": 8.410430092377853e-08, + "loss": 0.6878, + "step": 44650 + }, + { + "epoch": 0.9593160630665464, + "grad_norm": 0.5191445781226015, + "learning_rate": 8.322439660492398e-08, + "loss": 0.6882, + "step": 44660 + }, + { + "epoch": 0.959530867379817, + "grad_norm": 0.5030540214464223, + "learning_rate": 8.234910009529917e-08, + "loss": 0.6846, + "step": 44670 + }, + { + "epoch": 0.9597456716930876, + "grad_norm": 0.5221867031193459, + "learning_rate": 8.147841180160765e-08, + "loss": 0.6923, + "step": 44680 + }, + { + "epoch": 0.9599604760063583, + "grad_norm": 0.49462846118517095, + "learning_rate": 8.061233212841358e-08, + "loss": 0.6725, + "step": 44690 + }, + { + "epoch": 0.9601752803196288, + "grad_norm": 0.5156449438921737, + "learning_rate": 7.975086147813837e-08, + "loss": 0.6805, + "step": 44700 + }, + { + "epoch": 0.9603900846328994, + "grad_norm": 0.5045341788319067, + "learning_rate": 7.889400025106409e-08, + "loss": 0.6827, + "step": 44710 + }, + { + "epoch": 0.96060488894617, + "grad_norm": 0.5165527445661976, + "learning_rate": 7.804174884532778e-08, + "loss": 0.6875, + "step": 44720 + }, + { + "epoch": 0.9608196932594406, + "grad_norm": 0.5290209182103901, + "learning_rate": 7.719410765692825e-08, + "loss": 0.6818, + "step": 44730 + }, + { + "epoch": 0.9610344975727113, + "grad_norm": 0.5045124524630176, + "learning_rate": 7.635107707971712e-08, + "loss": 0.683, + "step": 44740 + }, + { + "epoch": 0.9612493018859819, + "grad_norm": 0.5110671930296831, + "learning_rate": 7.551265750540993e-08, + "loss": 0.6716, + "step": 44750 + }, + { + "epoch": 0.9614641061992525, + "grad_norm": 0.5184014503506628, + "learning_rate": 7.467884932357505e-08, + "loss": 0.6953, + "step": 44760 + }, + { + "epoch": 0.9616789105125231, + "grad_norm": 0.5025771257461924, + "learning_rate": 7.384965292164037e-08, + "loss": 0.6875, + "step": 44770 + }, + { + "epoch": 0.9618937148257937, + "grad_norm": 0.5106235992412884, + "learning_rate": 7.30250686848888e-08, + "loss": 0.6924, + "step": 44780 + }, + { + "epoch": 0.9621085191390644, + "grad_norm": 0.5139107082255027, + "learning_rate": 7.220509699646383e-08, + "loss": 0.7049, + "step": 44790 + }, + { + "epoch": 0.962323323452335, + "grad_norm": 0.5025433150135805, + "learning_rate": 7.138973823736295e-08, + "loss": 0.6851, + "step": 44800 + }, + { + "epoch": 0.9625381277656055, + "grad_norm": 0.5025163153971417, + "learning_rate": 7.057899278643975e-08, + "loss": 0.6925, + "step": 44810 + }, + { + "epoch": 0.9627529320788761, + "grad_norm": 0.5101640172429046, + "learning_rate": 6.977286102040625e-08, + "loss": 0.6688, + "step": 44820 + }, + { + "epoch": 0.9629677363921467, + "grad_norm": 0.5162577234774317, + "learning_rate": 6.89713433138306e-08, + "loss": 0.683, + "step": 44830 + }, + { + "epoch": 0.9631825407054173, + "grad_norm": 0.5102954616732902, + "learning_rate": 6.817444003913487e-08, + "loss": 0.6883, + "step": 44840 + }, + { + "epoch": 0.963397345018688, + "grad_norm": 0.5156868683615887, + "learning_rate": 6.738215156659955e-08, + "loss": 0.671, + "step": 44850 + }, + { + "epoch": 0.9636121493319586, + "grad_norm": 0.514482173761437, + "learning_rate": 6.659447826436017e-08, + "loss": 0.6849, + "step": 44860 + }, + { + "epoch": 0.9638269536452292, + "grad_norm": 0.5251149204765465, + "learning_rate": 6.581142049840616e-08, + "loss": 0.6903, + "step": 44870 + }, + { + "epoch": 0.9640417579584998, + "grad_norm": 0.4910026180097444, + "learning_rate": 6.503297863258429e-08, + "loss": 0.6878, + "step": 44880 + }, + { + "epoch": 0.9642565622717704, + "grad_norm": 0.5436572691315237, + "learning_rate": 6.425915302859631e-08, + "loss": 0.6933, + "step": 44890 + }, + { + "epoch": 0.9644713665850411, + "grad_norm": 0.4946026585653437, + "learning_rate": 6.348994404599907e-08, + "loss": 0.6943, + "step": 44900 + }, + { + "epoch": 0.9646861708983117, + "grad_norm": 0.5030459362111663, + "learning_rate": 6.27253520422022e-08, + "loss": 0.6879, + "step": 44910 + }, + { + "epoch": 0.9649009752115822, + "grad_norm": 0.5093527545070409, + "learning_rate": 6.196537737247488e-08, + "loss": 0.6776, + "step": 44920 + }, + { + "epoch": 0.9651157795248528, + "grad_norm": 0.534202285869032, + "learning_rate": 6.12100203899335e-08, + "loss": 0.7175, + "step": 44930 + }, + { + "epoch": 0.9653305838381234, + "grad_norm": 0.5112645845888609, + "learning_rate": 6.045928144555736e-08, + "loss": 0.6764, + "step": 44940 + }, + { + "epoch": 0.9655453881513941, + "grad_norm": 0.5085395742615043, + "learning_rate": 5.97131608881718e-08, + "loss": 0.6778, + "step": 44950 + }, + { + "epoch": 0.9657601924646647, + "grad_norm": 0.5335040181019025, + "learning_rate": 5.8971659064464006e-08, + "loss": 0.6836, + "step": 44960 + }, + { + "epoch": 0.9659749967779353, + "grad_norm": 0.5132355793000835, + "learning_rate": 5.823477631896945e-08, + "loss": 0.6993, + "step": 44970 + }, + { + "epoch": 0.9661898010912059, + "grad_norm": 0.5136374270218453, + "learning_rate": 5.750251299407761e-08, + "loss": 0.6885, + "step": 44980 + }, + { + "epoch": 0.9664046054044765, + "grad_norm": 0.5109545044167216, + "learning_rate": 5.6774869430036296e-08, + "loss": 0.6972, + "step": 44990 + }, + { + "epoch": 0.9666194097177472, + "grad_norm": 0.5188694431833389, + "learning_rate": 5.605184596494062e-08, + "loss": 0.6786, + "step": 45000 + }, + { + "epoch": 0.9668342140310178, + "grad_norm": 0.4958762948588162, + "learning_rate": 5.5333442934744074e-08, + "loss": 0.6846, + "step": 45010 + }, + { + "epoch": 0.9670490183442884, + "grad_norm": 0.5273914852077115, + "learning_rate": 5.461966067325075e-08, + "loss": 0.6889, + "step": 45020 + }, + { + "epoch": 0.967263822657559, + "grad_norm": 0.5090438225495914, + "learning_rate": 5.3910499512116465e-08, + "loss": 0.6946, + "step": 45030 + }, + { + "epoch": 0.9674786269708295, + "grad_norm": 0.5283538397180535, + "learning_rate": 5.32059597808543e-08, + "loss": 0.6821, + "step": 45040 + }, + { + "epoch": 0.9676934312841002, + "grad_norm": 0.5129695644458037, + "learning_rate": 5.2506041806824614e-08, + "loss": 0.6836, + "step": 45050 + }, + { + "epoch": 0.9679082355973708, + "grad_norm": 0.5357563822720558, + "learning_rate": 5.181074591524393e-08, + "loss": 0.6875, + "step": 45060 + }, + { + "epoch": 0.9681230399106414, + "grad_norm": 0.9871280435330488, + "learning_rate": 5.112007242918049e-08, + "loss": 0.6777, + "step": 45070 + }, + { + "epoch": 0.968337844223912, + "grad_norm": 0.5259865369064295, + "learning_rate": 5.043402166955314e-08, + "loss": 0.6878, + "step": 45080 + }, + { + "epoch": 0.9685526485371826, + "grad_norm": 0.49858883676118315, + "learning_rate": 4.9752593955134655e-08, + "loss": 0.6797, + "step": 45090 + }, + { + "epoch": 0.9687674528504533, + "grad_norm": 0.5177420266411118, + "learning_rate": 4.907578960254955e-08, + "loss": 0.696, + "step": 45100 + }, + { + "epoch": 0.9689822571637239, + "grad_norm": 0.5112656951608426, + "learning_rate": 4.840360892627183e-08, + "loss": 0.6812, + "step": 45110 + }, + { + "epoch": 0.9691970614769945, + "grad_norm": 0.5048791811705977, + "learning_rate": 4.773605223863054e-08, + "loss": 0.679, + "step": 45120 + }, + { + "epoch": 0.9694118657902651, + "grad_norm": 0.5184010359011654, + "learning_rate": 4.707311984980423e-08, + "loss": 0.694, + "step": 45130 + }, + { + "epoch": 0.9696266701035356, + "grad_norm": 0.526339150759906, + "learning_rate": 4.6414812067822056e-08, + "loss": 0.6889, + "step": 45140 + }, + { + "epoch": 0.9698414744168062, + "grad_norm": 0.5192950033137796, + "learning_rate": 4.576112919856601e-08, + "loss": 0.6996, + "step": 45150 + }, + { + "epoch": 0.9700562787300769, + "grad_norm": 0.5052304888836145, + "learning_rate": 4.51120715457698e-08, + "loss": 0.6903, + "step": 45160 + }, + { + "epoch": 0.9702710830433475, + "grad_norm": 0.5084386454674246, + "learning_rate": 4.44676394110144e-08, + "loss": 0.693, + "step": 45170 + }, + { + "epoch": 0.9704858873566181, + "grad_norm": 0.5136496837941966, + "learning_rate": 4.382783309373473e-08, + "loss": 0.6925, + "step": 45180 + }, + { + "epoch": 0.9707006916698887, + "grad_norm": 0.49863491668395693, + "learning_rate": 4.3192652891216326e-08, + "loss": 0.6773, + "step": 45190 + }, + { + "epoch": 0.9709154959831593, + "grad_norm": 0.5276766002981915, + "learning_rate": 4.25620990985931e-08, + "loss": 0.7053, + "step": 45200 + }, + { + "epoch": 0.97113030029643, + "grad_norm": 0.5299405568061919, + "learning_rate": 4.1936172008851806e-08, + "loss": 0.6931, + "step": 45210 + }, + { + "epoch": 0.9713451046097006, + "grad_norm": 0.5064992378132825, + "learning_rate": 4.131487191282757e-08, + "loss": 0.6806, + "step": 45220 + }, + { + "epoch": 0.9715599089229712, + "grad_norm": 0.5097429755248392, + "learning_rate": 4.069819909920614e-08, + "loss": 0.6799, + "step": 45230 + }, + { + "epoch": 0.9717747132362418, + "grad_norm": 0.4985251877587769, + "learning_rate": 4.008615385452275e-08, + "loss": 0.6891, + "step": 45240 + }, + { + "epoch": 0.9719895175495123, + "grad_norm": 0.5189916494654802, + "learning_rate": 3.947873646316325e-08, + "loss": 0.6916, + "step": 45250 + }, + { + "epoch": 0.972204321862783, + "grad_norm": 0.532252242471192, + "learning_rate": 3.88759472073641e-08, + "loss": 0.6869, + "step": 45260 + }, + { + "epoch": 0.9724191261760536, + "grad_norm": 0.5058615192775043, + "learning_rate": 3.827778636720791e-08, + "loss": 0.6909, + "step": 45270 + }, + { + "epoch": 0.9726339304893242, + "grad_norm": 0.4988140384119279, + "learning_rate": 3.7684254220630156e-08, + "loss": 0.6735, + "step": 45280 + }, + { + "epoch": 0.9728487348025948, + "grad_norm": 0.5031240100440125, + "learning_rate": 3.7095351043414665e-08, + "loss": 0.6965, + "step": 45290 + }, + { + "epoch": 0.9730635391158654, + "grad_norm": 0.5051450638071671, + "learning_rate": 3.651107710919366e-08, + "loss": 0.6921, + "step": 45300 + }, + { + "epoch": 0.9732783434291361, + "grad_norm": 0.50854818588177, + "learning_rate": 3.593143268944888e-08, + "loss": 0.6766, + "step": 45310 + }, + { + "epoch": 0.9734931477424067, + "grad_norm": 0.5209101220326824, + "learning_rate": 3.535641805351042e-08, + "loss": 0.6828, + "step": 45320 + }, + { + "epoch": 0.9737079520556773, + "grad_norm": 0.5185530290222669, + "learning_rate": 3.478603346855791e-08, + "loss": 0.6793, + "step": 45330 + }, + { + "epoch": 0.9739227563689479, + "grad_norm": 0.4981837565896543, + "learning_rate": 3.4220279199619346e-08, + "loss": 0.6813, + "step": 45340 + }, + { + "epoch": 0.9741375606822185, + "grad_norm": 0.6029095886431749, + "learning_rate": 3.365915550957222e-08, + "loss": 0.6776, + "step": 45350 + }, + { + "epoch": 0.9743523649954892, + "grad_norm": 0.529974027801158, + "learning_rate": 3.3102662659140195e-08, + "loss": 0.6956, + "step": 45360 + }, + { + "epoch": 0.9745671693087598, + "grad_norm": 0.5098466744359518, + "learning_rate": 3.255080090689644e-08, + "loss": 0.6872, + "step": 45370 + }, + { + "epoch": 0.9747819736220303, + "grad_norm": 0.5178231192461265, + "learning_rate": 3.200357050926361e-08, + "loss": 0.6765, + "step": 45380 + }, + { + "epoch": 0.9749967779353009, + "grad_norm": 0.507719206604319, + "learning_rate": 3.1460971720510544e-08, + "loss": 0.691, + "step": 45390 + }, + { + "epoch": 0.9752115822485715, + "grad_norm": 0.5012348553384962, + "learning_rate": 3.0923004792754454e-08, + "loss": 0.6947, + "step": 45400 + }, + { + "epoch": 0.9754263865618421, + "grad_norm": 0.5012135896224041, + "learning_rate": 3.0389669975959825e-08, + "loss": 0.6933, + "step": 45410 + }, + { + "epoch": 0.9756411908751128, + "grad_norm": 0.5171752738246117, + "learning_rate": 2.9860967517941766e-08, + "loss": 0.6924, + "step": 45420 + }, + { + "epoch": 0.9758559951883834, + "grad_norm": 0.5209513564127771, + "learning_rate": 2.933689766435932e-08, + "loss": 0.6994, + "step": 45430 + }, + { + "epoch": 0.976070799501654, + "grad_norm": 0.51299608455161, + "learning_rate": 2.881746065871993e-08, + "loss": 0.6846, + "step": 45440 + }, + { + "epoch": 0.9762856038149246, + "grad_norm": 0.5426181859963906, + "learning_rate": 2.830265674237942e-08, + "loss": 0.6972, + "step": 45450 + }, + { + "epoch": 0.9765004081281952, + "grad_norm": 0.5176632310591313, + "learning_rate": 2.7792486154540888e-08, + "loss": 0.6917, + "step": 45460 + }, + { + "epoch": 0.9767152124414659, + "grad_norm": 0.5069734550437351, + "learning_rate": 2.7286949132253605e-08, + "loss": 0.687, + "step": 45470 + }, + { + "epoch": 0.9769300167547365, + "grad_norm": 0.5134159936470577, + "learning_rate": 2.6786045910414117e-08, + "loss": 0.6829, + "step": 45480 + }, + { + "epoch": 0.977144821068007, + "grad_norm": 0.5314226916450205, + "learning_rate": 2.6289776721767356e-08, + "loss": 0.6875, + "step": 45490 + }, + { + "epoch": 0.9773596253812776, + "grad_norm": 0.5017572726059231, + "learning_rate": 2.57981417969011e-08, + "loss": 0.6891, + "step": 45500 + }, + { + "epoch": 0.9775744296945482, + "grad_norm": 0.5097143098042627, + "learning_rate": 2.5311141364254832e-08, + "loss": 0.6773, + "step": 45510 + }, + { + "epoch": 0.9777892340078189, + "grad_norm": 0.5094938311645063, + "learning_rate": 2.4828775650111993e-08, + "loss": 0.699, + "step": 45520 + }, + { + "epoch": 0.9780040383210895, + "grad_norm": 0.5041111422716359, + "learning_rate": 2.435104487860218e-08, + "loss": 0.689, + "step": 45530 + }, + { + "epoch": 0.9782188426343601, + "grad_norm": 0.49440540333290234, + "learning_rate": 2.3877949271702283e-08, + "loss": 0.6787, + "step": 45540 + }, + { + "epoch": 0.9784336469476307, + "grad_norm": 0.5075734688713911, + "learning_rate": 2.3409489049235347e-08, + "loss": 0.6876, + "step": 45550 + }, + { + "epoch": 0.9786484512609013, + "grad_norm": 0.4937175102422703, + "learning_rate": 2.2945664428870583e-08, + "loss": 0.6766, + "step": 45560 + }, + { + "epoch": 0.978863255574172, + "grad_norm": 0.5195853543250676, + "learning_rate": 2.2486475626122274e-08, + "loss": 0.6893, + "step": 45570 + }, + { + "epoch": 0.9790780598874426, + "grad_norm": 0.5066981294356002, + "learning_rate": 2.203192285435196e-08, + "loss": 0.6866, + "step": 45580 + }, + { + "epoch": 0.9792928642007132, + "grad_norm": 0.5106780154162585, + "learning_rate": 2.158200632476626e-08, + "loss": 0.6806, + "step": 45590 + }, + { + "epoch": 0.9795076685139837, + "grad_norm": 0.5216459413562595, + "learning_rate": 2.1136726246419048e-08, + "loss": 0.6838, + "step": 45600 + }, + { + "epoch": 0.9797224728272543, + "grad_norm": 0.5244173549621101, + "learning_rate": 2.0696082826209273e-08, + "loss": 0.6783, + "step": 45610 + }, + { + "epoch": 0.979937277140525, + "grad_norm": 0.508672858772139, + "learning_rate": 2.0260076268878716e-08, + "loss": 0.6991, + "step": 45620 + }, + { + "epoch": 0.9801520814537956, + "grad_norm": 0.5316971841084902, + "learning_rate": 1.9828706777017535e-08, + "loss": 0.6776, + "step": 45630 + }, + { + "epoch": 0.9803668857670662, + "grad_norm": 0.5278618608963893, + "learning_rate": 1.9401974551062075e-08, + "loss": 0.6895, + "step": 45640 + }, + { + "epoch": 0.9805816900803368, + "grad_norm": 0.5178835527189292, + "learning_rate": 1.89798797892915e-08, + "loss": 0.6877, + "step": 45650 + }, + { + "epoch": 0.9807964943936074, + "grad_norm": 0.5107328830474321, + "learning_rate": 1.856242268783226e-08, + "loss": 0.6806, + "step": 45660 + }, + { + "epoch": 0.9810112987068781, + "grad_norm": 0.5178035321000685, + "learning_rate": 1.814960344065364e-08, + "loss": 0.6884, + "step": 45670 + }, + { + "epoch": 0.9812261030201487, + "grad_norm": 0.5209369501476074, + "learning_rate": 1.7741422239572203e-08, + "loss": 0.7015, + "step": 45680 + }, + { + "epoch": 0.9814409073334193, + "grad_norm": 0.5145085242189749, + "learning_rate": 1.733787927424735e-08, + "loss": 0.692, + "step": 45690 + }, + { + "epoch": 0.9816557116466899, + "grad_norm": 0.5135454075520934, + "learning_rate": 1.6938974732185754e-08, + "loss": 0.6877, + "step": 45700 + }, + { + "epoch": 0.9818705159599604, + "grad_norm": 0.5111760775067833, + "learning_rate": 1.6544708798736932e-08, + "loss": 0.6897, + "step": 45710 + }, + { + "epoch": 0.982085320273231, + "grad_norm": 0.5073586927391305, + "learning_rate": 1.615508165709545e-08, + "loss": 0.6936, + "step": 45720 + }, + { + "epoch": 0.9823001245865017, + "grad_norm": 0.5311468459249292, + "learning_rate": 1.577009348830205e-08, + "loss": 0.7014, + "step": 45730 + }, + { + "epoch": 0.9825149288997723, + "grad_norm": 0.5133173645780131, + "learning_rate": 1.5389744471238087e-08, + "loss": 0.6858, + "step": 45740 + }, + { + "epoch": 0.9827297332130429, + "grad_norm": 0.5177357908350075, + "learning_rate": 1.5014034782635523e-08, + "loss": 0.7051, + "step": 45750 + }, + { + "epoch": 0.9829445375263135, + "grad_norm": 0.5334859426374919, + "learning_rate": 1.4642964597064713e-08, + "loss": 0.691, + "step": 45760 + }, + { + "epoch": 0.9831593418395841, + "grad_norm": 0.5104405442693613, + "learning_rate": 1.4276534086943295e-08, + "loss": 0.6878, + "step": 45770 + }, + { + "epoch": 0.9833741461528548, + "grad_norm": 0.5062353640908156, + "learning_rate": 1.391474342253174e-08, + "loss": 0.6876, + "step": 45780 + }, + { + "epoch": 0.9835889504661254, + "grad_norm": 0.5032985620842442, + "learning_rate": 1.3557592771935579e-08, + "loss": 0.6852, + "step": 45790 + }, + { + "epoch": 0.983803754779396, + "grad_norm": 0.5095670609317529, + "learning_rate": 1.3205082301105399e-08, + "loss": 0.6862, + "step": 45800 + }, + { + "epoch": 0.9840185590926666, + "grad_norm": 0.5048150094794633, + "learning_rate": 1.2857212173833512e-08, + "loss": 0.6619, + "step": 45810 + }, + { + "epoch": 0.9842333634059371, + "grad_norm": 0.5367881117018972, + "learning_rate": 1.2513982551756176e-08, + "loss": 0.6991, + "step": 45820 + }, + { + "epoch": 0.9844481677192078, + "grad_norm": 0.5139161341851894, + "learning_rate": 1.2175393594355822e-08, + "loss": 0.6819, + "step": 45830 + }, + { + "epoch": 0.9846629720324784, + "grad_norm": 0.5099451512210587, + "learning_rate": 1.1841445458956601e-08, + "loss": 0.6815, + "step": 45840 + }, + { + "epoch": 0.984877776345749, + "grad_norm": 0.5153723004890068, + "learning_rate": 1.1512138300726615e-08, + "loss": 0.6928, + "step": 45850 + }, + { + "epoch": 0.9850925806590196, + "grad_norm": 0.5169126964230165, + "learning_rate": 1.1187472272677913e-08, + "loss": 0.6861, + "step": 45860 + }, + { + "epoch": 0.9853073849722902, + "grad_norm": 0.5111747109332822, + "learning_rate": 1.0867447525665376e-08, + "loss": 0.6846, + "step": 45870 + }, + { + "epoch": 0.9855221892855609, + "grad_norm": 0.5237477989382217, + "learning_rate": 1.0552064208388946e-08, + "loss": 0.6892, + "step": 45880 + }, + { + "epoch": 0.9857369935988315, + "grad_norm": 0.5252395338889467, + "learning_rate": 1.0241322467389181e-08, + "loss": 0.7028, + "step": 45890 + }, + { + "epoch": 0.9859517979121021, + "grad_norm": 0.5050689116075816, + "learning_rate": 9.935222447052805e-09, + "loss": 0.6945, + "step": 45900 + }, + { + "epoch": 0.9861666022253727, + "grad_norm": 0.5177719740515985, + "learning_rate": 9.633764289608272e-09, + "loss": 0.6848, + "step": 45910 + }, + { + "epoch": 0.9863814065386433, + "grad_norm": 0.5082442912081105, + "learning_rate": 9.336948135127976e-09, + "loss": 0.6884, + "step": 45920 + }, + { + "epoch": 0.986596210851914, + "grad_norm": 0.5106716496528746, + "learning_rate": 9.044774121526045e-09, + "loss": 0.6841, + "step": 45930 + }, + { + "epoch": 0.9868110151651845, + "grad_norm": 0.5203087673402492, + "learning_rate": 8.75724238456166e-09, + "loss": 0.683, + "step": 45940 + }, + { + "epoch": 0.9870258194784551, + "grad_norm": 0.5157554968748806, + "learning_rate": 8.474353057834616e-09, + "loss": 0.6789, + "step": 45950 + }, + { + "epoch": 0.9872406237917257, + "grad_norm": 0.5085732376060498, + "learning_rate": 8.196106272789772e-09, + "loss": 0.6825, + "step": 45960 + }, + { + "epoch": 0.9874554281049963, + "grad_norm": 0.5199228182558453, + "learning_rate": 7.922502158713708e-09, + "loss": 0.6964, + "step": 45970 + }, + { + "epoch": 0.987670232418267, + "grad_norm": 0.5170481222004207, + "learning_rate": 7.65354084273473e-09, + "loss": 0.6866, + "step": 45980 + }, + { + "epoch": 0.9878850367315376, + "grad_norm": 0.5057263382088416, + "learning_rate": 7.389222449827316e-09, + "loss": 0.6828, + "step": 45990 + }, + { + "epoch": 0.9880998410448082, + "grad_norm": 0.5189108042110895, + "learning_rate": 7.1295471028054455e-09, + "loss": 0.6798, + "step": 46000 + }, + { + "epoch": 0.9883146453580788, + "grad_norm": 0.517177846632246, + "learning_rate": 6.874514922325937e-09, + "loss": 0.6847, + "step": 46010 + }, + { + "epoch": 0.9885294496713494, + "grad_norm": 0.5334596524865981, + "learning_rate": 6.624126026890665e-09, + "loss": 0.7139, + "step": 46020 + }, + { + "epoch": 0.98874425398462, + "grad_norm": 0.5017669931684683, + "learning_rate": 6.378380532839901e-09, + "loss": 0.6784, + "step": 46030 + }, + { + "epoch": 0.9889590582978907, + "grad_norm": 0.5157157762964487, + "learning_rate": 6.137278554361192e-09, + "loss": 0.6891, + "step": 46040 + }, + { + "epoch": 0.9891738626111612, + "grad_norm": 0.5324764096551247, + "learning_rate": 5.900820203481594e-09, + "loss": 0.683, + "step": 46050 + }, + { + "epoch": 0.9893886669244318, + "grad_norm": 0.5040931566397142, + "learning_rate": 5.669005590069887e-09, + "loss": 0.6853, + "step": 46060 + }, + { + "epoch": 0.9896034712377024, + "grad_norm": 0.5144642042437939, + "learning_rate": 5.4418348218387985e-09, + "loss": 0.6855, + "step": 46070 + }, + { + "epoch": 0.989818275550973, + "grad_norm": 0.5086245276544579, + "learning_rate": 5.219308004343893e-09, + "loss": 0.6973, + "step": 46080 + }, + { + "epoch": 0.9900330798642437, + "grad_norm": 0.5106173051210142, + "learning_rate": 5.00142524097913e-09, + "loss": 0.6809, + "step": 46090 + }, + { + "epoch": 0.9902478841775143, + "grad_norm": 0.5044758804491363, + "learning_rate": 4.788186632985747e-09, + "loss": 0.6866, + "step": 46100 + }, + { + "epoch": 0.9904626884907849, + "grad_norm": 0.5011346072824133, + "learning_rate": 4.579592279444489e-09, + "loss": 0.6773, + "step": 46110 + }, + { + "epoch": 0.9906774928040555, + "grad_norm": 0.5104576978811137, + "learning_rate": 4.375642277276715e-09, + "loss": 0.6896, + "step": 46120 + }, + { + "epoch": 0.9908922971173261, + "grad_norm": 0.5061972545009272, + "learning_rate": 4.1763367212477316e-09, + "loss": 0.6916, + "step": 46130 + }, + { + "epoch": 0.9911071014305968, + "grad_norm": 0.5114242229614572, + "learning_rate": 3.981675703965681e-09, + "loss": 0.686, + "step": 46140 + }, + { + "epoch": 0.9913219057438674, + "grad_norm": 0.5034691443686983, + "learning_rate": 3.791659315878216e-09, + "loss": 0.6991, + "step": 46150 + }, + { + "epoch": 0.991536710057138, + "grad_norm": 0.5237369943432371, + "learning_rate": 3.606287645276929e-09, + "loss": 0.7013, + "step": 46160 + }, + { + "epoch": 0.9917515143704085, + "grad_norm": 0.5050943270330421, + "learning_rate": 3.4255607782940348e-09, + "loss": 0.6686, + "step": 46170 + }, + { + "epoch": 0.9919663186836791, + "grad_norm": 0.5207022192351678, + "learning_rate": 3.2494787989034715e-09, + "loss": 0.6932, + "step": 46180 + }, + { + "epoch": 0.9921811229969498, + "grad_norm": 0.5210263524410033, + "learning_rate": 3.078041788922015e-09, + "loss": 0.6781, + "step": 46190 + }, + { + "epoch": 0.9923959273102204, + "grad_norm": 0.5144052422535814, + "learning_rate": 2.9112498280070565e-09, + "loss": 0.6888, + "step": 46200 + }, + { + "epoch": 0.992610731623491, + "grad_norm": 0.5123530950333133, + "learning_rate": 2.7491029936588252e-09, + "loss": 0.6871, + "step": 46210 + }, + { + "epoch": 0.9928255359367616, + "grad_norm": 0.5017887855233025, + "learning_rate": 2.5916013612170555e-09, + "loss": 0.6838, + "step": 46220 + }, + { + "epoch": 0.9930403402500322, + "grad_norm": 0.5059792773955643, + "learning_rate": 2.4387450038665385e-09, + "loss": 0.6769, + "step": 46230 + }, + { + "epoch": 0.9932551445633029, + "grad_norm": 0.5090551052637086, + "learning_rate": 2.2905339926293513e-09, + "loss": 0.6725, + "step": 46240 + }, + { + "epoch": 0.9934699488765735, + "grad_norm": 0.5044393490597461, + "learning_rate": 2.1469683963737386e-09, + "loss": 0.6938, + "step": 46250 + }, + { + "epoch": 0.9936847531898441, + "grad_norm": 0.5023690129044163, + "learning_rate": 2.008048281806341e-09, + "loss": 0.6813, + "step": 46260 + }, + { + "epoch": 0.9938995575031147, + "grad_norm": 0.5241055668792824, + "learning_rate": 1.8737737134744138e-09, + "loss": 0.6848, + "step": 46270 + }, + { + "epoch": 0.9941143618163852, + "grad_norm": 0.5136221543314958, + "learning_rate": 1.7441447537713817e-09, + "loss": 0.6737, + "step": 46280 + }, + { + "epoch": 0.9943291661296558, + "grad_norm": 0.510186886967427, + "learning_rate": 1.6191614629268437e-09, + "loss": 0.6827, + "step": 46290 + }, + { + "epoch": 0.9945439704429265, + "grad_norm": 0.5042541482836803, + "learning_rate": 1.4988238990143456e-09, + "loss": 0.6836, + "step": 46300 + }, + { + "epoch": 0.9947587747561971, + "grad_norm": 0.5162834914933954, + "learning_rate": 1.3831321179491596e-09, + "loss": 0.6976, + "step": 46310 + }, + { + "epoch": 0.9949735790694677, + "grad_norm": 0.5229889038934948, + "learning_rate": 1.2720861734871749e-09, + "loss": 0.698, + "step": 46320 + }, + { + "epoch": 0.9951883833827383, + "grad_norm": 0.5140297849222798, + "learning_rate": 1.1656861172248957e-09, + "loss": 0.6909, + "step": 46330 + }, + { + "epoch": 0.9954031876960089, + "grad_norm": 0.5211761716193927, + "learning_rate": 1.0639319986016639e-09, + "loss": 0.7059, + "step": 46340 + }, + { + "epoch": 0.9956179920092796, + "grad_norm": 0.5166206612511635, + "learning_rate": 9.66823864897437e-10, + "loss": 0.6799, + "step": 46350 + }, + { + "epoch": 0.9958327963225502, + "grad_norm": 0.5253510685541853, + "learning_rate": 8.74361761231679e-10, + "loss": 0.6795, + "step": 46360 + }, + { + "epoch": 0.9960476006358208, + "grad_norm": 0.5175002276710958, + "learning_rate": 7.865457305689106e-10, + "loss": 0.7023, + "step": 46370 + }, + { + "epoch": 0.9962624049490914, + "grad_norm": 0.5230288452217193, + "learning_rate": 7.033758137120483e-10, + "loss": 0.6874, + "step": 46380 + }, + { + "epoch": 0.9964772092623619, + "grad_norm": 0.5224400052978022, + "learning_rate": 6.24852049304625e-10, + "loss": 0.681, + "step": 46390 + }, + { + "epoch": 0.9966920135756326, + "grad_norm": 0.48862639002586716, + "learning_rate": 5.509744738341205e-10, + "loss": 0.6711, + "step": 46400 + }, + { + "epoch": 0.9969068178889032, + "grad_norm": 0.5192250692618456, + "learning_rate": 4.817431216264101e-10, + "loss": 0.6856, + "step": 46410 + }, + { + "epoch": 0.9971216222021738, + "grad_norm": 0.5154856759457314, + "learning_rate": 4.1715802485020604e-10, + "loss": 0.6921, + "step": 46420 + }, + { + "epoch": 0.9973364265154444, + "grad_norm": 0.5105045772126395, + "learning_rate": 3.5721921351483666e-10, + "loss": 0.6938, + "step": 46430 + }, + { + "epoch": 0.997551230828715, + "grad_norm": 0.508481663732828, + "learning_rate": 3.0192671547024655e-10, + "loss": 0.702, + "step": 46440 + }, + { + "epoch": 0.9977660351419857, + "grad_norm": 0.5050542413161292, + "learning_rate": 2.5128055640921687e-10, + "loss": 0.6901, + "step": 46450 + }, + { + "epoch": 0.9979808394552563, + "grad_norm": 0.5337828816495631, + "learning_rate": 2.0528075986292473e-10, + "loss": 0.69, + "step": 46460 + }, + { + "epoch": 0.9981956437685269, + "grad_norm": 0.49911609656939643, + "learning_rate": 1.6392734720649395e-10, + "loss": 0.6754, + "step": 46470 + }, + { + "epoch": 0.9984104480817975, + "grad_norm": 0.5450726921486923, + "learning_rate": 1.272203376534442e-10, + "loss": 0.6854, + "step": 46480 + }, + { + "epoch": 0.998625252395068, + "grad_norm": 0.5271282020213263, + "learning_rate": 9.515974826013186e-11, + "loss": 0.6966, + "step": 46490 + }, + { + "epoch": 0.9988400567083388, + "grad_norm": 0.5071464320894464, + "learning_rate": 6.774559392352942e-11, + "loss": 0.6808, + "step": 46500 + }, + { + "epoch": 0.9990548610216093, + "grad_norm": 0.5184516291604048, + "learning_rate": 4.497788738122566e-11, + "loss": 0.6894, + "step": 46510 + }, + { + "epoch": 0.9992696653348799, + "grad_norm": 0.5043819815091191, + "learning_rate": 2.6856639212535783e-11, + "loss": 0.6941, + "step": 46520 + }, + { + "epoch": 0.9994844696481505, + "grad_norm": 0.5554025858919428, + "learning_rate": 1.3381857838501433e-11, + "loss": 0.6999, + "step": 46530 + }, + { + "epoch": 0.9996992739614211, + "grad_norm": 0.5088148890810692, + "learning_rate": 4.553549517449795e-12, + "loss": 0.6761, + "step": 46540 + }, + { + "epoch": 0.9999140782746918, + "grad_norm": 0.5035234752449383, + "learning_rate": 3.7171835387539434e-13, + "loss": 0.69, + "step": 46550 + }, + { + "epoch": 1.0, + "eval_loss": 0.6645376086235046, + "eval_runtime": 16.1599, + "eval_samples_per_second": 44.245, + "eval_steps_per_second": 0.743, + "step": 46554 + }, + { + "epoch": 1.0, + "step": 46554, + "total_flos": 6.49330829599703e+16, + "train_loss": 0.6052397525345854, + "train_runtime": 238477.2919, + "train_samples_per_second": 12.494, + "train_steps_per_second": 0.195 + } + ], + "logging_steps": 10, + "max_steps": 46554, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 4000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.49330829599703e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}