diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,14065 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.08267797093601252, + "eval_steps": 10000, + "global_step": 20001, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 4.133691862207516e-06, + "grad_norm": 71.12261039322922, + "learning_rate": 8.264462809917355e-09, + "loss": 7.6413, + "step": 1 + }, + { + "epoch": 4.1336918622075155e-05, + "grad_norm": 67.75271488374239, + "learning_rate": 8.264462809917357e-08, + "loss": 7.6837, + "step": 10 + }, + { + "epoch": 8.267383724415031e-05, + "grad_norm": 68.70872130093693, + "learning_rate": 1.6528925619834713e-07, + "loss": 7.6852, + "step": 20 + }, + { + "epoch": 0.00012401075586622546, + "grad_norm": 62.98947446015298, + "learning_rate": 2.4793388429752067e-07, + "loss": 7.6376, + "step": 30 + }, + { + "epoch": 0.00016534767448830062, + "grad_norm": 54.98248131444704, + "learning_rate": 3.3057851239669426e-07, + "loss": 7.4517, + "step": 40 + }, + { + "epoch": 0.00020668459311037578, + "grad_norm": 42.7313362524158, + "learning_rate": 4.132231404958678e-07, + "loss": 7.1676, + "step": 50 + }, + { + "epoch": 0.0002480215117324509, + "grad_norm": 38.42250005234289, + "learning_rate": 4.958677685950413e-07, + "loss": 6.7673, + "step": 60 + }, + { + "epoch": 0.0002893584303545261, + "grad_norm": 26.17988841617355, + "learning_rate": 5.78512396694215e-07, + "loss": 6.1421, + "step": 70 + }, + { + "epoch": 0.00033069534897660124, + "grad_norm": 20.709699960482357, + "learning_rate": 6.611570247933885e-07, + "loss": 5.7812, + "step": 80 + }, + { + "epoch": 0.0003720322675986764, + "grad_norm": 18.83787744791924, + "learning_rate": 7.438016528925621e-07, + "loss": 5.4004, + "step": 90 + }, + { + "epoch": 0.00041336918622075157, + "grad_norm": 19.12503087164104, + "learning_rate": 8.264462809917356e-07, + "loss": 5.0367, + "step": 100 + }, + { + "epoch": 0.0004547061048428267, + "grad_norm": 14.34978763505406, + "learning_rate": 9.090909090909091e-07, + "loss": 4.7081, + "step": 110 + }, + { + "epoch": 0.0004960430234649018, + "grad_norm": 13.492660464865253, + "learning_rate": 9.917355371900827e-07, + "loss": 4.5718, + "step": 120 + }, + { + "epoch": 0.000537379942086977, + "grad_norm": 14.420928714044905, + "learning_rate": 1.0743801652892562e-06, + "loss": 4.4218, + "step": 130 + }, + { + "epoch": 0.0005787168607090522, + "grad_norm": 12.495448321891903, + "learning_rate": 1.15702479338843e-06, + "loss": 4.166, + "step": 140 + }, + { + "epoch": 0.0006200537793311273, + "grad_norm": 9.801331301009261, + "learning_rate": 1.2396694214876035e-06, + "loss": 4.0983, + "step": 150 + }, + { + "epoch": 0.0006613906979532025, + "grad_norm": 11.207849241885746, + "learning_rate": 1.322314049586777e-06, + "loss": 3.8569, + "step": 160 + }, + { + "epoch": 0.0007027276165752776, + "grad_norm": 9.679145248918651, + "learning_rate": 1.4049586776859506e-06, + "loss": 3.8265, + "step": 170 + }, + { + "epoch": 0.0007440645351973528, + "grad_norm": 11.74286989793758, + "learning_rate": 1.4876033057851241e-06, + "loss": 3.7614, + "step": 180 + }, + { + "epoch": 0.0007854014538194279, + "grad_norm": 12.757634031418831, + "learning_rate": 1.5702479338842977e-06, + "loss": 3.7239, + "step": 190 + }, + { + "epoch": 0.0008267383724415031, + "grad_norm": 10.441179098409538, + "learning_rate": 1.6528925619834712e-06, + "loss": 3.5734, + "step": 200 + }, + { + "epoch": 0.0008680752910635782, + "grad_norm": 9.812592859538713, + "learning_rate": 1.7355371900826448e-06, + "loss": 3.567, + "step": 210 + }, + { + "epoch": 0.0009094122096856533, + "grad_norm": 13.93844427924458, + "learning_rate": 1.8181818181818183e-06, + "loss": 3.4882, + "step": 220 + }, + { + "epoch": 0.0009507491283077286, + "grad_norm": 10.098507514325146, + "learning_rate": 1.900826446280992e-06, + "loss": 3.4818, + "step": 230 + }, + { + "epoch": 0.0009920860469298037, + "grad_norm": 9.58855528934084, + "learning_rate": 1.9834710743801654e-06, + "loss": 3.4219, + "step": 240 + }, + { + "epoch": 0.0010334229655518789, + "grad_norm": 10.682635147410519, + "learning_rate": 2.066115702479339e-06, + "loss": 3.4406, + "step": 250 + }, + { + "epoch": 0.001074759884173954, + "grad_norm": 8.223333896011393, + "learning_rate": 2.1487603305785124e-06, + "loss": 3.3402, + "step": 260 + }, + { + "epoch": 0.001116096802796029, + "grad_norm": 8.984512948693363, + "learning_rate": 2.231404958677686e-06, + "loss": 3.3059, + "step": 270 + }, + { + "epoch": 0.0011574337214181043, + "grad_norm": 9.341193028741104, + "learning_rate": 2.31404958677686e-06, + "loss": 3.1823, + "step": 280 + }, + { + "epoch": 0.0011987706400401795, + "grad_norm": 10.3056509420508, + "learning_rate": 2.3966942148760335e-06, + "loss": 3.2817, + "step": 290 + }, + { + "epoch": 0.0012401075586622545, + "grad_norm": 6.768688690699956, + "learning_rate": 2.479338842975207e-06, + "loss": 3.1747, + "step": 300 + }, + { + "epoch": 0.0012814444772843298, + "grad_norm": 6.809665393368199, + "learning_rate": 2.56198347107438e-06, + "loss": 3.1865, + "step": 310 + }, + { + "epoch": 0.001322781395906405, + "grad_norm": 6.996784607657393, + "learning_rate": 2.644628099173554e-06, + "loss": 3.1331, + "step": 320 + }, + { + "epoch": 0.0013641183145284802, + "grad_norm": 9.752811655395744, + "learning_rate": 2.7272727272727272e-06, + "loss": 3.1656, + "step": 330 + }, + { + "epoch": 0.0014054552331505552, + "grad_norm": 8.381106520516422, + "learning_rate": 2.809917355371901e-06, + "loss": 3.1045, + "step": 340 + }, + { + "epoch": 0.0014467921517726304, + "grad_norm": 9.529557732528744, + "learning_rate": 2.8925619834710743e-06, + "loss": 3.0388, + "step": 350 + }, + { + "epoch": 0.0014881290703947056, + "grad_norm": 7.141802881599084, + "learning_rate": 2.9752066115702483e-06, + "loss": 3.1119, + "step": 360 + }, + { + "epoch": 0.0015294659890167806, + "grad_norm": 8.784456968597924, + "learning_rate": 3.0578512396694214e-06, + "loss": 2.9281, + "step": 370 + }, + { + "epoch": 0.0015708029076388558, + "grad_norm": 7.19486645716449, + "learning_rate": 3.1404958677685953e-06, + "loss": 2.9817, + "step": 380 + }, + { + "epoch": 0.001612139826260931, + "grad_norm": 8.694417122337162, + "learning_rate": 3.2231404958677685e-06, + "loss": 3.0107, + "step": 390 + }, + { + "epoch": 0.0016534767448830063, + "grad_norm": 8.323745687899638, + "learning_rate": 3.3057851239669424e-06, + "loss": 3.0367, + "step": 400 + }, + { + "epoch": 0.0016948136635050813, + "grad_norm": 7.289283064256704, + "learning_rate": 3.388429752066116e-06, + "loss": 2.9071, + "step": 410 + }, + { + "epoch": 0.0017361505821271565, + "grad_norm": 8.436424184415285, + "learning_rate": 3.4710743801652895e-06, + "loss": 2.8761, + "step": 420 + }, + { + "epoch": 0.0017774875007492317, + "grad_norm": 8.284395953073583, + "learning_rate": 3.553719008264463e-06, + "loss": 2.8742, + "step": 430 + }, + { + "epoch": 0.0018188244193713067, + "grad_norm": 7.765622556565918, + "learning_rate": 3.6363636363636366e-06, + "loss": 2.9028, + "step": 440 + }, + { + "epoch": 0.001860161337993382, + "grad_norm": 8.06677494412333, + "learning_rate": 3.71900826446281e-06, + "loss": 2.885, + "step": 450 + }, + { + "epoch": 0.0019014982566154571, + "grad_norm": 6.847325183046608, + "learning_rate": 3.801652892561984e-06, + "loss": 2.8491, + "step": 460 + }, + { + "epoch": 0.0019428351752375323, + "grad_norm": 6.363354378607742, + "learning_rate": 3.884297520661157e-06, + "loss": 2.8626, + "step": 470 + }, + { + "epoch": 0.0019841720938596073, + "grad_norm": 6.752545926601506, + "learning_rate": 3.966942148760331e-06, + "loss": 2.8759, + "step": 480 + }, + { + "epoch": 0.0020255090124816828, + "grad_norm": 6.408269470963144, + "learning_rate": 4.049586776859504e-06, + "loss": 2.8132, + "step": 490 + }, + { + "epoch": 0.0020668459311037578, + "grad_norm": 10.174350278228932, + "learning_rate": 4.132231404958678e-06, + "loss": 2.7726, + "step": 500 + }, + { + "epoch": 0.0021081828497258328, + "grad_norm": 7.3668788002558045, + "learning_rate": 4.214876033057851e-06, + "loss": 2.781, + "step": 510 + }, + { + "epoch": 0.002149519768347908, + "grad_norm": 6.065779568736328, + "learning_rate": 4.297520661157025e-06, + "loss": 2.7566, + "step": 520 + }, + { + "epoch": 0.002190856686969983, + "grad_norm": 7.249820467027506, + "learning_rate": 4.3801652892561984e-06, + "loss": 2.7745, + "step": 530 + }, + { + "epoch": 0.002232193605592058, + "grad_norm": 6.6147066822580305, + "learning_rate": 4.462809917355372e-06, + "loss": 2.7287, + "step": 540 + }, + { + "epoch": 0.0022735305242141336, + "grad_norm": 6.730411628057589, + "learning_rate": 4.5454545454545455e-06, + "loss": 2.7263, + "step": 550 + }, + { + "epoch": 0.0023148674428362086, + "grad_norm": 5.90842778355055, + "learning_rate": 4.62809917355372e-06, + "loss": 2.7113, + "step": 560 + }, + { + "epoch": 0.0023562043614582836, + "grad_norm": 5.290690112865889, + "learning_rate": 4.710743801652893e-06, + "loss": 2.7407, + "step": 570 + }, + { + "epoch": 0.002397541280080359, + "grad_norm": 6.913494491726395, + "learning_rate": 4.793388429752067e-06, + "loss": 2.7089, + "step": 580 + }, + { + "epoch": 0.002438878198702434, + "grad_norm": 6.999126848562726, + "learning_rate": 4.87603305785124e-06, + "loss": 2.7074, + "step": 590 + }, + { + "epoch": 0.002480215117324509, + "grad_norm": 7.325247968940623, + "learning_rate": 4.958677685950414e-06, + "loss": 2.6561, + "step": 600 + }, + { + "epoch": 0.0025215520359465845, + "grad_norm": 5.841708656820878, + "learning_rate": 5.041322314049587e-06, + "loss": 2.5882, + "step": 610 + }, + { + "epoch": 0.0025628889545686595, + "grad_norm": 6.0353216317971725, + "learning_rate": 5.12396694214876e-06, + "loss": 2.6469, + "step": 620 + }, + { + "epoch": 0.002604225873190735, + "grad_norm": 7.544181254798358, + "learning_rate": 5.206611570247935e-06, + "loss": 2.6267, + "step": 630 + }, + { + "epoch": 0.00264556279181281, + "grad_norm": 6.608459353317291, + "learning_rate": 5.289256198347108e-06, + "loss": 2.5662, + "step": 640 + }, + { + "epoch": 0.002686899710434885, + "grad_norm": 6.839416904552874, + "learning_rate": 5.371900826446281e-06, + "loss": 2.6888, + "step": 650 + }, + { + "epoch": 0.0027282366290569604, + "grad_norm": 6.668329679339745, + "learning_rate": 5.4545454545454545e-06, + "loss": 2.5642, + "step": 660 + }, + { + "epoch": 0.0027695735476790354, + "grad_norm": 7.132958283503685, + "learning_rate": 5.537190082644629e-06, + "loss": 2.5651, + "step": 670 + }, + { + "epoch": 0.0028109104663011104, + "grad_norm": 6.277307177686086, + "learning_rate": 5.619834710743802e-06, + "loss": 2.5659, + "step": 680 + }, + { + "epoch": 0.002852247384923186, + "grad_norm": 6.128689798291957, + "learning_rate": 5.702479338842976e-06, + "loss": 2.5826, + "step": 690 + }, + { + "epoch": 0.002893584303545261, + "grad_norm": 6.5950769294424125, + "learning_rate": 5.785123966942149e-06, + "loss": 2.5845, + "step": 700 + }, + { + "epoch": 0.002934921222167336, + "grad_norm": 6.419190212095196, + "learning_rate": 5.867768595041323e-06, + "loss": 2.5336, + "step": 710 + }, + { + "epoch": 0.0029762581407894112, + "grad_norm": 8.4242870632546, + "learning_rate": 5.9504132231404965e-06, + "loss": 2.5085, + "step": 720 + }, + { + "epoch": 0.0030175950594114862, + "grad_norm": 7.690590337257814, + "learning_rate": 6.03305785123967e-06, + "loss": 2.6229, + "step": 730 + }, + { + "epoch": 0.0030589319780335612, + "grad_norm": 6.501607766316929, + "learning_rate": 6.115702479338843e-06, + "loss": 2.5214, + "step": 740 + }, + { + "epoch": 0.0031002688966556367, + "grad_norm": 6.318891494759645, + "learning_rate": 6.198347107438017e-06, + "loss": 2.5001, + "step": 750 + }, + { + "epoch": 0.0031416058152777117, + "grad_norm": 6.549087764929742, + "learning_rate": 6.280991735537191e-06, + "loss": 2.4692, + "step": 760 + }, + { + "epoch": 0.003182942733899787, + "grad_norm": 6.139353182718512, + "learning_rate": 6.363636363636364e-06, + "loss": 2.4862, + "step": 770 + }, + { + "epoch": 0.003224279652521862, + "grad_norm": 6.927572304151442, + "learning_rate": 6.446280991735537e-06, + "loss": 2.5114, + "step": 780 + }, + { + "epoch": 0.003265616571143937, + "grad_norm": 6.193127375797419, + "learning_rate": 6.528925619834712e-06, + "loss": 2.5163, + "step": 790 + }, + { + "epoch": 0.0033069534897660125, + "grad_norm": 6.624590490134376, + "learning_rate": 6.611570247933885e-06, + "loss": 2.4314, + "step": 800 + }, + { + "epoch": 0.0033482904083880875, + "grad_norm": 6.821779483779539, + "learning_rate": 6.694214876033058e-06, + "loss": 2.5099, + "step": 810 + }, + { + "epoch": 0.0033896273270101625, + "grad_norm": 7.545681159342933, + "learning_rate": 6.776859504132232e-06, + "loss": 2.431, + "step": 820 + }, + { + "epoch": 0.003430964245632238, + "grad_norm": 7.4575512413535785, + "learning_rate": 6.859504132231406e-06, + "loss": 2.5166, + "step": 830 + }, + { + "epoch": 0.003472301164254313, + "grad_norm": 5.330361729769768, + "learning_rate": 6.942148760330579e-06, + "loss": 2.4147, + "step": 840 + }, + { + "epoch": 0.003513638082876388, + "grad_norm": 9.642946599111673, + "learning_rate": 7.0247933884297525e-06, + "loss": 2.416, + "step": 850 + }, + { + "epoch": 0.0035549750014984634, + "grad_norm": 5.81243704909108, + "learning_rate": 7.107438016528926e-06, + "loss": 2.4712, + "step": 860 + }, + { + "epoch": 0.0035963119201205384, + "grad_norm": 8.181150357128717, + "learning_rate": 7.1900826446281005e-06, + "loss": 2.4275, + "step": 870 + }, + { + "epoch": 0.0036376488387426134, + "grad_norm": 6.783789830926592, + "learning_rate": 7.272727272727273e-06, + "loss": 2.4283, + "step": 880 + }, + { + "epoch": 0.003678985757364689, + "grad_norm": 6.537482232422147, + "learning_rate": 7.355371900826447e-06, + "loss": 2.4064, + "step": 890 + }, + { + "epoch": 0.003720322675986764, + "grad_norm": 5.51502652262759, + "learning_rate": 7.43801652892562e-06, + "loss": 2.3804, + "step": 900 + }, + { + "epoch": 0.0037616595946088393, + "grad_norm": 5.643663413025215, + "learning_rate": 7.520661157024795e-06, + "loss": 2.4225, + "step": 910 + }, + { + "epoch": 0.0038029965132309143, + "grad_norm": 5.698077553184173, + "learning_rate": 7.603305785123968e-06, + "loss": 2.3767, + "step": 920 + }, + { + "epoch": 0.0038443334318529893, + "grad_norm": 7.7844289388382695, + "learning_rate": 7.685950413223142e-06, + "loss": 2.3515, + "step": 930 + }, + { + "epoch": 0.0038856703504750647, + "grad_norm": 7.037093549799256, + "learning_rate": 7.768595041322314e-06, + "loss": 2.34, + "step": 940 + }, + { + "epoch": 0.00392700726909714, + "grad_norm": 6.080619495201754, + "learning_rate": 7.851239669421489e-06, + "loss": 2.3174, + "step": 950 + }, + { + "epoch": 0.003968344187719215, + "grad_norm": 5.880580728396556, + "learning_rate": 7.933884297520661e-06, + "loss": 2.3706, + "step": 960 + }, + { + "epoch": 0.00400968110634129, + "grad_norm": 5.661072675024262, + "learning_rate": 8.016528925619836e-06, + "loss": 2.3481, + "step": 970 + }, + { + "epoch": 0.0040510180249633656, + "grad_norm": 7.201611884034939, + "learning_rate": 8.099173553719009e-06, + "loss": 2.3667, + "step": 980 + }, + { + "epoch": 0.00409235494358544, + "grad_norm": 6.275874199218544, + "learning_rate": 8.181818181818183e-06, + "loss": 2.3169, + "step": 990 + }, + { + "epoch": 0.0041336918622075156, + "grad_norm": 5.267583094894874, + "learning_rate": 8.264462809917356e-06, + "loss": 2.3757, + "step": 1000 + }, + { + "epoch": 0.004175028780829591, + "grad_norm": 5.377757889968936, + "learning_rate": 8.34710743801653e-06, + "loss": 2.3631, + "step": 1010 + }, + { + "epoch": 0.0042163656994516656, + "grad_norm": 6.0201100161095225, + "learning_rate": 8.429752066115703e-06, + "loss": 2.2818, + "step": 1020 + }, + { + "epoch": 0.004257702618073741, + "grad_norm": 6.579057670565248, + "learning_rate": 8.512396694214877e-06, + "loss": 2.3313, + "step": 1030 + }, + { + "epoch": 0.004299039536695816, + "grad_norm": 7.27660719754988, + "learning_rate": 8.59504132231405e-06, + "loss": 2.3288, + "step": 1040 + }, + { + "epoch": 0.004340376455317891, + "grad_norm": 6.144262724026651, + "learning_rate": 8.677685950413224e-06, + "loss": 2.2981, + "step": 1050 + }, + { + "epoch": 0.004381713373939966, + "grad_norm": 5.672033241927713, + "learning_rate": 8.760330578512397e-06, + "loss": 2.3059, + "step": 1060 + }, + { + "epoch": 0.004423050292562042, + "grad_norm": 6.00241597585073, + "learning_rate": 8.842975206611571e-06, + "loss": 2.389, + "step": 1070 + }, + { + "epoch": 0.004464387211184116, + "grad_norm": 5.649027277167286, + "learning_rate": 8.925619834710744e-06, + "loss": 2.3371, + "step": 1080 + }, + { + "epoch": 0.004505724129806192, + "grad_norm": 5.927371810838215, + "learning_rate": 9.008264462809918e-06, + "loss": 2.3133, + "step": 1090 + }, + { + "epoch": 0.004547061048428267, + "grad_norm": 5.662885075294936, + "learning_rate": 9.090909090909091e-06, + "loss": 2.2779, + "step": 1100 + }, + { + "epoch": 0.004588397967050342, + "grad_norm": 5.654705250045512, + "learning_rate": 9.173553719008265e-06, + "loss": 2.2234, + "step": 1110 + }, + { + "epoch": 0.004629734885672417, + "grad_norm": 6.241923587474726, + "learning_rate": 9.25619834710744e-06, + "loss": 2.2626, + "step": 1120 + }, + { + "epoch": 0.004671071804294493, + "grad_norm": 5.741893435201511, + "learning_rate": 9.338842975206613e-06, + "loss": 2.3012, + "step": 1130 + }, + { + "epoch": 0.004712408722916567, + "grad_norm": 6.034507786920065, + "learning_rate": 9.421487603305785e-06, + "loss": 2.2682, + "step": 1140 + }, + { + "epoch": 0.004753745641538643, + "grad_norm": 7.410349347874194, + "learning_rate": 9.50413223140496e-06, + "loss": 2.2796, + "step": 1150 + }, + { + "epoch": 0.004795082560160718, + "grad_norm": 5.992102424922784, + "learning_rate": 9.586776859504134e-06, + "loss": 2.2112, + "step": 1160 + }, + { + "epoch": 0.004836419478782793, + "grad_norm": 5.453311998034154, + "learning_rate": 9.669421487603307e-06, + "loss": 2.1744, + "step": 1170 + }, + { + "epoch": 0.004877756397404868, + "grad_norm": 5.521988823358632, + "learning_rate": 9.75206611570248e-06, + "loss": 2.2984, + "step": 1180 + }, + { + "epoch": 0.004919093316026944, + "grad_norm": 6.153893937530345, + "learning_rate": 9.834710743801654e-06, + "loss": 2.2443, + "step": 1190 + }, + { + "epoch": 0.004960430234649018, + "grad_norm": 6.5507135206490315, + "learning_rate": 9.917355371900828e-06, + "loss": 2.245, + "step": 1200 + }, + { + "epoch": 0.005001767153271094, + "grad_norm": 8.209761096607327, + "learning_rate": 1e-05, + "loss": 2.1959, + "step": 1210 + }, + { + "epoch": 0.005043104071893169, + "grad_norm": 4.986264712575914, + "learning_rate": 1.0082644628099174e-05, + "loss": 2.1612, + "step": 1220 + }, + { + "epoch": 0.0050844409905152444, + "grad_norm": 6.3381969120868895, + "learning_rate": 1.0165289256198348e-05, + "loss": 2.187, + "step": 1230 + }, + { + "epoch": 0.005125777909137319, + "grad_norm": 5.750067641203542, + "learning_rate": 1.024793388429752e-05, + "loss": 2.2004, + "step": 1240 + }, + { + "epoch": 0.0051671148277593944, + "grad_norm": 5.826539821613237, + "learning_rate": 1.0330578512396693e-05, + "loss": 2.1668, + "step": 1250 + }, + { + "epoch": 0.00520845174638147, + "grad_norm": 6.296936925085496, + "learning_rate": 1.041322314049587e-05, + "loss": 2.1807, + "step": 1260 + }, + { + "epoch": 0.0052497886650035444, + "grad_norm": 5.812866932063289, + "learning_rate": 1.0495867768595042e-05, + "loss": 2.209, + "step": 1270 + }, + { + "epoch": 0.00529112558362562, + "grad_norm": 5.808144224407848, + "learning_rate": 1.0578512396694216e-05, + "loss": 2.1807, + "step": 1280 + }, + { + "epoch": 0.005332462502247695, + "grad_norm": 7.460856083590218, + "learning_rate": 1.0661157024793389e-05, + "loss": 2.2229, + "step": 1290 + }, + { + "epoch": 0.00537379942086977, + "grad_norm": 6.980089322389665, + "learning_rate": 1.0743801652892562e-05, + "loss": 2.175, + "step": 1300 + }, + { + "epoch": 0.005415136339491845, + "grad_norm": 5.57740557557049, + "learning_rate": 1.0826446280991736e-05, + "loss": 2.159, + "step": 1310 + }, + { + "epoch": 0.005456473258113921, + "grad_norm": 6.647266714783434, + "learning_rate": 1.0909090909090909e-05, + "loss": 2.1214, + "step": 1320 + }, + { + "epoch": 0.005497810176735995, + "grad_norm": 6.128334205497799, + "learning_rate": 1.0991735537190083e-05, + "loss": 2.1792, + "step": 1330 + }, + { + "epoch": 0.005539147095358071, + "grad_norm": 6.483094766646449, + "learning_rate": 1.1074380165289258e-05, + "loss": 2.2472, + "step": 1340 + }, + { + "epoch": 0.005580484013980146, + "grad_norm": 5.359049945838656, + "learning_rate": 1.1157024793388432e-05, + "loss": 2.182, + "step": 1350 + }, + { + "epoch": 0.005621820932602221, + "grad_norm": 6.6553587609192695, + "learning_rate": 1.1239669421487605e-05, + "loss": 2.1918, + "step": 1360 + }, + { + "epoch": 0.005663157851224296, + "grad_norm": 6.105297642757683, + "learning_rate": 1.1322314049586777e-05, + "loss": 2.1221, + "step": 1370 + }, + { + "epoch": 0.005704494769846372, + "grad_norm": 5.22407946250878, + "learning_rate": 1.1404958677685952e-05, + "loss": 2.0898, + "step": 1380 + }, + { + "epoch": 0.005745831688468446, + "grad_norm": 5.695260375861287, + "learning_rate": 1.1487603305785125e-05, + "loss": 2.2, + "step": 1390 + }, + { + "epoch": 0.005787168607090522, + "grad_norm": 5.834677547053157, + "learning_rate": 1.1570247933884297e-05, + "loss": 2.1191, + "step": 1400 + }, + { + "epoch": 0.005828505525712597, + "grad_norm": 7.863484441598645, + "learning_rate": 1.1652892561983472e-05, + "loss": 2.1255, + "step": 1410 + }, + { + "epoch": 0.005869842444334672, + "grad_norm": 5.295752440326079, + "learning_rate": 1.1735537190082646e-05, + "loss": 2.1535, + "step": 1420 + }, + { + "epoch": 0.005911179362956747, + "grad_norm": 6.925687761354192, + "learning_rate": 1.181818181818182e-05, + "loss": 2.0618, + "step": 1430 + }, + { + "epoch": 0.0059525162815788225, + "grad_norm": 4.89230568395151, + "learning_rate": 1.1900826446280993e-05, + "loss": 2.1745, + "step": 1440 + }, + { + "epoch": 0.005993853200200897, + "grad_norm": 5.795044632849597, + "learning_rate": 1.1983471074380166e-05, + "loss": 2.1352, + "step": 1450 + }, + { + "epoch": 0.0060351901188229725, + "grad_norm": 6.43513153980254, + "learning_rate": 1.206611570247934e-05, + "loss": 2.102, + "step": 1460 + }, + { + "epoch": 0.006076527037445048, + "grad_norm": 6.46737354415826, + "learning_rate": 1.2148760330578513e-05, + "loss": 2.0934, + "step": 1470 + }, + { + "epoch": 0.0061178639560671225, + "grad_norm": 6.202005592277405, + "learning_rate": 1.2231404958677686e-05, + "loss": 2.1482, + "step": 1480 + }, + { + "epoch": 0.006159200874689198, + "grad_norm": 5.8071883971926725, + "learning_rate": 1.231404958677686e-05, + "loss": 2.0553, + "step": 1490 + }, + { + "epoch": 0.006200537793311273, + "grad_norm": 6.2092050955251334, + "learning_rate": 1.2396694214876034e-05, + "loss": 2.0836, + "step": 1500 + }, + { + "epoch": 0.006241874711933349, + "grad_norm": 4.486772138898485, + "learning_rate": 1.2479338842975209e-05, + "loss": 2.1622, + "step": 1510 + }, + { + "epoch": 0.006283211630555423, + "grad_norm": 5.229981562060858, + "learning_rate": 1.2561983471074381e-05, + "loss": 2.0854, + "step": 1520 + }, + { + "epoch": 0.006324548549177499, + "grad_norm": 5.604269574061805, + "learning_rate": 1.2644628099173554e-05, + "loss": 2.1604, + "step": 1530 + }, + { + "epoch": 0.006365885467799574, + "grad_norm": 5.361170550183367, + "learning_rate": 1.2727272727272728e-05, + "loss": 2.063, + "step": 1540 + }, + { + "epoch": 0.006407222386421649, + "grad_norm": 7.20018526125173, + "learning_rate": 1.2809917355371901e-05, + "loss": 2.0935, + "step": 1550 + }, + { + "epoch": 0.006448559305043724, + "grad_norm": 5.379224740428811, + "learning_rate": 1.2892561983471074e-05, + "loss": 2.1291, + "step": 1560 + }, + { + "epoch": 0.0064898962236658, + "grad_norm": 4.967695885199312, + "learning_rate": 1.2975206611570248e-05, + "loss": 2.102, + "step": 1570 + }, + { + "epoch": 0.006531233142287874, + "grad_norm": 6.039676530986522, + "learning_rate": 1.3057851239669424e-05, + "loss": 2.1134, + "step": 1580 + }, + { + "epoch": 0.00657257006090995, + "grad_norm": 5.459189111144024, + "learning_rate": 1.3140495867768597e-05, + "loss": 2.0302, + "step": 1590 + }, + { + "epoch": 0.006613906979532025, + "grad_norm": 6.143839950859222, + "learning_rate": 1.322314049586777e-05, + "loss": 2.0978, + "step": 1600 + }, + { + "epoch": 0.0066552438981541, + "grad_norm": 5.6756704061902825, + "learning_rate": 1.3305785123966944e-05, + "loss": 2.1592, + "step": 1610 + }, + { + "epoch": 0.006696580816776175, + "grad_norm": 4.994981957965064, + "learning_rate": 1.3388429752066117e-05, + "loss": 2.1035, + "step": 1620 + }, + { + "epoch": 0.0067379177353982505, + "grad_norm": 5.853251459350967, + "learning_rate": 1.347107438016529e-05, + "loss": 2.0701, + "step": 1630 + }, + { + "epoch": 0.006779254654020325, + "grad_norm": 5.4681573607696965, + "learning_rate": 1.3553719008264464e-05, + "loss": 2.089, + "step": 1640 + }, + { + "epoch": 0.0068205915726424005, + "grad_norm": 5.848581304068256, + "learning_rate": 1.3636363636363637e-05, + "loss": 2.0819, + "step": 1650 + }, + { + "epoch": 0.006861928491264476, + "grad_norm": 5.481243900559041, + "learning_rate": 1.3719008264462813e-05, + "loss": 2.0284, + "step": 1660 + }, + { + "epoch": 0.0069032654098865505, + "grad_norm": 5.699959566604993, + "learning_rate": 1.3801652892561985e-05, + "loss": 2.0622, + "step": 1670 + }, + { + "epoch": 0.006944602328508626, + "grad_norm": 4.996648526388665, + "learning_rate": 1.3884297520661158e-05, + "loss": 2.0071, + "step": 1680 + }, + { + "epoch": 0.006985939247130701, + "grad_norm": 5.126923483054136, + "learning_rate": 1.3966942148760332e-05, + "loss": 2.0631, + "step": 1690 + }, + { + "epoch": 0.007027276165752776, + "grad_norm": 5.883552104251492, + "learning_rate": 1.4049586776859505e-05, + "loss": 2.0791, + "step": 1700 + }, + { + "epoch": 0.007068613084374851, + "grad_norm": 4.929514966855435, + "learning_rate": 1.4132231404958678e-05, + "loss": 2.0364, + "step": 1710 + }, + { + "epoch": 0.007109950002996927, + "grad_norm": 5.301129760644346, + "learning_rate": 1.4214876033057852e-05, + "loss": 2.0373, + "step": 1720 + }, + { + "epoch": 0.007151286921619001, + "grad_norm": 5.523739748516145, + "learning_rate": 1.4297520661157025e-05, + "loss": 2.1079, + "step": 1730 + }, + { + "epoch": 0.007192623840241077, + "grad_norm": 5.7887756838227755, + "learning_rate": 1.4380165289256201e-05, + "loss": 2.049, + "step": 1740 + }, + { + "epoch": 0.007233960758863152, + "grad_norm": 5.2452853088604865, + "learning_rate": 1.4462809917355374e-05, + "loss": 2.0686, + "step": 1750 + }, + { + "epoch": 0.007275297677485227, + "grad_norm": 4.454384214370969, + "learning_rate": 1.4545454545454546e-05, + "loss": 2.0124, + "step": 1760 + }, + { + "epoch": 0.007316634596107302, + "grad_norm": 6.397338503442304, + "learning_rate": 1.462809917355372e-05, + "loss": 2.0152, + "step": 1770 + }, + { + "epoch": 0.007357971514729378, + "grad_norm": 6.554144037150873, + "learning_rate": 1.4710743801652893e-05, + "loss": 1.9976, + "step": 1780 + }, + { + "epoch": 0.007399308433351453, + "grad_norm": 4.973940426595748, + "learning_rate": 1.4793388429752066e-05, + "loss": 2.0085, + "step": 1790 + }, + { + "epoch": 0.007440645351973528, + "grad_norm": 5.776375204519037, + "learning_rate": 1.487603305785124e-05, + "loss": 2.0339, + "step": 1800 + }, + { + "epoch": 0.007481982270595603, + "grad_norm": 5.472367097758556, + "learning_rate": 1.4958677685950413e-05, + "loss": 2.0016, + "step": 1810 + }, + { + "epoch": 0.0075233191892176785, + "grad_norm": 4.850880114898939, + "learning_rate": 1.504132231404959e-05, + "loss": 1.9952, + "step": 1820 + }, + { + "epoch": 0.007564656107839753, + "grad_norm": 4.825492061262016, + "learning_rate": 1.5123966942148762e-05, + "loss": 2.0149, + "step": 1830 + }, + { + "epoch": 0.0076059930264618285, + "grad_norm": 6.317700924322252, + "learning_rate": 1.5206611570247936e-05, + "loss": 1.9765, + "step": 1840 + }, + { + "epoch": 0.007647329945083904, + "grad_norm": 5.831048263887902, + "learning_rate": 1.528925619834711e-05, + "loss": 1.9669, + "step": 1850 + }, + { + "epoch": 0.0076886668637059785, + "grad_norm": 5.190457786334756, + "learning_rate": 1.5371900826446283e-05, + "loss": 2.0342, + "step": 1860 + }, + { + "epoch": 0.007730003782328054, + "grad_norm": 5.752029895196757, + "learning_rate": 1.5454545454545454e-05, + "loss": 2.0606, + "step": 1870 + }, + { + "epoch": 0.007771340700950129, + "grad_norm": 5.005855197604682, + "learning_rate": 1.553719008264463e-05, + "loss": 2.0764, + "step": 1880 + }, + { + "epoch": 0.007812677619572205, + "grad_norm": 5.362161895494138, + "learning_rate": 1.5619834710743803e-05, + "loss": 2.0373, + "step": 1890 + }, + { + "epoch": 0.00785401453819428, + "grad_norm": 5.589239650428267, + "learning_rate": 1.5702479338842978e-05, + "loss": 2.0536, + "step": 1900 + }, + { + "epoch": 0.007895351456816354, + "grad_norm": 5.38085836484136, + "learning_rate": 1.5785123966942152e-05, + "loss": 2.0357, + "step": 1910 + }, + { + "epoch": 0.00793668837543843, + "grad_norm": 6.4123494555744065, + "learning_rate": 1.5867768595041323e-05, + "loss": 1.9992, + "step": 1920 + }, + { + "epoch": 0.007978025294060505, + "grad_norm": 5.369699763052158, + "learning_rate": 1.5950413223140497e-05, + "loss": 1.9645, + "step": 1930 + }, + { + "epoch": 0.00801936221268258, + "grad_norm": 6.655726980448801, + "learning_rate": 1.6033057851239672e-05, + "loss": 2.0023, + "step": 1940 + }, + { + "epoch": 0.008060699131304656, + "grad_norm": 5.150395460216213, + "learning_rate": 1.6115702479338843e-05, + "loss": 1.9953, + "step": 1950 + }, + { + "epoch": 0.008102036049926731, + "grad_norm": 5.534796132616727, + "learning_rate": 1.6198347107438017e-05, + "loss": 1.964, + "step": 1960 + }, + { + "epoch": 0.008143372968548805, + "grad_norm": 5.0714233165065075, + "learning_rate": 1.628099173553719e-05, + "loss": 1.9942, + "step": 1970 + }, + { + "epoch": 0.00818470988717088, + "grad_norm": 5.370096628339807, + "learning_rate": 1.6363636363636366e-05, + "loss": 1.9938, + "step": 1980 + }, + { + "epoch": 0.008226046805792956, + "grad_norm": 4.816680798680657, + "learning_rate": 1.644628099173554e-05, + "loss": 1.9023, + "step": 1990 + }, + { + "epoch": 0.008267383724415031, + "grad_norm": 5.910326143029371, + "learning_rate": 1.652892561983471e-05, + "loss": 1.9366, + "step": 2000 + }, + { + "epoch": 0.008308720643037107, + "grad_norm": 5.364682793090204, + "learning_rate": 1.6611570247933886e-05, + "loss": 1.9835, + "step": 2010 + }, + { + "epoch": 0.008350057561659182, + "grad_norm": 6.171717096992393, + "learning_rate": 1.669421487603306e-05, + "loss": 1.9641, + "step": 2020 + }, + { + "epoch": 0.008391394480281256, + "grad_norm": 4.794750763380389, + "learning_rate": 1.677685950413223e-05, + "loss": 1.9405, + "step": 2030 + }, + { + "epoch": 0.008432731398903331, + "grad_norm": 6.3363242673070745, + "learning_rate": 1.6859504132231405e-05, + "loss": 1.9717, + "step": 2040 + }, + { + "epoch": 0.008474068317525407, + "grad_norm": 5.10756576497978, + "learning_rate": 1.694214876033058e-05, + "loss": 1.9312, + "step": 2050 + }, + { + "epoch": 0.008515405236147482, + "grad_norm": 5.5429121513722945, + "learning_rate": 1.7024793388429754e-05, + "loss": 1.9692, + "step": 2060 + }, + { + "epoch": 0.008556742154769557, + "grad_norm": 5.053921879606705, + "learning_rate": 1.710743801652893e-05, + "loss": 1.9187, + "step": 2070 + }, + { + "epoch": 0.008598079073391633, + "grad_norm": 5.246682645264326, + "learning_rate": 1.71900826446281e-05, + "loss": 2.0086, + "step": 2080 + }, + { + "epoch": 0.008639415992013707, + "grad_norm": 4.651358563124329, + "learning_rate": 1.7272727272727274e-05, + "loss": 1.9676, + "step": 2090 + }, + { + "epoch": 0.008680752910635782, + "grad_norm": 5.254574557184252, + "learning_rate": 1.735537190082645e-05, + "loss": 1.9193, + "step": 2100 + }, + { + "epoch": 0.008722089829257857, + "grad_norm": 5.5559516380514316, + "learning_rate": 1.743801652892562e-05, + "loss": 1.9123, + "step": 2110 + }, + { + "epoch": 0.008763426747879933, + "grad_norm": 5.714609535718523, + "learning_rate": 1.7520661157024794e-05, + "loss": 1.9831, + "step": 2120 + }, + { + "epoch": 0.008804763666502008, + "grad_norm": 4.664121459414757, + "learning_rate": 1.7603305785123968e-05, + "loss": 1.9606, + "step": 2130 + }, + { + "epoch": 0.008846100585124084, + "grad_norm": 4.9060858638182685, + "learning_rate": 1.7685950413223143e-05, + "loss": 1.9535, + "step": 2140 + }, + { + "epoch": 0.008887437503746157, + "grad_norm": 4.997171967559315, + "learning_rate": 1.7768595041322317e-05, + "loss": 1.9243, + "step": 2150 + }, + { + "epoch": 0.008928774422368233, + "grad_norm": 4.60645777188567, + "learning_rate": 1.7851239669421488e-05, + "loss": 1.9291, + "step": 2160 + }, + { + "epoch": 0.008970111340990308, + "grad_norm": 4.2131519608354004, + "learning_rate": 1.7933884297520662e-05, + "loss": 1.9105, + "step": 2170 + }, + { + "epoch": 0.009011448259612384, + "grad_norm": 5.51444703850531, + "learning_rate": 1.8016528925619837e-05, + "loss": 1.9502, + "step": 2180 + }, + { + "epoch": 0.00905278517823446, + "grad_norm": 5.2003808089855825, + "learning_rate": 1.809917355371901e-05, + "loss": 1.9436, + "step": 2190 + }, + { + "epoch": 0.009094122096856535, + "grad_norm": 4.240179682087964, + "learning_rate": 1.8181818181818182e-05, + "loss": 1.9194, + "step": 2200 + }, + { + "epoch": 0.00913545901547861, + "grad_norm": 4.582501074244312, + "learning_rate": 1.8264462809917356e-05, + "loss": 1.9145, + "step": 2210 + }, + { + "epoch": 0.009176795934100684, + "grad_norm": 5.362083861786352, + "learning_rate": 1.834710743801653e-05, + "loss": 1.9165, + "step": 2220 + }, + { + "epoch": 0.009218132852722759, + "grad_norm": 5.06281875114174, + "learning_rate": 1.8429752066115705e-05, + "loss": 1.977, + "step": 2230 + }, + { + "epoch": 0.009259469771344835, + "grad_norm": 4.661496047656461, + "learning_rate": 1.851239669421488e-05, + "loss": 1.8892, + "step": 2240 + }, + { + "epoch": 0.00930080668996691, + "grad_norm": 4.735532406310298, + "learning_rate": 1.859504132231405e-05, + "loss": 1.8689, + "step": 2250 + }, + { + "epoch": 0.009342143608588985, + "grad_norm": 4.445771479719063, + "learning_rate": 1.8677685950413225e-05, + "loss": 1.9301, + "step": 2260 + }, + { + "epoch": 0.00938348052721106, + "grad_norm": 5.100109904664726, + "learning_rate": 1.87603305785124e-05, + "loss": 1.9309, + "step": 2270 + }, + { + "epoch": 0.009424817445833135, + "grad_norm": 6.115469535323335, + "learning_rate": 1.884297520661157e-05, + "loss": 1.918, + "step": 2280 + }, + { + "epoch": 0.00946615436445521, + "grad_norm": 4.890019742506766, + "learning_rate": 1.8925619834710745e-05, + "loss": 1.945, + "step": 2290 + }, + { + "epoch": 0.009507491283077285, + "grad_norm": 5.023798525711054, + "learning_rate": 1.900826446280992e-05, + "loss": 1.9004, + "step": 2300 + }, + { + "epoch": 0.00954882820169936, + "grad_norm": 5.652810624249754, + "learning_rate": 1.9090909090909094e-05, + "loss": 1.9339, + "step": 2310 + }, + { + "epoch": 0.009590165120321436, + "grad_norm": 6.04847384963266, + "learning_rate": 1.9173553719008268e-05, + "loss": 1.8687, + "step": 2320 + }, + { + "epoch": 0.009631502038943512, + "grad_norm": 5.332733823425359, + "learning_rate": 1.925619834710744e-05, + "loss": 1.8938, + "step": 2330 + }, + { + "epoch": 0.009672838957565585, + "grad_norm": 4.814331448709232, + "learning_rate": 1.9338842975206613e-05, + "loss": 1.8752, + "step": 2340 + }, + { + "epoch": 0.00971417587618766, + "grad_norm": 5.558977499560294, + "learning_rate": 1.9421487603305788e-05, + "loss": 1.8835, + "step": 2350 + }, + { + "epoch": 0.009755512794809736, + "grad_norm": 5.554940177569548, + "learning_rate": 1.950413223140496e-05, + "loss": 1.857, + "step": 2360 + }, + { + "epoch": 0.009796849713431812, + "grad_norm": 4.970502489086558, + "learning_rate": 1.9586776859504133e-05, + "loss": 1.8553, + "step": 2370 + }, + { + "epoch": 0.009838186632053887, + "grad_norm": 4.044099915606779, + "learning_rate": 1.9669421487603307e-05, + "loss": 1.924, + "step": 2380 + }, + { + "epoch": 0.009879523550675963, + "grad_norm": 4.880726953238654, + "learning_rate": 1.9752066115702482e-05, + "loss": 1.9785, + "step": 2390 + }, + { + "epoch": 0.009920860469298036, + "grad_norm": 5.457077789861094, + "learning_rate": 1.9834710743801656e-05, + "loss": 1.8585, + "step": 2400 + }, + { + "epoch": 0.009962197387920112, + "grad_norm": 4.608586390817221, + "learning_rate": 1.9917355371900827e-05, + "loss": 1.8861, + "step": 2410 + }, + { + "epoch": 0.010003534306542187, + "grad_norm": 4.5178969512670335, + "learning_rate": 2e-05, + "loss": 1.8936, + "step": 2420 + }, + { + "epoch": 0.010044871225164263, + "grad_norm": 5.722004352525454, + "learning_rate": 1.999999991396395e-05, + "loss": 1.8616, + "step": 2430 + }, + { + "epoch": 0.010086208143786338, + "grad_norm": 4.99862696301366, + "learning_rate": 1.9999999655855794e-05, + "loss": 1.8865, + "step": 2440 + }, + { + "epoch": 0.010127545062408413, + "grad_norm": 5.204994732642035, + "learning_rate": 1.9999999225675543e-05, + "loss": 1.8602, + "step": 2450 + }, + { + "epoch": 0.010168881981030489, + "grad_norm": 4.1143956012846505, + "learning_rate": 1.9999998623423198e-05, + "loss": 1.9101, + "step": 2460 + }, + { + "epoch": 0.010210218899652563, + "grad_norm": 5.535771463118041, + "learning_rate": 1.9999997849098773e-05, + "loss": 1.8596, + "step": 2470 + }, + { + "epoch": 0.010251555818274638, + "grad_norm": 5.020430211409416, + "learning_rate": 1.999999690270228e-05, + "loss": 1.8393, + "step": 2480 + }, + { + "epoch": 0.010292892736896713, + "grad_norm": 5.571737674116448, + "learning_rate": 1.999999578423374e-05, + "loss": 1.8987, + "step": 2490 + }, + { + "epoch": 0.010334229655518789, + "grad_norm": 4.336614412280944, + "learning_rate": 1.9999994493693165e-05, + "loss": 1.9194, + "step": 2500 + }, + { + "epoch": 0.010375566574140864, + "grad_norm": 4.815853586635344, + "learning_rate": 1.999999303108058e-05, + "loss": 1.8332, + "step": 2510 + }, + { + "epoch": 0.01041690349276294, + "grad_norm": 4.559920874208704, + "learning_rate": 1.9999991396396014e-05, + "loss": 1.8818, + "step": 2520 + }, + { + "epoch": 0.010458240411385013, + "grad_norm": 5.23388810362715, + "learning_rate": 1.9999989589639487e-05, + "loss": 1.8302, + "step": 2530 + }, + { + "epoch": 0.010499577330007089, + "grad_norm": 5.000301577463503, + "learning_rate": 1.999998761081104e-05, + "loss": 1.8657, + "step": 2540 + }, + { + "epoch": 0.010540914248629164, + "grad_norm": 4.448533864801871, + "learning_rate": 1.9999985459910698e-05, + "loss": 1.8762, + "step": 2550 + }, + { + "epoch": 0.01058225116725124, + "grad_norm": 4.715035035883112, + "learning_rate": 1.9999983136938504e-05, + "loss": 1.8511, + "step": 2560 + }, + { + "epoch": 0.010623588085873315, + "grad_norm": 4.025529484549816, + "learning_rate": 1.9999980641894497e-05, + "loss": 1.8458, + "step": 2570 + }, + { + "epoch": 0.01066492500449539, + "grad_norm": 4.754000032727581, + "learning_rate": 1.9999977974778715e-05, + "loss": 1.8714, + "step": 2580 + }, + { + "epoch": 0.010706261923117464, + "grad_norm": 4.930978688660729, + "learning_rate": 1.999997513559121e-05, + "loss": 1.8656, + "step": 2590 + }, + { + "epoch": 0.01074759884173954, + "grad_norm": 4.4132729377261475, + "learning_rate": 1.9999972124332028e-05, + "loss": 1.9383, + "step": 2600 + }, + { + "epoch": 0.010788935760361615, + "grad_norm": 4.4540551253199325, + "learning_rate": 1.9999968941001225e-05, + "loss": 1.8426, + "step": 2610 + }, + { + "epoch": 0.01083027267898369, + "grad_norm": 4.797250042059473, + "learning_rate": 1.999996558559885e-05, + "loss": 1.8634, + "step": 2620 + }, + { + "epoch": 0.010871609597605766, + "grad_norm": 5.456931710111963, + "learning_rate": 1.999996205812496e-05, + "loss": 1.825, + "step": 2630 + }, + { + "epoch": 0.010912946516227841, + "grad_norm": 4.377649523138056, + "learning_rate": 1.999995835857962e-05, + "loss": 1.8545, + "step": 2640 + }, + { + "epoch": 0.010954283434849915, + "grad_norm": 4.5317328844732145, + "learning_rate": 1.9999954486962893e-05, + "loss": 1.8774, + "step": 2650 + }, + { + "epoch": 0.01099562035347199, + "grad_norm": 4.709498283906347, + "learning_rate": 1.9999950443274847e-05, + "loss": 1.8083, + "step": 2660 + }, + { + "epoch": 0.011036957272094066, + "grad_norm": 4.592327748002219, + "learning_rate": 1.9999946227515547e-05, + "loss": 1.792, + "step": 2670 + }, + { + "epoch": 0.011078294190716141, + "grad_norm": 5.036724935294618, + "learning_rate": 1.999994183968507e-05, + "loss": 1.8779, + "step": 2680 + }, + { + "epoch": 0.011119631109338217, + "grad_norm": 6.694409030503598, + "learning_rate": 1.999993727978349e-05, + "loss": 1.8573, + "step": 2690 + }, + { + "epoch": 0.011160968027960292, + "grad_norm": 5.018981836140064, + "learning_rate": 1.9999932547810883e-05, + "loss": 1.8726, + "step": 2700 + }, + { + "epoch": 0.011202304946582366, + "grad_norm": 4.451864521641474, + "learning_rate": 1.9999927643767332e-05, + "loss": 1.8193, + "step": 2710 + }, + { + "epoch": 0.011243641865204441, + "grad_norm": 5.036890897058994, + "learning_rate": 1.999992256765292e-05, + "loss": 1.8594, + "step": 2720 + }, + { + "epoch": 0.011284978783826517, + "grad_norm": 4.7545987502372755, + "learning_rate": 1.999991731946774e-05, + "loss": 1.9158, + "step": 2730 + }, + { + "epoch": 0.011326315702448592, + "grad_norm": 3.9156550800432783, + "learning_rate": 1.999991189921188e-05, + "loss": 1.8166, + "step": 2740 + }, + { + "epoch": 0.011367652621070668, + "grad_norm": 4.622686377530181, + "learning_rate": 1.999990630688543e-05, + "loss": 1.8426, + "step": 2750 + }, + { + "epoch": 0.011408989539692743, + "grad_norm": 4.176720366120709, + "learning_rate": 1.9999900542488487e-05, + "loss": 1.8701, + "step": 2760 + }, + { + "epoch": 0.011450326458314819, + "grad_norm": 4.588055146989058, + "learning_rate": 1.999989460602115e-05, + "loss": 1.8474, + "step": 2770 + }, + { + "epoch": 0.011491663376936892, + "grad_norm": 4.7632605353618604, + "learning_rate": 1.9999888497483523e-05, + "loss": 1.7611, + "step": 2780 + }, + { + "epoch": 0.011533000295558968, + "grad_norm": 5.168047411415939, + "learning_rate": 1.9999882216875714e-05, + "loss": 1.8297, + "step": 2790 + }, + { + "epoch": 0.011574337214181043, + "grad_norm": 5.6032261833368215, + "learning_rate": 1.9999875764197824e-05, + "loss": 1.8273, + "step": 2800 + }, + { + "epoch": 0.011615674132803119, + "grad_norm": 4.836606306201456, + "learning_rate": 1.9999869139449965e-05, + "loss": 1.8067, + "step": 2810 + }, + { + "epoch": 0.011657011051425194, + "grad_norm": 5.371156408385522, + "learning_rate": 1.9999862342632258e-05, + "loss": 1.7726, + "step": 2820 + }, + { + "epoch": 0.01169834797004727, + "grad_norm": 4.715562111195242, + "learning_rate": 1.9999855373744813e-05, + "loss": 1.8257, + "step": 2830 + }, + { + "epoch": 0.011739684888669343, + "grad_norm": 4.672226047314074, + "learning_rate": 1.9999848232787753e-05, + "loss": 1.807, + "step": 2840 + }, + { + "epoch": 0.011781021807291419, + "grad_norm": 5.305211095175868, + "learning_rate": 1.9999840919761202e-05, + "loss": 1.8398, + "step": 2850 + }, + { + "epoch": 0.011822358725913494, + "grad_norm": 4.973800529744849, + "learning_rate": 1.9999833434665282e-05, + "loss": 1.8028, + "step": 2860 + }, + { + "epoch": 0.01186369564453557, + "grad_norm": 4.580336749750151, + "learning_rate": 1.9999825777500127e-05, + "loss": 1.7559, + "step": 2870 + }, + { + "epoch": 0.011905032563157645, + "grad_norm": 5.203176556084249, + "learning_rate": 1.999981794826586e-05, + "loss": 1.8375, + "step": 2880 + }, + { + "epoch": 0.01194636948177972, + "grad_norm": 5.810430258629928, + "learning_rate": 1.9999809946962627e-05, + "loss": 1.8126, + "step": 2890 + }, + { + "epoch": 0.011987706400401794, + "grad_norm": 5.7480488342439955, + "learning_rate": 1.9999801773590556e-05, + "loss": 1.8228, + "step": 2900 + }, + { + "epoch": 0.01202904331902387, + "grad_norm": 4.946108636349945, + "learning_rate": 1.9999793428149793e-05, + "loss": 1.7801, + "step": 2910 + }, + { + "epoch": 0.012070380237645945, + "grad_norm": 4.686375909907021, + "learning_rate": 1.9999784910640484e-05, + "loss": 1.7595, + "step": 2920 + }, + { + "epoch": 0.01211171715626802, + "grad_norm": 5.2352360374303135, + "learning_rate": 1.9999776221062767e-05, + "loss": 1.8413, + "step": 2930 + }, + { + "epoch": 0.012153054074890096, + "grad_norm": 4.509401680547479, + "learning_rate": 1.99997673594168e-05, + "loss": 1.7973, + "step": 2940 + }, + { + "epoch": 0.012194390993512171, + "grad_norm": 4.614511294927466, + "learning_rate": 1.9999758325702728e-05, + "loss": 1.8206, + "step": 2950 + }, + { + "epoch": 0.012235727912134245, + "grad_norm": 6.185921445660834, + "learning_rate": 1.9999749119920714e-05, + "loss": 1.8462, + "step": 2960 + }, + { + "epoch": 0.01227706483075632, + "grad_norm": 4.174924494562577, + "learning_rate": 1.999973974207091e-05, + "loss": 1.8036, + "step": 2970 + }, + { + "epoch": 0.012318401749378396, + "grad_norm": 4.836665186633954, + "learning_rate": 1.9999730192153483e-05, + "loss": 1.8517, + "step": 2980 + }, + { + "epoch": 0.012359738668000471, + "grad_norm": 5.384960643252126, + "learning_rate": 1.999972047016859e-05, + "loss": 1.8159, + "step": 2990 + }, + { + "epoch": 0.012401075586622547, + "grad_norm": 5.021462883098841, + "learning_rate": 1.9999710576116403e-05, + "loss": 1.7985, + "step": 3000 + }, + { + "epoch": 0.012442412505244622, + "grad_norm": 5.427243361920921, + "learning_rate": 1.99997005099971e-05, + "loss": 1.7765, + "step": 3010 + }, + { + "epoch": 0.012483749423866698, + "grad_norm": 4.491526906719712, + "learning_rate": 1.999969027181084e-05, + "loss": 1.8183, + "step": 3020 + }, + { + "epoch": 0.012525086342488771, + "grad_norm": 4.559161527860771, + "learning_rate": 1.9999679861557804e-05, + "loss": 1.724, + "step": 3030 + }, + { + "epoch": 0.012566423261110847, + "grad_norm": 4.161439649907769, + "learning_rate": 1.9999669279238173e-05, + "loss": 1.7683, + "step": 3040 + }, + { + "epoch": 0.012607760179732922, + "grad_norm": 4.56366674854018, + "learning_rate": 1.999965852485213e-05, + "loss": 1.786, + "step": 3050 + }, + { + "epoch": 0.012649097098354998, + "grad_norm": 5.245429404102266, + "learning_rate": 1.999964759839986e-05, + "loss": 1.7822, + "step": 3060 + }, + { + "epoch": 0.012690434016977073, + "grad_norm": 4.6630532976211825, + "learning_rate": 1.9999636499881548e-05, + "loss": 1.7466, + "step": 3070 + }, + { + "epoch": 0.012731770935599148, + "grad_norm": 4.060133282127772, + "learning_rate": 1.9999625229297385e-05, + "loss": 1.795, + "step": 3080 + }, + { + "epoch": 0.012773107854221222, + "grad_norm": 3.9035653210065644, + "learning_rate": 1.9999613786647568e-05, + "loss": 1.7644, + "step": 3090 + }, + { + "epoch": 0.012814444772843298, + "grad_norm": 4.309261522039182, + "learning_rate": 1.9999602171932292e-05, + "loss": 1.7843, + "step": 3100 + }, + { + "epoch": 0.012855781691465373, + "grad_norm": 4.849350440475919, + "learning_rate": 1.999959038515176e-05, + "loss": 1.7703, + "step": 3110 + }, + { + "epoch": 0.012897118610087448, + "grad_norm": 4.275025549538348, + "learning_rate": 1.999957842630617e-05, + "loss": 1.7714, + "step": 3120 + }, + { + "epoch": 0.012938455528709524, + "grad_norm": 5.173686081149218, + "learning_rate": 1.9999566295395728e-05, + "loss": 1.7638, + "step": 3130 + }, + { + "epoch": 0.0129797924473316, + "grad_norm": 4.908518222034618, + "learning_rate": 1.999955399242065e-05, + "loss": 1.8008, + "step": 3140 + }, + { + "epoch": 0.013021129365953673, + "grad_norm": 4.14921313541257, + "learning_rate": 1.9999541517381137e-05, + "loss": 1.7741, + "step": 3150 + }, + { + "epoch": 0.013062466284575748, + "grad_norm": 4.543722393877187, + "learning_rate": 1.9999528870277412e-05, + "loss": 1.7949, + "step": 3160 + }, + { + "epoch": 0.013103803203197824, + "grad_norm": 4.410976439510873, + "learning_rate": 1.9999516051109688e-05, + "loss": 1.7547, + "step": 3170 + }, + { + "epoch": 0.0131451401218199, + "grad_norm": 5.705194883861194, + "learning_rate": 1.9999503059878188e-05, + "loss": 1.7513, + "step": 3180 + }, + { + "epoch": 0.013186477040441975, + "grad_norm": 4.65186813408292, + "learning_rate": 1.999948989658313e-05, + "loss": 1.7664, + "step": 3190 + }, + { + "epoch": 0.01322781395906405, + "grad_norm": 5.20074413082596, + "learning_rate": 1.9999476561224754e-05, + "loss": 1.7545, + "step": 3200 + }, + { + "epoch": 0.013269150877686124, + "grad_norm": 3.9975643391331745, + "learning_rate": 1.9999463053803275e-05, + "loss": 1.7175, + "step": 3210 + }, + { + "epoch": 0.0133104877963082, + "grad_norm": 4.798261065065511, + "learning_rate": 1.9999449374318934e-05, + "loss": 1.7464, + "step": 3220 + }, + { + "epoch": 0.013351824714930275, + "grad_norm": 4.858469902498838, + "learning_rate": 1.9999435522771963e-05, + "loss": 1.7568, + "step": 3230 + }, + { + "epoch": 0.01339316163355235, + "grad_norm": 4.236662694907985, + "learning_rate": 1.99994214991626e-05, + "loss": 1.7335, + "step": 3240 + }, + { + "epoch": 0.013434498552174426, + "grad_norm": 4.6188062645939585, + "learning_rate": 1.9999407303491085e-05, + "loss": 1.7529, + "step": 3250 + }, + { + "epoch": 0.013475835470796501, + "grad_norm": 4.77521754475989, + "learning_rate": 1.9999392935757668e-05, + "loss": 1.7734, + "step": 3260 + }, + { + "epoch": 0.013517172389418576, + "grad_norm": 6.027108769658543, + "learning_rate": 1.999937839596259e-05, + "loss": 1.8136, + "step": 3270 + }, + { + "epoch": 0.01355850930804065, + "grad_norm": 4.163761649197771, + "learning_rate": 1.9999363684106105e-05, + "loss": 1.7085, + "step": 3280 + }, + { + "epoch": 0.013599846226662726, + "grad_norm": 3.916493440655603, + "learning_rate": 1.9999348800188466e-05, + "loss": 1.7815, + "step": 3290 + }, + { + "epoch": 0.013641183145284801, + "grad_norm": 4.397530066361572, + "learning_rate": 1.9999333744209924e-05, + "loss": 1.7759, + "step": 3300 + }, + { + "epoch": 0.013682520063906876, + "grad_norm": 4.839462185241853, + "learning_rate": 1.9999318516170747e-05, + "loss": 1.7548, + "step": 3310 + }, + { + "epoch": 0.013723856982528952, + "grad_norm": 4.759012044819632, + "learning_rate": 1.999930311607119e-05, + "loss": 1.7815, + "step": 3320 + }, + { + "epoch": 0.013765193901151027, + "grad_norm": 4.1799473470272295, + "learning_rate": 1.9999287543911522e-05, + "loss": 1.7907, + "step": 3330 + }, + { + "epoch": 0.013806530819773101, + "grad_norm": 4.454633377063746, + "learning_rate": 1.9999271799692006e-05, + "loss": 1.7579, + "step": 3340 + }, + { + "epoch": 0.013847867738395176, + "grad_norm": 4.997867503776301, + "learning_rate": 1.999925588341292e-05, + "loss": 1.7335, + "step": 3350 + }, + { + "epoch": 0.013889204657017252, + "grad_norm": 4.345433706332678, + "learning_rate": 1.999923979507453e-05, + "loss": 1.7124, + "step": 3360 + }, + { + "epoch": 0.013930541575639327, + "grad_norm": 4.531985231521044, + "learning_rate": 1.999922353467712e-05, + "loss": 1.758, + "step": 3370 + }, + { + "epoch": 0.013971878494261403, + "grad_norm": 4.399801080955952, + "learning_rate": 1.9999207102220962e-05, + "loss": 1.7065, + "step": 3380 + }, + { + "epoch": 0.014013215412883478, + "grad_norm": 5.059800651377326, + "learning_rate": 1.999919049770635e-05, + "loss": 1.693, + "step": 3390 + }, + { + "epoch": 0.014054552331505552, + "grad_norm": 4.260000291303237, + "learning_rate": 1.9999173721133557e-05, + "loss": 1.7488, + "step": 3400 + }, + { + "epoch": 0.014095889250127627, + "grad_norm": 4.5331171056345605, + "learning_rate": 1.999915677250288e-05, + "loss": 1.7046, + "step": 3410 + }, + { + "epoch": 0.014137226168749703, + "grad_norm": 4.187185061547482, + "learning_rate": 1.999913965181461e-05, + "loss": 1.7013, + "step": 3420 + }, + { + "epoch": 0.014178563087371778, + "grad_norm": 4.429191188349303, + "learning_rate": 1.999912235906904e-05, + "loss": 1.7127, + "step": 3430 + }, + { + "epoch": 0.014219900005993854, + "grad_norm": 4.083346883074332, + "learning_rate": 1.9999104894266466e-05, + "loss": 1.7571, + "step": 3440 + }, + { + "epoch": 0.014261236924615929, + "grad_norm": 4.424685185794806, + "learning_rate": 1.999908725740719e-05, + "loss": 1.7563, + "step": 3450 + }, + { + "epoch": 0.014302573843238003, + "grad_norm": 4.285454928033791, + "learning_rate": 1.9999069448491516e-05, + "loss": 1.7547, + "step": 3460 + }, + { + "epoch": 0.014343910761860078, + "grad_norm": 4.424718664433471, + "learning_rate": 1.999905146751975e-05, + "loss": 1.7374, + "step": 3470 + }, + { + "epoch": 0.014385247680482154, + "grad_norm": 4.075077717271962, + "learning_rate": 1.99990333144922e-05, + "loss": 1.7661, + "step": 3480 + }, + { + "epoch": 0.014426584599104229, + "grad_norm": 4.538474712661468, + "learning_rate": 1.999901498940918e-05, + "loss": 1.7118, + "step": 3490 + }, + { + "epoch": 0.014467921517726304, + "grad_norm": 4.435775318213515, + "learning_rate": 1.9998996492271007e-05, + "loss": 1.7368, + "step": 3500 + }, + { + "epoch": 0.01450925843634838, + "grad_norm": 4.545629750120307, + "learning_rate": 1.9998977823077998e-05, + "loss": 1.7335, + "step": 3510 + }, + { + "epoch": 0.014550595354970454, + "grad_norm": 4.359608762821868, + "learning_rate": 1.9998958981830473e-05, + "loss": 1.7318, + "step": 3520 + }, + { + "epoch": 0.014591932273592529, + "grad_norm": 4.453842525389737, + "learning_rate": 1.9998939968528754e-05, + "loss": 1.7499, + "step": 3530 + }, + { + "epoch": 0.014633269192214604, + "grad_norm": 5.088846901725583, + "learning_rate": 1.9998920783173172e-05, + "loss": 1.7555, + "step": 3540 + }, + { + "epoch": 0.01467460611083668, + "grad_norm": 4.1590343693668395, + "learning_rate": 1.9998901425764057e-05, + "loss": 1.7386, + "step": 3550 + }, + { + "epoch": 0.014715943029458755, + "grad_norm": 4.181140524329235, + "learning_rate": 1.9998881896301744e-05, + "loss": 1.6455, + "step": 3560 + }, + { + "epoch": 0.01475727994808083, + "grad_norm": 4.228896471972448, + "learning_rate": 1.999886219478656e-05, + "loss": 1.7282, + "step": 3570 + }, + { + "epoch": 0.014798616866702906, + "grad_norm": 3.9899831004408526, + "learning_rate": 1.9998842321218855e-05, + "loss": 1.7201, + "step": 3580 + }, + { + "epoch": 0.01483995378532498, + "grad_norm": 3.9178031007246408, + "learning_rate": 1.9998822275598964e-05, + "loss": 1.6812, + "step": 3590 + }, + { + "epoch": 0.014881290703947055, + "grad_norm": 4.3808976089497484, + "learning_rate": 1.9998802057927236e-05, + "loss": 1.7175, + "step": 3600 + }, + { + "epoch": 0.01492262762256913, + "grad_norm": 3.9780395209303197, + "learning_rate": 1.9998781668204015e-05, + "loss": 1.7351, + "step": 3610 + }, + { + "epoch": 0.014963964541191206, + "grad_norm": 5.378812354806347, + "learning_rate": 1.9998761106429655e-05, + "loss": 1.7092, + "step": 3620 + }, + { + "epoch": 0.015005301459813282, + "grad_norm": 3.9422939515447246, + "learning_rate": 1.999874037260451e-05, + "loss": 1.7261, + "step": 3630 + }, + { + "epoch": 0.015046638378435357, + "grad_norm": 4.442422033504748, + "learning_rate": 1.9998719466728934e-05, + "loss": 1.7027, + "step": 3640 + }, + { + "epoch": 0.01508797529705743, + "grad_norm": 4.102984271689072, + "learning_rate": 1.9998698388803288e-05, + "loss": 1.6741, + "step": 3650 + }, + { + "epoch": 0.015129312215679506, + "grad_norm": 3.9608491048290615, + "learning_rate": 1.9998677138827934e-05, + "loss": 1.7542, + "step": 3660 + }, + { + "epoch": 0.015170649134301582, + "grad_norm": 4.561629575046756, + "learning_rate": 1.999865571680324e-05, + "loss": 1.6785, + "step": 3670 + }, + { + "epoch": 0.015211986052923657, + "grad_norm": 4.4640127715057885, + "learning_rate": 1.9998634122729573e-05, + "loss": 1.7, + "step": 3680 + }, + { + "epoch": 0.015253322971545732, + "grad_norm": 3.8935864417828188, + "learning_rate": 1.9998612356607303e-05, + "loss": 1.6939, + "step": 3690 + }, + { + "epoch": 0.015294659890167808, + "grad_norm": 5.011277234521909, + "learning_rate": 1.9998590418436808e-05, + "loss": 1.7019, + "step": 3700 + }, + { + "epoch": 0.015335996808789882, + "grad_norm": 4.107354033282244, + "learning_rate": 1.9998568308218465e-05, + "loss": 1.6637, + "step": 3710 + }, + { + "epoch": 0.015377333727411957, + "grad_norm": 5.3480918453617905, + "learning_rate": 1.999854602595265e-05, + "loss": 1.7322, + "step": 3720 + }, + { + "epoch": 0.015418670646034032, + "grad_norm": 4.443648241332512, + "learning_rate": 1.9998523571639752e-05, + "loss": 1.6794, + "step": 3730 + }, + { + "epoch": 0.015460007564656108, + "grad_norm": 3.4677507775480025, + "learning_rate": 1.999850094528015e-05, + "loss": 1.6943, + "step": 3740 + }, + { + "epoch": 0.015501344483278183, + "grad_norm": 4.306434811794374, + "learning_rate": 1.9998478146874244e-05, + "loss": 1.6996, + "step": 3750 + }, + { + "epoch": 0.015542681401900259, + "grad_norm": 5.783322294479809, + "learning_rate": 1.9998455176422423e-05, + "loss": 1.7071, + "step": 3760 + }, + { + "epoch": 0.015584018320522332, + "grad_norm": 5.907422561947855, + "learning_rate": 1.999843203392507e-05, + "loss": 1.7736, + "step": 3770 + }, + { + "epoch": 0.01562535523914441, + "grad_norm": 4.114299347318918, + "learning_rate": 1.9998408719382602e-05, + "loss": 1.7068, + "step": 3780 + }, + { + "epoch": 0.015666692157766483, + "grad_norm": 4.897826252389082, + "learning_rate": 1.999838523279541e-05, + "loss": 1.6542, + "step": 3790 + }, + { + "epoch": 0.01570802907638856, + "grad_norm": 4.387711122090114, + "learning_rate": 1.9998361574163897e-05, + "loss": 1.7202, + "step": 3800 + }, + { + "epoch": 0.015749365995010634, + "grad_norm": 4.249935651772863, + "learning_rate": 1.999833774348847e-05, + "loss": 1.6871, + "step": 3810 + }, + { + "epoch": 0.015790702913632708, + "grad_norm": 4.961734747800958, + "learning_rate": 1.9998313740769547e-05, + "loss": 1.7012, + "step": 3820 + }, + { + "epoch": 0.015832039832254785, + "grad_norm": 4.247988360660198, + "learning_rate": 1.9998289566007535e-05, + "loss": 1.684, + "step": 3830 + }, + { + "epoch": 0.01587337675087686, + "grad_norm": 5.434305269506113, + "learning_rate": 1.999826521920285e-05, + "loss": 1.7673, + "step": 3840 + }, + { + "epoch": 0.015914713669498936, + "grad_norm": 4.617133742171007, + "learning_rate": 1.999824070035591e-05, + "loss": 1.6622, + "step": 3850 + }, + { + "epoch": 0.01595605058812101, + "grad_norm": 3.70746479523289, + "learning_rate": 1.9998216009467136e-05, + "loss": 1.6647, + "step": 3860 + }, + { + "epoch": 0.015997387506743083, + "grad_norm": 4.604026510578186, + "learning_rate": 1.999819114653696e-05, + "loss": 1.6772, + "step": 3870 + }, + { + "epoch": 0.01603872442536516, + "grad_norm": 3.8606213125382642, + "learning_rate": 1.9998166111565804e-05, + "loss": 1.694, + "step": 3880 + }, + { + "epoch": 0.016080061343987234, + "grad_norm": 5.0244652420608045, + "learning_rate": 1.99981409045541e-05, + "loss": 1.7797, + "step": 3890 + }, + { + "epoch": 0.01612139826260931, + "grad_norm": 4.707739461519922, + "learning_rate": 1.999811552550228e-05, + "loss": 1.7159, + "step": 3900 + }, + { + "epoch": 0.016162735181231385, + "grad_norm": 3.9677147576335043, + "learning_rate": 1.9998089974410782e-05, + "loss": 1.6708, + "step": 3910 + }, + { + "epoch": 0.016204072099853462, + "grad_norm": 4.311084704937728, + "learning_rate": 1.9998064251280048e-05, + "loss": 1.7109, + "step": 3920 + }, + { + "epoch": 0.016245409018475536, + "grad_norm": 3.9457174661249534, + "learning_rate": 1.999803835611052e-05, + "loss": 1.6713, + "step": 3930 + }, + { + "epoch": 0.01628674593709761, + "grad_norm": 3.947531059176682, + "learning_rate": 1.999801228890264e-05, + "loss": 1.6796, + "step": 3940 + }, + { + "epoch": 0.016328082855719687, + "grad_norm": 4.14663907999712, + "learning_rate": 1.9997986049656858e-05, + "loss": 1.6452, + "step": 3950 + }, + { + "epoch": 0.01636941977434176, + "grad_norm": 3.897276226226099, + "learning_rate": 1.9997959638373626e-05, + "loss": 1.6507, + "step": 3960 + }, + { + "epoch": 0.016410756692963838, + "grad_norm": 3.778326978683171, + "learning_rate": 1.9997933055053402e-05, + "loss": 1.7378, + "step": 3970 + }, + { + "epoch": 0.01645209361158591, + "grad_norm": 4.014730222130603, + "learning_rate": 1.9997906299696635e-05, + "loss": 1.6651, + "step": 3980 + }, + { + "epoch": 0.016493430530207985, + "grad_norm": 3.8164751076978223, + "learning_rate": 1.9997879372303797e-05, + "loss": 1.7007, + "step": 3990 + }, + { + "epoch": 0.016534767448830062, + "grad_norm": 3.922371704332535, + "learning_rate": 1.999785227287534e-05, + "loss": 1.7161, + "step": 4000 + }, + { + "epoch": 0.016576104367452136, + "grad_norm": 3.934785675300376, + "learning_rate": 1.9997825001411738e-05, + "loss": 1.6704, + "step": 4010 + }, + { + "epoch": 0.016617441286074213, + "grad_norm": 4.564033996587743, + "learning_rate": 1.9997797557913455e-05, + "loss": 1.6918, + "step": 4020 + }, + { + "epoch": 0.016658778204696287, + "grad_norm": 4.4245567390274365, + "learning_rate": 1.9997769942380968e-05, + "loss": 1.7143, + "step": 4030 + }, + { + "epoch": 0.016700115123318364, + "grad_norm": 3.8624198473379874, + "learning_rate": 1.9997742154814744e-05, + "loss": 1.7298, + "step": 4040 + }, + { + "epoch": 0.016741452041940438, + "grad_norm": 4.010446146589402, + "learning_rate": 1.9997714195215275e-05, + "loss": 1.6851, + "step": 4050 + }, + { + "epoch": 0.01678278896056251, + "grad_norm": 4.139527737935189, + "learning_rate": 1.9997686063583028e-05, + "loss": 1.6597, + "step": 4060 + }, + { + "epoch": 0.01682412587918459, + "grad_norm": 3.617422879629344, + "learning_rate": 1.9997657759918498e-05, + "loss": 1.7078, + "step": 4070 + }, + { + "epoch": 0.016865462797806662, + "grad_norm": 4.492323213426353, + "learning_rate": 1.9997629284222165e-05, + "loss": 1.6521, + "step": 4080 + }, + { + "epoch": 0.01690679971642874, + "grad_norm": 5.007903819964739, + "learning_rate": 1.999760063649452e-05, + "loss": 1.6694, + "step": 4090 + }, + { + "epoch": 0.016948136635050813, + "grad_norm": 4.960862868620129, + "learning_rate": 1.999757181673606e-05, + "loss": 1.68, + "step": 4100 + }, + { + "epoch": 0.01698947355367289, + "grad_norm": 5.878432740559922, + "learning_rate": 1.9997542824947276e-05, + "loss": 1.6736, + "step": 4110 + }, + { + "epoch": 0.017030810472294964, + "grad_norm": 4.440326929426054, + "learning_rate": 1.999751366112867e-05, + "loss": 1.6335, + "step": 4120 + }, + { + "epoch": 0.017072147390917038, + "grad_norm": 4.263618522816504, + "learning_rate": 1.999748432528074e-05, + "loss": 1.7186, + "step": 4130 + }, + { + "epoch": 0.017113484309539115, + "grad_norm": 4.292363992231819, + "learning_rate": 1.9997454817403996e-05, + "loss": 1.6416, + "step": 4140 + }, + { + "epoch": 0.01715482122816119, + "grad_norm": 4.013314862106662, + "learning_rate": 1.9997425137498944e-05, + "loss": 1.723, + "step": 4150 + }, + { + "epoch": 0.017196158146783266, + "grad_norm": 4.07382683143937, + "learning_rate": 1.999739528556609e-05, + "loss": 1.6604, + "step": 4160 + }, + { + "epoch": 0.01723749506540534, + "grad_norm": 4.533516304139438, + "learning_rate": 1.9997365261605957e-05, + "loss": 1.6683, + "step": 4170 + }, + { + "epoch": 0.017278831984027413, + "grad_norm": 5.114666733039835, + "learning_rate": 1.999733506561905e-05, + "loss": 1.6925, + "step": 4180 + }, + { + "epoch": 0.01732016890264949, + "grad_norm": 3.895641699630939, + "learning_rate": 1.99973046976059e-05, + "loss": 1.6743, + "step": 4190 + }, + { + "epoch": 0.017361505821271564, + "grad_norm": 3.9125805892169465, + "learning_rate": 1.9997274157567025e-05, + "loss": 1.6823, + "step": 4200 + }, + { + "epoch": 0.01740284273989364, + "grad_norm": 4.530982763817902, + "learning_rate": 1.999724344550295e-05, + "loss": 1.666, + "step": 4210 + }, + { + "epoch": 0.017444179658515715, + "grad_norm": 4.806928145874966, + "learning_rate": 1.9997212561414198e-05, + "loss": 1.7254, + "step": 4220 + }, + { + "epoch": 0.017485516577137792, + "grad_norm": 3.9697720655534483, + "learning_rate": 1.999718150530131e-05, + "loss": 1.6241, + "step": 4230 + }, + { + "epoch": 0.017526853495759866, + "grad_norm": 4.257480914158059, + "learning_rate": 1.9997150277164815e-05, + "loss": 1.6346, + "step": 4240 + }, + { + "epoch": 0.01756819041438194, + "grad_norm": 3.799531767116148, + "learning_rate": 1.999711887700525e-05, + "loss": 1.6296, + "step": 4250 + }, + { + "epoch": 0.017609527333004017, + "grad_norm": 3.802902072405634, + "learning_rate": 1.999708730482316e-05, + "loss": 1.6296, + "step": 4260 + }, + { + "epoch": 0.01765086425162609, + "grad_norm": 5.118064089629252, + "learning_rate": 1.9997055560619082e-05, + "loss": 1.643, + "step": 4270 + }, + { + "epoch": 0.017692201170248167, + "grad_norm": 4.227158901611068, + "learning_rate": 1.9997023644393567e-05, + "loss": 1.6698, + "step": 4280 + }, + { + "epoch": 0.01773353808887024, + "grad_norm": 4.238927562799819, + "learning_rate": 1.9996991556147166e-05, + "loss": 1.653, + "step": 4290 + }, + { + "epoch": 0.017774875007492315, + "grad_norm": 4.204830304370112, + "learning_rate": 1.9996959295880423e-05, + "loss": 1.6844, + "step": 4300 + }, + { + "epoch": 0.017816211926114392, + "grad_norm": 4.097133417277415, + "learning_rate": 1.99969268635939e-05, + "loss": 1.6212, + "step": 4310 + }, + { + "epoch": 0.017857548844736466, + "grad_norm": 4.65335395814053, + "learning_rate": 1.999689425928815e-05, + "loss": 1.6882, + "step": 4320 + }, + { + "epoch": 0.017898885763358543, + "grad_norm": 4.112571966210029, + "learning_rate": 1.999686148296374e-05, + "loss": 1.6929, + "step": 4330 + }, + { + "epoch": 0.017940222681980617, + "grad_norm": 5.088602258322444, + "learning_rate": 1.999682853462123e-05, + "loss": 1.6648, + "step": 4340 + }, + { + "epoch": 0.017981559600602694, + "grad_norm": 3.9480572889086147, + "learning_rate": 1.9996795414261186e-05, + "loss": 1.5896, + "step": 4350 + }, + { + "epoch": 0.018022896519224767, + "grad_norm": 4.8104711694243, + "learning_rate": 1.9996762121884186e-05, + "loss": 1.6709, + "step": 4360 + }, + { + "epoch": 0.01806423343784684, + "grad_norm": 5.388396623467715, + "learning_rate": 1.999672865749079e-05, + "loss": 1.6716, + "step": 4370 + }, + { + "epoch": 0.01810557035646892, + "grad_norm": 4.279793170082693, + "learning_rate": 1.9996695021081584e-05, + "loss": 1.632, + "step": 4380 + }, + { + "epoch": 0.018146907275090992, + "grad_norm": 4.624743597271427, + "learning_rate": 1.999666121265714e-05, + "loss": 1.6054, + "step": 4390 + }, + { + "epoch": 0.01818824419371307, + "grad_norm": 4.133320200289432, + "learning_rate": 1.9996627232218048e-05, + "loss": 1.6418, + "step": 4400 + }, + { + "epoch": 0.018229581112335143, + "grad_norm": 4.0963463496824986, + "learning_rate": 1.9996593079764884e-05, + "loss": 1.6683, + "step": 4410 + }, + { + "epoch": 0.01827091803095722, + "grad_norm": 4.03547359932741, + "learning_rate": 1.9996558755298238e-05, + "loss": 1.5996, + "step": 4420 + }, + { + "epoch": 0.018312254949579294, + "grad_norm": 4.156363997210419, + "learning_rate": 1.9996524258818706e-05, + "loss": 1.6471, + "step": 4430 + }, + { + "epoch": 0.018353591868201367, + "grad_norm": 4.075479637959615, + "learning_rate": 1.9996489590326874e-05, + "loss": 1.5989, + "step": 4440 + }, + { + "epoch": 0.018394928786823445, + "grad_norm": 4.63601174765512, + "learning_rate": 1.9996454749823345e-05, + "loss": 1.6642, + "step": 4450 + }, + { + "epoch": 0.018436265705445518, + "grad_norm": 3.760851338042477, + "learning_rate": 1.9996419737308715e-05, + "loss": 1.6579, + "step": 4460 + }, + { + "epoch": 0.018477602624067595, + "grad_norm": 3.979536768168784, + "learning_rate": 1.9996384552783588e-05, + "loss": 1.6006, + "step": 4470 + }, + { + "epoch": 0.01851893954268967, + "grad_norm": 4.246767902971398, + "learning_rate": 1.9996349196248563e-05, + "loss": 1.6715, + "step": 4480 + }, + { + "epoch": 0.018560276461311743, + "grad_norm": 4.26779353731614, + "learning_rate": 1.999631366770426e-05, + "loss": 1.6859, + "step": 4490 + }, + { + "epoch": 0.01860161337993382, + "grad_norm": 4.049582440808523, + "learning_rate": 1.9996277967151283e-05, + "loss": 1.6882, + "step": 4500 + }, + { + "epoch": 0.018642950298555894, + "grad_norm": 4.066185344313316, + "learning_rate": 1.9996242094590248e-05, + "loss": 1.6601, + "step": 4510 + }, + { + "epoch": 0.01868428721717797, + "grad_norm": 3.7309702600230494, + "learning_rate": 1.9996206050021768e-05, + "loss": 1.6453, + "step": 4520 + }, + { + "epoch": 0.018725624135800045, + "grad_norm": 4.307728435051617, + "learning_rate": 1.9996169833446473e-05, + "loss": 1.6728, + "step": 4530 + }, + { + "epoch": 0.01876696105442212, + "grad_norm": 3.892468749865279, + "learning_rate": 1.9996133444864974e-05, + "loss": 1.6996, + "step": 4540 + }, + { + "epoch": 0.018808297973044195, + "grad_norm": 4.172694653615993, + "learning_rate": 1.999609688427791e-05, + "loss": 1.6519, + "step": 4550 + }, + { + "epoch": 0.01884963489166627, + "grad_norm": 4.211392128772361, + "learning_rate": 1.9996060151685895e-05, + "loss": 1.6096, + "step": 4560 + }, + { + "epoch": 0.018890971810288346, + "grad_norm": 4.728429773380645, + "learning_rate": 1.9996023247089576e-05, + "loss": 1.6217, + "step": 4570 + }, + { + "epoch": 0.01893230872891042, + "grad_norm": 3.7603074265755745, + "learning_rate": 1.999598617048958e-05, + "loss": 1.617, + "step": 4580 + }, + { + "epoch": 0.018973645647532497, + "grad_norm": 4.5264911357846165, + "learning_rate": 1.9995948921886547e-05, + "loss": 1.6009, + "step": 4590 + }, + { + "epoch": 0.01901498256615457, + "grad_norm": 4.285402551531064, + "learning_rate": 1.999591150128112e-05, + "loss": 1.6666, + "step": 4600 + }, + { + "epoch": 0.019056319484776648, + "grad_norm": 4.528562163332608, + "learning_rate": 1.9995873908673936e-05, + "loss": 1.6967, + "step": 4610 + }, + { + "epoch": 0.01909765640339872, + "grad_norm": 4.331142545150304, + "learning_rate": 1.999583614406565e-05, + "loss": 1.6387, + "step": 4620 + }, + { + "epoch": 0.019138993322020795, + "grad_norm": 4.277497333006759, + "learning_rate": 1.9995798207456906e-05, + "loss": 1.6407, + "step": 4630 + }, + { + "epoch": 0.019180330240642873, + "grad_norm": 4.236531733677237, + "learning_rate": 1.999576009884836e-05, + "loss": 1.6528, + "step": 4640 + }, + { + "epoch": 0.019221667159264946, + "grad_norm": 4.1527404087837825, + "learning_rate": 1.9995721818240664e-05, + "loss": 1.6386, + "step": 4650 + }, + { + "epoch": 0.019263004077887023, + "grad_norm": 4.21734134516066, + "learning_rate": 1.999568336563448e-05, + "loss": 1.6531, + "step": 4660 + }, + { + "epoch": 0.019304340996509097, + "grad_norm": 4.010277949791672, + "learning_rate": 1.999564474103047e-05, + "loss": 1.6125, + "step": 4670 + }, + { + "epoch": 0.01934567791513117, + "grad_norm": 4.974363400314765, + "learning_rate": 1.99956059444293e-05, + "loss": 1.6562, + "step": 4680 + }, + { + "epoch": 0.019387014833753248, + "grad_norm": 3.461845715262989, + "learning_rate": 1.999556697583163e-05, + "loss": 1.6715, + "step": 4690 + }, + { + "epoch": 0.01942835175237532, + "grad_norm": 4.501289760535044, + "learning_rate": 1.999552783523814e-05, + "loss": 1.6276, + "step": 4700 + }, + { + "epoch": 0.0194696886709974, + "grad_norm": 3.980526992455661, + "learning_rate": 1.99954885226495e-05, + "loss": 1.6512, + "step": 4710 + }, + { + "epoch": 0.019511025589619473, + "grad_norm": 4.754361998561602, + "learning_rate": 1.9995449038066385e-05, + "loss": 1.6563, + "step": 4720 + }, + { + "epoch": 0.01955236250824155, + "grad_norm": 3.962924389993788, + "learning_rate": 1.9995409381489473e-05, + "loss": 1.5921, + "step": 4730 + }, + { + "epoch": 0.019593699426863623, + "grad_norm": 4.230038259640959, + "learning_rate": 1.999536955291945e-05, + "loss": 1.6266, + "step": 4740 + }, + { + "epoch": 0.019635036345485697, + "grad_norm": 3.4637303252434863, + "learning_rate": 1.9995329552356996e-05, + "loss": 1.5613, + "step": 4750 + }, + { + "epoch": 0.019676373264107774, + "grad_norm": 4.180047059414082, + "learning_rate": 1.999528937980281e-05, + "loss": 1.6358, + "step": 4760 + }, + { + "epoch": 0.019717710182729848, + "grad_norm": 4.407688478601427, + "learning_rate": 1.9995249035257572e-05, + "loss": 1.6276, + "step": 4770 + }, + { + "epoch": 0.019759047101351925, + "grad_norm": 5.682179019619738, + "learning_rate": 1.999520851872198e-05, + "loss": 1.6339, + "step": 4780 + }, + { + "epoch": 0.019800384019974, + "grad_norm": 5.80296950656401, + "learning_rate": 1.9995167830196732e-05, + "loss": 1.6735, + "step": 4790 + }, + { + "epoch": 0.019841720938596073, + "grad_norm": 4.788010741660107, + "learning_rate": 1.999512696968253e-05, + "loss": 1.6183, + "step": 4800 + }, + { + "epoch": 0.01988305785721815, + "grad_norm": 3.2823877198029683, + "learning_rate": 1.9995085937180075e-05, + "loss": 1.6314, + "step": 4810 + }, + { + "epoch": 0.019924394775840223, + "grad_norm": 4.513204723991569, + "learning_rate": 1.9995044732690074e-05, + "loss": 1.6558, + "step": 4820 + }, + { + "epoch": 0.0199657316944623, + "grad_norm": 3.710887033971277, + "learning_rate": 1.999500335621323e-05, + "loss": 1.6339, + "step": 4830 + }, + { + "epoch": 0.020007068613084374, + "grad_norm": 3.914180149814728, + "learning_rate": 1.9994961807750264e-05, + "loss": 1.6263, + "step": 4840 + }, + { + "epoch": 0.02004840553170645, + "grad_norm": 4.149254446951243, + "learning_rate": 1.999492008730189e-05, + "loss": 1.6276, + "step": 4850 + }, + { + "epoch": 0.020089742450328525, + "grad_norm": 3.8520876610172756, + "learning_rate": 1.9994878194868817e-05, + "loss": 1.6168, + "step": 4860 + }, + { + "epoch": 0.0201310793689506, + "grad_norm": 4.315135033151227, + "learning_rate": 1.9994836130451777e-05, + "loss": 1.6799, + "step": 4870 + }, + { + "epoch": 0.020172416287572676, + "grad_norm": 4.299172694880712, + "learning_rate": 1.9994793894051483e-05, + "loss": 1.6094, + "step": 4880 + }, + { + "epoch": 0.02021375320619475, + "grad_norm": 3.9099719074716974, + "learning_rate": 1.999475148566867e-05, + "loss": 1.6002, + "step": 4890 + }, + { + "epoch": 0.020255090124816827, + "grad_norm": 3.621204913700773, + "learning_rate": 1.9994708905304066e-05, + "loss": 1.627, + "step": 4900 + }, + { + "epoch": 0.0202964270434389, + "grad_norm": 4.002608239997497, + "learning_rate": 1.9994666152958403e-05, + "loss": 1.6377, + "step": 4910 + }, + { + "epoch": 0.020337763962060978, + "grad_norm": 3.509839578650558, + "learning_rate": 1.9994623228632413e-05, + "loss": 1.6498, + "step": 4920 + }, + { + "epoch": 0.02037910088068305, + "grad_norm": 3.948041169756955, + "learning_rate": 1.9994580132326843e-05, + "loss": 1.6605, + "step": 4930 + }, + { + "epoch": 0.020420437799305125, + "grad_norm": 3.7588684802290713, + "learning_rate": 1.9994536864042428e-05, + "loss": 1.6845, + "step": 4940 + }, + { + "epoch": 0.020461774717927202, + "grad_norm": 4.867688920782023, + "learning_rate": 1.999449342377991e-05, + "loss": 1.5575, + "step": 4950 + }, + { + "epoch": 0.020503111636549276, + "grad_norm": 4.235921275935457, + "learning_rate": 1.9994449811540044e-05, + "loss": 1.6329, + "step": 4960 + }, + { + "epoch": 0.020544448555171353, + "grad_norm": 5.353004787701509, + "learning_rate": 1.9994406027323578e-05, + "loss": 1.5961, + "step": 4970 + }, + { + "epoch": 0.020585785473793427, + "grad_norm": 4.49092979482084, + "learning_rate": 1.999436207113126e-05, + "loss": 1.6152, + "step": 4980 + }, + { + "epoch": 0.0206271223924155, + "grad_norm": 4.786632872232947, + "learning_rate": 1.9994317942963856e-05, + "loss": 1.5889, + "step": 4990 + }, + { + "epoch": 0.020668459311037578, + "grad_norm": 3.7616100197105324, + "learning_rate": 1.999427364282212e-05, + "loss": 1.6428, + "step": 5000 + }, + { + "epoch": 0.02070979622965965, + "grad_norm": 4.922026489251745, + "learning_rate": 1.999422917070681e-05, + "loss": 1.6404, + "step": 5010 + }, + { + "epoch": 0.02075113314828173, + "grad_norm": 4.51143708824428, + "learning_rate": 1.9994184526618698e-05, + "loss": 1.6532, + "step": 5020 + }, + { + "epoch": 0.020792470066903802, + "grad_norm": 4.104589032058005, + "learning_rate": 1.999413971055855e-05, + "loss": 1.6071, + "step": 5030 + }, + { + "epoch": 0.02083380698552588, + "grad_norm": 4.89262784656072, + "learning_rate": 1.999409472252714e-05, + "loss": 1.6516, + "step": 5040 + }, + { + "epoch": 0.020875143904147953, + "grad_norm": 3.6347037714340122, + "learning_rate": 1.9994049562525235e-05, + "loss": 1.5681, + "step": 5050 + }, + { + "epoch": 0.020916480822770027, + "grad_norm": 3.986687295644655, + "learning_rate": 1.9994004230553616e-05, + "loss": 1.6061, + "step": 5060 + }, + { + "epoch": 0.020957817741392104, + "grad_norm": 5.1196884550128825, + "learning_rate": 1.999395872661307e-05, + "loss": 1.646, + "step": 5070 + }, + { + "epoch": 0.020999154660014178, + "grad_norm": 4.073313251564883, + "learning_rate": 1.9993913050704362e-05, + "loss": 1.5632, + "step": 5080 + }, + { + "epoch": 0.021040491578636255, + "grad_norm": 3.773829349198683, + "learning_rate": 1.99938672028283e-05, + "loss": 1.596, + "step": 5090 + }, + { + "epoch": 0.02108182849725833, + "grad_norm": 5.707286361857388, + "learning_rate": 1.9993821182985655e-05, + "loss": 1.587, + "step": 5100 + }, + { + "epoch": 0.021123165415880402, + "grad_norm": 4.135913165404502, + "learning_rate": 1.9993774991177227e-05, + "loss": 1.6229, + "step": 5110 + }, + { + "epoch": 0.02116450233450248, + "grad_norm": 4.538213401615244, + "learning_rate": 1.9993728627403814e-05, + "loss": 1.5913, + "step": 5120 + }, + { + "epoch": 0.021205839253124553, + "grad_norm": 4.103580788767663, + "learning_rate": 1.9993682091666206e-05, + "loss": 1.6532, + "step": 5130 + }, + { + "epoch": 0.02124717617174663, + "grad_norm": 3.6711472807654064, + "learning_rate": 1.9993635383965205e-05, + "loss": 1.5746, + "step": 5140 + }, + { + "epoch": 0.021288513090368704, + "grad_norm": 5.277279072305559, + "learning_rate": 1.9993588504301623e-05, + "loss": 1.597, + "step": 5150 + }, + { + "epoch": 0.02132985000899078, + "grad_norm": 3.646653216373581, + "learning_rate": 1.9993541452676257e-05, + "loss": 1.6045, + "step": 5160 + }, + { + "epoch": 0.021371186927612855, + "grad_norm": 4.454553625669168, + "learning_rate": 1.999349422908992e-05, + "loss": 1.6168, + "step": 5170 + }, + { + "epoch": 0.02141252384623493, + "grad_norm": 4.408940295701244, + "learning_rate": 1.999344683354343e-05, + "loss": 1.5688, + "step": 5180 + }, + { + "epoch": 0.021453860764857006, + "grad_norm": 4.30626191840598, + "learning_rate": 1.9993399266037593e-05, + "loss": 1.5743, + "step": 5190 + }, + { + "epoch": 0.02149519768347908, + "grad_norm": 3.674456985901954, + "learning_rate": 1.999335152657323e-05, + "loss": 1.5872, + "step": 5200 + }, + { + "epoch": 0.021536534602101157, + "grad_norm": 3.641790233464658, + "learning_rate": 1.9993303615151168e-05, + "loss": 1.5612, + "step": 5210 + }, + { + "epoch": 0.02157787152072323, + "grad_norm": 4.165728119210956, + "learning_rate": 1.9993255531772225e-05, + "loss": 1.59, + "step": 5220 + }, + { + "epoch": 0.021619208439345308, + "grad_norm": 3.8319777859342246, + "learning_rate": 1.9993207276437235e-05, + "loss": 1.5912, + "step": 5230 + }, + { + "epoch": 0.02166054535796738, + "grad_norm": 3.9855756729463168, + "learning_rate": 1.999315884914702e-05, + "loss": 1.58, + "step": 5240 + }, + { + "epoch": 0.021701882276589455, + "grad_norm": 3.8011477722676807, + "learning_rate": 1.999311024990242e-05, + "loss": 1.6003, + "step": 5250 + }, + { + "epoch": 0.021743219195211532, + "grad_norm": 3.985198206647649, + "learning_rate": 1.9993061478704275e-05, + "loss": 1.5986, + "step": 5260 + }, + { + "epoch": 0.021784556113833606, + "grad_norm": 3.9838081605823636, + "learning_rate": 1.9993012535553412e-05, + "loss": 1.6166, + "step": 5270 + }, + { + "epoch": 0.021825893032455683, + "grad_norm": 3.9996617755784043, + "learning_rate": 1.999296342045068e-05, + "loss": 1.5792, + "step": 5280 + }, + { + "epoch": 0.021867229951077757, + "grad_norm": 5.892962480457768, + "learning_rate": 1.9992914133396926e-05, + "loss": 1.6053, + "step": 5290 + }, + { + "epoch": 0.02190856686969983, + "grad_norm": 4.427789486632826, + "learning_rate": 1.9992864674392994e-05, + "loss": 1.6374, + "step": 5300 + }, + { + "epoch": 0.021949903788321908, + "grad_norm": 4.488482688049845, + "learning_rate": 1.9992815043439736e-05, + "loss": 1.6198, + "step": 5310 + }, + { + "epoch": 0.02199124070694398, + "grad_norm": 3.9697984164903035, + "learning_rate": 1.999276524053801e-05, + "loss": 1.6112, + "step": 5320 + }, + { + "epoch": 0.02203257762556606, + "grad_norm": 4.708237856178089, + "learning_rate": 1.9992715265688666e-05, + "loss": 1.569, + "step": 5330 + }, + { + "epoch": 0.022073914544188132, + "grad_norm": 4.180089792931872, + "learning_rate": 1.999266511889257e-05, + "loss": 1.564, + "step": 5340 + }, + { + "epoch": 0.02211525146281021, + "grad_norm": 4.540705844431402, + "learning_rate": 1.9992614800150582e-05, + "loss": 1.6062, + "step": 5350 + }, + { + "epoch": 0.022156588381432283, + "grad_norm": 3.6164199548569256, + "learning_rate": 1.999256430946357e-05, + "loss": 1.614, + "step": 5360 + }, + { + "epoch": 0.022197925300054357, + "grad_norm": 3.815681996528154, + "learning_rate": 1.9992513646832398e-05, + "loss": 1.5836, + "step": 5370 + }, + { + "epoch": 0.022239262218676434, + "grad_norm": 4.806439757203068, + "learning_rate": 1.9992462812257943e-05, + "loss": 1.6162, + "step": 5380 + }, + { + "epoch": 0.022280599137298508, + "grad_norm": 4.354139965343947, + "learning_rate": 1.999241180574108e-05, + "loss": 1.5888, + "step": 5390 + }, + { + "epoch": 0.022321936055920585, + "grad_norm": 4.126817858976234, + "learning_rate": 1.999236062728268e-05, + "loss": 1.5879, + "step": 5400 + }, + { + "epoch": 0.02236327297454266, + "grad_norm": 4.47607737943672, + "learning_rate": 1.9992309276883632e-05, + "loss": 1.6099, + "step": 5410 + }, + { + "epoch": 0.022404609893164732, + "grad_norm": 5.610066619695038, + "learning_rate": 1.9992257754544814e-05, + "loss": 1.593, + "step": 5420 + }, + { + "epoch": 0.02244594681178681, + "grad_norm": 4.2928973652861675, + "learning_rate": 1.9992206060267114e-05, + "loss": 1.5793, + "step": 5430 + }, + { + "epoch": 0.022487283730408883, + "grad_norm": 3.8921859700664325, + "learning_rate": 1.9992154194051422e-05, + "loss": 1.608, + "step": 5440 + }, + { + "epoch": 0.02252862064903096, + "grad_norm": 3.677731550454947, + "learning_rate": 1.999210215589863e-05, + "loss": 1.6151, + "step": 5450 + }, + { + "epoch": 0.022569957567653034, + "grad_norm": 4.200629201423265, + "learning_rate": 1.9992049945809632e-05, + "loss": 1.6246, + "step": 5460 + }, + { + "epoch": 0.02261129448627511, + "grad_norm": 4.064480908765512, + "learning_rate": 1.9991997563785332e-05, + "loss": 1.5607, + "step": 5470 + }, + { + "epoch": 0.022652631404897185, + "grad_norm": 3.5486537855524176, + "learning_rate": 1.9991945009826623e-05, + "loss": 1.5906, + "step": 5480 + }, + { + "epoch": 0.02269396832351926, + "grad_norm": 4.0698465101707, + "learning_rate": 1.9991892283934415e-05, + "loss": 1.5864, + "step": 5490 + }, + { + "epoch": 0.022735305242141336, + "grad_norm": 3.698399389749536, + "learning_rate": 1.9991839386109615e-05, + "loss": 1.593, + "step": 5500 + }, + { + "epoch": 0.02277664216076341, + "grad_norm": 4.854255782396672, + "learning_rate": 1.9991786316353134e-05, + "loss": 1.5961, + "step": 5510 + }, + { + "epoch": 0.022817979079385486, + "grad_norm": 3.5841353274799244, + "learning_rate": 1.9991733074665884e-05, + "loss": 1.5638, + "step": 5520 + }, + { + "epoch": 0.02285931599800756, + "grad_norm": 4.188646894537988, + "learning_rate": 1.9991679661048774e-05, + "loss": 1.5605, + "step": 5530 + }, + { + "epoch": 0.022900652916629637, + "grad_norm": 3.646293980881599, + "learning_rate": 1.9991626075502736e-05, + "loss": 1.5672, + "step": 5540 + }, + { + "epoch": 0.02294198983525171, + "grad_norm": 3.513345408175718, + "learning_rate": 1.999157231802868e-05, + "loss": 1.5228, + "step": 5550 + }, + { + "epoch": 0.022983326753873785, + "grad_norm": 4.22409759900443, + "learning_rate": 1.999151838862754e-05, + "loss": 1.5742, + "step": 5560 + }, + { + "epoch": 0.023024663672495862, + "grad_norm": 3.9606510772786674, + "learning_rate": 1.999146428730024e-05, + "loss": 1.5898, + "step": 5570 + }, + { + "epoch": 0.023066000591117936, + "grad_norm": 4.723833314885466, + "learning_rate": 1.9991410014047713e-05, + "loss": 1.6293, + "step": 5580 + }, + { + "epoch": 0.023107337509740013, + "grad_norm": 3.79738622812003, + "learning_rate": 1.999135556887089e-05, + "loss": 1.5347, + "step": 5590 + }, + { + "epoch": 0.023148674428362086, + "grad_norm": 3.5876021705924277, + "learning_rate": 1.9991300951770712e-05, + "loss": 1.5639, + "step": 5600 + }, + { + "epoch": 0.02319001134698416, + "grad_norm": 4.466727344043237, + "learning_rate": 1.9991246162748116e-05, + "loss": 1.5821, + "step": 5610 + }, + { + "epoch": 0.023231348265606237, + "grad_norm": 4.027485882579859, + "learning_rate": 1.999119120180404e-05, + "loss": 1.5641, + "step": 5620 + }, + { + "epoch": 0.02327268518422831, + "grad_norm": 4.698907728867797, + "learning_rate": 1.9991136068939436e-05, + "loss": 1.5717, + "step": 5630 + }, + { + "epoch": 0.023314022102850388, + "grad_norm": 3.9675562129009534, + "learning_rate": 1.9991080764155254e-05, + "loss": 1.5984, + "step": 5640 + }, + { + "epoch": 0.023355359021472462, + "grad_norm": 4.469330328433558, + "learning_rate": 1.9991025287452442e-05, + "loss": 1.5836, + "step": 5650 + }, + { + "epoch": 0.02339669594009454, + "grad_norm": 4.315359559691392, + "learning_rate": 1.9990969638831955e-05, + "loss": 1.5456, + "step": 5660 + }, + { + "epoch": 0.023438032858716613, + "grad_norm": 3.67958327218992, + "learning_rate": 1.9990913818294753e-05, + "loss": 1.6191, + "step": 5670 + }, + { + "epoch": 0.023479369777338686, + "grad_norm": 4.491956126894857, + "learning_rate": 1.9990857825841793e-05, + "loss": 1.5808, + "step": 5680 + }, + { + "epoch": 0.023520706695960764, + "grad_norm": 3.79674314266457, + "learning_rate": 1.999080166147404e-05, + "loss": 1.5183, + "step": 5690 + }, + { + "epoch": 0.023562043614582837, + "grad_norm": 4.5968252548890245, + "learning_rate": 1.999074532519246e-05, + "loss": 1.5757, + "step": 5700 + }, + { + "epoch": 0.023603380533204914, + "grad_norm": 3.5990834672231284, + "learning_rate": 1.9990688816998025e-05, + "loss": 1.6086, + "step": 5710 + }, + { + "epoch": 0.023644717451826988, + "grad_norm": 4.344410017466151, + "learning_rate": 1.99906321368917e-05, + "loss": 1.6113, + "step": 5720 + }, + { + "epoch": 0.023686054370449065, + "grad_norm": 3.7938891603257603, + "learning_rate": 1.9990575284874473e-05, + "loss": 1.6365, + "step": 5730 + }, + { + "epoch": 0.02372739128907114, + "grad_norm": 3.562057149121525, + "learning_rate": 1.999051826094731e-05, + "loss": 1.5485, + "step": 5740 + }, + { + "epoch": 0.023768728207693213, + "grad_norm": 4.081479989742111, + "learning_rate": 1.99904610651112e-05, + "loss": 1.5689, + "step": 5750 + }, + { + "epoch": 0.02381006512631529, + "grad_norm": 3.759485760858795, + "learning_rate": 1.999040369736712e-05, + "loss": 1.564, + "step": 5760 + }, + { + "epoch": 0.023851402044937364, + "grad_norm": 4.032363621919849, + "learning_rate": 1.9990346157716064e-05, + "loss": 1.6025, + "step": 5770 + }, + { + "epoch": 0.02389273896355944, + "grad_norm": 3.6432323322843403, + "learning_rate": 1.999028844615902e-05, + "loss": 1.5271, + "step": 5780 + }, + { + "epoch": 0.023934075882181514, + "grad_norm": 3.802770545609017, + "learning_rate": 1.9990230562696983e-05, + "loss": 1.5967, + "step": 5790 + }, + { + "epoch": 0.023975412800803588, + "grad_norm": 3.795072573463222, + "learning_rate": 1.9990172507330943e-05, + "loss": 1.5247, + "step": 5800 + }, + { + "epoch": 0.024016749719425665, + "grad_norm": 4.366382080210575, + "learning_rate": 1.99901142800619e-05, + "loss": 1.5781, + "step": 5810 + }, + { + "epoch": 0.02405808663804774, + "grad_norm": 3.9097914526353605, + "learning_rate": 1.9990055880890864e-05, + "loss": 1.6034, + "step": 5820 + }, + { + "epoch": 0.024099423556669816, + "grad_norm": 4.123926255872013, + "learning_rate": 1.9989997309818833e-05, + "loss": 1.5464, + "step": 5830 + }, + { + "epoch": 0.02414076047529189, + "grad_norm": 4.446493191993532, + "learning_rate": 1.9989938566846812e-05, + "loss": 1.5586, + "step": 5840 + }, + { + "epoch": 0.024182097393913967, + "grad_norm": 3.7337639849714233, + "learning_rate": 1.998987965197582e-05, + "loss": 1.5479, + "step": 5850 + }, + { + "epoch": 0.02422343431253604, + "grad_norm": 4.7444952313768525, + "learning_rate": 1.9989820565206865e-05, + "loss": 1.5808, + "step": 5860 + }, + { + "epoch": 0.024264771231158114, + "grad_norm": 4.247725775065283, + "learning_rate": 1.9989761306540966e-05, + "loss": 1.523, + "step": 5870 + }, + { + "epoch": 0.02430610814978019, + "grad_norm": 3.995186643530754, + "learning_rate": 1.998970187597914e-05, + "loss": 1.5785, + "step": 5880 + }, + { + "epoch": 0.024347445068402265, + "grad_norm": 4.816092056889684, + "learning_rate": 1.9989642273522416e-05, + "loss": 1.5746, + "step": 5890 + }, + { + "epoch": 0.024388781987024342, + "grad_norm": 4.290367502884436, + "learning_rate": 1.9989582499171813e-05, + "loss": 1.6119, + "step": 5900 + }, + { + "epoch": 0.024430118905646416, + "grad_norm": 3.5513668937922236, + "learning_rate": 1.9989522552928365e-05, + "loss": 1.5162, + "step": 5910 + }, + { + "epoch": 0.02447145582426849, + "grad_norm": 3.6198772954827665, + "learning_rate": 1.9989462434793096e-05, + "loss": 1.5323, + "step": 5920 + }, + { + "epoch": 0.024512792742890567, + "grad_norm": 3.852832040089439, + "learning_rate": 1.9989402144767046e-05, + "loss": 1.5311, + "step": 5930 + }, + { + "epoch": 0.02455412966151264, + "grad_norm": 3.9519174433535325, + "learning_rate": 1.9989341682851254e-05, + "loss": 1.5429, + "step": 5940 + }, + { + "epoch": 0.024595466580134718, + "grad_norm": 4.87353052847372, + "learning_rate": 1.9989281049046755e-05, + "loss": 1.6002, + "step": 5950 + }, + { + "epoch": 0.02463680349875679, + "grad_norm": 3.3803857370087225, + "learning_rate": 1.9989220243354595e-05, + "loss": 1.5793, + "step": 5960 + }, + { + "epoch": 0.02467814041737887, + "grad_norm": 3.8766963938819075, + "learning_rate": 1.998915926577582e-05, + "loss": 1.533, + "step": 5970 + }, + { + "epoch": 0.024719477336000942, + "grad_norm": 4.09044180490663, + "learning_rate": 1.998909811631148e-05, + "loss": 1.565, + "step": 5980 + }, + { + "epoch": 0.024760814254623016, + "grad_norm": 4.537575506546124, + "learning_rate": 1.998903679496263e-05, + "loss": 1.5523, + "step": 5990 + }, + { + "epoch": 0.024802151173245093, + "grad_norm": 3.4086201638465803, + "learning_rate": 1.9988975301730317e-05, + "loss": 1.5467, + "step": 6000 + }, + { + "epoch": 0.024843488091867167, + "grad_norm": 3.7512592579174244, + "learning_rate": 1.9988913636615608e-05, + "loss": 1.6148, + "step": 6010 + }, + { + "epoch": 0.024884825010489244, + "grad_norm": 3.9606857815229035, + "learning_rate": 1.9988851799619557e-05, + "loss": 1.5529, + "step": 6020 + }, + { + "epoch": 0.024926161929111318, + "grad_norm": 4.87271131926588, + "learning_rate": 1.9988789790743235e-05, + "loss": 1.624, + "step": 6030 + }, + { + "epoch": 0.024967498847733395, + "grad_norm": 4.562111872082575, + "learning_rate": 1.9988727609987705e-05, + "loss": 1.5954, + "step": 6040 + }, + { + "epoch": 0.02500883576635547, + "grad_norm": 4.160920766227917, + "learning_rate": 1.9988665257354035e-05, + "loss": 1.5745, + "step": 6050 + }, + { + "epoch": 0.025050172684977542, + "grad_norm": 3.7976329240225284, + "learning_rate": 1.9988602732843296e-05, + "loss": 1.539, + "step": 6060 + }, + { + "epoch": 0.02509150960359962, + "grad_norm": 3.9348324977710347, + "learning_rate": 1.9988540036456575e-05, + "loss": 1.5802, + "step": 6070 + }, + { + "epoch": 0.025132846522221693, + "grad_norm": 3.3649859713246313, + "learning_rate": 1.998847716819494e-05, + "loss": 1.5349, + "step": 6080 + }, + { + "epoch": 0.02517418344084377, + "grad_norm": 5.035730829505278, + "learning_rate": 1.998841412805948e-05, + "loss": 1.5522, + "step": 6090 + }, + { + "epoch": 0.025215520359465844, + "grad_norm": 4.38089533529463, + "learning_rate": 1.9988350916051272e-05, + "loss": 1.5696, + "step": 6100 + }, + { + "epoch": 0.025256857278087918, + "grad_norm": 4.0458062619048185, + "learning_rate": 1.9988287532171408e-05, + "loss": 1.582, + "step": 6110 + }, + { + "epoch": 0.025298194196709995, + "grad_norm": 5.197316936196237, + "learning_rate": 1.9988223976420983e-05, + "loss": 1.5685, + "step": 6120 + }, + { + "epoch": 0.02533953111533207, + "grad_norm": 3.701848060366763, + "learning_rate": 1.998816024880108e-05, + "loss": 1.568, + "step": 6130 + }, + { + "epoch": 0.025380868033954146, + "grad_norm": 4.576812131388496, + "learning_rate": 1.9988096349312808e-05, + "loss": 1.5925, + "step": 6140 + }, + { + "epoch": 0.02542220495257622, + "grad_norm": 3.626416937979281, + "learning_rate": 1.998803227795726e-05, + "loss": 1.6026, + "step": 6150 + }, + { + "epoch": 0.025463541871198297, + "grad_norm": 3.7415000301009016, + "learning_rate": 1.9987968034735535e-05, + "loss": 1.5632, + "step": 6160 + }, + { + "epoch": 0.02550487878982037, + "grad_norm": 4.093809033114078, + "learning_rate": 1.9987903619648745e-05, + "loss": 1.5442, + "step": 6170 + }, + { + "epoch": 0.025546215708442444, + "grad_norm": 3.782350165490308, + "learning_rate": 1.9987839032697995e-05, + "loss": 1.5423, + "step": 6180 + }, + { + "epoch": 0.02558755262706452, + "grad_norm": 3.1897173529667695, + "learning_rate": 1.9987774273884398e-05, + "loss": 1.5332, + "step": 6190 + }, + { + "epoch": 0.025628889545686595, + "grad_norm": 3.9224918301369276, + "learning_rate": 1.9987709343209066e-05, + "loss": 1.5133, + "step": 6200 + }, + { + "epoch": 0.025670226464308672, + "grad_norm": 3.830850927059349, + "learning_rate": 1.9987644240673118e-05, + "loss": 1.555, + "step": 6210 + }, + { + "epoch": 0.025711563382930746, + "grad_norm": 4.0209145103807575, + "learning_rate": 1.9987578966277678e-05, + "loss": 1.5114, + "step": 6220 + }, + { + "epoch": 0.02575290030155282, + "grad_norm": 3.936778500441379, + "learning_rate": 1.998751352002386e-05, + "loss": 1.5257, + "step": 6230 + }, + { + "epoch": 0.025794237220174897, + "grad_norm": 3.7293623909553313, + "learning_rate": 1.9987447901912794e-05, + "loss": 1.5694, + "step": 6240 + }, + { + "epoch": 0.02583557413879697, + "grad_norm": 3.5818490617695575, + "learning_rate": 1.9987382111945614e-05, + "loss": 1.5531, + "step": 6250 + }, + { + "epoch": 0.025876911057419048, + "grad_norm": 3.6430367540575928, + "learning_rate": 1.998731615012345e-05, + "loss": 1.5434, + "step": 6260 + }, + { + "epoch": 0.02591824797604112, + "grad_norm": 4.589782090699277, + "learning_rate": 1.998725001644743e-05, + "loss": 1.557, + "step": 6270 + }, + { + "epoch": 0.0259595848946632, + "grad_norm": 4.283265289643292, + "learning_rate": 1.99871837109187e-05, + "loss": 1.5624, + "step": 6280 + }, + { + "epoch": 0.026000921813285272, + "grad_norm": 4.946779268912546, + "learning_rate": 1.99871172335384e-05, + "loss": 1.5124, + "step": 6290 + }, + { + "epoch": 0.026042258731907346, + "grad_norm": 3.5038967204373694, + "learning_rate": 1.998705058430767e-05, + "loss": 1.5683, + "step": 6300 + }, + { + "epoch": 0.026083595650529423, + "grad_norm": 3.8770191196485886, + "learning_rate": 1.998698376322766e-05, + "loss": 1.5435, + "step": 6310 + }, + { + "epoch": 0.026124932569151497, + "grad_norm": 4.439160488934939, + "learning_rate": 1.998691677029952e-05, + "loss": 1.5295, + "step": 6320 + }, + { + "epoch": 0.026166269487773574, + "grad_norm": 4.092904098781107, + "learning_rate": 1.99868496055244e-05, + "loss": 1.55, + "step": 6330 + }, + { + "epoch": 0.026207606406395648, + "grad_norm": 4.303539009198583, + "learning_rate": 1.9986782268903457e-05, + "loss": 1.5484, + "step": 6340 + }, + { + "epoch": 0.026248943325017725, + "grad_norm": 3.9078194955949916, + "learning_rate": 1.9986714760437853e-05, + "loss": 1.5827, + "step": 6350 + }, + { + "epoch": 0.0262902802436398, + "grad_norm": 4.342380780259694, + "learning_rate": 1.9986647080128746e-05, + "loss": 1.557, + "step": 6360 + }, + { + "epoch": 0.026331617162261872, + "grad_norm": 3.985596918279314, + "learning_rate": 1.99865792279773e-05, + "loss": 1.578, + "step": 6370 + }, + { + "epoch": 0.02637295408088395, + "grad_norm": 3.653720676962278, + "learning_rate": 1.9986511203984683e-05, + "loss": 1.5668, + "step": 6380 + }, + { + "epoch": 0.026414290999506023, + "grad_norm": 3.751487721843964, + "learning_rate": 1.998644300815207e-05, + "loss": 1.5305, + "step": 6390 + }, + { + "epoch": 0.0264556279181281, + "grad_norm": 4.009883655861525, + "learning_rate": 1.9986374640480627e-05, + "loss": 1.5495, + "step": 6400 + }, + { + "epoch": 0.026496964836750174, + "grad_norm": 3.8266120037819396, + "learning_rate": 1.9986306100971533e-05, + "loss": 1.5255, + "step": 6410 + }, + { + "epoch": 0.026538301755372248, + "grad_norm": 3.6427176903376384, + "learning_rate": 1.9986237389625974e-05, + "loss": 1.5525, + "step": 6420 + }, + { + "epoch": 0.026579638673994325, + "grad_norm": 4.099974786255079, + "learning_rate": 1.998616850644512e-05, + "loss": 1.5424, + "step": 6430 + }, + { + "epoch": 0.0266209755926164, + "grad_norm": 3.8198383445190793, + "learning_rate": 1.998609945143017e-05, + "loss": 1.567, + "step": 6440 + }, + { + "epoch": 0.026662312511238476, + "grad_norm": 4.473728987789235, + "learning_rate": 1.9986030224582302e-05, + "loss": 1.4823, + "step": 6450 + }, + { + "epoch": 0.02670364942986055, + "grad_norm": 4.534257895704366, + "learning_rate": 1.998596082590271e-05, + "loss": 1.5712, + "step": 6460 + }, + { + "epoch": 0.026744986348482627, + "grad_norm": 3.644766820996961, + "learning_rate": 1.998589125539259e-05, + "loss": 1.4863, + "step": 6470 + }, + { + "epoch": 0.0267863232671047, + "grad_norm": 3.5999094373664526, + "learning_rate": 1.9985821513053137e-05, + "loss": 1.5326, + "step": 6480 + }, + { + "epoch": 0.026827660185726774, + "grad_norm": 4.1425167904001565, + "learning_rate": 1.9985751598885552e-05, + "loss": 1.5378, + "step": 6490 + }, + { + "epoch": 0.02686899710434885, + "grad_norm": 4.493755521920297, + "learning_rate": 1.998568151289104e-05, + "loss": 1.559, + "step": 6500 + }, + { + "epoch": 0.026910334022970925, + "grad_norm": 3.8756107746153896, + "learning_rate": 1.9985611255070806e-05, + "loss": 1.556, + "step": 6510 + }, + { + "epoch": 0.026951670941593002, + "grad_norm": 4.038084156630909, + "learning_rate": 1.9985540825426055e-05, + "loss": 1.5645, + "step": 6520 + }, + { + "epoch": 0.026993007860215076, + "grad_norm": 3.5809330138239392, + "learning_rate": 1.9985470223958e-05, + "loss": 1.5548, + "step": 6530 + }, + { + "epoch": 0.027034344778837153, + "grad_norm": 3.8676960332502963, + "learning_rate": 1.998539945066786e-05, + "loss": 1.5276, + "step": 6540 + }, + { + "epoch": 0.027075681697459227, + "grad_norm": 3.4449297103173593, + "learning_rate": 1.9985328505556852e-05, + "loss": 1.5651, + "step": 6550 + }, + { + "epoch": 0.0271170186160813, + "grad_norm": 4.288962654330007, + "learning_rate": 1.9985257388626196e-05, + "loss": 1.4996, + "step": 6560 + }, + { + "epoch": 0.027158355534703377, + "grad_norm": 4.128501380292175, + "learning_rate": 1.9985186099877112e-05, + "loss": 1.5419, + "step": 6570 + }, + { + "epoch": 0.02719969245332545, + "grad_norm": 4.575002926290647, + "learning_rate": 1.998511463931083e-05, + "loss": 1.5515, + "step": 6580 + }, + { + "epoch": 0.02724102937194753, + "grad_norm": 3.821068699345836, + "learning_rate": 1.998504300692858e-05, + "loss": 1.5233, + "step": 6590 + }, + { + "epoch": 0.027282366290569602, + "grad_norm": 3.2424897263365016, + "learning_rate": 1.9984971202731596e-05, + "loss": 1.5479, + "step": 6600 + }, + { + "epoch": 0.027323703209191676, + "grad_norm": 4.834170577555974, + "learning_rate": 1.9984899226721107e-05, + "loss": 1.5502, + "step": 6610 + }, + { + "epoch": 0.027365040127813753, + "grad_norm": 3.764169873278093, + "learning_rate": 1.998482707889836e-05, + "loss": 1.5891, + "step": 6620 + }, + { + "epoch": 0.027406377046435827, + "grad_norm": 3.414782018354158, + "learning_rate": 1.998475475926459e-05, + "loss": 1.5159, + "step": 6630 + }, + { + "epoch": 0.027447713965057904, + "grad_norm": 4.666184759190313, + "learning_rate": 1.9984682267821046e-05, + "loss": 1.5628, + "step": 6640 + }, + { + "epoch": 0.027489050883679977, + "grad_norm": 3.5574578914802943, + "learning_rate": 1.998460960456897e-05, + "loss": 1.5315, + "step": 6650 + }, + { + "epoch": 0.027530387802302055, + "grad_norm": 4.817382615755327, + "learning_rate": 1.9984536769509615e-05, + "loss": 1.5081, + "step": 6660 + }, + { + "epoch": 0.02757172472092413, + "grad_norm": 4.458867619168575, + "learning_rate": 1.998446376264424e-05, + "loss": 1.5099, + "step": 6670 + }, + { + "epoch": 0.027613061639546202, + "grad_norm": 4.929644851290023, + "learning_rate": 1.9984390583974093e-05, + "loss": 1.5122, + "step": 6680 + }, + { + "epoch": 0.02765439855816828, + "grad_norm": 4.625479741961043, + "learning_rate": 1.9984317233500435e-05, + "loss": 1.5516, + "step": 6690 + }, + { + "epoch": 0.027695735476790353, + "grad_norm": 4.116997057727521, + "learning_rate": 1.9984243711224535e-05, + "loss": 1.5376, + "step": 6700 + }, + { + "epoch": 0.02773707239541243, + "grad_norm": 3.5829728345047314, + "learning_rate": 1.998417001714765e-05, + "loss": 1.5175, + "step": 6710 + }, + { + "epoch": 0.027778409314034504, + "grad_norm": 3.9734979789101996, + "learning_rate": 1.9984096151271048e-05, + "loss": 1.4871, + "step": 6720 + }, + { + "epoch": 0.027819746232656577, + "grad_norm": 3.430905345503222, + "learning_rate": 1.9984022113596003e-05, + "loss": 1.5413, + "step": 6730 + }, + { + "epoch": 0.027861083151278655, + "grad_norm": 4.2373775185116465, + "learning_rate": 1.998394790412379e-05, + "loss": 1.508, + "step": 6740 + }, + { + "epoch": 0.027902420069900728, + "grad_norm": 3.973156898260741, + "learning_rate": 1.9983873522855684e-05, + "loss": 1.5283, + "step": 6750 + }, + { + "epoch": 0.027943756988522805, + "grad_norm": 3.714285928818181, + "learning_rate": 1.9983798969792966e-05, + "loss": 1.5362, + "step": 6760 + }, + { + "epoch": 0.02798509390714488, + "grad_norm": 4.251009623472971, + "learning_rate": 1.9983724244936916e-05, + "loss": 1.5282, + "step": 6770 + }, + { + "epoch": 0.028026430825766956, + "grad_norm": 3.886447728722872, + "learning_rate": 1.9983649348288825e-05, + "loss": 1.5719, + "step": 6780 + }, + { + "epoch": 0.02806776774438903, + "grad_norm": 3.5572049346515757, + "learning_rate": 1.9983574279849977e-05, + "loss": 1.5302, + "step": 6790 + }, + { + "epoch": 0.028109104663011104, + "grad_norm": 3.688224659646708, + "learning_rate": 1.9983499039621667e-05, + "loss": 1.5132, + "step": 6800 + }, + { + "epoch": 0.02815044158163318, + "grad_norm": 3.986436014630721, + "learning_rate": 1.998342362760519e-05, + "loss": 1.5045, + "step": 6810 + }, + { + "epoch": 0.028191778500255255, + "grad_norm": 4.042321521286428, + "learning_rate": 1.998334804380184e-05, + "loss": 1.52, + "step": 6820 + }, + { + "epoch": 0.02823311541887733, + "grad_norm": 3.936708692378881, + "learning_rate": 1.9983272288212917e-05, + "loss": 1.5208, + "step": 6830 + }, + { + "epoch": 0.028274452337499405, + "grad_norm": 4.692602969518199, + "learning_rate": 1.998319636083973e-05, + "loss": 1.5667, + "step": 6840 + }, + { + "epoch": 0.028315789256121483, + "grad_norm": 3.9313878236010598, + "learning_rate": 1.9983120261683582e-05, + "loss": 1.5831, + "step": 6850 + }, + { + "epoch": 0.028357126174743556, + "grad_norm": 3.551615629888668, + "learning_rate": 1.9983043990745784e-05, + "loss": 1.5308, + "step": 6860 + }, + { + "epoch": 0.02839846309336563, + "grad_norm": 4.6846872186437905, + "learning_rate": 1.9982967548027645e-05, + "loss": 1.4921, + "step": 6870 + }, + { + "epoch": 0.028439800011987707, + "grad_norm": 4.130277420309701, + "learning_rate": 1.9982890933530482e-05, + "loss": 1.4943, + "step": 6880 + }, + { + "epoch": 0.02848113693060978, + "grad_norm": 4.84212625045545, + "learning_rate": 1.9982814147255617e-05, + "loss": 1.5353, + "step": 6890 + }, + { + "epoch": 0.028522473849231858, + "grad_norm": 4.484005829649726, + "learning_rate": 1.9982737189204367e-05, + "loss": 1.5051, + "step": 6900 + }, + { + "epoch": 0.02856381076785393, + "grad_norm": 4.60797848842926, + "learning_rate": 1.998266005937806e-05, + "loss": 1.5816, + "step": 6910 + }, + { + "epoch": 0.028605147686476005, + "grad_norm": 4.6416638061065685, + "learning_rate": 1.998258275777802e-05, + "loss": 1.4904, + "step": 6920 + }, + { + "epoch": 0.028646484605098083, + "grad_norm": 3.477343383643086, + "learning_rate": 1.9982505284405574e-05, + "loss": 1.4904, + "step": 6930 + }, + { + "epoch": 0.028687821523720156, + "grad_norm": 3.6088868644511427, + "learning_rate": 1.9982427639262065e-05, + "loss": 1.5314, + "step": 6940 + }, + { + "epoch": 0.028729158442342233, + "grad_norm": 3.5925096362200333, + "learning_rate": 1.9982349822348816e-05, + "loss": 1.5714, + "step": 6950 + }, + { + "epoch": 0.028770495360964307, + "grad_norm": 3.2379164637124855, + "learning_rate": 1.9982271833667178e-05, + "loss": 1.538, + "step": 6960 + }, + { + "epoch": 0.028811832279586384, + "grad_norm": 4.70836437147594, + "learning_rate": 1.9982193673218487e-05, + "loss": 1.5221, + "step": 6970 + }, + { + "epoch": 0.028853169198208458, + "grad_norm": 4.481706228912118, + "learning_rate": 1.9982115341004088e-05, + "loss": 1.5313, + "step": 6980 + }, + { + "epoch": 0.02889450611683053, + "grad_norm": 4.687318727733607, + "learning_rate": 1.9982036837025332e-05, + "loss": 1.5051, + "step": 6990 + }, + { + "epoch": 0.02893584303545261, + "grad_norm": 4.598444922768698, + "learning_rate": 1.998195816128357e-05, + "loss": 1.4934, + "step": 7000 + }, + { + "epoch": 0.028977179954074683, + "grad_norm": 3.9891444926266555, + "learning_rate": 1.9981879313780145e-05, + "loss": 1.5511, + "step": 7010 + }, + { + "epoch": 0.02901851687269676, + "grad_norm": 4.416153022094333, + "learning_rate": 1.998180029451643e-05, + "loss": 1.5097, + "step": 7020 + }, + { + "epoch": 0.029059853791318833, + "grad_norm": 3.798723984502823, + "learning_rate": 1.9981721103493775e-05, + "loss": 1.4997, + "step": 7030 + }, + { + "epoch": 0.029101190709940907, + "grad_norm": 3.8640966990883223, + "learning_rate": 1.9981641740713545e-05, + "loss": 1.54, + "step": 7040 + }, + { + "epoch": 0.029142527628562984, + "grad_norm": 3.9923946800397436, + "learning_rate": 1.9981562206177104e-05, + "loss": 1.5511, + "step": 7050 + }, + { + "epoch": 0.029183864547185058, + "grad_norm": 3.451062899533455, + "learning_rate": 1.998148249988582e-05, + "loss": 1.5094, + "step": 7060 + }, + { + "epoch": 0.029225201465807135, + "grad_norm": 3.608872589804411, + "learning_rate": 1.998140262184107e-05, + "loss": 1.5162, + "step": 7070 + }, + { + "epoch": 0.02926653838442921, + "grad_norm": 4.033124276978576, + "learning_rate": 1.998132257204422e-05, + "loss": 1.4959, + "step": 7080 + }, + { + "epoch": 0.029307875303051286, + "grad_norm": 3.6841192145575397, + "learning_rate": 1.9981242350496656e-05, + "loss": 1.5223, + "step": 7090 + }, + { + "epoch": 0.02934921222167336, + "grad_norm": 3.5915932028439226, + "learning_rate": 1.9981161957199754e-05, + "loss": 1.5257, + "step": 7100 + }, + { + "epoch": 0.029390549140295433, + "grad_norm": 3.870756947521686, + "learning_rate": 1.9981081392154898e-05, + "loss": 1.4904, + "step": 7110 + }, + { + "epoch": 0.02943188605891751, + "grad_norm": 4.22816037060308, + "learning_rate": 1.9981000655363473e-05, + "loss": 1.4982, + "step": 7120 + }, + { + "epoch": 0.029473222977539584, + "grad_norm": 3.808092113157194, + "learning_rate": 1.9980919746826872e-05, + "loss": 1.519, + "step": 7130 + }, + { + "epoch": 0.02951455989616166, + "grad_norm": 3.709300764256846, + "learning_rate": 1.9980838666546483e-05, + "loss": 1.5533, + "step": 7140 + }, + { + "epoch": 0.029555896814783735, + "grad_norm": 3.4732849314384127, + "learning_rate": 1.9980757414523704e-05, + "loss": 1.5633, + "step": 7150 + }, + { + "epoch": 0.029597233733405812, + "grad_norm": 3.713987413842907, + "learning_rate": 1.998067599075993e-05, + "loss": 1.5201, + "step": 7160 + }, + { + "epoch": 0.029638570652027886, + "grad_norm": 4.113428203135297, + "learning_rate": 1.9980594395256564e-05, + "loss": 1.4594, + "step": 7170 + }, + { + "epoch": 0.02967990757064996, + "grad_norm": 2.9935199155014285, + "learning_rate": 1.9980512628015014e-05, + "loss": 1.4986, + "step": 7180 + }, + { + "epoch": 0.029721244489272037, + "grad_norm": 4.691195443226011, + "learning_rate": 1.998043068903668e-05, + "loss": 1.5357, + "step": 7190 + }, + { + "epoch": 0.02976258140789411, + "grad_norm": 3.4891344421625647, + "learning_rate": 1.9980348578322973e-05, + "loss": 1.5306, + "step": 7200 + }, + { + "epoch": 0.029803918326516188, + "grad_norm": 5.006688866087417, + "learning_rate": 1.9980266295875313e-05, + "loss": 1.512, + "step": 7210 + }, + { + "epoch": 0.02984525524513826, + "grad_norm": 4.053771198338998, + "learning_rate": 1.9980183841695107e-05, + "loss": 1.4794, + "step": 7220 + }, + { + "epoch": 0.029886592163760335, + "grad_norm": 3.352224471660883, + "learning_rate": 1.998010121578378e-05, + "loss": 1.4914, + "step": 7230 + }, + { + "epoch": 0.029927929082382412, + "grad_norm": 3.864235630776693, + "learning_rate": 1.998001841814275e-05, + "loss": 1.5253, + "step": 7240 + }, + { + "epoch": 0.029969266001004486, + "grad_norm": 3.706315048662037, + "learning_rate": 1.997993544877344e-05, + "loss": 1.5509, + "step": 7250 + }, + { + "epoch": 0.030010602919626563, + "grad_norm": 3.5465275464245676, + "learning_rate": 1.9979852307677285e-05, + "loss": 1.5605, + "step": 7260 + }, + { + "epoch": 0.030051939838248637, + "grad_norm": 4.129168530718907, + "learning_rate": 1.997976899485571e-05, + "loss": 1.5204, + "step": 7270 + }, + { + "epoch": 0.030093276756870714, + "grad_norm": 3.500344444775805, + "learning_rate": 1.997968551031015e-05, + "loss": 1.4853, + "step": 7280 + }, + { + "epoch": 0.030134613675492788, + "grad_norm": 3.930695708227359, + "learning_rate": 1.9979601854042044e-05, + "loss": 1.5186, + "step": 7290 + }, + { + "epoch": 0.03017595059411486, + "grad_norm": 3.8784171251524846, + "learning_rate": 1.9979518026052826e-05, + "loss": 1.5031, + "step": 7300 + }, + { + "epoch": 0.03021728751273694, + "grad_norm": 4.264401345696811, + "learning_rate": 1.997943402634394e-05, + "loss": 1.5175, + "step": 7310 + }, + { + "epoch": 0.030258624431359012, + "grad_norm": 3.924430485404916, + "learning_rate": 1.9979349854916836e-05, + "loss": 1.5415, + "step": 7320 + }, + { + "epoch": 0.03029996134998109, + "grad_norm": 5.108018679538745, + "learning_rate": 1.9979265511772958e-05, + "loss": 1.4635, + "step": 7330 + }, + { + "epoch": 0.030341298268603163, + "grad_norm": 3.5308579618903253, + "learning_rate": 1.997918099691376e-05, + "loss": 1.5076, + "step": 7340 + }, + { + "epoch": 0.03038263518722524, + "grad_norm": 5.35750207475576, + "learning_rate": 1.997909631034069e-05, + "loss": 1.5426, + "step": 7350 + }, + { + "epoch": 0.030423972105847314, + "grad_norm": 3.4906620060732645, + "learning_rate": 1.9979011452055216e-05, + "loss": 1.5012, + "step": 7360 + }, + { + "epoch": 0.030465309024469388, + "grad_norm": 3.059087960421475, + "learning_rate": 1.9978926422058788e-05, + "loss": 1.5542, + "step": 7370 + }, + { + "epoch": 0.030506645943091465, + "grad_norm": 3.9845949063681227, + "learning_rate": 1.9978841220352875e-05, + "loss": 1.546, + "step": 7380 + }, + { + "epoch": 0.03054798286171354, + "grad_norm": 3.8610381846741815, + "learning_rate": 1.9978755846938943e-05, + "loss": 1.5437, + "step": 7390 + }, + { + "epoch": 0.030589319780335616, + "grad_norm": 3.2351748039531154, + "learning_rate": 1.9978670301818456e-05, + "loss": 1.4819, + "step": 7400 + }, + { + "epoch": 0.03063065669895769, + "grad_norm": 3.3661408607769823, + "learning_rate": 1.997858458499289e-05, + "loss": 1.4698, + "step": 7410 + }, + { + "epoch": 0.030671993617579763, + "grad_norm": 3.882697263150638, + "learning_rate": 1.997849869646372e-05, + "loss": 1.5216, + "step": 7420 + }, + { + "epoch": 0.03071333053620184, + "grad_norm": 3.574753801928485, + "learning_rate": 1.9978412636232425e-05, + "loss": 1.4803, + "step": 7430 + }, + { + "epoch": 0.030754667454823914, + "grad_norm": 4.443698443398163, + "learning_rate": 1.997832640430048e-05, + "loss": 1.5006, + "step": 7440 + }, + { + "epoch": 0.03079600437344599, + "grad_norm": 4.180214183413532, + "learning_rate": 1.9978240000669377e-05, + "loss": 1.4823, + "step": 7450 + }, + { + "epoch": 0.030837341292068065, + "grad_norm": 4.271849788381411, + "learning_rate": 1.9978153425340596e-05, + "loss": 1.4888, + "step": 7460 + }, + { + "epoch": 0.030878678210690142, + "grad_norm": 3.638691493855987, + "learning_rate": 1.9978066678315634e-05, + "loss": 1.5171, + "step": 7470 + }, + { + "epoch": 0.030920015129312216, + "grad_norm": 4.31218852799806, + "learning_rate": 1.9977979759595972e-05, + "loss": 1.5313, + "step": 7480 + }, + { + "epoch": 0.03096135204793429, + "grad_norm": 4.188985873406339, + "learning_rate": 1.9977892669183115e-05, + "loss": 1.5333, + "step": 7490 + }, + { + "epoch": 0.031002688966556367, + "grad_norm": 4.091422365669455, + "learning_rate": 1.9977805407078563e-05, + "loss": 1.5104, + "step": 7500 + }, + { + "epoch": 0.03104402588517844, + "grad_norm": 3.7904969438772995, + "learning_rate": 1.997771797328381e-05, + "loss": 1.4961, + "step": 7510 + }, + { + "epoch": 0.031085362803800518, + "grad_norm": 4.393169555680857, + "learning_rate": 1.9977630367800366e-05, + "loss": 1.4876, + "step": 7520 + }, + { + "epoch": 0.03112669972242259, + "grad_norm": 3.3377888267921163, + "learning_rate": 1.9977542590629736e-05, + "loss": 1.5107, + "step": 7530 + }, + { + "epoch": 0.031168036641044665, + "grad_norm": 3.6979315229218512, + "learning_rate": 1.9977454641773432e-05, + "loss": 1.4984, + "step": 7540 + }, + { + "epoch": 0.031209373559666742, + "grad_norm": 4.422235985099495, + "learning_rate": 1.9977366521232966e-05, + "loss": 1.5166, + "step": 7550 + }, + { + "epoch": 0.03125071047828882, + "grad_norm": 4.357975123788466, + "learning_rate": 1.9977278229009854e-05, + "loss": 1.5133, + "step": 7560 + }, + { + "epoch": 0.03129204739691089, + "grad_norm": 3.618227619392518, + "learning_rate": 1.997718976510562e-05, + "loss": 1.4872, + "step": 7570 + }, + { + "epoch": 0.03133338431553297, + "grad_norm": 3.6903691829248175, + "learning_rate": 1.9977101129521778e-05, + "loss": 1.4968, + "step": 7580 + }, + { + "epoch": 0.031374721234155044, + "grad_norm": 3.325924769769221, + "learning_rate": 1.997701232225986e-05, + "loss": 1.484, + "step": 7590 + }, + { + "epoch": 0.03141605815277712, + "grad_norm": 3.244599139580768, + "learning_rate": 1.9976923343321388e-05, + "loss": 1.501, + "step": 7600 + }, + { + "epoch": 0.03145739507139919, + "grad_norm": 3.9512458600612224, + "learning_rate": 1.9976834192707898e-05, + "loss": 1.5146, + "step": 7610 + }, + { + "epoch": 0.03149873199002127, + "grad_norm": 6.319637912227645, + "learning_rate": 1.9976744870420925e-05, + "loss": 1.5232, + "step": 7620 + }, + { + "epoch": 0.031540068908643346, + "grad_norm": 3.385728813673924, + "learning_rate": 1.9976655376462003e-05, + "loss": 1.4964, + "step": 7630 + }, + { + "epoch": 0.031581405827265416, + "grad_norm": 3.5621843155206365, + "learning_rate": 1.997656571083267e-05, + "loss": 1.4948, + "step": 7640 + }, + { + "epoch": 0.03162274274588749, + "grad_norm": 3.5127469264785334, + "learning_rate": 1.9976475873534476e-05, + "loss": 1.4788, + "step": 7650 + }, + { + "epoch": 0.03166407966450957, + "grad_norm": 5.123261236833159, + "learning_rate": 1.9976385864568958e-05, + "loss": 1.4906, + "step": 7660 + }, + { + "epoch": 0.03170541658313164, + "grad_norm": 3.0872059709849378, + "learning_rate": 1.997629568393767e-05, + "loss": 1.5013, + "step": 7670 + }, + { + "epoch": 0.03174675350175372, + "grad_norm": 3.516937874543627, + "learning_rate": 1.9976205331642165e-05, + "loss": 1.4802, + "step": 7680 + }, + { + "epoch": 0.031788090420375795, + "grad_norm": 3.7194230277933684, + "learning_rate": 1.9976114807683996e-05, + "loss": 1.4776, + "step": 7690 + }, + { + "epoch": 0.03182942733899787, + "grad_norm": 3.6171028748264016, + "learning_rate": 1.9976024112064718e-05, + "loss": 1.4867, + "step": 7700 + }, + { + "epoch": 0.03187076425761994, + "grad_norm": 4.74734081777917, + "learning_rate": 1.9975933244785894e-05, + "loss": 1.5321, + "step": 7710 + }, + { + "epoch": 0.03191210117624202, + "grad_norm": 3.4998159594245912, + "learning_rate": 1.997584220584909e-05, + "loss": 1.4555, + "step": 7720 + }, + { + "epoch": 0.031953438094864096, + "grad_norm": 4.0664157482197245, + "learning_rate": 1.9975750995255865e-05, + "loss": 1.4982, + "step": 7730 + }, + { + "epoch": 0.03199477501348617, + "grad_norm": 4.448956391697098, + "learning_rate": 1.9975659613007797e-05, + "loss": 1.4877, + "step": 7740 + }, + { + "epoch": 0.032036111932108244, + "grad_norm": 3.9116050665163056, + "learning_rate": 1.9975568059106455e-05, + "loss": 1.51, + "step": 7750 + }, + { + "epoch": 0.03207744885073032, + "grad_norm": 3.8835007718258874, + "learning_rate": 1.9975476333553416e-05, + "loss": 1.5245, + "step": 7760 + }, + { + "epoch": 0.0321187857693524, + "grad_norm": 3.8364373709477215, + "learning_rate": 1.9975384436350254e-05, + "loss": 1.467, + "step": 7770 + }, + { + "epoch": 0.03216012268797447, + "grad_norm": 3.848935245597496, + "learning_rate": 1.9975292367498556e-05, + "loss": 1.4999, + "step": 7780 + }, + { + "epoch": 0.032201459606596546, + "grad_norm": 3.309302500010883, + "learning_rate": 1.99752001269999e-05, + "loss": 1.513, + "step": 7790 + }, + { + "epoch": 0.03224279652521862, + "grad_norm": 3.417545265677783, + "learning_rate": 1.9975107714855875e-05, + "loss": 1.5138, + "step": 7800 + }, + { + "epoch": 0.03228413344384069, + "grad_norm": 4.218881911775881, + "learning_rate": 1.9975015131068078e-05, + "loss": 1.4763, + "step": 7810 + }, + { + "epoch": 0.03232547036246277, + "grad_norm": 3.581778505125333, + "learning_rate": 1.997492237563809e-05, + "loss": 1.5195, + "step": 7820 + }, + { + "epoch": 0.03236680728108485, + "grad_norm": 3.7604938020140226, + "learning_rate": 1.997482944856752e-05, + "loss": 1.4933, + "step": 7830 + }, + { + "epoch": 0.032408144199706924, + "grad_norm": 4.37155325952897, + "learning_rate": 1.997473634985796e-05, + "loss": 1.4557, + "step": 7840 + }, + { + "epoch": 0.032449481118328995, + "grad_norm": 3.3738873014096153, + "learning_rate": 1.9974643079511008e-05, + "loss": 1.5323, + "step": 7850 + }, + { + "epoch": 0.03249081803695107, + "grad_norm": 4.587428592575524, + "learning_rate": 1.9974549637528276e-05, + "loss": 1.5129, + "step": 7860 + }, + { + "epoch": 0.03253215495557315, + "grad_norm": 3.7141451702858252, + "learning_rate": 1.997445602391137e-05, + "loss": 1.5176, + "step": 7870 + }, + { + "epoch": 0.03257349187419522, + "grad_norm": 3.585481088159076, + "learning_rate": 1.9974362238661903e-05, + "loss": 1.5109, + "step": 7880 + }, + { + "epoch": 0.032614828792817296, + "grad_norm": 3.4290626560881936, + "learning_rate": 1.9974268281781484e-05, + "loss": 1.4477, + "step": 7890 + }, + { + "epoch": 0.032656165711439374, + "grad_norm": 3.612891113663602, + "learning_rate": 1.9974174153271728e-05, + "loss": 1.4229, + "step": 7900 + }, + { + "epoch": 0.03269750263006145, + "grad_norm": 3.487845093010647, + "learning_rate": 1.9974079853134266e-05, + "loss": 1.5321, + "step": 7910 + }, + { + "epoch": 0.03273883954868352, + "grad_norm": 5.587764895917656, + "learning_rate": 1.9973985381370707e-05, + "loss": 1.4645, + "step": 7920 + }, + { + "epoch": 0.0327801764673056, + "grad_norm": 3.5490448514590494, + "learning_rate": 1.9973890737982684e-05, + "loss": 1.5374, + "step": 7930 + }, + { + "epoch": 0.032821513385927675, + "grad_norm": 3.94930169954439, + "learning_rate": 1.9973795922971827e-05, + "loss": 1.4661, + "step": 7940 + }, + { + "epoch": 0.032862850304549746, + "grad_norm": 3.494031781745132, + "learning_rate": 1.9973700936339763e-05, + "loss": 1.4291, + "step": 7950 + }, + { + "epoch": 0.03290418722317182, + "grad_norm": 3.0746272722997414, + "learning_rate": 1.9973605778088126e-05, + "loss": 1.5493, + "step": 7960 + }, + { + "epoch": 0.0329455241417939, + "grad_norm": 3.6416998354027736, + "learning_rate": 1.9973510448218558e-05, + "loss": 1.4471, + "step": 7970 + }, + { + "epoch": 0.03298686106041597, + "grad_norm": 3.8385152763033883, + "learning_rate": 1.99734149467327e-05, + "loss": 1.5182, + "step": 7980 + }, + { + "epoch": 0.03302819797903805, + "grad_norm": 3.5463908529630985, + "learning_rate": 1.9973319273632187e-05, + "loss": 1.4848, + "step": 7990 + }, + { + "epoch": 0.033069534897660124, + "grad_norm": 4.494540529736681, + "learning_rate": 1.9973223428918677e-05, + "loss": 1.4656, + "step": 8000 + }, + { + "epoch": 0.0331108718162822, + "grad_norm": 4.138487839617815, + "learning_rate": 1.997312741259381e-05, + "loss": 1.4533, + "step": 8010 + }, + { + "epoch": 0.03315220873490427, + "grad_norm": 3.2836571982439957, + "learning_rate": 1.9973031224659238e-05, + "loss": 1.4637, + "step": 8020 + }, + { + "epoch": 0.03319354565352635, + "grad_norm": 4.101602166042781, + "learning_rate": 1.9972934865116622e-05, + "loss": 1.4656, + "step": 8030 + }, + { + "epoch": 0.033234882572148426, + "grad_norm": 4.372499340047587, + "learning_rate": 1.9972838333967615e-05, + "loss": 1.5377, + "step": 8040 + }, + { + "epoch": 0.033276219490770496, + "grad_norm": 3.984477694550147, + "learning_rate": 1.997274163121388e-05, + "loss": 1.5216, + "step": 8050 + }, + { + "epoch": 0.033317556409392574, + "grad_norm": 3.484892612102319, + "learning_rate": 1.9972644756857087e-05, + "loss": 1.459, + "step": 8060 + }, + { + "epoch": 0.03335889332801465, + "grad_norm": 3.473953984247227, + "learning_rate": 1.9972547710898894e-05, + "loss": 1.4889, + "step": 8070 + }, + { + "epoch": 0.03340023024663673, + "grad_norm": 3.6982560350258384, + "learning_rate": 1.9972450493340973e-05, + "loss": 1.4529, + "step": 8080 + }, + { + "epoch": 0.0334415671652588, + "grad_norm": 4.338255434805353, + "learning_rate": 1.9972353104185e-05, + "loss": 1.4906, + "step": 8090 + }, + { + "epoch": 0.033482904083880875, + "grad_norm": 3.2105273217876897, + "learning_rate": 1.9972255543432644e-05, + "loss": 1.4846, + "step": 8100 + }, + { + "epoch": 0.03352424100250295, + "grad_norm": 3.7063825752894037, + "learning_rate": 1.997215781108559e-05, + "loss": 1.4354, + "step": 8110 + }, + { + "epoch": 0.03356557792112502, + "grad_norm": 3.606082322653538, + "learning_rate": 1.997205990714552e-05, + "loss": 1.5067, + "step": 8120 + }, + { + "epoch": 0.0336069148397471, + "grad_norm": 3.8291971667898643, + "learning_rate": 1.9971961831614116e-05, + "loss": 1.4619, + "step": 8130 + }, + { + "epoch": 0.03364825175836918, + "grad_norm": 3.331721665794406, + "learning_rate": 1.997186358449307e-05, + "loss": 1.4484, + "step": 8140 + }, + { + "epoch": 0.033689588676991254, + "grad_norm": 4.417828154129124, + "learning_rate": 1.9971765165784065e-05, + "loss": 1.508, + "step": 8150 + }, + { + "epoch": 0.033730925595613324, + "grad_norm": 3.782991542711988, + "learning_rate": 1.9971666575488798e-05, + "loss": 1.3925, + "step": 8160 + }, + { + "epoch": 0.0337722625142354, + "grad_norm": 4.079542042860949, + "learning_rate": 1.997156781360897e-05, + "loss": 1.4996, + "step": 8170 + }, + { + "epoch": 0.03381359943285748, + "grad_norm": 4.1626484536651995, + "learning_rate": 1.9971468880146273e-05, + "loss": 1.5178, + "step": 8180 + }, + { + "epoch": 0.03385493635147955, + "grad_norm": 4.122058968257828, + "learning_rate": 1.9971369775102417e-05, + "loss": 1.4267, + "step": 8190 + }, + { + "epoch": 0.033896273270101626, + "grad_norm": 4.017697912034419, + "learning_rate": 1.9971270498479097e-05, + "loss": 1.5129, + "step": 8200 + }, + { + "epoch": 0.0339376101887237, + "grad_norm": 3.3575071407879977, + "learning_rate": 1.997117105027803e-05, + "loss": 1.4796, + "step": 8210 + }, + { + "epoch": 0.03397894710734578, + "grad_norm": 3.9246891678069598, + "learning_rate": 1.9971071430500924e-05, + "loss": 1.5052, + "step": 8220 + }, + { + "epoch": 0.03402028402596785, + "grad_norm": 4.075111279351767, + "learning_rate": 1.9970971639149493e-05, + "loss": 1.4606, + "step": 8230 + }, + { + "epoch": 0.03406162094458993, + "grad_norm": 4.564662754708322, + "learning_rate": 1.997087167622546e-05, + "loss": 1.5111, + "step": 8240 + }, + { + "epoch": 0.034102957863212005, + "grad_norm": 3.753297984489193, + "learning_rate": 1.9970771541730536e-05, + "loss": 1.4899, + "step": 8250 + }, + { + "epoch": 0.034144294781834075, + "grad_norm": 3.913489388979073, + "learning_rate": 1.997067123566645e-05, + "loss": 1.4796, + "step": 8260 + }, + { + "epoch": 0.03418563170045615, + "grad_norm": 3.440711906522703, + "learning_rate": 1.9970570758034924e-05, + "loss": 1.5184, + "step": 8270 + }, + { + "epoch": 0.03422696861907823, + "grad_norm": 3.7595940155205363, + "learning_rate": 1.997047010883769e-05, + "loss": 1.4901, + "step": 8280 + }, + { + "epoch": 0.0342683055377003, + "grad_norm": 4.387403962431217, + "learning_rate": 1.9970369288076478e-05, + "loss": 1.4553, + "step": 8290 + }, + { + "epoch": 0.03430964245632238, + "grad_norm": 3.7034201439850594, + "learning_rate": 1.9970268295753022e-05, + "loss": 1.4534, + "step": 8300 + }, + { + "epoch": 0.034350979374944454, + "grad_norm": 4.0052763549673225, + "learning_rate": 1.9970167131869064e-05, + "loss": 1.4539, + "step": 8310 + }, + { + "epoch": 0.03439231629356653, + "grad_norm": 4.079349801665228, + "learning_rate": 1.9970065796426342e-05, + "loss": 1.4698, + "step": 8320 + }, + { + "epoch": 0.0344336532121886, + "grad_norm": 3.9322937150085875, + "learning_rate": 1.99699642894266e-05, + "loss": 1.4318, + "step": 8330 + }, + { + "epoch": 0.03447499013081068, + "grad_norm": 3.834658918473933, + "learning_rate": 1.9969862610871586e-05, + "loss": 1.4687, + "step": 8340 + }, + { + "epoch": 0.034516327049432756, + "grad_norm": 3.3637972502778912, + "learning_rate": 1.9969760760763045e-05, + "loss": 1.4661, + "step": 8350 + }, + { + "epoch": 0.034557663968054826, + "grad_norm": 3.5654841117273026, + "learning_rate": 1.9969658739102733e-05, + "loss": 1.4302, + "step": 8360 + }, + { + "epoch": 0.0345990008866769, + "grad_norm": 3.7270409908212194, + "learning_rate": 1.9969556545892405e-05, + "loss": 1.4447, + "step": 8370 + }, + { + "epoch": 0.03464033780529898, + "grad_norm": 3.914859995823126, + "learning_rate": 1.996945418113382e-05, + "loss": 1.4519, + "step": 8380 + }, + { + "epoch": 0.03468167472392106, + "grad_norm": 4.5791406660012095, + "learning_rate": 1.9969351644828742e-05, + "loss": 1.5204, + "step": 8390 + }, + { + "epoch": 0.03472301164254313, + "grad_norm": 3.6418617205879817, + "learning_rate": 1.9969248936978932e-05, + "loss": 1.4943, + "step": 8400 + }, + { + "epoch": 0.034764348561165205, + "grad_norm": 3.4609359361113534, + "learning_rate": 1.9969146057586156e-05, + "loss": 1.4799, + "step": 8410 + }, + { + "epoch": 0.03480568547978728, + "grad_norm": 3.9925889106780987, + "learning_rate": 1.9969043006652186e-05, + "loss": 1.4687, + "step": 8420 + }, + { + "epoch": 0.03484702239840935, + "grad_norm": 3.2054804086428548, + "learning_rate": 1.9968939784178794e-05, + "loss": 1.4816, + "step": 8430 + }, + { + "epoch": 0.03488835931703143, + "grad_norm": 3.349889124617882, + "learning_rate": 1.996883639016776e-05, + "loss": 1.4577, + "step": 8440 + }, + { + "epoch": 0.03492969623565351, + "grad_norm": 3.5789857070884086, + "learning_rate": 1.996873282462086e-05, + "loss": 1.5172, + "step": 8450 + }, + { + "epoch": 0.034971033154275584, + "grad_norm": 3.4989039985130272, + "learning_rate": 1.9968629087539876e-05, + "loss": 1.4852, + "step": 8460 + }, + { + "epoch": 0.035012370072897654, + "grad_norm": 3.3811833684889154, + "learning_rate": 1.9968525178926595e-05, + "loss": 1.4594, + "step": 8470 + }, + { + "epoch": 0.03505370699151973, + "grad_norm": 3.4385931194022223, + "learning_rate": 1.9968421098782803e-05, + "loss": 1.4595, + "step": 8480 + }, + { + "epoch": 0.03509504391014181, + "grad_norm": 4.16413923561225, + "learning_rate": 1.9968316847110292e-05, + "loss": 1.4963, + "step": 8490 + }, + { + "epoch": 0.03513638082876388, + "grad_norm": 4.289305042774894, + "learning_rate": 1.9968212423910855e-05, + "loss": 1.4551, + "step": 8500 + }, + { + "epoch": 0.035177717747385956, + "grad_norm": 5.453654756696216, + "learning_rate": 1.9968107829186287e-05, + "loss": 1.4885, + "step": 8510 + }, + { + "epoch": 0.03521905466600803, + "grad_norm": 3.6797118668666795, + "learning_rate": 1.996800306293839e-05, + "loss": 1.4984, + "step": 8520 + }, + { + "epoch": 0.03526039158463011, + "grad_norm": 3.2371854818646995, + "learning_rate": 1.9967898125168973e-05, + "loss": 1.4481, + "step": 8530 + }, + { + "epoch": 0.03530172850325218, + "grad_norm": 3.238508861502653, + "learning_rate": 1.9967793015879828e-05, + "loss": 1.4562, + "step": 8540 + }, + { + "epoch": 0.03534306542187426, + "grad_norm": 3.5415115005606177, + "learning_rate": 1.9967687735072776e-05, + "loss": 1.476, + "step": 8550 + }, + { + "epoch": 0.035384402340496335, + "grad_norm": 3.843042698193225, + "learning_rate": 1.9967582282749622e-05, + "loss": 1.4751, + "step": 8560 + }, + { + "epoch": 0.035425739259118405, + "grad_norm": 3.5779391668735006, + "learning_rate": 1.9967476658912184e-05, + "loss": 1.4804, + "step": 8570 + }, + { + "epoch": 0.03546707617774048, + "grad_norm": 4.949952686769368, + "learning_rate": 1.9967370863562276e-05, + "loss": 1.4245, + "step": 8580 + }, + { + "epoch": 0.03550841309636256, + "grad_norm": 3.8134978579481924, + "learning_rate": 1.996726489670172e-05, + "loss": 1.494, + "step": 8590 + }, + { + "epoch": 0.03554975001498463, + "grad_norm": 4.098567290916666, + "learning_rate": 1.996715875833234e-05, + "loss": 1.4339, + "step": 8600 + }, + { + "epoch": 0.03559108693360671, + "grad_norm": 3.4443466301897074, + "learning_rate": 1.9967052448455962e-05, + "loss": 1.4808, + "step": 8610 + }, + { + "epoch": 0.035632423852228784, + "grad_norm": 3.939242075931307, + "learning_rate": 1.9966945967074416e-05, + "loss": 1.4884, + "step": 8620 + }, + { + "epoch": 0.03567376077085086, + "grad_norm": 3.3941498280577975, + "learning_rate": 1.996683931418953e-05, + "loss": 1.4635, + "step": 8630 + }, + { + "epoch": 0.03571509768947293, + "grad_norm": 3.911248054251368, + "learning_rate": 1.996673248980315e-05, + "loss": 1.4785, + "step": 8640 + }, + { + "epoch": 0.03575643460809501, + "grad_norm": 4.0383619484944155, + "learning_rate": 1.99666254939171e-05, + "loss": 1.4334, + "step": 8650 + }, + { + "epoch": 0.035797771526717086, + "grad_norm": 3.21818266356431, + "learning_rate": 1.996651832653323e-05, + "loss": 1.5279, + "step": 8660 + }, + { + "epoch": 0.035839108445339156, + "grad_norm": 4.068360221073268, + "learning_rate": 1.9966410987653383e-05, + "loss": 1.5073, + "step": 8670 + }, + { + "epoch": 0.03588044536396123, + "grad_norm": 5.64416307388456, + "learning_rate": 1.9966303477279404e-05, + "loss": 1.4595, + "step": 8680 + }, + { + "epoch": 0.03592178228258331, + "grad_norm": 4.456991706091006, + "learning_rate": 1.9966195795413145e-05, + "loss": 1.5152, + "step": 8690 + }, + { + "epoch": 0.03596311920120539, + "grad_norm": 3.541237309488241, + "learning_rate": 1.9966087942056457e-05, + "loss": 1.4773, + "step": 8700 + }, + { + "epoch": 0.03600445611982746, + "grad_norm": 3.5306668816992186, + "learning_rate": 1.9965979917211196e-05, + "loss": 1.4838, + "step": 8710 + }, + { + "epoch": 0.036045793038449535, + "grad_norm": 4.051613339547623, + "learning_rate": 1.9965871720879223e-05, + "loss": 1.463, + "step": 8720 + }, + { + "epoch": 0.03608712995707161, + "grad_norm": 3.6129954404785, + "learning_rate": 1.9965763353062394e-05, + "loss": 1.4479, + "step": 8730 + }, + { + "epoch": 0.03612846687569368, + "grad_norm": 4.2384349637413825, + "learning_rate": 1.9965654813762582e-05, + "loss": 1.4928, + "step": 8740 + }, + { + "epoch": 0.03616980379431576, + "grad_norm": 4.343148391315392, + "learning_rate": 1.9965546102981652e-05, + "loss": 1.4418, + "step": 8750 + }, + { + "epoch": 0.03621114071293784, + "grad_norm": 3.9477945474327276, + "learning_rate": 1.996543722072147e-05, + "loss": 1.4417, + "step": 8760 + }, + { + "epoch": 0.036252477631559914, + "grad_norm": 3.912481869555381, + "learning_rate": 1.9965328166983916e-05, + "loss": 1.4877, + "step": 8770 + }, + { + "epoch": 0.036293814550181984, + "grad_norm": 4.391935734682612, + "learning_rate": 1.9965218941770857e-05, + "loss": 1.4335, + "step": 8780 + }, + { + "epoch": 0.03633515146880406, + "grad_norm": 4.493537291846412, + "learning_rate": 1.9965109545084185e-05, + "loss": 1.4919, + "step": 8790 + }, + { + "epoch": 0.03637648838742614, + "grad_norm": 2.93026955700472, + "learning_rate": 1.9964999976925775e-05, + "loss": 1.4304, + "step": 8800 + }, + { + "epoch": 0.03641782530604821, + "grad_norm": 3.6053506467813032, + "learning_rate": 1.9964890237297512e-05, + "loss": 1.4635, + "step": 8810 + }, + { + "epoch": 0.036459162224670286, + "grad_norm": 3.5234834011018648, + "learning_rate": 1.9964780326201286e-05, + "loss": 1.4981, + "step": 8820 + }, + { + "epoch": 0.03650049914329236, + "grad_norm": 3.750450253620856, + "learning_rate": 1.996467024363899e-05, + "loss": 1.4627, + "step": 8830 + }, + { + "epoch": 0.03654183606191444, + "grad_norm": 3.666723051780572, + "learning_rate": 1.9964559989612516e-05, + "loss": 1.4514, + "step": 8840 + }, + { + "epoch": 0.03658317298053651, + "grad_norm": 3.3239044375214633, + "learning_rate": 1.996444956412376e-05, + "loss": 1.4972, + "step": 8850 + }, + { + "epoch": 0.03662450989915859, + "grad_norm": 3.8599698199624064, + "learning_rate": 1.9964338967174625e-05, + "loss": 1.5057, + "step": 8860 + }, + { + "epoch": 0.036665846817780665, + "grad_norm": 4.132699231086706, + "learning_rate": 1.9964228198767012e-05, + "loss": 1.4519, + "step": 8870 + }, + { + "epoch": 0.036707183736402735, + "grad_norm": 3.0714085451165745, + "learning_rate": 1.9964117258902828e-05, + "loss": 1.434, + "step": 8880 + }, + { + "epoch": 0.03674852065502481, + "grad_norm": 3.8796486291954904, + "learning_rate": 1.9964006147583982e-05, + "loss": 1.4505, + "step": 8890 + }, + { + "epoch": 0.03678985757364689, + "grad_norm": 3.832002897416075, + "learning_rate": 1.9963894864812383e-05, + "loss": 1.4526, + "step": 8900 + }, + { + "epoch": 0.03683119449226896, + "grad_norm": 4.887224283091199, + "learning_rate": 1.9963783410589948e-05, + "loss": 1.4644, + "step": 8910 + }, + { + "epoch": 0.036872531410891037, + "grad_norm": 4.158724114940273, + "learning_rate": 1.99636717849186e-05, + "loss": 1.4417, + "step": 8920 + }, + { + "epoch": 0.036913868329513114, + "grad_norm": 3.81771878130769, + "learning_rate": 1.9963559987800253e-05, + "loss": 1.508, + "step": 8930 + }, + { + "epoch": 0.03695520524813519, + "grad_norm": 3.5553407292065207, + "learning_rate": 1.9963448019236834e-05, + "loss": 1.383, + "step": 8940 + }, + { + "epoch": 0.03699654216675726, + "grad_norm": 5.03061141095772, + "learning_rate": 1.9963335879230264e-05, + "loss": 1.4293, + "step": 8950 + }, + { + "epoch": 0.03703787908537934, + "grad_norm": 3.183233332739406, + "learning_rate": 1.996322356778248e-05, + "loss": 1.4355, + "step": 8960 + }, + { + "epoch": 0.037079216004001415, + "grad_norm": 3.555732688914675, + "learning_rate": 1.996311108489541e-05, + "loss": 1.4338, + "step": 8970 + }, + { + "epoch": 0.037120552922623486, + "grad_norm": 3.696220192021282, + "learning_rate": 1.9962998430570994e-05, + "loss": 1.4883, + "step": 8980 + }, + { + "epoch": 0.03716188984124556, + "grad_norm": 4.796096029475931, + "learning_rate": 1.9962885604811168e-05, + "loss": 1.4901, + "step": 8990 + }, + { + "epoch": 0.03720322675986764, + "grad_norm": 5.814203236754815, + "learning_rate": 1.996277260761787e-05, + "loss": 1.4053, + "step": 9000 + }, + { + "epoch": 0.03724456367848972, + "grad_norm": 3.3287110492970795, + "learning_rate": 1.996265943899305e-05, + "loss": 1.4202, + "step": 9010 + }, + { + "epoch": 0.03728590059711179, + "grad_norm": 3.877230681858091, + "learning_rate": 1.996254609893865e-05, + "loss": 1.4297, + "step": 9020 + }, + { + "epoch": 0.037327237515733865, + "grad_norm": 3.48844533397734, + "learning_rate": 1.9962432587456622e-05, + "loss": 1.4652, + "step": 9030 + }, + { + "epoch": 0.03736857443435594, + "grad_norm": 3.5520610987897943, + "learning_rate": 1.9962318904548923e-05, + "loss": 1.4807, + "step": 9040 + }, + { + "epoch": 0.03740991135297801, + "grad_norm": 3.181838391240591, + "learning_rate": 1.9962205050217504e-05, + "loss": 1.4757, + "step": 9050 + }, + { + "epoch": 0.03745124827160009, + "grad_norm": 3.7425531387998907, + "learning_rate": 1.996209102446433e-05, + "loss": 1.4331, + "step": 9060 + }, + { + "epoch": 0.037492585190222166, + "grad_norm": 3.663633392520708, + "learning_rate": 1.9961976827291358e-05, + "loss": 1.4718, + "step": 9070 + }, + { + "epoch": 0.03753392210884424, + "grad_norm": 4.833995454604731, + "learning_rate": 1.9961862458700554e-05, + "loss": 1.4217, + "step": 9080 + }, + { + "epoch": 0.037575259027466314, + "grad_norm": 3.6290459016542216, + "learning_rate": 1.9961747918693887e-05, + "loss": 1.4848, + "step": 9090 + }, + { + "epoch": 0.03761659594608839, + "grad_norm": 3.585806885070931, + "learning_rate": 1.9961633207273325e-05, + "loss": 1.4358, + "step": 9100 + }, + { + "epoch": 0.03765793286471047, + "grad_norm": 3.4952003665857134, + "learning_rate": 1.9961518324440847e-05, + "loss": 1.3939, + "step": 9110 + }, + { + "epoch": 0.03769926978333254, + "grad_norm": 3.279719203181294, + "learning_rate": 1.9961403270198424e-05, + "loss": 1.4808, + "step": 9120 + }, + { + "epoch": 0.037740606701954615, + "grad_norm": 3.2692766545796528, + "learning_rate": 1.9961288044548043e-05, + "loss": 1.3822, + "step": 9130 + }, + { + "epoch": 0.03778194362057669, + "grad_norm": 3.6490123739235623, + "learning_rate": 1.996117264749168e-05, + "loss": 1.4485, + "step": 9140 + }, + { + "epoch": 0.03782328053919877, + "grad_norm": 4.464763724322134, + "learning_rate": 1.996105707903132e-05, + "loss": 1.4795, + "step": 9150 + }, + { + "epoch": 0.03786461745782084, + "grad_norm": 3.529618572994803, + "learning_rate": 1.9960941339168963e-05, + "loss": 1.4452, + "step": 9160 + }, + { + "epoch": 0.03790595437644292, + "grad_norm": 3.949852891089842, + "learning_rate": 1.9960825427906587e-05, + "loss": 1.4866, + "step": 9170 + }, + { + "epoch": 0.037947291295064994, + "grad_norm": 6.3198129841396735, + "learning_rate": 1.9960709345246192e-05, + "loss": 1.4661, + "step": 9180 + }, + { + "epoch": 0.037988628213687065, + "grad_norm": 4.3016466998403775, + "learning_rate": 1.9960593091189776e-05, + "loss": 1.4575, + "step": 9190 + }, + { + "epoch": 0.03802996513230914, + "grad_norm": 3.218809542898574, + "learning_rate": 1.996047666573934e-05, + "loss": 1.4385, + "step": 9200 + }, + { + "epoch": 0.03807130205093122, + "grad_norm": 3.507546904929844, + "learning_rate": 1.9960360068896884e-05, + "loss": 1.456, + "step": 9210 + }, + { + "epoch": 0.038112638969553296, + "grad_norm": 3.2658287561416866, + "learning_rate": 1.9960243300664418e-05, + "loss": 1.4937, + "step": 9220 + }, + { + "epoch": 0.038153975888175366, + "grad_norm": 3.9657257849748078, + "learning_rate": 1.996012636104395e-05, + "loss": 1.4743, + "step": 9230 + }, + { + "epoch": 0.03819531280679744, + "grad_norm": 3.7419945345055865, + "learning_rate": 1.996000925003749e-05, + "loss": 1.4645, + "step": 9240 + }, + { + "epoch": 0.03823664972541952, + "grad_norm": 3.717998186266208, + "learning_rate": 1.9959891967647055e-05, + "loss": 1.4304, + "step": 9250 + }, + { + "epoch": 0.03827798664404159, + "grad_norm": 4.122611974230224, + "learning_rate": 1.9959774513874666e-05, + "loss": 1.4396, + "step": 9260 + }, + { + "epoch": 0.03831932356266367, + "grad_norm": 4.081766829152903, + "learning_rate": 1.9959656888722338e-05, + "loss": 1.4296, + "step": 9270 + }, + { + "epoch": 0.038360660481285745, + "grad_norm": 3.4327932618086807, + "learning_rate": 1.99595390921921e-05, + "loss": 1.479, + "step": 9280 + }, + { + "epoch": 0.038401997399907815, + "grad_norm": 4.251866528393302, + "learning_rate": 1.9959421124285976e-05, + "loss": 1.4399, + "step": 9290 + }, + { + "epoch": 0.03844333431852989, + "grad_norm": 4.132921022210262, + "learning_rate": 1.9959302985006e-05, + "loss": 1.4366, + "step": 9300 + }, + { + "epoch": 0.03848467123715197, + "grad_norm": 4.791211851168452, + "learning_rate": 1.9959184674354198e-05, + "loss": 1.4838, + "step": 9310 + }, + { + "epoch": 0.03852600815577405, + "grad_norm": 3.000007579210258, + "learning_rate": 1.995906619233261e-05, + "loss": 1.4541, + "step": 9320 + }, + { + "epoch": 0.03856734507439612, + "grad_norm": 5.059959210643911, + "learning_rate": 1.9958947538943278e-05, + "loss": 1.5233, + "step": 9330 + }, + { + "epoch": 0.038608681993018194, + "grad_norm": 3.20842711732194, + "learning_rate": 1.9958828714188236e-05, + "loss": 1.4718, + "step": 9340 + }, + { + "epoch": 0.03865001891164027, + "grad_norm": 3.796018357701994, + "learning_rate": 1.9958709718069532e-05, + "loss": 1.4522, + "step": 9350 + }, + { + "epoch": 0.03869135583026234, + "grad_norm": 3.9321479256125347, + "learning_rate": 1.995859055058922e-05, + "loss": 1.5065, + "step": 9360 + }, + { + "epoch": 0.03873269274888442, + "grad_norm": 3.254019632954085, + "learning_rate": 1.9958471211749342e-05, + "loss": 1.4114, + "step": 9370 + }, + { + "epoch": 0.038774029667506496, + "grad_norm": 3.3308294037697896, + "learning_rate": 1.9958351701551953e-05, + "loss": 1.4285, + "step": 9380 + }, + { + "epoch": 0.03881536658612857, + "grad_norm": 4.08834871777043, + "learning_rate": 1.9958232019999114e-05, + "loss": 1.4295, + "step": 9390 + }, + { + "epoch": 0.03885670350475064, + "grad_norm": 3.441069579264666, + "learning_rate": 1.995811216709288e-05, + "loss": 1.4472, + "step": 9400 + }, + { + "epoch": 0.03889804042337272, + "grad_norm": 3.426532775633606, + "learning_rate": 1.995799214283531e-05, + "loss": 1.4566, + "step": 9410 + }, + { + "epoch": 0.0389393773419948, + "grad_norm": 3.399689817601649, + "learning_rate": 1.9957871947228476e-05, + "loss": 1.4642, + "step": 9420 + }, + { + "epoch": 0.03898071426061687, + "grad_norm": 3.388140389856613, + "learning_rate": 1.995775158027445e-05, + "loss": 1.4593, + "step": 9430 + }, + { + "epoch": 0.039022051179238945, + "grad_norm": 3.325888034679041, + "learning_rate": 1.9957631041975292e-05, + "loss": 1.473, + "step": 9440 + }, + { + "epoch": 0.03906338809786102, + "grad_norm": 5.25058506424265, + "learning_rate": 1.995751033233308e-05, + "loss": 1.4085, + "step": 9450 + }, + { + "epoch": 0.0391047250164831, + "grad_norm": 3.8257776442726135, + "learning_rate": 1.9957389451349898e-05, + "loss": 1.4926, + "step": 9460 + }, + { + "epoch": 0.03914606193510517, + "grad_norm": 3.9914355755037514, + "learning_rate": 1.9957268399027815e-05, + "loss": 1.4433, + "step": 9470 + }, + { + "epoch": 0.03918739885372725, + "grad_norm": 4.259453650516103, + "learning_rate": 1.9957147175368923e-05, + "loss": 1.4435, + "step": 9480 + }, + { + "epoch": 0.039228735772349324, + "grad_norm": 3.4057039561381974, + "learning_rate": 1.99570257803753e-05, + "loss": 1.4021, + "step": 9490 + }, + { + "epoch": 0.039270072690971394, + "grad_norm": 3.9702568689341735, + "learning_rate": 1.9956904214049044e-05, + "loss": 1.3975, + "step": 9500 + }, + { + "epoch": 0.03931140960959347, + "grad_norm": 4.162984306124767, + "learning_rate": 1.995678247639224e-05, + "loss": 1.4269, + "step": 9510 + }, + { + "epoch": 0.03935274652821555, + "grad_norm": 3.4623660466543216, + "learning_rate": 1.9956660567406984e-05, + "loss": 1.4812, + "step": 9520 + }, + { + "epoch": 0.039394083446837626, + "grad_norm": 3.9487634862208663, + "learning_rate": 1.9956538487095375e-05, + "loss": 1.3904, + "step": 9530 + }, + { + "epoch": 0.039435420365459696, + "grad_norm": 3.940768474272943, + "learning_rate": 1.9956416235459514e-05, + "loss": 1.4627, + "step": 9540 + }, + { + "epoch": 0.03947675728408177, + "grad_norm": 3.7240510688214488, + "learning_rate": 1.9956293812501503e-05, + "loss": 1.4714, + "step": 9550 + }, + { + "epoch": 0.03951809420270385, + "grad_norm": 3.544248199002313, + "learning_rate": 1.995617121822345e-05, + "loss": 1.4418, + "step": 9560 + }, + { + "epoch": 0.03955943112132592, + "grad_norm": 3.7941720521427453, + "learning_rate": 1.9956048452627463e-05, + "loss": 1.398, + "step": 9570 + }, + { + "epoch": 0.039600768039948, + "grad_norm": 3.231769382614049, + "learning_rate": 1.9955925515715656e-05, + "loss": 1.4323, + "step": 9580 + }, + { + "epoch": 0.039642104958570075, + "grad_norm": 3.4504677343753585, + "learning_rate": 1.9955802407490144e-05, + "loss": 1.4508, + "step": 9590 + }, + { + "epoch": 0.039683441877192145, + "grad_norm": 4.608743387499926, + "learning_rate": 1.9955679127953046e-05, + "loss": 1.4849, + "step": 9600 + }, + { + "epoch": 0.03972477879581422, + "grad_norm": 3.2583619571782223, + "learning_rate": 1.995555567710648e-05, + "loss": 1.4528, + "step": 9610 + }, + { + "epoch": 0.0397661157144363, + "grad_norm": 3.592600847545303, + "learning_rate": 1.9955432054952573e-05, + "loss": 1.4222, + "step": 9620 + }, + { + "epoch": 0.03980745263305838, + "grad_norm": 3.935340478064598, + "learning_rate": 1.9955308261493457e-05, + "loss": 1.4243, + "step": 9630 + }, + { + "epoch": 0.03984878955168045, + "grad_norm": 3.7051161334020075, + "learning_rate": 1.995518429673125e-05, + "loss": 1.4487, + "step": 9640 + }, + { + "epoch": 0.039890126470302524, + "grad_norm": 3.6647142900939977, + "learning_rate": 1.9955060160668095e-05, + "loss": 1.4458, + "step": 9650 + }, + { + "epoch": 0.0399314633889246, + "grad_norm": 4.428497991354939, + "learning_rate": 1.9954935853306124e-05, + "loss": 1.4721, + "step": 9660 + }, + { + "epoch": 0.03997280030754667, + "grad_norm": 3.2958564393103056, + "learning_rate": 1.9954811374647474e-05, + "loss": 1.4394, + "step": 9670 + }, + { + "epoch": 0.04001413722616875, + "grad_norm": 3.10104196973718, + "learning_rate": 1.9954686724694297e-05, + "loss": 1.4361, + "step": 9680 + }, + { + "epoch": 0.040055474144790826, + "grad_norm": 3.957938872776804, + "learning_rate": 1.9954561903448727e-05, + "loss": 1.4602, + "step": 9690 + }, + { + "epoch": 0.0400968110634129, + "grad_norm": 3.760794840185392, + "learning_rate": 1.9954436910912914e-05, + "loss": 1.4285, + "step": 9700 + }, + { + "epoch": 0.04013814798203497, + "grad_norm": 3.421396807117046, + "learning_rate": 1.9954311747089012e-05, + "loss": 1.4774, + "step": 9710 + }, + { + "epoch": 0.04017948490065705, + "grad_norm": 3.91789094802535, + "learning_rate": 1.9954186411979175e-05, + "loss": 1.4021, + "step": 9720 + }, + { + "epoch": 0.04022082181927913, + "grad_norm": 3.081464490088515, + "learning_rate": 1.9954060905585556e-05, + "loss": 1.4219, + "step": 9730 + }, + { + "epoch": 0.0402621587379012, + "grad_norm": 3.3381107703512507, + "learning_rate": 1.9953935227910316e-05, + "loss": 1.4632, + "step": 9740 + }, + { + "epoch": 0.040303495656523275, + "grad_norm": 3.8300980744875828, + "learning_rate": 1.995380937895562e-05, + "loss": 1.4322, + "step": 9750 + }, + { + "epoch": 0.04034483257514535, + "grad_norm": 3.4534661824404633, + "learning_rate": 1.995368335872363e-05, + "loss": 1.4436, + "step": 9760 + }, + { + "epoch": 0.04038616949376743, + "grad_norm": 3.983712880561037, + "learning_rate": 1.995355716721652e-05, + "loss": 1.4598, + "step": 9770 + }, + { + "epoch": 0.0404275064123895, + "grad_norm": 3.840919795852268, + "learning_rate": 1.995343080443645e-05, + "loss": 1.4456, + "step": 9780 + }, + { + "epoch": 0.04046884333101158, + "grad_norm": 3.901157368681076, + "learning_rate": 1.9953304270385607e-05, + "loss": 1.4525, + "step": 9790 + }, + { + "epoch": 0.040510180249633654, + "grad_norm": 3.189808091891606, + "learning_rate": 1.9953177565066163e-05, + "loss": 1.4462, + "step": 9800 + }, + { + "epoch": 0.040551517168255724, + "grad_norm": 4.025890961267514, + "learning_rate": 1.9953050688480293e-05, + "loss": 1.443, + "step": 9810 + }, + { + "epoch": 0.0405928540868778, + "grad_norm": 3.3710964799433007, + "learning_rate": 1.995292364063019e-05, + "loss": 1.4262, + "step": 9820 + }, + { + "epoch": 0.04063419100549988, + "grad_norm": 3.883950857165337, + "learning_rate": 1.9952796421518034e-05, + "loss": 1.4174, + "step": 9830 + }, + { + "epoch": 0.040675527924121956, + "grad_norm": 3.474777308443348, + "learning_rate": 1.995266903114602e-05, + "loss": 1.4654, + "step": 9840 + }, + { + "epoch": 0.040716864842744026, + "grad_norm": 3.116715119287666, + "learning_rate": 1.995254146951633e-05, + "loss": 1.3727, + "step": 9850 + }, + { + "epoch": 0.0407582017613661, + "grad_norm": 3.9762493552410203, + "learning_rate": 1.9952413736631165e-05, + "loss": 1.4567, + "step": 9860 + }, + { + "epoch": 0.04079953867998818, + "grad_norm": 2.9468825909120033, + "learning_rate": 1.9952285832492726e-05, + "loss": 1.4422, + "step": 9870 + }, + { + "epoch": 0.04084087559861025, + "grad_norm": 3.5348361015353444, + "learning_rate": 1.995215775710321e-05, + "loss": 1.3795, + "step": 9880 + }, + { + "epoch": 0.04088221251723233, + "grad_norm": 3.276927230678387, + "learning_rate": 1.995202951046482e-05, + "loss": 1.4218, + "step": 9890 + }, + { + "epoch": 0.040923549435854405, + "grad_norm": 3.606741214717579, + "learning_rate": 1.9951901092579763e-05, + "loss": 1.4364, + "step": 9900 + }, + { + "epoch": 0.040964886354476475, + "grad_norm": 4.3260733895333425, + "learning_rate": 1.9951772503450252e-05, + "loss": 1.4398, + "step": 9910 + }, + { + "epoch": 0.04100622327309855, + "grad_norm": 3.1621905456493544, + "learning_rate": 1.9951643743078496e-05, + "loss": 1.4397, + "step": 9920 + }, + { + "epoch": 0.04104756019172063, + "grad_norm": 3.4412582079657623, + "learning_rate": 1.9951514811466713e-05, + "loss": 1.4036, + "step": 9930 + }, + { + "epoch": 0.041088897110342706, + "grad_norm": 4.873896899838307, + "learning_rate": 1.995138570861712e-05, + "loss": 1.4263, + "step": 9940 + }, + { + "epoch": 0.04113023402896478, + "grad_norm": 4.331814069124075, + "learning_rate": 1.9951256434531943e-05, + "loss": 1.4817, + "step": 9950 + }, + { + "epoch": 0.041171570947586854, + "grad_norm": 3.9349360259135926, + "learning_rate": 1.9951126989213398e-05, + "loss": 1.4483, + "step": 9960 + }, + { + "epoch": 0.04121290786620893, + "grad_norm": 3.3613448968668864, + "learning_rate": 1.995099737266372e-05, + "loss": 1.4229, + "step": 9970 + }, + { + "epoch": 0.041254244784831, + "grad_norm": 3.549433934959654, + "learning_rate": 1.9950867584885132e-05, + "loss": 1.4283, + "step": 9980 + }, + { + "epoch": 0.04129558170345308, + "grad_norm": 3.5652364655273208, + "learning_rate": 1.995073762587987e-05, + "loss": 1.4642, + "step": 9990 + }, + { + "epoch": 0.041336918622075156, + "grad_norm": 4.029695967481624, + "learning_rate": 1.995060749565018e-05, + "loss": 1.3657, + "step": 10000 + }, + { + "epoch": 0.041336918622075156, + "eval_loss": 1.736175537109375, + "eval_runtime": 393.8494, + "eval_samples_per_second": 10.4, + "eval_steps_per_second": 2.6, + "step": 10000 + }, + { + "epoch": 0.04137825554069723, + "grad_norm": 3.414046152937389, + "learning_rate": 1.9950477194198287e-05, + "loss": 1.3957, + "step": 10010 + }, + { + "epoch": 0.0414195924593193, + "grad_norm": 5.320606616740586, + "learning_rate": 1.9950346721526443e-05, + "loss": 1.4508, + "step": 10020 + }, + { + "epoch": 0.04146092937794138, + "grad_norm": 3.9807925522216423, + "learning_rate": 1.9950216077636886e-05, + "loss": 1.3943, + "step": 10030 + }, + { + "epoch": 0.04150226629656346, + "grad_norm": 3.501083066632413, + "learning_rate": 1.9950085262531868e-05, + "loss": 1.4352, + "step": 10040 + }, + { + "epoch": 0.04154360321518553, + "grad_norm": 3.771268569637735, + "learning_rate": 1.994995427621364e-05, + "loss": 1.452, + "step": 10050 + }, + { + "epoch": 0.041584940133807605, + "grad_norm": 3.8515101224909216, + "learning_rate": 1.9949823118684454e-05, + "loss": 1.4306, + "step": 10060 + }, + { + "epoch": 0.04162627705242968, + "grad_norm": 3.7934745333554782, + "learning_rate": 1.9949691789946567e-05, + "loss": 1.4805, + "step": 10070 + }, + { + "epoch": 0.04166761397105176, + "grad_norm": 3.396103842576295, + "learning_rate": 1.9949560290002245e-05, + "loss": 1.4516, + "step": 10080 + }, + { + "epoch": 0.04170895088967383, + "grad_norm": 3.4268415637061085, + "learning_rate": 1.994942861885374e-05, + "loss": 1.4143, + "step": 10090 + }, + { + "epoch": 0.041750287808295906, + "grad_norm": 3.4582505595292203, + "learning_rate": 1.9949296776503324e-05, + "loss": 1.3815, + "step": 10100 + }, + { + "epoch": 0.041791624726917984, + "grad_norm": 3.5395990026077304, + "learning_rate": 1.994916476295327e-05, + "loss": 1.4449, + "step": 10110 + }, + { + "epoch": 0.041832961645540054, + "grad_norm": 3.4229281128115403, + "learning_rate": 1.9949032578205834e-05, + "loss": 1.4526, + "step": 10120 + }, + { + "epoch": 0.04187429856416213, + "grad_norm": 3.983206436567361, + "learning_rate": 1.994890022226331e-05, + "loss": 1.4463, + "step": 10130 + }, + { + "epoch": 0.04191563548278421, + "grad_norm": 3.668734425155437, + "learning_rate": 1.9948767695127964e-05, + "loss": 1.419, + "step": 10140 + }, + { + "epoch": 0.041956972401406285, + "grad_norm": 3.3634372280714517, + "learning_rate": 1.9948634996802078e-05, + "loss": 1.4329, + "step": 10150 + }, + { + "epoch": 0.041998309320028356, + "grad_norm": 4.062775728402737, + "learning_rate": 1.9948502127287936e-05, + "loss": 1.4361, + "step": 10160 + }, + { + "epoch": 0.04203964623865043, + "grad_norm": 3.4149660693597084, + "learning_rate": 1.9948369086587823e-05, + "loss": 1.4725, + "step": 10170 + }, + { + "epoch": 0.04208098315727251, + "grad_norm": 3.6916128915527313, + "learning_rate": 1.9948235874704035e-05, + "loss": 1.4732, + "step": 10180 + }, + { + "epoch": 0.04212232007589458, + "grad_norm": 3.9231999868924206, + "learning_rate": 1.9948102491638853e-05, + "loss": 1.4558, + "step": 10190 + }, + { + "epoch": 0.04216365699451666, + "grad_norm": 4.846870150341976, + "learning_rate": 1.9947968937394583e-05, + "loss": 1.4455, + "step": 10200 + }, + { + "epoch": 0.042204993913138734, + "grad_norm": 3.426175390236964, + "learning_rate": 1.9947835211973517e-05, + "loss": 1.3997, + "step": 10210 + }, + { + "epoch": 0.042246330831760805, + "grad_norm": 3.7909997652306258, + "learning_rate": 1.9947701315377954e-05, + "loss": 1.4361, + "step": 10220 + }, + { + "epoch": 0.04228766775038288, + "grad_norm": 3.535939765317278, + "learning_rate": 1.9947567247610206e-05, + "loss": 1.4449, + "step": 10230 + }, + { + "epoch": 0.04232900466900496, + "grad_norm": 3.3731810089302523, + "learning_rate": 1.9947433008672572e-05, + "loss": 1.4193, + "step": 10240 + }, + { + "epoch": 0.042370341587627036, + "grad_norm": 3.9292291070435077, + "learning_rate": 1.9947298598567364e-05, + "loss": 1.4657, + "step": 10250 + }, + { + "epoch": 0.042411678506249106, + "grad_norm": 3.369066359531392, + "learning_rate": 1.99471640172969e-05, + "loss": 1.4509, + "step": 10260 + }, + { + "epoch": 0.042453015424871184, + "grad_norm": 3.6668982318612495, + "learning_rate": 1.994702926486349e-05, + "loss": 1.3931, + "step": 10270 + }, + { + "epoch": 0.04249435234349326, + "grad_norm": 3.2034209344506097, + "learning_rate": 1.9946894341269453e-05, + "loss": 1.4217, + "step": 10280 + }, + { + "epoch": 0.04253568926211533, + "grad_norm": 4.400853617662863, + "learning_rate": 1.9946759246517113e-05, + "loss": 1.4544, + "step": 10290 + }, + { + "epoch": 0.04257702618073741, + "grad_norm": 3.1712083272819473, + "learning_rate": 1.9946623980608792e-05, + "loss": 1.4813, + "step": 10300 + }, + { + "epoch": 0.042618363099359485, + "grad_norm": 3.5677581184867395, + "learning_rate": 1.994648854354682e-05, + "loss": 1.4321, + "step": 10310 + }, + { + "epoch": 0.04265970001798156, + "grad_norm": 3.38462741867337, + "learning_rate": 1.9946352935333528e-05, + "loss": 1.3907, + "step": 10320 + }, + { + "epoch": 0.04270103693660363, + "grad_norm": 3.6690520985143054, + "learning_rate": 1.994621715597125e-05, + "loss": 1.453, + "step": 10330 + }, + { + "epoch": 0.04274237385522571, + "grad_norm": 3.628541207308318, + "learning_rate": 1.9946081205462315e-05, + "loss": 1.4224, + "step": 10340 + }, + { + "epoch": 0.04278371077384779, + "grad_norm": 3.573675637942579, + "learning_rate": 1.994594508380907e-05, + "loss": 1.4409, + "step": 10350 + }, + { + "epoch": 0.04282504769246986, + "grad_norm": 3.9512735810988584, + "learning_rate": 1.9945808791013857e-05, + "loss": 1.4116, + "step": 10360 + }, + { + "epoch": 0.042866384611091934, + "grad_norm": 3.2189804332946936, + "learning_rate": 1.994567232707902e-05, + "loss": 1.4239, + "step": 10370 + }, + { + "epoch": 0.04290772152971401, + "grad_norm": 3.277452726822312, + "learning_rate": 1.9945535692006903e-05, + "loss": 1.419, + "step": 10380 + }, + { + "epoch": 0.04294905844833609, + "grad_norm": 3.2409307004738594, + "learning_rate": 1.994539888579986e-05, + "loss": 1.412, + "step": 10390 + }, + { + "epoch": 0.04299039536695816, + "grad_norm": 5.955091940094864, + "learning_rate": 1.9945261908460248e-05, + "loss": 1.4001, + "step": 10400 + }, + { + "epoch": 0.043031732285580236, + "grad_norm": 3.626590483447886, + "learning_rate": 1.9945124759990424e-05, + "loss": 1.4598, + "step": 10410 + }, + { + "epoch": 0.04307306920420231, + "grad_norm": 3.4065599382832197, + "learning_rate": 1.9944987440392742e-05, + "loss": 1.3991, + "step": 10420 + }, + { + "epoch": 0.043114406122824384, + "grad_norm": 3.3228235524159824, + "learning_rate": 1.994484994966957e-05, + "loss": 1.4069, + "step": 10430 + }, + { + "epoch": 0.04315574304144646, + "grad_norm": 4.069300982498486, + "learning_rate": 1.9944712287823275e-05, + "loss": 1.4376, + "step": 10440 + }, + { + "epoch": 0.04319707996006854, + "grad_norm": 3.668990751455877, + "learning_rate": 1.9944574454856216e-05, + "loss": 1.4185, + "step": 10450 + }, + { + "epoch": 0.043238416878690615, + "grad_norm": 3.189803317003545, + "learning_rate": 1.9944436450770775e-05, + "loss": 1.3998, + "step": 10460 + }, + { + "epoch": 0.043279753797312685, + "grad_norm": 3.7817594340150924, + "learning_rate": 1.9944298275569328e-05, + "loss": 1.4494, + "step": 10470 + }, + { + "epoch": 0.04332109071593476, + "grad_norm": 4.780904235096889, + "learning_rate": 1.9944159929254245e-05, + "loss": 1.4616, + "step": 10480 + }, + { + "epoch": 0.04336242763455684, + "grad_norm": 4.010329780152807, + "learning_rate": 1.9944021411827905e-05, + "loss": 1.4532, + "step": 10490 + }, + { + "epoch": 0.04340376455317891, + "grad_norm": 4.209509632753131, + "learning_rate": 1.9943882723292704e-05, + "loss": 1.4622, + "step": 10500 + }, + { + "epoch": 0.04344510147180099, + "grad_norm": 3.228687583673167, + "learning_rate": 1.9943743863651017e-05, + "loss": 1.4053, + "step": 10510 + }, + { + "epoch": 0.043486438390423064, + "grad_norm": 3.288729771085999, + "learning_rate": 1.994360483290523e-05, + "loss": 1.4192, + "step": 10520 + }, + { + "epoch": 0.043527775309045134, + "grad_norm": 4.41078023777337, + "learning_rate": 1.994346563105775e-05, + "loss": 1.4098, + "step": 10530 + }, + { + "epoch": 0.04356911222766721, + "grad_norm": 3.1815816594140487, + "learning_rate": 1.9943326258110963e-05, + "loss": 1.4676, + "step": 10540 + }, + { + "epoch": 0.04361044914628929, + "grad_norm": 3.554730176042178, + "learning_rate": 1.994318671406727e-05, + "loss": 1.4262, + "step": 10550 + }, + { + "epoch": 0.043651786064911366, + "grad_norm": 4.564103408690964, + "learning_rate": 1.9943046998929073e-05, + "loss": 1.4104, + "step": 10560 + }, + { + "epoch": 0.043693122983533436, + "grad_norm": 3.5454961573863994, + "learning_rate": 1.994290711269877e-05, + "loss": 1.4235, + "step": 10570 + }, + { + "epoch": 0.04373445990215551, + "grad_norm": 3.6248317766975857, + "learning_rate": 1.9942767055378775e-05, + "loss": 1.3733, + "step": 10580 + }, + { + "epoch": 0.04377579682077759, + "grad_norm": 3.2489128741687123, + "learning_rate": 1.9942626826971493e-05, + "loss": 1.4456, + "step": 10590 + }, + { + "epoch": 0.04381713373939966, + "grad_norm": 3.5799361135868057, + "learning_rate": 1.994248642747934e-05, + "loss": 1.4071, + "step": 10600 + }, + { + "epoch": 0.04385847065802174, + "grad_norm": 3.4391607635624033, + "learning_rate": 1.9942345856904727e-05, + "loss": 1.388, + "step": 10610 + }, + { + "epoch": 0.043899807576643815, + "grad_norm": 4.900926633402934, + "learning_rate": 1.994220511525008e-05, + "loss": 1.4214, + "step": 10620 + }, + { + "epoch": 0.04394114449526589, + "grad_norm": 3.1998682814537807, + "learning_rate": 1.994206420251782e-05, + "loss": 1.4392, + "step": 10630 + }, + { + "epoch": 0.04398248141388796, + "grad_norm": 3.512730762072939, + "learning_rate": 1.9941923118710366e-05, + "loss": 1.3833, + "step": 10640 + }, + { + "epoch": 0.04402381833251004, + "grad_norm": 3.5959575986075354, + "learning_rate": 1.9941781863830153e-05, + "loss": 1.4666, + "step": 10650 + }, + { + "epoch": 0.04406515525113212, + "grad_norm": 4.116993239444605, + "learning_rate": 1.9941640437879603e-05, + "loss": 1.417, + "step": 10660 + }, + { + "epoch": 0.04410649216975419, + "grad_norm": 4.587080576933717, + "learning_rate": 1.9941498840861153e-05, + "loss": 1.3558, + "step": 10670 + }, + { + "epoch": 0.044147829088376264, + "grad_norm": 3.923348655449712, + "learning_rate": 1.9941357072777245e-05, + "loss": 1.403, + "step": 10680 + }, + { + "epoch": 0.04418916600699834, + "grad_norm": 3.3131343328753884, + "learning_rate": 1.9941215133630312e-05, + "loss": 1.414, + "step": 10690 + }, + { + "epoch": 0.04423050292562042, + "grad_norm": 3.815180569497117, + "learning_rate": 1.9941073023422796e-05, + "loss": 1.4567, + "step": 10700 + }, + { + "epoch": 0.04427183984424249, + "grad_norm": 3.0191885771803264, + "learning_rate": 1.994093074215715e-05, + "loss": 1.4257, + "step": 10710 + }, + { + "epoch": 0.044313176762864566, + "grad_norm": 3.4376292652965494, + "learning_rate": 1.994078828983581e-05, + "loss": 1.4118, + "step": 10720 + }, + { + "epoch": 0.04435451368148664, + "grad_norm": 3.5106899932837643, + "learning_rate": 1.994064566646124e-05, + "loss": 1.4159, + "step": 10730 + }, + { + "epoch": 0.04439585060010871, + "grad_norm": 3.6846637686102413, + "learning_rate": 1.9940502872035888e-05, + "loss": 1.3948, + "step": 10740 + }, + { + "epoch": 0.04443718751873079, + "grad_norm": 3.657265133747329, + "learning_rate": 1.9940359906562207e-05, + "loss": 1.4087, + "step": 10750 + }, + { + "epoch": 0.04447852443735287, + "grad_norm": 4.430332521129557, + "learning_rate": 1.9940216770042666e-05, + "loss": 1.3989, + "step": 10760 + }, + { + "epoch": 0.044519861355974945, + "grad_norm": 4.43254812995105, + "learning_rate": 1.994007346247972e-05, + "loss": 1.3781, + "step": 10770 + }, + { + "epoch": 0.044561198274597015, + "grad_norm": 3.547905857131194, + "learning_rate": 1.9939929983875837e-05, + "loss": 1.443, + "step": 10780 + }, + { + "epoch": 0.04460253519321909, + "grad_norm": 4.225199610421922, + "learning_rate": 1.9939786334233492e-05, + "loss": 1.3992, + "step": 10790 + }, + { + "epoch": 0.04464387211184117, + "grad_norm": 3.2850031799014494, + "learning_rate": 1.993964251355515e-05, + "loss": 1.39, + "step": 10800 + }, + { + "epoch": 0.04468520903046324, + "grad_norm": 3.576860893151518, + "learning_rate": 1.993949852184329e-05, + "loss": 1.4019, + "step": 10810 + }, + { + "epoch": 0.04472654594908532, + "grad_norm": 4.14729049725031, + "learning_rate": 1.9939354359100385e-05, + "loss": 1.407, + "step": 10820 + }, + { + "epoch": 0.044767882867707394, + "grad_norm": 3.6785935387585447, + "learning_rate": 1.9939210025328915e-05, + "loss": 1.4188, + "step": 10830 + }, + { + "epoch": 0.044809219786329464, + "grad_norm": 3.475380301816819, + "learning_rate": 1.993906552053137e-05, + "loss": 1.4146, + "step": 10840 + }, + { + "epoch": 0.04485055670495154, + "grad_norm": 3.3679721828323217, + "learning_rate": 1.9938920844710235e-05, + "loss": 1.4208, + "step": 10850 + }, + { + "epoch": 0.04489189362357362, + "grad_norm": 3.679471702118622, + "learning_rate": 1.9938775997867995e-05, + "loss": 1.4209, + "step": 10860 + }, + { + "epoch": 0.044933230542195696, + "grad_norm": 3.8980049289176377, + "learning_rate": 1.9938630980007147e-05, + "loss": 1.4121, + "step": 10870 + }, + { + "epoch": 0.044974567460817766, + "grad_norm": 3.7079901840906713, + "learning_rate": 1.9938485791130183e-05, + "loss": 1.3969, + "step": 10880 + }, + { + "epoch": 0.04501590437943984, + "grad_norm": 3.7675855531387996, + "learning_rate": 1.9938340431239603e-05, + "loss": 1.4012, + "step": 10890 + }, + { + "epoch": 0.04505724129806192, + "grad_norm": 3.3894112723434127, + "learning_rate": 1.9938194900337908e-05, + "loss": 1.4184, + "step": 10900 + }, + { + "epoch": 0.04509857821668399, + "grad_norm": 4.1568950335530825, + "learning_rate": 1.9938049198427604e-05, + "loss": 1.452, + "step": 10910 + }, + { + "epoch": 0.04513991513530607, + "grad_norm": 3.630087506411177, + "learning_rate": 1.9937903325511193e-05, + "loss": 1.4657, + "step": 10920 + }, + { + "epoch": 0.045181252053928145, + "grad_norm": 3.510575809020148, + "learning_rate": 1.9937757281591187e-05, + "loss": 1.4341, + "step": 10930 + }, + { + "epoch": 0.04522258897255022, + "grad_norm": 3.309825385197255, + "learning_rate": 1.9937611066670106e-05, + "loss": 1.3789, + "step": 10940 + }, + { + "epoch": 0.04526392589117229, + "grad_norm": 3.239522136091904, + "learning_rate": 1.9937464680750454e-05, + "loss": 1.4103, + "step": 10950 + }, + { + "epoch": 0.04530526280979437, + "grad_norm": 4.673675936224972, + "learning_rate": 1.9937318123834762e-05, + "loss": 1.3989, + "step": 10960 + }, + { + "epoch": 0.04534659972841645, + "grad_norm": 4.627358104306948, + "learning_rate": 1.9937171395925544e-05, + "loss": 1.4203, + "step": 10970 + }, + { + "epoch": 0.04538793664703852, + "grad_norm": 3.311365466265083, + "learning_rate": 1.9937024497025325e-05, + "loss": 1.389, + "step": 10980 + }, + { + "epoch": 0.045429273565660594, + "grad_norm": 4.134318195617502, + "learning_rate": 1.9936877427136637e-05, + "loss": 1.4224, + "step": 10990 + }, + { + "epoch": 0.04547061048428267, + "grad_norm": 3.006959002241816, + "learning_rate": 1.9936730186262007e-05, + "loss": 1.3988, + "step": 11000 + }, + { + "epoch": 0.04551194740290475, + "grad_norm": 4.01529741437254, + "learning_rate": 1.993658277440397e-05, + "loss": 1.4396, + "step": 11010 + }, + { + "epoch": 0.04555328432152682, + "grad_norm": 3.2748525540941507, + "learning_rate": 1.993643519156506e-05, + "loss": 1.3921, + "step": 11020 + }, + { + "epoch": 0.045594621240148896, + "grad_norm": 4.018973443549097, + "learning_rate": 1.9936287437747822e-05, + "loss": 1.3617, + "step": 11030 + }, + { + "epoch": 0.04563595815877097, + "grad_norm": 3.462150874636636, + "learning_rate": 1.993613951295479e-05, + "loss": 1.4075, + "step": 11040 + }, + { + "epoch": 0.04567729507739304, + "grad_norm": 3.6284928503493528, + "learning_rate": 1.9935991417188523e-05, + "loss": 1.3774, + "step": 11050 + }, + { + "epoch": 0.04571863199601512, + "grad_norm": 4.331109900085688, + "learning_rate": 1.9935843150451558e-05, + "loss": 1.4156, + "step": 11060 + }, + { + "epoch": 0.0457599689146372, + "grad_norm": 3.749722584209809, + "learning_rate": 1.9935694712746448e-05, + "loss": 1.4314, + "step": 11070 + }, + { + "epoch": 0.045801305833259275, + "grad_norm": 3.1035349095523266, + "learning_rate": 1.9935546104075746e-05, + "loss": 1.4167, + "step": 11080 + }, + { + "epoch": 0.045842642751881345, + "grad_norm": 4.134657657963317, + "learning_rate": 1.9935397324442015e-05, + "loss": 1.4377, + "step": 11090 + }, + { + "epoch": 0.04588397967050342, + "grad_norm": 3.378268647534537, + "learning_rate": 1.993524837384781e-05, + "loss": 1.4201, + "step": 11100 + }, + { + "epoch": 0.0459253165891255, + "grad_norm": 3.3061200097201615, + "learning_rate": 1.9935099252295694e-05, + "loss": 1.391, + "step": 11110 + }, + { + "epoch": 0.04596665350774757, + "grad_norm": 3.6514603716731133, + "learning_rate": 1.9934949959788237e-05, + "loss": 1.4423, + "step": 11120 + }, + { + "epoch": 0.046007990426369647, + "grad_norm": 3.2198262717397896, + "learning_rate": 1.9934800496328006e-05, + "loss": 1.4049, + "step": 11130 + }, + { + "epoch": 0.046049327344991724, + "grad_norm": 3.1555815125987197, + "learning_rate": 1.993465086191757e-05, + "loss": 1.4418, + "step": 11140 + }, + { + "epoch": 0.0460906642636138, + "grad_norm": 3.6391685610732476, + "learning_rate": 1.993450105655951e-05, + "loss": 1.3824, + "step": 11150 + }, + { + "epoch": 0.04613200118223587, + "grad_norm": 3.426206028662225, + "learning_rate": 1.9934351080256395e-05, + "loss": 1.3837, + "step": 11160 + }, + { + "epoch": 0.04617333810085795, + "grad_norm": 4.616699518929945, + "learning_rate": 1.9934200933010816e-05, + "loss": 1.3886, + "step": 11170 + }, + { + "epoch": 0.046214675019480025, + "grad_norm": 3.919463255914606, + "learning_rate": 1.993405061482535e-05, + "loss": 1.418, + "step": 11180 + }, + { + "epoch": 0.046256011938102096, + "grad_norm": 4.372063175345489, + "learning_rate": 1.9933900125702582e-05, + "loss": 1.3976, + "step": 11190 + }, + { + "epoch": 0.04629734885672417, + "grad_norm": 4.040676043168612, + "learning_rate": 1.9933749465645103e-05, + "loss": 1.4122, + "step": 11200 + }, + { + "epoch": 0.04633868577534625, + "grad_norm": 3.3730911260973815, + "learning_rate": 1.9933598634655512e-05, + "loss": 1.3707, + "step": 11210 + }, + { + "epoch": 0.04638002269396832, + "grad_norm": 3.6851619767350066, + "learning_rate": 1.9933447632736393e-05, + "loss": 1.398, + "step": 11220 + }, + { + "epoch": 0.0464213596125904, + "grad_norm": 4.001189833407079, + "learning_rate": 1.9933296459890355e-05, + "loss": 1.4071, + "step": 11230 + }, + { + "epoch": 0.046462696531212475, + "grad_norm": 3.3898995076664757, + "learning_rate": 1.993314511611999e-05, + "loss": 1.408, + "step": 11240 + }, + { + "epoch": 0.04650403344983455, + "grad_norm": 3.6996997277102146, + "learning_rate": 1.9932993601427912e-05, + "loss": 1.3975, + "step": 11250 + }, + { + "epoch": 0.04654537036845662, + "grad_norm": 3.313365690401916, + "learning_rate": 1.993284191581672e-05, + "loss": 1.408, + "step": 11260 + }, + { + "epoch": 0.0465867072870787, + "grad_norm": 4.633876867393197, + "learning_rate": 1.993269005928903e-05, + "loss": 1.396, + "step": 11270 + }, + { + "epoch": 0.046628044205700776, + "grad_norm": 3.5037620526852304, + "learning_rate": 1.993253803184745e-05, + "loss": 1.4024, + "step": 11280 + }, + { + "epoch": 0.046669381124322847, + "grad_norm": 3.081503642322238, + "learning_rate": 1.9932385833494597e-05, + "loss": 1.4109, + "step": 11290 + }, + { + "epoch": 0.046710718042944924, + "grad_norm": 3.547058360569091, + "learning_rate": 1.9932233464233092e-05, + "loss": 1.3796, + "step": 11300 + }, + { + "epoch": 0.046752054961567, + "grad_norm": 3.814887745242394, + "learning_rate": 1.9932080924065556e-05, + "loss": 1.4401, + "step": 11310 + }, + { + "epoch": 0.04679339188018908, + "grad_norm": 3.5004316252867085, + "learning_rate": 1.993192821299461e-05, + "loss": 1.452, + "step": 11320 + }, + { + "epoch": 0.04683472879881115, + "grad_norm": 3.449228750426351, + "learning_rate": 1.993177533102289e-05, + "loss": 1.3734, + "step": 11330 + }, + { + "epoch": 0.046876065717433225, + "grad_norm": 3.2964308484381784, + "learning_rate": 1.9931622278153024e-05, + "loss": 1.4018, + "step": 11340 + }, + { + "epoch": 0.0469174026360553, + "grad_norm": 2.8180078039607697, + "learning_rate": 1.993146905438764e-05, + "loss": 1.4081, + "step": 11350 + }, + { + "epoch": 0.04695873955467737, + "grad_norm": 3.2236402061866545, + "learning_rate": 1.9931315659729376e-05, + "loss": 1.4534, + "step": 11360 + }, + { + "epoch": 0.04700007647329945, + "grad_norm": 3.5082002531342473, + "learning_rate": 1.9931162094180874e-05, + "loss": 1.4173, + "step": 11370 + }, + { + "epoch": 0.04704141339192153, + "grad_norm": 3.284436648082263, + "learning_rate": 1.993100835774478e-05, + "loss": 1.4092, + "step": 11380 + }, + { + "epoch": 0.047082750310543604, + "grad_norm": 3.4816460403688314, + "learning_rate": 1.9930854450423736e-05, + "loss": 1.3913, + "step": 11390 + }, + { + "epoch": 0.047124087229165675, + "grad_norm": 4.14476906280773, + "learning_rate": 1.9930700372220387e-05, + "loss": 1.3703, + "step": 11400 + }, + { + "epoch": 0.04716542414778775, + "grad_norm": 4.375945137034217, + "learning_rate": 1.993054612313739e-05, + "loss": 1.4362, + "step": 11410 + }, + { + "epoch": 0.04720676106640983, + "grad_norm": 4.277168672236525, + "learning_rate": 1.993039170317739e-05, + "loss": 1.479, + "step": 11420 + }, + { + "epoch": 0.0472480979850319, + "grad_norm": 3.927053905964369, + "learning_rate": 1.9930237112343056e-05, + "loss": 1.3872, + "step": 11430 + }, + { + "epoch": 0.047289434903653976, + "grad_norm": 3.607150421833661, + "learning_rate": 1.9930082350637042e-05, + "loss": 1.3891, + "step": 11440 + }, + { + "epoch": 0.04733077182227605, + "grad_norm": 3.338415521714108, + "learning_rate": 1.992992741806201e-05, + "loss": 1.4211, + "step": 11450 + }, + { + "epoch": 0.04737210874089813, + "grad_norm": 3.629104841971202, + "learning_rate": 1.9929772314620627e-05, + "loss": 1.3425, + "step": 11460 + }, + { + "epoch": 0.0474134456595202, + "grad_norm": 3.099599716044088, + "learning_rate": 1.9929617040315563e-05, + "loss": 1.382, + "step": 11470 + }, + { + "epoch": 0.04745478257814228, + "grad_norm": 3.7158273713517893, + "learning_rate": 1.992946159514949e-05, + "loss": 1.4361, + "step": 11480 + }, + { + "epoch": 0.047496119496764355, + "grad_norm": 4.389528378421289, + "learning_rate": 1.992930597912508e-05, + "loss": 1.4435, + "step": 11490 + }, + { + "epoch": 0.047537456415386425, + "grad_norm": 3.669631509338215, + "learning_rate": 1.9929150192245016e-05, + "loss": 1.4321, + "step": 11500 + }, + { + "epoch": 0.0475787933340085, + "grad_norm": 3.4995914344254624, + "learning_rate": 1.992899423451197e-05, + "loss": 1.446, + "step": 11510 + }, + { + "epoch": 0.04762013025263058, + "grad_norm": 3.3809354325443017, + "learning_rate": 1.9928838105928635e-05, + "loss": 1.3941, + "step": 11520 + }, + { + "epoch": 0.04766146717125265, + "grad_norm": 3.1788524195701684, + "learning_rate": 1.9928681806497693e-05, + "loss": 1.4027, + "step": 11530 + }, + { + "epoch": 0.04770280408987473, + "grad_norm": 3.368111520244943, + "learning_rate": 1.9928525336221837e-05, + "loss": 1.4038, + "step": 11540 + }, + { + "epoch": 0.047744141008496804, + "grad_norm": 3.6045389016529636, + "learning_rate": 1.992836869510375e-05, + "loss": 1.4523, + "step": 11550 + }, + { + "epoch": 0.04778547792711888, + "grad_norm": 3.859885628963866, + "learning_rate": 1.9928211883146136e-05, + "loss": 1.4307, + "step": 11560 + }, + { + "epoch": 0.04782681484574095, + "grad_norm": 3.2925025525674756, + "learning_rate": 1.9928054900351693e-05, + "loss": 1.4473, + "step": 11570 + }, + { + "epoch": 0.04786815176436303, + "grad_norm": 4.075772971569745, + "learning_rate": 1.992789774672312e-05, + "loss": 1.4384, + "step": 11580 + }, + { + "epoch": 0.047909488682985106, + "grad_norm": 3.391288577095074, + "learning_rate": 1.9927740422263117e-05, + "loss": 1.4038, + "step": 11590 + }, + { + "epoch": 0.047950825601607176, + "grad_norm": 3.4783586083297626, + "learning_rate": 1.9927582926974402e-05, + "loss": 1.3911, + "step": 11600 + }, + { + "epoch": 0.04799216252022925, + "grad_norm": 3.468149363696469, + "learning_rate": 1.9927425260859673e-05, + "loss": 1.4123, + "step": 11610 + }, + { + "epoch": 0.04803349943885133, + "grad_norm": 3.464512963986536, + "learning_rate": 1.992726742392165e-05, + "loss": 1.3973, + "step": 11620 + }, + { + "epoch": 0.04807483635747341, + "grad_norm": 4.061746689943389, + "learning_rate": 1.992710941616305e-05, + "loss": 1.3841, + "step": 11630 + }, + { + "epoch": 0.04811617327609548, + "grad_norm": 4.139608234706919, + "learning_rate": 1.992695123758659e-05, + "loss": 1.3787, + "step": 11640 + }, + { + "epoch": 0.048157510194717555, + "grad_norm": 3.8447070353873025, + "learning_rate": 1.992679288819499e-05, + "loss": 1.3815, + "step": 11650 + }, + { + "epoch": 0.04819884711333963, + "grad_norm": 3.6946628562082693, + "learning_rate": 1.9926634367990973e-05, + "loss": 1.3788, + "step": 11660 + }, + { + "epoch": 0.0482401840319617, + "grad_norm": 3.3618354267056465, + "learning_rate": 1.992647567697727e-05, + "loss": 1.4063, + "step": 11670 + }, + { + "epoch": 0.04828152095058378, + "grad_norm": 3.495643041214259, + "learning_rate": 1.9926316815156617e-05, + "loss": 1.4348, + "step": 11680 + }, + { + "epoch": 0.04832285786920586, + "grad_norm": 3.552951920155812, + "learning_rate": 1.9926157782531735e-05, + "loss": 1.3604, + "step": 11690 + }, + { + "epoch": 0.048364194787827934, + "grad_norm": 3.2600229354667025, + "learning_rate": 1.9925998579105374e-05, + "loss": 1.3395, + "step": 11700 + }, + { + "epoch": 0.048405531706450004, + "grad_norm": 3.4189972893062635, + "learning_rate": 1.9925839204880263e-05, + "loss": 1.4291, + "step": 11710 + }, + { + "epoch": 0.04844686862507208, + "grad_norm": 3.2696931259943494, + "learning_rate": 1.9925679659859148e-05, + "loss": 1.3748, + "step": 11720 + }, + { + "epoch": 0.04848820554369416, + "grad_norm": 3.400522141333647, + "learning_rate": 1.9925519944044772e-05, + "loss": 1.4141, + "step": 11730 + }, + { + "epoch": 0.04852954246231623, + "grad_norm": 3.347216625916271, + "learning_rate": 1.9925360057439887e-05, + "loss": 1.4062, + "step": 11740 + }, + { + "epoch": 0.048570879380938306, + "grad_norm": 3.55815106093295, + "learning_rate": 1.9925200000047248e-05, + "loss": 1.4056, + "step": 11750 + }, + { + "epoch": 0.04861221629956038, + "grad_norm": 4.75921473200291, + "learning_rate": 1.99250397718696e-05, + "loss": 1.418, + "step": 11760 + }, + { + "epoch": 0.04865355321818246, + "grad_norm": 3.2697324056422588, + "learning_rate": 1.9924879372909703e-05, + "loss": 1.4015, + "step": 11770 + }, + { + "epoch": 0.04869489013680453, + "grad_norm": 3.590517133814017, + "learning_rate": 1.9924718803170324e-05, + "loss": 1.3738, + "step": 11780 + }, + { + "epoch": 0.04873622705542661, + "grad_norm": 3.579850493829701, + "learning_rate": 1.9924558062654215e-05, + "loss": 1.334, + "step": 11790 + }, + { + "epoch": 0.048777563974048685, + "grad_norm": 3.552170187760315, + "learning_rate": 1.9924397151364148e-05, + "loss": 1.4169, + "step": 11800 + }, + { + "epoch": 0.048818900892670755, + "grad_norm": 3.0842304871945037, + "learning_rate": 1.992423606930289e-05, + "loss": 1.3906, + "step": 11810 + }, + { + "epoch": 0.04886023781129283, + "grad_norm": 3.379537547677814, + "learning_rate": 1.9924074816473215e-05, + "loss": 1.4351, + "step": 11820 + }, + { + "epoch": 0.04890157472991491, + "grad_norm": 3.53218922925584, + "learning_rate": 1.9923913392877896e-05, + "loss": 1.4032, + "step": 11830 + }, + { + "epoch": 0.04894291164853698, + "grad_norm": 5.016060492627983, + "learning_rate": 1.992375179851971e-05, + "loss": 1.3918, + "step": 11840 + }, + { + "epoch": 0.04898424856715906, + "grad_norm": 3.540388411237351, + "learning_rate": 1.9923590033401443e-05, + "loss": 1.4196, + "step": 11850 + }, + { + "epoch": 0.049025585485781134, + "grad_norm": 3.3104883369223765, + "learning_rate": 1.9923428097525872e-05, + "loss": 1.4141, + "step": 11860 + }, + { + "epoch": 0.04906692240440321, + "grad_norm": 3.580413781862322, + "learning_rate": 1.9923265990895785e-05, + "loss": 1.4291, + "step": 11870 + }, + { + "epoch": 0.04910825932302528, + "grad_norm": 4.057142008160253, + "learning_rate": 1.9923103713513972e-05, + "loss": 1.4193, + "step": 11880 + }, + { + "epoch": 0.04914959624164736, + "grad_norm": 3.157870137031843, + "learning_rate": 1.9922941265383226e-05, + "loss": 1.3949, + "step": 11890 + }, + { + "epoch": 0.049190933160269436, + "grad_norm": 3.7453792892755255, + "learning_rate": 1.992277864650634e-05, + "loss": 1.373, + "step": 11900 + }, + { + "epoch": 0.049232270078891506, + "grad_norm": 4.17858858176331, + "learning_rate": 1.992261585688611e-05, + "loss": 1.3732, + "step": 11910 + }, + { + "epoch": 0.04927360699751358, + "grad_norm": 3.3040489276601157, + "learning_rate": 1.992245289652535e-05, + "loss": 1.373, + "step": 11920 + }, + { + "epoch": 0.04931494391613566, + "grad_norm": 3.0410499391716117, + "learning_rate": 1.992228976542685e-05, + "loss": 1.3839, + "step": 11930 + }, + { + "epoch": 0.04935628083475774, + "grad_norm": 3.541114095436553, + "learning_rate": 1.9922126463593422e-05, + "loss": 1.4006, + "step": 11940 + }, + { + "epoch": 0.04939761775337981, + "grad_norm": 3.9811902872742673, + "learning_rate": 1.992196299102788e-05, + "loss": 1.3546, + "step": 11950 + }, + { + "epoch": 0.049438954672001885, + "grad_norm": 3.1780875587126705, + "learning_rate": 1.9921799347733026e-05, + "loss": 1.3693, + "step": 11960 + }, + { + "epoch": 0.04948029159062396, + "grad_norm": 3.5586898768297415, + "learning_rate": 1.9921635533711687e-05, + "loss": 1.4215, + "step": 11970 + }, + { + "epoch": 0.04952162850924603, + "grad_norm": 2.8477978920610565, + "learning_rate": 1.9921471548966678e-05, + "loss": 1.4256, + "step": 11980 + }, + { + "epoch": 0.04956296542786811, + "grad_norm": 3.5714848436239275, + "learning_rate": 1.9921307393500822e-05, + "loss": 1.4358, + "step": 11990 + }, + { + "epoch": 0.04960430234649019, + "grad_norm": 3.746696171382991, + "learning_rate": 1.992114306731694e-05, + "loss": 1.4556, + "step": 12000 + }, + { + "epoch": 0.049645639265112264, + "grad_norm": 3.6562155551861353, + "learning_rate": 1.992097857041786e-05, + "loss": 1.404, + "step": 12010 + }, + { + "epoch": 0.049686976183734334, + "grad_norm": 3.2699664906016803, + "learning_rate": 1.9920813902806414e-05, + "loss": 1.3946, + "step": 12020 + }, + { + "epoch": 0.04972831310235641, + "grad_norm": 4.034724844068233, + "learning_rate": 1.992064906448544e-05, + "loss": 1.4321, + "step": 12030 + }, + { + "epoch": 0.04976965002097849, + "grad_norm": 3.4822925081412497, + "learning_rate": 1.9920484055457767e-05, + "loss": 1.4105, + "step": 12040 + }, + { + "epoch": 0.04981098693960056, + "grad_norm": 3.587853195823534, + "learning_rate": 1.9920318875726238e-05, + "loss": 1.3697, + "step": 12050 + }, + { + "epoch": 0.049852323858222636, + "grad_norm": 3.205239993732016, + "learning_rate": 1.9920153525293694e-05, + "loss": 1.3979, + "step": 12060 + }, + { + "epoch": 0.04989366077684471, + "grad_norm": 3.388133162074283, + "learning_rate": 1.991998800416298e-05, + "loss": 1.4023, + "step": 12070 + }, + { + "epoch": 0.04993499769546679, + "grad_norm": 4.2719394165918985, + "learning_rate": 1.9919822312336947e-05, + "loss": 1.3956, + "step": 12080 + }, + { + "epoch": 0.04997633461408886, + "grad_norm": 3.234674011512212, + "learning_rate": 1.9919656449818444e-05, + "loss": 1.4101, + "step": 12090 + }, + { + "epoch": 0.05001767153271094, + "grad_norm": 3.4860946302341107, + "learning_rate": 1.9919490416610327e-05, + "loss": 1.3802, + "step": 12100 + }, + { + "epoch": 0.050059008451333015, + "grad_norm": 3.3542639404969483, + "learning_rate": 1.9919324212715448e-05, + "loss": 1.3865, + "step": 12110 + }, + { + "epoch": 0.050100345369955085, + "grad_norm": 3.7877647269232293, + "learning_rate": 1.9919157838136668e-05, + "loss": 1.4198, + "step": 12120 + }, + { + "epoch": 0.05014168228857716, + "grad_norm": 3.6768245303830214, + "learning_rate": 1.9918991292876857e-05, + "loss": 1.392, + "step": 12130 + }, + { + "epoch": 0.05018301920719924, + "grad_norm": 3.0504339083555054, + "learning_rate": 1.9918824576938872e-05, + "loss": 1.3943, + "step": 12140 + }, + { + "epoch": 0.05022435612582131, + "grad_norm": 4.003605772511745, + "learning_rate": 1.9918657690325586e-05, + "loss": 1.3627, + "step": 12150 + }, + { + "epoch": 0.05026569304444339, + "grad_norm": 4.04766902184383, + "learning_rate": 1.9918490633039873e-05, + "loss": 1.3867, + "step": 12160 + }, + { + "epoch": 0.050307029963065464, + "grad_norm": 4.054619230075598, + "learning_rate": 1.99183234050846e-05, + "loss": 1.3558, + "step": 12170 + }, + { + "epoch": 0.05034836688168754, + "grad_norm": 3.3575853249481873, + "learning_rate": 1.9918156006462653e-05, + "loss": 1.3863, + "step": 12180 + }, + { + "epoch": 0.05038970380030961, + "grad_norm": 3.5267112119515582, + "learning_rate": 1.9917988437176908e-05, + "loss": 1.3705, + "step": 12190 + }, + { + "epoch": 0.05043104071893169, + "grad_norm": 3.528333383483439, + "learning_rate": 1.9917820697230247e-05, + "loss": 1.4441, + "step": 12200 + }, + { + "epoch": 0.050472377637553766, + "grad_norm": 3.233459986557015, + "learning_rate": 1.991765278662556e-05, + "loss": 1.3532, + "step": 12210 + }, + { + "epoch": 0.050513714556175836, + "grad_norm": 3.4949233408933034, + "learning_rate": 1.991748470536573e-05, + "loss": 1.3658, + "step": 12220 + }, + { + "epoch": 0.05055505147479791, + "grad_norm": 3.194405798297394, + "learning_rate": 1.9917316453453657e-05, + "loss": 1.397, + "step": 12230 + }, + { + "epoch": 0.05059638839341999, + "grad_norm": 3.6606193297707046, + "learning_rate": 1.9917148030892238e-05, + "loss": 1.4072, + "step": 12240 + }, + { + "epoch": 0.05063772531204207, + "grad_norm": 4.069430004373759, + "learning_rate": 1.9916979437684362e-05, + "loss": 1.4136, + "step": 12250 + }, + { + "epoch": 0.05067906223066414, + "grad_norm": 3.1603885636318956, + "learning_rate": 1.991681067383293e-05, + "loss": 1.4097, + "step": 12260 + }, + { + "epoch": 0.050720399149286215, + "grad_norm": 3.177326748991991, + "learning_rate": 1.9916641739340857e-05, + "loss": 1.4195, + "step": 12270 + }, + { + "epoch": 0.05076173606790829, + "grad_norm": 3.918139727949541, + "learning_rate": 1.991647263421104e-05, + "loss": 1.3876, + "step": 12280 + }, + { + "epoch": 0.05080307298653036, + "grad_norm": 3.9573038369404427, + "learning_rate": 1.9916303358446392e-05, + "loss": 1.3683, + "step": 12290 + }, + { + "epoch": 0.05084440990515244, + "grad_norm": 4.275806482199394, + "learning_rate": 1.9916133912049825e-05, + "loss": 1.4204, + "step": 12300 + }, + { + "epoch": 0.050885746823774516, + "grad_norm": 4.847335915383492, + "learning_rate": 1.9915964295024254e-05, + "loss": 1.4034, + "step": 12310 + }, + { + "epoch": 0.050927083742396594, + "grad_norm": 3.273054866007932, + "learning_rate": 1.99157945073726e-05, + "loss": 1.3968, + "step": 12320 + }, + { + "epoch": 0.050968420661018664, + "grad_norm": 3.7167200760785173, + "learning_rate": 1.9915624549097784e-05, + "loss": 1.3999, + "step": 12330 + }, + { + "epoch": 0.05100975757964074, + "grad_norm": 3.4590376733691337, + "learning_rate": 1.991545442020273e-05, + "loss": 1.4122, + "step": 12340 + }, + { + "epoch": 0.05105109449826282, + "grad_norm": 3.1847142656240255, + "learning_rate": 1.9915284120690362e-05, + "loss": 1.4239, + "step": 12350 + }, + { + "epoch": 0.05109243141688489, + "grad_norm": 3.3467985786610353, + "learning_rate": 1.991511365056362e-05, + "loss": 1.3844, + "step": 12360 + }, + { + "epoch": 0.051133768335506966, + "grad_norm": 4.094929119314472, + "learning_rate": 1.9914943009825425e-05, + "loss": 1.3577, + "step": 12370 + }, + { + "epoch": 0.05117510525412904, + "grad_norm": 3.521925167773648, + "learning_rate": 1.9914772198478723e-05, + "loss": 1.3954, + "step": 12380 + }, + { + "epoch": 0.05121644217275112, + "grad_norm": 3.3998170027309067, + "learning_rate": 1.9914601216526446e-05, + "loss": 1.4102, + "step": 12390 + }, + { + "epoch": 0.05125777909137319, + "grad_norm": 3.6457154728502035, + "learning_rate": 1.9914430063971542e-05, + "loss": 1.3666, + "step": 12400 + }, + { + "epoch": 0.05129911600999527, + "grad_norm": 3.133616222950282, + "learning_rate": 1.9914258740816956e-05, + "loss": 1.4071, + "step": 12410 + }, + { + "epoch": 0.051340452928617344, + "grad_norm": 3.5165895133634133, + "learning_rate": 1.9914087247065634e-05, + "loss": 1.4127, + "step": 12420 + }, + { + "epoch": 0.051381789847239415, + "grad_norm": 3.1317185826456795, + "learning_rate": 1.991391558272052e-05, + "loss": 1.3609, + "step": 12430 + }, + { + "epoch": 0.05142312676586149, + "grad_norm": 3.7493626264431605, + "learning_rate": 1.991374374778458e-05, + "loss": 1.3832, + "step": 12440 + }, + { + "epoch": 0.05146446368448357, + "grad_norm": 3.5662863496670196, + "learning_rate": 1.991357174226076e-05, + "loss": 1.3665, + "step": 12450 + }, + { + "epoch": 0.05150580060310564, + "grad_norm": 3.1448919796113035, + "learning_rate": 1.9913399566152033e-05, + "loss": 1.3965, + "step": 12460 + }, + { + "epoch": 0.051547137521727716, + "grad_norm": 3.3154750106704625, + "learning_rate": 1.991322721946135e-05, + "loss": 1.3873, + "step": 12470 + }, + { + "epoch": 0.051588474440349794, + "grad_norm": 3.688390777094577, + "learning_rate": 1.991305470219168e-05, + "loss": 1.3224, + "step": 12480 + }, + { + "epoch": 0.05162981135897187, + "grad_norm": 3.0270982188671907, + "learning_rate": 1.9912882014345988e-05, + "loss": 1.3551, + "step": 12490 + }, + { + "epoch": 0.05167114827759394, + "grad_norm": 3.7441081256974136, + "learning_rate": 1.9912709155927254e-05, + "loss": 1.3945, + "step": 12500 + }, + { + "epoch": 0.05171248519621602, + "grad_norm": 3.697861466581755, + "learning_rate": 1.9912536126938446e-05, + "loss": 1.3612, + "step": 12510 + }, + { + "epoch": 0.051753822114838095, + "grad_norm": 3.523721321551833, + "learning_rate": 1.9912362927382546e-05, + "loss": 1.3747, + "step": 12520 + }, + { + "epoch": 0.051795159033460166, + "grad_norm": 3.0061067921910727, + "learning_rate": 1.9912189557262528e-05, + "loss": 1.4086, + "step": 12530 + }, + { + "epoch": 0.05183649595208224, + "grad_norm": 4.0409382024057425, + "learning_rate": 1.991201601658138e-05, + "loss": 1.3849, + "step": 12540 + }, + { + "epoch": 0.05187783287070432, + "grad_norm": 3.7066972530256983, + "learning_rate": 1.9911842305342085e-05, + "loss": 1.3775, + "step": 12550 + }, + { + "epoch": 0.0519191697893264, + "grad_norm": 3.8641850919990737, + "learning_rate": 1.9911668423547635e-05, + "loss": 1.4056, + "step": 12560 + }, + { + "epoch": 0.05196050670794847, + "grad_norm": 2.8845765281029325, + "learning_rate": 1.9911494371201023e-05, + "loss": 1.3433, + "step": 12570 + }, + { + "epoch": 0.052001843626570544, + "grad_norm": 2.9182840148796996, + "learning_rate": 1.9911320148305235e-05, + "loss": 1.4146, + "step": 12580 + }, + { + "epoch": 0.05204318054519262, + "grad_norm": 3.471309021112678, + "learning_rate": 1.991114575486328e-05, + "loss": 1.3769, + "step": 12590 + }, + { + "epoch": 0.05208451746381469, + "grad_norm": 3.132099407416561, + "learning_rate": 1.9910971190878157e-05, + "loss": 1.4006, + "step": 12600 + }, + { + "epoch": 0.05212585438243677, + "grad_norm": 3.5078473659046554, + "learning_rate": 1.9910796456352863e-05, + "loss": 1.3608, + "step": 12610 + }, + { + "epoch": 0.052167191301058846, + "grad_norm": 3.420611210950219, + "learning_rate": 1.991062155129041e-05, + "loss": 1.3477, + "step": 12620 + }, + { + "epoch": 0.05220852821968092, + "grad_norm": 3.3602682043236425, + "learning_rate": 1.991044647569381e-05, + "loss": 1.3821, + "step": 12630 + }, + { + "epoch": 0.052249865138302994, + "grad_norm": 3.3934199487204326, + "learning_rate": 1.9910271229566067e-05, + "loss": 1.3672, + "step": 12640 + }, + { + "epoch": 0.05229120205692507, + "grad_norm": 3.3930743766477636, + "learning_rate": 1.9910095812910205e-05, + "loss": 1.3805, + "step": 12650 + }, + { + "epoch": 0.05233253897554715, + "grad_norm": 3.803471056919821, + "learning_rate": 1.9909920225729237e-05, + "loss": 1.357, + "step": 12660 + }, + { + "epoch": 0.05237387589416922, + "grad_norm": 2.9908606514422766, + "learning_rate": 1.990974446802619e-05, + "loss": 1.4148, + "step": 12670 + }, + { + "epoch": 0.052415212812791295, + "grad_norm": 3.472506553773665, + "learning_rate": 1.990956853980408e-05, + "loss": 1.4119, + "step": 12680 + }, + { + "epoch": 0.05245654973141337, + "grad_norm": 3.652498665098648, + "learning_rate": 1.9909392441065944e-05, + "loss": 1.3896, + "step": 12690 + }, + { + "epoch": 0.05249788665003545, + "grad_norm": 3.556198285804122, + "learning_rate": 1.9909216171814802e-05, + "loss": 1.3556, + "step": 12700 + }, + { + "epoch": 0.05253922356865752, + "grad_norm": 3.3341578506950187, + "learning_rate": 1.9909039732053695e-05, + "loss": 1.3875, + "step": 12710 + }, + { + "epoch": 0.0525805604872796, + "grad_norm": 3.623737574209396, + "learning_rate": 1.9908863121785656e-05, + "loss": 1.3699, + "step": 12720 + }, + { + "epoch": 0.052621897405901674, + "grad_norm": 3.068120426816953, + "learning_rate": 1.9908686341013723e-05, + "loss": 1.3504, + "step": 12730 + }, + { + "epoch": 0.052663234324523744, + "grad_norm": 3.5988757581859643, + "learning_rate": 1.990850938974094e-05, + "loss": 1.3506, + "step": 12740 + }, + { + "epoch": 0.05270457124314582, + "grad_norm": 3.4850198824984724, + "learning_rate": 1.990833226797035e-05, + "loss": 1.3949, + "step": 12750 + }, + { + "epoch": 0.0527459081617679, + "grad_norm": 3.3573178296822834, + "learning_rate": 1.9908154975705e-05, + "loss": 1.3766, + "step": 12760 + }, + { + "epoch": 0.052787245080389976, + "grad_norm": 3.5288003708700186, + "learning_rate": 1.990797751294795e-05, + "loss": 1.3915, + "step": 12770 + }, + { + "epoch": 0.052828581999012046, + "grad_norm": 3.0065181585529794, + "learning_rate": 1.990779987970224e-05, + "loss": 1.3943, + "step": 12780 + }, + { + "epoch": 0.05286991891763412, + "grad_norm": 3.8902210517557787, + "learning_rate": 1.9907622075970933e-05, + "loss": 1.4339, + "step": 12790 + }, + { + "epoch": 0.0529112558362562, + "grad_norm": 3.0754498963080317, + "learning_rate": 1.990744410175709e-05, + "loss": 1.3633, + "step": 12800 + }, + { + "epoch": 0.05295259275487827, + "grad_norm": 3.373819633563616, + "learning_rate": 1.990726595706377e-05, + "loss": 1.3729, + "step": 12810 + }, + { + "epoch": 0.05299392967350035, + "grad_norm": 3.2413750593238277, + "learning_rate": 1.990708764189404e-05, + "loss": 1.3611, + "step": 12820 + }, + { + "epoch": 0.053035266592122425, + "grad_norm": 3.3175842583387287, + "learning_rate": 1.990690915625097e-05, + "loss": 1.4386, + "step": 12830 + }, + { + "epoch": 0.053076603510744495, + "grad_norm": 4.421464949416987, + "learning_rate": 1.9906730500137626e-05, + "loss": 1.3825, + "step": 12840 + }, + { + "epoch": 0.05311794042936657, + "grad_norm": 3.7375473757828312, + "learning_rate": 1.9906551673557092e-05, + "loss": 1.3584, + "step": 12850 + }, + { + "epoch": 0.05315927734798865, + "grad_norm": 4.23504822699641, + "learning_rate": 1.9906372676512435e-05, + "loss": 1.3655, + "step": 12860 + }, + { + "epoch": 0.05320061426661073, + "grad_norm": 3.674354440233681, + "learning_rate": 1.9906193509006737e-05, + "loss": 1.3652, + "step": 12870 + }, + { + "epoch": 0.0532419511852328, + "grad_norm": 3.270440974926962, + "learning_rate": 1.9906014171043085e-05, + "loss": 1.408, + "step": 12880 + }, + { + "epoch": 0.053283288103854874, + "grad_norm": 3.4328461661592007, + "learning_rate": 1.9905834662624562e-05, + "loss": 1.3881, + "step": 12890 + }, + { + "epoch": 0.05332462502247695, + "grad_norm": 3.296815547244285, + "learning_rate": 1.9905654983754255e-05, + "loss": 1.3099, + "step": 12900 + }, + { + "epoch": 0.05336596194109902, + "grad_norm": 3.182778307558256, + "learning_rate": 1.9905475134435265e-05, + "loss": 1.3887, + "step": 12910 + }, + { + "epoch": 0.0534072988597211, + "grad_norm": 4.066051141098089, + "learning_rate": 1.9905295114670674e-05, + "loss": 1.3615, + "step": 12920 + }, + { + "epoch": 0.053448635778343176, + "grad_norm": 3.62599590443558, + "learning_rate": 1.9905114924463592e-05, + "loss": 1.3461, + "step": 12930 + }, + { + "epoch": 0.05348997269696525, + "grad_norm": 4.5391383467222735, + "learning_rate": 1.9904934563817106e-05, + "loss": 1.3543, + "step": 12940 + }, + { + "epoch": 0.05353130961558732, + "grad_norm": 4.04368698393749, + "learning_rate": 1.990475403273433e-05, + "loss": 1.3712, + "step": 12950 + }, + { + "epoch": 0.0535726465342094, + "grad_norm": 3.9811173894333836, + "learning_rate": 1.9904573331218365e-05, + "loss": 1.4334, + "step": 12960 + }, + { + "epoch": 0.05361398345283148, + "grad_norm": 3.499953327542098, + "learning_rate": 1.9904392459272326e-05, + "loss": 1.3871, + "step": 12970 + }, + { + "epoch": 0.05365532037145355, + "grad_norm": 3.226290735311431, + "learning_rate": 1.9904211416899322e-05, + "loss": 1.4122, + "step": 12980 + }, + { + "epoch": 0.053696657290075625, + "grad_norm": 3.566091958099414, + "learning_rate": 1.990403020410247e-05, + "loss": 1.4075, + "step": 12990 + }, + { + "epoch": 0.0537379942086977, + "grad_norm": 3.4558175186897513, + "learning_rate": 1.990384882088488e-05, + "loss": 1.4553, + "step": 13000 + }, + { + "epoch": 0.05377933112731978, + "grad_norm": 3.238909449520725, + "learning_rate": 1.9903667267249683e-05, + "loss": 1.3791, + "step": 13010 + }, + { + "epoch": 0.05382066804594185, + "grad_norm": 3.517722296765338, + "learning_rate": 1.9903485543199995e-05, + "loss": 1.3283, + "step": 13020 + }, + { + "epoch": 0.05386200496456393, + "grad_norm": 3.42397575432932, + "learning_rate": 1.9903303648738954e-05, + "loss": 1.3335, + "step": 13030 + }, + { + "epoch": 0.053903341883186004, + "grad_norm": 3.350334059229468, + "learning_rate": 1.990312158386968e-05, + "loss": 1.3806, + "step": 13040 + }, + { + "epoch": 0.053944678801808074, + "grad_norm": 3.051548296138219, + "learning_rate": 1.9902939348595307e-05, + "loss": 1.3885, + "step": 13050 + }, + { + "epoch": 0.05398601572043015, + "grad_norm": 3.276091159978694, + "learning_rate": 1.9902756942918976e-05, + "loss": 1.359, + "step": 13060 + }, + { + "epoch": 0.05402735263905223, + "grad_norm": 3.5296387125760185, + "learning_rate": 1.9902574366843824e-05, + "loss": 1.3625, + "step": 13070 + }, + { + "epoch": 0.054068689557674306, + "grad_norm": 3.567948297220875, + "learning_rate": 1.990239162037299e-05, + "loss": 1.351, + "step": 13080 + }, + { + "epoch": 0.054110026476296376, + "grad_norm": 3.1640186718240266, + "learning_rate": 1.9902208703509617e-05, + "loss": 1.3458, + "step": 13090 + }, + { + "epoch": 0.05415136339491845, + "grad_norm": 3.9025546167384495, + "learning_rate": 1.9902025616256854e-05, + "loss": 1.3588, + "step": 13100 + }, + { + "epoch": 0.05419270031354053, + "grad_norm": 3.501658240766089, + "learning_rate": 1.9901842358617854e-05, + "loss": 1.3624, + "step": 13110 + }, + { + "epoch": 0.0542340372321626, + "grad_norm": 3.923570308465845, + "learning_rate": 1.9901658930595774e-05, + "loss": 1.3294, + "step": 13120 + }, + { + "epoch": 0.05427537415078468, + "grad_norm": 3.0232913372852406, + "learning_rate": 1.990147533219376e-05, + "loss": 1.3855, + "step": 13130 + }, + { + "epoch": 0.054316711069406755, + "grad_norm": 3.670482250458697, + "learning_rate": 1.9901291563414977e-05, + "loss": 1.3337, + "step": 13140 + }, + { + "epoch": 0.054358047988028825, + "grad_norm": 3.790854501668152, + "learning_rate": 1.990110762426259e-05, + "loss": 1.366, + "step": 13150 + }, + { + "epoch": 0.0543993849066509, + "grad_norm": 3.1103384131591256, + "learning_rate": 1.9900923514739758e-05, + "loss": 1.3574, + "step": 13160 + }, + { + "epoch": 0.05444072182527298, + "grad_norm": 3.2207958459794845, + "learning_rate": 1.990073923484965e-05, + "loss": 1.3675, + "step": 13170 + }, + { + "epoch": 0.05448205874389506, + "grad_norm": 3.2588291421463023, + "learning_rate": 1.990055478459544e-05, + "loss": 1.3313, + "step": 13180 + }, + { + "epoch": 0.05452339566251713, + "grad_norm": 2.9426904447180506, + "learning_rate": 1.99003701639803e-05, + "loss": 1.3995, + "step": 13190 + }, + { + "epoch": 0.054564732581139204, + "grad_norm": 3.892827987664763, + "learning_rate": 1.990018537300741e-05, + "loss": 1.4035, + "step": 13200 + }, + { + "epoch": 0.05460606949976128, + "grad_norm": 3.765962575470102, + "learning_rate": 1.9900000411679946e-05, + "loss": 1.3823, + "step": 13210 + }, + { + "epoch": 0.05464740641838335, + "grad_norm": 3.031044142550962, + "learning_rate": 1.9899815280001093e-05, + "loss": 1.3907, + "step": 13220 + }, + { + "epoch": 0.05468874333700543, + "grad_norm": 3.401074997651561, + "learning_rate": 1.9899629977974033e-05, + "loss": 1.3724, + "step": 13230 + }, + { + "epoch": 0.054730080255627506, + "grad_norm": 3.4363592487014367, + "learning_rate": 1.9899444505601957e-05, + "loss": 1.4044, + "step": 13240 + }, + { + "epoch": 0.05477141717424958, + "grad_norm": 3.4008170147404924, + "learning_rate": 1.9899258862888055e-05, + "loss": 1.4329, + "step": 13250 + }, + { + "epoch": 0.05481275409287165, + "grad_norm": 3.3535448510349086, + "learning_rate": 1.9899073049835526e-05, + "loss": 1.3803, + "step": 13260 + }, + { + "epoch": 0.05485409101149373, + "grad_norm": 3.6491506303085677, + "learning_rate": 1.9898887066447564e-05, + "loss": 1.4061, + "step": 13270 + }, + { + "epoch": 0.05489542793011581, + "grad_norm": 3.2649595342568754, + "learning_rate": 1.9898700912727365e-05, + "loss": 1.3548, + "step": 13280 + }, + { + "epoch": 0.05493676484873788, + "grad_norm": 3.260685808424658, + "learning_rate": 1.9898514588678138e-05, + "loss": 1.3798, + "step": 13290 + }, + { + "epoch": 0.054978101767359955, + "grad_norm": 3.3068054059856964, + "learning_rate": 1.989832809430309e-05, + "loss": 1.3873, + "step": 13300 + }, + { + "epoch": 0.05501943868598203, + "grad_norm": 3.3289477651913844, + "learning_rate": 1.9898141429605428e-05, + "loss": 1.42, + "step": 13310 + }, + { + "epoch": 0.05506077560460411, + "grad_norm": 3.899358403289862, + "learning_rate": 1.9897954594588366e-05, + "loss": 1.3612, + "step": 13320 + }, + { + "epoch": 0.05510211252322618, + "grad_norm": 3.4534257185508768, + "learning_rate": 1.989776758925511e-05, + "loss": 1.4139, + "step": 13330 + }, + { + "epoch": 0.05514344944184826, + "grad_norm": 3.0896933369894555, + "learning_rate": 1.9897580413608888e-05, + "loss": 1.3455, + "step": 13340 + }, + { + "epoch": 0.055184786360470334, + "grad_norm": 3.6880673723268895, + "learning_rate": 1.9897393067652916e-05, + "loss": 1.3553, + "step": 13350 + }, + { + "epoch": 0.055226123279092404, + "grad_norm": 4.959844713915171, + "learning_rate": 1.989720555139042e-05, + "loss": 1.3744, + "step": 13360 + }, + { + "epoch": 0.05526746019771448, + "grad_norm": 3.7512130358105535, + "learning_rate": 1.9897017864824623e-05, + "loss": 1.3967, + "step": 13370 + }, + { + "epoch": 0.05530879711633656, + "grad_norm": 3.438280531829743, + "learning_rate": 1.989683000795876e-05, + "loss": 1.3199, + "step": 13380 + }, + { + "epoch": 0.055350134034958635, + "grad_norm": 3.191658504407269, + "learning_rate": 1.989664198079606e-05, + "loss": 1.3843, + "step": 13390 + }, + { + "epoch": 0.055391470953580706, + "grad_norm": 3.246301259794481, + "learning_rate": 1.989645378333976e-05, + "loss": 1.3823, + "step": 13400 + }, + { + "epoch": 0.05543280787220278, + "grad_norm": 3.209808944542668, + "learning_rate": 1.9896265415593096e-05, + "loss": 1.4023, + "step": 13410 + }, + { + "epoch": 0.05547414479082486, + "grad_norm": 3.8876392074458055, + "learning_rate": 1.989607687755931e-05, + "loss": 1.4021, + "step": 13420 + }, + { + "epoch": 0.05551548170944693, + "grad_norm": 4.6622959130132635, + "learning_rate": 1.9895888169241643e-05, + "loss": 1.3941, + "step": 13430 + }, + { + "epoch": 0.05555681862806901, + "grad_norm": 3.2829449802312243, + "learning_rate": 1.989569929064335e-05, + "loss": 1.3914, + "step": 13440 + }, + { + "epoch": 0.055598155546691085, + "grad_norm": 2.975464909103463, + "learning_rate": 1.989551024176768e-05, + "loss": 1.3393, + "step": 13450 + }, + { + "epoch": 0.055639492465313155, + "grad_norm": 3.3616145226127374, + "learning_rate": 1.9895321022617877e-05, + "loss": 1.3691, + "step": 13460 + }, + { + "epoch": 0.05568082938393523, + "grad_norm": 3.551441103147202, + "learning_rate": 1.9895131633197206e-05, + "loss": 1.3748, + "step": 13470 + }, + { + "epoch": 0.05572216630255731, + "grad_norm": 3.1368088044777838, + "learning_rate": 1.9894942073508924e-05, + "loss": 1.3341, + "step": 13480 + }, + { + "epoch": 0.055763503221179386, + "grad_norm": 2.8747175172948722, + "learning_rate": 1.989475234355629e-05, + "loss": 1.3951, + "step": 13490 + }, + { + "epoch": 0.055804840139801457, + "grad_norm": 3.0419841938845975, + "learning_rate": 1.989456244334257e-05, + "loss": 1.3325, + "step": 13500 + }, + { + "epoch": 0.055846177058423534, + "grad_norm": 3.6672894427510947, + "learning_rate": 1.9894372372871036e-05, + "loss": 1.3847, + "step": 13510 + }, + { + "epoch": 0.05588751397704561, + "grad_norm": 3.3319466434724325, + "learning_rate": 1.989418213214495e-05, + "loss": 1.4007, + "step": 13520 + }, + { + "epoch": 0.05592885089566768, + "grad_norm": 2.756361075189523, + "learning_rate": 1.9893991721167593e-05, + "loss": 1.3962, + "step": 13530 + }, + { + "epoch": 0.05597018781428976, + "grad_norm": 3.2588019424168384, + "learning_rate": 1.989380113994224e-05, + "loss": 1.409, + "step": 13540 + }, + { + "epoch": 0.056011524732911835, + "grad_norm": 3.992120970935661, + "learning_rate": 1.9893610388472162e-05, + "loss": 1.3642, + "step": 13550 + }, + { + "epoch": 0.05605286165153391, + "grad_norm": 3.0454737775385885, + "learning_rate": 1.9893419466760653e-05, + "loss": 1.3696, + "step": 13560 + }, + { + "epoch": 0.05609419857015598, + "grad_norm": 3.1960156109377507, + "learning_rate": 1.9893228374810993e-05, + "loss": 1.3611, + "step": 13570 + }, + { + "epoch": 0.05613553548877806, + "grad_norm": 3.3282542329613496, + "learning_rate": 1.989303711262647e-05, + "loss": 1.3541, + "step": 13580 + }, + { + "epoch": 0.05617687240740014, + "grad_norm": 4.271766406501802, + "learning_rate": 1.9892845680210374e-05, + "loss": 1.3033, + "step": 13590 + }, + { + "epoch": 0.05621820932602221, + "grad_norm": 3.4542215892482964, + "learning_rate": 1.9892654077566003e-05, + "loss": 1.3853, + "step": 13600 + }, + { + "epoch": 0.056259546244644285, + "grad_norm": 5.10198450683926, + "learning_rate": 1.9892462304696653e-05, + "loss": 1.3758, + "step": 13610 + }, + { + "epoch": 0.05630088316326636, + "grad_norm": 4.67424032198832, + "learning_rate": 1.989227036160562e-05, + "loss": 1.3395, + "step": 13620 + }, + { + "epoch": 0.05634222008188844, + "grad_norm": 3.574141696384299, + "learning_rate": 1.989207824829621e-05, + "loss": 1.3953, + "step": 13630 + }, + { + "epoch": 0.05638355700051051, + "grad_norm": 3.4219942324444244, + "learning_rate": 1.989188596477173e-05, + "loss": 1.3493, + "step": 13640 + }, + { + "epoch": 0.056424893919132586, + "grad_norm": 3.135032092895971, + "learning_rate": 1.9891693511035484e-05, + "loss": 1.4203, + "step": 13650 + }, + { + "epoch": 0.05646623083775466, + "grad_norm": 3.0856331845063387, + "learning_rate": 1.989150088709079e-05, + "loss": 1.2854, + "step": 13660 + }, + { + "epoch": 0.056507567756376734, + "grad_norm": 3.9267068067232, + "learning_rate": 1.9891308092940953e-05, + "loss": 1.3701, + "step": 13670 + }, + { + "epoch": 0.05654890467499881, + "grad_norm": 3.8306681224418395, + "learning_rate": 1.98911151285893e-05, + "loss": 1.4076, + "step": 13680 + }, + { + "epoch": 0.05659024159362089, + "grad_norm": 3.306837169963007, + "learning_rate": 1.9890921994039148e-05, + "loss": 1.3873, + "step": 13690 + }, + { + "epoch": 0.056631578512242965, + "grad_norm": 3.2434462335337297, + "learning_rate": 1.989072868929382e-05, + "loss": 1.3531, + "step": 13700 + }, + { + "epoch": 0.056672915430865035, + "grad_norm": 3.3740272090711856, + "learning_rate": 1.989053521435664e-05, + "loss": 1.3772, + "step": 13710 + }, + { + "epoch": 0.05671425234948711, + "grad_norm": 3.141984063033404, + "learning_rate": 1.989034156923094e-05, + "loss": 1.3983, + "step": 13720 + }, + { + "epoch": 0.05675558926810919, + "grad_norm": 2.9680517660305847, + "learning_rate": 1.989014775392005e-05, + "loss": 1.3651, + "step": 13730 + }, + { + "epoch": 0.05679692618673126, + "grad_norm": 3.418890084499366, + "learning_rate": 1.9889953768427313e-05, + "loss": 1.4157, + "step": 13740 + }, + { + "epoch": 0.05683826310535334, + "grad_norm": 3.7254805946590706, + "learning_rate": 1.9889759612756053e-05, + "loss": 1.3979, + "step": 13750 + }, + { + "epoch": 0.056879600023975414, + "grad_norm": 3.617034367391942, + "learning_rate": 1.9889565286909623e-05, + "loss": 1.3549, + "step": 13760 + }, + { + "epoch": 0.056920936942597485, + "grad_norm": 3.8592922160592646, + "learning_rate": 1.9889370790891364e-05, + "loss": 1.4008, + "step": 13770 + }, + { + "epoch": 0.05696227386121956, + "grad_norm": 3.510616141867297, + "learning_rate": 1.9889176124704616e-05, + "loss": 1.4071, + "step": 13780 + }, + { + "epoch": 0.05700361077984164, + "grad_norm": 3.5434621547794105, + "learning_rate": 1.9888981288352736e-05, + "loss": 1.3782, + "step": 13790 + }, + { + "epoch": 0.057044947698463716, + "grad_norm": 3.0056956686627117, + "learning_rate": 1.988878628183907e-05, + "loss": 1.352, + "step": 13800 + }, + { + "epoch": 0.057086284617085786, + "grad_norm": 2.876862066794774, + "learning_rate": 1.9888591105166984e-05, + "loss": 1.3451, + "step": 13810 + }, + { + "epoch": 0.05712762153570786, + "grad_norm": 3.6994718345443776, + "learning_rate": 1.9888395758339823e-05, + "loss": 1.3711, + "step": 13820 + }, + { + "epoch": 0.05716895845432994, + "grad_norm": 3.7019952550478052, + "learning_rate": 1.988820024136096e-05, + "loss": 1.3542, + "step": 13830 + }, + { + "epoch": 0.05721029537295201, + "grad_norm": 3.149511937310367, + "learning_rate": 1.9888004554233757e-05, + "loss": 1.3498, + "step": 13840 + }, + { + "epoch": 0.05725163229157409, + "grad_norm": 3.3687656057902298, + "learning_rate": 1.9887808696961574e-05, + "loss": 1.3759, + "step": 13850 + }, + { + "epoch": 0.057292969210196165, + "grad_norm": 3.286956265957372, + "learning_rate": 1.988761266954779e-05, + "loss": 1.3416, + "step": 13860 + }, + { + "epoch": 0.05733430612881824, + "grad_norm": 3.8546233434442025, + "learning_rate": 1.988741647199577e-05, + "loss": 1.3601, + "step": 13870 + }, + { + "epoch": 0.05737564304744031, + "grad_norm": 4.043137550642432, + "learning_rate": 1.98872201043089e-05, + "loss": 1.4019, + "step": 13880 + }, + { + "epoch": 0.05741697996606239, + "grad_norm": 4.6033780589634805, + "learning_rate": 1.988702356649055e-05, + "loss": 1.406, + "step": 13890 + }, + { + "epoch": 0.05745831688468447, + "grad_norm": 3.4533366026357135, + "learning_rate": 1.9886826858544103e-05, + "loss": 1.3579, + "step": 13900 + }, + { + "epoch": 0.05749965380330654, + "grad_norm": 4.2918223423561725, + "learning_rate": 1.9886629980472945e-05, + "loss": 1.3238, + "step": 13910 + }, + { + "epoch": 0.057540990721928614, + "grad_norm": 3.1786603013459667, + "learning_rate": 1.988643293228047e-05, + "loss": 1.3815, + "step": 13920 + }, + { + "epoch": 0.05758232764055069, + "grad_norm": 4.187787674938507, + "learning_rate": 1.988623571397006e-05, + "loss": 1.3129, + "step": 13930 + }, + { + "epoch": 0.05762366455917277, + "grad_norm": 3.2066247160025956, + "learning_rate": 1.9886038325545112e-05, + "loss": 1.3604, + "step": 13940 + }, + { + "epoch": 0.05766500147779484, + "grad_norm": 4.137189558470061, + "learning_rate": 1.9885840767009023e-05, + "loss": 1.3683, + "step": 13950 + }, + { + "epoch": 0.057706338396416916, + "grad_norm": 3.21825868230931, + "learning_rate": 1.988564303836519e-05, + "loss": 1.3521, + "step": 13960 + }, + { + "epoch": 0.05774767531503899, + "grad_norm": 3.5331562264396097, + "learning_rate": 1.9885445139617018e-05, + "loss": 1.4079, + "step": 13970 + }, + { + "epoch": 0.05778901223366106, + "grad_norm": 3.1970430005062607, + "learning_rate": 1.9885247070767915e-05, + "loss": 1.3688, + "step": 13980 + }, + { + "epoch": 0.05783034915228314, + "grad_norm": 4.27476021372676, + "learning_rate": 1.988504883182128e-05, + "loss": 1.364, + "step": 13990 + }, + { + "epoch": 0.05787168607090522, + "grad_norm": 3.2156024105612278, + "learning_rate": 1.9884850422780534e-05, + "loss": 1.3814, + "step": 14000 + }, + { + "epoch": 0.057913022989527295, + "grad_norm": 3.6998253526421565, + "learning_rate": 1.9884651843649083e-05, + "loss": 1.3698, + "step": 14010 + }, + { + "epoch": 0.057954359908149365, + "grad_norm": 4.503662250473274, + "learning_rate": 1.988445309443035e-05, + "loss": 1.3564, + "step": 14020 + }, + { + "epoch": 0.05799569682677144, + "grad_norm": 3.762384040491392, + "learning_rate": 1.9884254175127754e-05, + "loss": 1.4119, + "step": 14030 + }, + { + "epoch": 0.05803703374539352, + "grad_norm": 3.285301388684364, + "learning_rate": 1.9884055085744713e-05, + "loss": 1.3501, + "step": 14040 + }, + { + "epoch": 0.05807837066401559, + "grad_norm": 4.336832986530797, + "learning_rate": 1.9883855826284656e-05, + "loss": 1.3662, + "step": 14050 + }, + { + "epoch": 0.05811970758263767, + "grad_norm": 3.4574445488885734, + "learning_rate": 1.9883656396751016e-05, + "loss": 1.3127, + "step": 14060 + }, + { + "epoch": 0.058161044501259744, + "grad_norm": 3.4098914920099883, + "learning_rate": 1.988345679714722e-05, + "loss": 1.391, + "step": 14070 + }, + { + "epoch": 0.058202381419881814, + "grad_norm": 3.6614081585424603, + "learning_rate": 1.98832570274767e-05, + "loss": 1.3659, + "step": 14080 + }, + { + "epoch": 0.05824371833850389, + "grad_norm": 3.276861233677139, + "learning_rate": 1.98830570877429e-05, + "loss": 1.3559, + "step": 14090 + }, + { + "epoch": 0.05828505525712597, + "grad_norm": 3.4536947240708997, + "learning_rate": 1.9882856977949257e-05, + "loss": 1.3779, + "step": 14100 + }, + { + "epoch": 0.058326392175748046, + "grad_norm": 3.242988394736396, + "learning_rate": 1.9882656698099213e-05, + "loss": 1.3353, + "step": 14110 + }, + { + "epoch": 0.058367729094370116, + "grad_norm": 3.033285353432935, + "learning_rate": 1.9882456248196216e-05, + "loss": 1.3831, + "step": 14120 + }, + { + "epoch": 0.05840906601299219, + "grad_norm": 2.9840093106321173, + "learning_rate": 1.9882255628243715e-05, + "loss": 1.399, + "step": 14130 + }, + { + "epoch": 0.05845040293161427, + "grad_norm": 3.424871961043564, + "learning_rate": 1.9882054838245158e-05, + "loss": 1.3774, + "step": 14140 + }, + { + "epoch": 0.05849173985023634, + "grad_norm": 4.206811562034524, + "learning_rate": 1.988185387820401e-05, + "loss": 1.3768, + "step": 14150 + }, + { + "epoch": 0.05853307676885842, + "grad_norm": 3.167196829826696, + "learning_rate": 1.9881652748123723e-05, + "loss": 1.3118, + "step": 14160 + }, + { + "epoch": 0.058574413687480495, + "grad_norm": 3.2647863004270583, + "learning_rate": 1.9881451448007752e-05, + "loss": 1.359, + "step": 14170 + }, + { + "epoch": 0.05861575060610257, + "grad_norm": 3.2249069204445506, + "learning_rate": 1.988124997785957e-05, + "loss": 1.3594, + "step": 14180 + }, + { + "epoch": 0.05865708752472464, + "grad_norm": 3.125833926204158, + "learning_rate": 1.9881048337682644e-05, + "loss": 1.3729, + "step": 14190 + }, + { + "epoch": 0.05869842444334672, + "grad_norm": 3.124857727130482, + "learning_rate": 1.9880846527480434e-05, + "loss": 1.3968, + "step": 14200 + }, + { + "epoch": 0.0587397613619688, + "grad_norm": 3.5324983045866416, + "learning_rate": 1.988064454725642e-05, + "loss": 1.3744, + "step": 14210 + }, + { + "epoch": 0.05878109828059087, + "grad_norm": 3.534173243273415, + "learning_rate": 1.9880442397014082e-05, + "loss": 1.3858, + "step": 14220 + }, + { + "epoch": 0.058822435199212944, + "grad_norm": 4.522930564227188, + "learning_rate": 1.9880240076756885e-05, + "loss": 1.3365, + "step": 14230 + }, + { + "epoch": 0.05886377211783502, + "grad_norm": 2.911847199383502, + "learning_rate": 1.9880037586488324e-05, + "loss": 1.3629, + "step": 14240 + }, + { + "epoch": 0.0589051090364571, + "grad_norm": 3.467653251925633, + "learning_rate": 1.9879834926211875e-05, + "loss": 1.3839, + "step": 14250 + }, + { + "epoch": 0.05894644595507917, + "grad_norm": 3.7166470032659618, + "learning_rate": 1.9879632095931024e-05, + "loss": 1.3358, + "step": 14260 + }, + { + "epoch": 0.058987782873701246, + "grad_norm": 3.7077022813203193, + "learning_rate": 1.987942909564927e-05, + "loss": 1.3474, + "step": 14270 + }, + { + "epoch": 0.05902911979232332, + "grad_norm": 3.1733151465540916, + "learning_rate": 1.9879225925370094e-05, + "loss": 1.3881, + "step": 14280 + }, + { + "epoch": 0.05907045671094539, + "grad_norm": 3.0349885556128378, + "learning_rate": 1.9879022585097005e-05, + "loss": 1.3686, + "step": 14290 + }, + { + "epoch": 0.05911179362956747, + "grad_norm": 3.2620109103700363, + "learning_rate": 1.9878819074833493e-05, + "loss": 1.3588, + "step": 14300 + }, + { + "epoch": 0.05915313054818955, + "grad_norm": 3.7133352574339873, + "learning_rate": 1.9878615394583062e-05, + "loss": 1.34, + "step": 14310 + }, + { + "epoch": 0.059194467466811625, + "grad_norm": 3.1624289931149114, + "learning_rate": 1.987841154434922e-05, + "loss": 1.3334, + "step": 14320 + }, + { + "epoch": 0.059235804385433695, + "grad_norm": 3.7047396553657643, + "learning_rate": 1.9878207524135468e-05, + "loss": 1.3375, + "step": 14330 + }, + { + "epoch": 0.05927714130405577, + "grad_norm": 3.3119519535402, + "learning_rate": 1.9878003333945325e-05, + "loss": 1.3537, + "step": 14340 + }, + { + "epoch": 0.05931847822267785, + "grad_norm": 3.3812187643072105, + "learning_rate": 1.98777989737823e-05, + "loss": 1.3374, + "step": 14350 + }, + { + "epoch": 0.05935981514129992, + "grad_norm": 3.299892572730851, + "learning_rate": 1.9877594443649902e-05, + "loss": 1.3704, + "step": 14360 + }, + { + "epoch": 0.059401152059922, + "grad_norm": 4.081245562156265, + "learning_rate": 1.9877389743551668e-05, + "loss": 1.3498, + "step": 14370 + }, + { + "epoch": 0.059442488978544074, + "grad_norm": 3.350158600172311, + "learning_rate": 1.9877184873491102e-05, + "loss": 1.3449, + "step": 14380 + }, + { + "epoch": 0.05948382589716615, + "grad_norm": 3.714541324538004, + "learning_rate": 1.9876979833471742e-05, + "loss": 1.3874, + "step": 14390 + }, + { + "epoch": 0.05952516281578822, + "grad_norm": 3.274100022702722, + "learning_rate": 1.9876774623497112e-05, + "loss": 1.3582, + "step": 14400 + }, + { + "epoch": 0.0595664997344103, + "grad_norm": 2.9329187414523425, + "learning_rate": 1.9876569243570742e-05, + "loss": 1.3901, + "step": 14410 + }, + { + "epoch": 0.059607836653032376, + "grad_norm": 3.73192299633757, + "learning_rate": 1.9876363693696166e-05, + "loss": 1.3898, + "step": 14420 + }, + { + "epoch": 0.059649173571654446, + "grad_norm": 3.2578219560804196, + "learning_rate": 1.987615797387692e-05, + "loss": 1.371, + "step": 14430 + }, + { + "epoch": 0.05969051049027652, + "grad_norm": 3.2376808673712807, + "learning_rate": 1.9875952084116548e-05, + "loss": 1.336, + "step": 14440 + }, + { + "epoch": 0.0597318474088986, + "grad_norm": 3.5969021652399253, + "learning_rate": 1.987574602441859e-05, + "loss": 1.3862, + "step": 14450 + }, + { + "epoch": 0.05977318432752067, + "grad_norm": 3.1882930317103844, + "learning_rate": 1.9875539794786593e-05, + "loss": 1.3734, + "step": 14460 + }, + { + "epoch": 0.05981452124614275, + "grad_norm": 3.147605709223492, + "learning_rate": 1.9875333395224102e-05, + "loss": 1.3739, + "step": 14470 + }, + { + "epoch": 0.059855858164764825, + "grad_norm": 3.4276761969558645, + "learning_rate": 1.9875126825734673e-05, + "loss": 1.3301, + "step": 14480 + }, + { + "epoch": 0.0598971950833869, + "grad_norm": 3.4226682382169997, + "learning_rate": 1.987492008632186e-05, + "loss": 1.3747, + "step": 14490 + }, + { + "epoch": 0.05993853200200897, + "grad_norm": 3.471596558825063, + "learning_rate": 1.987471317698922e-05, + "loss": 1.3349, + "step": 14500 + }, + { + "epoch": 0.05997986892063105, + "grad_norm": 3.676413482025792, + "learning_rate": 1.9874506097740308e-05, + "loss": 1.3963, + "step": 14510 + }, + { + "epoch": 0.060021205839253126, + "grad_norm": 3.2543359739910267, + "learning_rate": 1.9874298848578696e-05, + "loss": 1.3334, + "step": 14520 + }, + { + "epoch": 0.0600625427578752, + "grad_norm": 3.2372261281319377, + "learning_rate": 1.9874091429507943e-05, + "loss": 1.3367, + "step": 14530 + }, + { + "epoch": 0.060103879676497274, + "grad_norm": 3.3746909228055397, + "learning_rate": 1.987388384053162e-05, + "loss": 1.376, + "step": 14540 + }, + { + "epoch": 0.06014521659511935, + "grad_norm": 2.921092227637486, + "learning_rate": 1.9873676081653302e-05, + "loss": 1.3715, + "step": 14550 + }, + { + "epoch": 0.06018655351374143, + "grad_norm": 2.9872735994422417, + "learning_rate": 1.9873468152876563e-05, + "loss": 1.3457, + "step": 14560 + }, + { + "epoch": 0.0602278904323635, + "grad_norm": 3.713230272104003, + "learning_rate": 1.9873260054204978e-05, + "loss": 1.328, + "step": 14570 + }, + { + "epoch": 0.060269227350985576, + "grad_norm": 3.3242217719152496, + "learning_rate": 1.9873051785642134e-05, + "loss": 1.3433, + "step": 14580 + }, + { + "epoch": 0.06031056426960765, + "grad_norm": 3.3594706223759143, + "learning_rate": 1.9872843347191607e-05, + "loss": 1.4027, + "step": 14590 + }, + { + "epoch": 0.06035190118822972, + "grad_norm": 3.1573056170670846, + "learning_rate": 1.9872634738856987e-05, + "loss": 1.3798, + "step": 14600 + }, + { + "epoch": 0.0603932381068518, + "grad_norm": 3.076626392806199, + "learning_rate": 1.9872425960641863e-05, + "loss": 1.3581, + "step": 14610 + }, + { + "epoch": 0.06043457502547388, + "grad_norm": 3.468364476739333, + "learning_rate": 1.987221701254983e-05, + "loss": 1.3675, + "step": 14620 + }, + { + "epoch": 0.060475911944095954, + "grad_norm": 4.079855852909671, + "learning_rate": 1.987200789458448e-05, + "loss": 1.3197, + "step": 14630 + }, + { + "epoch": 0.060517248862718025, + "grad_norm": 4.960368430326063, + "learning_rate": 1.9871798606749415e-05, + "loss": 1.4018, + "step": 14640 + }, + { + "epoch": 0.0605585857813401, + "grad_norm": 3.1263141804205272, + "learning_rate": 1.9871589149048232e-05, + "loss": 1.4034, + "step": 14650 + }, + { + "epoch": 0.06059992269996218, + "grad_norm": 3.0030045708337876, + "learning_rate": 1.9871379521484538e-05, + "loss": 1.314, + "step": 14660 + }, + { + "epoch": 0.06064125961858425, + "grad_norm": 4.171309905380893, + "learning_rate": 1.987116972406194e-05, + "loss": 1.3454, + "step": 14670 + }, + { + "epoch": 0.060682596537206326, + "grad_norm": 3.678722232531344, + "learning_rate": 1.9870959756784044e-05, + "loss": 1.3644, + "step": 14680 + }, + { + "epoch": 0.060723933455828404, + "grad_norm": 3.5114052948471164, + "learning_rate": 1.987074961965447e-05, + "loss": 1.3446, + "step": 14690 + }, + { + "epoch": 0.06076527037445048, + "grad_norm": 3.3802619453865637, + "learning_rate": 1.987053931267683e-05, + "loss": 1.3603, + "step": 14700 + }, + { + "epoch": 0.06080660729307255, + "grad_norm": 3.239888006176567, + "learning_rate": 1.9870328835854743e-05, + "loss": 1.3263, + "step": 14710 + }, + { + "epoch": 0.06084794421169463, + "grad_norm": 2.7923482855439716, + "learning_rate": 1.9870118189191833e-05, + "loss": 1.3532, + "step": 14720 + }, + { + "epoch": 0.060889281130316705, + "grad_norm": 3.672374432583175, + "learning_rate": 1.9869907372691715e-05, + "loss": 1.3749, + "step": 14730 + }, + { + "epoch": 0.060930618048938776, + "grad_norm": 3.1371007241226017, + "learning_rate": 1.9869696386358032e-05, + "loss": 1.3529, + "step": 14740 + }, + { + "epoch": 0.06097195496756085, + "grad_norm": 3.5064852286924837, + "learning_rate": 1.9869485230194403e-05, + "loss": 1.3664, + "step": 14750 + }, + { + "epoch": 0.06101329188618293, + "grad_norm": 3.9587666778347073, + "learning_rate": 1.9869273904204465e-05, + "loss": 1.3847, + "step": 14760 + }, + { + "epoch": 0.061054628804805, + "grad_norm": 3.2129303491061973, + "learning_rate": 1.9869062408391855e-05, + "loss": 1.3625, + "step": 14770 + }, + { + "epoch": 0.06109596572342708, + "grad_norm": 3.1860777109810465, + "learning_rate": 1.9868850742760212e-05, + "loss": 1.3062, + "step": 14780 + }, + { + "epoch": 0.061137302642049154, + "grad_norm": 3.47772048599133, + "learning_rate": 1.9868638907313174e-05, + "loss": 1.3487, + "step": 14790 + }, + { + "epoch": 0.06117863956067123, + "grad_norm": 3.323868913803053, + "learning_rate": 1.9868426902054394e-05, + "loss": 1.3304, + "step": 14800 + }, + { + "epoch": 0.0612199764792933, + "grad_norm": 3.4796197385612735, + "learning_rate": 1.9868214726987513e-05, + "loss": 1.3143, + "step": 14810 + }, + { + "epoch": 0.06126131339791538, + "grad_norm": 3.8413485871243402, + "learning_rate": 1.9868002382116186e-05, + "loss": 1.3451, + "step": 14820 + }, + { + "epoch": 0.061302650316537456, + "grad_norm": 3.3376163822500278, + "learning_rate": 1.9867789867444066e-05, + "loss": 1.3486, + "step": 14830 + }, + { + "epoch": 0.061343987235159526, + "grad_norm": 3.3554967862457543, + "learning_rate": 1.9867577182974807e-05, + "loss": 1.3447, + "step": 14840 + }, + { + "epoch": 0.061385324153781604, + "grad_norm": 3.0553156261560757, + "learning_rate": 1.9867364328712074e-05, + "loss": 1.3436, + "step": 14850 + }, + { + "epoch": 0.06142666107240368, + "grad_norm": 3.9386923791793027, + "learning_rate": 1.9867151304659527e-05, + "loss": 1.3719, + "step": 14860 + }, + { + "epoch": 0.06146799799102576, + "grad_norm": 3.915463974315291, + "learning_rate": 1.986693811082083e-05, + "loss": 1.328, + "step": 14870 + }, + { + "epoch": 0.06150933490964783, + "grad_norm": 2.800179512440139, + "learning_rate": 1.986672474719965e-05, + "loss": 1.322, + "step": 14880 + }, + { + "epoch": 0.061550671828269905, + "grad_norm": 3.3411218270323664, + "learning_rate": 1.9866511213799665e-05, + "loss": 1.3899, + "step": 14890 + }, + { + "epoch": 0.06159200874689198, + "grad_norm": 3.323705843953213, + "learning_rate": 1.9866297510624544e-05, + "loss": 1.3615, + "step": 14900 + }, + { + "epoch": 0.06163334566551405, + "grad_norm": 4.754040048447529, + "learning_rate": 1.9866083637677963e-05, + "loss": 1.3726, + "step": 14910 + }, + { + "epoch": 0.06167468258413613, + "grad_norm": 3.45837680678418, + "learning_rate": 1.9865869594963607e-05, + "loss": 1.3519, + "step": 14920 + }, + { + "epoch": 0.06171601950275821, + "grad_norm": 3.183067903396876, + "learning_rate": 1.986565538248516e-05, + "loss": 1.3584, + "step": 14930 + }, + { + "epoch": 0.061757356421380284, + "grad_norm": 3.8419649238240656, + "learning_rate": 1.98654410002463e-05, + "loss": 1.3698, + "step": 14940 + }, + { + "epoch": 0.061798693340002354, + "grad_norm": 3.3977765733638168, + "learning_rate": 1.9865226448250725e-05, + "loss": 1.3702, + "step": 14950 + }, + { + "epoch": 0.06184003025862443, + "grad_norm": 3.5010199408218305, + "learning_rate": 1.9865011726502118e-05, + "loss": 1.3515, + "step": 14960 + }, + { + "epoch": 0.06188136717724651, + "grad_norm": 3.521969897290798, + "learning_rate": 1.9864796835004184e-05, + "loss": 1.3562, + "step": 14970 + }, + { + "epoch": 0.06192270409586858, + "grad_norm": 3.198260891262558, + "learning_rate": 1.986458177376061e-05, + "loss": 1.3265, + "step": 14980 + }, + { + "epoch": 0.061964041014490656, + "grad_norm": 4.203288617287408, + "learning_rate": 1.9864366542775104e-05, + "loss": 1.3445, + "step": 14990 + }, + { + "epoch": 0.06200537793311273, + "grad_norm": 3.0231024395080204, + "learning_rate": 1.9864151142051367e-05, + "loss": 1.3437, + "step": 15000 + }, + { + "epoch": 0.06204671485173481, + "grad_norm": 3.375301699368925, + "learning_rate": 1.9863935571593104e-05, + "loss": 1.3587, + "step": 15010 + }, + { + "epoch": 0.06208805177035688, + "grad_norm": 3.3942212627441433, + "learning_rate": 1.986371983140403e-05, + "loss": 1.3574, + "step": 15020 + }, + { + "epoch": 0.06212938868897896, + "grad_norm": 2.8229502868870906, + "learning_rate": 1.986350392148785e-05, + "loss": 1.3252, + "step": 15030 + }, + { + "epoch": 0.062170725607601035, + "grad_norm": 3.531378883228978, + "learning_rate": 1.9863287841848283e-05, + "loss": 1.3284, + "step": 15040 + }, + { + "epoch": 0.062212062526223105, + "grad_norm": 3.4111136482320057, + "learning_rate": 1.986307159248905e-05, + "loss": 1.3503, + "step": 15050 + }, + { + "epoch": 0.06225339944484518, + "grad_norm": 3.1924697026460525, + "learning_rate": 1.9862855173413864e-05, + "loss": 1.3316, + "step": 15060 + }, + { + "epoch": 0.06229473636346726, + "grad_norm": 3.673294835187914, + "learning_rate": 1.9862638584626456e-05, + "loss": 1.378, + "step": 15070 + }, + { + "epoch": 0.06233607328208933, + "grad_norm": 2.944502490931273, + "learning_rate": 1.9862421826130548e-05, + "loss": 1.3505, + "step": 15080 + }, + { + "epoch": 0.06237741020071141, + "grad_norm": 3.166457773127537, + "learning_rate": 1.9862204897929875e-05, + "loss": 1.3274, + "step": 15090 + }, + { + "epoch": 0.062418747119333484, + "grad_norm": 3.539140440652841, + "learning_rate": 1.9861987800028167e-05, + "loss": 1.3373, + "step": 15100 + }, + { + "epoch": 0.06246008403795556, + "grad_norm": 4.111976580182342, + "learning_rate": 1.986177053242916e-05, + "loss": 1.3795, + "step": 15110 + }, + { + "epoch": 0.06250142095657764, + "grad_norm": 3.518730561598167, + "learning_rate": 1.986155309513659e-05, + "loss": 1.3284, + "step": 15120 + }, + { + "epoch": 0.06254275787519971, + "grad_norm": 3.391425624844218, + "learning_rate": 1.9861335488154206e-05, + "loss": 1.3587, + "step": 15130 + }, + { + "epoch": 0.06258409479382178, + "grad_norm": 3.4961442083301097, + "learning_rate": 1.9861117711485743e-05, + "loss": 1.399, + "step": 15140 + }, + { + "epoch": 0.06262543171244386, + "grad_norm": 2.7302506739470784, + "learning_rate": 1.9860899765134953e-05, + "loss": 1.3654, + "step": 15150 + }, + { + "epoch": 0.06266676863106593, + "grad_norm": 4.411982690984872, + "learning_rate": 1.9860681649105585e-05, + "loss": 1.3409, + "step": 15160 + }, + { + "epoch": 0.062708105549688, + "grad_norm": 3.671422586435062, + "learning_rate": 1.9860463363401393e-05, + "loss": 1.3629, + "step": 15170 + }, + { + "epoch": 0.06274944246831009, + "grad_norm": 3.599702261798689, + "learning_rate": 1.9860244908026133e-05, + "loss": 1.3464, + "step": 15180 + }, + { + "epoch": 0.06279077938693216, + "grad_norm": 3.6962151021925598, + "learning_rate": 1.9860026282983568e-05, + "loss": 1.362, + "step": 15190 + }, + { + "epoch": 0.06283211630555424, + "grad_norm": 3.5740776610173284, + "learning_rate": 1.9859807488277453e-05, + "loss": 1.3657, + "step": 15200 + }, + { + "epoch": 0.06287345322417631, + "grad_norm": 3.437476924912017, + "learning_rate": 1.9859588523911554e-05, + "loss": 1.3384, + "step": 15210 + }, + { + "epoch": 0.06291479014279838, + "grad_norm": 3.2699157716406373, + "learning_rate": 1.9859369389889642e-05, + "loss": 1.3658, + "step": 15220 + }, + { + "epoch": 0.06295612706142047, + "grad_norm": 3.123305029392182, + "learning_rate": 1.9859150086215487e-05, + "loss": 1.352, + "step": 15230 + }, + { + "epoch": 0.06299746398004254, + "grad_norm": 3.2830928690839354, + "learning_rate": 1.985893061289286e-05, + "loss": 1.3799, + "step": 15240 + }, + { + "epoch": 0.06303880089866461, + "grad_norm": 3.664715842390073, + "learning_rate": 1.9858710969925547e-05, + "loss": 1.3669, + "step": 15250 + }, + { + "epoch": 0.06308013781728669, + "grad_norm": 3.2013150011378952, + "learning_rate": 1.985849115731731e-05, + "loss": 1.3403, + "step": 15260 + }, + { + "epoch": 0.06312147473590876, + "grad_norm": 3.1107330520735643, + "learning_rate": 1.9858271175071946e-05, + "loss": 1.348, + "step": 15270 + }, + { + "epoch": 0.06316281165453083, + "grad_norm": 3.188091096811774, + "learning_rate": 1.9858051023193234e-05, + "loss": 1.3219, + "step": 15280 + }, + { + "epoch": 0.06320414857315292, + "grad_norm": 3.1313626953852705, + "learning_rate": 1.9857830701684967e-05, + "loss": 1.3622, + "step": 15290 + }, + { + "epoch": 0.06324548549177499, + "grad_norm": 3.1702519850910127, + "learning_rate": 1.985761021055093e-05, + "loss": 1.3546, + "step": 15300 + }, + { + "epoch": 0.06328682241039706, + "grad_norm": 3.252596544143132, + "learning_rate": 1.9857389549794917e-05, + "loss": 1.2552, + "step": 15310 + }, + { + "epoch": 0.06332815932901914, + "grad_norm": 3.0313555049861374, + "learning_rate": 1.985716871942073e-05, + "loss": 1.4033, + "step": 15320 + }, + { + "epoch": 0.06336949624764121, + "grad_norm": 3.4443767622891115, + "learning_rate": 1.985694771943217e-05, + "loss": 1.3893, + "step": 15330 + }, + { + "epoch": 0.06341083316626328, + "grad_norm": 3.616984395155513, + "learning_rate": 1.9856726549833034e-05, + "loss": 1.3499, + "step": 15340 + }, + { + "epoch": 0.06345217008488536, + "grad_norm": 3.50828559499885, + "learning_rate": 1.985650521062713e-05, + "loss": 1.3298, + "step": 15350 + }, + { + "epoch": 0.06349350700350744, + "grad_norm": 2.8467914617151244, + "learning_rate": 1.9856283701818268e-05, + "loss": 1.3307, + "step": 15360 + }, + { + "epoch": 0.06353484392212952, + "grad_norm": 4.069010272907543, + "learning_rate": 1.9856062023410257e-05, + "loss": 1.3341, + "step": 15370 + }, + { + "epoch": 0.06357618084075159, + "grad_norm": 3.554716298604244, + "learning_rate": 1.985584017540691e-05, + "loss": 1.3366, + "step": 15380 + }, + { + "epoch": 0.06361751775937366, + "grad_norm": 3.2437195929255123, + "learning_rate": 1.985561815781205e-05, + "loss": 1.3241, + "step": 15390 + }, + { + "epoch": 0.06365885467799574, + "grad_norm": 3.0814014462588495, + "learning_rate": 1.9855395970629497e-05, + "loss": 1.3086, + "step": 15400 + }, + { + "epoch": 0.06370019159661781, + "grad_norm": 2.845186766341529, + "learning_rate": 1.985517361386307e-05, + "loss": 1.3302, + "step": 15410 + }, + { + "epoch": 0.06374152851523988, + "grad_norm": 3.594783956612939, + "learning_rate": 1.9854951087516598e-05, + "loss": 1.3374, + "step": 15420 + }, + { + "epoch": 0.06378286543386197, + "grad_norm": 2.8854977362042815, + "learning_rate": 1.9854728391593904e-05, + "loss": 1.3326, + "step": 15430 + }, + { + "epoch": 0.06382420235248404, + "grad_norm": 3.5259935395974784, + "learning_rate": 1.985450552609883e-05, + "loss": 1.3081, + "step": 15440 + }, + { + "epoch": 0.06386553927110611, + "grad_norm": 3.245306593778846, + "learning_rate": 1.9854282491035203e-05, + "loss": 1.3746, + "step": 15450 + }, + { + "epoch": 0.06390687618972819, + "grad_norm": 4.071308549266746, + "learning_rate": 1.9854059286406866e-05, + "loss": 1.3783, + "step": 15460 + }, + { + "epoch": 0.06394821310835026, + "grad_norm": 3.247822958885961, + "learning_rate": 1.9853835912217657e-05, + "loss": 1.3411, + "step": 15470 + }, + { + "epoch": 0.06398955002697233, + "grad_norm": 3.35088921885468, + "learning_rate": 1.9853612368471416e-05, + "loss": 1.3769, + "step": 15480 + }, + { + "epoch": 0.06403088694559442, + "grad_norm": 3.311416405062516, + "learning_rate": 1.9853388655171998e-05, + "loss": 1.3546, + "step": 15490 + }, + { + "epoch": 0.06407222386421649, + "grad_norm": 3.423751956404864, + "learning_rate": 1.985316477232325e-05, + "loss": 1.3082, + "step": 15500 + }, + { + "epoch": 0.06411356078283857, + "grad_norm": 3.5994672319763072, + "learning_rate": 1.9852940719929017e-05, + "loss": 1.308, + "step": 15510 + }, + { + "epoch": 0.06415489770146064, + "grad_norm": 6.092109341719527, + "learning_rate": 1.9852716497993164e-05, + "loss": 1.3333, + "step": 15520 + }, + { + "epoch": 0.06419623462008271, + "grad_norm": 3.4870247799249574, + "learning_rate": 1.985249210651954e-05, + "loss": 1.3755, + "step": 15530 + }, + { + "epoch": 0.0642375715387048, + "grad_norm": 3.1557901036816687, + "learning_rate": 1.9852267545512016e-05, + "loss": 1.3237, + "step": 15540 + }, + { + "epoch": 0.06427890845732687, + "grad_norm": 2.9168599988216655, + "learning_rate": 1.9852042814974448e-05, + "loss": 1.3333, + "step": 15550 + }, + { + "epoch": 0.06432024537594894, + "grad_norm": 3.658350430782456, + "learning_rate": 1.9851817914910707e-05, + "loss": 1.3157, + "step": 15560 + }, + { + "epoch": 0.06436158229457102, + "grad_norm": 2.98123289160538, + "learning_rate": 1.9851592845324664e-05, + "loss": 1.3461, + "step": 15570 + }, + { + "epoch": 0.06440291921319309, + "grad_norm": 3.2851248740018133, + "learning_rate": 1.9851367606220187e-05, + "loss": 1.3592, + "step": 15580 + }, + { + "epoch": 0.06444425613181516, + "grad_norm": 2.874060228764119, + "learning_rate": 1.9851142197601157e-05, + "loss": 1.3179, + "step": 15590 + }, + { + "epoch": 0.06448559305043725, + "grad_norm": 3.3595179685654286, + "learning_rate": 1.985091661947145e-05, + "loss": 1.358, + "step": 15600 + }, + { + "epoch": 0.06452692996905932, + "grad_norm": 4.343402000280312, + "learning_rate": 1.9850690871834945e-05, + "loss": 1.3387, + "step": 15610 + }, + { + "epoch": 0.06456826688768139, + "grad_norm": 4.4028919030557345, + "learning_rate": 1.985046495469553e-05, + "loss": 1.3405, + "step": 15620 + }, + { + "epoch": 0.06460960380630347, + "grad_norm": 3.3683021821213077, + "learning_rate": 1.9850238868057097e-05, + "loss": 1.3164, + "step": 15630 + }, + { + "epoch": 0.06465094072492554, + "grad_norm": 3.223018835339703, + "learning_rate": 1.9850012611923527e-05, + "loss": 1.2937, + "step": 15640 + }, + { + "epoch": 0.06469227764354761, + "grad_norm": 3.6479375571543584, + "learning_rate": 1.984978618629872e-05, + "loss": 1.3703, + "step": 15650 + }, + { + "epoch": 0.0647336145621697, + "grad_norm": 3.0694724692776107, + "learning_rate": 1.9849559591186566e-05, + "loss": 1.3239, + "step": 15660 + }, + { + "epoch": 0.06477495148079176, + "grad_norm": 3.595788633474915, + "learning_rate": 1.984933282659097e-05, + "loss": 1.2858, + "step": 15670 + }, + { + "epoch": 0.06481628839941385, + "grad_norm": 2.996464746206387, + "learning_rate": 1.984910589251583e-05, + "loss": 1.3677, + "step": 15680 + }, + { + "epoch": 0.06485762531803592, + "grad_norm": 2.978010213666374, + "learning_rate": 1.9848878788965053e-05, + "loss": 1.3612, + "step": 15690 + }, + { + "epoch": 0.06489896223665799, + "grad_norm": 4.107355827588679, + "learning_rate": 1.9848651515942545e-05, + "loss": 1.3473, + "step": 15700 + }, + { + "epoch": 0.06494029915528007, + "grad_norm": 3.405862899284087, + "learning_rate": 1.984842407345222e-05, + "loss": 1.329, + "step": 15710 + }, + { + "epoch": 0.06498163607390214, + "grad_norm": 3.56806926046013, + "learning_rate": 1.984819646149799e-05, + "loss": 1.3547, + "step": 15720 + }, + { + "epoch": 0.06502297299252421, + "grad_norm": 2.9244416633025234, + "learning_rate": 1.984796868008377e-05, + "loss": 1.3451, + "step": 15730 + }, + { + "epoch": 0.0650643099111463, + "grad_norm": 3.6605360363216133, + "learning_rate": 1.984774072921348e-05, + "loss": 1.3121, + "step": 15740 + }, + { + "epoch": 0.06510564682976837, + "grad_norm": 3.857082230167186, + "learning_rate": 1.9847512608891046e-05, + "loss": 1.3546, + "step": 15750 + }, + { + "epoch": 0.06514698374839044, + "grad_norm": 3.20861184076794, + "learning_rate": 1.9847284319120386e-05, + "loss": 1.3384, + "step": 15760 + }, + { + "epoch": 0.06518832066701252, + "grad_norm": 4.374471400346774, + "learning_rate": 1.9847055859905434e-05, + "loss": 1.3603, + "step": 15770 + }, + { + "epoch": 0.06522965758563459, + "grad_norm": 3.558128348565767, + "learning_rate": 1.984682723125012e-05, + "loss": 1.3307, + "step": 15780 + }, + { + "epoch": 0.06527099450425666, + "grad_norm": 3.740975960278226, + "learning_rate": 1.984659843315838e-05, + "loss": 1.3565, + "step": 15790 + }, + { + "epoch": 0.06531233142287875, + "grad_norm": 3.0884764960813254, + "learning_rate": 1.9846369465634146e-05, + "loss": 1.3371, + "step": 15800 + }, + { + "epoch": 0.06535366834150082, + "grad_norm": 3.0640927344766236, + "learning_rate": 1.9846140328681363e-05, + "loss": 1.3075, + "step": 15810 + }, + { + "epoch": 0.0653950052601229, + "grad_norm": 3.6774626803339285, + "learning_rate": 1.9845911022303973e-05, + "loss": 1.3647, + "step": 15820 + }, + { + "epoch": 0.06543634217874497, + "grad_norm": 2.9365431211187065, + "learning_rate": 1.9845681546505915e-05, + "loss": 1.3086, + "step": 15830 + }, + { + "epoch": 0.06547767909736704, + "grad_norm": 4.305431264385432, + "learning_rate": 1.9845451901291145e-05, + "loss": 1.3348, + "step": 15840 + }, + { + "epoch": 0.06551901601598913, + "grad_norm": 3.032533703820296, + "learning_rate": 1.9845222086663615e-05, + "loss": 1.3527, + "step": 15850 + }, + { + "epoch": 0.0655603529346112, + "grad_norm": 3.3387798006802782, + "learning_rate": 1.9844992102627273e-05, + "loss": 1.3249, + "step": 15860 + }, + { + "epoch": 0.06560168985323327, + "grad_norm": 3.3127539852292363, + "learning_rate": 1.9844761949186083e-05, + "loss": 1.3323, + "step": 15870 + }, + { + "epoch": 0.06564302677185535, + "grad_norm": 3.4862694527591307, + "learning_rate": 1.9844531626344003e-05, + "loss": 1.3224, + "step": 15880 + }, + { + "epoch": 0.06568436369047742, + "grad_norm": 3.215035735991411, + "learning_rate": 1.9844301134104996e-05, + "loss": 1.349, + "step": 15890 + }, + { + "epoch": 0.06572570060909949, + "grad_norm": 3.331575129362213, + "learning_rate": 1.9844070472473026e-05, + "loss": 1.3297, + "step": 15900 + }, + { + "epoch": 0.06576703752772158, + "grad_norm": 3.008247289759144, + "learning_rate": 1.9843839641452062e-05, + "loss": 1.368, + "step": 15910 + }, + { + "epoch": 0.06580837444634365, + "grad_norm": 3.314210218559566, + "learning_rate": 1.984360864104608e-05, + "loss": 1.3318, + "step": 15920 + }, + { + "epoch": 0.06584971136496572, + "grad_norm": 4.460695903393647, + "learning_rate": 1.9843377471259056e-05, + "loss": 1.363, + "step": 15930 + }, + { + "epoch": 0.0658910482835878, + "grad_norm": 3.591421453277731, + "learning_rate": 1.984314613209496e-05, + "loss": 1.3428, + "step": 15940 + }, + { + "epoch": 0.06593238520220987, + "grad_norm": 3.708262991759124, + "learning_rate": 1.984291462355778e-05, + "loss": 1.3564, + "step": 15950 + }, + { + "epoch": 0.06597372212083194, + "grad_norm": 3.1432924399561903, + "learning_rate": 1.9842682945651495e-05, + "loss": 1.3455, + "step": 15960 + }, + { + "epoch": 0.06601505903945402, + "grad_norm": 3.1161735074970216, + "learning_rate": 1.9842451098380096e-05, + "loss": 1.3514, + "step": 15970 + }, + { + "epoch": 0.0660563959580761, + "grad_norm": 2.8632026794139875, + "learning_rate": 1.984221908174757e-05, + "loss": 1.3446, + "step": 15980 + }, + { + "epoch": 0.06609773287669818, + "grad_norm": 3.934735081002441, + "learning_rate": 1.9841986895757907e-05, + "loss": 1.3298, + "step": 15990 + }, + { + "epoch": 0.06613906979532025, + "grad_norm": 4.324584533198296, + "learning_rate": 1.9841754540415102e-05, + "loss": 1.3537, + "step": 16000 + }, + { + "epoch": 0.06618040671394232, + "grad_norm": 3.6119231745645166, + "learning_rate": 1.9841522015723164e-05, + "loss": 1.3343, + "step": 16010 + }, + { + "epoch": 0.0662217436325644, + "grad_norm": 3.1193540137010887, + "learning_rate": 1.984128932168608e-05, + "loss": 1.3752, + "step": 16020 + }, + { + "epoch": 0.06626308055118647, + "grad_norm": 3.1351192552066762, + "learning_rate": 1.984105645830786e-05, + "loss": 1.3289, + "step": 16030 + }, + { + "epoch": 0.06630441746980854, + "grad_norm": 3.5610679476787737, + "learning_rate": 1.9840823425592512e-05, + "loss": 1.3543, + "step": 16040 + }, + { + "epoch": 0.06634575438843063, + "grad_norm": 3.5677181048737943, + "learning_rate": 1.984059022354404e-05, + "loss": 1.3483, + "step": 16050 + }, + { + "epoch": 0.0663870913070527, + "grad_norm": 3.6239317747912385, + "learning_rate": 1.9840356852166465e-05, + "loss": 1.3511, + "step": 16060 + }, + { + "epoch": 0.06642842822567477, + "grad_norm": 3.7952366513012636, + "learning_rate": 1.9840123311463803e-05, + "loss": 1.33, + "step": 16070 + }, + { + "epoch": 0.06646976514429685, + "grad_norm": 3.345456868567261, + "learning_rate": 1.9839889601440064e-05, + "loss": 1.3226, + "step": 16080 + }, + { + "epoch": 0.06651110206291892, + "grad_norm": 3.0646963347861194, + "learning_rate": 1.9839655722099277e-05, + "loss": 1.3142, + "step": 16090 + }, + { + "epoch": 0.06655243898154099, + "grad_norm": 3.1796454557601956, + "learning_rate": 1.9839421673445457e-05, + "loss": 1.3363, + "step": 16100 + }, + { + "epoch": 0.06659377590016308, + "grad_norm": 2.7604488860103453, + "learning_rate": 1.9839187455482646e-05, + "loss": 1.3453, + "step": 16110 + }, + { + "epoch": 0.06663511281878515, + "grad_norm": 3.1473856534090885, + "learning_rate": 1.9838953068214862e-05, + "loss": 1.3146, + "step": 16120 + }, + { + "epoch": 0.06667644973740723, + "grad_norm": 3.323185903362043, + "learning_rate": 1.983871851164614e-05, + "loss": 1.3013, + "step": 16130 + }, + { + "epoch": 0.0667177866560293, + "grad_norm": 3.0139283592638315, + "learning_rate": 1.9838483785780522e-05, + "loss": 1.3761, + "step": 16140 + }, + { + "epoch": 0.06675912357465137, + "grad_norm": 3.2332451066710535, + "learning_rate": 1.9838248890622043e-05, + "loss": 1.341, + "step": 16150 + }, + { + "epoch": 0.06680046049327346, + "grad_norm": 3.0517814224449826, + "learning_rate": 1.9838013826174745e-05, + "loss": 1.3003, + "step": 16160 + }, + { + "epoch": 0.06684179741189553, + "grad_norm": 4.164899927708623, + "learning_rate": 1.983777859244267e-05, + "loss": 1.3247, + "step": 16170 + }, + { + "epoch": 0.0668831343305176, + "grad_norm": 2.992166731140292, + "learning_rate": 1.983754318942987e-05, + "loss": 1.2762, + "step": 16180 + }, + { + "epoch": 0.06692447124913968, + "grad_norm": 3.774796705592006, + "learning_rate": 1.98373076171404e-05, + "loss": 1.3652, + "step": 16190 + }, + { + "epoch": 0.06696580816776175, + "grad_norm": 3.6727461962589296, + "learning_rate": 1.98370718755783e-05, + "loss": 1.3433, + "step": 16200 + }, + { + "epoch": 0.06700714508638382, + "grad_norm": 2.759830849179559, + "learning_rate": 1.983683596474764e-05, + "loss": 1.2917, + "step": 16210 + }, + { + "epoch": 0.0670484820050059, + "grad_norm": 4.142227423223113, + "learning_rate": 1.983659988465247e-05, + "loss": 1.3287, + "step": 16220 + }, + { + "epoch": 0.06708981892362798, + "grad_norm": 3.1361138389830305, + "learning_rate": 1.9836363635296856e-05, + "loss": 1.3526, + "step": 16230 + }, + { + "epoch": 0.06713115584225005, + "grad_norm": 4.684718067129147, + "learning_rate": 1.9836127216684864e-05, + "loss": 1.3398, + "step": 16240 + }, + { + "epoch": 0.06717249276087213, + "grad_norm": 3.9305271840952325, + "learning_rate": 1.9835890628820564e-05, + "loss": 1.3061, + "step": 16250 + }, + { + "epoch": 0.0672138296794942, + "grad_norm": 3.6945496877183754, + "learning_rate": 1.983565387170802e-05, + "loss": 1.3046, + "step": 16260 + }, + { + "epoch": 0.06725516659811627, + "grad_norm": 2.913410222912495, + "learning_rate": 1.983541694535131e-05, + "loss": 1.3439, + "step": 16270 + }, + { + "epoch": 0.06729650351673835, + "grad_norm": 3.4482083464660143, + "learning_rate": 1.9835179849754517e-05, + "loss": 1.3282, + "step": 16280 + }, + { + "epoch": 0.06733784043536042, + "grad_norm": 4.311927057182653, + "learning_rate": 1.983494258492171e-05, + "loss": 1.3156, + "step": 16290 + }, + { + "epoch": 0.06737917735398251, + "grad_norm": 3.5718375129378015, + "learning_rate": 1.9834705150856973e-05, + "loss": 1.3088, + "step": 16300 + }, + { + "epoch": 0.06742051427260458, + "grad_norm": 3.6069557479034704, + "learning_rate": 1.98344675475644e-05, + "loss": 1.3259, + "step": 16310 + }, + { + "epoch": 0.06746185119122665, + "grad_norm": 3.2551197753818393, + "learning_rate": 1.9834229775048076e-05, + "loss": 1.3389, + "step": 16320 + }, + { + "epoch": 0.06750318810984873, + "grad_norm": 3.4034457092581536, + "learning_rate": 1.9833991833312086e-05, + "loss": 1.3396, + "step": 16330 + }, + { + "epoch": 0.0675445250284708, + "grad_norm": 2.9661916406960858, + "learning_rate": 1.9833753722360534e-05, + "loss": 1.2989, + "step": 16340 + }, + { + "epoch": 0.06758586194709287, + "grad_norm": 5.279255386120152, + "learning_rate": 1.983351544219751e-05, + "loss": 1.3283, + "step": 16350 + }, + { + "epoch": 0.06762719886571496, + "grad_norm": 3.8221920564477028, + "learning_rate": 1.9833276992827117e-05, + "loss": 1.2918, + "step": 16360 + }, + { + "epoch": 0.06766853578433703, + "grad_norm": 3.2255728423614163, + "learning_rate": 1.9833038374253456e-05, + "loss": 1.327, + "step": 16370 + }, + { + "epoch": 0.0677098727029591, + "grad_norm": 3.2355832791038464, + "learning_rate": 1.9832799586480637e-05, + "loss": 1.3204, + "step": 16380 + }, + { + "epoch": 0.06775120962158118, + "grad_norm": 3.2467585154210155, + "learning_rate": 1.9832560629512767e-05, + "loss": 1.3338, + "step": 16390 + }, + { + "epoch": 0.06779254654020325, + "grad_norm": 4.988429472619141, + "learning_rate": 1.9832321503353954e-05, + "loss": 1.3876, + "step": 16400 + }, + { + "epoch": 0.06783388345882532, + "grad_norm": 3.446832489576954, + "learning_rate": 1.9832082208008317e-05, + "loss": 1.3233, + "step": 16410 + }, + { + "epoch": 0.0678752203774474, + "grad_norm": 3.605129460226094, + "learning_rate": 1.9831842743479975e-05, + "loss": 1.3386, + "step": 16420 + }, + { + "epoch": 0.06791655729606948, + "grad_norm": 4.275587631874799, + "learning_rate": 1.9831603109773044e-05, + "loss": 1.3613, + "step": 16430 + }, + { + "epoch": 0.06795789421469156, + "grad_norm": 4.1009269376864665, + "learning_rate": 1.983136330689165e-05, + "loss": 1.3662, + "step": 16440 + }, + { + "epoch": 0.06799923113331363, + "grad_norm": 2.9883770968491565, + "learning_rate": 1.983112333483992e-05, + "loss": 1.3304, + "step": 16450 + }, + { + "epoch": 0.0680405680519357, + "grad_norm": 2.9575768154010085, + "learning_rate": 1.983088319362198e-05, + "loss": 1.3075, + "step": 16460 + }, + { + "epoch": 0.06808190497055779, + "grad_norm": 3.1295830557654147, + "learning_rate": 1.9830642883241967e-05, + "loss": 1.3311, + "step": 16470 + }, + { + "epoch": 0.06812324188917986, + "grad_norm": 3.6574005023751774, + "learning_rate": 1.9830402403704008e-05, + "loss": 1.2925, + "step": 16480 + }, + { + "epoch": 0.06816457880780193, + "grad_norm": 3.080968609055009, + "learning_rate": 1.9830161755012255e-05, + "loss": 1.3156, + "step": 16490 + }, + { + "epoch": 0.06820591572642401, + "grad_norm": 3.0330618488937553, + "learning_rate": 1.9829920937170835e-05, + "loss": 1.3314, + "step": 16500 + }, + { + "epoch": 0.06824725264504608, + "grad_norm": 3.3116009543610665, + "learning_rate": 1.9829679950183895e-05, + "loss": 1.3034, + "step": 16510 + }, + { + "epoch": 0.06828858956366815, + "grad_norm": 3.783202270094679, + "learning_rate": 1.9829438794055584e-05, + "loss": 1.3313, + "step": 16520 + }, + { + "epoch": 0.06832992648229023, + "grad_norm": 3.969686559597743, + "learning_rate": 1.9829197468790054e-05, + "loss": 1.2911, + "step": 16530 + }, + { + "epoch": 0.0683712634009123, + "grad_norm": 3.580580495241376, + "learning_rate": 1.9828955974391455e-05, + "loss": 1.2912, + "step": 16540 + }, + { + "epoch": 0.06841260031953438, + "grad_norm": 2.9992747255679033, + "learning_rate": 1.982871431086394e-05, + "loss": 1.302, + "step": 16550 + }, + { + "epoch": 0.06845393723815646, + "grad_norm": 3.5773285600866256, + "learning_rate": 1.9828472478211673e-05, + "loss": 1.334, + "step": 16560 + }, + { + "epoch": 0.06849527415677853, + "grad_norm": 3.647217700504523, + "learning_rate": 1.982823047643881e-05, + "loss": 1.3137, + "step": 16570 + }, + { + "epoch": 0.0685366110754006, + "grad_norm": 3.7929582820880428, + "learning_rate": 1.982798830554952e-05, + "loss": 1.3598, + "step": 16580 + }, + { + "epoch": 0.06857794799402268, + "grad_norm": 3.3921261114157817, + "learning_rate": 1.982774596554796e-05, + "loss": 1.3159, + "step": 16590 + }, + { + "epoch": 0.06861928491264475, + "grad_norm": 3.948788035775027, + "learning_rate": 1.9827503456438314e-05, + "loss": 1.3487, + "step": 16600 + }, + { + "epoch": 0.06866062183126684, + "grad_norm": 3.4434803120058595, + "learning_rate": 1.9827260778224744e-05, + "loss": 1.3611, + "step": 16610 + }, + { + "epoch": 0.06870195874988891, + "grad_norm": 3.345318165402726, + "learning_rate": 1.9827017930911433e-05, + "loss": 1.3214, + "step": 16620 + }, + { + "epoch": 0.06874329566851098, + "grad_norm": 3.3238354847739675, + "learning_rate": 1.9826774914502554e-05, + "loss": 1.3415, + "step": 16630 + }, + { + "epoch": 0.06878463258713306, + "grad_norm": 3.3341319261133773, + "learning_rate": 1.9826531729002293e-05, + "loss": 1.2814, + "step": 16640 + }, + { + "epoch": 0.06882596950575513, + "grad_norm": 3.2557833923669937, + "learning_rate": 1.982628837441483e-05, + "loss": 1.3692, + "step": 16650 + }, + { + "epoch": 0.0688673064243772, + "grad_norm": 2.7035167555145545, + "learning_rate": 1.9826044850744358e-05, + "loss": 1.3045, + "step": 16660 + }, + { + "epoch": 0.06890864334299929, + "grad_norm": 3.2563919103242167, + "learning_rate": 1.9825801157995065e-05, + "loss": 1.2807, + "step": 16670 + }, + { + "epoch": 0.06894998026162136, + "grad_norm": 3.369904458766467, + "learning_rate": 1.9825557296171143e-05, + "loss": 1.2897, + "step": 16680 + }, + { + "epoch": 0.06899131718024343, + "grad_norm": 3.064586151511127, + "learning_rate": 1.982531326527679e-05, + "loss": 1.3332, + "step": 16690 + }, + { + "epoch": 0.06903265409886551, + "grad_norm": 3.0263447019462753, + "learning_rate": 1.9825069065316204e-05, + "loss": 1.2825, + "step": 16700 + }, + { + "epoch": 0.06907399101748758, + "grad_norm": 4.065735171694178, + "learning_rate": 1.9824824696293584e-05, + "loss": 1.2698, + "step": 16710 + }, + { + "epoch": 0.06911532793610965, + "grad_norm": 3.4683529768298262, + "learning_rate": 1.9824580158213142e-05, + "loss": 1.3135, + "step": 16720 + }, + { + "epoch": 0.06915666485473174, + "grad_norm": 2.955425396507506, + "learning_rate": 1.9824335451079083e-05, + "loss": 1.3571, + "step": 16730 + }, + { + "epoch": 0.0691980017733538, + "grad_norm": 3.099433278967918, + "learning_rate": 1.982409057489561e-05, + "loss": 1.3273, + "step": 16740 + }, + { + "epoch": 0.06923933869197589, + "grad_norm": 2.8197838698402093, + "learning_rate": 1.982384552966695e-05, + "loss": 1.3158, + "step": 16750 + }, + { + "epoch": 0.06928067561059796, + "grad_norm": 3.422610473722703, + "learning_rate": 1.982360031539731e-05, + "loss": 1.3072, + "step": 16760 + }, + { + "epoch": 0.06932201252922003, + "grad_norm": 3.654024548878873, + "learning_rate": 1.9823354932090913e-05, + "loss": 1.3174, + "step": 16770 + }, + { + "epoch": 0.06936334944784212, + "grad_norm": 2.69963783380391, + "learning_rate": 1.982310937975198e-05, + "loss": 1.301, + "step": 16780 + }, + { + "epoch": 0.06940468636646419, + "grad_norm": 3.119208859314543, + "learning_rate": 1.9822863658384736e-05, + "loss": 1.2915, + "step": 16790 + }, + { + "epoch": 0.06944602328508626, + "grad_norm": 3.983735003830943, + "learning_rate": 1.982261776799341e-05, + "loss": 1.3029, + "step": 16800 + }, + { + "epoch": 0.06948736020370834, + "grad_norm": 2.8833424065520292, + "learning_rate": 1.9822371708582236e-05, + "loss": 1.309, + "step": 16810 + }, + { + "epoch": 0.06952869712233041, + "grad_norm": 3.5651047449269138, + "learning_rate": 1.9822125480155442e-05, + "loss": 1.3408, + "step": 16820 + }, + { + "epoch": 0.06957003404095248, + "grad_norm": 2.9084817767691344, + "learning_rate": 1.982187908271727e-05, + "loss": 1.3217, + "step": 16830 + }, + { + "epoch": 0.06961137095957456, + "grad_norm": 3.38644936958733, + "learning_rate": 1.982163251627196e-05, + "loss": 1.3094, + "step": 16840 + }, + { + "epoch": 0.06965270787819663, + "grad_norm": 3.7040548837026384, + "learning_rate": 1.9821385780823748e-05, + "loss": 1.2973, + "step": 16850 + }, + { + "epoch": 0.0696940447968187, + "grad_norm": 3.1936456120357435, + "learning_rate": 1.982113887637689e-05, + "loss": 1.3362, + "step": 16860 + }, + { + "epoch": 0.06973538171544079, + "grad_norm": 3.676850292318552, + "learning_rate": 1.9820891802935623e-05, + "loss": 1.2947, + "step": 16870 + }, + { + "epoch": 0.06977671863406286, + "grad_norm": 3.484703646839946, + "learning_rate": 1.9820644560504207e-05, + "loss": 1.2488, + "step": 16880 + }, + { + "epoch": 0.06981805555268493, + "grad_norm": 3.4336203726483654, + "learning_rate": 1.9820397149086892e-05, + "loss": 1.3372, + "step": 16890 + }, + { + "epoch": 0.06985939247130701, + "grad_norm": 3.4895422854739437, + "learning_rate": 1.9820149568687937e-05, + "loss": 1.3434, + "step": 16900 + }, + { + "epoch": 0.06990072938992908, + "grad_norm": 3.5294671703855545, + "learning_rate": 1.98199018193116e-05, + "loss": 1.3299, + "step": 16910 + }, + { + "epoch": 0.06994206630855117, + "grad_norm": 4.136918538506862, + "learning_rate": 1.9819653900962153e-05, + "loss": 1.3082, + "step": 16920 + }, + { + "epoch": 0.06998340322717324, + "grad_norm": 3.1337022113010975, + "learning_rate": 1.981940581364385e-05, + "loss": 1.3131, + "step": 16930 + }, + { + "epoch": 0.07002474014579531, + "grad_norm": 3.3461906954820115, + "learning_rate": 1.9819157557360965e-05, + "loss": 1.3533, + "step": 16940 + }, + { + "epoch": 0.07006607706441739, + "grad_norm": 2.945652884357033, + "learning_rate": 1.981890913211777e-05, + "loss": 1.333, + "step": 16950 + }, + { + "epoch": 0.07010741398303946, + "grad_norm": 3.8269320260288375, + "learning_rate": 1.981866053791854e-05, + "loss": 1.3021, + "step": 16960 + }, + { + "epoch": 0.07014875090166153, + "grad_norm": 3.373539442185096, + "learning_rate": 1.9818411774767555e-05, + "loss": 1.3621, + "step": 16970 + }, + { + "epoch": 0.07019008782028362, + "grad_norm": 3.1789158671874724, + "learning_rate": 1.9818162842669087e-05, + "loss": 1.3357, + "step": 16980 + }, + { + "epoch": 0.07023142473890569, + "grad_norm": 3.379334004765994, + "learning_rate": 1.981791374162743e-05, + "loss": 1.3328, + "step": 16990 + }, + { + "epoch": 0.07027276165752776, + "grad_norm": 3.3263838295875794, + "learning_rate": 1.981766447164686e-05, + "loss": 1.3177, + "step": 17000 + }, + { + "epoch": 0.07031409857614984, + "grad_norm": 3.4846748393089744, + "learning_rate": 1.9817415032731676e-05, + "loss": 1.3088, + "step": 17010 + }, + { + "epoch": 0.07035543549477191, + "grad_norm": 3.1166768241545526, + "learning_rate": 1.9817165424886165e-05, + "loss": 1.3168, + "step": 17020 + }, + { + "epoch": 0.07039677241339398, + "grad_norm": 3.880062668262267, + "learning_rate": 1.9816915648114623e-05, + "loss": 1.3071, + "step": 17030 + }, + { + "epoch": 0.07043810933201607, + "grad_norm": 3.511807529042466, + "learning_rate": 1.9816665702421344e-05, + "loss": 1.3409, + "step": 17040 + }, + { + "epoch": 0.07047944625063814, + "grad_norm": 3.1993990782018256, + "learning_rate": 1.9816415587810636e-05, + "loss": 1.2918, + "step": 17050 + }, + { + "epoch": 0.07052078316926022, + "grad_norm": 2.9930648253290943, + "learning_rate": 1.98161653042868e-05, + "loss": 1.2926, + "step": 17060 + }, + { + "epoch": 0.07056212008788229, + "grad_norm": 3.0399828063698116, + "learning_rate": 1.981591485185414e-05, + "loss": 1.3183, + "step": 17070 + }, + { + "epoch": 0.07060345700650436, + "grad_norm": 3.233401985951665, + "learning_rate": 1.981566423051697e-05, + "loss": 1.3262, + "step": 17080 + }, + { + "epoch": 0.07064479392512645, + "grad_norm": 2.891852783093535, + "learning_rate": 1.9815413440279597e-05, + "loss": 1.2882, + "step": 17090 + }, + { + "epoch": 0.07068613084374852, + "grad_norm": 2.861582978848217, + "learning_rate": 1.9815162481146345e-05, + "loss": 1.3417, + "step": 17100 + }, + { + "epoch": 0.07072746776237059, + "grad_norm": 3.657647746254238, + "learning_rate": 1.981491135312152e-05, + "loss": 1.3239, + "step": 17110 + }, + { + "epoch": 0.07076880468099267, + "grad_norm": 3.422931095645904, + "learning_rate": 1.9814660056209454e-05, + "loss": 1.3471, + "step": 17120 + }, + { + "epoch": 0.07081014159961474, + "grad_norm": 3.233056855771981, + "learning_rate": 1.9814408590414466e-05, + "loss": 1.342, + "step": 17130 + }, + { + "epoch": 0.07085147851823681, + "grad_norm": 2.8978002673834706, + "learning_rate": 1.9814156955740885e-05, + "loss": 1.3526, + "step": 17140 + }, + { + "epoch": 0.0708928154368589, + "grad_norm": 3.1044166179845227, + "learning_rate": 1.981390515219304e-05, + "loss": 1.3338, + "step": 17150 + }, + { + "epoch": 0.07093415235548096, + "grad_norm": 3.4443852171387963, + "learning_rate": 1.9813653179775263e-05, + "loss": 1.2798, + "step": 17160 + }, + { + "epoch": 0.07097548927410303, + "grad_norm": 3.4197392236572925, + "learning_rate": 1.9813401038491893e-05, + "loss": 1.3278, + "step": 17170 + }, + { + "epoch": 0.07101682619272512, + "grad_norm": 2.8656903185698868, + "learning_rate": 1.9813148728347263e-05, + "loss": 1.3044, + "step": 17180 + }, + { + "epoch": 0.07105816311134719, + "grad_norm": 3.4951853787951457, + "learning_rate": 1.981289624934572e-05, + "loss": 1.3731, + "step": 17190 + }, + { + "epoch": 0.07109950002996926, + "grad_norm": 3.0021246003594646, + "learning_rate": 1.981264360149161e-05, + "loss": 1.2953, + "step": 17200 + }, + { + "epoch": 0.07114083694859134, + "grad_norm": 3.078524102988392, + "learning_rate": 1.981239078478927e-05, + "loss": 1.3536, + "step": 17210 + }, + { + "epoch": 0.07118217386721341, + "grad_norm": 3.6935463858453805, + "learning_rate": 1.981213779924306e-05, + "loss": 1.3386, + "step": 17220 + }, + { + "epoch": 0.0712235107858355, + "grad_norm": 3.2508773899525036, + "learning_rate": 1.9811884644857332e-05, + "loss": 1.2929, + "step": 17230 + }, + { + "epoch": 0.07126484770445757, + "grad_norm": 4.11272659457257, + "learning_rate": 1.9811631321636438e-05, + "loss": 1.3376, + "step": 17240 + }, + { + "epoch": 0.07130618462307964, + "grad_norm": 3.3031386708100734, + "learning_rate": 1.9811377829584738e-05, + "loss": 1.3078, + "step": 17250 + }, + { + "epoch": 0.07134752154170172, + "grad_norm": 3.721567213304264, + "learning_rate": 1.9811124168706598e-05, + "loss": 1.3135, + "step": 17260 + }, + { + "epoch": 0.07138885846032379, + "grad_norm": 3.3077256345455712, + "learning_rate": 1.981087033900638e-05, + "loss": 1.2727, + "step": 17270 + }, + { + "epoch": 0.07143019537894586, + "grad_norm": 3.1365945025331716, + "learning_rate": 1.9810616340488448e-05, + "loss": 1.3082, + "step": 17280 + }, + { + "epoch": 0.07147153229756795, + "grad_norm": 3.0095750827782095, + "learning_rate": 1.981036217315718e-05, + "loss": 1.3422, + "step": 17290 + }, + { + "epoch": 0.07151286921619002, + "grad_norm": 3.879458456856346, + "learning_rate": 1.9810107837016943e-05, + "loss": 1.3444, + "step": 17300 + }, + { + "epoch": 0.07155420613481209, + "grad_norm": 3.5419514306232145, + "learning_rate": 1.9809853332072118e-05, + "loss": 1.3495, + "step": 17310 + }, + { + "epoch": 0.07159554305343417, + "grad_norm": 3.309317975252579, + "learning_rate": 1.9809598658327084e-05, + "loss": 1.3249, + "step": 17320 + }, + { + "epoch": 0.07163687997205624, + "grad_norm": 3.0972043541501484, + "learning_rate": 1.9809343815786218e-05, + "loss": 1.3289, + "step": 17330 + }, + { + "epoch": 0.07167821689067831, + "grad_norm": 3.278786808325931, + "learning_rate": 1.9809088804453913e-05, + "loss": 1.2998, + "step": 17340 + }, + { + "epoch": 0.0717195538093004, + "grad_norm": 4.238835899003122, + "learning_rate": 1.9808833624334547e-05, + "loss": 1.3311, + "step": 17350 + }, + { + "epoch": 0.07176089072792247, + "grad_norm": 2.92277331992324, + "learning_rate": 1.980857827543252e-05, + "loss": 1.2915, + "step": 17360 + }, + { + "epoch": 0.07180222764654455, + "grad_norm": 3.376501582542177, + "learning_rate": 1.9808322757752227e-05, + "loss": 1.3324, + "step": 17370 + }, + { + "epoch": 0.07184356456516662, + "grad_norm": 2.9710666955071754, + "learning_rate": 1.9808067071298057e-05, + "loss": 1.28, + "step": 17380 + }, + { + "epoch": 0.07188490148378869, + "grad_norm": 3.651318543567687, + "learning_rate": 1.9807811216074412e-05, + "loss": 1.2827, + "step": 17390 + }, + { + "epoch": 0.07192623840241077, + "grad_norm": 4.002270792783564, + "learning_rate": 1.9807555192085697e-05, + "loss": 1.2869, + "step": 17400 + }, + { + "epoch": 0.07196757532103285, + "grad_norm": 3.7494249341770027, + "learning_rate": 1.9807298999336316e-05, + "loss": 1.368, + "step": 17410 + }, + { + "epoch": 0.07200891223965492, + "grad_norm": 3.533387599209301, + "learning_rate": 1.9807042637830677e-05, + "loss": 1.3093, + "step": 17420 + }, + { + "epoch": 0.072050249158277, + "grad_norm": 3.3204434550557096, + "learning_rate": 1.980678610757319e-05, + "loss": 1.3557, + "step": 17430 + }, + { + "epoch": 0.07209158607689907, + "grad_norm": 3.1416247569905935, + "learning_rate": 1.9806529408568274e-05, + "loss": 1.2784, + "step": 17440 + }, + { + "epoch": 0.07213292299552114, + "grad_norm": 3.117796981445201, + "learning_rate": 1.980627254082034e-05, + "loss": 1.2882, + "step": 17450 + }, + { + "epoch": 0.07217425991414322, + "grad_norm": 3.13339952835746, + "learning_rate": 1.9806015504333812e-05, + "loss": 1.3144, + "step": 17460 + }, + { + "epoch": 0.0722155968327653, + "grad_norm": 3.7609938207052074, + "learning_rate": 1.9805758299113115e-05, + "loss": 1.329, + "step": 17470 + }, + { + "epoch": 0.07225693375138736, + "grad_norm": 3.1345386262954533, + "learning_rate": 1.980550092516267e-05, + "loss": 1.3049, + "step": 17480 + }, + { + "epoch": 0.07229827067000945, + "grad_norm": 3.7469777473015573, + "learning_rate": 1.98052433824869e-05, + "loss": 1.3096, + "step": 17490 + }, + { + "epoch": 0.07233960758863152, + "grad_norm": 2.9933582431757073, + "learning_rate": 1.9804985671090252e-05, + "loss": 1.3274, + "step": 17500 + }, + { + "epoch": 0.07238094450725359, + "grad_norm": 3.182564552885168, + "learning_rate": 1.980472779097715e-05, + "loss": 1.2891, + "step": 17510 + }, + { + "epoch": 0.07242228142587567, + "grad_norm": 3.8813847623233744, + "learning_rate": 1.9804469742152035e-05, + "loss": 1.2941, + "step": 17520 + }, + { + "epoch": 0.07246361834449774, + "grad_norm": 3.388309470760254, + "learning_rate": 1.9804211524619345e-05, + "loss": 1.3129, + "step": 17530 + }, + { + "epoch": 0.07250495526311983, + "grad_norm": 3.0370861904609305, + "learning_rate": 1.9803953138383523e-05, + "loss": 1.2816, + "step": 17540 + }, + { + "epoch": 0.0725462921817419, + "grad_norm": 3.255739551897295, + "learning_rate": 1.980369458344902e-05, + "loss": 1.3048, + "step": 17550 + }, + { + "epoch": 0.07258762910036397, + "grad_norm": 3.2197652995956325, + "learning_rate": 1.9803435859820278e-05, + "loss": 1.3401, + "step": 17560 + }, + { + "epoch": 0.07262896601898605, + "grad_norm": 3.144102345108867, + "learning_rate": 1.9803176967501752e-05, + "loss": 1.3093, + "step": 17570 + }, + { + "epoch": 0.07267030293760812, + "grad_norm": 3.255959441623639, + "learning_rate": 1.98029179064979e-05, + "loss": 1.3107, + "step": 17580 + }, + { + "epoch": 0.07271163985623019, + "grad_norm": 3.300836279803978, + "learning_rate": 1.9802658676813177e-05, + "loss": 1.2793, + "step": 17590 + }, + { + "epoch": 0.07275297677485228, + "grad_norm": 3.1982961633057307, + "learning_rate": 1.980239927845204e-05, + "loss": 1.293, + "step": 17600 + }, + { + "epoch": 0.07279431369347435, + "grad_norm": 2.7735162621143066, + "learning_rate": 1.980213971141896e-05, + "loss": 1.2829, + "step": 17610 + }, + { + "epoch": 0.07283565061209642, + "grad_norm": 3.1127853241409524, + "learning_rate": 1.9801879975718397e-05, + "loss": 1.3038, + "step": 17620 + }, + { + "epoch": 0.0728769875307185, + "grad_norm": 4.4882400102517, + "learning_rate": 1.9801620071354823e-05, + "loss": 1.3252, + "step": 17630 + }, + { + "epoch": 0.07291832444934057, + "grad_norm": 3.0527302986554474, + "learning_rate": 1.980135999833271e-05, + "loss": 1.3324, + "step": 17640 + }, + { + "epoch": 0.07295966136796264, + "grad_norm": 2.6769380820391637, + "learning_rate": 1.9801099756656534e-05, + "loss": 1.3472, + "step": 17650 + }, + { + "epoch": 0.07300099828658473, + "grad_norm": 2.8240010897597796, + "learning_rate": 1.980083934633077e-05, + "loss": 1.3072, + "step": 17660 + }, + { + "epoch": 0.0730423352052068, + "grad_norm": 3.0678813341238387, + "learning_rate": 1.9800578767359905e-05, + "loss": 1.3385, + "step": 17670 + }, + { + "epoch": 0.07308367212382888, + "grad_norm": 3.9303518801684056, + "learning_rate": 1.9800318019748414e-05, + "loss": 1.3024, + "step": 17680 + }, + { + "epoch": 0.07312500904245095, + "grad_norm": 3.4138877073872367, + "learning_rate": 1.980005710350079e-05, + "loss": 1.3219, + "step": 17690 + }, + { + "epoch": 0.07316634596107302, + "grad_norm": 2.9612831417650574, + "learning_rate": 1.9799796018621523e-05, + "loss": 1.2972, + "step": 17700 + }, + { + "epoch": 0.0732076828796951, + "grad_norm": 3.5999340523181167, + "learning_rate": 1.9799534765115106e-05, + "loss": 1.2879, + "step": 17710 + }, + { + "epoch": 0.07324901979831717, + "grad_norm": 3.243493407552293, + "learning_rate": 1.9799273342986027e-05, + "loss": 1.3312, + "step": 17720 + }, + { + "epoch": 0.07329035671693925, + "grad_norm": 3.1251098781902678, + "learning_rate": 1.979901175223879e-05, + "loss": 1.283, + "step": 17730 + }, + { + "epoch": 0.07333169363556133, + "grad_norm": 3.016302925303861, + "learning_rate": 1.97987499928779e-05, + "loss": 1.2653, + "step": 17740 + }, + { + "epoch": 0.0733730305541834, + "grad_norm": 3.0676270506907373, + "learning_rate": 1.9798488064907854e-05, + "loss": 1.3463, + "step": 17750 + }, + { + "epoch": 0.07341436747280547, + "grad_norm": 3.33093266064901, + "learning_rate": 1.9798225968333162e-05, + "loss": 1.2925, + "step": 17760 + }, + { + "epoch": 0.07345570439142755, + "grad_norm": 2.9945442082263707, + "learning_rate": 1.9797963703158338e-05, + "loss": 1.3073, + "step": 17770 + }, + { + "epoch": 0.07349704131004962, + "grad_norm": 2.7596863662002145, + "learning_rate": 1.9797701269387886e-05, + "loss": 1.3014, + "step": 17780 + }, + { + "epoch": 0.0735383782286717, + "grad_norm": 3.533092084537244, + "learning_rate": 1.979743866702633e-05, + "loss": 1.2791, + "step": 17790 + }, + { + "epoch": 0.07357971514729378, + "grad_norm": 4.416928923354257, + "learning_rate": 1.9797175896078183e-05, + "loss": 1.3187, + "step": 17800 + }, + { + "epoch": 0.07362105206591585, + "grad_norm": 3.4889626149350983, + "learning_rate": 1.9796912956547968e-05, + "loss": 1.3279, + "step": 17810 + }, + { + "epoch": 0.07366238898453792, + "grad_norm": 2.8612849638284636, + "learning_rate": 1.979664984844021e-05, + "loss": 1.286, + "step": 17820 + }, + { + "epoch": 0.07370372590316, + "grad_norm": 3.48443371509397, + "learning_rate": 1.9796386571759437e-05, + "loss": 1.3143, + "step": 17830 + }, + { + "epoch": 0.07374506282178207, + "grad_norm": 3.744181699697567, + "learning_rate": 1.979612312651018e-05, + "loss": 1.3133, + "step": 17840 + }, + { + "epoch": 0.07378639974040416, + "grad_norm": 3.0686519126186056, + "learning_rate": 1.9795859512696974e-05, + "loss": 1.3136, + "step": 17850 + }, + { + "epoch": 0.07382773665902623, + "grad_norm": 3.067495520118401, + "learning_rate": 1.9795595730324347e-05, + "loss": 1.3026, + "step": 17860 + }, + { + "epoch": 0.0738690735776483, + "grad_norm": 3.39693600568867, + "learning_rate": 1.9795331779396846e-05, + "loss": 1.3045, + "step": 17870 + }, + { + "epoch": 0.07391041049627038, + "grad_norm": 3.593857826641827, + "learning_rate": 1.9795067659919008e-05, + "loss": 1.3278, + "step": 17880 + }, + { + "epoch": 0.07395174741489245, + "grad_norm": 3.5924096708123985, + "learning_rate": 1.9794803371895383e-05, + "loss": 1.2578, + "step": 17890 + }, + { + "epoch": 0.07399308433351452, + "grad_norm": 3.1548306467737546, + "learning_rate": 1.9794538915330514e-05, + "loss": 1.3145, + "step": 17900 + }, + { + "epoch": 0.0740344212521366, + "grad_norm": 3.3874308529909736, + "learning_rate": 1.979427429022895e-05, + "loss": 1.2635, + "step": 17910 + }, + { + "epoch": 0.07407575817075868, + "grad_norm": 3.574121943476516, + "learning_rate": 1.979400949659525e-05, + "loss": 1.3333, + "step": 17920 + }, + { + "epoch": 0.07411709508938075, + "grad_norm": 3.277207731814223, + "learning_rate": 1.9793744534433968e-05, + "loss": 1.3438, + "step": 17930 + }, + { + "epoch": 0.07415843200800283, + "grad_norm": 3.8968392073501152, + "learning_rate": 1.979347940374966e-05, + "loss": 1.313, + "step": 17940 + }, + { + "epoch": 0.0741997689266249, + "grad_norm": 3.083444751622717, + "learning_rate": 1.9793214104546895e-05, + "loss": 1.3146, + "step": 17950 + }, + { + "epoch": 0.07424110584524697, + "grad_norm": 3.2007909625237208, + "learning_rate": 1.9792948636830235e-05, + "loss": 1.2767, + "step": 17960 + }, + { + "epoch": 0.07428244276386906, + "grad_norm": 3.475285388439305, + "learning_rate": 1.979268300060424e-05, + "loss": 1.3113, + "step": 17970 + }, + { + "epoch": 0.07432377968249113, + "grad_norm": 3.0728058276568064, + "learning_rate": 1.9792417195873496e-05, + "loss": 1.2625, + "step": 17980 + }, + { + "epoch": 0.07436511660111321, + "grad_norm": 3.1462512846060595, + "learning_rate": 1.9792151222642565e-05, + "loss": 1.2669, + "step": 17990 + }, + { + "epoch": 0.07440645351973528, + "grad_norm": 3.451097468244895, + "learning_rate": 1.9791885080916026e-05, + "loss": 1.3536, + "step": 18000 + }, + { + "epoch": 0.07444779043835735, + "grad_norm": 2.9123675277858805, + "learning_rate": 1.979161877069846e-05, + "loss": 1.2859, + "step": 18010 + }, + { + "epoch": 0.07448912735697943, + "grad_norm": 3.6048473916322905, + "learning_rate": 1.9791352291994453e-05, + "loss": 1.2868, + "step": 18020 + }, + { + "epoch": 0.0745304642756015, + "grad_norm": 2.877236521609129, + "learning_rate": 1.9791085644808588e-05, + "loss": 1.3201, + "step": 18030 + }, + { + "epoch": 0.07457180119422357, + "grad_norm": 3.346518920497629, + "learning_rate": 1.9790818829145447e-05, + "loss": 1.2914, + "step": 18040 + }, + { + "epoch": 0.07461313811284566, + "grad_norm": 3.5346578512252416, + "learning_rate": 1.979055184500963e-05, + "loss": 1.2668, + "step": 18050 + }, + { + "epoch": 0.07465447503146773, + "grad_norm": 3.044418176627761, + "learning_rate": 1.9790284692405723e-05, + "loss": 1.2722, + "step": 18060 + }, + { + "epoch": 0.0746958119500898, + "grad_norm": 3.4743539555195206, + "learning_rate": 1.979001737133833e-05, + "loss": 1.3093, + "step": 18070 + }, + { + "epoch": 0.07473714886871188, + "grad_norm": 3.152139034733342, + "learning_rate": 1.978974988181205e-05, + "loss": 1.3282, + "step": 18080 + }, + { + "epoch": 0.07477848578733395, + "grad_norm": 3.844127465592703, + "learning_rate": 1.978948222383148e-05, + "loss": 1.3, + "step": 18090 + }, + { + "epoch": 0.07481982270595602, + "grad_norm": 3.312821234732469, + "learning_rate": 1.9789214397401233e-05, + "loss": 1.3007, + "step": 18100 + }, + { + "epoch": 0.07486115962457811, + "grad_norm": 2.587430778686902, + "learning_rate": 1.978894640252591e-05, + "loss": 1.2712, + "step": 18110 + }, + { + "epoch": 0.07490249654320018, + "grad_norm": 3.6471232765625867, + "learning_rate": 1.978867823921013e-05, + "loss": 1.3255, + "step": 18120 + }, + { + "epoch": 0.07494383346182225, + "grad_norm": 3.4191203190660966, + "learning_rate": 1.9788409907458502e-05, + "loss": 1.2588, + "step": 18130 + }, + { + "epoch": 0.07498517038044433, + "grad_norm": 3.3115429776293466, + "learning_rate": 1.9788141407275643e-05, + "loss": 1.2923, + "step": 18140 + }, + { + "epoch": 0.0750265072990664, + "grad_norm": 3.360103375582472, + "learning_rate": 1.9787872738666182e-05, + "loss": 1.3274, + "step": 18150 + }, + { + "epoch": 0.07506784421768849, + "grad_norm": 4.4310427169860205, + "learning_rate": 1.978760390163473e-05, + "loss": 1.3237, + "step": 18160 + }, + { + "epoch": 0.07510918113631056, + "grad_norm": 2.794577806503562, + "learning_rate": 1.9787334896185916e-05, + "loss": 1.3095, + "step": 18170 + }, + { + "epoch": 0.07515051805493263, + "grad_norm": 3.3585888834525384, + "learning_rate": 1.9787065722324374e-05, + "loss": 1.3199, + "step": 18180 + }, + { + "epoch": 0.07519185497355471, + "grad_norm": 3.060067450654041, + "learning_rate": 1.9786796380054733e-05, + "loss": 1.2532, + "step": 18190 + }, + { + "epoch": 0.07523319189217678, + "grad_norm": 3.124345955834219, + "learning_rate": 1.978652686938163e-05, + "loss": 1.2875, + "step": 18200 + }, + { + "epoch": 0.07527452881079885, + "grad_norm": 3.6658880497970254, + "learning_rate": 1.9786257190309695e-05, + "loss": 1.301, + "step": 18210 + }, + { + "epoch": 0.07531586572942094, + "grad_norm": 3.7099075204316088, + "learning_rate": 1.9785987342843573e-05, + "loss": 1.343, + "step": 18220 + }, + { + "epoch": 0.075357202648043, + "grad_norm": 3.26000011118403, + "learning_rate": 1.9785717326987914e-05, + "loss": 1.2782, + "step": 18230 + }, + { + "epoch": 0.07539853956666508, + "grad_norm": 3.133092069419479, + "learning_rate": 1.978544714274735e-05, + "loss": 1.3026, + "step": 18240 + }, + { + "epoch": 0.07543987648528716, + "grad_norm": 3.1564032343316453, + "learning_rate": 1.9785176790126542e-05, + "loss": 1.3207, + "step": 18250 + }, + { + "epoch": 0.07548121340390923, + "grad_norm": 3.284115672721678, + "learning_rate": 1.9784906269130137e-05, + "loss": 1.3117, + "step": 18260 + }, + { + "epoch": 0.0755225503225313, + "grad_norm": 3.1339318140995167, + "learning_rate": 1.9784635579762793e-05, + "loss": 1.305, + "step": 18270 + }, + { + "epoch": 0.07556388724115339, + "grad_norm": 4.379220492841367, + "learning_rate": 1.9784364722029165e-05, + "loss": 1.3408, + "step": 18280 + }, + { + "epoch": 0.07560522415977546, + "grad_norm": 3.9318661880011945, + "learning_rate": 1.978409369593391e-05, + "loss": 1.3053, + "step": 18290 + }, + { + "epoch": 0.07564656107839754, + "grad_norm": 3.3417710306838253, + "learning_rate": 1.97838225014817e-05, + "loss": 1.2781, + "step": 18300 + }, + { + "epoch": 0.07568789799701961, + "grad_norm": 3.54702678079395, + "learning_rate": 1.9783551138677197e-05, + "loss": 1.2998, + "step": 18310 + }, + { + "epoch": 0.07572923491564168, + "grad_norm": 3.4741727106428946, + "learning_rate": 1.978327960752507e-05, + "loss": 1.3003, + "step": 18320 + }, + { + "epoch": 0.07577057183426376, + "grad_norm": 3.2924386529485563, + "learning_rate": 1.9783007908029995e-05, + "loss": 1.3095, + "step": 18330 + }, + { + "epoch": 0.07581190875288583, + "grad_norm": 3.516437725795232, + "learning_rate": 1.978273604019664e-05, + "loss": 1.3088, + "step": 18340 + }, + { + "epoch": 0.0758532456715079, + "grad_norm": 3.5342954273687193, + "learning_rate": 1.9782464004029692e-05, + "loss": 1.3035, + "step": 18350 + }, + { + "epoch": 0.07589458259012999, + "grad_norm": 3.3786994777959523, + "learning_rate": 1.9782191799533824e-05, + "loss": 1.315, + "step": 18360 + }, + { + "epoch": 0.07593591950875206, + "grad_norm": 3.16392973516022, + "learning_rate": 1.9781919426713725e-05, + "loss": 1.3363, + "step": 18370 + }, + { + "epoch": 0.07597725642737413, + "grad_norm": 3.5807733324507693, + "learning_rate": 1.9781646885574078e-05, + "loss": 1.313, + "step": 18380 + }, + { + "epoch": 0.07601859334599621, + "grad_norm": 2.911047627936505, + "learning_rate": 1.978137417611958e-05, + "loss": 1.2943, + "step": 18390 + }, + { + "epoch": 0.07605993026461828, + "grad_norm": 3.519244637721149, + "learning_rate": 1.9781101298354913e-05, + "loss": 1.3013, + "step": 18400 + }, + { + "epoch": 0.07610126718324035, + "grad_norm": 2.80355546975897, + "learning_rate": 1.9780828252284778e-05, + "loss": 1.319, + "step": 18410 + }, + { + "epoch": 0.07614260410186244, + "grad_norm": 3.272390505391555, + "learning_rate": 1.9780555037913874e-05, + "loss": 1.3139, + "step": 18420 + }, + { + "epoch": 0.07618394102048451, + "grad_norm": 4.373312173753661, + "learning_rate": 1.9780281655246903e-05, + "loss": 1.332, + "step": 18430 + }, + { + "epoch": 0.07622527793910659, + "grad_norm": 3.4593509090352828, + "learning_rate": 1.9780008104288566e-05, + "loss": 1.3275, + "step": 18440 + }, + { + "epoch": 0.07626661485772866, + "grad_norm": 3.988419562725868, + "learning_rate": 1.9779734385043572e-05, + "loss": 1.2876, + "step": 18450 + }, + { + "epoch": 0.07630795177635073, + "grad_norm": 3.1901407057893127, + "learning_rate": 1.9779460497516633e-05, + "loss": 1.3077, + "step": 18460 + }, + { + "epoch": 0.07634928869497282, + "grad_norm": 3.392493661451085, + "learning_rate": 1.9779186441712456e-05, + "loss": 1.2853, + "step": 18470 + }, + { + "epoch": 0.07639062561359489, + "grad_norm": 3.2188126520970126, + "learning_rate": 1.9778912217635762e-05, + "loss": 1.3166, + "step": 18480 + }, + { + "epoch": 0.07643196253221696, + "grad_norm": 3.207853073897323, + "learning_rate": 1.9778637825291267e-05, + "loss": 1.2522, + "step": 18490 + }, + { + "epoch": 0.07647329945083904, + "grad_norm": 3.614534499504545, + "learning_rate": 1.9778363264683694e-05, + "loss": 1.2857, + "step": 18500 + }, + { + "epoch": 0.07651463636946111, + "grad_norm": 3.1137575271670634, + "learning_rate": 1.9778088535817765e-05, + "loss": 1.322, + "step": 18510 + }, + { + "epoch": 0.07655597328808318, + "grad_norm": 3.155733653317413, + "learning_rate": 1.977781363869821e-05, + "loss": 1.2775, + "step": 18520 + }, + { + "epoch": 0.07659731020670527, + "grad_norm": 4.553902503352338, + "learning_rate": 1.9777538573329757e-05, + "loss": 1.3102, + "step": 18530 + }, + { + "epoch": 0.07663864712532734, + "grad_norm": 3.421736022180327, + "learning_rate": 1.9777263339717143e-05, + "loss": 1.3101, + "step": 18540 + }, + { + "epoch": 0.0766799840439494, + "grad_norm": 3.0688727373371245, + "learning_rate": 1.97769879378651e-05, + "loss": 1.3116, + "step": 18550 + }, + { + "epoch": 0.07672132096257149, + "grad_norm": 3.7016289513150173, + "learning_rate": 1.977671236777837e-05, + "loss": 1.3266, + "step": 18560 + }, + { + "epoch": 0.07676265788119356, + "grad_norm": 2.918955840628811, + "learning_rate": 1.977643662946169e-05, + "loss": 1.335, + "step": 18570 + }, + { + "epoch": 0.07680399479981563, + "grad_norm": 2.710819215083447, + "learning_rate": 1.9776160722919808e-05, + "loss": 1.3241, + "step": 18580 + }, + { + "epoch": 0.07684533171843771, + "grad_norm": 4.690835156163024, + "learning_rate": 1.9775884648157473e-05, + "loss": 1.3112, + "step": 18590 + }, + { + "epoch": 0.07688666863705979, + "grad_norm": 2.8651231551210685, + "learning_rate": 1.9775608405179433e-05, + "loss": 1.2753, + "step": 18600 + }, + { + "epoch": 0.07692800555568187, + "grad_norm": 3.440907539318232, + "learning_rate": 1.9775331993990445e-05, + "loss": 1.3065, + "step": 18610 + }, + { + "epoch": 0.07696934247430394, + "grad_norm": 3.284791911253238, + "learning_rate": 1.977505541459526e-05, + "loss": 1.2491, + "step": 18620 + }, + { + "epoch": 0.07701067939292601, + "grad_norm": 3.9127411714280815, + "learning_rate": 1.977477866699864e-05, + "loss": 1.3486, + "step": 18630 + }, + { + "epoch": 0.0770520163115481, + "grad_norm": 3.6832173528899292, + "learning_rate": 1.9774501751205343e-05, + "loss": 1.26, + "step": 18640 + }, + { + "epoch": 0.07709335323017016, + "grad_norm": 3.037263148140778, + "learning_rate": 1.9774224667220145e-05, + "loss": 1.3066, + "step": 18650 + }, + { + "epoch": 0.07713469014879223, + "grad_norm": 3.9474254309978978, + "learning_rate": 1.97739474150478e-05, + "loss": 1.3526, + "step": 18660 + }, + { + "epoch": 0.07717602706741432, + "grad_norm": 3.2632360352472087, + "learning_rate": 1.977366999469309e-05, + "loss": 1.242, + "step": 18670 + }, + { + "epoch": 0.07721736398603639, + "grad_norm": 3.398609819663782, + "learning_rate": 1.977339240616078e-05, + "loss": 1.3083, + "step": 18680 + }, + { + "epoch": 0.07725870090465846, + "grad_norm": 2.8558120813505274, + "learning_rate": 1.977311464945565e-05, + "loss": 1.3002, + "step": 18690 + }, + { + "epoch": 0.07730003782328054, + "grad_norm": 3.3599778107706553, + "learning_rate": 1.9772836724582483e-05, + "loss": 1.3299, + "step": 18700 + }, + { + "epoch": 0.07734137474190261, + "grad_norm": 3.9628018743002658, + "learning_rate": 1.9772558631546054e-05, + "loss": 1.3115, + "step": 18710 + }, + { + "epoch": 0.07738271166052468, + "grad_norm": 3.239048590526076, + "learning_rate": 1.9772280370351155e-05, + "loss": 1.2683, + "step": 18720 + }, + { + "epoch": 0.07742404857914677, + "grad_norm": 3.0722778687848558, + "learning_rate": 1.977200194100257e-05, + "loss": 1.3484, + "step": 18730 + }, + { + "epoch": 0.07746538549776884, + "grad_norm": 3.1378953738889637, + "learning_rate": 1.9771723343505093e-05, + "loss": 1.353, + "step": 18740 + }, + { + "epoch": 0.07750672241639092, + "grad_norm": 2.9849102736288113, + "learning_rate": 1.9771444577863517e-05, + "loss": 1.3318, + "step": 18750 + }, + { + "epoch": 0.07754805933501299, + "grad_norm": 3.006210181091503, + "learning_rate": 1.9771165644082636e-05, + "loss": 1.3095, + "step": 18760 + }, + { + "epoch": 0.07758939625363506, + "grad_norm": 2.8083464465567074, + "learning_rate": 1.9770886542167252e-05, + "loss": 1.2896, + "step": 18770 + }, + { + "epoch": 0.07763073317225715, + "grad_norm": 3.262789038521565, + "learning_rate": 1.9770607272122168e-05, + "loss": 1.3333, + "step": 18780 + }, + { + "epoch": 0.07767207009087922, + "grad_norm": 3.421189746770822, + "learning_rate": 1.9770327833952187e-05, + "loss": 1.28, + "step": 18790 + }, + { + "epoch": 0.07771340700950129, + "grad_norm": 3.006884991049078, + "learning_rate": 1.977004822766212e-05, + "loss": 1.2982, + "step": 18800 + }, + { + "epoch": 0.07775474392812337, + "grad_norm": 2.8631010619173427, + "learning_rate": 1.976976845325678e-05, + "loss": 1.2924, + "step": 18810 + }, + { + "epoch": 0.07779608084674544, + "grad_norm": 3.4772737519626533, + "learning_rate": 1.9769488510740974e-05, + "loss": 1.2927, + "step": 18820 + }, + { + "epoch": 0.07783741776536751, + "grad_norm": 2.7187723773853967, + "learning_rate": 1.976920840011953e-05, + "loss": 1.2868, + "step": 18830 + }, + { + "epoch": 0.0778787546839896, + "grad_norm": 3.145525828551427, + "learning_rate": 1.9768928121397253e-05, + "loss": 1.2662, + "step": 18840 + }, + { + "epoch": 0.07792009160261167, + "grad_norm": 3.6834084868042205, + "learning_rate": 1.9768647674578978e-05, + "loss": 1.2916, + "step": 18850 + }, + { + "epoch": 0.07796142852123374, + "grad_norm": 3.2384861614856697, + "learning_rate": 1.976836705966953e-05, + "loss": 1.2719, + "step": 18860 + }, + { + "epoch": 0.07800276543985582, + "grad_norm": 3.06441372291841, + "learning_rate": 1.976808627667373e-05, + "loss": 1.3137, + "step": 18870 + }, + { + "epoch": 0.07804410235847789, + "grad_norm": 3.1960269763685565, + "learning_rate": 1.9767805325596417e-05, + "loss": 1.2943, + "step": 18880 + }, + { + "epoch": 0.07808543927709996, + "grad_norm": 3.6639857210827778, + "learning_rate": 1.976752420644242e-05, + "loss": 1.2634, + "step": 18890 + }, + { + "epoch": 0.07812677619572204, + "grad_norm": 2.8992259499910564, + "learning_rate": 1.976724291921658e-05, + "loss": 1.3205, + "step": 18900 + }, + { + "epoch": 0.07816811311434411, + "grad_norm": 3.8410116582710963, + "learning_rate": 1.9766961463923735e-05, + "loss": 1.2778, + "step": 18910 + }, + { + "epoch": 0.0782094500329662, + "grad_norm": 4.723075181703024, + "learning_rate": 1.976667984056873e-05, + "loss": 1.2998, + "step": 18920 + }, + { + "epoch": 0.07825078695158827, + "grad_norm": 3.371508438100944, + "learning_rate": 1.976639804915641e-05, + "loss": 1.2512, + "step": 18930 + }, + { + "epoch": 0.07829212387021034, + "grad_norm": 3.6639818461414677, + "learning_rate": 1.976611608969162e-05, + "loss": 1.2961, + "step": 18940 + }, + { + "epoch": 0.07833346078883242, + "grad_norm": 2.8877225304005525, + "learning_rate": 1.976583396217922e-05, + "loss": 1.3345, + "step": 18950 + }, + { + "epoch": 0.0783747977074545, + "grad_norm": 3.282148781915577, + "learning_rate": 1.9765551666624062e-05, + "loss": 1.3293, + "step": 18960 + }, + { + "epoch": 0.07841613462607656, + "grad_norm": 3.1290209387742007, + "learning_rate": 1.9765269203030996e-05, + "loss": 1.3202, + "step": 18970 + }, + { + "epoch": 0.07845747154469865, + "grad_norm": 2.9401948590630798, + "learning_rate": 1.9764986571404892e-05, + "loss": 1.2739, + "step": 18980 + }, + { + "epoch": 0.07849880846332072, + "grad_norm": 2.726413528264204, + "learning_rate": 1.9764703771750606e-05, + "loss": 1.3417, + "step": 18990 + }, + { + "epoch": 0.07854014538194279, + "grad_norm": 3.8708836373349693, + "learning_rate": 1.976442080407301e-05, + "loss": 1.2817, + "step": 19000 + }, + { + "epoch": 0.07858148230056487, + "grad_norm": 2.9568442793661225, + "learning_rate": 1.976413766837697e-05, + "loss": 1.3157, + "step": 19010 + }, + { + "epoch": 0.07862281921918694, + "grad_norm": 3.2014326193422895, + "learning_rate": 1.9763854364667355e-05, + "loss": 1.2734, + "step": 19020 + }, + { + "epoch": 0.07866415613780901, + "grad_norm": 3.8142983049855204, + "learning_rate": 1.9763570892949048e-05, + "loss": 1.3268, + "step": 19030 + }, + { + "epoch": 0.0787054930564311, + "grad_norm": 3.2687191971325555, + "learning_rate": 1.976328725322692e-05, + "loss": 1.2922, + "step": 19040 + }, + { + "epoch": 0.07874682997505317, + "grad_norm": 3.8385310533023276, + "learning_rate": 1.9763003445505854e-05, + "loss": 1.274, + "step": 19050 + }, + { + "epoch": 0.07878816689367525, + "grad_norm": 3.11002301324885, + "learning_rate": 1.9762719469790736e-05, + "loss": 1.3201, + "step": 19060 + }, + { + "epoch": 0.07882950381229732, + "grad_norm": 3.1064094109546385, + "learning_rate": 1.9762435326086446e-05, + "loss": 1.2524, + "step": 19070 + }, + { + "epoch": 0.07887084073091939, + "grad_norm": 2.8255870735206687, + "learning_rate": 1.976215101439788e-05, + "loss": 1.2982, + "step": 19080 + }, + { + "epoch": 0.07891217764954148, + "grad_norm": 3.041236956663884, + "learning_rate": 1.9761866534729926e-05, + "loss": 1.2784, + "step": 19090 + }, + { + "epoch": 0.07895351456816355, + "grad_norm": 3.0762050459561032, + "learning_rate": 1.976158188708748e-05, + "loss": 1.2615, + "step": 19100 + }, + { + "epoch": 0.07899485148678562, + "grad_norm": 4.057630290231405, + "learning_rate": 1.976129707147544e-05, + "loss": 1.2956, + "step": 19110 + }, + { + "epoch": 0.0790361884054077, + "grad_norm": 3.530641336009131, + "learning_rate": 1.976101208789871e-05, + "loss": 1.3015, + "step": 19120 + }, + { + "epoch": 0.07907752532402977, + "grad_norm": 3.457139153653439, + "learning_rate": 1.976072693636219e-05, + "loss": 1.3007, + "step": 19130 + }, + { + "epoch": 0.07911886224265184, + "grad_norm": 4.247767055366308, + "learning_rate": 1.9760441616870785e-05, + "loss": 1.3284, + "step": 19140 + }, + { + "epoch": 0.07916019916127393, + "grad_norm": 3.4313340038672995, + "learning_rate": 1.976015612942941e-05, + "loss": 1.3064, + "step": 19150 + }, + { + "epoch": 0.079201536079896, + "grad_norm": 3.908560338711229, + "learning_rate": 1.9759870474042973e-05, + "loss": 1.3116, + "step": 19160 + }, + { + "epoch": 0.07924287299851807, + "grad_norm": 3.745765592110947, + "learning_rate": 1.9759584650716395e-05, + "loss": 1.3737, + "step": 19170 + }, + { + "epoch": 0.07928420991714015, + "grad_norm": 3.078986622432472, + "learning_rate": 1.9759298659454588e-05, + "loss": 1.2788, + "step": 19180 + }, + { + "epoch": 0.07932554683576222, + "grad_norm": 2.987786079650764, + "learning_rate": 1.9759012500262474e-05, + "loss": 1.2834, + "step": 19190 + }, + { + "epoch": 0.07936688375438429, + "grad_norm": 3.1036456287250673, + "learning_rate": 1.975872617314498e-05, + "loss": 1.2831, + "step": 19200 + }, + { + "epoch": 0.07940822067300637, + "grad_norm": 3.865690690229234, + "learning_rate": 1.9758439678107033e-05, + "loss": 1.2922, + "step": 19210 + }, + { + "epoch": 0.07944955759162844, + "grad_norm": 3.3185016462394588, + "learning_rate": 1.9758153015153553e-05, + "loss": 1.3349, + "step": 19220 + }, + { + "epoch": 0.07949089451025053, + "grad_norm": 2.971387000152931, + "learning_rate": 1.975786618428949e-05, + "loss": 1.2411, + "step": 19230 + }, + { + "epoch": 0.0795322314288726, + "grad_norm": 3.1868412483457065, + "learning_rate": 1.9757579185519766e-05, + "loss": 1.3152, + "step": 19240 + }, + { + "epoch": 0.07957356834749467, + "grad_norm": 3.573887753524877, + "learning_rate": 1.9757292018849322e-05, + "loss": 1.32, + "step": 19250 + }, + { + "epoch": 0.07961490526611675, + "grad_norm": 2.6269840263774347, + "learning_rate": 1.9757004684283107e-05, + "loss": 1.3123, + "step": 19260 + }, + { + "epoch": 0.07965624218473882, + "grad_norm": 3.1839995568616546, + "learning_rate": 1.9756717181826054e-05, + "loss": 1.3305, + "step": 19270 + }, + { + "epoch": 0.0796975791033609, + "grad_norm": 3.5732944308856474, + "learning_rate": 1.9756429511483117e-05, + "loss": 1.298, + "step": 19280 + }, + { + "epoch": 0.07973891602198298, + "grad_norm": 3.612052010659264, + "learning_rate": 1.9756141673259247e-05, + "loss": 1.2797, + "step": 19290 + }, + { + "epoch": 0.07978025294060505, + "grad_norm": 3.129559274239735, + "learning_rate": 1.9755853667159392e-05, + "loss": 1.3242, + "step": 19300 + }, + { + "epoch": 0.07982158985922712, + "grad_norm": 3.2804938014945915, + "learning_rate": 1.9755565493188507e-05, + "loss": 1.2882, + "step": 19310 + }, + { + "epoch": 0.0798629267778492, + "grad_norm": 3.5207612414520444, + "learning_rate": 1.9755277151351558e-05, + "loss": 1.3292, + "step": 19320 + }, + { + "epoch": 0.07990426369647127, + "grad_norm": 3.2901806629006356, + "learning_rate": 1.9754988641653502e-05, + "loss": 1.2829, + "step": 19330 + }, + { + "epoch": 0.07994560061509334, + "grad_norm": 3.1031625175692876, + "learning_rate": 1.97546999640993e-05, + "loss": 1.2952, + "step": 19340 + }, + { + "epoch": 0.07998693753371543, + "grad_norm": 3.1014549071869606, + "learning_rate": 1.975441111869393e-05, + "loss": 1.2609, + "step": 19350 + }, + { + "epoch": 0.0800282744523375, + "grad_norm": 2.9394216851114217, + "learning_rate": 1.975412210544235e-05, + "loss": 1.2948, + "step": 19360 + }, + { + "epoch": 0.08006961137095958, + "grad_norm": 3.0263675235321545, + "learning_rate": 1.975383292434954e-05, + "loss": 1.3069, + "step": 19370 + }, + { + "epoch": 0.08011094828958165, + "grad_norm": 3.0848357434937124, + "learning_rate": 1.9753543575420477e-05, + "loss": 1.2747, + "step": 19380 + }, + { + "epoch": 0.08015228520820372, + "grad_norm": 2.9964068973521774, + "learning_rate": 1.9753254058660132e-05, + "loss": 1.3225, + "step": 19390 + }, + { + "epoch": 0.0801936221268258, + "grad_norm": 3.2536839847485224, + "learning_rate": 1.9752964374073494e-05, + "loss": 1.3448, + "step": 19400 + }, + { + "epoch": 0.08023495904544788, + "grad_norm": 3.37952149059762, + "learning_rate": 1.9752674521665546e-05, + "loss": 1.2845, + "step": 19410 + }, + { + "epoch": 0.08027629596406995, + "grad_norm": 3.0635418759023687, + "learning_rate": 1.9752384501441276e-05, + "loss": 1.3123, + "step": 19420 + }, + { + "epoch": 0.08031763288269203, + "grad_norm": 3.298468467101726, + "learning_rate": 1.9752094313405674e-05, + "loss": 1.2986, + "step": 19430 + }, + { + "epoch": 0.0803589698013141, + "grad_norm": 3.2473420562854507, + "learning_rate": 1.9751803957563735e-05, + "loss": 1.327, + "step": 19440 + }, + { + "epoch": 0.08040030671993617, + "grad_norm": 3.0843171834880483, + "learning_rate": 1.975151343392045e-05, + "loss": 1.304, + "step": 19450 + }, + { + "epoch": 0.08044164363855826, + "grad_norm": 3.0717834263814856, + "learning_rate": 1.9751222742480823e-05, + "loss": 1.3133, + "step": 19460 + }, + { + "epoch": 0.08048298055718033, + "grad_norm": 2.9934271457981603, + "learning_rate": 1.9750931883249852e-05, + "loss": 1.2674, + "step": 19470 + }, + { + "epoch": 0.0805243174758024, + "grad_norm": 3.1064293377071843, + "learning_rate": 1.9750640856232548e-05, + "loss": 1.2917, + "step": 19480 + }, + { + "epoch": 0.08056565439442448, + "grad_norm": 2.980738543973593, + "learning_rate": 1.975034966143391e-05, + "loss": 1.2978, + "step": 19490 + }, + { + "epoch": 0.08060699131304655, + "grad_norm": 3.383580434705728, + "learning_rate": 1.975005829885896e-05, + "loss": 1.2603, + "step": 19500 + }, + { + "epoch": 0.08064832823166862, + "grad_norm": 3.2830909352425564, + "learning_rate": 1.97497667685127e-05, + "loss": 1.3043, + "step": 19510 + }, + { + "epoch": 0.0806896651502907, + "grad_norm": 3.3969068546862036, + "learning_rate": 1.9749475070400157e-05, + "loss": 1.3338, + "step": 19520 + }, + { + "epoch": 0.08073100206891277, + "grad_norm": 3.0880094364724773, + "learning_rate": 1.974918320452634e-05, + "loss": 1.2351, + "step": 19530 + }, + { + "epoch": 0.08077233898753486, + "grad_norm": 3.4417279889999564, + "learning_rate": 1.974889117089628e-05, + "loss": 1.264, + "step": 19540 + }, + { + "epoch": 0.08081367590615693, + "grad_norm": 2.904756528303193, + "learning_rate": 1.9748598969514993e-05, + "loss": 1.2647, + "step": 19550 + }, + { + "epoch": 0.080855012824779, + "grad_norm": 2.7148091448555722, + "learning_rate": 1.9748306600387516e-05, + "loss": 1.2989, + "step": 19560 + }, + { + "epoch": 0.08089634974340108, + "grad_norm": 3.651161572746769, + "learning_rate": 1.9748014063518875e-05, + "loss": 1.3161, + "step": 19570 + }, + { + "epoch": 0.08093768666202315, + "grad_norm": 3.1157328327095866, + "learning_rate": 1.9747721358914106e-05, + "loss": 1.2713, + "step": 19580 + }, + { + "epoch": 0.08097902358064522, + "grad_norm": 3.131352709584711, + "learning_rate": 1.9747428486578243e-05, + "loss": 1.2904, + "step": 19590 + }, + { + "epoch": 0.08102036049926731, + "grad_norm": 2.942079614415163, + "learning_rate": 1.9747135446516327e-05, + "loss": 1.2857, + "step": 19600 + }, + { + "epoch": 0.08106169741788938, + "grad_norm": 2.8838500418296955, + "learning_rate": 1.9746842238733404e-05, + "loss": 1.3162, + "step": 19610 + }, + { + "epoch": 0.08110303433651145, + "grad_norm": 3.62285581087724, + "learning_rate": 1.9746548863234512e-05, + "loss": 1.3105, + "step": 19620 + }, + { + "epoch": 0.08114437125513353, + "grad_norm": 2.8759608727278927, + "learning_rate": 1.9746255320024702e-05, + "loss": 1.2757, + "step": 19630 + }, + { + "epoch": 0.0811857081737556, + "grad_norm": 3.2868321525430377, + "learning_rate": 1.974596160910903e-05, + "loss": 1.2808, + "step": 19640 + }, + { + "epoch": 0.08122704509237767, + "grad_norm": 2.9255021662463, + "learning_rate": 1.9745667730492543e-05, + "loss": 1.2982, + "step": 19650 + }, + { + "epoch": 0.08126838201099976, + "grad_norm": 3.6734905665007336, + "learning_rate": 1.97453736841803e-05, + "loss": 1.3469, + "step": 19660 + }, + { + "epoch": 0.08130971892962183, + "grad_norm": 2.9210368580920294, + "learning_rate": 1.974507947017736e-05, + "loss": 1.2624, + "step": 19670 + }, + { + "epoch": 0.08135105584824391, + "grad_norm": 3.5155014430588514, + "learning_rate": 1.974478508848879e-05, + "loss": 1.3032, + "step": 19680 + }, + { + "epoch": 0.08139239276686598, + "grad_norm": 3.1972640975511966, + "learning_rate": 1.9744490539119652e-05, + "loss": 1.2663, + "step": 19690 + }, + { + "epoch": 0.08143372968548805, + "grad_norm": 3.137196038716376, + "learning_rate": 1.9744195822075016e-05, + "loss": 1.2599, + "step": 19700 + }, + { + "epoch": 0.08147506660411014, + "grad_norm": 3.429683582295254, + "learning_rate": 1.974390093735995e-05, + "loss": 1.3125, + "step": 19710 + }, + { + "epoch": 0.0815164035227322, + "grad_norm": 3.4935153839614213, + "learning_rate": 1.974360588497953e-05, + "loss": 1.3021, + "step": 19720 + }, + { + "epoch": 0.08155774044135428, + "grad_norm": 3.693677622986634, + "learning_rate": 1.9743310664938836e-05, + "loss": 1.3154, + "step": 19730 + }, + { + "epoch": 0.08159907735997636, + "grad_norm": 3.1960119762782995, + "learning_rate": 1.9743015277242942e-05, + "loss": 1.2931, + "step": 19740 + }, + { + "epoch": 0.08164041427859843, + "grad_norm": 3.362501999299266, + "learning_rate": 1.9742719721896936e-05, + "loss": 1.2977, + "step": 19750 + }, + { + "epoch": 0.0816817511972205, + "grad_norm": 3.152254479818853, + "learning_rate": 1.97424239989059e-05, + "loss": 1.2571, + "step": 19760 + }, + { + "epoch": 0.08172308811584258, + "grad_norm": 3.78958825416846, + "learning_rate": 1.9742128108274926e-05, + "loss": 1.2903, + "step": 19770 + }, + { + "epoch": 0.08176442503446466, + "grad_norm": 3.2123145951877077, + "learning_rate": 1.9741832050009102e-05, + "loss": 1.282, + "step": 19780 + }, + { + "epoch": 0.08180576195308673, + "grad_norm": 3.1167331240538076, + "learning_rate": 1.9741535824113526e-05, + "loss": 1.2552, + "step": 19790 + }, + { + "epoch": 0.08184709887170881, + "grad_norm": 4.206288270940374, + "learning_rate": 1.974123943059329e-05, + "loss": 1.2597, + "step": 19800 + }, + { + "epoch": 0.08188843579033088, + "grad_norm": 3.553992295271698, + "learning_rate": 1.9740942869453504e-05, + "loss": 1.2908, + "step": 19810 + }, + { + "epoch": 0.08192977270895295, + "grad_norm": 3.521907001370512, + "learning_rate": 1.974064614069926e-05, + "loss": 1.297, + "step": 19820 + }, + { + "epoch": 0.08197110962757503, + "grad_norm": 3.360470421890721, + "learning_rate": 1.9740349244335665e-05, + "loss": 1.2882, + "step": 19830 + }, + { + "epoch": 0.0820124465461971, + "grad_norm": 2.9432002594333464, + "learning_rate": 1.9740052180367836e-05, + "loss": 1.252, + "step": 19840 + }, + { + "epoch": 0.08205378346481919, + "grad_norm": 3.252199526734451, + "learning_rate": 1.9739754948800874e-05, + "loss": 1.2805, + "step": 19850 + }, + { + "epoch": 0.08209512038344126, + "grad_norm": 3.1871490405890586, + "learning_rate": 1.9739457549639905e-05, + "loss": 1.2697, + "step": 19860 + }, + { + "epoch": 0.08213645730206333, + "grad_norm": 3.1800803475250166, + "learning_rate": 1.973915998289004e-05, + "loss": 1.3342, + "step": 19870 + }, + { + "epoch": 0.08217779422068541, + "grad_norm": 2.8783537685939478, + "learning_rate": 1.9738862248556395e-05, + "loss": 1.2471, + "step": 19880 + }, + { + "epoch": 0.08221913113930748, + "grad_norm": 2.7100348444687246, + "learning_rate": 1.9738564346644103e-05, + "loss": 1.2827, + "step": 19890 + }, + { + "epoch": 0.08226046805792955, + "grad_norm": 3.509524916127054, + "learning_rate": 1.973826627715828e-05, + "loss": 1.3025, + "step": 19900 + }, + { + "epoch": 0.08230180497655164, + "grad_norm": 3.3725272601095795, + "learning_rate": 1.9737968040104065e-05, + "loss": 1.3234, + "step": 19910 + }, + { + "epoch": 0.08234314189517371, + "grad_norm": 3.341475977428283, + "learning_rate": 1.9737669635486585e-05, + "loss": 1.3203, + "step": 19920 + }, + { + "epoch": 0.08238447881379578, + "grad_norm": 3.185315976662971, + "learning_rate": 1.9737371063310972e-05, + "loss": 1.2828, + "step": 19930 + }, + { + "epoch": 0.08242581573241786, + "grad_norm": 3.6059578502827945, + "learning_rate": 1.9737072323582366e-05, + "loss": 1.3272, + "step": 19940 + }, + { + "epoch": 0.08246715265103993, + "grad_norm": 3.486164979637442, + "learning_rate": 1.973677341630591e-05, + "loss": 1.2619, + "step": 19950 + }, + { + "epoch": 0.082508489569662, + "grad_norm": 3.3788628785271935, + "learning_rate": 1.9736474341486742e-05, + "loss": 1.2866, + "step": 19960 + }, + { + "epoch": 0.08254982648828409, + "grad_norm": 3.116863275035675, + "learning_rate": 1.973617509913001e-05, + "loss": 1.291, + "step": 19970 + }, + { + "epoch": 0.08259116340690616, + "grad_norm": 2.7964040776322796, + "learning_rate": 1.973587568924087e-05, + "loss": 1.2725, + "step": 19980 + }, + { + "epoch": 0.08263250032552824, + "grad_norm": 2.797740269032973, + "learning_rate": 1.9735576111824465e-05, + "loss": 1.2742, + "step": 19990 + }, + { + "epoch": 0.08267383724415031, + "grad_norm": 3.2432141472722162, + "learning_rate": 1.9735276366885956e-05, + "loss": 1.2947, + "step": 20000 + }, + { + "epoch": 0.08267383724415031, + "eval_loss": 1.5713279247283936, + "eval_runtime": 392.3898, + "eval_samples_per_second": 10.439, + "eval_steps_per_second": 2.61, + "step": 20000 + }, + { + "epoch": 0.08267797093601252, + "step": 20001, + "total_flos": 0.0, + "train_loss": 6.551032936291113e-05, + "train_runtime": 86.8457, + "train_samples_per_second": 14738.777, + "train_steps_per_second": 230.293 + } + ], + "logging_steps": 10, + "max_steps": 20000, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 10000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}