{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9997956025345286, "eval_steps": 2000, "global_step": 4280, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002335971033959179, "grad_norm": 73.6875, "learning_rate": 9.999954375607375e-07, "loss": 107.9787, "step": 10 }, { "epoch": 0.004671942067918358, "grad_norm": 73.25, "learning_rate": 9.99990875121475e-07, "loss": 103.9372, "step": 20 }, { "epoch": 0.007007913101877537, "grad_norm": 59.53125, "learning_rate": 9.999863126822123e-07, "loss": 100.6365, "step": 30 }, { "epoch": 0.009343884135836715, "grad_norm": 60.3125, "learning_rate": 9.999817502429498e-07, "loss": 100.6602, "step": 40 }, { "epoch": 0.011679855169795894, "grad_norm": 59.1875, "learning_rate": 9.999771878036874e-07, "loss": 100.0208, "step": 50 }, { "epoch": 0.014015826203755074, "grad_norm": 55.53125, "learning_rate": 9.999726253644248e-07, "loss": 98.8471, "step": 60 }, { "epoch": 0.016351797237714252, "grad_norm": 55.3125, "learning_rate": 9.999680629251621e-07, "loss": 99.5175, "step": 70 }, { "epoch": 0.01868776827167343, "grad_norm": 58.53125, "learning_rate": 9.999635004858997e-07, "loss": 98.5395, "step": 80 }, { "epoch": 0.02102373930563261, "grad_norm": 59.8125, "learning_rate": 9.999589380466373e-07, "loss": 98.9192, "step": 90 }, { "epoch": 0.023359710339591788, "grad_norm": 63.78125, "learning_rate": 9.999543756073747e-07, "loss": 97.6611, "step": 100 }, { "epoch": 0.02569568137355097, "grad_norm": 58.9375, "learning_rate": 9.99949813168112e-07, "loss": 97.8031, "step": 110 }, { "epoch": 0.028031652407510148, "grad_norm": 59.1875, "learning_rate": 9.999452507288496e-07, "loss": 98.0507, "step": 120 }, { "epoch": 0.030367623441469326, "grad_norm": 51.15625, "learning_rate": 9.999406882895872e-07, "loss": 96.8367, "step": 130 }, { "epoch": 0.032703594475428505, "grad_norm": 56.09375, "learning_rate": 9.999361258503245e-07, "loss": 96.7883, "step": 140 }, { "epoch": 0.035039565509387686, "grad_norm": 54.9375, "learning_rate": 9.99931563411062e-07, "loss": 97.7532, "step": 150 }, { "epoch": 0.03737553654334686, "grad_norm": 61.21875, "learning_rate": 9.999270009717995e-07, "loss": 97.0359, "step": 160 }, { "epoch": 0.03971150757730604, "grad_norm": 60.5625, "learning_rate": 9.99922438532537e-07, "loss": 96.5724, "step": 170 }, { "epoch": 0.04204747861126522, "grad_norm": 52.625, "learning_rate": 9.999178760932744e-07, "loss": 97.048, "step": 180 }, { "epoch": 0.0443834496452244, "grad_norm": 57.53125, "learning_rate": 9.999133136540118e-07, "loss": 96.4664, "step": 190 }, { "epoch": 0.046719420679183575, "grad_norm": 52.875, "learning_rate": 9.999087512147494e-07, "loss": 96.2538, "step": 200 }, { "epoch": 0.04905539171314276, "grad_norm": 63.71875, "learning_rate": 9.99904188775487e-07, "loss": 95.9222, "step": 210 }, { "epoch": 0.05139136274710194, "grad_norm": 54.8125, "learning_rate": 9.998996263362243e-07, "loss": 96.0401, "step": 220 }, { "epoch": 0.053727333781061114, "grad_norm": 55.0625, "learning_rate": 9.998950638969619e-07, "loss": 96.3144, "step": 230 }, { "epoch": 0.056063304815020296, "grad_norm": 53.5625, "learning_rate": 9.998905014576992e-07, "loss": 95.5121, "step": 240 }, { "epoch": 0.05839927584897947, "grad_norm": 52.59375, "learning_rate": 9.998859390184368e-07, "loss": 95.5379, "step": 250 }, { "epoch": 0.06073524688293865, "grad_norm": 51.875, "learning_rate": 9.998813765791742e-07, "loss": 94.811, "step": 260 }, { "epoch": 0.06307121791689783, "grad_norm": 54.46875, "learning_rate": 9.998768141399118e-07, "loss": 95.986, "step": 270 }, { "epoch": 0.06540718895085701, "grad_norm": 55.1875, "learning_rate": 9.998722517006491e-07, "loss": 95.3928, "step": 280 }, { "epoch": 0.06774315998481618, "grad_norm": 56.625, "learning_rate": 9.998676892613867e-07, "loss": 94.6728, "step": 290 }, { "epoch": 0.07007913101877537, "grad_norm": 55.09375, "learning_rate": 9.99863126822124e-07, "loss": 95.4789, "step": 300 }, { "epoch": 0.07241510205273455, "grad_norm": 61.96875, "learning_rate": 9.998585643828616e-07, "loss": 95.9337, "step": 310 }, { "epoch": 0.07475107308669372, "grad_norm": 55.34375, "learning_rate": 9.99854001943599e-07, "loss": 94.6639, "step": 320 }, { "epoch": 0.0770870441206529, "grad_norm": 55.3125, "learning_rate": 9.998494395043366e-07, "loss": 95.4775, "step": 330 }, { "epoch": 0.07942301515461209, "grad_norm": 51.65625, "learning_rate": 9.99844877065074e-07, "loss": 94.9841, "step": 340 }, { "epoch": 0.08175898618857126, "grad_norm": 55.5, "learning_rate": 9.998403146258115e-07, "loss": 94.7989, "step": 350 }, { "epoch": 0.08409495722253044, "grad_norm": 51.21875, "learning_rate": 9.998357521865489e-07, "loss": 93.6134, "step": 360 }, { "epoch": 0.08643092825648963, "grad_norm": 62.40625, "learning_rate": 9.998311897472865e-07, "loss": 95.1199, "step": 370 }, { "epoch": 0.0887668992904488, "grad_norm": 48.125, "learning_rate": 9.998266273080238e-07, "loss": 93.7213, "step": 380 }, { "epoch": 0.09110287032440798, "grad_norm": 53.125, "learning_rate": 9.998220648687614e-07, "loss": 94.9632, "step": 390 }, { "epoch": 0.09343884135836715, "grad_norm": 49.96875, "learning_rate": 9.99817502429499e-07, "loss": 93.9266, "step": 400 }, { "epoch": 0.09577481239232634, "grad_norm": 49.46875, "learning_rate": 9.998129399902363e-07, "loss": 95.0468, "step": 410 }, { "epoch": 0.09811078342628551, "grad_norm": 50.59375, "learning_rate": 9.99808377550974e-07, "loss": 93.5313, "step": 420 }, { "epoch": 0.10044675446024469, "grad_norm": 54.90625, "learning_rate": 9.998038151117113e-07, "loss": 94.5912, "step": 430 }, { "epoch": 0.10278272549420388, "grad_norm": 50.96875, "learning_rate": 9.997992526724488e-07, "loss": 93.7903, "step": 440 }, { "epoch": 0.10511869652816305, "grad_norm": 52.5, "learning_rate": 9.997946902331862e-07, "loss": 93.5428, "step": 450 }, { "epoch": 0.10745466756212223, "grad_norm": 50.90625, "learning_rate": 9.997901277939238e-07, "loss": 94.4597, "step": 460 }, { "epoch": 0.1097906385960814, "grad_norm": 49.46875, "learning_rate": 9.997855653546612e-07, "loss": 94.1416, "step": 470 }, { "epoch": 0.11212660963004059, "grad_norm": 52.53125, "learning_rate": 9.997810029153987e-07, "loss": 93.9781, "step": 480 }, { "epoch": 0.11446258066399977, "grad_norm": 51.0, "learning_rate": 9.99776440476136e-07, "loss": 94.0498, "step": 490 }, { "epoch": 0.11679855169795894, "grad_norm": 50.21875, "learning_rate": 9.997718780368737e-07, "loss": 92.6403, "step": 500 }, { "epoch": 0.11913452273191813, "grad_norm": 49.03125, "learning_rate": 9.99767315597611e-07, "loss": 92.5467, "step": 510 }, { "epoch": 0.1214704937658773, "grad_norm": 54.71875, "learning_rate": 9.997627531583486e-07, "loss": 92.8119, "step": 520 }, { "epoch": 0.12380646479983648, "grad_norm": 48.40625, "learning_rate": 9.99758190719086e-07, "loss": 93.4688, "step": 530 }, { "epoch": 0.12614243583379567, "grad_norm": 53.40625, "learning_rate": 9.997536282798235e-07, "loss": 93.5298, "step": 540 }, { "epoch": 0.12847840686775483, "grad_norm": 47.96875, "learning_rate": 9.99749065840561e-07, "loss": 93.2207, "step": 550 }, { "epoch": 0.13081437790171402, "grad_norm": 56.0, "learning_rate": 9.997445034012985e-07, "loss": 93.1208, "step": 560 }, { "epoch": 0.1331503489356732, "grad_norm": 54.53125, "learning_rate": 9.997399409620359e-07, "loss": 92.8771, "step": 570 }, { "epoch": 0.13548631996963237, "grad_norm": 64.6875, "learning_rate": 9.997353785227734e-07, "loss": 93.5443, "step": 580 }, { "epoch": 0.13782229100359156, "grad_norm": 53.6875, "learning_rate": 9.997308160835108e-07, "loss": 92.5809, "step": 590 }, { "epoch": 0.14015826203755075, "grad_norm": 49.53125, "learning_rate": 9.997262536442484e-07, "loss": 93.1532, "step": 600 }, { "epoch": 0.1424942330715099, "grad_norm": 59.9375, "learning_rate": 9.99721691204986e-07, "loss": 92.8726, "step": 610 }, { "epoch": 0.1448302041054691, "grad_norm": 54.25, "learning_rate": 9.997171287657233e-07, "loss": 92.0574, "step": 620 }, { "epoch": 0.14716617513942828, "grad_norm": 52.96875, "learning_rate": 9.997125663264607e-07, "loss": 93.3626, "step": 630 }, { "epoch": 0.14950214617338745, "grad_norm": 52.875, "learning_rate": 9.997080038871982e-07, "loss": 92.2334, "step": 640 }, { "epoch": 0.15183811720734663, "grad_norm": 49.46875, "learning_rate": 9.997034414479358e-07, "loss": 94.0112, "step": 650 }, { "epoch": 0.1541740882413058, "grad_norm": 49.5, "learning_rate": 9.996988790086732e-07, "loss": 92.2169, "step": 660 }, { "epoch": 0.15651005927526498, "grad_norm": 48.84375, "learning_rate": 9.996943165694106e-07, "loss": 93.1208, "step": 670 }, { "epoch": 0.15884603030922417, "grad_norm": 47.8125, "learning_rate": 9.996897541301481e-07, "loss": 92.4204, "step": 680 }, { "epoch": 0.16118200134318333, "grad_norm": 52.1875, "learning_rate": 9.996851916908857e-07, "loss": 92.2801, "step": 690 }, { "epoch": 0.16351797237714252, "grad_norm": 51.84375, "learning_rate": 9.99680629251623e-07, "loss": 92.815, "step": 700 }, { "epoch": 0.1658539434111017, "grad_norm": 50.40625, "learning_rate": 9.996760668123604e-07, "loss": 93.1973, "step": 710 }, { "epoch": 0.16818991444506087, "grad_norm": 49.84375, "learning_rate": 9.99671504373098e-07, "loss": 93.101, "step": 720 }, { "epoch": 0.17052588547902006, "grad_norm": 50.5, "learning_rate": 9.996669419338356e-07, "loss": 92.027, "step": 730 }, { "epoch": 0.17286185651297925, "grad_norm": 47.625, "learning_rate": 9.99662379494573e-07, "loss": 92.048, "step": 740 }, { "epoch": 0.1751978275469384, "grad_norm": 58.28125, "learning_rate": 9.996578170553103e-07, "loss": 93.1853, "step": 750 }, { "epoch": 0.1775337985808976, "grad_norm": 73.875, "learning_rate": 9.996532546160479e-07, "loss": 91.2014, "step": 760 }, { "epoch": 0.1798697696148568, "grad_norm": 50.8125, "learning_rate": 9.996486921767855e-07, "loss": 92.475, "step": 770 }, { "epoch": 0.18220574064881595, "grad_norm": 50.15625, "learning_rate": 9.996441297375228e-07, "loss": 92.3456, "step": 780 }, { "epoch": 0.18454171168277514, "grad_norm": 51.3125, "learning_rate": 9.996395672982602e-07, "loss": 92.1092, "step": 790 }, { "epoch": 0.1868776827167343, "grad_norm": 53.25, "learning_rate": 9.996350048589978e-07, "loss": 92.2168, "step": 800 }, { "epoch": 0.1892136537506935, "grad_norm": 49.96875, "learning_rate": 9.996304424197353e-07, "loss": 91.5845, "step": 810 }, { "epoch": 0.19154962478465268, "grad_norm": 51.96875, "learning_rate": 9.996258799804727e-07, "loss": 92.4014, "step": 820 }, { "epoch": 0.19388559581861184, "grad_norm": 52.65625, "learning_rate": 9.9962131754121e-07, "loss": 91.9784, "step": 830 }, { "epoch": 0.19622156685257103, "grad_norm": 55.59375, "learning_rate": 9.996167551019476e-07, "loss": 92.1067, "step": 840 }, { "epoch": 0.19855753788653022, "grad_norm": 54.8125, "learning_rate": 9.996121926626852e-07, "loss": 92.2285, "step": 850 }, { "epoch": 0.20089350892048938, "grad_norm": 48.21875, "learning_rate": 9.996076302234226e-07, "loss": 92.6478, "step": 860 }, { "epoch": 0.20322947995444857, "grad_norm": 49.65625, "learning_rate": 9.9960306778416e-07, "loss": 91.2663, "step": 870 }, { "epoch": 0.20556545098840776, "grad_norm": 51.78125, "learning_rate": 9.995985053448975e-07, "loss": 91.9975, "step": 880 }, { "epoch": 0.20790142202236692, "grad_norm": 56.21875, "learning_rate": 9.99593942905635e-07, "loss": 91.8558, "step": 890 }, { "epoch": 0.2102373930563261, "grad_norm": 48.21875, "learning_rate": 9.995893804663725e-07, "loss": 92.5588, "step": 900 }, { "epoch": 0.2125733640902853, "grad_norm": 51.6875, "learning_rate": 9.995848180271098e-07, "loss": 91.8372, "step": 910 }, { "epoch": 0.21490933512424445, "grad_norm": 49.6875, "learning_rate": 9.995802555878474e-07, "loss": 91.0599, "step": 920 }, { "epoch": 0.21724530615820364, "grad_norm": 51.09375, "learning_rate": 9.99575693148585e-07, "loss": 92.0935, "step": 930 }, { "epoch": 0.2195812771921628, "grad_norm": 51.0, "learning_rate": 9.995711307093223e-07, "loss": 92.2526, "step": 940 }, { "epoch": 0.221917248226122, "grad_norm": 55.09375, "learning_rate": 9.995665682700597e-07, "loss": 91.4987, "step": 950 }, { "epoch": 0.22425321926008118, "grad_norm": 48.875, "learning_rate": 9.995620058307973e-07, "loss": 91.7583, "step": 960 }, { "epoch": 0.22658919029404034, "grad_norm": 54.28125, "learning_rate": 9.995574433915349e-07, "loss": 92.9723, "step": 970 }, { "epoch": 0.22892516132799953, "grad_norm": 47.375, "learning_rate": 9.995528809522722e-07, "loss": 91.1949, "step": 980 }, { "epoch": 0.23126113236195872, "grad_norm": 48.1875, "learning_rate": 9.995483185130098e-07, "loss": 91.6117, "step": 990 }, { "epoch": 0.23359710339591788, "grad_norm": 49.1875, "learning_rate": 9.995437560737472e-07, "loss": 91.0056, "step": 1000 }, { "epoch": 0.23593307442987707, "grad_norm": 51.46875, "learning_rate": 9.995391936344847e-07, "loss": 91.9323, "step": 1010 }, { "epoch": 0.23826904546383626, "grad_norm": 47.875, "learning_rate": 9.995346311952221e-07, "loss": 91.1979, "step": 1020 }, { "epoch": 0.24060501649779542, "grad_norm": 48.90625, "learning_rate": 9.995300687559597e-07, "loss": 91.2106, "step": 1030 }, { "epoch": 0.2429409875317546, "grad_norm": 48.5, "learning_rate": 9.99525506316697e-07, "loss": 90.451, "step": 1040 }, { "epoch": 0.2452769585657138, "grad_norm": 47.8125, "learning_rate": 9.995209438774346e-07, "loss": 90.8564, "step": 1050 }, { "epoch": 0.24761292959967296, "grad_norm": 48.4375, "learning_rate": 9.99516381438172e-07, "loss": 91.9894, "step": 1060 }, { "epoch": 0.24994890063363215, "grad_norm": 51.5, "learning_rate": 9.995118189989096e-07, "loss": 90.8876, "step": 1070 }, { "epoch": 0.25228487166759134, "grad_norm": 48.34375, "learning_rate": 9.99507256559647e-07, "loss": 89.8074, "step": 1080 }, { "epoch": 0.2546208427015505, "grad_norm": 49.71875, "learning_rate": 9.995026941203845e-07, "loss": 90.9951, "step": 1090 }, { "epoch": 0.25695681373550966, "grad_norm": 48.25, "learning_rate": 9.994981316811219e-07, "loss": 91.1307, "step": 1100 }, { "epoch": 0.2592927847694689, "grad_norm": 51.96875, "learning_rate": 9.994935692418594e-07, "loss": 90.8755, "step": 1110 }, { "epoch": 0.26162875580342804, "grad_norm": 52.0625, "learning_rate": 9.994890068025968e-07, "loss": 90.3661, "step": 1120 }, { "epoch": 0.2639647268373872, "grad_norm": 50.0625, "learning_rate": 9.994844443633344e-07, "loss": 91.0299, "step": 1130 }, { "epoch": 0.2663006978713464, "grad_norm": 48.40625, "learning_rate": 9.994798819240718e-07, "loss": 90.4072, "step": 1140 }, { "epoch": 0.2686366689053056, "grad_norm": 48.21875, "learning_rate": 9.994753194848093e-07, "loss": 90.3286, "step": 1150 }, { "epoch": 0.27097263993926474, "grad_norm": 47.375, "learning_rate": 9.994707570455467e-07, "loss": 89.8693, "step": 1160 }, { "epoch": 0.27330861097322395, "grad_norm": 47.71875, "learning_rate": 9.994661946062843e-07, "loss": 90.2988, "step": 1170 }, { "epoch": 0.2756445820071831, "grad_norm": 48.375, "learning_rate": 9.994616321670216e-07, "loss": 90.7299, "step": 1180 }, { "epoch": 0.2779805530411423, "grad_norm": 48.03125, "learning_rate": 9.994570697277592e-07, "loss": 90.5661, "step": 1190 }, { "epoch": 0.2803165240751015, "grad_norm": 50.875, "learning_rate": 9.994525072884968e-07, "loss": 91.3686, "step": 1200 }, { "epoch": 0.28265249510906065, "grad_norm": 56.40625, "learning_rate": 9.994479448492341e-07, "loss": 90.5123, "step": 1210 }, { "epoch": 0.2849884661430198, "grad_norm": 47.75, "learning_rate": 9.994433824099715e-07, "loss": 90.0628, "step": 1220 }, { "epoch": 0.28732443717697903, "grad_norm": 48.59375, "learning_rate": 9.99438819970709e-07, "loss": 91.5217, "step": 1230 }, { "epoch": 0.2896604082109382, "grad_norm": 50.375, "learning_rate": 9.994342575314467e-07, "loss": 91.552, "step": 1240 }, { "epoch": 0.29199637924489735, "grad_norm": 46.34375, "learning_rate": 9.99429695092184e-07, "loss": 90.1196, "step": 1250 }, { "epoch": 0.29433235027885657, "grad_norm": 51.28125, "learning_rate": 9.994251326529216e-07, "loss": 90.6674, "step": 1260 }, { "epoch": 0.29666832131281573, "grad_norm": 47.46875, "learning_rate": 9.99420570213659e-07, "loss": 90.2552, "step": 1270 }, { "epoch": 0.2990042923467749, "grad_norm": 47.1875, "learning_rate": 9.994160077743965e-07, "loss": 89.4563, "step": 1280 }, { "epoch": 0.30134026338073405, "grad_norm": 48.375, "learning_rate": 9.99411445335134e-07, "loss": 89.6932, "step": 1290 }, { "epoch": 0.30367623441469327, "grad_norm": 49.65625, "learning_rate": 9.994068828958715e-07, "loss": 90.6541, "step": 1300 }, { "epoch": 0.30601220544865243, "grad_norm": 47.4375, "learning_rate": 9.994023204566088e-07, "loss": 90.2363, "step": 1310 }, { "epoch": 0.3083481764826116, "grad_norm": 47.59375, "learning_rate": 9.993977580173464e-07, "loss": 89.9082, "step": 1320 }, { "epoch": 0.3106841475165708, "grad_norm": 49.9375, "learning_rate": 9.993931955780838e-07, "loss": 90.5597, "step": 1330 }, { "epoch": 0.31302011855052997, "grad_norm": 47.09375, "learning_rate": 9.993886331388214e-07, "loss": 89.9793, "step": 1340 }, { "epoch": 0.31535608958448913, "grad_norm": 46.1875, "learning_rate": 9.993840706995587e-07, "loss": 89.6057, "step": 1350 }, { "epoch": 0.31769206061844835, "grad_norm": 48.375, "learning_rate": 9.993795082602963e-07, "loss": 90.4494, "step": 1360 }, { "epoch": 0.3200280316524075, "grad_norm": 47.84375, "learning_rate": 9.993749458210337e-07, "loss": 89.3954, "step": 1370 }, { "epoch": 0.32236400268636667, "grad_norm": 51.5, "learning_rate": 9.993703833817712e-07, "loss": 88.9873, "step": 1380 }, { "epoch": 0.3246999737203259, "grad_norm": 49.375, "learning_rate": 9.993658209425086e-07, "loss": 89.7105, "step": 1390 }, { "epoch": 0.32703594475428505, "grad_norm": 48.21875, "learning_rate": 9.993612585032462e-07, "loss": 90.0021, "step": 1400 }, { "epoch": 0.3293719157882442, "grad_norm": 48.75, "learning_rate": 9.993566960639835e-07, "loss": 90.7298, "step": 1410 }, { "epoch": 0.3317078868222034, "grad_norm": 48.0625, "learning_rate": 9.993521336247211e-07, "loss": 89.614, "step": 1420 }, { "epoch": 0.3340438578561626, "grad_norm": 47.6875, "learning_rate": 9.993475711854585e-07, "loss": 90.2349, "step": 1430 }, { "epoch": 0.33637982889012175, "grad_norm": 47.6875, "learning_rate": 9.99343008746196e-07, "loss": 89.0322, "step": 1440 }, { "epoch": 0.33871579992408096, "grad_norm": 52.875, "learning_rate": 9.993384463069336e-07, "loss": 91.0856, "step": 1450 }, { "epoch": 0.3410517709580401, "grad_norm": 45.84375, "learning_rate": 9.99333883867671e-07, "loss": 89.7295, "step": 1460 }, { "epoch": 0.3433877419919993, "grad_norm": 47.875, "learning_rate": 9.993293214284084e-07, "loss": 89.7628, "step": 1470 }, { "epoch": 0.3457237130259585, "grad_norm": 50.71875, "learning_rate": 9.99324758989146e-07, "loss": 89.6516, "step": 1480 }, { "epoch": 0.34805968405991766, "grad_norm": 49.5625, "learning_rate": 9.993201965498835e-07, "loss": 89.7199, "step": 1490 }, { "epoch": 0.3503956550938768, "grad_norm": 44.96875, "learning_rate": 9.993156341106209e-07, "loss": 89.6713, "step": 1500 }, { "epoch": 0.35273162612783604, "grad_norm": 49.03125, "learning_rate": 9.993110716713582e-07, "loss": 89.7211, "step": 1510 }, { "epoch": 0.3550675971617952, "grad_norm": 46.65625, "learning_rate": 9.993065092320958e-07, "loss": 89.7591, "step": 1520 }, { "epoch": 0.35740356819575436, "grad_norm": 46.40625, "learning_rate": 9.993019467928334e-07, "loss": 89.5946, "step": 1530 }, { "epoch": 0.3597395392297136, "grad_norm": 47.46875, "learning_rate": 9.992973843535708e-07, "loss": 89.3533, "step": 1540 }, { "epoch": 0.36207551026367274, "grad_norm": 66.75, "learning_rate": 9.992928219143081e-07, "loss": 88.915, "step": 1550 }, { "epoch": 0.3644114812976319, "grad_norm": 49.625, "learning_rate": 9.992882594750457e-07, "loss": 89.5318, "step": 1560 }, { "epoch": 0.36674745233159106, "grad_norm": 52.5, "learning_rate": 9.992836970357833e-07, "loss": 89.6842, "step": 1570 }, { "epoch": 0.3690834233655503, "grad_norm": 47.65625, "learning_rate": 9.992791345965206e-07, "loss": 89.9355, "step": 1580 }, { "epoch": 0.37141939439950944, "grad_norm": 47.34375, "learning_rate": 9.99274572157258e-07, "loss": 89.0862, "step": 1590 }, { "epoch": 0.3737553654334686, "grad_norm": 47.0, "learning_rate": 9.992700097179956e-07, "loss": 89.5908, "step": 1600 }, { "epoch": 0.3760913364674278, "grad_norm": 48.40625, "learning_rate": 9.992654472787332e-07, "loss": 90.0093, "step": 1610 }, { "epoch": 0.378427307501387, "grad_norm": 46.90625, "learning_rate": 9.992608848394705e-07, "loss": 89.8005, "step": 1620 }, { "epoch": 0.38076327853534614, "grad_norm": 46.53125, "learning_rate": 9.992563224002079e-07, "loss": 89.5087, "step": 1630 }, { "epoch": 0.38309924956930536, "grad_norm": 46.6875, "learning_rate": 9.992517599609455e-07, "loss": 89.3029, "step": 1640 }, { "epoch": 0.3854352206032645, "grad_norm": 48.0, "learning_rate": 9.99247197521683e-07, "loss": 89.3145, "step": 1650 }, { "epoch": 0.3877711916372237, "grad_norm": 47.5625, "learning_rate": 9.992426350824204e-07, "loss": 88.9554, "step": 1660 }, { "epoch": 0.3901071626711829, "grad_norm": 50.25, "learning_rate": 9.992380726431578e-07, "loss": 89.8971, "step": 1670 }, { "epoch": 0.39244313370514206, "grad_norm": 49.1875, "learning_rate": 9.992335102038953e-07, "loss": 88.8999, "step": 1680 }, { "epoch": 0.3947791047391012, "grad_norm": 47.6875, "learning_rate": 9.99228947764633e-07, "loss": 90.1073, "step": 1690 }, { "epoch": 0.39711507577306043, "grad_norm": 48.375, "learning_rate": 9.992243853253703e-07, "loss": 89.0198, "step": 1700 }, { "epoch": 0.3994510468070196, "grad_norm": 49.9375, "learning_rate": 9.992198228861076e-07, "loss": 89.9081, "step": 1710 }, { "epoch": 0.40178701784097876, "grad_norm": 48.6875, "learning_rate": 9.992152604468452e-07, "loss": 89.2711, "step": 1720 }, { "epoch": 0.40412298887493797, "grad_norm": 46.34375, "learning_rate": 9.992106980075828e-07, "loss": 89.0298, "step": 1730 }, { "epoch": 0.40645895990889713, "grad_norm": 46.65625, "learning_rate": 9.992061355683202e-07, "loss": 89.1033, "step": 1740 }, { "epoch": 0.4087949309428563, "grad_norm": 47.9375, "learning_rate": 9.992015731290577e-07, "loss": 89.7967, "step": 1750 }, { "epoch": 0.4111309019768155, "grad_norm": 47.53125, "learning_rate": 9.99197010689795e-07, "loss": 87.6053, "step": 1760 }, { "epoch": 0.41346687301077467, "grad_norm": 46.6875, "learning_rate": 9.991924482505327e-07, "loss": 89.5975, "step": 1770 }, { "epoch": 0.41580284404473383, "grad_norm": 50.90625, "learning_rate": 9.9918788581127e-07, "loss": 88.9577, "step": 1780 }, { "epoch": 0.41813881507869305, "grad_norm": 49.125, "learning_rate": 9.991833233720076e-07, "loss": 88.7783, "step": 1790 }, { "epoch": 0.4204747861126522, "grad_norm": 47.9375, "learning_rate": 9.99178760932745e-07, "loss": 89.6563, "step": 1800 }, { "epoch": 0.42281075714661137, "grad_norm": 46.90625, "learning_rate": 9.991741984934826e-07, "loss": 88.626, "step": 1810 }, { "epoch": 0.4251467281805706, "grad_norm": 46.71875, "learning_rate": 9.9916963605422e-07, "loss": 87.7213, "step": 1820 }, { "epoch": 0.42748269921452975, "grad_norm": 49.40625, "learning_rate": 9.991650736149575e-07, "loss": 88.2201, "step": 1830 }, { "epoch": 0.4298186702484889, "grad_norm": 46.40625, "learning_rate": 9.991605111756949e-07, "loss": 89.3786, "step": 1840 }, { "epoch": 0.43215464128244807, "grad_norm": 48.125, "learning_rate": 9.991559487364324e-07, "loss": 87.8735, "step": 1850 }, { "epoch": 0.4344906123164073, "grad_norm": 52.09375, "learning_rate": 9.991513862971698e-07, "loss": 89.6088, "step": 1860 }, { "epoch": 0.43682658335036645, "grad_norm": 48.875, "learning_rate": 9.991468238579074e-07, "loss": 88.5974, "step": 1870 }, { "epoch": 0.4391625543843256, "grad_norm": 46.3125, "learning_rate": 9.991422614186447e-07, "loss": 89.1903, "step": 1880 }, { "epoch": 0.4414985254182848, "grad_norm": 45.1875, "learning_rate": 9.991376989793823e-07, "loss": 88.6345, "step": 1890 }, { "epoch": 0.443834496452244, "grad_norm": 45.375, "learning_rate": 9.991331365401197e-07, "loss": 88.6808, "step": 1900 }, { "epoch": 0.44617046748620315, "grad_norm": 45.6875, "learning_rate": 9.991285741008573e-07, "loss": 88.9256, "step": 1910 }, { "epoch": 0.44850643852016236, "grad_norm": 45.53125, "learning_rate": 9.991240116615946e-07, "loss": 88.0677, "step": 1920 }, { "epoch": 0.4508424095541215, "grad_norm": 47.15625, "learning_rate": 9.991194492223322e-07, "loss": 89.2818, "step": 1930 }, { "epoch": 0.4531783805880807, "grad_norm": 46.28125, "learning_rate": 9.991148867830696e-07, "loss": 88.0857, "step": 1940 }, { "epoch": 0.4555143516220399, "grad_norm": 48.78125, "learning_rate": 9.991103243438071e-07, "loss": 89.0477, "step": 1950 }, { "epoch": 0.45785032265599906, "grad_norm": 48.90625, "learning_rate": 9.991057619045447e-07, "loss": 89.073, "step": 1960 }, { "epoch": 0.4601862936899582, "grad_norm": 48.25, "learning_rate": 9.99101199465282e-07, "loss": 89.1609, "step": 1970 }, { "epoch": 0.46252226472391744, "grad_norm": 52.0625, "learning_rate": 9.990966370260194e-07, "loss": 89.7074, "step": 1980 }, { "epoch": 0.4648582357578766, "grad_norm": 47.84375, "learning_rate": 9.99092074586757e-07, "loss": 88.3551, "step": 1990 }, { "epoch": 0.46719420679183576, "grad_norm": 45.875, "learning_rate": 9.990875121474946e-07, "loss": 89.2271, "step": 2000 }, { "epoch": 0.46719420679183576, "eval_loss": 1.3847792148590088, "eval_runtime": 136.4587, "eval_samples_per_second": 1647.4, "eval_steps_per_second": 51.488, "step": 2000 }, { "epoch": 0.469530177825795, "grad_norm": 46.53125, "learning_rate": 9.99082949708232e-07, "loss": 88.5579, "step": 2010 }, { "epoch": 0.47186614885975414, "grad_norm": 46.96875, "learning_rate": 9.990783872689693e-07, "loss": 88.9332, "step": 2020 }, { "epoch": 0.4742021198937133, "grad_norm": 45.03125, "learning_rate": 9.99073824829707e-07, "loss": 88.1122, "step": 2030 }, { "epoch": 0.4765380909276725, "grad_norm": 47.8125, "learning_rate": 9.990692623904445e-07, "loss": 88.4026, "step": 2040 }, { "epoch": 0.4788740619616317, "grad_norm": 47.46875, "learning_rate": 9.990646999511818e-07, "loss": 88.9833, "step": 2050 }, { "epoch": 0.48121003299559084, "grad_norm": 49.03125, "learning_rate": 9.990601375119192e-07, "loss": 88.6076, "step": 2060 }, { "epoch": 0.48354600402955006, "grad_norm": 57.125, "learning_rate": 9.990555750726568e-07, "loss": 88.9196, "step": 2070 }, { "epoch": 0.4858819750635092, "grad_norm": 47.6875, "learning_rate": 9.990510126333944e-07, "loss": 88.4763, "step": 2080 }, { "epoch": 0.4882179460974684, "grad_norm": 49.65625, "learning_rate": 9.990464501941317e-07, "loss": 87.9524, "step": 2090 }, { "epoch": 0.4905539171314276, "grad_norm": 45.5625, "learning_rate": 9.990418877548693e-07, "loss": 88.7893, "step": 2100 }, { "epoch": 0.49288988816538676, "grad_norm": 46.5, "learning_rate": 9.990373253156067e-07, "loss": 89.0926, "step": 2110 }, { "epoch": 0.4952258591993459, "grad_norm": 52.0, "learning_rate": 9.990327628763442e-07, "loss": 88.1107, "step": 2120 }, { "epoch": 0.4975618302333051, "grad_norm": 45.84375, "learning_rate": 9.990282004370816e-07, "loss": 88.8404, "step": 2130 }, { "epoch": 0.4998978012672643, "grad_norm": 44.6875, "learning_rate": 9.990236379978192e-07, "loss": 88.8822, "step": 2140 }, { "epoch": 0.5022337723012235, "grad_norm": 47.21875, "learning_rate": 9.990190755585565e-07, "loss": 88.8674, "step": 2150 }, { "epoch": 0.5045697433351827, "grad_norm": 46.0625, "learning_rate": 9.990145131192941e-07, "loss": 88.4117, "step": 2160 }, { "epoch": 0.5069057143691418, "grad_norm": 47.0625, "learning_rate": 9.990099506800315e-07, "loss": 87.901, "step": 2170 }, { "epoch": 0.509241685403101, "grad_norm": 46.46875, "learning_rate": 9.99005388240769e-07, "loss": 88.5639, "step": 2180 }, { "epoch": 0.5115776564370602, "grad_norm": 47.9375, "learning_rate": 9.990008258015064e-07, "loss": 88.3239, "step": 2190 }, { "epoch": 0.5139136274710193, "grad_norm": 47.34375, "learning_rate": 9.98996263362244e-07, "loss": 88.649, "step": 2200 }, { "epoch": 0.5162495985049785, "grad_norm": 44.34375, "learning_rate": 9.989917009229814e-07, "loss": 87.9817, "step": 2210 }, { "epoch": 0.5185855695389378, "grad_norm": 46.375, "learning_rate": 9.98987138483719e-07, "loss": 87.0908, "step": 2220 }, { "epoch": 0.5209215405728969, "grad_norm": 45.03125, "learning_rate": 9.989825760444563e-07, "loss": 88.3031, "step": 2230 }, { "epoch": 0.5232575116068561, "grad_norm": 45.875, "learning_rate": 9.989780136051939e-07, "loss": 88.9973, "step": 2240 }, { "epoch": 0.5255934826408153, "grad_norm": 48.84375, "learning_rate": 9.989734511659312e-07, "loss": 88.8684, "step": 2250 }, { "epoch": 0.5279294536747744, "grad_norm": 44.1875, "learning_rate": 9.989688887266688e-07, "loss": 89.4066, "step": 2260 }, { "epoch": 0.5302654247087336, "grad_norm": 48.3125, "learning_rate": 9.989643262874062e-07, "loss": 87.0936, "step": 2270 }, { "epoch": 0.5326013957426928, "grad_norm": 49.25, "learning_rate": 9.989597638481438e-07, "loss": 88.0914, "step": 2280 }, { "epoch": 0.5349373667766519, "grad_norm": 44.53125, "learning_rate": 9.989552014088813e-07, "loss": 88.5739, "step": 2290 }, { "epoch": 0.5372733378106112, "grad_norm": 45.375, "learning_rate": 9.989506389696187e-07, "loss": 88.5934, "step": 2300 }, { "epoch": 0.5396093088445704, "grad_norm": 48.8125, "learning_rate": 9.98946076530356e-07, "loss": 88.2016, "step": 2310 }, { "epoch": 0.5419452798785295, "grad_norm": 46.15625, "learning_rate": 9.989415140910936e-07, "loss": 87.577, "step": 2320 }, { "epoch": 0.5442812509124887, "grad_norm": 45.28125, "learning_rate": 9.989369516518312e-07, "loss": 88.1394, "step": 2330 }, { "epoch": 0.5466172219464479, "grad_norm": 50.65625, "learning_rate": 9.989323892125686e-07, "loss": 86.5228, "step": 2340 }, { "epoch": 0.548953192980407, "grad_norm": 48.9375, "learning_rate": 9.98927826773306e-07, "loss": 88.3342, "step": 2350 }, { "epoch": 0.5512891640143662, "grad_norm": 48.71875, "learning_rate": 9.989232643340435e-07, "loss": 87.4456, "step": 2360 }, { "epoch": 0.5536251350483254, "grad_norm": 48.90625, "learning_rate": 9.98918701894781e-07, "loss": 88.0534, "step": 2370 }, { "epoch": 0.5559611060822846, "grad_norm": 45.8125, "learning_rate": 9.989141394555185e-07, "loss": 87.6145, "step": 2380 }, { "epoch": 0.5582970771162438, "grad_norm": 51.09375, "learning_rate": 9.989095770162558e-07, "loss": 86.6963, "step": 2390 }, { "epoch": 0.560633048150203, "grad_norm": 48.53125, "learning_rate": 9.989050145769934e-07, "loss": 88.0634, "step": 2400 }, { "epoch": 0.5629690191841621, "grad_norm": 82.125, "learning_rate": 9.98900452137731e-07, "loss": 87.3878, "step": 2410 }, { "epoch": 0.5653049902181213, "grad_norm": 44.6875, "learning_rate": 9.988958896984683e-07, "loss": 87.9072, "step": 2420 }, { "epoch": 0.5676409612520805, "grad_norm": 45.4375, "learning_rate": 9.988913272592057e-07, "loss": 88.5203, "step": 2430 }, { "epoch": 0.5699769322860396, "grad_norm": 77.0625, "learning_rate": 9.988867648199433e-07, "loss": 88.19, "step": 2440 }, { "epoch": 0.5723129033199988, "grad_norm": 50.03125, "learning_rate": 9.988822023806809e-07, "loss": 87.7846, "step": 2450 }, { "epoch": 0.5746488743539581, "grad_norm": 47.6875, "learning_rate": 9.988776399414182e-07, "loss": 87.4866, "step": 2460 }, { "epoch": 0.5769848453879172, "grad_norm": 47.375, "learning_rate": 9.988730775021556e-07, "loss": 87.9125, "step": 2470 }, { "epoch": 0.5793208164218764, "grad_norm": 46.65625, "learning_rate": 9.988685150628932e-07, "loss": 87.7185, "step": 2480 }, { "epoch": 0.5816567874558356, "grad_norm": 46.46875, "learning_rate": 9.988639526236307e-07, "loss": 88.5204, "step": 2490 }, { "epoch": 0.5839927584897947, "grad_norm": 46.53125, "learning_rate": 9.98859390184368e-07, "loss": 87.8029, "step": 2500 }, { "epoch": 0.5863287295237539, "grad_norm": 47.25, "learning_rate": 9.988548277451055e-07, "loss": 87.5321, "step": 2510 }, { "epoch": 0.5886647005577131, "grad_norm": 45.0625, "learning_rate": 9.98850265305843e-07, "loss": 88.0817, "step": 2520 }, { "epoch": 0.5910006715916722, "grad_norm": 48.3125, "learning_rate": 9.988457028665806e-07, "loss": 88.5133, "step": 2530 }, { "epoch": 0.5933366426256315, "grad_norm": 50.90625, "learning_rate": 9.98841140427318e-07, "loss": 87.1346, "step": 2540 }, { "epoch": 0.5956726136595906, "grad_norm": 49.5625, "learning_rate": 9.988365779880556e-07, "loss": 87.3656, "step": 2550 }, { "epoch": 0.5980085846935498, "grad_norm": 46.25, "learning_rate": 9.98832015548793e-07, "loss": 87.9664, "step": 2560 }, { "epoch": 0.600344555727509, "grad_norm": 48.6875, "learning_rate": 9.988274531095305e-07, "loss": 87.8553, "step": 2570 }, { "epoch": 0.6026805267614681, "grad_norm": 46.375, "learning_rate": 9.988228906702679e-07, "loss": 87.606, "step": 2580 }, { "epoch": 0.6050164977954273, "grad_norm": 54.21875, "learning_rate": 9.988183282310054e-07, "loss": 88.3672, "step": 2590 }, { "epoch": 0.6073524688293865, "grad_norm": 49.4375, "learning_rate": 9.988137657917428e-07, "loss": 87.1978, "step": 2600 }, { "epoch": 0.6096884398633456, "grad_norm": 46.3125, "learning_rate": 9.988092033524804e-07, "loss": 87.6631, "step": 2610 }, { "epoch": 0.6120244108973049, "grad_norm": 47.9375, "learning_rate": 9.988046409132177e-07, "loss": 87.6851, "step": 2620 }, { "epoch": 0.6143603819312641, "grad_norm": 48.59375, "learning_rate": 9.988000784739553e-07, "loss": 88.0132, "step": 2630 }, { "epoch": 0.6166963529652232, "grad_norm": 51.84375, "learning_rate": 9.987955160346927e-07, "loss": 87.3412, "step": 2640 }, { "epoch": 0.6190323239991824, "grad_norm": 48.875, "learning_rate": 9.987909535954303e-07, "loss": 87.6682, "step": 2650 }, { "epoch": 0.6213682950331416, "grad_norm": 43.75, "learning_rate": 9.987863911561676e-07, "loss": 87.5445, "step": 2660 }, { "epoch": 0.6237042660671007, "grad_norm": 48.40625, "learning_rate": 9.987818287169052e-07, "loss": 87.7517, "step": 2670 }, { "epoch": 0.6260402371010599, "grad_norm": 49.875, "learning_rate": 9.987772662776426e-07, "loss": 86.7831, "step": 2680 }, { "epoch": 0.6283762081350192, "grad_norm": 44.625, "learning_rate": 9.987727038383801e-07, "loss": 87.5577, "step": 2690 }, { "epoch": 0.6307121791689783, "grad_norm": 45.1875, "learning_rate": 9.987681413991175e-07, "loss": 87.1173, "step": 2700 }, { "epoch": 0.6330481502029375, "grad_norm": 50.5, "learning_rate": 9.98763578959855e-07, "loss": 87.7701, "step": 2710 }, { "epoch": 0.6353841212368967, "grad_norm": 49.4375, "learning_rate": 9.987590165205927e-07, "loss": 87.7584, "step": 2720 }, { "epoch": 0.6377200922708558, "grad_norm": 47.34375, "learning_rate": 9.9875445408133e-07, "loss": 86.956, "step": 2730 }, { "epoch": 0.640056063304815, "grad_norm": 49.6875, "learning_rate": 9.987498916420674e-07, "loss": 87.0904, "step": 2740 }, { "epoch": 0.6423920343387742, "grad_norm": 46.75, "learning_rate": 9.98745329202805e-07, "loss": 87.2963, "step": 2750 }, { "epoch": 0.6447280053727333, "grad_norm": 46.28125, "learning_rate": 9.987407667635425e-07, "loss": 87.8248, "step": 2760 }, { "epoch": 0.6470639764066926, "grad_norm": 46.875, "learning_rate": 9.9873620432428e-07, "loss": 88.0839, "step": 2770 }, { "epoch": 0.6493999474406518, "grad_norm": 46.125, "learning_rate": 9.987316418850173e-07, "loss": 88.2584, "step": 2780 }, { "epoch": 0.6517359184746109, "grad_norm": 46.96875, "learning_rate": 9.987270794457548e-07, "loss": 87.885, "step": 2790 }, { "epoch": 0.6540718895085701, "grad_norm": 48.875, "learning_rate": 9.987225170064924e-07, "loss": 87.3159, "step": 2800 }, { "epoch": 0.6564078605425293, "grad_norm": 47.0625, "learning_rate": 9.987179545672298e-07, "loss": 87.1514, "step": 2810 }, { "epoch": 0.6587438315764884, "grad_norm": 46.65625, "learning_rate": 9.987133921279671e-07, "loss": 87.9247, "step": 2820 }, { "epoch": 0.6610798026104476, "grad_norm": 49.28125, "learning_rate": 9.987088296887047e-07, "loss": 86.3055, "step": 2830 }, { "epoch": 0.6634157736444068, "grad_norm": 51.78125, "learning_rate": 9.987042672494423e-07, "loss": 87.9834, "step": 2840 }, { "epoch": 0.665751744678366, "grad_norm": 47.8125, "learning_rate": 9.986997048101797e-07, "loss": 87.2726, "step": 2850 }, { "epoch": 0.6680877157123252, "grad_norm": 50.28125, "learning_rate": 9.98695142370917e-07, "loss": 87.7858, "step": 2860 }, { "epoch": 0.6704236867462844, "grad_norm": 47.375, "learning_rate": 9.986905799316546e-07, "loss": 88.7471, "step": 2870 }, { "epoch": 0.6727596577802435, "grad_norm": 46.03125, "learning_rate": 9.986860174923922e-07, "loss": 87.0236, "step": 2880 }, { "epoch": 0.6750956288142027, "grad_norm": 45.53125, "learning_rate": 9.986814550531295e-07, "loss": 86.4502, "step": 2890 }, { "epoch": 0.6774315998481619, "grad_norm": 45.03125, "learning_rate": 9.98676892613867e-07, "loss": 87.5116, "step": 2900 }, { "epoch": 0.679767570882121, "grad_norm": 49.3125, "learning_rate": 9.986723301746045e-07, "loss": 87.5916, "step": 2910 }, { "epoch": 0.6821035419160802, "grad_norm": 45.90625, "learning_rate": 9.98667767735342e-07, "loss": 87.8794, "step": 2920 }, { "epoch": 0.6844395129500395, "grad_norm": 45.84375, "learning_rate": 9.986632052960794e-07, "loss": 86.8035, "step": 2930 }, { "epoch": 0.6867754839839986, "grad_norm": 48.84375, "learning_rate": 9.98658642856817e-07, "loss": 87.8464, "step": 2940 }, { "epoch": 0.6891114550179578, "grad_norm": 46.15625, "learning_rate": 9.986540804175544e-07, "loss": 87.964, "step": 2950 }, { "epoch": 0.691447426051917, "grad_norm": 46.59375, "learning_rate": 9.98649517978292e-07, "loss": 87.3683, "step": 2960 }, { "epoch": 0.6937833970858761, "grad_norm": 51.125, "learning_rate": 9.986449555390293e-07, "loss": 87.7749, "step": 2970 }, { "epoch": 0.6961193681198353, "grad_norm": 46.75, "learning_rate": 9.986403930997669e-07, "loss": 88.3439, "step": 2980 }, { "epoch": 0.6984553391537945, "grad_norm": 45.28125, "learning_rate": 9.986358306605042e-07, "loss": 87.627, "step": 2990 }, { "epoch": 0.7007913101877536, "grad_norm": 45.90625, "learning_rate": 9.986312682212418e-07, "loss": 86.3417, "step": 3000 }, { "epoch": 0.7031272812217129, "grad_norm": 48.53125, "learning_rate": 9.986267057819792e-07, "loss": 87.2988, "step": 3010 }, { "epoch": 0.7054632522556721, "grad_norm": 48.125, "learning_rate": 9.986221433427168e-07, "loss": 87.3414, "step": 3020 }, { "epoch": 0.7077992232896312, "grad_norm": 47.96875, "learning_rate": 9.986175809034541e-07, "loss": 86.9798, "step": 3030 }, { "epoch": 0.7101351943235904, "grad_norm": 47.5, "learning_rate": 9.986130184641917e-07, "loss": 87.7504, "step": 3040 }, { "epoch": 0.7124711653575496, "grad_norm": 50.46875, "learning_rate": 9.98608456024929e-07, "loss": 87.2987, "step": 3050 }, { "epoch": 0.7148071363915087, "grad_norm": 47.625, "learning_rate": 9.986038935856666e-07, "loss": 86.8407, "step": 3060 }, { "epoch": 0.7171431074254679, "grad_norm": 51.78125, "learning_rate": 9.98599331146404e-07, "loss": 86.4857, "step": 3070 }, { "epoch": 0.7194790784594272, "grad_norm": 43.46875, "learning_rate": 9.985947687071416e-07, "loss": 86.6848, "step": 3080 }, { "epoch": 0.7218150494933863, "grad_norm": 47.46875, "learning_rate": 9.98590206267879e-07, "loss": 86.5055, "step": 3090 }, { "epoch": 0.7241510205273455, "grad_norm": 46.59375, "learning_rate": 9.985856438286165e-07, "loss": 87.3536, "step": 3100 }, { "epoch": 0.7264869915613046, "grad_norm": 47.90625, "learning_rate": 9.985810813893539e-07, "loss": 86.9832, "step": 3110 }, { "epoch": 0.7288229625952638, "grad_norm": 45.8125, "learning_rate": 9.985765189500915e-07, "loss": 86.8103, "step": 3120 }, { "epoch": 0.731158933629223, "grad_norm": 46.125, "learning_rate": 9.98571956510829e-07, "loss": 87.5608, "step": 3130 }, { "epoch": 0.7334949046631821, "grad_norm": 44.25, "learning_rate": 9.985673940715664e-07, "loss": 87.2201, "step": 3140 }, { "epoch": 0.7358308756971413, "grad_norm": 46.125, "learning_rate": 9.985628316323038e-07, "loss": 87.6277, "step": 3150 }, { "epoch": 0.7381668467311006, "grad_norm": 46.09375, "learning_rate": 9.985582691930413e-07, "loss": 87.0271, "step": 3160 }, { "epoch": 0.7405028177650597, "grad_norm": 48.96875, "learning_rate": 9.98553706753779e-07, "loss": 87.2596, "step": 3170 }, { "epoch": 0.7428387887990189, "grad_norm": 50.84375, "learning_rate": 9.985491443145163e-07, "loss": 87.0863, "step": 3180 }, { "epoch": 0.7451747598329781, "grad_norm": 44.8125, "learning_rate": 9.985445818752536e-07, "loss": 87.3691, "step": 3190 }, { "epoch": 0.7475107308669372, "grad_norm": 56.15625, "learning_rate": 9.985400194359912e-07, "loss": 87.2236, "step": 3200 }, { "epoch": 0.7498467019008964, "grad_norm": 46.71875, "learning_rate": 9.985354569967288e-07, "loss": 87.7344, "step": 3210 }, { "epoch": 0.7521826729348556, "grad_norm": 45.71875, "learning_rate": 9.985308945574662e-07, "loss": 87.3144, "step": 3220 }, { "epoch": 0.7545186439688147, "grad_norm": 44.53125, "learning_rate": 9.985263321182035e-07, "loss": 86.0887, "step": 3230 }, { "epoch": 0.756854615002774, "grad_norm": 48.78125, "learning_rate": 9.98521769678941e-07, "loss": 87.4508, "step": 3240 }, { "epoch": 0.7591905860367332, "grad_norm": 45.78125, "learning_rate": 9.985172072396787e-07, "loss": 87.3724, "step": 3250 }, { "epoch": 0.7615265570706923, "grad_norm": 45.375, "learning_rate": 9.98512644800416e-07, "loss": 87.1348, "step": 3260 }, { "epoch": 0.7638625281046515, "grad_norm": 45.4375, "learning_rate": 9.985080823611534e-07, "loss": 87.7193, "step": 3270 }, { "epoch": 0.7661984991386107, "grad_norm": 47.53125, "learning_rate": 9.98503519921891e-07, "loss": 86.5363, "step": 3280 }, { "epoch": 0.7685344701725698, "grad_norm": 47.78125, "learning_rate": 9.984989574826285e-07, "loss": 87.0599, "step": 3290 }, { "epoch": 0.770870441206529, "grad_norm": 48.6875, "learning_rate": 9.98494395043366e-07, "loss": 87.0781, "step": 3300 }, { "epoch": 0.7732064122404883, "grad_norm": 46.09375, "learning_rate": 9.984898326041035e-07, "loss": 87.3568, "step": 3310 }, { "epoch": 0.7755423832744474, "grad_norm": 45.15625, "learning_rate": 9.984852701648409e-07, "loss": 86.8565, "step": 3320 }, { "epoch": 0.7778783543084066, "grad_norm": 44.65625, "learning_rate": 9.984807077255784e-07, "loss": 87.5893, "step": 3330 }, { "epoch": 0.7802143253423658, "grad_norm": 47.6875, "learning_rate": 9.984761452863158e-07, "loss": 86.4368, "step": 3340 }, { "epoch": 0.7825502963763249, "grad_norm": 46.15625, "learning_rate": 9.984715828470534e-07, "loss": 87.0425, "step": 3350 }, { "epoch": 0.7848862674102841, "grad_norm": 47.6875, "learning_rate": 9.984670204077907e-07, "loss": 87.8817, "step": 3360 }, { "epoch": 0.7872222384442433, "grad_norm": 53.6875, "learning_rate": 9.984624579685283e-07, "loss": 87.1403, "step": 3370 }, { "epoch": 0.7895582094782024, "grad_norm": 43.96875, "learning_rate": 9.984578955292657e-07, "loss": 86.3467, "step": 3380 }, { "epoch": 0.7918941805121616, "grad_norm": 48.46875, "learning_rate": 9.984533330900033e-07, "loss": 87.1886, "step": 3390 }, { "epoch": 0.7942301515461209, "grad_norm": 46.59375, "learning_rate": 9.984487706507406e-07, "loss": 85.6375, "step": 3400 }, { "epoch": 0.79656612258008, "grad_norm": 49.65625, "learning_rate": 9.984442082114782e-07, "loss": 86.6058, "step": 3410 }, { "epoch": 0.7989020936140392, "grad_norm": 46.59375, "learning_rate": 9.984396457722156e-07, "loss": 86.563, "step": 3420 }, { "epoch": 0.8012380646479984, "grad_norm": 46.4375, "learning_rate": 9.984350833329531e-07, "loss": 87.169, "step": 3430 }, { "epoch": 0.8035740356819575, "grad_norm": 45.75, "learning_rate": 9.984305208936905e-07, "loss": 87.7978, "step": 3440 }, { "epoch": 0.8059100067159167, "grad_norm": 47.8125, "learning_rate": 9.98425958454428e-07, "loss": 87.7728, "step": 3450 }, { "epoch": 0.8082459777498759, "grad_norm": 46.46875, "learning_rate": 9.984213960151654e-07, "loss": 87.2359, "step": 3460 }, { "epoch": 0.810581948783835, "grad_norm": 46.4375, "learning_rate": 9.98416833575903e-07, "loss": 86.3683, "step": 3470 }, { "epoch": 0.8129179198177943, "grad_norm": 48.71875, "learning_rate": 9.984122711366404e-07, "loss": 88.0013, "step": 3480 }, { "epoch": 0.8152538908517535, "grad_norm": 47.5625, "learning_rate": 9.98407708697378e-07, "loss": 86.5688, "step": 3490 }, { "epoch": 0.8175898618857126, "grad_norm": 48.0, "learning_rate": 9.984031462581153e-07, "loss": 87.148, "step": 3500 }, { "epoch": 0.8199258329196718, "grad_norm": 49.59375, "learning_rate": 9.983985838188529e-07, "loss": 87.0857, "step": 3510 }, { "epoch": 0.822261803953631, "grad_norm": 50.25, "learning_rate": 9.983940213795905e-07, "loss": 86.8389, "step": 3520 }, { "epoch": 0.8245977749875901, "grad_norm": 47.4375, "learning_rate": 9.983894589403278e-07, "loss": 87.4967, "step": 3530 }, { "epoch": 0.8269337460215493, "grad_norm": 46.59375, "learning_rate": 9.983848965010652e-07, "loss": 86.6476, "step": 3540 }, { "epoch": 0.8292697170555086, "grad_norm": 46.3125, "learning_rate": 9.983803340618028e-07, "loss": 87.4724, "step": 3550 }, { "epoch": 0.8316056880894677, "grad_norm": 45.1875, "learning_rate": 9.983757716225403e-07, "loss": 87.1353, "step": 3560 }, { "epoch": 0.8339416591234269, "grad_norm": 48.125, "learning_rate": 9.983712091832777e-07, "loss": 85.959, "step": 3570 }, { "epoch": 0.8362776301573861, "grad_norm": 45.75, "learning_rate": 9.98366646744015e-07, "loss": 86.5401, "step": 3580 }, { "epoch": 0.8386136011913452, "grad_norm": 47.5625, "learning_rate": 9.983620843047527e-07, "loss": 85.5629, "step": 3590 }, { "epoch": 0.8409495722253044, "grad_norm": 46.65625, "learning_rate": 9.983575218654902e-07, "loss": 87.5704, "step": 3600 }, { "epoch": 0.8432855432592636, "grad_norm": 46.46875, "learning_rate": 9.983529594262276e-07, "loss": 87.0222, "step": 3610 }, { "epoch": 0.8456215142932227, "grad_norm": 45.9375, "learning_rate": 9.98348396986965e-07, "loss": 86.3191, "step": 3620 }, { "epoch": 0.847957485327182, "grad_norm": 46.625, "learning_rate": 9.983438345477025e-07, "loss": 86.5284, "step": 3630 }, { "epoch": 0.8502934563611412, "grad_norm": 49.375, "learning_rate": 9.983392721084401e-07, "loss": 86.9173, "step": 3640 }, { "epoch": 0.8526294273951003, "grad_norm": 45.34375, "learning_rate": 9.983347096691775e-07, "loss": 86.998, "step": 3650 }, { "epoch": 0.8549653984290595, "grad_norm": 47.1875, "learning_rate": 9.983301472299148e-07, "loss": 86.2805, "step": 3660 }, { "epoch": 0.8573013694630186, "grad_norm": 47.78125, "learning_rate": 9.983255847906524e-07, "loss": 87.0101, "step": 3670 }, { "epoch": 0.8596373404969778, "grad_norm": 50.5625, "learning_rate": 9.9832102235139e-07, "loss": 87.0326, "step": 3680 }, { "epoch": 0.861973311530937, "grad_norm": 45.5, "learning_rate": 9.983164599121274e-07, "loss": 85.8297, "step": 3690 }, { "epoch": 0.8643092825648961, "grad_norm": 46.15625, "learning_rate": 9.983118974728647e-07, "loss": 86.0181, "step": 3700 }, { "epoch": 0.8666452535988554, "grad_norm": 51.125, "learning_rate": 9.983073350336023e-07, "loss": 85.7637, "step": 3710 }, { "epoch": 0.8689812246328146, "grad_norm": 48.25, "learning_rate": 9.983027725943399e-07, "loss": 86.8376, "step": 3720 }, { "epoch": 0.8713171956667737, "grad_norm": 47.96875, "learning_rate": 9.982982101550772e-07, "loss": 86.3769, "step": 3730 }, { "epoch": 0.8736531667007329, "grad_norm": 45.8125, "learning_rate": 9.982936477158148e-07, "loss": 86.1424, "step": 3740 }, { "epoch": 0.8759891377346921, "grad_norm": 46.75, "learning_rate": 9.982890852765522e-07, "loss": 86.66, "step": 3750 }, { "epoch": 0.8783251087686512, "grad_norm": 45.5, "learning_rate": 9.982845228372897e-07, "loss": 85.9828, "step": 3760 }, { "epoch": 0.8806610798026104, "grad_norm": 46.28125, "learning_rate": 9.982799603980271e-07, "loss": 86.6127, "step": 3770 }, { "epoch": 0.8829970508365697, "grad_norm": 44.4375, "learning_rate": 9.982753979587647e-07, "loss": 86.5096, "step": 3780 }, { "epoch": 0.8853330218705288, "grad_norm": 44.875, "learning_rate": 9.98270835519502e-07, "loss": 86.9895, "step": 3790 }, { "epoch": 0.887668992904488, "grad_norm": 47.15625, "learning_rate": 9.982662730802396e-07, "loss": 88.0681, "step": 3800 }, { "epoch": 0.8900049639384472, "grad_norm": 48.84375, "learning_rate": 9.98261710640977e-07, "loss": 86.534, "step": 3810 }, { "epoch": 0.8923409349724063, "grad_norm": 46.3125, "learning_rate": 9.982571482017146e-07, "loss": 86.3877, "step": 3820 }, { "epoch": 0.8946769060063655, "grad_norm": 45.6875, "learning_rate": 9.98252585762452e-07, "loss": 87.146, "step": 3830 }, { "epoch": 0.8970128770403247, "grad_norm": 48.1875, "learning_rate": 9.982480233231895e-07, "loss": 86.8891, "step": 3840 }, { "epoch": 0.8993488480742838, "grad_norm": 45.90625, "learning_rate": 9.982434608839269e-07, "loss": 86.9012, "step": 3850 }, { "epoch": 0.901684819108243, "grad_norm": 48.28125, "learning_rate": 9.982388984446644e-07, "loss": 86.8401, "step": 3860 }, { "epoch": 0.9040207901422023, "grad_norm": 49.78125, "learning_rate": 9.982343360054018e-07, "loss": 87.0155, "step": 3870 }, { "epoch": 0.9063567611761614, "grad_norm": 46.875, "learning_rate": 9.982297735661394e-07, "loss": 86.033, "step": 3880 }, { "epoch": 0.9086927322101206, "grad_norm": 47.25, "learning_rate": 9.982252111268768e-07, "loss": 87.0386, "step": 3890 }, { "epoch": 0.9110287032440798, "grad_norm": 43.375, "learning_rate": 9.982206486876143e-07, "loss": 85.8166, "step": 3900 }, { "epoch": 0.9133646742780389, "grad_norm": 46.6875, "learning_rate": 9.982160862483517e-07, "loss": 87.5271, "step": 3910 }, { "epoch": 0.9157006453119981, "grad_norm": 47.90625, "learning_rate": 9.982115238090893e-07, "loss": 85.2684, "step": 3920 }, { "epoch": 0.9180366163459573, "grad_norm": 47.53125, "learning_rate": 9.982069613698268e-07, "loss": 86.8994, "step": 3930 }, { "epoch": 0.9203725873799165, "grad_norm": 45.96875, "learning_rate": 9.982023989305642e-07, "loss": 86.4264, "step": 3940 }, { "epoch": 0.9227085584138757, "grad_norm": 45.59375, "learning_rate": 9.981978364913016e-07, "loss": 85.8178, "step": 3950 }, { "epoch": 0.9250445294478349, "grad_norm": 45.34375, "learning_rate": 9.981932740520391e-07, "loss": 87.0427, "step": 3960 }, { "epoch": 0.927380500481794, "grad_norm": 47.59375, "learning_rate": 9.981887116127767e-07, "loss": 86.6578, "step": 3970 }, { "epoch": 0.9297164715157532, "grad_norm": 49.84375, "learning_rate": 9.98184149173514e-07, "loss": 86.9295, "step": 3980 }, { "epoch": 0.9320524425497124, "grad_norm": 45.5, "learning_rate": 9.981795867342515e-07, "loss": 86.1575, "step": 3990 }, { "epoch": 0.9343884135836715, "grad_norm": 49.28125, "learning_rate": 9.98175024294989e-07, "loss": 86.415, "step": 4000 }, { "epoch": 0.9343884135836715, "eval_loss": 1.3514955043792725, "eval_runtime": 133.9042, "eval_samples_per_second": 1678.827, "eval_steps_per_second": 52.47, "step": 4000 }, { "epoch": 0.9367243846176307, "grad_norm": 46.65625, "learning_rate": 9.981704618557266e-07, "loss": 86.0902, "step": 4010 }, { "epoch": 0.93906035565159, "grad_norm": 44.65625, "learning_rate": 9.98165899416464e-07, "loss": 85.4592, "step": 4020 }, { "epoch": 0.9413963266855491, "grad_norm": 44.34375, "learning_rate": 9.981613369772013e-07, "loss": 86.5728, "step": 4030 }, { "epoch": 0.9437322977195083, "grad_norm": 46.03125, "learning_rate": 9.98156774537939e-07, "loss": 87.2485, "step": 4040 }, { "epoch": 0.9460682687534675, "grad_norm": 48.28125, "learning_rate": 9.981522120986765e-07, "loss": 87.1623, "step": 4050 }, { "epoch": 0.9484042397874266, "grad_norm": 47.96875, "learning_rate": 9.981476496594138e-07, "loss": 86.2034, "step": 4060 }, { "epoch": 0.9507402108213858, "grad_norm": 48.25, "learning_rate": 9.981430872201514e-07, "loss": 86.5078, "step": 4070 }, { "epoch": 0.953076181855345, "grad_norm": 44.53125, "learning_rate": 9.981385247808888e-07, "loss": 86.3279, "step": 4080 }, { "epoch": 0.9554121528893041, "grad_norm": 45.6875, "learning_rate": 9.981339623416264e-07, "loss": 86.4747, "step": 4090 }, { "epoch": 0.9577481239232634, "grad_norm": 47.53125, "learning_rate": 9.981293999023637e-07, "loss": 85.3221, "step": 4100 }, { "epoch": 0.9600840949572226, "grad_norm": 47.15625, "learning_rate": 9.981248374631013e-07, "loss": 85.7835, "step": 4110 }, { "epoch": 0.9624200659911817, "grad_norm": 45.96875, "learning_rate": 9.981202750238387e-07, "loss": 85.919, "step": 4120 }, { "epoch": 0.9647560370251409, "grad_norm": 46.40625, "learning_rate": 9.981157125845762e-07, "loss": 86.6488, "step": 4130 }, { "epoch": 0.9670920080591001, "grad_norm": 47.8125, "learning_rate": 9.981111501453136e-07, "loss": 86.7465, "step": 4140 }, { "epoch": 0.9694279790930592, "grad_norm": 50.96875, "learning_rate": 9.981065877060512e-07, "loss": 85.8423, "step": 4150 }, { "epoch": 0.9717639501270184, "grad_norm": 44.84375, "learning_rate": 9.981020252667885e-07, "loss": 86.4872, "step": 4160 }, { "epoch": 0.9740999211609777, "grad_norm": 51.46875, "learning_rate": 9.980974628275261e-07, "loss": 86.9111, "step": 4170 }, { "epoch": 0.9764358921949368, "grad_norm": 46.25, "learning_rate": 9.980929003882635e-07, "loss": 86.4476, "step": 4180 }, { "epoch": 0.978771863228896, "grad_norm": 47.0625, "learning_rate": 9.98088337949001e-07, "loss": 86.3345, "step": 4190 }, { "epoch": 0.9811078342628552, "grad_norm": 47.96875, "learning_rate": 9.980837755097384e-07, "loss": 87.4492, "step": 4200 }, { "epoch": 0.9834438052968143, "grad_norm": 47.53125, "learning_rate": 9.98079213070476e-07, "loss": 87.3175, "step": 4210 }, { "epoch": 0.9857797763307735, "grad_norm": 47.84375, "learning_rate": 9.980746506312134e-07, "loss": 85.7159, "step": 4220 }, { "epoch": 0.9881157473647327, "grad_norm": 50.5, "learning_rate": 9.98070088191951e-07, "loss": 85.7232, "step": 4230 }, { "epoch": 0.9904517183986918, "grad_norm": 47.1875, "learning_rate": 9.980655257526883e-07, "loss": 86.1964, "step": 4240 }, { "epoch": 0.992787689432651, "grad_norm": 46.15625, "learning_rate": 9.980609633134259e-07, "loss": 86.2977, "step": 4250 }, { "epoch": 0.9951236604666102, "grad_norm": 44.8125, "learning_rate": 9.980564008741632e-07, "loss": 85.6801, "step": 4260 }, { "epoch": 0.9974596315005694, "grad_norm": 46.15625, "learning_rate": 9.980518384349008e-07, "loss": 85.8044, "step": 4270 }, { "epoch": 0.9997956025345286, "grad_norm": 46.75, "learning_rate": 9.980472759956384e-07, "loss": 86.1971, "step": 4280 } ], "logging_steps": 10, "max_steps": 4280, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.1817578952753414e+19, "train_batch_size": 4, "trial_name": null, "trial_params": null }