|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9997956025345286, |
|
"eval_steps": 2000, |
|
"global_step": 4280, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.002335971033959179, |
|
"grad_norm": 73.6875, |
|
"learning_rate": 9.999954375607375e-07, |
|
"loss": 107.9787, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.004671942067918358, |
|
"grad_norm": 73.25, |
|
"learning_rate": 9.99990875121475e-07, |
|
"loss": 103.9372, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.007007913101877537, |
|
"grad_norm": 59.53125, |
|
"learning_rate": 9.999863126822123e-07, |
|
"loss": 100.6365, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.009343884135836715, |
|
"grad_norm": 60.3125, |
|
"learning_rate": 9.999817502429498e-07, |
|
"loss": 100.6602, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.011679855169795894, |
|
"grad_norm": 59.1875, |
|
"learning_rate": 9.999771878036874e-07, |
|
"loss": 100.0208, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.014015826203755074, |
|
"grad_norm": 55.53125, |
|
"learning_rate": 9.999726253644248e-07, |
|
"loss": 98.8471, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.016351797237714252, |
|
"grad_norm": 55.3125, |
|
"learning_rate": 9.999680629251621e-07, |
|
"loss": 99.5175, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.01868776827167343, |
|
"grad_norm": 58.53125, |
|
"learning_rate": 9.999635004858997e-07, |
|
"loss": 98.5395, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.02102373930563261, |
|
"grad_norm": 59.8125, |
|
"learning_rate": 9.999589380466373e-07, |
|
"loss": 98.9192, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.023359710339591788, |
|
"grad_norm": 63.78125, |
|
"learning_rate": 9.999543756073747e-07, |
|
"loss": 97.6611, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.02569568137355097, |
|
"grad_norm": 58.9375, |
|
"learning_rate": 9.99949813168112e-07, |
|
"loss": 97.8031, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.028031652407510148, |
|
"grad_norm": 59.1875, |
|
"learning_rate": 9.999452507288496e-07, |
|
"loss": 98.0507, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.030367623441469326, |
|
"grad_norm": 51.15625, |
|
"learning_rate": 9.999406882895872e-07, |
|
"loss": 96.8367, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.032703594475428505, |
|
"grad_norm": 56.09375, |
|
"learning_rate": 9.999361258503245e-07, |
|
"loss": 96.7883, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.035039565509387686, |
|
"grad_norm": 54.9375, |
|
"learning_rate": 9.99931563411062e-07, |
|
"loss": 97.7532, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.03737553654334686, |
|
"grad_norm": 61.21875, |
|
"learning_rate": 9.999270009717995e-07, |
|
"loss": 97.0359, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.03971150757730604, |
|
"grad_norm": 60.5625, |
|
"learning_rate": 9.99922438532537e-07, |
|
"loss": 96.5724, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.04204747861126522, |
|
"grad_norm": 52.625, |
|
"learning_rate": 9.999178760932744e-07, |
|
"loss": 97.048, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.0443834496452244, |
|
"grad_norm": 57.53125, |
|
"learning_rate": 9.999133136540118e-07, |
|
"loss": 96.4664, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.046719420679183575, |
|
"grad_norm": 52.875, |
|
"learning_rate": 9.999087512147494e-07, |
|
"loss": 96.2538, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.04905539171314276, |
|
"grad_norm": 63.71875, |
|
"learning_rate": 9.99904188775487e-07, |
|
"loss": 95.9222, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.05139136274710194, |
|
"grad_norm": 54.8125, |
|
"learning_rate": 9.998996263362243e-07, |
|
"loss": 96.0401, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.053727333781061114, |
|
"grad_norm": 55.0625, |
|
"learning_rate": 9.998950638969619e-07, |
|
"loss": 96.3144, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.056063304815020296, |
|
"grad_norm": 53.5625, |
|
"learning_rate": 9.998905014576992e-07, |
|
"loss": 95.5121, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.05839927584897947, |
|
"grad_norm": 52.59375, |
|
"learning_rate": 9.998859390184368e-07, |
|
"loss": 95.5379, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.06073524688293865, |
|
"grad_norm": 51.875, |
|
"learning_rate": 9.998813765791742e-07, |
|
"loss": 94.811, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.06307121791689783, |
|
"grad_norm": 54.46875, |
|
"learning_rate": 9.998768141399118e-07, |
|
"loss": 95.986, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.06540718895085701, |
|
"grad_norm": 55.1875, |
|
"learning_rate": 9.998722517006491e-07, |
|
"loss": 95.3928, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.06774315998481618, |
|
"grad_norm": 56.625, |
|
"learning_rate": 9.998676892613867e-07, |
|
"loss": 94.6728, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.07007913101877537, |
|
"grad_norm": 55.09375, |
|
"learning_rate": 9.99863126822124e-07, |
|
"loss": 95.4789, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.07241510205273455, |
|
"grad_norm": 61.96875, |
|
"learning_rate": 9.998585643828616e-07, |
|
"loss": 95.9337, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.07475107308669372, |
|
"grad_norm": 55.34375, |
|
"learning_rate": 9.99854001943599e-07, |
|
"loss": 94.6639, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.0770870441206529, |
|
"grad_norm": 55.3125, |
|
"learning_rate": 9.998494395043366e-07, |
|
"loss": 95.4775, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.07942301515461209, |
|
"grad_norm": 51.65625, |
|
"learning_rate": 9.99844877065074e-07, |
|
"loss": 94.9841, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.08175898618857126, |
|
"grad_norm": 55.5, |
|
"learning_rate": 9.998403146258115e-07, |
|
"loss": 94.7989, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.08409495722253044, |
|
"grad_norm": 51.21875, |
|
"learning_rate": 9.998357521865489e-07, |
|
"loss": 93.6134, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.08643092825648963, |
|
"grad_norm": 62.40625, |
|
"learning_rate": 9.998311897472865e-07, |
|
"loss": 95.1199, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.0887668992904488, |
|
"grad_norm": 48.125, |
|
"learning_rate": 9.998266273080238e-07, |
|
"loss": 93.7213, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.09110287032440798, |
|
"grad_norm": 53.125, |
|
"learning_rate": 9.998220648687614e-07, |
|
"loss": 94.9632, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.09343884135836715, |
|
"grad_norm": 49.96875, |
|
"learning_rate": 9.99817502429499e-07, |
|
"loss": 93.9266, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.09577481239232634, |
|
"grad_norm": 49.46875, |
|
"learning_rate": 9.998129399902363e-07, |
|
"loss": 95.0468, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.09811078342628551, |
|
"grad_norm": 50.59375, |
|
"learning_rate": 9.99808377550974e-07, |
|
"loss": 93.5313, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.10044675446024469, |
|
"grad_norm": 54.90625, |
|
"learning_rate": 9.998038151117113e-07, |
|
"loss": 94.5912, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.10278272549420388, |
|
"grad_norm": 50.96875, |
|
"learning_rate": 9.997992526724488e-07, |
|
"loss": 93.7903, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.10511869652816305, |
|
"grad_norm": 52.5, |
|
"learning_rate": 9.997946902331862e-07, |
|
"loss": 93.5428, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.10745466756212223, |
|
"grad_norm": 50.90625, |
|
"learning_rate": 9.997901277939238e-07, |
|
"loss": 94.4597, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.1097906385960814, |
|
"grad_norm": 49.46875, |
|
"learning_rate": 9.997855653546612e-07, |
|
"loss": 94.1416, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.11212660963004059, |
|
"grad_norm": 52.53125, |
|
"learning_rate": 9.997810029153987e-07, |
|
"loss": 93.9781, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.11446258066399977, |
|
"grad_norm": 51.0, |
|
"learning_rate": 9.99776440476136e-07, |
|
"loss": 94.0498, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.11679855169795894, |
|
"grad_norm": 50.21875, |
|
"learning_rate": 9.997718780368737e-07, |
|
"loss": 92.6403, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.11913452273191813, |
|
"grad_norm": 49.03125, |
|
"learning_rate": 9.99767315597611e-07, |
|
"loss": 92.5467, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.1214704937658773, |
|
"grad_norm": 54.71875, |
|
"learning_rate": 9.997627531583486e-07, |
|
"loss": 92.8119, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.12380646479983648, |
|
"grad_norm": 48.40625, |
|
"learning_rate": 9.99758190719086e-07, |
|
"loss": 93.4688, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.12614243583379567, |
|
"grad_norm": 53.40625, |
|
"learning_rate": 9.997536282798235e-07, |
|
"loss": 93.5298, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.12847840686775483, |
|
"grad_norm": 47.96875, |
|
"learning_rate": 9.99749065840561e-07, |
|
"loss": 93.2207, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.13081437790171402, |
|
"grad_norm": 56.0, |
|
"learning_rate": 9.997445034012985e-07, |
|
"loss": 93.1208, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.1331503489356732, |
|
"grad_norm": 54.53125, |
|
"learning_rate": 9.997399409620359e-07, |
|
"loss": 92.8771, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.13548631996963237, |
|
"grad_norm": 64.6875, |
|
"learning_rate": 9.997353785227734e-07, |
|
"loss": 93.5443, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.13782229100359156, |
|
"grad_norm": 53.6875, |
|
"learning_rate": 9.997308160835108e-07, |
|
"loss": 92.5809, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.14015826203755075, |
|
"grad_norm": 49.53125, |
|
"learning_rate": 9.997262536442484e-07, |
|
"loss": 93.1532, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.1424942330715099, |
|
"grad_norm": 59.9375, |
|
"learning_rate": 9.99721691204986e-07, |
|
"loss": 92.8726, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.1448302041054691, |
|
"grad_norm": 54.25, |
|
"learning_rate": 9.997171287657233e-07, |
|
"loss": 92.0574, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.14716617513942828, |
|
"grad_norm": 52.96875, |
|
"learning_rate": 9.997125663264607e-07, |
|
"loss": 93.3626, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.14950214617338745, |
|
"grad_norm": 52.875, |
|
"learning_rate": 9.997080038871982e-07, |
|
"loss": 92.2334, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.15183811720734663, |
|
"grad_norm": 49.46875, |
|
"learning_rate": 9.997034414479358e-07, |
|
"loss": 94.0112, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.1541740882413058, |
|
"grad_norm": 49.5, |
|
"learning_rate": 9.996988790086732e-07, |
|
"loss": 92.2169, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.15651005927526498, |
|
"grad_norm": 48.84375, |
|
"learning_rate": 9.996943165694106e-07, |
|
"loss": 93.1208, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.15884603030922417, |
|
"grad_norm": 47.8125, |
|
"learning_rate": 9.996897541301481e-07, |
|
"loss": 92.4204, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.16118200134318333, |
|
"grad_norm": 52.1875, |
|
"learning_rate": 9.996851916908857e-07, |
|
"loss": 92.2801, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.16351797237714252, |
|
"grad_norm": 51.84375, |
|
"learning_rate": 9.99680629251623e-07, |
|
"loss": 92.815, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.1658539434111017, |
|
"grad_norm": 50.40625, |
|
"learning_rate": 9.996760668123604e-07, |
|
"loss": 93.1973, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.16818991444506087, |
|
"grad_norm": 49.84375, |
|
"learning_rate": 9.99671504373098e-07, |
|
"loss": 93.101, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.17052588547902006, |
|
"grad_norm": 50.5, |
|
"learning_rate": 9.996669419338356e-07, |
|
"loss": 92.027, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.17286185651297925, |
|
"grad_norm": 47.625, |
|
"learning_rate": 9.99662379494573e-07, |
|
"loss": 92.048, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.1751978275469384, |
|
"grad_norm": 58.28125, |
|
"learning_rate": 9.996578170553103e-07, |
|
"loss": 93.1853, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.1775337985808976, |
|
"grad_norm": 73.875, |
|
"learning_rate": 9.996532546160479e-07, |
|
"loss": 91.2014, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.1798697696148568, |
|
"grad_norm": 50.8125, |
|
"learning_rate": 9.996486921767855e-07, |
|
"loss": 92.475, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.18220574064881595, |
|
"grad_norm": 50.15625, |
|
"learning_rate": 9.996441297375228e-07, |
|
"loss": 92.3456, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.18454171168277514, |
|
"grad_norm": 51.3125, |
|
"learning_rate": 9.996395672982602e-07, |
|
"loss": 92.1092, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.1868776827167343, |
|
"grad_norm": 53.25, |
|
"learning_rate": 9.996350048589978e-07, |
|
"loss": 92.2168, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.1892136537506935, |
|
"grad_norm": 49.96875, |
|
"learning_rate": 9.996304424197353e-07, |
|
"loss": 91.5845, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.19154962478465268, |
|
"grad_norm": 51.96875, |
|
"learning_rate": 9.996258799804727e-07, |
|
"loss": 92.4014, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.19388559581861184, |
|
"grad_norm": 52.65625, |
|
"learning_rate": 9.9962131754121e-07, |
|
"loss": 91.9784, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.19622156685257103, |
|
"grad_norm": 55.59375, |
|
"learning_rate": 9.996167551019476e-07, |
|
"loss": 92.1067, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.19855753788653022, |
|
"grad_norm": 54.8125, |
|
"learning_rate": 9.996121926626852e-07, |
|
"loss": 92.2285, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.20089350892048938, |
|
"grad_norm": 48.21875, |
|
"learning_rate": 9.996076302234226e-07, |
|
"loss": 92.6478, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.20322947995444857, |
|
"grad_norm": 49.65625, |
|
"learning_rate": 9.9960306778416e-07, |
|
"loss": 91.2663, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.20556545098840776, |
|
"grad_norm": 51.78125, |
|
"learning_rate": 9.995985053448975e-07, |
|
"loss": 91.9975, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.20790142202236692, |
|
"grad_norm": 56.21875, |
|
"learning_rate": 9.99593942905635e-07, |
|
"loss": 91.8558, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.2102373930563261, |
|
"grad_norm": 48.21875, |
|
"learning_rate": 9.995893804663725e-07, |
|
"loss": 92.5588, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.2125733640902853, |
|
"grad_norm": 51.6875, |
|
"learning_rate": 9.995848180271098e-07, |
|
"loss": 91.8372, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.21490933512424445, |
|
"grad_norm": 49.6875, |
|
"learning_rate": 9.995802555878474e-07, |
|
"loss": 91.0599, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.21724530615820364, |
|
"grad_norm": 51.09375, |
|
"learning_rate": 9.99575693148585e-07, |
|
"loss": 92.0935, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.2195812771921628, |
|
"grad_norm": 51.0, |
|
"learning_rate": 9.995711307093223e-07, |
|
"loss": 92.2526, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.221917248226122, |
|
"grad_norm": 55.09375, |
|
"learning_rate": 9.995665682700597e-07, |
|
"loss": 91.4987, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.22425321926008118, |
|
"grad_norm": 48.875, |
|
"learning_rate": 9.995620058307973e-07, |
|
"loss": 91.7583, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.22658919029404034, |
|
"grad_norm": 54.28125, |
|
"learning_rate": 9.995574433915349e-07, |
|
"loss": 92.9723, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.22892516132799953, |
|
"grad_norm": 47.375, |
|
"learning_rate": 9.995528809522722e-07, |
|
"loss": 91.1949, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.23126113236195872, |
|
"grad_norm": 48.1875, |
|
"learning_rate": 9.995483185130098e-07, |
|
"loss": 91.6117, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.23359710339591788, |
|
"grad_norm": 49.1875, |
|
"learning_rate": 9.995437560737472e-07, |
|
"loss": 91.0056, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.23593307442987707, |
|
"grad_norm": 51.46875, |
|
"learning_rate": 9.995391936344847e-07, |
|
"loss": 91.9323, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.23826904546383626, |
|
"grad_norm": 47.875, |
|
"learning_rate": 9.995346311952221e-07, |
|
"loss": 91.1979, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.24060501649779542, |
|
"grad_norm": 48.90625, |
|
"learning_rate": 9.995300687559597e-07, |
|
"loss": 91.2106, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.2429409875317546, |
|
"grad_norm": 48.5, |
|
"learning_rate": 9.99525506316697e-07, |
|
"loss": 90.451, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.2452769585657138, |
|
"grad_norm": 47.8125, |
|
"learning_rate": 9.995209438774346e-07, |
|
"loss": 90.8564, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.24761292959967296, |
|
"grad_norm": 48.4375, |
|
"learning_rate": 9.99516381438172e-07, |
|
"loss": 91.9894, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.24994890063363215, |
|
"grad_norm": 51.5, |
|
"learning_rate": 9.995118189989096e-07, |
|
"loss": 90.8876, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.25228487166759134, |
|
"grad_norm": 48.34375, |
|
"learning_rate": 9.99507256559647e-07, |
|
"loss": 89.8074, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.2546208427015505, |
|
"grad_norm": 49.71875, |
|
"learning_rate": 9.995026941203845e-07, |
|
"loss": 90.9951, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.25695681373550966, |
|
"grad_norm": 48.25, |
|
"learning_rate": 9.994981316811219e-07, |
|
"loss": 91.1307, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.2592927847694689, |
|
"grad_norm": 51.96875, |
|
"learning_rate": 9.994935692418594e-07, |
|
"loss": 90.8755, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.26162875580342804, |
|
"grad_norm": 52.0625, |
|
"learning_rate": 9.994890068025968e-07, |
|
"loss": 90.3661, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.2639647268373872, |
|
"grad_norm": 50.0625, |
|
"learning_rate": 9.994844443633344e-07, |
|
"loss": 91.0299, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.2663006978713464, |
|
"grad_norm": 48.40625, |
|
"learning_rate": 9.994798819240718e-07, |
|
"loss": 90.4072, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.2686366689053056, |
|
"grad_norm": 48.21875, |
|
"learning_rate": 9.994753194848093e-07, |
|
"loss": 90.3286, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.27097263993926474, |
|
"grad_norm": 47.375, |
|
"learning_rate": 9.994707570455467e-07, |
|
"loss": 89.8693, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.27330861097322395, |
|
"grad_norm": 47.71875, |
|
"learning_rate": 9.994661946062843e-07, |
|
"loss": 90.2988, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.2756445820071831, |
|
"grad_norm": 48.375, |
|
"learning_rate": 9.994616321670216e-07, |
|
"loss": 90.7299, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.2779805530411423, |
|
"grad_norm": 48.03125, |
|
"learning_rate": 9.994570697277592e-07, |
|
"loss": 90.5661, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.2803165240751015, |
|
"grad_norm": 50.875, |
|
"learning_rate": 9.994525072884968e-07, |
|
"loss": 91.3686, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.28265249510906065, |
|
"grad_norm": 56.40625, |
|
"learning_rate": 9.994479448492341e-07, |
|
"loss": 90.5123, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.2849884661430198, |
|
"grad_norm": 47.75, |
|
"learning_rate": 9.994433824099715e-07, |
|
"loss": 90.0628, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.28732443717697903, |
|
"grad_norm": 48.59375, |
|
"learning_rate": 9.99438819970709e-07, |
|
"loss": 91.5217, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.2896604082109382, |
|
"grad_norm": 50.375, |
|
"learning_rate": 9.994342575314467e-07, |
|
"loss": 91.552, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.29199637924489735, |
|
"grad_norm": 46.34375, |
|
"learning_rate": 9.99429695092184e-07, |
|
"loss": 90.1196, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.29433235027885657, |
|
"grad_norm": 51.28125, |
|
"learning_rate": 9.994251326529216e-07, |
|
"loss": 90.6674, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.29666832131281573, |
|
"grad_norm": 47.46875, |
|
"learning_rate": 9.99420570213659e-07, |
|
"loss": 90.2552, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.2990042923467749, |
|
"grad_norm": 47.1875, |
|
"learning_rate": 9.994160077743965e-07, |
|
"loss": 89.4563, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.30134026338073405, |
|
"grad_norm": 48.375, |
|
"learning_rate": 9.99411445335134e-07, |
|
"loss": 89.6932, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.30367623441469327, |
|
"grad_norm": 49.65625, |
|
"learning_rate": 9.994068828958715e-07, |
|
"loss": 90.6541, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.30601220544865243, |
|
"grad_norm": 47.4375, |
|
"learning_rate": 9.994023204566088e-07, |
|
"loss": 90.2363, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.3083481764826116, |
|
"grad_norm": 47.59375, |
|
"learning_rate": 9.993977580173464e-07, |
|
"loss": 89.9082, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.3106841475165708, |
|
"grad_norm": 49.9375, |
|
"learning_rate": 9.993931955780838e-07, |
|
"loss": 90.5597, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.31302011855052997, |
|
"grad_norm": 47.09375, |
|
"learning_rate": 9.993886331388214e-07, |
|
"loss": 89.9793, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.31535608958448913, |
|
"grad_norm": 46.1875, |
|
"learning_rate": 9.993840706995587e-07, |
|
"loss": 89.6057, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.31769206061844835, |
|
"grad_norm": 48.375, |
|
"learning_rate": 9.993795082602963e-07, |
|
"loss": 90.4494, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.3200280316524075, |
|
"grad_norm": 47.84375, |
|
"learning_rate": 9.993749458210337e-07, |
|
"loss": 89.3954, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.32236400268636667, |
|
"grad_norm": 51.5, |
|
"learning_rate": 9.993703833817712e-07, |
|
"loss": 88.9873, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.3246999737203259, |
|
"grad_norm": 49.375, |
|
"learning_rate": 9.993658209425086e-07, |
|
"loss": 89.7105, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.32703594475428505, |
|
"grad_norm": 48.21875, |
|
"learning_rate": 9.993612585032462e-07, |
|
"loss": 90.0021, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.3293719157882442, |
|
"grad_norm": 48.75, |
|
"learning_rate": 9.993566960639835e-07, |
|
"loss": 90.7298, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.3317078868222034, |
|
"grad_norm": 48.0625, |
|
"learning_rate": 9.993521336247211e-07, |
|
"loss": 89.614, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.3340438578561626, |
|
"grad_norm": 47.6875, |
|
"learning_rate": 9.993475711854585e-07, |
|
"loss": 90.2349, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.33637982889012175, |
|
"grad_norm": 47.6875, |
|
"learning_rate": 9.99343008746196e-07, |
|
"loss": 89.0322, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.33871579992408096, |
|
"grad_norm": 52.875, |
|
"learning_rate": 9.993384463069336e-07, |
|
"loss": 91.0856, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.3410517709580401, |
|
"grad_norm": 45.84375, |
|
"learning_rate": 9.99333883867671e-07, |
|
"loss": 89.7295, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.3433877419919993, |
|
"grad_norm": 47.875, |
|
"learning_rate": 9.993293214284084e-07, |
|
"loss": 89.7628, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.3457237130259585, |
|
"grad_norm": 50.71875, |
|
"learning_rate": 9.99324758989146e-07, |
|
"loss": 89.6516, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.34805968405991766, |
|
"grad_norm": 49.5625, |
|
"learning_rate": 9.993201965498835e-07, |
|
"loss": 89.7199, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.3503956550938768, |
|
"grad_norm": 44.96875, |
|
"learning_rate": 9.993156341106209e-07, |
|
"loss": 89.6713, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.35273162612783604, |
|
"grad_norm": 49.03125, |
|
"learning_rate": 9.993110716713582e-07, |
|
"loss": 89.7211, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.3550675971617952, |
|
"grad_norm": 46.65625, |
|
"learning_rate": 9.993065092320958e-07, |
|
"loss": 89.7591, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.35740356819575436, |
|
"grad_norm": 46.40625, |
|
"learning_rate": 9.993019467928334e-07, |
|
"loss": 89.5946, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.3597395392297136, |
|
"grad_norm": 47.46875, |
|
"learning_rate": 9.992973843535708e-07, |
|
"loss": 89.3533, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.36207551026367274, |
|
"grad_norm": 66.75, |
|
"learning_rate": 9.992928219143081e-07, |
|
"loss": 88.915, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.3644114812976319, |
|
"grad_norm": 49.625, |
|
"learning_rate": 9.992882594750457e-07, |
|
"loss": 89.5318, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.36674745233159106, |
|
"grad_norm": 52.5, |
|
"learning_rate": 9.992836970357833e-07, |
|
"loss": 89.6842, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.3690834233655503, |
|
"grad_norm": 47.65625, |
|
"learning_rate": 9.992791345965206e-07, |
|
"loss": 89.9355, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.37141939439950944, |
|
"grad_norm": 47.34375, |
|
"learning_rate": 9.99274572157258e-07, |
|
"loss": 89.0862, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.3737553654334686, |
|
"grad_norm": 47.0, |
|
"learning_rate": 9.992700097179956e-07, |
|
"loss": 89.5908, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.3760913364674278, |
|
"grad_norm": 48.40625, |
|
"learning_rate": 9.992654472787332e-07, |
|
"loss": 90.0093, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.378427307501387, |
|
"grad_norm": 46.90625, |
|
"learning_rate": 9.992608848394705e-07, |
|
"loss": 89.8005, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.38076327853534614, |
|
"grad_norm": 46.53125, |
|
"learning_rate": 9.992563224002079e-07, |
|
"loss": 89.5087, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.38309924956930536, |
|
"grad_norm": 46.6875, |
|
"learning_rate": 9.992517599609455e-07, |
|
"loss": 89.3029, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.3854352206032645, |
|
"grad_norm": 48.0, |
|
"learning_rate": 9.99247197521683e-07, |
|
"loss": 89.3145, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.3877711916372237, |
|
"grad_norm": 47.5625, |
|
"learning_rate": 9.992426350824204e-07, |
|
"loss": 88.9554, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.3901071626711829, |
|
"grad_norm": 50.25, |
|
"learning_rate": 9.992380726431578e-07, |
|
"loss": 89.8971, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.39244313370514206, |
|
"grad_norm": 49.1875, |
|
"learning_rate": 9.992335102038953e-07, |
|
"loss": 88.8999, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.3947791047391012, |
|
"grad_norm": 47.6875, |
|
"learning_rate": 9.99228947764633e-07, |
|
"loss": 90.1073, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.39711507577306043, |
|
"grad_norm": 48.375, |
|
"learning_rate": 9.992243853253703e-07, |
|
"loss": 89.0198, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.3994510468070196, |
|
"grad_norm": 49.9375, |
|
"learning_rate": 9.992198228861076e-07, |
|
"loss": 89.9081, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.40178701784097876, |
|
"grad_norm": 48.6875, |
|
"learning_rate": 9.992152604468452e-07, |
|
"loss": 89.2711, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.40412298887493797, |
|
"grad_norm": 46.34375, |
|
"learning_rate": 9.992106980075828e-07, |
|
"loss": 89.0298, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.40645895990889713, |
|
"grad_norm": 46.65625, |
|
"learning_rate": 9.992061355683202e-07, |
|
"loss": 89.1033, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.4087949309428563, |
|
"grad_norm": 47.9375, |
|
"learning_rate": 9.992015731290577e-07, |
|
"loss": 89.7967, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.4111309019768155, |
|
"grad_norm": 47.53125, |
|
"learning_rate": 9.99197010689795e-07, |
|
"loss": 87.6053, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.41346687301077467, |
|
"grad_norm": 46.6875, |
|
"learning_rate": 9.991924482505327e-07, |
|
"loss": 89.5975, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.41580284404473383, |
|
"grad_norm": 50.90625, |
|
"learning_rate": 9.9918788581127e-07, |
|
"loss": 88.9577, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.41813881507869305, |
|
"grad_norm": 49.125, |
|
"learning_rate": 9.991833233720076e-07, |
|
"loss": 88.7783, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.4204747861126522, |
|
"grad_norm": 47.9375, |
|
"learning_rate": 9.99178760932745e-07, |
|
"loss": 89.6563, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.42281075714661137, |
|
"grad_norm": 46.90625, |
|
"learning_rate": 9.991741984934826e-07, |
|
"loss": 88.626, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.4251467281805706, |
|
"grad_norm": 46.71875, |
|
"learning_rate": 9.9916963605422e-07, |
|
"loss": 87.7213, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.42748269921452975, |
|
"grad_norm": 49.40625, |
|
"learning_rate": 9.991650736149575e-07, |
|
"loss": 88.2201, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.4298186702484889, |
|
"grad_norm": 46.40625, |
|
"learning_rate": 9.991605111756949e-07, |
|
"loss": 89.3786, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.43215464128244807, |
|
"grad_norm": 48.125, |
|
"learning_rate": 9.991559487364324e-07, |
|
"loss": 87.8735, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.4344906123164073, |
|
"grad_norm": 52.09375, |
|
"learning_rate": 9.991513862971698e-07, |
|
"loss": 89.6088, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.43682658335036645, |
|
"grad_norm": 48.875, |
|
"learning_rate": 9.991468238579074e-07, |
|
"loss": 88.5974, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.4391625543843256, |
|
"grad_norm": 46.3125, |
|
"learning_rate": 9.991422614186447e-07, |
|
"loss": 89.1903, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.4414985254182848, |
|
"grad_norm": 45.1875, |
|
"learning_rate": 9.991376989793823e-07, |
|
"loss": 88.6345, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.443834496452244, |
|
"grad_norm": 45.375, |
|
"learning_rate": 9.991331365401197e-07, |
|
"loss": 88.6808, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.44617046748620315, |
|
"grad_norm": 45.6875, |
|
"learning_rate": 9.991285741008573e-07, |
|
"loss": 88.9256, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.44850643852016236, |
|
"grad_norm": 45.53125, |
|
"learning_rate": 9.991240116615946e-07, |
|
"loss": 88.0677, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.4508424095541215, |
|
"grad_norm": 47.15625, |
|
"learning_rate": 9.991194492223322e-07, |
|
"loss": 89.2818, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.4531783805880807, |
|
"grad_norm": 46.28125, |
|
"learning_rate": 9.991148867830696e-07, |
|
"loss": 88.0857, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.4555143516220399, |
|
"grad_norm": 48.78125, |
|
"learning_rate": 9.991103243438071e-07, |
|
"loss": 89.0477, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.45785032265599906, |
|
"grad_norm": 48.90625, |
|
"learning_rate": 9.991057619045447e-07, |
|
"loss": 89.073, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.4601862936899582, |
|
"grad_norm": 48.25, |
|
"learning_rate": 9.99101199465282e-07, |
|
"loss": 89.1609, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.46252226472391744, |
|
"grad_norm": 52.0625, |
|
"learning_rate": 9.990966370260194e-07, |
|
"loss": 89.7074, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.4648582357578766, |
|
"grad_norm": 47.84375, |
|
"learning_rate": 9.99092074586757e-07, |
|
"loss": 88.3551, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.46719420679183576, |
|
"grad_norm": 45.875, |
|
"learning_rate": 9.990875121474946e-07, |
|
"loss": 89.2271, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.46719420679183576, |
|
"eval_loss": 1.3847792148590088, |
|
"eval_runtime": 136.4587, |
|
"eval_samples_per_second": 1647.4, |
|
"eval_steps_per_second": 51.488, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.469530177825795, |
|
"grad_norm": 46.53125, |
|
"learning_rate": 9.99082949708232e-07, |
|
"loss": 88.5579, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.47186614885975414, |
|
"grad_norm": 46.96875, |
|
"learning_rate": 9.990783872689693e-07, |
|
"loss": 88.9332, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.4742021198937133, |
|
"grad_norm": 45.03125, |
|
"learning_rate": 9.99073824829707e-07, |
|
"loss": 88.1122, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.4765380909276725, |
|
"grad_norm": 47.8125, |
|
"learning_rate": 9.990692623904445e-07, |
|
"loss": 88.4026, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.4788740619616317, |
|
"grad_norm": 47.46875, |
|
"learning_rate": 9.990646999511818e-07, |
|
"loss": 88.9833, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.48121003299559084, |
|
"grad_norm": 49.03125, |
|
"learning_rate": 9.990601375119192e-07, |
|
"loss": 88.6076, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.48354600402955006, |
|
"grad_norm": 57.125, |
|
"learning_rate": 9.990555750726568e-07, |
|
"loss": 88.9196, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.4858819750635092, |
|
"grad_norm": 47.6875, |
|
"learning_rate": 9.990510126333944e-07, |
|
"loss": 88.4763, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.4882179460974684, |
|
"grad_norm": 49.65625, |
|
"learning_rate": 9.990464501941317e-07, |
|
"loss": 87.9524, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.4905539171314276, |
|
"grad_norm": 45.5625, |
|
"learning_rate": 9.990418877548693e-07, |
|
"loss": 88.7893, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.49288988816538676, |
|
"grad_norm": 46.5, |
|
"learning_rate": 9.990373253156067e-07, |
|
"loss": 89.0926, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.4952258591993459, |
|
"grad_norm": 52.0, |
|
"learning_rate": 9.990327628763442e-07, |
|
"loss": 88.1107, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.4975618302333051, |
|
"grad_norm": 45.84375, |
|
"learning_rate": 9.990282004370816e-07, |
|
"loss": 88.8404, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.4998978012672643, |
|
"grad_norm": 44.6875, |
|
"learning_rate": 9.990236379978192e-07, |
|
"loss": 88.8822, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.5022337723012235, |
|
"grad_norm": 47.21875, |
|
"learning_rate": 9.990190755585565e-07, |
|
"loss": 88.8674, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.5045697433351827, |
|
"grad_norm": 46.0625, |
|
"learning_rate": 9.990145131192941e-07, |
|
"loss": 88.4117, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.5069057143691418, |
|
"grad_norm": 47.0625, |
|
"learning_rate": 9.990099506800315e-07, |
|
"loss": 87.901, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.509241685403101, |
|
"grad_norm": 46.46875, |
|
"learning_rate": 9.99005388240769e-07, |
|
"loss": 88.5639, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.5115776564370602, |
|
"grad_norm": 47.9375, |
|
"learning_rate": 9.990008258015064e-07, |
|
"loss": 88.3239, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.5139136274710193, |
|
"grad_norm": 47.34375, |
|
"learning_rate": 9.98996263362244e-07, |
|
"loss": 88.649, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.5162495985049785, |
|
"grad_norm": 44.34375, |
|
"learning_rate": 9.989917009229814e-07, |
|
"loss": 87.9817, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.5185855695389378, |
|
"grad_norm": 46.375, |
|
"learning_rate": 9.98987138483719e-07, |
|
"loss": 87.0908, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.5209215405728969, |
|
"grad_norm": 45.03125, |
|
"learning_rate": 9.989825760444563e-07, |
|
"loss": 88.3031, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.5232575116068561, |
|
"grad_norm": 45.875, |
|
"learning_rate": 9.989780136051939e-07, |
|
"loss": 88.9973, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.5255934826408153, |
|
"grad_norm": 48.84375, |
|
"learning_rate": 9.989734511659312e-07, |
|
"loss": 88.8684, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.5279294536747744, |
|
"grad_norm": 44.1875, |
|
"learning_rate": 9.989688887266688e-07, |
|
"loss": 89.4066, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.5302654247087336, |
|
"grad_norm": 48.3125, |
|
"learning_rate": 9.989643262874062e-07, |
|
"loss": 87.0936, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.5326013957426928, |
|
"grad_norm": 49.25, |
|
"learning_rate": 9.989597638481438e-07, |
|
"loss": 88.0914, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.5349373667766519, |
|
"grad_norm": 44.53125, |
|
"learning_rate": 9.989552014088813e-07, |
|
"loss": 88.5739, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.5372733378106112, |
|
"grad_norm": 45.375, |
|
"learning_rate": 9.989506389696187e-07, |
|
"loss": 88.5934, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.5396093088445704, |
|
"grad_norm": 48.8125, |
|
"learning_rate": 9.98946076530356e-07, |
|
"loss": 88.2016, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.5419452798785295, |
|
"grad_norm": 46.15625, |
|
"learning_rate": 9.989415140910936e-07, |
|
"loss": 87.577, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.5442812509124887, |
|
"grad_norm": 45.28125, |
|
"learning_rate": 9.989369516518312e-07, |
|
"loss": 88.1394, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.5466172219464479, |
|
"grad_norm": 50.65625, |
|
"learning_rate": 9.989323892125686e-07, |
|
"loss": 86.5228, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.548953192980407, |
|
"grad_norm": 48.9375, |
|
"learning_rate": 9.98927826773306e-07, |
|
"loss": 88.3342, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.5512891640143662, |
|
"grad_norm": 48.71875, |
|
"learning_rate": 9.989232643340435e-07, |
|
"loss": 87.4456, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.5536251350483254, |
|
"grad_norm": 48.90625, |
|
"learning_rate": 9.98918701894781e-07, |
|
"loss": 88.0534, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.5559611060822846, |
|
"grad_norm": 45.8125, |
|
"learning_rate": 9.989141394555185e-07, |
|
"loss": 87.6145, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.5582970771162438, |
|
"grad_norm": 51.09375, |
|
"learning_rate": 9.989095770162558e-07, |
|
"loss": 86.6963, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.560633048150203, |
|
"grad_norm": 48.53125, |
|
"learning_rate": 9.989050145769934e-07, |
|
"loss": 88.0634, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.5629690191841621, |
|
"grad_norm": 82.125, |
|
"learning_rate": 9.98900452137731e-07, |
|
"loss": 87.3878, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.5653049902181213, |
|
"grad_norm": 44.6875, |
|
"learning_rate": 9.988958896984683e-07, |
|
"loss": 87.9072, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.5676409612520805, |
|
"grad_norm": 45.4375, |
|
"learning_rate": 9.988913272592057e-07, |
|
"loss": 88.5203, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.5699769322860396, |
|
"grad_norm": 77.0625, |
|
"learning_rate": 9.988867648199433e-07, |
|
"loss": 88.19, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.5723129033199988, |
|
"grad_norm": 50.03125, |
|
"learning_rate": 9.988822023806809e-07, |
|
"loss": 87.7846, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.5746488743539581, |
|
"grad_norm": 47.6875, |
|
"learning_rate": 9.988776399414182e-07, |
|
"loss": 87.4866, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.5769848453879172, |
|
"grad_norm": 47.375, |
|
"learning_rate": 9.988730775021556e-07, |
|
"loss": 87.9125, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.5793208164218764, |
|
"grad_norm": 46.65625, |
|
"learning_rate": 9.988685150628932e-07, |
|
"loss": 87.7185, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.5816567874558356, |
|
"grad_norm": 46.46875, |
|
"learning_rate": 9.988639526236307e-07, |
|
"loss": 88.5204, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.5839927584897947, |
|
"grad_norm": 46.53125, |
|
"learning_rate": 9.98859390184368e-07, |
|
"loss": 87.8029, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.5863287295237539, |
|
"grad_norm": 47.25, |
|
"learning_rate": 9.988548277451055e-07, |
|
"loss": 87.5321, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 0.5886647005577131, |
|
"grad_norm": 45.0625, |
|
"learning_rate": 9.98850265305843e-07, |
|
"loss": 88.0817, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.5910006715916722, |
|
"grad_norm": 48.3125, |
|
"learning_rate": 9.988457028665806e-07, |
|
"loss": 88.5133, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.5933366426256315, |
|
"grad_norm": 50.90625, |
|
"learning_rate": 9.98841140427318e-07, |
|
"loss": 87.1346, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.5956726136595906, |
|
"grad_norm": 49.5625, |
|
"learning_rate": 9.988365779880556e-07, |
|
"loss": 87.3656, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.5980085846935498, |
|
"grad_norm": 46.25, |
|
"learning_rate": 9.98832015548793e-07, |
|
"loss": 87.9664, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.600344555727509, |
|
"grad_norm": 48.6875, |
|
"learning_rate": 9.988274531095305e-07, |
|
"loss": 87.8553, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 0.6026805267614681, |
|
"grad_norm": 46.375, |
|
"learning_rate": 9.988228906702679e-07, |
|
"loss": 87.606, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.6050164977954273, |
|
"grad_norm": 54.21875, |
|
"learning_rate": 9.988183282310054e-07, |
|
"loss": 88.3672, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.6073524688293865, |
|
"grad_norm": 49.4375, |
|
"learning_rate": 9.988137657917428e-07, |
|
"loss": 87.1978, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.6096884398633456, |
|
"grad_norm": 46.3125, |
|
"learning_rate": 9.988092033524804e-07, |
|
"loss": 87.6631, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.6120244108973049, |
|
"grad_norm": 47.9375, |
|
"learning_rate": 9.988046409132177e-07, |
|
"loss": 87.6851, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.6143603819312641, |
|
"grad_norm": 48.59375, |
|
"learning_rate": 9.988000784739553e-07, |
|
"loss": 88.0132, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 0.6166963529652232, |
|
"grad_norm": 51.84375, |
|
"learning_rate": 9.987955160346927e-07, |
|
"loss": 87.3412, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.6190323239991824, |
|
"grad_norm": 48.875, |
|
"learning_rate": 9.987909535954303e-07, |
|
"loss": 87.6682, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.6213682950331416, |
|
"grad_norm": 43.75, |
|
"learning_rate": 9.987863911561676e-07, |
|
"loss": 87.5445, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.6237042660671007, |
|
"grad_norm": 48.40625, |
|
"learning_rate": 9.987818287169052e-07, |
|
"loss": 87.7517, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.6260402371010599, |
|
"grad_norm": 49.875, |
|
"learning_rate": 9.987772662776426e-07, |
|
"loss": 86.7831, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.6283762081350192, |
|
"grad_norm": 44.625, |
|
"learning_rate": 9.987727038383801e-07, |
|
"loss": 87.5577, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 0.6307121791689783, |
|
"grad_norm": 45.1875, |
|
"learning_rate": 9.987681413991175e-07, |
|
"loss": 87.1173, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.6330481502029375, |
|
"grad_norm": 50.5, |
|
"learning_rate": 9.98763578959855e-07, |
|
"loss": 87.7701, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 0.6353841212368967, |
|
"grad_norm": 49.4375, |
|
"learning_rate": 9.987590165205927e-07, |
|
"loss": 87.7584, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.6377200922708558, |
|
"grad_norm": 47.34375, |
|
"learning_rate": 9.9875445408133e-07, |
|
"loss": 86.956, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.640056063304815, |
|
"grad_norm": 49.6875, |
|
"learning_rate": 9.987498916420674e-07, |
|
"loss": 87.0904, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 0.6423920343387742, |
|
"grad_norm": 46.75, |
|
"learning_rate": 9.98745329202805e-07, |
|
"loss": 87.2963, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.6447280053727333, |
|
"grad_norm": 46.28125, |
|
"learning_rate": 9.987407667635425e-07, |
|
"loss": 87.8248, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.6470639764066926, |
|
"grad_norm": 46.875, |
|
"learning_rate": 9.9873620432428e-07, |
|
"loss": 88.0839, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 0.6493999474406518, |
|
"grad_norm": 46.125, |
|
"learning_rate": 9.987316418850173e-07, |
|
"loss": 88.2584, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 0.6517359184746109, |
|
"grad_norm": 46.96875, |
|
"learning_rate": 9.987270794457548e-07, |
|
"loss": 87.885, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 0.6540718895085701, |
|
"grad_norm": 48.875, |
|
"learning_rate": 9.987225170064924e-07, |
|
"loss": 87.3159, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.6564078605425293, |
|
"grad_norm": 47.0625, |
|
"learning_rate": 9.987179545672298e-07, |
|
"loss": 87.1514, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 0.6587438315764884, |
|
"grad_norm": 46.65625, |
|
"learning_rate": 9.987133921279671e-07, |
|
"loss": 87.9247, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.6610798026104476, |
|
"grad_norm": 49.28125, |
|
"learning_rate": 9.987088296887047e-07, |
|
"loss": 86.3055, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 0.6634157736444068, |
|
"grad_norm": 51.78125, |
|
"learning_rate": 9.987042672494423e-07, |
|
"loss": 87.9834, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 0.665751744678366, |
|
"grad_norm": 47.8125, |
|
"learning_rate": 9.986997048101797e-07, |
|
"loss": 87.2726, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.6680877157123252, |
|
"grad_norm": 50.28125, |
|
"learning_rate": 9.98695142370917e-07, |
|
"loss": 87.7858, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 0.6704236867462844, |
|
"grad_norm": 47.375, |
|
"learning_rate": 9.986905799316546e-07, |
|
"loss": 88.7471, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 0.6727596577802435, |
|
"grad_norm": 46.03125, |
|
"learning_rate": 9.986860174923922e-07, |
|
"loss": 87.0236, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.6750956288142027, |
|
"grad_norm": 45.53125, |
|
"learning_rate": 9.986814550531295e-07, |
|
"loss": 86.4502, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 0.6774315998481619, |
|
"grad_norm": 45.03125, |
|
"learning_rate": 9.98676892613867e-07, |
|
"loss": 87.5116, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.679767570882121, |
|
"grad_norm": 49.3125, |
|
"learning_rate": 9.986723301746045e-07, |
|
"loss": 87.5916, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 0.6821035419160802, |
|
"grad_norm": 45.90625, |
|
"learning_rate": 9.98667767735342e-07, |
|
"loss": 87.8794, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 0.6844395129500395, |
|
"grad_norm": 45.84375, |
|
"learning_rate": 9.986632052960794e-07, |
|
"loss": 86.8035, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 0.6867754839839986, |
|
"grad_norm": 48.84375, |
|
"learning_rate": 9.98658642856817e-07, |
|
"loss": 87.8464, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.6891114550179578, |
|
"grad_norm": 46.15625, |
|
"learning_rate": 9.986540804175544e-07, |
|
"loss": 87.964, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.691447426051917, |
|
"grad_norm": 46.59375, |
|
"learning_rate": 9.98649517978292e-07, |
|
"loss": 87.3683, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 0.6937833970858761, |
|
"grad_norm": 51.125, |
|
"learning_rate": 9.986449555390293e-07, |
|
"loss": 87.7749, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 0.6961193681198353, |
|
"grad_norm": 46.75, |
|
"learning_rate": 9.986403930997669e-07, |
|
"loss": 88.3439, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 0.6984553391537945, |
|
"grad_norm": 45.28125, |
|
"learning_rate": 9.986358306605042e-07, |
|
"loss": 87.627, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 0.7007913101877536, |
|
"grad_norm": 45.90625, |
|
"learning_rate": 9.986312682212418e-07, |
|
"loss": 86.3417, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.7031272812217129, |
|
"grad_norm": 48.53125, |
|
"learning_rate": 9.986267057819792e-07, |
|
"loss": 87.2988, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 0.7054632522556721, |
|
"grad_norm": 48.125, |
|
"learning_rate": 9.986221433427168e-07, |
|
"loss": 87.3414, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 0.7077992232896312, |
|
"grad_norm": 47.96875, |
|
"learning_rate": 9.986175809034541e-07, |
|
"loss": 86.9798, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 0.7101351943235904, |
|
"grad_norm": 47.5, |
|
"learning_rate": 9.986130184641917e-07, |
|
"loss": 87.7504, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 0.7124711653575496, |
|
"grad_norm": 50.46875, |
|
"learning_rate": 9.98608456024929e-07, |
|
"loss": 87.2987, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.7148071363915087, |
|
"grad_norm": 47.625, |
|
"learning_rate": 9.986038935856666e-07, |
|
"loss": 86.8407, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 0.7171431074254679, |
|
"grad_norm": 51.78125, |
|
"learning_rate": 9.98599331146404e-07, |
|
"loss": 86.4857, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 0.7194790784594272, |
|
"grad_norm": 43.46875, |
|
"learning_rate": 9.985947687071416e-07, |
|
"loss": 86.6848, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 0.7218150494933863, |
|
"grad_norm": 47.46875, |
|
"learning_rate": 9.98590206267879e-07, |
|
"loss": 86.5055, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 0.7241510205273455, |
|
"grad_norm": 46.59375, |
|
"learning_rate": 9.985856438286165e-07, |
|
"loss": 87.3536, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.7264869915613046, |
|
"grad_norm": 47.90625, |
|
"learning_rate": 9.985810813893539e-07, |
|
"loss": 86.9832, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 0.7288229625952638, |
|
"grad_norm": 45.8125, |
|
"learning_rate": 9.985765189500915e-07, |
|
"loss": 86.8103, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 0.731158933629223, |
|
"grad_norm": 46.125, |
|
"learning_rate": 9.98571956510829e-07, |
|
"loss": 87.5608, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 0.7334949046631821, |
|
"grad_norm": 44.25, |
|
"learning_rate": 9.985673940715664e-07, |
|
"loss": 87.2201, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 0.7358308756971413, |
|
"grad_norm": 46.125, |
|
"learning_rate": 9.985628316323038e-07, |
|
"loss": 87.6277, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.7381668467311006, |
|
"grad_norm": 46.09375, |
|
"learning_rate": 9.985582691930413e-07, |
|
"loss": 87.0271, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 0.7405028177650597, |
|
"grad_norm": 48.96875, |
|
"learning_rate": 9.98553706753779e-07, |
|
"loss": 87.2596, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 0.7428387887990189, |
|
"grad_norm": 50.84375, |
|
"learning_rate": 9.985491443145163e-07, |
|
"loss": 87.0863, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 0.7451747598329781, |
|
"grad_norm": 44.8125, |
|
"learning_rate": 9.985445818752536e-07, |
|
"loss": 87.3691, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 0.7475107308669372, |
|
"grad_norm": 56.15625, |
|
"learning_rate": 9.985400194359912e-07, |
|
"loss": 87.2236, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.7498467019008964, |
|
"grad_norm": 46.71875, |
|
"learning_rate": 9.985354569967288e-07, |
|
"loss": 87.7344, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 0.7521826729348556, |
|
"grad_norm": 45.71875, |
|
"learning_rate": 9.985308945574662e-07, |
|
"loss": 87.3144, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 0.7545186439688147, |
|
"grad_norm": 44.53125, |
|
"learning_rate": 9.985263321182035e-07, |
|
"loss": 86.0887, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 0.756854615002774, |
|
"grad_norm": 48.78125, |
|
"learning_rate": 9.98521769678941e-07, |
|
"loss": 87.4508, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 0.7591905860367332, |
|
"grad_norm": 45.78125, |
|
"learning_rate": 9.985172072396787e-07, |
|
"loss": 87.3724, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.7615265570706923, |
|
"grad_norm": 45.375, |
|
"learning_rate": 9.98512644800416e-07, |
|
"loss": 87.1348, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 0.7638625281046515, |
|
"grad_norm": 45.4375, |
|
"learning_rate": 9.985080823611534e-07, |
|
"loss": 87.7193, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 0.7661984991386107, |
|
"grad_norm": 47.53125, |
|
"learning_rate": 9.98503519921891e-07, |
|
"loss": 86.5363, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 0.7685344701725698, |
|
"grad_norm": 47.78125, |
|
"learning_rate": 9.984989574826285e-07, |
|
"loss": 87.0599, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 0.770870441206529, |
|
"grad_norm": 48.6875, |
|
"learning_rate": 9.98494395043366e-07, |
|
"loss": 87.0781, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.7732064122404883, |
|
"grad_norm": 46.09375, |
|
"learning_rate": 9.984898326041035e-07, |
|
"loss": 87.3568, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 0.7755423832744474, |
|
"grad_norm": 45.15625, |
|
"learning_rate": 9.984852701648409e-07, |
|
"loss": 86.8565, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 0.7778783543084066, |
|
"grad_norm": 44.65625, |
|
"learning_rate": 9.984807077255784e-07, |
|
"loss": 87.5893, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 0.7802143253423658, |
|
"grad_norm": 47.6875, |
|
"learning_rate": 9.984761452863158e-07, |
|
"loss": 86.4368, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 0.7825502963763249, |
|
"grad_norm": 46.15625, |
|
"learning_rate": 9.984715828470534e-07, |
|
"loss": 87.0425, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.7848862674102841, |
|
"grad_norm": 47.6875, |
|
"learning_rate": 9.984670204077907e-07, |
|
"loss": 87.8817, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 0.7872222384442433, |
|
"grad_norm": 53.6875, |
|
"learning_rate": 9.984624579685283e-07, |
|
"loss": 87.1403, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 0.7895582094782024, |
|
"grad_norm": 43.96875, |
|
"learning_rate": 9.984578955292657e-07, |
|
"loss": 86.3467, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 0.7918941805121616, |
|
"grad_norm": 48.46875, |
|
"learning_rate": 9.984533330900033e-07, |
|
"loss": 87.1886, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 0.7942301515461209, |
|
"grad_norm": 46.59375, |
|
"learning_rate": 9.984487706507406e-07, |
|
"loss": 85.6375, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.79656612258008, |
|
"grad_norm": 49.65625, |
|
"learning_rate": 9.984442082114782e-07, |
|
"loss": 86.6058, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 0.7989020936140392, |
|
"grad_norm": 46.59375, |
|
"learning_rate": 9.984396457722156e-07, |
|
"loss": 86.563, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 0.8012380646479984, |
|
"grad_norm": 46.4375, |
|
"learning_rate": 9.984350833329531e-07, |
|
"loss": 87.169, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 0.8035740356819575, |
|
"grad_norm": 45.75, |
|
"learning_rate": 9.984305208936905e-07, |
|
"loss": 87.7978, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 0.8059100067159167, |
|
"grad_norm": 47.8125, |
|
"learning_rate": 9.98425958454428e-07, |
|
"loss": 87.7728, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.8082459777498759, |
|
"grad_norm": 46.46875, |
|
"learning_rate": 9.984213960151654e-07, |
|
"loss": 87.2359, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 0.810581948783835, |
|
"grad_norm": 46.4375, |
|
"learning_rate": 9.98416833575903e-07, |
|
"loss": 86.3683, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 0.8129179198177943, |
|
"grad_norm": 48.71875, |
|
"learning_rate": 9.984122711366404e-07, |
|
"loss": 88.0013, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 0.8152538908517535, |
|
"grad_norm": 47.5625, |
|
"learning_rate": 9.98407708697378e-07, |
|
"loss": 86.5688, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 0.8175898618857126, |
|
"grad_norm": 48.0, |
|
"learning_rate": 9.984031462581153e-07, |
|
"loss": 87.148, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.8199258329196718, |
|
"grad_norm": 49.59375, |
|
"learning_rate": 9.983985838188529e-07, |
|
"loss": 87.0857, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 0.822261803953631, |
|
"grad_norm": 50.25, |
|
"learning_rate": 9.983940213795905e-07, |
|
"loss": 86.8389, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 0.8245977749875901, |
|
"grad_norm": 47.4375, |
|
"learning_rate": 9.983894589403278e-07, |
|
"loss": 87.4967, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 0.8269337460215493, |
|
"grad_norm": 46.59375, |
|
"learning_rate": 9.983848965010652e-07, |
|
"loss": 86.6476, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 0.8292697170555086, |
|
"grad_norm": 46.3125, |
|
"learning_rate": 9.983803340618028e-07, |
|
"loss": 87.4724, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 0.8316056880894677, |
|
"grad_norm": 45.1875, |
|
"learning_rate": 9.983757716225403e-07, |
|
"loss": 87.1353, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 0.8339416591234269, |
|
"grad_norm": 48.125, |
|
"learning_rate": 9.983712091832777e-07, |
|
"loss": 85.959, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 0.8362776301573861, |
|
"grad_norm": 45.75, |
|
"learning_rate": 9.98366646744015e-07, |
|
"loss": 86.5401, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 0.8386136011913452, |
|
"grad_norm": 47.5625, |
|
"learning_rate": 9.983620843047527e-07, |
|
"loss": 85.5629, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 0.8409495722253044, |
|
"grad_norm": 46.65625, |
|
"learning_rate": 9.983575218654902e-07, |
|
"loss": 87.5704, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.8432855432592636, |
|
"grad_norm": 46.46875, |
|
"learning_rate": 9.983529594262276e-07, |
|
"loss": 87.0222, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 0.8456215142932227, |
|
"grad_norm": 45.9375, |
|
"learning_rate": 9.98348396986965e-07, |
|
"loss": 86.3191, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 0.847957485327182, |
|
"grad_norm": 46.625, |
|
"learning_rate": 9.983438345477025e-07, |
|
"loss": 86.5284, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 0.8502934563611412, |
|
"grad_norm": 49.375, |
|
"learning_rate": 9.983392721084401e-07, |
|
"loss": 86.9173, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 0.8526294273951003, |
|
"grad_norm": 45.34375, |
|
"learning_rate": 9.983347096691775e-07, |
|
"loss": 86.998, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 0.8549653984290595, |
|
"grad_norm": 47.1875, |
|
"learning_rate": 9.983301472299148e-07, |
|
"loss": 86.2805, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 0.8573013694630186, |
|
"grad_norm": 47.78125, |
|
"learning_rate": 9.983255847906524e-07, |
|
"loss": 87.0101, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 0.8596373404969778, |
|
"grad_norm": 50.5625, |
|
"learning_rate": 9.9832102235139e-07, |
|
"loss": 87.0326, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 0.861973311530937, |
|
"grad_norm": 45.5, |
|
"learning_rate": 9.983164599121274e-07, |
|
"loss": 85.8297, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 0.8643092825648961, |
|
"grad_norm": 46.15625, |
|
"learning_rate": 9.983118974728647e-07, |
|
"loss": 86.0181, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.8666452535988554, |
|
"grad_norm": 51.125, |
|
"learning_rate": 9.983073350336023e-07, |
|
"loss": 85.7637, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 0.8689812246328146, |
|
"grad_norm": 48.25, |
|
"learning_rate": 9.983027725943399e-07, |
|
"loss": 86.8376, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 0.8713171956667737, |
|
"grad_norm": 47.96875, |
|
"learning_rate": 9.982982101550772e-07, |
|
"loss": 86.3769, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 0.8736531667007329, |
|
"grad_norm": 45.8125, |
|
"learning_rate": 9.982936477158148e-07, |
|
"loss": 86.1424, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 0.8759891377346921, |
|
"grad_norm": 46.75, |
|
"learning_rate": 9.982890852765522e-07, |
|
"loss": 86.66, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.8783251087686512, |
|
"grad_norm": 45.5, |
|
"learning_rate": 9.982845228372897e-07, |
|
"loss": 85.9828, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 0.8806610798026104, |
|
"grad_norm": 46.28125, |
|
"learning_rate": 9.982799603980271e-07, |
|
"loss": 86.6127, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 0.8829970508365697, |
|
"grad_norm": 44.4375, |
|
"learning_rate": 9.982753979587647e-07, |
|
"loss": 86.5096, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 0.8853330218705288, |
|
"grad_norm": 44.875, |
|
"learning_rate": 9.98270835519502e-07, |
|
"loss": 86.9895, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 0.887668992904488, |
|
"grad_norm": 47.15625, |
|
"learning_rate": 9.982662730802396e-07, |
|
"loss": 88.0681, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.8900049639384472, |
|
"grad_norm": 48.84375, |
|
"learning_rate": 9.98261710640977e-07, |
|
"loss": 86.534, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 0.8923409349724063, |
|
"grad_norm": 46.3125, |
|
"learning_rate": 9.982571482017146e-07, |
|
"loss": 86.3877, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 0.8946769060063655, |
|
"grad_norm": 45.6875, |
|
"learning_rate": 9.98252585762452e-07, |
|
"loss": 87.146, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 0.8970128770403247, |
|
"grad_norm": 48.1875, |
|
"learning_rate": 9.982480233231895e-07, |
|
"loss": 86.8891, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 0.8993488480742838, |
|
"grad_norm": 45.90625, |
|
"learning_rate": 9.982434608839269e-07, |
|
"loss": 86.9012, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 0.901684819108243, |
|
"grad_norm": 48.28125, |
|
"learning_rate": 9.982388984446644e-07, |
|
"loss": 86.8401, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 0.9040207901422023, |
|
"grad_norm": 49.78125, |
|
"learning_rate": 9.982343360054018e-07, |
|
"loss": 87.0155, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 0.9063567611761614, |
|
"grad_norm": 46.875, |
|
"learning_rate": 9.982297735661394e-07, |
|
"loss": 86.033, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 0.9086927322101206, |
|
"grad_norm": 47.25, |
|
"learning_rate": 9.982252111268768e-07, |
|
"loss": 87.0386, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 0.9110287032440798, |
|
"grad_norm": 43.375, |
|
"learning_rate": 9.982206486876143e-07, |
|
"loss": 85.8166, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.9133646742780389, |
|
"grad_norm": 46.6875, |
|
"learning_rate": 9.982160862483517e-07, |
|
"loss": 87.5271, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 0.9157006453119981, |
|
"grad_norm": 47.90625, |
|
"learning_rate": 9.982115238090893e-07, |
|
"loss": 85.2684, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 0.9180366163459573, |
|
"grad_norm": 47.53125, |
|
"learning_rate": 9.982069613698268e-07, |
|
"loss": 86.8994, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 0.9203725873799165, |
|
"grad_norm": 45.96875, |
|
"learning_rate": 9.982023989305642e-07, |
|
"loss": 86.4264, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 0.9227085584138757, |
|
"grad_norm": 45.59375, |
|
"learning_rate": 9.981978364913016e-07, |
|
"loss": 85.8178, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 0.9250445294478349, |
|
"grad_norm": 45.34375, |
|
"learning_rate": 9.981932740520391e-07, |
|
"loss": 87.0427, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 0.927380500481794, |
|
"grad_norm": 47.59375, |
|
"learning_rate": 9.981887116127767e-07, |
|
"loss": 86.6578, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 0.9297164715157532, |
|
"grad_norm": 49.84375, |
|
"learning_rate": 9.98184149173514e-07, |
|
"loss": 86.9295, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 0.9320524425497124, |
|
"grad_norm": 45.5, |
|
"learning_rate": 9.981795867342515e-07, |
|
"loss": 86.1575, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 0.9343884135836715, |
|
"grad_norm": 49.28125, |
|
"learning_rate": 9.98175024294989e-07, |
|
"loss": 86.415, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.9343884135836715, |
|
"eval_loss": 1.3514955043792725, |
|
"eval_runtime": 133.9042, |
|
"eval_samples_per_second": 1678.827, |
|
"eval_steps_per_second": 52.47, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.9367243846176307, |
|
"grad_norm": 46.65625, |
|
"learning_rate": 9.981704618557266e-07, |
|
"loss": 86.0902, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 0.93906035565159, |
|
"grad_norm": 44.65625, |
|
"learning_rate": 9.98165899416464e-07, |
|
"loss": 85.4592, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 0.9413963266855491, |
|
"grad_norm": 44.34375, |
|
"learning_rate": 9.981613369772013e-07, |
|
"loss": 86.5728, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 0.9437322977195083, |
|
"grad_norm": 46.03125, |
|
"learning_rate": 9.98156774537939e-07, |
|
"loss": 87.2485, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 0.9460682687534675, |
|
"grad_norm": 48.28125, |
|
"learning_rate": 9.981522120986765e-07, |
|
"loss": 87.1623, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 0.9484042397874266, |
|
"grad_norm": 47.96875, |
|
"learning_rate": 9.981476496594138e-07, |
|
"loss": 86.2034, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 0.9507402108213858, |
|
"grad_norm": 48.25, |
|
"learning_rate": 9.981430872201514e-07, |
|
"loss": 86.5078, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 0.953076181855345, |
|
"grad_norm": 44.53125, |
|
"learning_rate": 9.981385247808888e-07, |
|
"loss": 86.3279, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 0.9554121528893041, |
|
"grad_norm": 45.6875, |
|
"learning_rate": 9.981339623416264e-07, |
|
"loss": 86.4747, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 0.9577481239232634, |
|
"grad_norm": 47.53125, |
|
"learning_rate": 9.981293999023637e-07, |
|
"loss": 85.3221, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.9600840949572226, |
|
"grad_norm": 47.15625, |
|
"learning_rate": 9.981248374631013e-07, |
|
"loss": 85.7835, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 0.9624200659911817, |
|
"grad_norm": 45.96875, |
|
"learning_rate": 9.981202750238387e-07, |
|
"loss": 85.919, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 0.9647560370251409, |
|
"grad_norm": 46.40625, |
|
"learning_rate": 9.981157125845762e-07, |
|
"loss": 86.6488, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 0.9670920080591001, |
|
"grad_norm": 47.8125, |
|
"learning_rate": 9.981111501453136e-07, |
|
"loss": 86.7465, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 0.9694279790930592, |
|
"grad_norm": 50.96875, |
|
"learning_rate": 9.981065877060512e-07, |
|
"loss": 85.8423, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 0.9717639501270184, |
|
"grad_norm": 44.84375, |
|
"learning_rate": 9.981020252667885e-07, |
|
"loss": 86.4872, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 0.9740999211609777, |
|
"grad_norm": 51.46875, |
|
"learning_rate": 9.980974628275261e-07, |
|
"loss": 86.9111, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 0.9764358921949368, |
|
"grad_norm": 46.25, |
|
"learning_rate": 9.980929003882635e-07, |
|
"loss": 86.4476, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 0.978771863228896, |
|
"grad_norm": 47.0625, |
|
"learning_rate": 9.98088337949001e-07, |
|
"loss": 86.3345, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 0.9811078342628552, |
|
"grad_norm": 47.96875, |
|
"learning_rate": 9.980837755097384e-07, |
|
"loss": 87.4492, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.9834438052968143, |
|
"grad_norm": 47.53125, |
|
"learning_rate": 9.98079213070476e-07, |
|
"loss": 87.3175, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 0.9857797763307735, |
|
"grad_norm": 47.84375, |
|
"learning_rate": 9.980746506312134e-07, |
|
"loss": 85.7159, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 0.9881157473647327, |
|
"grad_norm": 50.5, |
|
"learning_rate": 9.98070088191951e-07, |
|
"loss": 85.7232, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 0.9904517183986918, |
|
"grad_norm": 47.1875, |
|
"learning_rate": 9.980655257526883e-07, |
|
"loss": 86.1964, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 0.992787689432651, |
|
"grad_norm": 46.15625, |
|
"learning_rate": 9.980609633134259e-07, |
|
"loss": 86.2977, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 0.9951236604666102, |
|
"grad_norm": 44.8125, |
|
"learning_rate": 9.980564008741632e-07, |
|
"loss": 85.6801, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 0.9974596315005694, |
|
"grad_norm": 46.15625, |
|
"learning_rate": 9.980518384349008e-07, |
|
"loss": 85.8044, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 0.9997956025345286, |
|
"grad_norm": 46.75, |
|
"learning_rate": 9.980472759956384e-07, |
|
"loss": 86.1971, |
|
"step": 4280 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 4280, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 2000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.1817578952753414e+19, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|