|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 25.0, |
|
"eval_steps": 500, |
|
"global_step": 1250, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.459547519683838, |
|
"learning_rate": 1.6000000000000001e-06, |
|
"loss": 1.2185, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.5558891296386719, |
|
"learning_rate": 3.2000000000000003e-06, |
|
"loss": 0.892, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.5772704482078552, |
|
"learning_rate": 4.800000000000001e-06, |
|
"loss": 0.7887, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.40292349457740784, |
|
"learning_rate": 6.4000000000000006e-06, |
|
"loss": 0.7111, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.35770532488822937, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 0.6489, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.43718403577804565, |
|
"learning_rate": 9.600000000000001e-06, |
|
"loss": 0.5936, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 0.4591154158115387, |
|
"learning_rate": 1.1200000000000001e-05, |
|
"loss": 0.6121, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.6591519713401794, |
|
"learning_rate": 1.2800000000000001e-05, |
|
"loss": 0.6059, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 0.6931334733963013, |
|
"learning_rate": 1.4400000000000001e-05, |
|
"loss": 0.6062, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.6464695334434509, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 0.5929, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 0.7549647688865662, |
|
"learning_rate": 1.76e-05, |
|
"loss": 0.5138, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 0.6728184819221497, |
|
"learning_rate": 1.9200000000000003e-05, |
|
"loss": 0.4737, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 0.7162397503852844, |
|
"learning_rate": 1.9999025240093045e-05, |
|
"loss": 0.4891, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 0.9795845150947571, |
|
"learning_rate": 1.9991228300988586e-05, |
|
"loss": 0.4663, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.6311202049255371, |
|
"learning_rate": 1.9975640502598243e-05, |
|
"loss": 0.5164, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 0.9567041993141174, |
|
"learning_rate": 1.9952273999818312e-05, |
|
"loss": 0.3981, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"grad_norm": 0.8767746090888977, |
|
"learning_rate": 1.9921147013144782e-05, |
|
"loss": 0.3761, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"grad_norm": 1.8661500215530396, |
|
"learning_rate": 1.988228381446553e-05, |
|
"loss": 0.374, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 3.8, |
|
"grad_norm": 0.7793363332748413, |
|
"learning_rate": 1.983571470813386e-05, |
|
"loss": 0.3908, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.8413230180740356, |
|
"learning_rate": 1.9781476007338058e-05, |
|
"loss": 0.3527, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 4.2, |
|
"grad_norm": 1.5790411233901978, |
|
"learning_rate": 1.9719610005785466e-05, |
|
"loss": 0.2756, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 4.4, |
|
"grad_norm": 1.1819688081741333, |
|
"learning_rate": 1.9650164944723116e-05, |
|
"loss": 0.2622, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 4.6, |
|
"grad_norm": 1.2115598917007446, |
|
"learning_rate": 1.9573194975320672e-05, |
|
"loss": 0.2475, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 4.8, |
|
"grad_norm": 1.0625479221343994, |
|
"learning_rate": 1.9488760116444966e-05, |
|
"loss": 0.2682, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 1.2376039028167725, |
|
"learning_rate": 1.9396926207859085e-05, |
|
"loss": 0.2514, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 5.2, |
|
"grad_norm": 0.6343020796775818, |
|
"learning_rate": 1.9297764858882516e-05, |
|
"loss": 0.1448, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 5.4, |
|
"grad_norm": 1.1657440662384033, |
|
"learning_rate": 1.9191353392552346e-05, |
|
"loss": 0.1725, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 5.6, |
|
"grad_norm": 1.4713112115859985, |
|
"learning_rate": 1.907777478532909e-05, |
|
"loss": 0.1753, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 5.8, |
|
"grad_norm": 1.9010937213897705, |
|
"learning_rate": 1.895711760239413e-05, |
|
"loss": 0.1697, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 2.2462244033813477, |
|
"learning_rate": 1.8829475928589272e-05, |
|
"loss": 0.157, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 6.2, |
|
"grad_norm": 1.5734026432037354, |
|
"learning_rate": 1.869494929505219e-05, |
|
"loss": 0.0987, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 6.4, |
|
"grad_norm": 1.6871922016143799, |
|
"learning_rate": 1.855364260160507e-05, |
|
"loss": 0.1122, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 6.6, |
|
"grad_norm": 0.8505676984786987, |
|
"learning_rate": 1.8405666034956842e-05, |
|
"loss": 0.0999, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 6.8, |
|
"grad_norm": 1.9714100360870361, |
|
"learning_rate": 1.8251134982782952e-05, |
|
"loss": 0.0925, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"grad_norm": 1.3289463520050049, |
|
"learning_rate": 1.8090169943749477e-05, |
|
"loss": 0.1034, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 7.2, |
|
"grad_norm": 1.7757859230041504, |
|
"learning_rate": 1.792289643355191e-05, |
|
"loss": 0.0538, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 7.4, |
|
"grad_norm": 1.5128904581069946, |
|
"learning_rate": 1.7749444887041797e-05, |
|
"loss": 0.0562, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 7.6, |
|
"grad_norm": 1.7095279693603516, |
|
"learning_rate": 1.7569950556517566e-05, |
|
"loss": 0.0656, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 7.8, |
|
"grad_norm": 1.7054250240325928, |
|
"learning_rate": 1.7384553406258842e-05, |
|
"loss": 0.0783, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 1.2087674140930176, |
|
"learning_rate": 1.7193398003386514e-05, |
|
"loss": 0.0832, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 8.2, |
|
"grad_norm": 1.2171767950057983, |
|
"learning_rate": 1.6996633405133656e-05, |
|
"loss": 0.0389, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 8.4, |
|
"grad_norm": 1.2036916017532349, |
|
"learning_rate": 1.6794413042615168e-05, |
|
"loss": 0.0456, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 8.6, |
|
"grad_norm": 1.6378772258758545, |
|
"learning_rate": 1.6586894601186804e-05, |
|
"loss": 0.0539, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 8.8, |
|
"grad_norm": 1.062146544456482, |
|
"learning_rate": 1.63742398974869e-05, |
|
"loss": 0.0351, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"grad_norm": 0.7032294273376465, |
|
"learning_rate": 1.6156614753256583e-05, |
|
"loss": 0.0331, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 9.2, |
|
"grad_norm": 1.2708150148391724, |
|
"learning_rate": 1.5934188866037017e-05, |
|
"loss": 0.0298, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 9.4, |
|
"grad_norm": 0.5238479375839233, |
|
"learning_rate": 1.570713567684432e-05, |
|
"loss": 0.0271, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 9.6, |
|
"grad_norm": 0.6896675229072571, |
|
"learning_rate": 1.5475632234925505e-05, |
|
"loss": 0.0148, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 9.8, |
|
"grad_norm": 0.8893882632255554, |
|
"learning_rate": 1.5239859059700794e-05, |
|
"loss": 0.0384, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 1.3057194948196411, |
|
"learning_rate": 1.5000000000000002e-05, |
|
"loss": 0.0216, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 10.2, |
|
"grad_norm": 0.4277481436729431, |
|
"learning_rate": 1.4756242090702756e-05, |
|
"loss": 0.0186, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 10.4, |
|
"grad_norm": 2.500697374343872, |
|
"learning_rate": 1.4508775406894308e-05, |
|
"loss": 0.0289, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 10.6, |
|
"grad_norm": 0.4189341366291046, |
|
"learning_rate": 1.4257792915650728e-05, |
|
"loss": 0.0161, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 10.8, |
|
"grad_norm": 0.6834658980369568, |
|
"learning_rate": 1.4003490325568953e-05, |
|
"loss": 0.016, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"grad_norm": 0.32209300994873047, |
|
"learning_rate": 1.3746065934159123e-05, |
|
"loss": 0.0214, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 11.2, |
|
"grad_norm": 0.6142628788948059, |
|
"learning_rate": 1.3485720473218153e-05, |
|
"loss": 0.0169, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 11.4, |
|
"grad_norm": 0.020493434742093086, |
|
"learning_rate": 1.3222656952305113e-05, |
|
"loss": 0.0072, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 11.6, |
|
"grad_norm": 0.6618867516517639, |
|
"learning_rate": 1.2957080500440469e-05, |
|
"loss": 0.0134, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 11.8, |
|
"grad_norm": 2.014190196990967, |
|
"learning_rate": 1.2689198206152657e-05, |
|
"loss": 0.0141, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"grad_norm": 0.3526052236557007, |
|
"learning_rate": 1.2419218955996677e-05, |
|
"loss": 0.0152, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 12.2, |
|
"grad_norm": 0.39659520983695984, |
|
"learning_rate": 1.2147353271670634e-05, |
|
"loss": 0.0082, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 12.4, |
|
"grad_norm": 1.6676621437072754, |
|
"learning_rate": 1.187381314585725e-05, |
|
"loss": 0.0086, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 12.6, |
|
"grad_norm": 0.48944175243377686, |
|
"learning_rate": 1.159881187691835e-05, |
|
"loss": 0.0073, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 12.8, |
|
"grad_norm": 0.11040916293859482, |
|
"learning_rate": 1.1322563902571227e-05, |
|
"loss": 0.0111, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"grad_norm": 0.19332852959632874, |
|
"learning_rate": 1.1045284632676535e-05, |
|
"loss": 0.0084, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 13.2, |
|
"grad_norm": 0.5781747698783875, |
|
"learning_rate": 1.0767190281268187e-05, |
|
"loss": 0.0088, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 13.4, |
|
"grad_norm": 0.5372014045715332, |
|
"learning_rate": 1.0488497697956134e-05, |
|
"loss": 0.0129, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 13.6, |
|
"grad_norm": 0.0674474835395813, |
|
"learning_rate": 1.0209424198833571e-05, |
|
"loss": 0.0047, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 13.8, |
|
"grad_norm": 0.0646585077047348, |
|
"learning_rate": 9.930187397020385e-06, |
|
"loss": 0.0079, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"grad_norm": 0.16117724776268005, |
|
"learning_rate": 9.651005032974994e-06, |
|
"loss": 0.0137, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 14.2, |
|
"grad_norm": 0.031529657542705536, |
|
"learning_rate": 9.372094804706867e-06, |
|
"loss": 0.0195, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 14.4, |
|
"grad_norm": 0.7396082878112793, |
|
"learning_rate": 9.093674198022201e-06, |
|
"loss": 0.0038, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 14.6, |
|
"grad_norm": 0.131903737783432, |
|
"learning_rate": 8.815960316934991e-06, |
|
"loss": 0.009, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 14.8, |
|
"grad_norm": 0.031985681504011154, |
|
"learning_rate": 8.539169714375885e-06, |
|
"loss": 0.0013, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"grad_norm": 0.12234848737716675, |
|
"learning_rate": 8.263518223330698e-06, |
|
"loss": 0.0043, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 15.2, |
|
"grad_norm": 0.10590848326683044, |
|
"learning_rate": 7.989220788540356e-06, |
|
"loss": 0.0016, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 15.4, |
|
"grad_norm": 0.20384103059768677, |
|
"learning_rate": 7.716491298893443e-06, |
|
"loss": 0.0011, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 15.6, |
|
"grad_norm": 0.16497106850147247, |
|
"learning_rate": 7.445542420642097e-06, |
|
"loss": 0.002, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 15.8, |
|
"grad_norm": 0.0461721308529377, |
|
"learning_rate": 7.176585431571235e-06, |
|
"loss": 0.0017, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"grad_norm": 0.023166760802268982, |
|
"learning_rate": 6.909830056250527e-06, |
|
"loss": 0.0008, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 16.2, |
|
"grad_norm": 0.032082173973321915, |
|
"learning_rate": 6.645484302497452e-06, |
|
"loss": 0.0003, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 16.4, |
|
"grad_norm": 0.08265218138694763, |
|
"learning_rate": 6.383754299179079e-06, |
|
"loss": 0.0007, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 16.6, |
|
"grad_norm": 0.024636510759592056, |
|
"learning_rate": 6.124844135478971e-06, |
|
"loss": 0.005, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 16.8, |
|
"grad_norm": 0.019458631053566933, |
|
"learning_rate": 5.868955701754584e-06, |
|
"loss": 0.0004, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"grad_norm": 0.01352341752499342, |
|
"learning_rate": 5.616288532109225e-06, |
|
"loss": 0.0007, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 17.2, |
|
"grad_norm": 0.009102717973291874, |
|
"learning_rate": 5.367039648801386e-06, |
|
"loss": 0.0004, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 17.4, |
|
"grad_norm": 0.01286560669541359, |
|
"learning_rate": 5.121403408612672e-06, |
|
"loss": 0.0002, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 17.6, |
|
"grad_norm": 0.36686578392982483, |
|
"learning_rate": 4.879571351294287e-06, |
|
"loss": 0.0007, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 17.8, |
|
"grad_norm": 0.005845635663717985, |
|
"learning_rate": 4.641732050210032e-06, |
|
"loss": 0.0002, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"grad_norm": 0.06807386130094528, |
|
"learning_rate": 4.408070965292534e-06, |
|
"loss": 0.0002, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 18.2, |
|
"grad_norm": 0.02204430289566517, |
|
"learning_rate": 4.178770298427107e-06, |
|
"loss": 0.0001, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 18.4, |
|
"grad_norm": 0.008754129521548748, |
|
"learning_rate": 3.954008851376252e-06, |
|
"loss": 0.0001, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 18.6, |
|
"grad_norm": 0.00636293226853013, |
|
"learning_rate": 3.7339618863553983e-06, |
|
"loss": 0.0001, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 18.8, |
|
"grad_norm": 0.004866347182542086, |
|
"learning_rate": 3.5188009893686916e-06, |
|
"loss": 0.0006, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"grad_norm": 0.004850468598306179, |
|
"learning_rate": 3.308693936411421e-06, |
|
"loss": 0.0001, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 19.2, |
|
"grad_norm": 0.0016120151849463582, |
|
"learning_rate": 3.103804562643302e-06, |
|
"loss": 0.0001, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 19.4, |
|
"grad_norm": 0.08115795254707336, |
|
"learning_rate": 2.9042926346347932e-06, |
|
"loss": 0.0002, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 19.6, |
|
"grad_norm": 0.0031273579224944115, |
|
"learning_rate": 2.7103137257858867e-06, |
|
"loss": 0.0001, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 19.8, |
|
"grad_norm": 0.005434623919427395, |
|
"learning_rate": 2.522019095014683e-06, |
|
"loss": 0.0001, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"grad_norm": 0.003434584243223071, |
|
"learning_rate": 2.339555568810221e-06, |
|
"loss": 0.0003, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 20.2, |
|
"grad_norm": 0.0036388696171343327, |
|
"learning_rate": 2.163065426741603e-06, |
|
"loss": 0.0003, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 20.4, |
|
"grad_norm": 0.005258283577859402, |
|
"learning_rate": 1.9926862905126663e-06, |
|
"loss": 0.0001, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 20.6, |
|
"grad_norm": 0.0052086408250033855, |
|
"learning_rate": 1.8285510166487154e-06, |
|
"loss": 0.0001, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 20.8, |
|
"grad_norm": 0.007011784706264734, |
|
"learning_rate": 1.6707875928990059e-06, |
|
"loss": 0.0001, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 21.0, |
|
"grad_norm": 0.008353537879884243, |
|
"learning_rate": 1.5195190384357405e-06, |
|
"loss": 0.0001, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 21.2, |
|
"grad_norm": 0.005364676937460899, |
|
"learning_rate": 1.3748633079274254e-06, |
|
"loss": 0.0001, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 21.4, |
|
"grad_norm": 0.07464922219514847, |
|
"learning_rate": 1.2369331995613664e-06, |
|
"loss": 0.0003, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 21.6, |
|
"grad_norm": 0.005178565625101328, |
|
"learning_rate": 1.1058362670870248e-06, |
|
"loss": 0.0001, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 21.8, |
|
"grad_norm": 0.003679296700283885, |
|
"learning_rate": 9.816747359488632e-07, |
|
"loss": 0.0001, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"grad_norm": 0.008506166748702526, |
|
"learning_rate": 8.645454235739903e-07, |
|
"loss": 0.0001, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 22.2, |
|
"grad_norm": 0.0045352838933467865, |
|
"learning_rate": 7.545396638768698e-07, |
|
"loss": 0.0001, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 22.4, |
|
"grad_norm": 0.0041460818611085415, |
|
"learning_rate": 6.517432360398556e-07, |
|
"loss": 0.0002, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 22.6, |
|
"grad_norm": 0.00789156835526228, |
|
"learning_rate": 5.562362976251901e-07, |
|
"loss": 0.0001, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 22.8, |
|
"grad_norm": 0.004850171506404877, |
|
"learning_rate": 4.6809332207053083e-07, |
|
"loss": 0.0002, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 23.0, |
|
"grad_norm": 0.0024761913809925318, |
|
"learning_rate": 3.8738304061681107e-07, |
|
"loss": 0.0001, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 23.2, |
|
"grad_norm": 0.004848626907914877, |
|
"learning_rate": 3.1416838871368925e-07, |
|
"loss": 0.0002, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 23.4, |
|
"grad_norm": 0.008169041946530342, |
|
"learning_rate": 2.4850645694436736e-07, |
|
"loss": 0.0001, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 23.6, |
|
"grad_norm": 0.008230429142713547, |
|
"learning_rate": 1.9044844650808468e-07, |
|
"loss": 0.0001, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 23.8, |
|
"grad_norm": 0.008880352601408958, |
|
"learning_rate": 1.400396292949513e-07, |
|
"loss": 0.0001, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"grad_norm": 0.003499081125482917, |
|
"learning_rate": 9.731931258429638e-08, |
|
"loss": 0.0002, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 24.2, |
|
"grad_norm": 0.002381326397880912, |
|
"learning_rate": 6.232080839403631e-08, |
|
"loss": 0.0001, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 24.4, |
|
"grad_norm": 0.003195591736584902, |
|
"learning_rate": 3.50714075049563e-08, |
|
"loss": 0.0001, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 24.6, |
|
"grad_norm": 0.003270337823778391, |
|
"learning_rate": 1.5592358180189782e-08, |
|
"loss": 0.0001, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 24.8, |
|
"grad_norm": 0.00868391152471304, |
|
"learning_rate": 3.898849596456477e-09, |
|
"loss": 0.0001, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"grad_norm": 0.0035359435714781284, |
|
"learning_rate": 0.0, |
|
"loss": 0.0002, |
|
"step": 1250 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1250, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 25, |
|
"save_steps": 1000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.502555032784732e+17, |
|
"train_batch_size": 5, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|