|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 200, |
|
"global_step": 1505, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.013289036544850499, |
|
"grad_norm": 0.10525072365999222, |
|
"learning_rate": 4.9978216198586135e-05, |
|
"loss": 0.6155, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.026578073089700997, |
|
"grad_norm": 0.08554615080356598, |
|
"learning_rate": 4.991290275706486e-05, |
|
"loss": 0.5694, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.03986710963455149, |
|
"grad_norm": 0.08361516892910004, |
|
"learning_rate": 4.980417349743936e-05, |
|
"loss": 0.557, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.053156146179401995, |
|
"grad_norm": 0.08680060505867004, |
|
"learning_rate": 4.9652217902637596e-05, |
|
"loss": 0.548, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.0664451827242525, |
|
"grad_norm": 0.08960291743278503, |
|
"learning_rate": 4.945730078629964e-05, |
|
"loss": 0.5427, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.07973421926910298, |
|
"grad_norm": 0.09262242168188095, |
|
"learning_rate": 4.921976183128585e-05, |
|
"loss": 0.5384, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.09302325581395349, |
|
"grad_norm": 0.08780515193939209, |
|
"learning_rate": 4.894001499771015e-05, |
|
"loss": 0.5362, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.10631229235880399, |
|
"grad_norm": 0.09249912202358246, |
|
"learning_rate": 4.861854780153004e-05, |
|
"loss": 0.5324, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.11960132890365449, |
|
"grad_norm": 0.09562400728464127, |
|
"learning_rate": 4.825592046495054e-05, |
|
"loss": 0.5311, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.132890365448505, |
|
"grad_norm": 0.09372778236865997, |
|
"learning_rate": 4.785276494012263e-05, |
|
"loss": 0.5278, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.132890365448505, |
|
"eval_accuracy": 0.19452303794312395, |
|
"eval_loss": 0.5592088103294373, |
|
"eval_runtime": 19.5284, |
|
"eval_samples_per_second": 93.914, |
|
"eval_steps_per_second": 0.41, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.1461794019933555, |
|
"grad_norm": 0.08762918412685394, |
|
"learning_rate": 4.740978380783765e-05, |
|
"loss": 0.5253, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.15946843853820597, |
|
"grad_norm": 0.08518578112125397, |
|
"learning_rate": 4.6927749053136866e-05, |
|
"loss": 0.5192, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.17275747508305647, |
|
"grad_norm": 0.09664598107337952, |
|
"learning_rate": 4.640750071996995e-05, |
|
"loss": 0.5217, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.18604651162790697, |
|
"grad_norm": 0.08245342969894409, |
|
"learning_rate": 4.584994544724695e-05, |
|
"loss": 0.5172, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.19933554817275748, |
|
"grad_norm": 0.08551981300115585, |
|
"learning_rate": 4.5256054888834934e-05, |
|
"loss": 0.5152, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.21262458471760798, |
|
"grad_norm": 0.09647104889154434, |
|
"learning_rate": 4.4626864020252774e-05, |
|
"loss": 0.5139, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.22591362126245848, |
|
"grad_norm": 0.09810427576303482, |
|
"learning_rate": 4.3963469335015085e-05, |
|
"loss": 0.5129, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.23920265780730898, |
|
"grad_norm": 0.08342389762401581, |
|
"learning_rate": 4.326702693376844e-05, |
|
"loss": 0.5119, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.25249169435215946, |
|
"grad_norm": 0.08738644421100616, |
|
"learning_rate": 4.2538750509550054e-05, |
|
"loss": 0.511, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.26578073089701, |
|
"grad_norm": 0.08475251495838165, |
|
"learning_rate": 4.177990923267986e-05, |
|
"loss": 0.5117, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.26578073089701, |
|
"eval_accuracy": 0.1953402564276045, |
|
"eval_loss": 0.5438870787620544, |
|
"eval_runtime": 15.5302, |
|
"eval_samples_per_second": 118.093, |
|
"eval_steps_per_second": 0.515, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.27906976744186046, |
|
"grad_norm": 0.07873477786779404, |
|
"learning_rate": 4.099182553897229e-05, |
|
"loss": 0.5084, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.292358803986711, |
|
"grad_norm": 0.09158772230148315, |
|
"learning_rate": 4.017587282512181e-05, |
|
"loss": 0.5065, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.30564784053156147, |
|
"grad_norm": 0.07729614526033401, |
|
"learning_rate": 3.933347305527898e-05, |
|
"loss": 0.5047, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.31893687707641194, |
|
"grad_norm": 0.08530613034963608, |
|
"learning_rate": 3.846609428298757e-05, |
|
"loss": 0.5049, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.33222591362126247, |
|
"grad_norm": 0.07760792225599289, |
|
"learning_rate": 3.7575248092801686e-05, |
|
"loss": 0.5035, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.34551495016611294, |
|
"grad_norm": 0.08521712571382523, |
|
"learning_rate": 3.66624869660411e-05, |
|
"loss": 0.5042, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.3588039867109635, |
|
"grad_norm": 0.08439727872610092, |
|
"learning_rate": 3.572940157527572e-05, |
|
"loss": 0.5021, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.37209302325581395, |
|
"grad_norm": 0.09042590111494064, |
|
"learning_rate": 3.47776180122539e-05, |
|
"loss": 0.5019, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.3853820598006645, |
|
"grad_norm": 0.08219762146472931, |
|
"learning_rate": 3.3808794954105716e-05, |
|
"loss": 0.501, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.39867109634551495, |
|
"grad_norm": 0.08426713198423386, |
|
"learning_rate": 3.282462077275947e-05, |
|
"loss": 0.5013, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.39867109634551495, |
|
"eval_accuracy": 0.19588631180347973, |
|
"eval_loss": 0.5341373682022095, |
|
"eval_runtime": 16.1072, |
|
"eval_samples_per_second": 113.862, |
|
"eval_steps_per_second": 0.497, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.4119601328903654, |
|
"grad_norm": 0.08020314574241638, |
|
"learning_rate": 3.1826810592609036e-05, |
|
"loss": 0.4968, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.42524916943521596, |
|
"grad_norm": 0.07975760847330093, |
|
"learning_rate": 3.081710330155942e-05, |
|
"loss": 0.4997, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.43853820598006643, |
|
"grad_norm": 0.08056964725255966, |
|
"learning_rate": 2.979725852065981e-05, |
|
"loss": 0.4968, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.45182724252491696, |
|
"grad_norm": 0.08022565394639969, |
|
"learning_rate": 2.876905353760459e-05, |
|
"loss": 0.4976, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.46511627906976744, |
|
"grad_norm": 0.08131925761699677, |
|
"learning_rate": 2.7734280209446865e-05, |
|
"loss": 0.4973, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.47840531561461797, |
|
"grad_norm": 0.07562076300382614, |
|
"learning_rate": 2.6694741839921732e-05, |
|
"loss": 0.4956, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.49169435215946844, |
|
"grad_norm": 0.07877329736948013, |
|
"learning_rate": 2.5652250036821523e-05, |
|
"loss": 0.4966, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.5049833887043189, |
|
"grad_norm": 0.08014395087957382, |
|
"learning_rate": 2.4608621554899362e-05, |
|
"loss": 0.4934, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.5182724252491694, |
|
"grad_norm": 0.07770328223705292, |
|
"learning_rate": 2.356567512980326e-05, |
|
"loss": 0.4934, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.53156146179402, |
|
"grad_norm": 0.07732851803302765, |
|
"learning_rate": 2.252522830855798e-05, |
|
"loss": 0.4951, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.53156146179402, |
|
"eval_accuracy": 0.19623978277118043, |
|
"eval_loss": 0.5274041295051575, |
|
"eval_runtime": 16.4552, |
|
"eval_samples_per_second": 111.454, |
|
"eval_steps_per_second": 0.486, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.5448504983388704, |
|
"grad_norm": 0.07608461380004883, |
|
"learning_rate": 2.1489094282118395e-05, |
|
"loss": 0.4896, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.5581395348837209, |
|
"grad_norm": 0.07657533138990402, |
|
"learning_rate": 2.0459078725514092e-05, |
|
"loss": 0.4918, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.5714285714285714, |
|
"grad_norm": 0.07983728498220444, |
|
"learning_rate": 1.9436976651092144e-05, |
|
"loss": 0.4927, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.584717607973422, |
|
"grad_norm": 0.07355430722236633, |
|
"learning_rate": 1.8424569280341653e-05, |
|
"loss": 0.493, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.5980066445182725, |
|
"grad_norm": 0.08014149218797684, |
|
"learning_rate": 1.7423620939751788e-05, |
|
"loss": 0.4922, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.6112956810631229, |
|
"grad_norm": 0.07500924915075302, |
|
"learning_rate": 1.6435875986112685e-05, |
|
"loss": 0.491, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.6245847176079734, |
|
"grad_norm": 0.07356715947389603, |
|
"learning_rate": 1.546305576661776e-05, |
|
"loss": 0.4909, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.6378737541528239, |
|
"grad_norm": 0.07140863686800003, |
|
"learning_rate": 1.4506855619064846e-05, |
|
"loss": 0.489, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.6511627906976745, |
|
"grad_norm": 0.07692987471818924, |
|
"learning_rate": 1.3568941917384036e-05, |
|
"loss": 0.4902, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.6644518272425249, |
|
"grad_norm": 0.07356040179729462, |
|
"learning_rate": 1.2650949167640997e-05, |
|
"loss": 0.4894, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.6644518272425249, |
|
"eval_accuracy": 0.19652373156663552, |
|
"eval_loss": 0.5229406952857971, |
|
"eval_runtime": 15.6791, |
|
"eval_samples_per_second": 116.971, |
|
"eval_steps_per_second": 0.51, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.6777408637873754, |
|
"grad_norm": 0.0691773071885109, |
|
"learning_rate": 1.1754477159576499e-05, |
|
"loss": 0.4869, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.6910299003322259, |
|
"grad_norm": 0.07505939155817032, |
|
"learning_rate": 1.088108817864629e-05, |
|
"loss": 0.4865, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.7043189368770764, |
|
"grad_norm": 0.06973451375961304, |
|
"learning_rate": 1.003230428341979e-05, |
|
"loss": 0.4888, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.717607973421927, |
|
"grad_norm": 0.07225219160318375, |
|
"learning_rate": 9.209604653082326e-06, |
|
"loss": 0.4858, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.7308970099667774, |
|
"grad_norm": 0.07558443397283554, |
|
"learning_rate": 8.414423009663563e-06, |
|
"loss": 0.4891, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.7441860465116279, |
|
"grad_norm": 0.0698658898472786, |
|
"learning_rate": 7.648145119484152e-06, |
|
"loss": 0.4871, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.7574750830564784, |
|
"grad_norm": 0.06963298469781876, |
|
"learning_rate": 6.912106378175098e-06, |
|
"loss": 0.4884, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.770764119601329, |
|
"grad_norm": 0.0692787617444992, |
|
"learning_rate": 6.207589483478266e-06, |
|
"loss": 0.4877, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.7840531561461794, |
|
"grad_norm": 0.07016126066446304, |
|
"learning_rate": 5.53582219988382e-06, |
|
"loss": 0.4856, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.7973421926910299, |
|
"grad_norm": 0.06945677101612091, |
|
"learning_rate": 4.897975218999926e-06, |
|
"loss": 0.4868, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.7973421926910299, |
|
"eval_accuracy": 0.19665158843513314, |
|
"eval_loss": 0.5205041170120239, |
|
"eval_runtime": 14.8321, |
|
"eval_samples_per_second": 123.651, |
|
"eval_steps_per_second": 0.539, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.8106312292358804, |
|
"grad_norm": 0.07045505195856094, |
|
"learning_rate": 4.295160119383712e-06, |
|
"loss": 0.4859, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.8239202657807309, |
|
"grad_norm": 0.06839559227228165, |
|
"learning_rate": 3.728427429388709e-06, |
|
"loss": 0.4863, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.8372093023255814, |
|
"grad_norm": 0.06684821844100952, |
|
"learning_rate": 3.198764796404807e-06, |
|
"loss": 0.4856, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.8504983388704319, |
|
"grad_norm": 0.06731660664081573, |
|
"learning_rate": 2.707095265681081e-06, |
|
"loss": 0.4854, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.8637873754152824, |
|
"grad_norm": 0.06780705600976944, |
|
"learning_rate": 2.254275671731007e-06, |
|
"loss": 0.4868, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.8770764119601329, |
|
"grad_norm": 0.06815515458583832, |
|
"learning_rate": 1.8410951451234533e-06, |
|
"loss": 0.4854, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.8903654485049833, |
|
"grad_norm": 0.0670180469751358, |
|
"learning_rate": 1.4682737372615967e-06, |
|
"loss": 0.485, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.9036544850498339, |
|
"grad_norm": 0.06649608910083771, |
|
"learning_rate": 1.1364611655463736e-06, |
|
"loss": 0.4867, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.9169435215946844, |
|
"grad_norm": 0.0674930214881897, |
|
"learning_rate": 8.462356811112987e-07, |
|
"loss": 0.4865, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.9302325581395349, |
|
"grad_norm": 0.06808231770992279, |
|
"learning_rate": 5.981030611018234e-07, |
|
"loss": 0.4864, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.9302325581395349, |
|
"eval_accuracy": 0.19667556159797644, |
|
"eval_loss": 0.519675612449646, |
|
"eval_runtime": 14.9507, |
|
"eval_samples_per_second": 122.67, |
|
"eval_steps_per_second": 0.535, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.9435215946843853, |
|
"grad_norm": 0.06696037203073502, |
|
"learning_rate": 3.9249572725543196e-07, |
|
"loss": 0.4852, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.9568106312292359, |
|
"grad_norm": 0.06675516068935394, |
|
"learning_rate": 2.297719923185032e-07, |
|
"loss": 0.4875, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.9700996677740864, |
|
"grad_norm": 0.06678403913974762, |
|
"learning_rate": 1.1021543561322012e-07, |
|
"loss": 0.4852, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.9833887043189369, |
|
"grad_norm": 0.0660882443189621, |
|
"learning_rate": 3.403440884269526e-08, |
|
"loss": 0.4848, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.9966777408637874, |
|
"grad_norm": 0.06698651611804962, |
|
"learning_rate": 1.3616729956228425e-09, |
|
"loss": 0.4847, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 1505, |
|
"total_flos": 2.786803439690685e+19, |
|
"train_loss": 0.0, |
|
"train_runtime": 4.5361, |
|
"train_samples_per_second": 339673.082, |
|
"train_steps_per_second": 331.781 |
|
} |
|
], |
|
"logging_steps": 20, |
|
"max_steps": 1505, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 200, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.786803439690685e+19, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|