shizhediao2's picture
Upload trainer_state.json with huggingface_hub
27e5821 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 200,
"global_step": 1505,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.013289036544850499,
"grad_norm": 0.10525072365999222,
"learning_rate": 4.9978216198586135e-05,
"loss": 0.6155,
"step": 20
},
{
"epoch": 0.026578073089700997,
"grad_norm": 0.08554615080356598,
"learning_rate": 4.991290275706486e-05,
"loss": 0.5694,
"step": 40
},
{
"epoch": 0.03986710963455149,
"grad_norm": 0.08361516892910004,
"learning_rate": 4.980417349743936e-05,
"loss": 0.557,
"step": 60
},
{
"epoch": 0.053156146179401995,
"grad_norm": 0.08680060505867004,
"learning_rate": 4.9652217902637596e-05,
"loss": 0.548,
"step": 80
},
{
"epoch": 0.0664451827242525,
"grad_norm": 0.08960291743278503,
"learning_rate": 4.945730078629964e-05,
"loss": 0.5427,
"step": 100
},
{
"epoch": 0.07973421926910298,
"grad_norm": 0.09262242168188095,
"learning_rate": 4.921976183128585e-05,
"loss": 0.5384,
"step": 120
},
{
"epoch": 0.09302325581395349,
"grad_norm": 0.08780515193939209,
"learning_rate": 4.894001499771015e-05,
"loss": 0.5362,
"step": 140
},
{
"epoch": 0.10631229235880399,
"grad_norm": 0.09249912202358246,
"learning_rate": 4.861854780153004e-05,
"loss": 0.5324,
"step": 160
},
{
"epoch": 0.11960132890365449,
"grad_norm": 0.09562400728464127,
"learning_rate": 4.825592046495054e-05,
"loss": 0.5311,
"step": 180
},
{
"epoch": 0.132890365448505,
"grad_norm": 0.09372778236865997,
"learning_rate": 4.785276494012263e-05,
"loss": 0.5278,
"step": 200
},
{
"epoch": 0.132890365448505,
"eval_accuracy": 0.19452303794312395,
"eval_loss": 0.5592088103294373,
"eval_runtime": 19.5284,
"eval_samples_per_second": 93.914,
"eval_steps_per_second": 0.41,
"step": 200
},
{
"epoch": 0.1461794019933555,
"grad_norm": 0.08762918412685394,
"learning_rate": 4.740978380783765e-05,
"loss": 0.5253,
"step": 220
},
{
"epoch": 0.15946843853820597,
"grad_norm": 0.08518578112125397,
"learning_rate": 4.6927749053136866e-05,
"loss": 0.5192,
"step": 240
},
{
"epoch": 0.17275747508305647,
"grad_norm": 0.09664598107337952,
"learning_rate": 4.640750071996995e-05,
"loss": 0.5217,
"step": 260
},
{
"epoch": 0.18604651162790697,
"grad_norm": 0.08245342969894409,
"learning_rate": 4.584994544724695e-05,
"loss": 0.5172,
"step": 280
},
{
"epoch": 0.19933554817275748,
"grad_norm": 0.08551981300115585,
"learning_rate": 4.5256054888834934e-05,
"loss": 0.5152,
"step": 300
},
{
"epoch": 0.21262458471760798,
"grad_norm": 0.09647104889154434,
"learning_rate": 4.4626864020252774e-05,
"loss": 0.5139,
"step": 320
},
{
"epoch": 0.22591362126245848,
"grad_norm": 0.09810427576303482,
"learning_rate": 4.3963469335015085e-05,
"loss": 0.5129,
"step": 340
},
{
"epoch": 0.23920265780730898,
"grad_norm": 0.08342389762401581,
"learning_rate": 4.326702693376844e-05,
"loss": 0.5119,
"step": 360
},
{
"epoch": 0.25249169435215946,
"grad_norm": 0.08738644421100616,
"learning_rate": 4.2538750509550054e-05,
"loss": 0.511,
"step": 380
},
{
"epoch": 0.26578073089701,
"grad_norm": 0.08475251495838165,
"learning_rate": 4.177990923267986e-05,
"loss": 0.5117,
"step": 400
},
{
"epoch": 0.26578073089701,
"eval_accuracy": 0.1953402564276045,
"eval_loss": 0.5438870787620544,
"eval_runtime": 15.5302,
"eval_samples_per_second": 118.093,
"eval_steps_per_second": 0.515,
"step": 400
},
{
"epoch": 0.27906976744186046,
"grad_norm": 0.07873477786779404,
"learning_rate": 4.099182553897229e-05,
"loss": 0.5084,
"step": 420
},
{
"epoch": 0.292358803986711,
"grad_norm": 0.09158772230148315,
"learning_rate": 4.017587282512181e-05,
"loss": 0.5065,
"step": 440
},
{
"epoch": 0.30564784053156147,
"grad_norm": 0.07729614526033401,
"learning_rate": 3.933347305527898e-05,
"loss": 0.5047,
"step": 460
},
{
"epoch": 0.31893687707641194,
"grad_norm": 0.08530613034963608,
"learning_rate": 3.846609428298757e-05,
"loss": 0.5049,
"step": 480
},
{
"epoch": 0.33222591362126247,
"grad_norm": 0.07760792225599289,
"learning_rate": 3.7575248092801686e-05,
"loss": 0.5035,
"step": 500
},
{
"epoch": 0.34551495016611294,
"grad_norm": 0.08521712571382523,
"learning_rate": 3.66624869660411e-05,
"loss": 0.5042,
"step": 520
},
{
"epoch": 0.3588039867109635,
"grad_norm": 0.08439727872610092,
"learning_rate": 3.572940157527572e-05,
"loss": 0.5021,
"step": 540
},
{
"epoch": 0.37209302325581395,
"grad_norm": 0.09042590111494064,
"learning_rate": 3.47776180122539e-05,
"loss": 0.5019,
"step": 560
},
{
"epoch": 0.3853820598006645,
"grad_norm": 0.08219762146472931,
"learning_rate": 3.3808794954105716e-05,
"loss": 0.501,
"step": 580
},
{
"epoch": 0.39867109634551495,
"grad_norm": 0.08426713198423386,
"learning_rate": 3.282462077275947e-05,
"loss": 0.5013,
"step": 600
},
{
"epoch": 0.39867109634551495,
"eval_accuracy": 0.19588631180347973,
"eval_loss": 0.5341373682022095,
"eval_runtime": 16.1072,
"eval_samples_per_second": 113.862,
"eval_steps_per_second": 0.497,
"step": 600
},
{
"epoch": 0.4119601328903654,
"grad_norm": 0.08020314574241638,
"learning_rate": 3.1826810592609036e-05,
"loss": 0.4968,
"step": 620
},
{
"epoch": 0.42524916943521596,
"grad_norm": 0.07975760847330093,
"learning_rate": 3.081710330155942e-05,
"loss": 0.4997,
"step": 640
},
{
"epoch": 0.43853820598006643,
"grad_norm": 0.08056964725255966,
"learning_rate": 2.979725852065981e-05,
"loss": 0.4968,
"step": 660
},
{
"epoch": 0.45182724252491696,
"grad_norm": 0.08022565394639969,
"learning_rate": 2.876905353760459e-05,
"loss": 0.4976,
"step": 680
},
{
"epoch": 0.46511627906976744,
"grad_norm": 0.08131925761699677,
"learning_rate": 2.7734280209446865e-05,
"loss": 0.4973,
"step": 700
},
{
"epoch": 0.47840531561461797,
"grad_norm": 0.07562076300382614,
"learning_rate": 2.6694741839921732e-05,
"loss": 0.4956,
"step": 720
},
{
"epoch": 0.49169435215946844,
"grad_norm": 0.07877329736948013,
"learning_rate": 2.5652250036821523e-05,
"loss": 0.4966,
"step": 740
},
{
"epoch": 0.5049833887043189,
"grad_norm": 0.08014395087957382,
"learning_rate": 2.4608621554899362e-05,
"loss": 0.4934,
"step": 760
},
{
"epoch": 0.5182724252491694,
"grad_norm": 0.07770328223705292,
"learning_rate": 2.356567512980326e-05,
"loss": 0.4934,
"step": 780
},
{
"epoch": 0.53156146179402,
"grad_norm": 0.07732851803302765,
"learning_rate": 2.252522830855798e-05,
"loss": 0.4951,
"step": 800
},
{
"epoch": 0.53156146179402,
"eval_accuracy": 0.19623978277118043,
"eval_loss": 0.5274041295051575,
"eval_runtime": 16.4552,
"eval_samples_per_second": 111.454,
"eval_steps_per_second": 0.486,
"step": 800
},
{
"epoch": 0.5448504983388704,
"grad_norm": 0.07608461380004883,
"learning_rate": 2.1489094282118395e-05,
"loss": 0.4896,
"step": 820
},
{
"epoch": 0.5581395348837209,
"grad_norm": 0.07657533138990402,
"learning_rate": 2.0459078725514092e-05,
"loss": 0.4918,
"step": 840
},
{
"epoch": 0.5714285714285714,
"grad_norm": 0.07983728498220444,
"learning_rate": 1.9436976651092144e-05,
"loss": 0.4927,
"step": 860
},
{
"epoch": 0.584717607973422,
"grad_norm": 0.07355430722236633,
"learning_rate": 1.8424569280341653e-05,
"loss": 0.493,
"step": 880
},
{
"epoch": 0.5980066445182725,
"grad_norm": 0.08014149218797684,
"learning_rate": 1.7423620939751788e-05,
"loss": 0.4922,
"step": 900
},
{
"epoch": 0.6112956810631229,
"grad_norm": 0.07500924915075302,
"learning_rate": 1.6435875986112685e-05,
"loss": 0.491,
"step": 920
},
{
"epoch": 0.6245847176079734,
"grad_norm": 0.07356715947389603,
"learning_rate": 1.546305576661776e-05,
"loss": 0.4909,
"step": 940
},
{
"epoch": 0.6378737541528239,
"grad_norm": 0.07140863686800003,
"learning_rate": 1.4506855619064846e-05,
"loss": 0.489,
"step": 960
},
{
"epoch": 0.6511627906976745,
"grad_norm": 0.07692987471818924,
"learning_rate": 1.3568941917384036e-05,
"loss": 0.4902,
"step": 980
},
{
"epoch": 0.6644518272425249,
"grad_norm": 0.07356040179729462,
"learning_rate": 1.2650949167640997e-05,
"loss": 0.4894,
"step": 1000
},
{
"epoch": 0.6644518272425249,
"eval_accuracy": 0.19652373156663552,
"eval_loss": 0.5229406952857971,
"eval_runtime": 15.6791,
"eval_samples_per_second": 116.971,
"eval_steps_per_second": 0.51,
"step": 1000
},
{
"epoch": 0.6777408637873754,
"grad_norm": 0.0691773071885109,
"learning_rate": 1.1754477159576499e-05,
"loss": 0.4869,
"step": 1020
},
{
"epoch": 0.6910299003322259,
"grad_norm": 0.07505939155817032,
"learning_rate": 1.088108817864629e-05,
"loss": 0.4865,
"step": 1040
},
{
"epoch": 0.7043189368770764,
"grad_norm": 0.06973451375961304,
"learning_rate": 1.003230428341979e-05,
"loss": 0.4888,
"step": 1060
},
{
"epoch": 0.717607973421927,
"grad_norm": 0.07225219160318375,
"learning_rate": 9.209604653082326e-06,
"loss": 0.4858,
"step": 1080
},
{
"epoch": 0.7308970099667774,
"grad_norm": 0.07558443397283554,
"learning_rate": 8.414423009663563e-06,
"loss": 0.4891,
"step": 1100
},
{
"epoch": 0.7441860465116279,
"grad_norm": 0.0698658898472786,
"learning_rate": 7.648145119484152e-06,
"loss": 0.4871,
"step": 1120
},
{
"epoch": 0.7574750830564784,
"grad_norm": 0.06963298469781876,
"learning_rate": 6.912106378175098e-06,
"loss": 0.4884,
"step": 1140
},
{
"epoch": 0.770764119601329,
"grad_norm": 0.0692787617444992,
"learning_rate": 6.207589483478266e-06,
"loss": 0.4877,
"step": 1160
},
{
"epoch": 0.7840531561461794,
"grad_norm": 0.07016126066446304,
"learning_rate": 5.53582219988382e-06,
"loss": 0.4856,
"step": 1180
},
{
"epoch": 0.7973421926910299,
"grad_norm": 0.06945677101612091,
"learning_rate": 4.897975218999926e-06,
"loss": 0.4868,
"step": 1200
},
{
"epoch": 0.7973421926910299,
"eval_accuracy": 0.19665158843513314,
"eval_loss": 0.5205041170120239,
"eval_runtime": 14.8321,
"eval_samples_per_second": 123.651,
"eval_steps_per_second": 0.539,
"step": 1200
},
{
"epoch": 0.8106312292358804,
"grad_norm": 0.07045505195856094,
"learning_rate": 4.295160119383712e-06,
"loss": 0.4859,
"step": 1220
},
{
"epoch": 0.8239202657807309,
"grad_norm": 0.06839559227228165,
"learning_rate": 3.728427429388709e-06,
"loss": 0.4863,
"step": 1240
},
{
"epoch": 0.8372093023255814,
"grad_norm": 0.06684821844100952,
"learning_rate": 3.198764796404807e-06,
"loss": 0.4856,
"step": 1260
},
{
"epoch": 0.8504983388704319,
"grad_norm": 0.06731660664081573,
"learning_rate": 2.707095265681081e-06,
"loss": 0.4854,
"step": 1280
},
{
"epoch": 0.8637873754152824,
"grad_norm": 0.06780705600976944,
"learning_rate": 2.254275671731007e-06,
"loss": 0.4868,
"step": 1300
},
{
"epoch": 0.8770764119601329,
"grad_norm": 0.06815515458583832,
"learning_rate": 1.8410951451234533e-06,
"loss": 0.4854,
"step": 1320
},
{
"epoch": 0.8903654485049833,
"grad_norm": 0.0670180469751358,
"learning_rate": 1.4682737372615967e-06,
"loss": 0.485,
"step": 1340
},
{
"epoch": 0.9036544850498339,
"grad_norm": 0.06649608910083771,
"learning_rate": 1.1364611655463736e-06,
"loss": 0.4867,
"step": 1360
},
{
"epoch": 0.9169435215946844,
"grad_norm": 0.0674930214881897,
"learning_rate": 8.462356811112987e-07,
"loss": 0.4865,
"step": 1380
},
{
"epoch": 0.9302325581395349,
"grad_norm": 0.06808231770992279,
"learning_rate": 5.981030611018234e-07,
"loss": 0.4864,
"step": 1400
},
{
"epoch": 0.9302325581395349,
"eval_accuracy": 0.19667556159797644,
"eval_loss": 0.519675612449646,
"eval_runtime": 14.9507,
"eval_samples_per_second": 122.67,
"eval_steps_per_second": 0.535,
"step": 1400
},
{
"epoch": 0.9435215946843853,
"grad_norm": 0.06696037203073502,
"learning_rate": 3.9249572725543196e-07,
"loss": 0.4852,
"step": 1420
},
{
"epoch": 0.9568106312292359,
"grad_norm": 0.06675516068935394,
"learning_rate": 2.297719923185032e-07,
"loss": 0.4875,
"step": 1440
},
{
"epoch": 0.9700996677740864,
"grad_norm": 0.06678403913974762,
"learning_rate": 1.1021543561322012e-07,
"loss": 0.4852,
"step": 1460
},
{
"epoch": 0.9833887043189369,
"grad_norm": 0.0660882443189621,
"learning_rate": 3.403440884269526e-08,
"loss": 0.4848,
"step": 1480
},
{
"epoch": 0.9966777408637874,
"grad_norm": 0.06698651611804962,
"learning_rate": 1.3616729956228425e-09,
"loss": 0.4847,
"step": 1500
},
{
"epoch": 1.0,
"step": 1505,
"total_flos": 2.786803439690685e+19,
"train_loss": 0.0,
"train_runtime": 4.5361,
"train_samples_per_second": 339673.082,
"train_steps_per_second": 331.781
}
],
"logging_steps": 20,
"max_steps": 1505,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.786803439690685e+19,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}