|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.644453259288642, |
|
"eval_steps": 1000, |
|
"global_step": 20000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.06611133148221605, |
|
"grad_norm": 1.0232534408569336, |
|
"learning_rate": 4.4080049369655294e-05, |
|
"loss": 5.6838, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.1322226629644321, |
|
"grad_norm": 1.0166140794754028, |
|
"learning_rate": 8.816009873931059e-05, |
|
"loss": 3.8378, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.1322226629644321, |
|
"eval_accuracy": 0.4720710052887577, |
|
"eval_loss": 3.6431853771209717, |
|
"eval_runtime": 65.5904, |
|
"eval_samples_per_second": 28.053, |
|
"eval_steps_per_second": 1.174, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.19833399444664815, |
|
"grad_norm": 1.0467888116836548, |
|
"learning_rate": 9.830315009952811e-05, |
|
"loss": 3.3712, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.2644453259288642, |
|
"grad_norm": 1.1263777017593384, |
|
"learning_rate": 9.59831475011252e-05, |
|
"loss": 3.0922, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.2644453259288642, |
|
"eval_accuracy": 0.5138609524011809, |
|
"eval_loss": 3.076597213745117, |
|
"eval_runtime": 64.6228, |
|
"eval_samples_per_second": 28.473, |
|
"eval_steps_per_second": 1.192, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.33055665741108026, |
|
"grad_norm": 1.4438892602920532, |
|
"learning_rate": 9.366314490272228e-05, |
|
"loss": 2.9066, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.3966679888932963, |
|
"grad_norm": 1.3693314790725708, |
|
"learning_rate": 9.134314230431938e-05, |
|
"loss": 2.7993, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.3966679888932963, |
|
"eval_accuracy": 0.5319845054268176, |
|
"eval_loss": 2.84745454788208, |
|
"eval_runtime": 64.9029, |
|
"eval_samples_per_second": 28.35, |
|
"eval_steps_per_second": 1.186, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.46277932037551234, |
|
"grad_norm": 1.3279718160629272, |
|
"learning_rate": 8.902313970591646e-05, |
|
"loss": 2.7166, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.5288906518577284, |
|
"grad_norm": 1.465155839920044, |
|
"learning_rate": 8.670313710751356e-05, |
|
"loss": 2.7115, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.5288906518577284, |
|
"eval_accuracy": 0.5392130052462777, |
|
"eval_loss": 2.7528512477874756, |
|
"eval_runtime": 65.039, |
|
"eval_samples_per_second": 28.291, |
|
"eval_steps_per_second": 1.184, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.5950019833399445, |
|
"grad_norm": 2.4618444442749023, |
|
"learning_rate": 8.438313450911065e-05, |
|
"loss": 2.644, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.6611133148221605, |
|
"grad_norm": 3.049086093902588, |
|
"learning_rate": 8.206313191070773e-05, |
|
"loss": 2.6702, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.6611133148221605, |
|
"eval_accuracy": 0.5420291625071685, |
|
"eval_loss": 2.7150135040283203, |
|
"eval_runtime": 64.9223, |
|
"eval_samples_per_second": 28.342, |
|
"eval_steps_per_second": 1.186, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.7272246463043766, |
|
"grad_norm": 3.927698850631714, |
|
"learning_rate": 7.974312931230483e-05, |
|
"loss": 2.6029, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.7933359777865926, |
|
"grad_norm": 4.909026622772217, |
|
"learning_rate": 7.742312671390191e-05, |
|
"loss": 2.6484, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.7933359777865926, |
|
"eval_accuracy": 0.543187538497483, |
|
"eval_loss": 2.696218729019165, |
|
"eval_runtime": 64.8633, |
|
"eval_samples_per_second": 28.367, |
|
"eval_steps_per_second": 1.187, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.8594473092688086, |
|
"grad_norm": 10.72818660736084, |
|
"learning_rate": 7.510312411549901e-05, |
|
"loss": 2.6474, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.9255586407510247, |
|
"grad_norm": 12.435935020446777, |
|
"learning_rate": 7.278312151709609e-05, |
|
"loss": 2.6419, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.9255586407510247, |
|
"eval_accuracy": 0.5387701514411334, |
|
"eval_loss": 2.7223353385925293, |
|
"eval_runtime": 68.3123, |
|
"eval_samples_per_second": 26.935, |
|
"eval_steps_per_second": 1.127, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.9916699722332408, |
|
"grad_norm": 15.605013847351074, |
|
"learning_rate": 7.046311891869319e-05, |
|
"loss": 2.6239, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 1.0577813037154569, |
|
"grad_norm": 55.199256896972656, |
|
"learning_rate": 6.814311632029027e-05, |
|
"loss": 2.5853, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 1.0577813037154569, |
|
"eval_accuracy": 0.5401743803232727, |
|
"eval_loss": 2.7088677883148193, |
|
"eval_runtime": 66.8855, |
|
"eval_samples_per_second": 27.51, |
|
"eval_steps_per_second": 1.151, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 1.1238926351976728, |
|
"grad_norm": 19.066770553588867, |
|
"learning_rate": 6.582311372188736e-05, |
|
"loss": 2.616, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 1.190003966679889, |
|
"grad_norm": 25.54907989501953, |
|
"learning_rate": 6.350311112348446e-05, |
|
"loss": 2.6009, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 1.190003966679889, |
|
"eval_accuracy": 0.5401903103162634, |
|
"eval_loss": 2.703549861907959, |
|
"eval_runtime": 65.0156, |
|
"eval_samples_per_second": 28.301, |
|
"eval_steps_per_second": 1.184, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 1.256115298162105, |
|
"grad_norm": 24.64689826965332, |
|
"learning_rate": 6.118310852508154e-05, |
|
"loss": 2.622, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 1.322226629644321, |
|
"grad_norm": 39.75895309448242, |
|
"learning_rate": 5.886310592667864e-05, |
|
"loss": 2.6347, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 1.322226629644321, |
|
"eval_accuracy": 0.5368293472950872, |
|
"eval_loss": 2.7321841716766357, |
|
"eval_runtime": 70.4109, |
|
"eval_samples_per_second": 26.132, |
|
"eval_steps_per_second": 1.094, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 1.388337961126537, |
|
"grad_norm": 63.85321044921875, |
|
"learning_rate": 5.654310332827573e-05, |
|
"loss": 2.634, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 1.454449292608753, |
|
"grad_norm": 38.93082046508789, |
|
"learning_rate": 5.422310072987282e-05, |
|
"loss": 2.7407, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 1.454449292608753, |
|
"eval_accuracy": 0.5244159002570039, |
|
"eval_loss": 2.8357815742492676, |
|
"eval_runtime": 64.8895, |
|
"eval_samples_per_second": 28.356, |
|
"eval_steps_per_second": 1.187, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 1.5205606240909693, |
|
"grad_norm": 95.31773376464844, |
|
"learning_rate": 5.1903098131469904e-05, |
|
"loss": 2.787, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 1.5866719555731852, |
|
"grad_norm": 388.4613952636719, |
|
"learning_rate": 4.9583095533066995e-05, |
|
"loss": 2.8981, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 1.5866719555731852, |
|
"eval_accuracy": 0.5073068222849982, |
|
"eval_loss": 2.9791054725646973, |
|
"eval_runtime": 65.179, |
|
"eval_samples_per_second": 28.23, |
|
"eval_steps_per_second": 1.181, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 1.6527832870554013, |
|
"grad_norm": 612.2838745117188, |
|
"learning_rate": 4.7263092934664086e-05, |
|
"loss": 2.9835, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 1.7188946185376173, |
|
"grad_norm": 115.95877838134766, |
|
"learning_rate": 4.4943090336261176e-05, |
|
"loss": 3.1243, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 1.7188946185376173, |
|
"eval_accuracy": 0.4558367494318302, |
|
"eval_loss": 3.454159736633301, |
|
"eval_runtime": 64.9781, |
|
"eval_samples_per_second": 28.317, |
|
"eval_steps_per_second": 1.185, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 1.7850059500198334, |
|
"grad_norm": 152.87709045410156, |
|
"learning_rate": 4.262308773785827e-05, |
|
"loss": 3.2902, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 1.8511172815020496, |
|
"grad_norm": 235.900390625, |
|
"learning_rate": 4.030308513945535e-05, |
|
"loss": 3.2186, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 1.8511172815020496, |
|
"eval_accuracy": 0.4863910069879569, |
|
"eval_loss": 3.138493061065674, |
|
"eval_runtime": 65.7386, |
|
"eval_samples_per_second": 27.99, |
|
"eval_steps_per_second": 1.171, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 1.9172286129842655, |
|
"grad_norm": 22.522586822509766, |
|
"learning_rate": 3.798308254105244e-05, |
|
"loss": 3.0901, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 1.9833399444664814, |
|
"grad_norm": 186.2947235107422, |
|
"learning_rate": 3.566307994264953e-05, |
|
"loss": 2.9741, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 1.9833399444664814, |
|
"eval_accuracy": 0.4909350374885835, |
|
"eval_loss": 3.0912961959838867, |
|
"eval_runtime": 64.5786, |
|
"eval_samples_per_second": 28.492, |
|
"eval_steps_per_second": 1.192, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 2.0494512759486976, |
|
"grad_norm": 68.64359283447266, |
|
"learning_rate": 3.3343077344246624e-05, |
|
"loss": 2.8854, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 2.1155626074309137, |
|
"grad_norm": 95.0623550415039, |
|
"learning_rate": 3.1023074745843715e-05, |
|
"loss": 2.8322, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 2.1155626074309137, |
|
"eval_accuracy": 0.5131106497313142, |
|
"eval_loss": 2.899470329284668, |
|
"eval_runtime": 64.8181, |
|
"eval_samples_per_second": 28.387, |
|
"eval_steps_per_second": 1.188, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 2.18167393891313, |
|
"grad_norm": 34.978458404541016, |
|
"learning_rate": 2.8703072147440806e-05, |
|
"loss": 2.8632, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 2.2477852703953456, |
|
"grad_norm": 94.10592651367188, |
|
"learning_rate": 2.6383069549037897e-05, |
|
"loss": 2.8482, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 2.2477852703953456, |
|
"eval_accuracy": 0.505882149911854, |
|
"eval_loss": 2.944797992706299, |
|
"eval_runtime": 64.8393, |
|
"eval_samples_per_second": 28.378, |
|
"eval_steps_per_second": 1.188, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 2.3138966018775617, |
|
"grad_norm": 68.33843994140625, |
|
"learning_rate": 2.4063066950634984e-05, |
|
"loss": 2.8747, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 2.380007933359778, |
|
"grad_norm": 83.47583770751953, |
|
"learning_rate": 2.1743064352232075e-05, |
|
"loss": 2.8697, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 2.380007933359778, |
|
"eval_accuracy": 0.503569911429239, |
|
"eval_loss": 2.9733448028564453, |
|
"eval_runtime": 64.8844, |
|
"eval_samples_per_second": 28.358, |
|
"eval_steps_per_second": 1.187, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 2.446119264841994, |
|
"grad_norm": 132.79673767089844, |
|
"learning_rate": 1.9423061753829162e-05, |
|
"loss": 2.8369, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 2.51223059632421, |
|
"grad_norm": 189.53028869628906, |
|
"learning_rate": 1.7103059155426253e-05, |
|
"loss": 2.8289, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 2.51223059632421, |
|
"eval_accuracy": 0.5066844905588241, |
|
"eval_loss": 2.94022536277771, |
|
"eval_runtime": 65.2056, |
|
"eval_samples_per_second": 28.218, |
|
"eval_steps_per_second": 1.181, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 2.578341927806426, |
|
"grad_norm": 130.18309020996094, |
|
"learning_rate": 1.4783056557023344e-05, |
|
"loss": 2.8655, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 2.644453259288642, |
|
"grad_norm": 136.07675170898438, |
|
"learning_rate": 1.2463053958620433e-05, |
|
"loss": 2.8319, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 2.644453259288642, |
|
"eval_accuracy": 0.506187209277628, |
|
"eval_loss": 2.9402058124542236, |
|
"eval_runtime": 65.0148, |
|
"eval_samples_per_second": 28.301, |
|
"eval_steps_per_second": 1.184, |
|
"step": 20000 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 22689, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 10000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.2813992666018611e+17, |
|
"train_batch_size": 12, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|