|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.016548072149594573, |
|
"eval_steps": 5, |
|
"global_step": 50, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00033096144299189144, |
|
"grad_norm": 1.7213324308395386, |
|
"learning_rate": 1e-05, |
|
"loss": 5.8203, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.00033096144299189144, |
|
"eval_loss": 1.4616162776947021, |
|
"eval_runtime": 255.2761, |
|
"eval_samples_per_second": 4.987, |
|
"eval_steps_per_second": 2.495, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0006619228859837829, |
|
"grad_norm": 1.3458422422409058, |
|
"learning_rate": 2e-05, |
|
"loss": 5.5279, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0009928843289756743, |
|
"grad_norm": 1.8355010747909546, |
|
"learning_rate": 3e-05, |
|
"loss": 6.4283, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0013238457719675658, |
|
"grad_norm": 1.6977440118789673, |
|
"learning_rate": 4e-05, |
|
"loss": 5.8083, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.0016548072149594572, |
|
"grad_norm": 1.6104906797409058, |
|
"learning_rate": 5e-05, |
|
"loss": 5.7421, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0016548072149594572, |
|
"eval_loss": 1.4400510787963867, |
|
"eval_runtime": 257.5858, |
|
"eval_samples_per_second": 4.942, |
|
"eval_steps_per_second": 2.473, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0019857686579513485, |
|
"grad_norm": 1.9484833478927612, |
|
"learning_rate": 6e-05, |
|
"loss": 5.6831, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0023167301009432402, |
|
"grad_norm": 1.7339823246002197, |
|
"learning_rate": 7e-05, |
|
"loss": 5.6721, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.0026476915439351315, |
|
"grad_norm": 2.2821269035339355, |
|
"learning_rate": 8e-05, |
|
"loss": 5.8779, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.002978652986927023, |
|
"grad_norm": 2.1454436779022217, |
|
"learning_rate": 9e-05, |
|
"loss": 5.5083, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.0033096144299189145, |
|
"grad_norm": 1.9616013765335083, |
|
"learning_rate": 0.0001, |
|
"loss": 4.9761, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0033096144299189145, |
|
"eval_loss": 1.2095388174057007, |
|
"eval_runtime": 257.6811, |
|
"eval_samples_per_second": 4.94, |
|
"eval_steps_per_second": 2.472, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0036405758729108058, |
|
"grad_norm": 2.1629104614257812, |
|
"learning_rate": 9.98458666866564e-05, |
|
"loss": 5.3772, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.003971537315902697, |
|
"grad_norm": 2.3832602500915527, |
|
"learning_rate": 9.938441702975689e-05, |
|
"loss": 4.6844, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.004302498758894589, |
|
"grad_norm": 1.8890061378479004, |
|
"learning_rate": 9.861849601988383e-05, |
|
"loss": 4.5607, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.0046334602018864805, |
|
"grad_norm": 2.4125006198883057, |
|
"learning_rate": 9.755282581475769e-05, |
|
"loss": 4.352, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.004964421644878371, |
|
"grad_norm": 2.396585464477539, |
|
"learning_rate": 9.619397662556435e-05, |
|
"loss": 4.6298, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.004964421644878371, |
|
"eval_loss": 0.9934529066085815, |
|
"eval_runtime": 256.971, |
|
"eval_samples_per_second": 4.954, |
|
"eval_steps_per_second": 2.479, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.005295383087870263, |
|
"grad_norm": 2.7325778007507324, |
|
"learning_rate": 9.45503262094184e-05, |
|
"loss": 3.8196, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.005626344530862155, |
|
"grad_norm": 2.316589832305908, |
|
"learning_rate": 9.263200821770461e-05, |
|
"loss": 4.0056, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.005957305973854046, |
|
"grad_norm": 2.2376809120178223, |
|
"learning_rate": 9.045084971874738e-05, |
|
"loss": 3.798, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.006288267416845937, |
|
"grad_norm": 2.7583084106445312, |
|
"learning_rate": 8.802029828000156e-05, |
|
"loss": 3.6498, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.006619228859837829, |
|
"grad_norm": 1.9092457294464111, |
|
"learning_rate": 8.535533905932738e-05, |
|
"loss": 3.6041, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.006619228859837829, |
|
"eval_loss": 0.8894630074501038, |
|
"eval_runtime": 257.2366, |
|
"eval_samples_per_second": 4.949, |
|
"eval_steps_per_second": 2.476, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.006950190302829721, |
|
"grad_norm": 2.0496015548706055, |
|
"learning_rate": 8.247240241650918e-05, |
|
"loss": 3.731, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.0072811517458216115, |
|
"grad_norm": 2.2720491886138916, |
|
"learning_rate": 7.938926261462366e-05, |
|
"loss": 3.4111, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.007612113188813503, |
|
"grad_norm": 2.0112087726593018, |
|
"learning_rate": 7.612492823579745e-05, |
|
"loss": 3.4615, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.007943074631805394, |
|
"grad_norm": 1.7832343578338623, |
|
"learning_rate": 7.269952498697734e-05, |
|
"loss": 3.4218, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.008274036074797287, |
|
"grad_norm": 2.1423120498657227, |
|
"learning_rate": 6.91341716182545e-05, |
|
"loss": 3.4775, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.008274036074797287, |
|
"eval_loss": 0.8443974852561951, |
|
"eval_runtime": 257.4444, |
|
"eval_samples_per_second": 4.945, |
|
"eval_steps_per_second": 2.474, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.008604997517789177, |
|
"grad_norm": 1.674811601638794, |
|
"learning_rate": 6.545084971874738e-05, |
|
"loss": 3.4598, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.008935958960781068, |
|
"grad_norm": 2.091567039489746, |
|
"learning_rate": 6.167226819279528e-05, |
|
"loss": 3.7612, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.009266920403772961, |
|
"grad_norm": 2.1018764972686768, |
|
"learning_rate": 5.782172325201155e-05, |
|
"loss": 3.6964, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.009597881846764852, |
|
"grad_norm": 1.832294225692749, |
|
"learning_rate": 5.392295478639225e-05, |
|
"loss": 3.6292, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.009928843289756743, |
|
"grad_norm": 2.0582776069641113, |
|
"learning_rate": 5e-05, |
|
"loss": 2.8792, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.009928843289756743, |
|
"eval_loss": 0.811681866645813, |
|
"eval_runtime": 257.1914, |
|
"eval_samples_per_second": 4.95, |
|
"eval_steps_per_second": 2.477, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.010259804732748635, |
|
"grad_norm": 1.9213557243347168, |
|
"learning_rate": 4.607704521360776e-05, |
|
"loss": 3.4662, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.010590766175740526, |
|
"grad_norm": 2.288318395614624, |
|
"learning_rate": 4.2178276747988446e-05, |
|
"loss": 3.3137, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.010921727618732417, |
|
"grad_norm": 1.9459996223449707, |
|
"learning_rate": 3.832773180720475e-05, |
|
"loss": 3.4205, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.01125268906172431, |
|
"grad_norm": 1.8996334075927734, |
|
"learning_rate": 3.4549150281252636e-05, |
|
"loss": 3.5199, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.0115836505047162, |
|
"grad_norm": 2.4128260612487793, |
|
"learning_rate": 3.086582838174551e-05, |
|
"loss": 3.3873, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.0115836505047162, |
|
"eval_loss": 0.7911669611930847, |
|
"eval_runtime": 256.9175, |
|
"eval_samples_per_second": 4.955, |
|
"eval_steps_per_second": 2.479, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.011914611947708093, |
|
"grad_norm": 2.779157876968384, |
|
"learning_rate": 2.7300475013022663e-05, |
|
"loss": 3.2642, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.012245573390699984, |
|
"grad_norm": 2.04453444480896, |
|
"learning_rate": 2.3875071764202563e-05, |
|
"loss": 3.3588, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.012576534833691875, |
|
"grad_norm": 1.9138102531433105, |
|
"learning_rate": 2.061073738537635e-05, |
|
"loss": 3.1433, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.012907496276683767, |
|
"grad_norm": 2.5308830738067627, |
|
"learning_rate": 1.7527597583490822e-05, |
|
"loss": 3.2754, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.013238457719675658, |
|
"grad_norm": 2.0766677856445312, |
|
"learning_rate": 1.4644660940672627e-05, |
|
"loss": 3.1162, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.013238457719675658, |
|
"eval_loss": 0.7798255085945129, |
|
"eval_runtime": 257.4302, |
|
"eval_samples_per_second": 4.945, |
|
"eval_steps_per_second": 2.474, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.013569419162667549, |
|
"grad_norm": 2.0006625652313232, |
|
"learning_rate": 1.1979701719998453e-05, |
|
"loss": 3.2087, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.013900380605659441, |
|
"grad_norm": 2.1602704524993896, |
|
"learning_rate": 9.549150281252633e-06, |
|
"loss": 3.177, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.014231342048651332, |
|
"grad_norm": 1.935172438621521, |
|
"learning_rate": 7.367991782295391e-06, |
|
"loss": 2.6165, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.014562303491643223, |
|
"grad_norm": 1.9336456060409546, |
|
"learning_rate": 5.449673790581611e-06, |
|
"loss": 2.9167, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.014893264934635116, |
|
"grad_norm": 1.7702152729034424, |
|
"learning_rate": 3.8060233744356633e-06, |
|
"loss": 3.3973, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.014893264934635116, |
|
"eval_loss": 0.7749022245407104, |
|
"eval_runtime": 257.2066, |
|
"eval_samples_per_second": 4.949, |
|
"eval_steps_per_second": 2.477, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.015224226377627006, |
|
"grad_norm": 1.742775559425354, |
|
"learning_rate": 2.4471741852423237e-06, |
|
"loss": 3.1991, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.015555187820618897, |
|
"grad_norm": 2.1770782470703125, |
|
"learning_rate": 1.3815039801161721e-06, |
|
"loss": 2.8691, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.015886149263610788, |
|
"grad_norm": 2.1110646724700928, |
|
"learning_rate": 6.15582970243117e-07, |
|
"loss": 2.4055, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.016217110706602682, |
|
"grad_norm": 1.619126796722412, |
|
"learning_rate": 1.5413331334360182e-07, |
|
"loss": 3.075, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.016548072149594573, |
|
"grad_norm": 2.098407030105591, |
|
"learning_rate": 0.0, |
|
"loss": 2.9933, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.016548072149594573, |
|
"eval_loss": 0.7738050818443298, |
|
"eval_runtime": 256.8475, |
|
"eval_samples_per_second": 4.956, |
|
"eval_steps_per_second": 2.48, |
|
"step": 50 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 50, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 25, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.28697269714944e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|