|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 2544, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0589622641509434, |
|
"grad_norm": 1.1042355298995972, |
|
"learning_rate": 4.9017295597484283e-05, |
|
"loss": 0.6615, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1179245283018868, |
|
"grad_norm": 2.1375088691711426, |
|
"learning_rate": 4.803459119496855e-05, |
|
"loss": 0.6046, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.17688679245283018, |
|
"grad_norm": 2.163625717163086, |
|
"learning_rate": 4.705188679245283e-05, |
|
"loss": 0.5757, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.2358490566037736, |
|
"grad_norm": 1.8961758613586426, |
|
"learning_rate": 4.606918238993711e-05, |
|
"loss": 0.5779, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.294811320754717, |
|
"grad_norm": 1.833200216293335, |
|
"learning_rate": 4.508647798742139e-05, |
|
"loss": 0.5505, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.35377358490566035, |
|
"grad_norm": 1.3514596223831177, |
|
"learning_rate": 4.410377358490566e-05, |
|
"loss": 0.5552, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.41273584905660377, |
|
"grad_norm": 1.499182105064392, |
|
"learning_rate": 4.312106918238994e-05, |
|
"loss": 0.5432, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.4716981132075472, |
|
"grad_norm": 1.5426216125488281, |
|
"learning_rate": 4.213836477987422e-05, |
|
"loss": 0.528, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.5306603773584906, |
|
"grad_norm": 2.095034122467041, |
|
"learning_rate": 4.115566037735849e-05, |
|
"loss": 0.5397, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.589622641509434, |
|
"grad_norm": 1.8828486204147339, |
|
"learning_rate": 4.017295597484277e-05, |
|
"loss": 0.5402, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.6485849056603774, |
|
"grad_norm": 1.1266496181488037, |
|
"learning_rate": 3.9190251572327046e-05, |
|
"loss": 0.5325, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.7075471698113207, |
|
"grad_norm": 1.8232479095458984, |
|
"learning_rate": 3.820754716981133e-05, |
|
"loss": 0.5207, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.7665094339622641, |
|
"grad_norm": 1.7382409572601318, |
|
"learning_rate": 3.7224842767295595e-05, |
|
"loss": 0.5174, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.8254716981132075, |
|
"grad_norm": 1.7190814018249512, |
|
"learning_rate": 3.6242138364779876e-05, |
|
"loss": 0.5138, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.8844339622641509, |
|
"grad_norm": 1.7162833213806152, |
|
"learning_rate": 3.525943396226416e-05, |
|
"loss": 0.5117, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.9433962264150944, |
|
"grad_norm": 1.8863391876220703, |
|
"learning_rate": 3.4276729559748424e-05, |
|
"loss": 0.4996, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.7362162162162162, |
|
"eval_f1": 0.7330681253107907, |
|
"eval_loss": 0.49908825755119324, |
|
"eval_runtime": 54.2047, |
|
"eval_samples_per_second": 375.429, |
|
"eval_steps_per_second": 3.911, |
|
"step": 848 |
|
}, |
|
{ |
|
"epoch": 1.0023584905660377, |
|
"grad_norm": 1.9871962070465088, |
|
"learning_rate": 3.3294025157232705e-05, |
|
"loss": 0.501, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.0613207547169812, |
|
"grad_norm": 3.0909602642059326, |
|
"learning_rate": 3.2311320754716986e-05, |
|
"loss": 0.4468, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.1202830188679245, |
|
"grad_norm": 2.0430283546447754, |
|
"learning_rate": 3.132861635220126e-05, |
|
"loss": 0.4495, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.179245283018868, |
|
"grad_norm": 2.5914931297302246, |
|
"learning_rate": 3.0345911949685535e-05, |
|
"loss": 0.4515, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.2382075471698113, |
|
"grad_norm": 3.2264254093170166, |
|
"learning_rate": 2.9363207547169812e-05, |
|
"loss": 0.4505, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.2971698113207548, |
|
"grad_norm": 1.447571873664856, |
|
"learning_rate": 2.838050314465409e-05, |
|
"loss": 0.4422, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.3561320754716981, |
|
"grad_norm": 2.17903995513916, |
|
"learning_rate": 2.7397798742138364e-05, |
|
"loss": 0.4465, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.4150943396226414, |
|
"grad_norm": 2.608694076538086, |
|
"learning_rate": 2.641509433962264e-05, |
|
"loss": 0.4445, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.474056603773585, |
|
"grad_norm": 1.0498720407485962, |
|
"learning_rate": 2.543238993710692e-05, |
|
"loss": 0.4554, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.5330188679245285, |
|
"grad_norm": 2.429417133331299, |
|
"learning_rate": 2.4449685534591197e-05, |
|
"loss": 0.4569, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.5919811320754715, |
|
"grad_norm": 1.741003394126892, |
|
"learning_rate": 2.346698113207547e-05, |
|
"loss": 0.4484, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.650943396226415, |
|
"grad_norm": 1.6256601810455322, |
|
"learning_rate": 2.248427672955975e-05, |
|
"loss": 0.4494, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.7099056603773586, |
|
"grad_norm": 1.4815946817398071, |
|
"learning_rate": 2.1501572327044026e-05, |
|
"loss": 0.4444, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.7688679245283019, |
|
"grad_norm": 1.878029227256775, |
|
"learning_rate": 2.0518867924528304e-05, |
|
"loss": 0.4529, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.8278301886792452, |
|
"grad_norm": 1.870025873184204, |
|
"learning_rate": 1.9536163522012578e-05, |
|
"loss": 0.4634, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.8867924528301887, |
|
"grad_norm": 2.8423280715942383, |
|
"learning_rate": 1.8553459119496856e-05, |
|
"loss": 0.4517, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.9457547169811322, |
|
"grad_norm": 1.5151439905166626, |
|
"learning_rate": 1.7570754716981134e-05, |
|
"loss": 0.4226, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.742014742014742, |
|
"eval_f1": 0.7299938284303641, |
|
"eval_loss": 0.49465227127075195, |
|
"eval_runtime": 54.3383, |
|
"eval_samples_per_second": 374.505, |
|
"eval_steps_per_second": 3.901, |
|
"step": 1696 |
|
}, |
|
{ |
|
"epoch": 2.0047169811320753, |
|
"grad_norm": 1.8239262104034424, |
|
"learning_rate": 1.6588050314465408e-05, |
|
"loss": 0.4294, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 2.063679245283019, |
|
"grad_norm": 5.227139472961426, |
|
"learning_rate": 1.5605345911949685e-05, |
|
"loss": 0.3895, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 2.1226415094339623, |
|
"grad_norm": 3.1078689098358154, |
|
"learning_rate": 1.4622641509433963e-05, |
|
"loss": 0.3782, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.1816037735849054, |
|
"grad_norm": 2.545759916305542, |
|
"learning_rate": 1.363993710691824e-05, |
|
"loss": 0.3835, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 2.240566037735849, |
|
"grad_norm": 2.3632497787475586, |
|
"learning_rate": 1.2657232704402517e-05, |
|
"loss": 0.3899, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 2.2995283018867925, |
|
"grad_norm": 3.032485008239746, |
|
"learning_rate": 1.1674528301886793e-05, |
|
"loss": 0.3714, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 2.358490566037736, |
|
"grad_norm": 3.313594341278076, |
|
"learning_rate": 1.069182389937107e-05, |
|
"loss": 0.3809, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.417452830188679, |
|
"grad_norm": 2.6334567070007324, |
|
"learning_rate": 9.709119496855348e-06, |
|
"loss": 0.3752, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 2.4764150943396226, |
|
"grad_norm": 2.706216812133789, |
|
"learning_rate": 8.726415094339622e-06, |
|
"loss": 0.3917, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 2.535377358490566, |
|
"grad_norm": 3.003523588180542, |
|
"learning_rate": 7.7437106918239e-06, |
|
"loss": 0.3715, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 2.5943396226415096, |
|
"grad_norm": 3.0342845916748047, |
|
"learning_rate": 6.761006289308176e-06, |
|
"loss": 0.3875, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.6533018867924527, |
|
"grad_norm": 2.371635913848877, |
|
"learning_rate": 5.778301886792453e-06, |
|
"loss": 0.381, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 2.7122641509433962, |
|
"grad_norm": 2.7369866371154785, |
|
"learning_rate": 4.79559748427673e-06, |
|
"loss": 0.3785, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 2.7712264150943398, |
|
"grad_norm": 4.16819953918457, |
|
"learning_rate": 3.8128930817610063e-06, |
|
"loss": 0.3897, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 2.830188679245283, |
|
"grad_norm": 1.599187970161438, |
|
"learning_rate": 2.830188679245283e-06, |
|
"loss": 0.381, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.8891509433962264, |
|
"grad_norm": 1.6142021417617798, |
|
"learning_rate": 1.8474842767295599e-06, |
|
"loss": 0.3773, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 2.94811320754717, |
|
"grad_norm": 5.334159851074219, |
|
"learning_rate": 8.647798742138365e-07, |
|
"loss": 0.3814, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.7441769041769042, |
|
"eval_f1": 0.7459992193598751, |
|
"eval_loss": 0.5200024247169495, |
|
"eval_runtime": 54.3504, |
|
"eval_samples_per_second": 374.422, |
|
"eval_steps_per_second": 3.901, |
|
"step": 2544 |
|
} |
|
], |
|
"logging_steps": 50, |
|
"max_steps": 2544, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.234814134959923e+16, |
|
"train_batch_size": 96, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|