|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9962546816479403, |
|
"eval_steps": 500, |
|
"global_step": 600, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.049937578027465665, |
|
"grad_norm": 29.759422123901825, |
|
"learning_rate": 5e-06, |
|
"loss": 1.0298, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.09987515605493133, |
|
"grad_norm": 1.659620518169597, |
|
"learning_rate": 5e-06, |
|
"loss": 0.9418, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.149812734082397, |
|
"grad_norm": 1.6655817038532692, |
|
"learning_rate": 5e-06, |
|
"loss": 0.9067, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.19975031210986266, |
|
"grad_norm": 3.758532618440977, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8837, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.24968789013732834, |
|
"grad_norm": 1.0914089528556838, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8683, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.299625468164794, |
|
"grad_norm": 0.8217188574851603, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8554, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.3495630461922597, |
|
"grad_norm": 2.6152137256768913, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8501, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.3995006242197253, |
|
"grad_norm": 0.6790645381435163, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8376, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.449438202247191, |
|
"grad_norm": 1.050547891408586, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8389, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.4993757802746567, |
|
"grad_norm": 0.61531936130051, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8283, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.5493133583021224, |
|
"grad_norm": 0.7158756598167714, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8285, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.599250936329588, |
|
"grad_norm": 0.6741740850175894, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8208, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.6491885143570537, |
|
"grad_norm": 0.727395321147856, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8226, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.6991260923845194, |
|
"grad_norm": 0.5979029940330495, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8218, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.7490636704119851, |
|
"grad_norm": 0.7354628133399658, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8146, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.7990012484394506, |
|
"grad_norm": 1.0797869029852383, |
|
"learning_rate": 5e-06, |
|
"loss": 0.811, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.8489388264669163, |
|
"grad_norm": 0.5732338176348897, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8117, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.898876404494382, |
|
"grad_norm": 0.516647597175339, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8099, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.9488139825218477, |
|
"grad_norm": 0.595235164677505, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8062, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.9987515605493134, |
|
"grad_norm": 0.5491264032653016, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7992, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.9987515605493134, |
|
"eval_loss": 0.8015628457069397, |
|
"eval_runtime": 212.5855, |
|
"eval_samples_per_second": 25.387, |
|
"eval_steps_per_second": 0.4, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.048689138576779, |
|
"grad_norm": 1.1815779977106664, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8245, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.0986267166042447, |
|
"grad_norm": 0.8607004375758024, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7537, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.1485642946317103, |
|
"grad_norm": 0.6382531480247193, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7617, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.198501872659176, |
|
"grad_norm": 0.5848998545511357, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7635, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.2484394506866416, |
|
"grad_norm": 0.6856479277932508, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7623, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.2983770287141074, |
|
"grad_norm": 0.7819524787327043, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7567, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.348314606741573, |
|
"grad_norm": 0.6831313099201878, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7606, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.3982521847690386, |
|
"grad_norm": 0.6700856388131974, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7553, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.4481897627965044, |
|
"grad_norm": 0.5874295240823044, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7575, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.4981273408239701, |
|
"grad_norm": 0.6100148315517313, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7523, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.5480649188514357, |
|
"grad_norm": 0.6291672713518774, |
|
"learning_rate": 5e-06, |
|
"loss": 0.759, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.5980024968789013, |
|
"grad_norm": 0.7275448418797654, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7532, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.647940074906367, |
|
"grad_norm": 0.6798292981346045, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7652, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.6978776529338329, |
|
"grad_norm": 0.7320780258400261, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7554, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.7478152309612984, |
|
"grad_norm": 0.6107676047027211, |
|
"learning_rate": 5e-06, |
|
"loss": 0.757, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.797752808988764, |
|
"grad_norm": 0.5083613384732135, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7576, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.8476903870162298, |
|
"grad_norm": 0.5021025632111004, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7584, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.8976279650436954, |
|
"grad_norm": 0.6593302140861815, |
|
"learning_rate": 5e-06, |
|
"loss": 0.755, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.947565543071161, |
|
"grad_norm": 0.5669868340257436, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7478, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.9975031210986267, |
|
"grad_norm": 0.5745040341281294, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7538, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.9975031210986267, |
|
"eval_loss": 0.7874204516410828, |
|
"eval_runtime": 212.6782, |
|
"eval_samples_per_second": 25.376, |
|
"eval_steps_per_second": 0.4, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.0474406991260925, |
|
"grad_norm": 0.6952347883899184, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7611, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.097378277153558, |
|
"grad_norm": 0.6000479994459602, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7037, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.1473158551810236, |
|
"grad_norm": 0.6575266096005482, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7089, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 2.1972534332084894, |
|
"grad_norm": 0.7384159721059136, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7057, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.247191011235955, |
|
"grad_norm": 0.6735840214535883, |
|
"learning_rate": 5e-06, |
|
"loss": 0.706, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.2971285892634206, |
|
"grad_norm": 0.7153617513297972, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7064, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.3470661672908864, |
|
"grad_norm": 0.7396133098853745, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7049, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 2.397003745318352, |
|
"grad_norm": 0.6440383221784979, |
|
"learning_rate": 5e-06, |
|
"loss": 0.705, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 2.4469413233458175, |
|
"grad_norm": 0.5481603423583875, |
|
"learning_rate": 5e-06, |
|
"loss": 0.709, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 2.4968789013732833, |
|
"grad_norm": 0.6611697985224058, |
|
"learning_rate": 5e-06, |
|
"loss": 0.71, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.546816479400749, |
|
"grad_norm": 0.6252639550455323, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7128, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 2.596754057428215, |
|
"grad_norm": 0.578764019014536, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7116, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 2.6466916354556806, |
|
"grad_norm": 0.5718219886250622, |
|
"learning_rate": 5e-06, |
|
"loss": 0.711, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 2.696629213483146, |
|
"grad_norm": 0.6480951015929783, |
|
"learning_rate": 5e-06, |
|
"loss": 0.706, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 2.746566791510612, |
|
"grad_norm": 0.568128147930456, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7144, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 2.796504369538077, |
|
"grad_norm": 0.7016907742592169, |
|
"learning_rate": 5e-06, |
|
"loss": 0.708, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 2.846441947565543, |
|
"grad_norm": 0.6718047517989062, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7147, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 2.8963795255930087, |
|
"grad_norm": 0.7869266984488797, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7114, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 2.9463171036204745, |
|
"grad_norm": 0.6070316014377024, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7072, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 2.9962546816479403, |
|
"grad_norm": 0.5662761836861052, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7158, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.9962546816479403, |
|
"eval_loss": 0.7859531044960022, |
|
"eval_runtime": 213.866, |
|
"eval_samples_per_second": 25.235, |
|
"eval_steps_per_second": 0.397, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.9962546816479403, |
|
"step": 600, |
|
"total_flos": 1004812967608320.0, |
|
"train_loss": 0.773849273522695, |
|
"train_runtime": 35442.7976, |
|
"train_samples_per_second": 8.678, |
|
"train_steps_per_second": 0.017 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 600, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1004812967608320.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|