|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9954170485792853, |
|
"eval_steps": 500, |
|
"global_step": 816, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03666361136571952, |
|
"grad_norm": 10.920966734668648, |
|
"learning_rate": 5e-06, |
|
"loss": 1.033, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.07332722273143905, |
|
"grad_norm": 2.445980198452297, |
|
"learning_rate": 5e-06, |
|
"loss": 0.9011, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.10999083409715857, |
|
"grad_norm": 1.5424232386159482, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8764, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.1466544454628781, |
|
"grad_norm": 1.1797649838046136, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8446, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.18331805682859761, |
|
"grad_norm": 1.0295589020655365, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8204, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.21998166819431714, |
|
"grad_norm": 1.2160434357225554, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8104, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.2566452795600367, |
|
"grad_norm": 1.2135493715768004, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7968, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.2933088909257562, |
|
"grad_norm": 0.7594488712178804, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7836, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.32997250229147573, |
|
"grad_norm": 0.8913076302913621, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7781, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.36663611365719523, |
|
"grad_norm": 1.126183659145103, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7732, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.4032997250229148, |
|
"grad_norm": 0.7476760341544976, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7711, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.4399633363886343, |
|
"grad_norm": 0.828783632948725, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7637, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.4766269477543538, |
|
"grad_norm": 0.7005369874659794, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7617, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.5132905591200734, |
|
"grad_norm": 0.6781356553576761, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7562, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.5499541704857929, |
|
"grad_norm": 0.6643060954517749, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7601, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.5866177818515124, |
|
"grad_norm": 0.654862572470797, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7561, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.6232813932172319, |
|
"grad_norm": 0.7126834121476828, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7549, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.6599450045829515, |
|
"grad_norm": 0.5845932549413623, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7525, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.696608615948671, |
|
"grad_norm": 0.583642927450063, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7507, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.7332722273143905, |
|
"grad_norm": 0.5759630428428489, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7492, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.76993583868011, |
|
"grad_norm": 0.597809207757354, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7446, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.8065994500458296, |
|
"grad_norm": 0.6520665055230834, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7512, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.843263061411549, |
|
"grad_norm": 0.6521761800994458, |
|
"learning_rate": 5e-06, |
|
"loss": 0.744, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.8799266727772685, |
|
"grad_norm": 0.6083361886529014, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7431, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.916590284142988, |
|
"grad_norm": 0.8966782629847545, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7399, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.9532538955087076, |
|
"grad_norm": 0.6584181334872885, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7457, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.9899175068744271, |
|
"grad_norm": 0.5614900416740534, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7434, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.997250229147571, |
|
"eval_loss": 0.743977963924408, |
|
"eval_runtime": 96.6447, |
|
"eval_samples_per_second": 76.052, |
|
"eval_steps_per_second": 0.6, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 1.0284142988084326, |
|
"grad_norm": 0.6377526876986616, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7593, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.065077910174152, |
|
"grad_norm": 0.8312923337011684, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6885, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.1017415215398716, |
|
"grad_norm": 0.6499984381614756, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6893, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.138405132905591, |
|
"grad_norm": 0.658519279927457, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6868, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.1750687442713108, |
|
"grad_norm": 0.6307182099292118, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6885, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.2117323556370303, |
|
"grad_norm": 0.6191143311988347, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6871, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.2483959670027498, |
|
"grad_norm": 0.6735946598593434, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6935, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.2850595783684693, |
|
"grad_norm": 0.7213451984916242, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6943, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.3217231897341888, |
|
"grad_norm": 0.5841901070016948, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6938, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.3583868010999083, |
|
"grad_norm": 0.6609752377099979, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6856, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.3950504124656278, |
|
"grad_norm": 0.6004672142282963, |
|
"learning_rate": 5e-06, |
|
"loss": 0.69, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.4317140238313475, |
|
"grad_norm": 0.7494020947088555, |
|
"learning_rate": 5e-06, |
|
"loss": 0.682, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.468377635197067, |
|
"grad_norm": 0.6711006066177567, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6917, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.5050412465627865, |
|
"grad_norm": 0.6517430215570676, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6871, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.541704857928506, |
|
"grad_norm": 0.6180564693914907, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6829, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.5783684692942255, |
|
"grad_norm": 0.5764324092377354, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6824, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.615032080659945, |
|
"grad_norm": 0.7134204082562298, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6822, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.6516956920256645, |
|
"grad_norm": 0.7630512170385407, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6879, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.6883593033913842, |
|
"grad_norm": 0.6285437172539765, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6804, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.7250229147571035, |
|
"grad_norm": 0.5968789313484854, |
|
"learning_rate": 5e-06, |
|
"loss": 0.686, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.7616865261228232, |
|
"grad_norm": 0.6425175740435289, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6856, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.7983501374885427, |
|
"grad_norm": 0.7614365266625939, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6814, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.8350137488542622, |
|
"grad_norm": 0.5496379357416068, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6855, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.8716773602199817, |
|
"grad_norm": 0.8494093367270151, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6875, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.9083409715857012, |
|
"grad_norm": 0.6756166103000668, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6856, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.9450045829514209, |
|
"grad_norm": 0.7228484772895967, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6841, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.9816681943171401, |
|
"grad_norm": 0.7786774729146112, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6845, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.996333638863428, |
|
"eval_loss": 0.7305116057395935, |
|
"eval_runtime": 96.1532, |
|
"eval_samples_per_second": 76.441, |
|
"eval_steps_per_second": 0.603, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 2.020164986251146, |
|
"grad_norm": 1.0425759476709966, |
|
"learning_rate": 5e-06, |
|
"loss": 0.707, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 2.056828597616865, |
|
"grad_norm": 0.8473344095764829, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6313, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 2.093492208982585, |
|
"grad_norm": 0.7205628261028438, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6281, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 2.130155820348304, |
|
"grad_norm": 0.6604987014823058, |
|
"learning_rate": 5e-06, |
|
"loss": 0.631, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 2.166819431714024, |
|
"grad_norm": 0.6774961015973217, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6351, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 2.203483043079743, |
|
"grad_norm": 0.8519809292040578, |
|
"learning_rate": 5e-06, |
|
"loss": 0.634, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.240146654445463, |
|
"grad_norm": 0.693823740633704, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6327, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 2.276810265811182, |
|
"grad_norm": 0.6448705487045298, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6339, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 2.313473877176902, |
|
"grad_norm": 0.5865817788059118, |
|
"learning_rate": 5e-06, |
|
"loss": 0.636, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.3501374885426216, |
|
"grad_norm": 0.8116556137845999, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6342, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.386801099908341, |
|
"grad_norm": 0.6231657257473445, |
|
"learning_rate": 5e-06, |
|
"loss": 0.637, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.4234647112740606, |
|
"grad_norm": 0.6250913266909794, |
|
"learning_rate": 5e-06, |
|
"loss": 0.63, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.46012832263978, |
|
"grad_norm": 0.582068921531117, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6288, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.4967919340054996, |
|
"grad_norm": 0.6912367969819871, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6381, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.5334555453712193, |
|
"grad_norm": 0.7147652107920064, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6332, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.5701191567369386, |
|
"grad_norm": 0.5792260811836798, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6351, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.606782768102658, |
|
"grad_norm": 0.7963438662743851, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6363, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.6434463794683776, |
|
"grad_norm": 0.9276380358330181, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6355, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.6801099908340973, |
|
"grad_norm": 0.9313823270809661, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6351, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.7167736021998166, |
|
"grad_norm": 0.7304200587600748, |
|
"learning_rate": 5e-06, |
|
"loss": 0.638, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.7534372135655363, |
|
"grad_norm": 0.6212966397528322, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6388, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.7901008249312556, |
|
"grad_norm": 0.6720686482466423, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6364, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.8267644362969753, |
|
"grad_norm": 0.6438467896193539, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6421, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.863428047662695, |
|
"grad_norm": 0.6043416931907646, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6379, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.9000916590284143, |
|
"grad_norm": 0.6496494693588303, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6414, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.936755270394134, |
|
"grad_norm": 0.8144443719589332, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6361, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.9734188817598532, |
|
"grad_norm": 0.7037764123768507, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6373, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.9954170485792853, |
|
"eval_loss": 0.7332214117050171, |
|
"eval_runtime": 94.4197, |
|
"eval_samples_per_second": 77.844, |
|
"eval_steps_per_second": 0.614, |
|
"step": 816 |
|
}, |
|
{ |
|
"epoch": 2.9954170485792853, |
|
"step": 816, |
|
"total_flos": 1366411632967680.0, |
|
"train_loss": 0.7035682309491962, |
|
"train_runtime": 14220.3782, |
|
"train_samples_per_second": 29.46, |
|
"train_steps_per_second": 0.057 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 816, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1366411632967680.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|