{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9954170485792853, "eval_steps": 500, "global_step": 816, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03666361136571952, "grad_norm": 10.920966734668648, "learning_rate": 5e-06, "loss": 1.033, "step": 10 }, { "epoch": 0.07332722273143905, "grad_norm": 2.445980198452297, "learning_rate": 5e-06, "loss": 0.9011, "step": 20 }, { "epoch": 0.10999083409715857, "grad_norm": 1.5424232386159482, "learning_rate": 5e-06, "loss": 0.8764, "step": 30 }, { "epoch": 0.1466544454628781, "grad_norm": 1.1797649838046136, "learning_rate": 5e-06, "loss": 0.8446, "step": 40 }, { "epoch": 0.18331805682859761, "grad_norm": 1.0295589020655365, "learning_rate": 5e-06, "loss": 0.8204, "step": 50 }, { "epoch": 0.21998166819431714, "grad_norm": 1.2160434357225554, "learning_rate": 5e-06, "loss": 0.8104, "step": 60 }, { "epoch": 0.2566452795600367, "grad_norm": 1.2135493715768004, "learning_rate": 5e-06, "loss": 0.7968, "step": 70 }, { "epoch": 0.2933088909257562, "grad_norm": 0.7594488712178804, "learning_rate": 5e-06, "loss": 0.7836, "step": 80 }, { "epoch": 0.32997250229147573, "grad_norm": 0.8913076302913621, "learning_rate": 5e-06, "loss": 0.7781, "step": 90 }, { "epoch": 0.36663611365719523, "grad_norm": 1.126183659145103, "learning_rate": 5e-06, "loss": 0.7732, "step": 100 }, { "epoch": 0.4032997250229148, "grad_norm": 0.7476760341544976, "learning_rate": 5e-06, "loss": 0.7711, "step": 110 }, { "epoch": 0.4399633363886343, "grad_norm": 0.828783632948725, "learning_rate": 5e-06, "loss": 0.7637, "step": 120 }, { "epoch": 0.4766269477543538, "grad_norm": 0.7005369874659794, "learning_rate": 5e-06, "loss": 0.7617, "step": 130 }, { "epoch": 0.5132905591200734, "grad_norm": 0.6781356553576761, "learning_rate": 5e-06, "loss": 0.7562, "step": 140 }, { "epoch": 0.5499541704857929, "grad_norm": 0.6643060954517749, "learning_rate": 5e-06, "loss": 0.7601, "step": 150 }, { "epoch": 0.5866177818515124, "grad_norm": 0.654862572470797, "learning_rate": 5e-06, "loss": 0.7561, "step": 160 }, { "epoch": 0.6232813932172319, "grad_norm": 0.7126834121476828, "learning_rate": 5e-06, "loss": 0.7549, "step": 170 }, { "epoch": 0.6599450045829515, "grad_norm": 0.5845932549413623, "learning_rate": 5e-06, "loss": 0.7525, "step": 180 }, { "epoch": 0.696608615948671, "grad_norm": 0.583642927450063, "learning_rate": 5e-06, "loss": 0.7507, "step": 190 }, { "epoch": 0.7332722273143905, "grad_norm": 0.5759630428428489, "learning_rate": 5e-06, "loss": 0.7492, "step": 200 }, { "epoch": 0.76993583868011, "grad_norm": 0.597809207757354, "learning_rate": 5e-06, "loss": 0.7446, "step": 210 }, { "epoch": 0.8065994500458296, "grad_norm": 0.6520665055230834, "learning_rate": 5e-06, "loss": 0.7512, "step": 220 }, { "epoch": 0.843263061411549, "grad_norm": 0.6521761800994458, "learning_rate": 5e-06, "loss": 0.744, "step": 230 }, { "epoch": 0.8799266727772685, "grad_norm": 0.6083361886529014, "learning_rate": 5e-06, "loss": 0.7431, "step": 240 }, { "epoch": 0.916590284142988, "grad_norm": 0.8966782629847545, "learning_rate": 5e-06, "loss": 0.7399, "step": 250 }, { "epoch": 0.9532538955087076, "grad_norm": 0.6584181334872885, "learning_rate": 5e-06, "loss": 0.7457, "step": 260 }, { "epoch": 0.9899175068744271, "grad_norm": 0.5614900416740534, "learning_rate": 5e-06, "loss": 0.7434, "step": 270 }, { "epoch": 0.997250229147571, "eval_loss": 0.743977963924408, "eval_runtime": 96.6447, "eval_samples_per_second": 76.052, "eval_steps_per_second": 0.6, "step": 272 }, { "epoch": 1.0284142988084326, "grad_norm": 0.6377526876986616, "learning_rate": 5e-06, "loss": 0.7593, "step": 280 }, { "epoch": 1.065077910174152, "grad_norm": 0.8312923337011684, "learning_rate": 5e-06, "loss": 0.6885, "step": 290 }, { "epoch": 1.1017415215398716, "grad_norm": 0.6499984381614756, "learning_rate": 5e-06, "loss": 0.6893, "step": 300 }, { "epoch": 1.138405132905591, "grad_norm": 0.658519279927457, "learning_rate": 5e-06, "loss": 0.6868, "step": 310 }, { "epoch": 1.1750687442713108, "grad_norm": 0.6307182099292118, "learning_rate": 5e-06, "loss": 0.6885, "step": 320 }, { "epoch": 1.2117323556370303, "grad_norm": 0.6191143311988347, "learning_rate": 5e-06, "loss": 0.6871, "step": 330 }, { "epoch": 1.2483959670027498, "grad_norm": 0.6735946598593434, "learning_rate": 5e-06, "loss": 0.6935, "step": 340 }, { "epoch": 1.2850595783684693, "grad_norm": 0.7213451984916242, "learning_rate": 5e-06, "loss": 0.6943, "step": 350 }, { "epoch": 1.3217231897341888, "grad_norm": 0.5841901070016948, "learning_rate": 5e-06, "loss": 0.6938, "step": 360 }, { "epoch": 1.3583868010999083, "grad_norm": 0.6609752377099979, "learning_rate": 5e-06, "loss": 0.6856, "step": 370 }, { "epoch": 1.3950504124656278, "grad_norm": 0.6004672142282963, "learning_rate": 5e-06, "loss": 0.69, "step": 380 }, { "epoch": 1.4317140238313475, "grad_norm": 0.7494020947088555, "learning_rate": 5e-06, "loss": 0.682, "step": 390 }, { "epoch": 1.468377635197067, "grad_norm": 0.6711006066177567, "learning_rate": 5e-06, "loss": 0.6917, "step": 400 }, { "epoch": 1.5050412465627865, "grad_norm": 0.6517430215570676, "learning_rate": 5e-06, "loss": 0.6871, "step": 410 }, { "epoch": 1.541704857928506, "grad_norm": 0.6180564693914907, "learning_rate": 5e-06, "loss": 0.6829, "step": 420 }, { "epoch": 1.5783684692942255, "grad_norm": 0.5764324092377354, "learning_rate": 5e-06, "loss": 0.6824, "step": 430 }, { "epoch": 1.615032080659945, "grad_norm": 0.7134204082562298, "learning_rate": 5e-06, "loss": 0.6822, "step": 440 }, { "epoch": 1.6516956920256645, "grad_norm": 0.7630512170385407, "learning_rate": 5e-06, "loss": 0.6879, "step": 450 }, { "epoch": 1.6883593033913842, "grad_norm": 0.6285437172539765, "learning_rate": 5e-06, "loss": 0.6804, "step": 460 }, { "epoch": 1.7250229147571035, "grad_norm": 0.5968789313484854, "learning_rate": 5e-06, "loss": 0.686, "step": 470 }, { "epoch": 1.7616865261228232, "grad_norm": 0.6425175740435289, "learning_rate": 5e-06, "loss": 0.6856, "step": 480 }, { "epoch": 1.7983501374885427, "grad_norm": 0.7614365266625939, "learning_rate": 5e-06, "loss": 0.6814, "step": 490 }, { "epoch": 1.8350137488542622, "grad_norm": 0.5496379357416068, "learning_rate": 5e-06, "loss": 0.6855, "step": 500 }, { "epoch": 1.8716773602199817, "grad_norm": 0.8494093367270151, "learning_rate": 5e-06, "loss": 0.6875, "step": 510 }, { "epoch": 1.9083409715857012, "grad_norm": 0.6756166103000668, "learning_rate": 5e-06, "loss": 0.6856, "step": 520 }, { "epoch": 1.9450045829514209, "grad_norm": 0.7228484772895967, "learning_rate": 5e-06, "loss": 0.6841, "step": 530 }, { "epoch": 1.9816681943171401, "grad_norm": 0.7786774729146112, "learning_rate": 5e-06, "loss": 0.6845, "step": 540 }, { "epoch": 1.996333638863428, "eval_loss": 0.7305116057395935, "eval_runtime": 96.1532, "eval_samples_per_second": 76.441, "eval_steps_per_second": 0.603, "step": 544 }, { "epoch": 2.020164986251146, "grad_norm": 1.0425759476709966, "learning_rate": 5e-06, "loss": 0.707, "step": 550 }, { "epoch": 2.056828597616865, "grad_norm": 0.8473344095764829, "learning_rate": 5e-06, "loss": 0.6313, "step": 560 }, { "epoch": 2.093492208982585, "grad_norm": 0.7205628261028438, "learning_rate": 5e-06, "loss": 0.6281, "step": 570 }, { "epoch": 2.130155820348304, "grad_norm": 0.6604987014823058, "learning_rate": 5e-06, "loss": 0.631, "step": 580 }, { "epoch": 2.166819431714024, "grad_norm": 0.6774961015973217, "learning_rate": 5e-06, "loss": 0.6351, "step": 590 }, { "epoch": 2.203483043079743, "grad_norm": 0.8519809292040578, "learning_rate": 5e-06, "loss": 0.634, "step": 600 }, { "epoch": 2.240146654445463, "grad_norm": 0.693823740633704, "learning_rate": 5e-06, "loss": 0.6327, "step": 610 }, { "epoch": 2.276810265811182, "grad_norm": 0.6448705487045298, "learning_rate": 5e-06, "loss": 0.6339, "step": 620 }, { "epoch": 2.313473877176902, "grad_norm": 0.5865817788059118, "learning_rate": 5e-06, "loss": 0.636, "step": 630 }, { "epoch": 2.3501374885426216, "grad_norm": 0.8116556137845999, "learning_rate": 5e-06, "loss": 0.6342, "step": 640 }, { "epoch": 2.386801099908341, "grad_norm": 0.6231657257473445, "learning_rate": 5e-06, "loss": 0.637, "step": 650 }, { "epoch": 2.4234647112740606, "grad_norm": 0.6250913266909794, "learning_rate": 5e-06, "loss": 0.63, "step": 660 }, { "epoch": 2.46012832263978, "grad_norm": 0.582068921531117, "learning_rate": 5e-06, "loss": 0.6288, "step": 670 }, { "epoch": 2.4967919340054996, "grad_norm": 0.6912367969819871, "learning_rate": 5e-06, "loss": 0.6381, "step": 680 }, { "epoch": 2.5334555453712193, "grad_norm": 0.7147652107920064, "learning_rate": 5e-06, "loss": 0.6332, "step": 690 }, { "epoch": 2.5701191567369386, "grad_norm": 0.5792260811836798, "learning_rate": 5e-06, "loss": 0.6351, "step": 700 }, { "epoch": 2.606782768102658, "grad_norm": 0.7963438662743851, "learning_rate": 5e-06, "loss": 0.6363, "step": 710 }, { "epoch": 2.6434463794683776, "grad_norm": 0.9276380358330181, "learning_rate": 5e-06, "loss": 0.6355, "step": 720 }, { "epoch": 2.6801099908340973, "grad_norm": 0.9313823270809661, "learning_rate": 5e-06, "loss": 0.6351, "step": 730 }, { "epoch": 2.7167736021998166, "grad_norm": 0.7304200587600748, "learning_rate": 5e-06, "loss": 0.638, "step": 740 }, { "epoch": 2.7534372135655363, "grad_norm": 0.6212966397528322, "learning_rate": 5e-06, "loss": 0.6388, "step": 750 }, { "epoch": 2.7901008249312556, "grad_norm": 0.6720686482466423, "learning_rate": 5e-06, "loss": 0.6364, "step": 760 }, { "epoch": 2.8267644362969753, "grad_norm": 0.6438467896193539, "learning_rate": 5e-06, "loss": 0.6421, "step": 770 }, { "epoch": 2.863428047662695, "grad_norm": 0.6043416931907646, "learning_rate": 5e-06, "loss": 0.6379, "step": 780 }, { "epoch": 2.9000916590284143, "grad_norm": 0.6496494693588303, "learning_rate": 5e-06, "loss": 0.6414, "step": 790 }, { "epoch": 2.936755270394134, "grad_norm": 0.8144443719589332, "learning_rate": 5e-06, "loss": 0.6361, "step": 800 }, { "epoch": 2.9734188817598532, "grad_norm": 0.7037764123768507, "learning_rate": 5e-06, "loss": 0.6373, "step": 810 }, { "epoch": 2.9954170485792853, "eval_loss": 0.7332214117050171, "eval_runtime": 94.4197, "eval_samples_per_second": 77.844, "eval_steps_per_second": 0.614, "step": 816 }, { "epoch": 2.9954170485792853, "step": 816, "total_flos": 1366411632967680.0, "train_loss": 0.7035682309491962, "train_runtime": 14220.3782, "train_samples_per_second": 29.46, "train_steps_per_second": 0.057 } ], "logging_steps": 10, "max_steps": 816, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1366411632967680.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }