{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 15010, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.1665556295802798, "grad_norm": 1.6862213611602783, "learning_rate": 0.00048334443704197203, "loss": 2.122, "step": 500 }, { "epoch": 0.3331112591605596, "grad_norm": 0.9895781874656677, "learning_rate": 0.00046668887408394405, "loss": 1.6754, "step": 1000 }, { "epoch": 0.4996668887408394, "grad_norm": 1.4011154174804688, "learning_rate": 0.00045003331112591607, "loss": 1.5258, "step": 1500 }, { "epoch": 0.6662225183211192, "grad_norm": 1.263901948928833, "learning_rate": 0.0004333777481678881, "loss": 1.3949, "step": 2000 }, { "epoch": 0.832778147901399, "grad_norm": 0.9926736354827881, "learning_rate": 0.0004167221852098601, "loss": 1.323, "step": 2500 }, { "epoch": 0.9993337774816788, "grad_norm": 1.2104265689849854, "learning_rate": 0.0004000666222518321, "loss": 1.2978, "step": 3000 }, { "epoch": 1.0, "eval_loss": 1.1156065464019775, "eval_runtime": 37.2553, "eval_samples_per_second": 80.579, "eval_steps_per_second": 10.093, "step": 3002 }, { "epoch": 1.1658894070619588, "grad_norm": 0.9220499992370605, "learning_rate": 0.00038341105929380414, "loss": 0.964, "step": 3500 }, { "epoch": 1.3324450366422385, "grad_norm": 1.2684720754623413, "learning_rate": 0.00036675549633577616, "loss": 0.9784, "step": 4000 }, { "epoch": 1.4990006662225184, "grad_norm": 1.3914306163787842, "learning_rate": 0.0003500999333777481, "loss": 0.9265, "step": 4500 }, { "epoch": 1.6655562958027983, "grad_norm": 1.0393187999725342, "learning_rate": 0.0003334443704197202, "loss": 0.9401, "step": 5000 }, { "epoch": 1.832111925383078, "grad_norm": 1.1595275402069092, "learning_rate": 0.0003167888074616922, "loss": 0.9091, "step": 5500 }, { "epoch": 1.9986675549633577, "grad_norm": 1.2401949167251587, "learning_rate": 0.00030013324450366423, "loss": 0.9271, "step": 6000 }, { "epoch": 2.0, "eval_loss": 0.9488099217414856, "eval_runtime": 37.415, "eval_samples_per_second": 80.235, "eval_steps_per_second": 10.049, "step": 6004 }, { "epoch": 2.1652231845436374, "grad_norm": 0.6324372887611389, "learning_rate": 0.00028347768154563625, "loss": 0.6576, "step": 6500 }, { "epoch": 2.3317788141239175, "grad_norm": 1.377943992614746, "learning_rate": 0.00026682211858760827, "loss": 0.6499, "step": 7000 }, { "epoch": 2.498334443704197, "grad_norm": 0.9929794669151306, "learning_rate": 0.0002501665556295803, "loss": 0.6509, "step": 7500 }, { "epoch": 2.664890073284477, "grad_norm": 1.331009030342102, "learning_rate": 0.0002335109926715523, "loss": 0.6492, "step": 8000 }, { "epoch": 2.831445702864757, "grad_norm": 0.9260538816452026, "learning_rate": 0.00021685542971352432, "loss": 0.6765, "step": 8500 }, { "epoch": 2.9980013324450367, "grad_norm": 1.6844342947006226, "learning_rate": 0.00020019986675549634, "loss": 0.6452, "step": 9000 }, { "epoch": 3.0, "eval_loss": 0.9022971391677856, "eval_runtime": 37.548, "eval_samples_per_second": 79.951, "eval_steps_per_second": 10.014, "step": 9006 }, { "epoch": 3.1645569620253164, "grad_norm": 0.6371086835861206, "learning_rate": 0.00018354430379746836, "loss": 0.4646, "step": 9500 }, { "epoch": 3.331112591605596, "grad_norm": 1.005698323249817, "learning_rate": 0.00016688874083944038, "loss": 0.4706, "step": 10000 }, { "epoch": 3.497668221185876, "grad_norm": 0.990774393081665, "learning_rate": 0.0001502331778814124, "loss": 0.434, "step": 10500 }, { "epoch": 3.664223850766156, "grad_norm": 0.7444645762443542, "learning_rate": 0.00013357761492338441, "loss": 0.4682, "step": 11000 }, { "epoch": 3.8307794803464357, "grad_norm": 1.1938289403915405, "learning_rate": 0.00011692205196535643, "loss": 0.4428, "step": 11500 }, { "epoch": 3.9973351099267154, "grad_norm": 1.2232627868652344, "learning_rate": 0.00010026648900732845, "loss": 0.4446, "step": 12000 }, { "epoch": 4.0, "eval_loss": 0.9124976396560669, "eval_runtime": 37.6584, "eval_samples_per_second": 79.717, "eval_steps_per_second": 9.985, "step": 12008 }, { "epoch": 4.1638907395069955, "grad_norm": 1.035305380821228, "learning_rate": 8.361092604930047e-05, "loss": 0.3292, "step": 12500 }, { "epoch": 4.330446369087275, "grad_norm": 1.409875512123108, "learning_rate": 6.695536309127249e-05, "loss": 0.321, "step": 13000 }, { "epoch": 4.497001998667555, "grad_norm": 1.468259334564209, "learning_rate": 5.0299800133244506e-05, "loss": 0.3161, "step": 13500 }, { "epoch": 4.663557628247835, "grad_norm": 1.00761878490448, "learning_rate": 3.3644237175216524e-05, "loss": 0.3214, "step": 14000 }, { "epoch": 4.830113257828114, "grad_norm": 0.7190210223197937, "learning_rate": 1.698867421718854e-05, "loss": 0.3037, "step": 14500 }, { "epoch": 4.996668887408394, "grad_norm": 0.6449595093727112, "learning_rate": 3.3311125916055966e-07, "loss": 0.3045, "step": 15000 } ], "logging_steps": 500, "max_steps": 15010, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6083104659545088.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }