{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.4984384759525295, "eval_steps": 500, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.12492192379762648, "grad_norm": 0.27754032611846924, "learning_rate": 2e-05, "loss": 0.1442, "step": 100 }, { "epoch": 0.24984384759525297, "grad_norm": 0.2751781642436981, "learning_rate": 2e-05, "loss": 0.088, "step": 200 }, { "epoch": 0.3747657713928795, "grad_norm": 0.23674216866493225, "learning_rate": 2e-05, "loss": 0.0738, "step": 300 }, { "epoch": 0.49968769519050593, "grad_norm": 0.23040202260017395, "learning_rate": 2e-05, "loss": 0.0643, "step": 400 }, { "epoch": 0.6246096189881324, "grad_norm": 0.26208606362342834, "learning_rate": 2e-05, "loss": 0.0584, "step": 500 }, { "epoch": 0.749531542785759, "grad_norm": 0.2172231376171112, "learning_rate": 2e-05, "loss": 0.0534, "step": 600 }, { "epoch": 0.8744534665833854, "grad_norm": 0.21661953628063202, "learning_rate": 2e-05, "loss": 0.0502, "step": 700 }, { "epoch": 0.9993753903810119, "grad_norm": 0.186213880777359, "learning_rate": 2e-05, "loss": 0.0471, "step": 800 }, { "epoch": 1.1242973141786383, "grad_norm": 0.2086647003889084, "learning_rate": 2e-05, "loss": 0.0373, "step": 900 }, { "epoch": 1.2492192379762648, "grad_norm": 0.1900208592414856, "learning_rate": 2e-05, "loss": 0.0351, "step": 1000 }, { "epoch": 1.3741411617738915, "grad_norm": 0.21206732094287872, "learning_rate": 2e-05, "loss": 0.0349, "step": 1100 }, { "epoch": 1.499063085571518, "grad_norm": 0.20564615726470947, "learning_rate": 2e-05, "loss": 0.0332, "step": 1200 }, { "epoch": 1.6239850093691444, "grad_norm": 0.2596442997455597, "learning_rate": 2e-05, "loss": 0.0313, "step": 1300 }, { "epoch": 1.7489069331667708, "grad_norm": 0.18866442143917084, "learning_rate": 2e-05, "loss": 0.0309, "step": 1400 }, { "epoch": 1.8738288569643973, "grad_norm": 0.19815115630626678, "learning_rate": 2e-05, "loss": 0.0298, "step": 1500 }, { "epoch": 1.9987507807620237, "grad_norm": 0.21096043288707733, "learning_rate": 2e-05, "loss": 0.0281, "step": 1600 }, { "epoch": 2.12367270455965, "grad_norm": 0.1663718819618225, "learning_rate": 2e-05, "loss": 0.0191, "step": 1700 }, { "epoch": 2.2485946283572766, "grad_norm": 0.1737908571958542, "learning_rate": 2e-05, "loss": 0.0191, "step": 1800 }, { "epoch": 2.373516552154903, "grad_norm": 0.21257372200489044, "learning_rate": 2e-05, "loss": 0.0191, "step": 1900 }, { "epoch": 2.4984384759525295, "grad_norm": 0.21343937516212463, "learning_rate": 2e-05, "loss": 0.0185, "step": 2000 } ], "logging_steps": 100, "max_steps": 2400, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 94237318184960.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }