{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.992412746585736, "eval_steps": 500, "global_step": 246, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.12139605462822459, "grad_norm": 1.9068916730195116, "learning_rate": 5e-06, "loss": 0.7721, "step": 10 }, { "epoch": 0.24279210925644917, "grad_norm": 5.44810384092203, "learning_rate": 5e-06, "loss": 0.6638, "step": 20 }, { "epoch": 0.36418816388467373, "grad_norm": 0.8162292861303865, "learning_rate": 5e-06, "loss": 0.6289, "step": 30 }, { "epoch": 0.48558421851289835, "grad_norm": 0.724722400146013, "learning_rate": 5e-06, "loss": 0.6062, "step": 40 }, { "epoch": 0.6069802731411229, "grad_norm": 0.6325755456980601, "learning_rate": 5e-06, "loss": 0.5886, "step": 50 }, { "epoch": 0.7283763277693475, "grad_norm": 0.5102096530669636, "learning_rate": 5e-06, "loss": 0.5763, "step": 60 }, { "epoch": 0.849772382397572, "grad_norm": 0.6134528530146113, "learning_rate": 5e-06, "loss": 0.5635, "step": 70 }, { "epoch": 0.9711684370257967, "grad_norm": 0.6520975040339092, "learning_rate": 5e-06, "loss": 0.5578, "step": 80 }, { "epoch": 0.9954476479514416, "eval_loss": 0.5475569367408752, "eval_runtime": 31.1849, "eval_samples_per_second": 71.156, "eval_steps_per_second": 1.122, "step": 82 }, { "epoch": 1.095599393019727, "grad_norm": 0.9030012716394636, "learning_rate": 5e-06, "loss": 0.5739, "step": 90 }, { "epoch": 1.2169954476479514, "grad_norm": 0.5546978323548724, "learning_rate": 5e-06, "loss": 0.5098, "step": 100 }, { "epoch": 1.338391502276176, "grad_norm": 0.7373972665017838, "learning_rate": 5e-06, "loss": 0.5099, "step": 110 }, { "epoch": 1.4597875569044005, "grad_norm": 0.8335652060900699, "learning_rate": 5e-06, "loss": 0.5086, "step": 120 }, { "epoch": 1.5811836115326252, "grad_norm": 0.9482742766383457, "learning_rate": 5e-06, "loss": 0.4988, "step": 130 }, { "epoch": 1.7025796661608497, "grad_norm": 0.7107692585969188, "learning_rate": 5e-06, "loss": 0.4973, "step": 140 }, { "epoch": 1.8239757207890743, "grad_norm": 0.6956293579831972, "learning_rate": 5e-06, "loss": 0.5006, "step": 150 }, { "epoch": 1.945371775417299, "grad_norm": 0.5537353905530825, "learning_rate": 5e-06, "loss": 0.4962, "step": 160 }, { "epoch": 1.9939301972685888, "eval_loss": 0.5237926244735718, "eval_runtime": 31.2236, "eval_samples_per_second": 71.068, "eval_steps_per_second": 1.121, "step": 164 }, { "epoch": 2.069802731411229, "grad_norm": 0.9676846382246841, "learning_rate": 5e-06, "loss": 0.5181, "step": 170 }, { "epoch": 2.191198786039454, "grad_norm": 1.0604432739536909, "learning_rate": 5e-06, "loss": 0.4483, "step": 180 }, { "epoch": 2.3125948406676784, "grad_norm": 0.9072874578025836, "learning_rate": 5e-06, "loss": 0.4481, "step": 190 }, { "epoch": 2.433990895295903, "grad_norm": 0.93397296136386, "learning_rate": 5e-06, "loss": 0.4503, "step": 200 }, { "epoch": 2.5553869499241273, "grad_norm": 0.5608137627707893, "learning_rate": 5e-06, "loss": 0.4475, "step": 210 }, { "epoch": 2.676783004552352, "grad_norm": 0.7216376866533744, "learning_rate": 5e-06, "loss": 0.4491, "step": 220 }, { "epoch": 2.7981790591805766, "grad_norm": 0.7502162152741092, "learning_rate": 5e-06, "loss": 0.4475, "step": 230 }, { "epoch": 2.919575113808801, "grad_norm": 0.6902724753233441, "learning_rate": 5e-06, "loss": 0.4505, "step": 240 }, { "epoch": 2.992412746585736, "eval_loss": 0.5186718702316284, "eval_runtime": 31.6067, "eval_samples_per_second": 70.207, "eval_steps_per_second": 1.107, "step": 246 }, { "epoch": 2.992412746585736, "step": 246, "total_flos": 411849782722560.0, "train_loss": 0.5282489497487138, "train_runtime": 6430.2484, "train_samples_per_second": 19.666, "train_steps_per_second": 0.038 } ], "logging_steps": 10, "max_steps": 246, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 411849782722560.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }