{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.0896, "eval_steps": 500, "global_step": 14000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0032, "grad_norm": 1.7230606079101562, "learning_rate": 4.99208e-05, "loss": 1.2281, "step": 500 }, { "epoch": 0.0064, "grad_norm": 3.655383348464966, "learning_rate": 4.9840800000000006e-05, "loss": 0.7566, "step": 1000 }, { "epoch": 0.0096, "grad_norm": 1.2925927639007568, "learning_rate": 4.97608e-05, "loss": 0.6764, "step": 1500 }, { "epoch": 0.0128, "grad_norm": 1.286004900932312, "learning_rate": 4.968080000000001e-05, "loss": 0.6304, "step": 2000 }, { "epoch": 0.016, "grad_norm": 1.2140214443206787, "learning_rate": 4.96008e-05, "loss": 0.5981, "step": 2500 }, { "epoch": 0.0192, "grad_norm": 1.2525482177734375, "learning_rate": 4.95208e-05, "loss": 0.5767, "step": 3000 }, { "epoch": 0.0224, "grad_norm": 1.2310410737991333, "learning_rate": 4.94408e-05, "loss": 0.5597, "step": 3500 }, { "epoch": 0.0256, "grad_norm": 1.1735206842422485, "learning_rate": 4.9360800000000004e-05, "loss": 0.5418, "step": 4000 }, { "epoch": 0.0288, "grad_norm": 1.114688754081726, "learning_rate": 4.9280800000000004e-05, "loss": 0.5335, "step": 4500 }, { "epoch": 0.032, "grad_norm": 0.8874593377113342, "learning_rate": 4.9200800000000005e-05, "loss": 0.5237, "step": 5000 }, { "epoch": 0.0352, "grad_norm": 1.1261299848556519, "learning_rate": 4.91208e-05, "loss": 0.5135, "step": 5500 }, { "epoch": 0.0384, "grad_norm": 0.9994556307792664, "learning_rate": 4.9040800000000007e-05, "loss": 0.5059, "step": 6000 }, { "epoch": 0.0416, "grad_norm": 1.2349673509597778, "learning_rate": 4.89608e-05, "loss": 0.4939, "step": 6500 }, { "epoch": 0.0448, "grad_norm": 0.9770995378494263, "learning_rate": 4.88808e-05, "loss": 0.4824, "step": 7000 }, { "epoch": 0.048, "grad_norm": 0.981966495513916, "learning_rate": 4.88008e-05, "loss": 0.4875, "step": 7500 }, { "epoch": 0.0512, "grad_norm": 1.0177415609359741, "learning_rate": 4.87208e-05, "loss": 0.4785, "step": 8000 }, { "epoch": 0.0544, "grad_norm": 1.0521667003631592, "learning_rate": 4.8640800000000004e-05, "loss": 0.4731, "step": 8500 }, { "epoch": 0.0576, "grad_norm": 0.8560615181922913, "learning_rate": 4.85608e-05, "loss": 0.4633, "step": 9000 }, { "epoch": 0.0608, "grad_norm": 1.0170217752456665, "learning_rate": 4.8480800000000005e-05, "loss": 0.4576, "step": 9500 }, { "epoch": 0.064, "grad_norm": 0.9891325831413269, "learning_rate": 4.84008e-05, "loss": 0.4556, "step": 10000 }, { "epoch": 0.0672, "grad_norm": 1.0609711408615112, "learning_rate": 4.832080000000001e-05, "loss": 0.4493, "step": 10500 }, { "epoch": 0.0704, "grad_norm": 0.8623799681663513, "learning_rate": 4.82408e-05, "loss": 0.4459, "step": 11000 }, { "epoch": 0.0736, "grad_norm": 0.9587870240211487, "learning_rate": 4.81608e-05, "loss": 0.4418, "step": 11500 }, { "epoch": 0.0768, "grad_norm": 0.8939447999000549, "learning_rate": 4.80808e-05, "loss": 0.4327, "step": 12000 }, { "epoch": 0.08, "grad_norm": 0.9886033535003662, "learning_rate": 4.80008e-05, "loss": 0.438, "step": 12500 }, { "epoch": 0.0832, "grad_norm": 0.9157513976097107, "learning_rate": 4.7920800000000004e-05, "loss": 0.4323, "step": 13000 }, { "epoch": 0.0864, "grad_norm": 0.9085854887962341, "learning_rate": 4.7840800000000005e-05, "loss": 0.4303, "step": 13500 }, { "epoch": 0.0896, "grad_norm": 0.9123984575271606, "learning_rate": 4.77608e-05, "loss": 0.4247, "step": 14000 } ], "logging_steps": 500, "max_steps": 312500, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.820328374272e+16, "train_batch_size": 64, "trial_name": null, "trial_params": null }