{ "best_metric": null, "best_model_checkpoint": null, "epoch": 16.0, "eval_steps": 500, "global_step": 336, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.0, "grad_norm": 328.8559875488281, "learning_rate": 5e-06, "loss": 127.4109, "step": 21 }, { "epoch": 1.0, "eval_loss": 478.246337890625, "eval_runtime": 5.1173, "eval_samples_per_second": 63.119, "eval_steps_per_second": 3.322, "step": 21 }, { "epoch": 2.0, "grad_norm": 174.37905883789062, "learning_rate": 1e-05, "loss": 36.0425, "step": 42 }, { "epoch": 2.0, "eval_loss": 390.6916198730469, "eval_runtime": 4.0686, "eval_samples_per_second": 79.389, "eval_steps_per_second": 4.178, "step": 42 }, { "epoch": 3.0, "grad_norm": 75.46469116210938, "learning_rate": 9.444444444444445e-06, "loss": 15.8166, "step": 63 }, { "epoch": 3.0, "eval_loss": 175.55548095703125, "eval_runtime": 4.1777, "eval_samples_per_second": 77.316, "eval_steps_per_second": 4.069, "step": 63 }, { "epoch": 4.0, "grad_norm": 47.89638900756836, "learning_rate": 8.888888888888888e-06, "loss": 8.0933, "step": 84 }, { "epoch": 4.0, "eval_loss": 119.244384765625, "eval_runtime": 4.1061, "eval_samples_per_second": 78.663, "eval_steps_per_second": 4.14, "step": 84 }, { "epoch": 5.0, "grad_norm": 39.76791000366211, "learning_rate": 8.333333333333334e-06, "loss": 5.7852, "step": 105 }, { "epoch": 5.0, "eval_loss": 102.83723449707031, "eval_runtime": 3.9939, "eval_samples_per_second": 80.873, "eval_steps_per_second": 4.256, "step": 105 }, { "epoch": 6.0, "grad_norm": 17.662046432495117, "learning_rate": 7.77777777777778e-06, "loss": 4.1072, "step": 126 }, { "epoch": 6.0, "eval_loss": 92.03376770019531, "eval_runtime": 3.9224, "eval_samples_per_second": 82.348, "eval_steps_per_second": 4.334, "step": 126 }, { "epoch": 7.0, "grad_norm": 22.999319076538086, "learning_rate": 7.222222222222223e-06, "loss": 3.0971, "step": 147 }, { "epoch": 7.0, "eval_loss": 84.36630249023438, "eval_runtime": 4.4688, "eval_samples_per_second": 72.278, "eval_steps_per_second": 3.804, "step": 147 }, { "epoch": 8.0, "grad_norm": 14.748945236206055, "learning_rate": 6.666666666666667e-06, "loss": 1.9596, "step": 168 }, { "epoch": 8.0, "eval_loss": 81.93232727050781, "eval_runtime": 4.7697, "eval_samples_per_second": 67.718, "eval_steps_per_second": 3.564, "step": 168 }, { "epoch": 9.0, "grad_norm": 55.0167350769043, "learning_rate": 6.111111111111112e-06, "loss": 1.2031, "step": 189 }, { "epoch": 9.0, "eval_loss": 80.85563659667969, "eval_runtime": 4.4821, "eval_samples_per_second": 72.064, "eval_steps_per_second": 3.793, "step": 189 }, { "epoch": 10.0, "grad_norm": 59.18910598754883, "learning_rate": 5.555555555555557e-06, "loss": 0.8863, "step": 210 }, { "epoch": 10.0, "eval_loss": 98.6752700805664, "eval_runtime": 4.4465, "eval_samples_per_second": 72.642, "eval_steps_per_second": 3.823, "step": 210 }, { "epoch": 11.0, "grad_norm": 1.2452582120895386, "learning_rate": 5e-06, "loss": 0.3703, "step": 231 }, { "epoch": 11.0, "eval_loss": 102.78790283203125, "eval_runtime": 4.4585, "eval_samples_per_second": 72.446, "eval_steps_per_second": 3.813, "step": 231 }, { "epoch": 12.0, "grad_norm": 51.24710464477539, "learning_rate": 4.444444444444444e-06, "loss": 0.2899, "step": 252 }, { "epoch": 12.0, "eval_loss": 105.00616455078125, "eval_runtime": 4.4633, "eval_samples_per_second": 72.368, "eval_steps_per_second": 3.809, "step": 252 }, { "epoch": 13.0, "grad_norm": 0.3481183648109436, "learning_rate": 3.88888888888889e-06, "loss": 0.193, "step": 273 }, { "epoch": 13.0, "eval_loss": 119.50952911376953, "eval_runtime": 4.454, "eval_samples_per_second": 72.519, "eval_steps_per_second": 3.817, "step": 273 }, { "epoch": 14.0, "grad_norm": 0.8411530256271362, "learning_rate": 3.3333333333333333e-06, "loss": 0.0836, "step": 294 }, { "epoch": 14.0, "eval_loss": 126.7031478881836, "eval_runtime": 4.2646, "eval_samples_per_second": 75.739, "eval_steps_per_second": 3.986, "step": 294 }, { "epoch": 15.0, "grad_norm": 16.207983016967773, "learning_rate": 2.7777777777777783e-06, "loss": 0.1528, "step": 315 }, { "epoch": 15.0, "eval_loss": 137.7791748046875, "eval_runtime": 4.1698, "eval_samples_per_second": 77.462, "eval_steps_per_second": 4.077, "step": 315 }, { "epoch": 16.0, "grad_norm": 0.5255839824676514, "learning_rate": 2.222222222222222e-06, "loss": 0.035, "step": 336 }, { "epoch": 16.0, "eval_loss": 146.33724975585938, "eval_runtime": 4.2416, "eval_samples_per_second": 76.151, "eval_steps_per_second": 4.008, "step": 336 } ], "logging_steps": 500, "max_steps": 420, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 20, "trial_name": null, "trial_params": null }