{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.010204081632653, "eval_steps": 9, "global_step": 99, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01020408163265306, "eval_loss": 2.5547358989715576, "eval_runtime": 10.9643, "eval_samples_per_second": 15.049, "eval_steps_per_second": 1.915, "step": 1 }, { "epoch": 0.030612244897959183, "grad_norm": 22.481250762939453, "learning_rate": 3e-05, "loss": 10.7284, "step": 3 }, { "epoch": 0.061224489795918366, "grad_norm": 8.63905143737793, "learning_rate": 6e-05, "loss": 9.5021, "step": 6 }, { "epoch": 0.09183673469387756, "grad_norm": 6.093153953552246, "learning_rate": 9e-05, "loss": 8.4335, "step": 9 }, { "epoch": 0.09183673469387756, "eval_loss": 1.867263674736023, "eval_runtime": 11.1356, "eval_samples_per_second": 14.817, "eval_steps_per_second": 1.886, "step": 9 }, { "epoch": 0.12244897959183673, "grad_norm": 5.729458332061768, "learning_rate": 9.987820251299122e-05, "loss": 7.2274, "step": 12 }, { "epoch": 0.15306122448979592, "grad_norm": 5.534293174743652, "learning_rate": 9.924038765061042e-05, "loss": 7.2317, "step": 15 }, { "epoch": 0.1836734693877551, "grad_norm": 6.290515899658203, "learning_rate": 9.806308479691595e-05, "loss": 7.4009, "step": 18 }, { "epoch": 0.1836734693877551, "eval_loss": 1.7399932146072388, "eval_runtime": 11.152, "eval_samples_per_second": 14.796, "eval_steps_per_second": 1.883, "step": 18 }, { "epoch": 0.21428571428571427, "grad_norm": 5.591012477874756, "learning_rate": 9.635919272833938e-05, "loss": 7.2483, "step": 21 }, { "epoch": 0.24489795918367346, "grad_norm": 4.81268310546875, "learning_rate": 9.414737964294636e-05, "loss": 7.0443, "step": 24 }, { "epoch": 0.2755102040816326, "grad_norm": 4.390398025512695, "learning_rate": 9.145187862775209e-05, "loss": 6.7129, "step": 27 }, { "epoch": 0.2755102040816326, "eval_loss": 1.676879644393921, "eval_runtime": 11.1798, "eval_samples_per_second": 14.759, "eval_steps_per_second": 1.878, "step": 27 }, { "epoch": 0.30612244897959184, "grad_norm": 4.704708099365234, "learning_rate": 8.83022221559489e-05, "loss": 6.8546, "step": 30 }, { "epoch": 0.336734693877551, "grad_norm": 4.343167304992676, "learning_rate": 8.473291852294987e-05, "loss": 6.9952, "step": 33 }, { "epoch": 0.3673469387755102, "grad_norm": 3.5825247764587402, "learning_rate": 8.07830737662829e-05, "loss": 6.6506, "step": 36 }, { "epoch": 0.3673469387755102, "eval_loss": 1.6432121992111206, "eval_runtime": 11.181, "eval_samples_per_second": 14.757, "eval_steps_per_second": 1.878, "step": 36 }, { "epoch": 0.3979591836734694, "grad_norm": 4.151194095611572, "learning_rate": 7.649596321166024e-05, "loss": 6.538, "step": 39 }, { "epoch": 0.42857142857142855, "grad_norm": 4.192036151885986, "learning_rate": 7.191855733945387e-05, "loss": 6.6947, "step": 42 }, { "epoch": 0.45918367346938777, "grad_norm": 4.50932502746582, "learning_rate": 6.710100716628344e-05, "loss": 6.848, "step": 45 }, { "epoch": 0.45918367346938777, "eval_loss": 1.622291922569275, "eval_runtime": 11.1576, "eval_samples_per_second": 14.788, "eval_steps_per_second": 1.882, "step": 45 }, { "epoch": 0.4897959183673469, "grad_norm": 3.9591734409332275, "learning_rate": 6.209609477998338e-05, "loss": 6.6632, "step": 48 }, { "epoch": 0.5204081632653061, "grad_norm": 3.3937931060791016, "learning_rate": 5.695865504800327e-05, "loss": 6.5888, "step": 51 }, { "epoch": 0.5510204081632653, "grad_norm": 3.470456123352051, "learning_rate": 5.174497483512506e-05, "loss": 6.5541, "step": 54 }, { "epoch": 0.5510204081632653, "eval_loss": 1.6126476526260376, "eval_runtime": 11.1832, "eval_samples_per_second": 14.754, "eval_steps_per_second": 1.878, "step": 54 }, { "epoch": 0.5816326530612245, "grad_norm": 3.859346389770508, "learning_rate": 4.6512176312793736e-05, "loss": 6.6553, "step": 57 }, { "epoch": 0.6122448979591837, "grad_norm": 3.1429214477539062, "learning_rate": 4.131759111665349e-05, "loss": 6.4766, "step": 60 }, { "epoch": 0.6428571428571429, "grad_norm": 4.059041500091553, "learning_rate": 3.6218132209150045e-05, "loss": 6.416, "step": 63 }, { "epoch": 0.6428571428571429, "eval_loss": 1.5922267436981201, "eval_runtime": 11.174, "eval_samples_per_second": 14.766, "eval_steps_per_second": 1.879, "step": 63 }, { "epoch": 0.673469387755102, "grad_norm": 4.152980804443359, "learning_rate": 3.12696703292044e-05, "loss": 6.4115, "step": 66 }, { "epoch": 0.7040816326530612, "grad_norm": 3.4916810989379883, "learning_rate": 2.6526421860705473e-05, "loss": 6.4901, "step": 69 }, { "epoch": 0.7346938775510204, "grad_norm": 3.7989633083343506, "learning_rate": 2.2040354826462668e-05, "loss": 6.6644, "step": 72 }, { "epoch": 0.7346938775510204, "eval_loss": 1.5862826108932495, "eval_runtime": 11.1677, "eval_samples_per_second": 14.775, "eval_steps_per_second": 1.88, "step": 72 }, { "epoch": 0.7653061224489796, "grad_norm": 3.602689504623413, "learning_rate": 1.7860619515673033e-05, "loss": 6.447, "step": 75 }, { "epoch": 0.7959183673469388, "grad_norm": 3.743265390396118, "learning_rate": 1.4033009983067452e-05, "loss": 6.4277, "step": 78 }, { "epoch": 0.826530612244898, "grad_norm": 3.567052125930786, "learning_rate": 1.0599462319663905e-05, "loss": 6.7132, "step": 81 }, { "epoch": 0.826530612244898, "eval_loss": 1.5810420513153076, "eval_runtime": 11.1521, "eval_samples_per_second": 14.795, "eval_steps_per_second": 1.883, "step": 81 }, { "epoch": 0.8571428571428571, "grad_norm": 3.60418438911438, "learning_rate": 7.597595192178702e-06, "loss": 6.3228, "step": 84 }, { "epoch": 0.8877551020408163, "grad_norm": 3.443751573562622, "learning_rate": 5.060297685041659e-06, "loss": 6.6778, "step": 87 }, { "epoch": 0.9183673469387755, "grad_norm": 4.735818386077881, "learning_rate": 3.0153689607045845e-06, "loss": 6.7016, "step": 90 }, { "epoch": 0.9183673469387755, "eval_loss": 1.5796834230422974, "eval_runtime": 11.1726, "eval_samples_per_second": 14.768, "eval_steps_per_second": 1.88, "step": 90 }, { "epoch": 0.9489795918367347, "grad_norm": 3.8432347774505615, "learning_rate": 1.4852136862001764e-06, "loss": 6.5667, "step": 93 }, { "epoch": 0.9795918367346939, "grad_norm": 3.743124008178711, "learning_rate": 4.865965629214819e-07, "loss": 6.4069, "step": 96 }, { "epoch": 1.010204081632653, "grad_norm": 3.2074642181396484, "learning_rate": 3.04586490452119e-08, "loss": 6.4376, "step": 99 }, { "epoch": 1.010204081632653, "eval_loss": 1.579128384590149, "eval_runtime": 11.1546, "eval_samples_per_second": 14.792, "eval_steps_per_second": 1.883, "step": 99 } ], "logging_steps": 3, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 9, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.43458510209024e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }