{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.4634146341463414, "eval_steps": 13, "global_step": 150, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00975609756097561, "eval_loss": 3.28096079826355, "eval_runtime": 4.8662, "eval_samples_per_second": 35.551, "eval_steps_per_second": 4.521, "step": 1 }, { "epoch": 0.02926829268292683, "grad_norm": 0.17861229181289673, "learning_rate": 1.5e-05, "loss": 3.1127, "step": 3 }, { "epoch": 0.05853658536585366, "grad_norm": 0.3140682876110077, "learning_rate": 3e-05, "loss": 3.4267, "step": 6 }, { "epoch": 0.08780487804878048, "grad_norm": 0.29320892691612244, "learning_rate": 4.5e-05, "loss": 3.371, "step": 9 }, { "epoch": 0.11707317073170732, "grad_norm": 0.17337766289710999, "learning_rate": 4.997482666353287e-05, "loss": 3.0004, "step": 12 }, { "epoch": 0.12682926829268293, "eval_loss": 3.250086545944214, "eval_runtime": 4.8803, "eval_samples_per_second": 35.448, "eval_steps_per_second": 4.508, "step": 13 }, { "epoch": 0.14634146341463414, "grad_norm": 0.4313404858112335, "learning_rate": 4.984280524733107e-05, "loss": 3.3864, "step": 15 }, { "epoch": 0.17560975609756097, "grad_norm": 0.2435210794210434, "learning_rate": 4.959823971496574e-05, "loss": 2.9856, "step": 18 }, { "epoch": 0.2048780487804878, "grad_norm": 0.35940220952033997, "learning_rate": 4.9242238009417175e-05, "loss": 3.1762, "step": 21 }, { "epoch": 0.23414634146341465, "grad_norm": 0.30893248319625854, "learning_rate": 4.877641290737884e-05, "loss": 2.8997, "step": 24 }, { "epoch": 0.25365853658536586, "eval_loss": 3.1296775341033936, "eval_runtime": 4.8635, "eval_samples_per_second": 35.571, "eval_steps_per_second": 4.523, "step": 26 }, { "epoch": 0.2634146341463415, "grad_norm": 0.22518818080425262, "learning_rate": 4.820287471297598e-05, "loss": 2.7146, "step": 27 }, { "epoch": 0.2926829268292683, "grad_norm": 0.14826013147830963, "learning_rate": 4.752422169756048e-05, "loss": 2.7433, "step": 30 }, { "epoch": 0.32195121951219513, "grad_norm": 0.16330775618553162, "learning_rate": 4.674352832889239e-05, "loss": 2.7485, "step": 33 }, { "epoch": 0.35121951219512193, "grad_norm": 0.2697175145149231, "learning_rate": 4.586433134303257e-05, "loss": 3.1459, "step": 36 }, { "epoch": 0.3804878048780488, "grad_norm": 0.1805887669324875, "learning_rate": 4.489061372204453e-05, "loss": 2.7904, "step": 39 }, { "epoch": 0.3804878048780488, "eval_loss": 2.998751163482666, "eval_runtime": 4.8535, "eval_samples_per_second": 35.644, "eval_steps_per_second": 4.533, "step": 39 }, { "epoch": 0.4097560975609756, "grad_norm": 0.3743424713611603, "learning_rate": 4.382678665009028e-05, "loss": 3.0961, "step": 42 }, { "epoch": 0.43902439024390244, "grad_norm": 0.21087686717510223, "learning_rate": 4.267766952966369e-05, "loss": 2.7703, "step": 45 }, { "epoch": 0.4682926829268293, "grad_norm": 0.23139822483062744, "learning_rate": 4.144846814849282e-05, "loss": 3.0125, "step": 48 }, { "epoch": 0.4975609756097561, "grad_norm": 0.23395372927188873, "learning_rate": 4.01447510960205e-05, "loss": 2.9427, "step": 51 }, { "epoch": 0.5073170731707317, "eval_loss": 2.8844926357269287, "eval_runtime": 4.864, "eval_samples_per_second": 35.567, "eval_steps_per_second": 4.523, "step": 52 }, { "epoch": 0.526829268292683, "grad_norm": 0.3578898310661316, "learning_rate": 3.8772424536302564e-05, "loss": 2.7399, "step": 54 }, { "epoch": 0.5560975609756098, "grad_norm": 0.301941454410553, "learning_rate": 3.7337705451608674e-05, "loss": 2.7642, "step": 57 }, { "epoch": 0.5853658536585366, "grad_norm": 0.21311171352863312, "learning_rate": 3.5847093477938956e-05, "loss": 2.8332, "step": 60 }, { "epoch": 0.6146341463414634, "grad_norm": 0.22062243521213531, "learning_rate": 3.4307341460048633e-05, "loss": 2.5688, "step": 63 }, { "epoch": 0.6341463414634146, "eval_loss": 2.812589645385742, "eval_runtime": 4.8706, "eval_samples_per_second": 35.519, "eval_steps_per_second": 4.517, "step": 65 }, { "epoch": 0.6439024390243903, "grad_norm": 0.24126602709293365, "learning_rate": 3.272542485937369e-05, "loss": 2.6723, "step": 66 }, { "epoch": 0.6731707317073171, "grad_norm": 0.20950078964233398, "learning_rate": 3.110851015344735e-05, "loss": 2.9801, "step": 69 }, { "epoch": 0.7024390243902439, "grad_norm": 0.1919126808643341, "learning_rate": 2.9463922369965917e-05, "loss": 2.7774, "step": 72 }, { "epoch": 0.7317073170731707, "grad_norm": 0.2704748809337616, "learning_rate": 2.7799111902582696e-05, "loss": 2.6039, "step": 75 }, { "epoch": 0.7609756097560976, "grad_norm": 0.24143126606941223, "learning_rate": 2.6121620758762877e-05, "loss": 2.9561, "step": 78 }, { "epoch": 0.7609756097560976, "eval_loss": 2.77209210395813, "eval_runtime": 4.8697, "eval_samples_per_second": 35.526, "eval_steps_per_second": 4.518, "step": 78 }, { "epoch": 0.7902439024390244, "grad_norm": 0.3689444661140442, "learning_rate": 2.443904839260488e-05, "loss": 2.7232, "step": 81 }, { "epoch": 0.8195121951219512, "grad_norm": 0.33407869935035706, "learning_rate": 2.2759017277414166e-05, "loss": 2.5639, "step": 84 }, { "epoch": 0.848780487804878, "grad_norm": 0.21137681603431702, "learning_rate": 2.1089138373994223e-05, "loss": 2.5741, "step": 87 }, { "epoch": 0.8780487804878049, "grad_norm": 0.2102205455303192, "learning_rate": 1.9436976651092144e-05, "loss": 2.7756, "step": 90 }, { "epoch": 0.8878048780487805, "eval_loss": 2.7437009811401367, "eval_runtime": 4.8619, "eval_samples_per_second": 35.583, "eval_steps_per_second": 4.525, "step": 91 }, { "epoch": 0.9073170731707317, "grad_norm": 0.2911361753940582, "learning_rate": 1.781001681419957e-05, "loss": 2.75, "step": 93 }, { "epoch": 0.9365853658536586, "grad_norm": 0.4980337619781494, "learning_rate": 1.621562939796643e-05, "loss": 2.612, "step": 96 }, { "epoch": 0.9658536585365853, "grad_norm": 0.3111054599285126, "learning_rate": 1.466103737583699e-05, "loss": 2.6542, "step": 99 }, { "epoch": 0.9951219512195122, "grad_norm": 0.2570768892765045, "learning_rate": 1.3153283438175034e-05, "loss": 3.0268, "step": 102 }, { "epoch": 1.0146341463414634, "eval_loss": 2.723863124847412, "eval_runtime": 4.857, "eval_samples_per_second": 35.619, "eval_steps_per_second": 4.53, "step": 104 }, { "epoch": 1.024390243902439, "grad_norm": 0.3360951542854309, "learning_rate": 1.1699198087116589e-05, "loss": 3.0774, "step": 105 }, { "epoch": 1.053658536585366, "grad_norm": 0.28752246499061584, "learning_rate": 1.0305368692688174e-05, "loss": 2.8053, "step": 108 }, { "epoch": 1.0829268292682928, "grad_norm": 0.2592686712741852, "learning_rate": 8.978109650374397e-06, "loss": 2.9168, "step": 111 }, { "epoch": 1.1121951219512196, "grad_norm": 0.1692250370979309, "learning_rate": 7.723433775328384e-06, "loss": 2.4492, "step": 114 }, { "epoch": 1.1414634146341462, "grad_norm": 0.2585885226726532, "learning_rate": 6.547025062816486e-06, "loss": 2.6056, "step": 117 }, { "epoch": 1.1414634146341462, "eval_loss": 2.7146248817443848, "eval_runtime": 4.869, "eval_samples_per_second": 35.531, "eval_steps_per_second": 4.518, "step": 117 }, { "epoch": 1.170731707317073, "grad_norm": 0.2549976706504822, "learning_rate": 5.454212938299255e-06, "loss": 2.8282, "step": 120 }, { "epoch": 1.2, "grad_norm": 0.31321239471435547, "learning_rate": 4.4499481138022544e-06, "loss": 2.5444, "step": 123 }, { "epoch": 1.2292682926829268, "grad_norm": 0.2727275490760803, "learning_rate": 3.5387801599533475e-06, "loss": 2.4951, "step": 126 }, { "epoch": 1.2585365853658537, "grad_norm": 0.26122474670410156, "learning_rate": 2.7248368952908053e-06, "loss": 2.5487, "step": 129 }, { "epoch": 1.2682926829268293, "eval_loss": 2.7085165977478027, "eval_runtime": 4.86, "eval_samples_per_second": 35.597, "eval_steps_per_second": 4.527, "step": 130 }, { "epoch": 1.2878048780487805, "grad_norm": 0.268062025308609, "learning_rate": 2.0118056862137357e-06, "loss": 3.0802, "step": 132 }, { "epoch": 1.3170731707317074, "grad_norm": 0.21112675964832306, "learning_rate": 1.4029167422908107e-06, "loss": 2.1312, "step": 135 }, { "epoch": 1.346341463414634, "grad_norm": 0.3374578356742859, "learning_rate": 9.009284826036691e-07, "loss": 2.6085, "step": 138 }, { "epoch": 1.3756097560975609, "grad_norm": 0.2656230330467224, "learning_rate": 5.08115039419113e-07, "loss": 2.8064, "step": 141 }, { "epoch": 1.395121951219512, "eval_loss": 2.7080576419830322, "eval_runtime": 4.8544, "eval_samples_per_second": 35.638, "eval_steps_per_second": 4.532, "step": 143 }, { "epoch": 1.4048780487804877, "grad_norm": 0.25628089904785156, "learning_rate": 2.262559558016325e-07, "loss": 2.3778, "step": 144 }, { "epoch": 1.4341463414634146, "grad_norm": 0.17649215459823608, "learning_rate": 5.662812383859795e-08, "loss": 2.3724, "step": 147 }, { "epoch": 1.4634146341463414, "grad_norm": 0.3153819739818573, "learning_rate": 0.0, "loss": 2.8507, "step": 150 } ], "logging_steps": 3, "max_steps": 150, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 13, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.115759104832307e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }