{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 237, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004219409282700422, "grad_norm": 41.47472723630441, "learning_rate": 1.2499999999999999e-05, "loss": 2.7746, "step": 1 }, { "epoch": 0.02109704641350211, "grad_norm": 64.04925382439093, "learning_rate": 6.25e-05, "loss": 7.0206, "step": 5 }, { "epoch": 0.04219409282700422, "grad_norm": 31.695867233164076, "learning_rate": 0.000125, "loss": 2.0729, "step": 10 }, { "epoch": 0.06329113924050633, "grad_norm": 20.697061646205608, "learning_rate": 0.00018749999999999998, "loss": 2.3708, "step": 15 }, { "epoch": 0.08438818565400844, "grad_norm": 27.19753629081583, "learning_rate": 0.00025, "loss": 2.0771, "step": 20 }, { "epoch": 0.10548523206751055, "grad_norm": 98.00456381942149, "learning_rate": 0.0002999836847426398, "loss": 4.1945, "step": 25 }, { "epoch": 0.12658227848101267, "grad_norm": 182.51493263483027, "learning_rate": 0.0002994130233112417, "loss": 3.4014, "step": 30 }, { "epoch": 0.14767932489451477, "grad_norm": 11.535478461037668, "learning_rate": 0.00029803014471122853, "loss": 2.4622, "step": 35 }, { "epoch": 0.16877637130801687, "grad_norm": 20.197984273278912, "learning_rate": 0.00029584256634447164, "loss": 2.6132, "step": 40 }, { "epoch": 0.189873417721519, "grad_norm": 8.178205900726194, "learning_rate": 0.0002928621800037197, "loss": 1.9102, "step": 45 }, { "epoch": 0.2109704641350211, "grad_norm": 5.9959330455646125, "learning_rate": 0.00028910518722817866, "loss": 1.7519, "step": 50 }, { "epoch": 0.2320675105485232, "grad_norm": 13.868140556663864, "learning_rate": 0.00028459201123108015, "loss": 1.7839, "step": 55 }, { "epoch": 0.25316455696202533, "grad_norm": 8.017800813910466, "learning_rate": 0.00027934718587800417, "loss": 1.6271, "step": 60 }, { "epoch": 0.2742616033755274, "grad_norm": 5.37043737910495, "learning_rate": 0.0002733992223194766, "loss": 1.5652, "step": 65 }, { "epoch": 0.29535864978902954, "grad_norm": 3.3275774087311523, "learning_rate": 0.00026678045400283336, "loss": 1.5245, "step": 70 }, { "epoch": 0.31645569620253167, "grad_norm": 4.741642465345151, "learning_rate": 0.0002595268609058751, "loss": 1.5238, "step": 75 }, { "epoch": 0.33755274261603374, "grad_norm": 4.030676855228214, "learning_rate": 0.00025167787394778763, "loss": 1.5316, "step": 80 }, { "epoch": 0.35864978902953587, "grad_norm": 2.119168421995465, "learning_rate": 0.000243276160640561, "loss": 1.4865, "step": 85 }, { "epoch": 0.379746835443038, "grad_norm": 9.504914395443073, "learning_rate": 0.0002343673931461171, "loss": 1.4966, "step": 90 }, { "epoch": 0.4008438818565401, "grad_norm": 1.9138438030380622, "learning_rate": 0.000225, "loss": 1.5385, "step": 95 }, { "epoch": 0.4219409282700422, "grad_norm": 2.0250123306044228, "learning_rate": 0.0002152249028512724, "loss": 1.4467, "step": 100 }, { "epoch": 0.4430379746835443, "grad_norm": 5.253723621821196, "learning_rate": 0.0002050952396497135, "loss": 1.4458, "step": 105 }, { "epoch": 0.4641350210970464, "grad_norm": 1.667081359954707, "learning_rate": 0.00019466607578508832, "loss": 1.4039, "step": 110 }, { "epoch": 0.48523206751054854, "grad_norm": 1.612910217878065, "learning_rate": 0.00018399410474875, "loss": 1.3915, "step": 115 }, { "epoch": 0.5063291139240507, "grad_norm": 4.664851099721208, "learning_rate": 0.00017313733994479531, "loss": 1.3946, "step": 120 }, { "epoch": 0.5274261603375527, "grad_norm": 1.1333666368766908, "learning_rate": 0.00016215479932610397, "loss": 1.3759, "step": 125 }, { "epoch": 0.5485232067510548, "grad_norm": 46.39351482577546, "learning_rate": 0.00015110618456959686, "loss": 1.3596, "step": 130 }, { "epoch": 0.569620253164557, "grad_norm": 11.304078659755062, "learning_rate": 0.00014005155653473443, "loss": 1.3708, "step": 135 }, { "epoch": 0.5907172995780591, "grad_norm": 1.4112479369616122, "learning_rate": 0.00012905100876947907, "loss": 1.3954, "step": 140 }, { "epoch": 0.6118143459915611, "grad_norm": 4.386921961578911, "learning_rate": 0.00011816434083856155, "loss": 1.3064, "step": 145 }, { "epoch": 0.6329113924050633, "grad_norm": 1.2097887430832492, "learning_rate": 0.00010745073324985548, "loss": 1.3082, "step": 150 }, { "epoch": 0.6540084388185654, "grad_norm": 1.1759289220096156, "learning_rate": 9.696842574597847e-05, "loss": 1.2311, "step": 155 }, { "epoch": 0.6751054852320675, "grad_norm": 2.762333419837543, "learning_rate": 8.677440070994279e-05, "loss": 1.2588, "step": 160 }, { "epoch": 0.6962025316455697, "grad_norm": 0.8634167427320883, "learning_rate": 7.692407340588055e-05, "loss": 1.1888, "step": 165 }, { "epoch": 0.7172995780590717, "grad_norm": 2.2322735412619386, "learning_rate": 6.747099073871015e-05, "loss": 1.2047, "step": 170 }, { "epoch": 0.7383966244725738, "grad_norm": 0.7808932928409826, "learning_rate": 5.8466540170303925e-05, "loss": 1.1594, "step": 175 }, { "epoch": 0.759493670886076, "grad_norm": 0.7823146836939375, "learning_rate": 4.995967037450238e-05, "loss": 1.1763, "step": 180 }, { "epoch": 0.7805907172995781, "grad_norm": 0.7925457205036505, "learning_rate": 4.199662514951142e-05, "loss": 1.1291, "step": 185 }, { "epoch": 0.8016877637130801, "grad_norm": 0.8477428122185438, "learning_rate": 3.4620692034146904e-05, "loss": 1.1162, "step": 190 }, { "epoch": 0.8227848101265823, "grad_norm": 0.7180062181554933, "learning_rate": 2.7871966994463884e-05, "loss": 1.1061, "step": 195 }, { "epoch": 0.8438818565400844, "grad_norm": 0.6932161729089131, "learning_rate": 2.1787136459944886e-05, "loss": 1.1264, "step": 200 }, { "epoch": 0.8649789029535865, "grad_norm": 0.7413528530014878, "learning_rate": 1.639927789411174e-05, "loss": 1.1001, "step": 205 }, { "epoch": 0.8860759493670886, "grad_norm": 0.7062510216168646, "learning_rate": 1.1737679983668258e-05, "loss": 1.0853, "step": 210 }, { "epoch": 0.9071729957805907, "grad_norm": 0.6294764868696124, "learning_rate": 7.8276834236364e-06, "loss": 1.0527, "step": 215 }, { "epoch": 0.9282700421940928, "grad_norm": 0.6759047685020939, "learning_rate": 4.690543163985771e-06, "loss": 1.0758, "step": 220 }, { "epoch": 0.9493670886075949, "grad_norm": 1.3620865902651844, "learning_rate": 2.343312866591163e-06, "loss": 1.0805, "step": 225 }, { "epoch": 0.9704641350210971, "grad_norm": 0.6839144968477266, "learning_rate": 7.987522006165426e-07, "loss": 1.0588, "step": 230 }, { "epoch": 0.9915611814345991, "grad_norm": 1.1867918696725697, "learning_rate": 6.525748027244593e-08, "loss": 1.0692, "step": 235 }, { "epoch": 1.0, "eval_loss": 2.265784978866577, "eval_runtime": 2.2863, "eval_samples_per_second": 0.875, "eval_steps_per_second": 0.437, "step": 237 }, { "epoch": 1.0, "step": 237, "total_flos": 12379572142080.0, "train_loss": 1.644804019968218, "train_runtime": 4500.3652, "train_samples_per_second": 1.682, "train_steps_per_second": 0.053 } ], "logging_steps": 5, "max_steps": 237, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 12379572142080.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }