{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 28366, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.017626736233519003, "grad_norm": 5.204168796539307, "learning_rate": 4.9118663188324054e-05, "loss": 3.9535, "step": 500 }, { "epoch": 0.035253472467038006, "grad_norm": 5.162827968597412, "learning_rate": 4.82373263766481e-05, "loss": 3.761, "step": 1000 }, { "epoch": 0.052880208700557006, "grad_norm": 5.309798240661621, "learning_rate": 4.735598956497215e-05, "loss": 3.7096, "step": 1500 }, { "epoch": 0.07050694493407601, "grad_norm": 5.0922369956970215, "learning_rate": 4.64746527532962e-05, "loss": 3.6577, "step": 2000 }, { "epoch": 0.08813368116759501, "grad_norm": 5.067632675170898, "learning_rate": 4.559331594162025e-05, "loss": 3.6288, "step": 2500 }, { "epoch": 0.10576041740111401, "grad_norm": 5.3605475425720215, "learning_rate": 4.4711979129944304e-05, "loss": 3.6192, "step": 3000 }, { "epoch": 0.12338715363463301, "grad_norm": 5.510789394378662, "learning_rate": 4.383064231826835e-05, "loss": 3.559, "step": 3500 }, { "epoch": 0.14101388986815203, "grad_norm": 5.7333855628967285, "learning_rate": 4.29493055065924e-05, "loss": 3.5382, "step": 4000 }, { "epoch": 0.158640626101671, "grad_norm": 5.04295539855957, "learning_rate": 4.206796869491645e-05, "loss": 3.4962, "step": 4500 }, { "epoch": 0.17626736233519003, "grad_norm": 4.932398796081543, "learning_rate": 4.11866318832405e-05, "loss": 3.5339, "step": 5000 }, { "epoch": 0.193894098568709, "grad_norm": 5.262182235717773, "learning_rate": 4.0305295071564555e-05, "loss": 3.4758, "step": 5500 }, { "epoch": 0.21152083480222802, "grad_norm": 5.248316764831543, "learning_rate": 3.94239582598886e-05, "loss": 3.4524, "step": 6000 }, { "epoch": 0.229147571035747, "grad_norm": 5.176753520965576, "learning_rate": 3.854262144821265e-05, "loss": 3.4403, "step": 6500 }, { "epoch": 0.24677430726926602, "grad_norm": 5.396851539611816, "learning_rate": 3.76612846365367e-05, "loss": 3.4066, "step": 7000 }, { "epoch": 0.26440104350278504, "grad_norm": 4.905313968658447, "learning_rate": 3.677994782486075e-05, "loss": 3.4277, "step": 7500 }, { "epoch": 0.28202777973630405, "grad_norm": 5.581764221191406, "learning_rate": 3.58986110131848e-05, "loss": 3.3977, "step": 8000 }, { "epoch": 0.299654515969823, "grad_norm": 4.564020156860352, "learning_rate": 3.501727420150885e-05, "loss": 3.3739, "step": 8500 }, { "epoch": 0.317281252203342, "grad_norm": 5.451286315917969, "learning_rate": 3.41359373898329e-05, "loss": 3.3724, "step": 9000 }, { "epoch": 0.33490798843686104, "grad_norm": 5.060819149017334, "learning_rate": 3.325460057815695e-05, "loss": 3.3393, "step": 9500 }, { "epoch": 0.35253472467038005, "grad_norm": 5.474411487579346, "learning_rate": 3.2373263766481e-05, "loss": 3.3186, "step": 10000 }, { "epoch": 0.370161460903899, "grad_norm": 5.26786994934082, "learning_rate": 3.149192695480505e-05, "loss": 3.3223, "step": 10500 }, { "epoch": 0.387788197137418, "grad_norm": 5.467500686645508, "learning_rate": 3.06105901431291e-05, "loss": 3.3054, "step": 11000 }, { "epoch": 0.40541493337093704, "grad_norm": 5.263679027557373, "learning_rate": 2.972925333145315e-05, "loss": 3.3193, "step": 11500 }, { "epoch": 0.42304166960445605, "grad_norm": 4.835860729217529, "learning_rate": 2.88479165197772e-05, "loss": 3.2871, "step": 12000 }, { "epoch": 0.44066840583797506, "grad_norm": 4.88271951675415, "learning_rate": 2.7966579708101248e-05, "loss": 3.2783, "step": 12500 }, { "epoch": 0.458295142071494, "grad_norm": 5.228416442871094, "learning_rate": 2.70852428964253e-05, "loss": 3.2845, "step": 13000 }, { "epoch": 0.47592187830501304, "grad_norm": 5.097890853881836, "learning_rate": 2.6203906084749348e-05, "loss": 3.2731, "step": 13500 }, { "epoch": 0.49354861453853205, "grad_norm": 4.9926066398620605, "learning_rate": 2.53225692730734e-05, "loss": 3.27, "step": 14000 }, { "epoch": 0.511175350772051, "grad_norm": 5.329204559326172, "learning_rate": 2.4441232461397447e-05, "loss": 3.253, "step": 14500 }, { "epoch": 0.5288020870055701, "grad_norm": 4.740358352661133, "learning_rate": 2.35598956497215e-05, "loss": 3.2511, "step": 15000 }, { "epoch": 0.546428823239089, "grad_norm": 5.418153285980225, "learning_rate": 2.267855883804555e-05, "loss": 3.2315, "step": 15500 }, { "epoch": 0.5640555594726081, "grad_norm": 4.993420600891113, "learning_rate": 2.1797222026369598e-05, "loss": 3.2453, "step": 16000 }, { "epoch": 0.5816822957061271, "grad_norm": 5.474274635314941, "learning_rate": 2.091588521469365e-05, "loss": 3.2328, "step": 16500 }, { "epoch": 0.599309031939646, "grad_norm": 4.977609157562256, "learning_rate": 2.0034548403017698e-05, "loss": 3.2181, "step": 17000 }, { "epoch": 0.6169357681731651, "grad_norm": 4.982664585113525, "learning_rate": 1.915321159134175e-05, "loss": 3.2106, "step": 17500 }, { "epoch": 0.634562504406684, "grad_norm": 5.291051387786865, "learning_rate": 1.8271874779665797e-05, "loss": 3.2134, "step": 18000 }, { "epoch": 0.652189240640203, "grad_norm": 5.687000751495361, "learning_rate": 1.739053796798985e-05, "loss": 3.1905, "step": 18500 }, { "epoch": 0.6698159768737221, "grad_norm": 5.048547267913818, "learning_rate": 1.6509201156313897e-05, "loss": 3.2165, "step": 19000 }, { "epoch": 0.687442713107241, "grad_norm": 5.21890926361084, "learning_rate": 1.5627864344637945e-05, "loss": 3.216, "step": 19500 }, { "epoch": 0.7050694493407601, "grad_norm": 4.901352405548096, "learning_rate": 1.4746527532961998e-05, "loss": 3.1903, "step": 20000 }, { "epoch": 0.7226961855742791, "grad_norm": 5.835772514343262, "learning_rate": 1.3865190721286048e-05, "loss": 3.1971, "step": 20500 }, { "epoch": 0.740322921807798, "grad_norm": 4.900722503662109, "learning_rate": 1.2983853909610097e-05, "loss": 3.1832, "step": 21000 }, { "epoch": 0.7579496580413171, "grad_norm": 4.764721870422363, "learning_rate": 1.2102517097934147e-05, "loss": 3.1808, "step": 21500 }, { "epoch": 0.775576394274836, "grad_norm": 5.3555731773376465, "learning_rate": 1.1221180286258197e-05, "loss": 3.1847, "step": 22000 }, { "epoch": 0.7932031305083551, "grad_norm": 5.72691535949707, "learning_rate": 1.0339843474582247e-05, "loss": 3.1689, "step": 22500 }, { "epoch": 0.8108298667418741, "grad_norm": 5.263107776641846, "learning_rate": 9.458506662906296e-06, "loss": 3.1666, "step": 23000 }, { "epoch": 0.828456602975393, "grad_norm": 5.273736476898193, "learning_rate": 8.577169851230346e-06, "loss": 3.1583, "step": 23500 }, { "epoch": 0.8460833392089121, "grad_norm": 5.418051719665527, "learning_rate": 7.695833039554396e-06, "loss": 3.1429, "step": 24000 }, { "epoch": 0.8637100754424311, "grad_norm": 4.837016582489014, "learning_rate": 6.814496227878446e-06, "loss": 3.1831, "step": 24500 }, { "epoch": 0.8813368116759501, "grad_norm": 5.3440680503845215, "learning_rate": 5.933159416202496e-06, "loss": 3.151, "step": 25000 }, { "epoch": 0.8989635479094691, "grad_norm": 5.674468517303467, "learning_rate": 5.051822604526546e-06, "loss": 3.142, "step": 25500 }, { "epoch": 0.916590284142988, "grad_norm": 5.245038986206055, "learning_rate": 4.170485792850596e-06, "loss": 3.1537, "step": 26000 }, { "epoch": 0.9342170203765071, "grad_norm": 5.040459632873535, "learning_rate": 3.289148981174646e-06, "loss": 3.1496, "step": 26500 }, { "epoch": 0.9518437566100261, "grad_norm": 4.918792724609375, "learning_rate": 2.4078121694986958e-06, "loss": 3.1541, "step": 27000 }, { "epoch": 0.9694704928435451, "grad_norm": 5.169427394866943, "learning_rate": 1.5264753578227457e-06, "loss": 3.1609, "step": 27500 }, { "epoch": 0.9870972290770641, "grad_norm": 5.406129837036133, "learning_rate": 6.451385461467955e-07, "loss": 3.1467, "step": 28000 }, { "epoch": 1.0, "step": 28366, "total_flos": 1.5038202327662592e+16, "train_loss": 3.3210895868885175, "train_runtime": 6630.0799, "train_samples_per_second": 34.227, "train_steps_per_second": 4.278 } ], "logging_steps": 500, "max_steps": 28366, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.5038202327662592e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }