|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 28366, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.017626736233519003, |
|
"grad_norm": 5.204168796539307, |
|
"learning_rate": 4.9118663188324054e-05, |
|
"loss": 3.9535, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.035253472467038006, |
|
"grad_norm": 5.162827968597412, |
|
"learning_rate": 4.82373263766481e-05, |
|
"loss": 3.761, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.052880208700557006, |
|
"grad_norm": 5.309798240661621, |
|
"learning_rate": 4.735598956497215e-05, |
|
"loss": 3.7096, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.07050694493407601, |
|
"grad_norm": 5.0922369956970215, |
|
"learning_rate": 4.64746527532962e-05, |
|
"loss": 3.6577, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.08813368116759501, |
|
"grad_norm": 5.067632675170898, |
|
"learning_rate": 4.559331594162025e-05, |
|
"loss": 3.6288, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.10576041740111401, |
|
"grad_norm": 5.3605475425720215, |
|
"learning_rate": 4.4711979129944304e-05, |
|
"loss": 3.6192, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.12338715363463301, |
|
"grad_norm": 5.510789394378662, |
|
"learning_rate": 4.383064231826835e-05, |
|
"loss": 3.559, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.14101388986815203, |
|
"grad_norm": 5.7333855628967285, |
|
"learning_rate": 4.29493055065924e-05, |
|
"loss": 3.5382, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.158640626101671, |
|
"grad_norm": 5.04295539855957, |
|
"learning_rate": 4.206796869491645e-05, |
|
"loss": 3.4962, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.17626736233519003, |
|
"grad_norm": 4.932398796081543, |
|
"learning_rate": 4.11866318832405e-05, |
|
"loss": 3.5339, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.193894098568709, |
|
"grad_norm": 5.262182235717773, |
|
"learning_rate": 4.0305295071564555e-05, |
|
"loss": 3.4758, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.21152083480222802, |
|
"grad_norm": 5.248316764831543, |
|
"learning_rate": 3.94239582598886e-05, |
|
"loss": 3.4524, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.229147571035747, |
|
"grad_norm": 5.176753520965576, |
|
"learning_rate": 3.854262144821265e-05, |
|
"loss": 3.4403, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.24677430726926602, |
|
"grad_norm": 5.396851539611816, |
|
"learning_rate": 3.76612846365367e-05, |
|
"loss": 3.4066, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.26440104350278504, |
|
"grad_norm": 4.905313968658447, |
|
"learning_rate": 3.677994782486075e-05, |
|
"loss": 3.4277, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.28202777973630405, |
|
"grad_norm": 5.581764221191406, |
|
"learning_rate": 3.58986110131848e-05, |
|
"loss": 3.3977, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.299654515969823, |
|
"grad_norm": 4.564020156860352, |
|
"learning_rate": 3.501727420150885e-05, |
|
"loss": 3.3739, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.317281252203342, |
|
"grad_norm": 5.451286315917969, |
|
"learning_rate": 3.41359373898329e-05, |
|
"loss": 3.3724, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.33490798843686104, |
|
"grad_norm": 5.060819149017334, |
|
"learning_rate": 3.325460057815695e-05, |
|
"loss": 3.3393, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.35253472467038005, |
|
"grad_norm": 5.474411487579346, |
|
"learning_rate": 3.2373263766481e-05, |
|
"loss": 3.3186, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.370161460903899, |
|
"grad_norm": 5.26786994934082, |
|
"learning_rate": 3.149192695480505e-05, |
|
"loss": 3.3223, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.387788197137418, |
|
"grad_norm": 5.467500686645508, |
|
"learning_rate": 3.06105901431291e-05, |
|
"loss": 3.3054, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.40541493337093704, |
|
"grad_norm": 5.263679027557373, |
|
"learning_rate": 2.972925333145315e-05, |
|
"loss": 3.3193, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.42304166960445605, |
|
"grad_norm": 4.835860729217529, |
|
"learning_rate": 2.88479165197772e-05, |
|
"loss": 3.2871, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.44066840583797506, |
|
"grad_norm": 4.88271951675415, |
|
"learning_rate": 2.7966579708101248e-05, |
|
"loss": 3.2783, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.458295142071494, |
|
"grad_norm": 5.228416442871094, |
|
"learning_rate": 2.70852428964253e-05, |
|
"loss": 3.2845, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.47592187830501304, |
|
"grad_norm": 5.097890853881836, |
|
"learning_rate": 2.6203906084749348e-05, |
|
"loss": 3.2731, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.49354861453853205, |
|
"grad_norm": 4.9926066398620605, |
|
"learning_rate": 2.53225692730734e-05, |
|
"loss": 3.27, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.511175350772051, |
|
"grad_norm": 5.329204559326172, |
|
"learning_rate": 2.4441232461397447e-05, |
|
"loss": 3.253, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.5288020870055701, |
|
"grad_norm": 4.740358352661133, |
|
"learning_rate": 2.35598956497215e-05, |
|
"loss": 3.2511, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.546428823239089, |
|
"grad_norm": 5.418153285980225, |
|
"learning_rate": 2.267855883804555e-05, |
|
"loss": 3.2315, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.5640555594726081, |
|
"grad_norm": 4.993420600891113, |
|
"learning_rate": 2.1797222026369598e-05, |
|
"loss": 3.2453, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.5816822957061271, |
|
"grad_norm": 5.474274635314941, |
|
"learning_rate": 2.091588521469365e-05, |
|
"loss": 3.2328, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.599309031939646, |
|
"grad_norm": 4.977609157562256, |
|
"learning_rate": 2.0034548403017698e-05, |
|
"loss": 3.2181, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.6169357681731651, |
|
"grad_norm": 4.982664585113525, |
|
"learning_rate": 1.915321159134175e-05, |
|
"loss": 3.2106, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.634562504406684, |
|
"grad_norm": 5.291051387786865, |
|
"learning_rate": 1.8271874779665797e-05, |
|
"loss": 3.2134, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.652189240640203, |
|
"grad_norm": 5.687000751495361, |
|
"learning_rate": 1.739053796798985e-05, |
|
"loss": 3.1905, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 0.6698159768737221, |
|
"grad_norm": 5.048547267913818, |
|
"learning_rate": 1.6509201156313897e-05, |
|
"loss": 3.2165, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.687442713107241, |
|
"grad_norm": 5.21890926361084, |
|
"learning_rate": 1.5627864344637945e-05, |
|
"loss": 3.216, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 0.7050694493407601, |
|
"grad_norm": 4.901352405548096, |
|
"learning_rate": 1.4746527532961998e-05, |
|
"loss": 3.1903, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.7226961855742791, |
|
"grad_norm": 5.835772514343262, |
|
"learning_rate": 1.3865190721286048e-05, |
|
"loss": 3.1971, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 0.740322921807798, |
|
"grad_norm": 4.900722503662109, |
|
"learning_rate": 1.2983853909610097e-05, |
|
"loss": 3.1832, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.7579496580413171, |
|
"grad_norm": 4.764721870422363, |
|
"learning_rate": 1.2102517097934147e-05, |
|
"loss": 3.1808, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 0.775576394274836, |
|
"grad_norm": 5.3555731773376465, |
|
"learning_rate": 1.1221180286258197e-05, |
|
"loss": 3.1847, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.7932031305083551, |
|
"grad_norm": 5.72691535949707, |
|
"learning_rate": 1.0339843474582247e-05, |
|
"loss": 3.1689, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 0.8108298667418741, |
|
"grad_norm": 5.263107776641846, |
|
"learning_rate": 9.458506662906296e-06, |
|
"loss": 3.1666, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 0.828456602975393, |
|
"grad_norm": 5.273736476898193, |
|
"learning_rate": 8.577169851230346e-06, |
|
"loss": 3.1583, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 0.8460833392089121, |
|
"grad_norm": 5.418051719665527, |
|
"learning_rate": 7.695833039554396e-06, |
|
"loss": 3.1429, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.8637100754424311, |
|
"grad_norm": 4.837016582489014, |
|
"learning_rate": 6.814496227878446e-06, |
|
"loss": 3.1831, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 0.8813368116759501, |
|
"grad_norm": 5.3440680503845215, |
|
"learning_rate": 5.933159416202496e-06, |
|
"loss": 3.151, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 0.8989635479094691, |
|
"grad_norm": 5.674468517303467, |
|
"learning_rate": 5.051822604526546e-06, |
|
"loss": 3.142, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 0.916590284142988, |
|
"grad_norm": 5.245038986206055, |
|
"learning_rate": 4.170485792850596e-06, |
|
"loss": 3.1537, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 0.9342170203765071, |
|
"grad_norm": 5.040459632873535, |
|
"learning_rate": 3.289148981174646e-06, |
|
"loss": 3.1496, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 0.9518437566100261, |
|
"grad_norm": 4.918792724609375, |
|
"learning_rate": 2.4078121694986958e-06, |
|
"loss": 3.1541, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 0.9694704928435451, |
|
"grad_norm": 5.169427394866943, |
|
"learning_rate": 1.5264753578227457e-06, |
|
"loss": 3.1609, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 0.9870972290770641, |
|
"grad_norm": 5.406129837036133, |
|
"learning_rate": 6.451385461467955e-07, |
|
"loss": 3.1467, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 28366, |
|
"total_flos": 1.5038202327662592e+16, |
|
"train_loss": 3.3210895868885175, |
|
"train_runtime": 6630.0799, |
|
"train_samples_per_second": 34.227, |
|
"train_steps_per_second": 4.278 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 28366, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.5038202327662592e+16, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|