|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 5746, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01740341106856944, |
|
"grad_norm": 0.046589791774749756, |
|
"learning_rate": 4.351610095735422e-07, |
|
"loss": 2.7999, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.03480682213713888, |
|
"grad_norm": 0.04616040736436844, |
|
"learning_rate": 8.703220191470844e-07, |
|
"loss": 2.7996, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.05221023320570832, |
|
"grad_norm": 0.05549981817603111, |
|
"learning_rate": 1.305483028720627e-06, |
|
"loss": 2.797, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.06961364427427776, |
|
"grad_norm": 0.063571035861969, |
|
"learning_rate": 1.7406440382941688e-06, |
|
"loss": 2.7909, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.0870170553428472, |
|
"grad_norm": 0.08422163128852844, |
|
"learning_rate": 2.1758050478677113e-06, |
|
"loss": 2.7951, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.10442046641141664, |
|
"grad_norm": 0.09257014095783234, |
|
"learning_rate": 2.610966057441254e-06, |
|
"loss": 2.7803, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.12182387747998608, |
|
"grad_norm": 0.11055697500705719, |
|
"learning_rate": 3.046127067014796e-06, |
|
"loss": 2.7681, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.1392272885485555, |
|
"grad_norm": 0.10759040713310242, |
|
"learning_rate": 3.4812880765883376e-06, |
|
"loss": 2.7611, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.15663069961712495, |
|
"grad_norm": 0.12318646907806396, |
|
"learning_rate": 3.9164490861618806e-06, |
|
"loss": 2.7402, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.1740341106856944, |
|
"grad_norm": 0.12962989509105682, |
|
"learning_rate": 4.351610095735423e-06, |
|
"loss": 2.7451, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.19143752175426385, |
|
"grad_norm": 0.13981275260448456, |
|
"learning_rate": 4.786771105308965e-06, |
|
"loss": 2.735, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.20884093282283328, |
|
"grad_norm": 0.14711035788059235, |
|
"learning_rate": 5.221932114882508e-06, |
|
"loss": 2.7469, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.22624434389140272, |
|
"grad_norm": 0.15727241337299347, |
|
"learning_rate": 5.657093124456049e-06, |
|
"loss": 2.7327, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.24364775495997215, |
|
"grad_norm": 0.15055705606937408, |
|
"learning_rate": 6.092254134029592e-06, |
|
"loss": 2.7234, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.2610511660285416, |
|
"grad_norm": 0.16661331057548523, |
|
"learning_rate": 6.527415143603134e-06, |
|
"loss": 2.7174, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.278454577097111, |
|
"grad_norm": 0.17976854741573334, |
|
"learning_rate": 6.962576153176675e-06, |
|
"loss": 2.719, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.2958579881656805, |
|
"grad_norm": 0.1790621429681778, |
|
"learning_rate": 7.397737162750218e-06, |
|
"loss": 2.7173, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.3132613992342499, |
|
"grad_norm": 0.19079644978046417, |
|
"learning_rate": 7.832898172323761e-06, |
|
"loss": 2.7131, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.33066481030281936, |
|
"grad_norm": 0.19005636870861053, |
|
"learning_rate": 8.268059181897302e-06, |
|
"loss": 2.7168, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.3480682213713888, |
|
"grad_norm": 0.19910404086112976, |
|
"learning_rate": 8.703220191470845e-06, |
|
"loss": 2.7061, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.3654716324399582, |
|
"grad_norm": 0.20510949194431305, |
|
"learning_rate": 9.138381201044387e-06, |
|
"loss": 2.6862, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.3828750435085277, |
|
"grad_norm": 0.20418143272399902, |
|
"learning_rate": 9.57354221061793e-06, |
|
"loss": 2.6802, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.4002784545770971, |
|
"grad_norm": 0.21713656187057495, |
|
"learning_rate": 1.000870322019147e-05, |
|
"loss": 2.6923, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.41768186564566656, |
|
"grad_norm": 0.2298802137374878, |
|
"learning_rate": 1.0443864229765015e-05, |
|
"loss": 2.6818, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.43508527671423597, |
|
"grad_norm": 0.2294008880853653, |
|
"learning_rate": 1.0879025239338557e-05, |
|
"loss": 2.6896, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.45248868778280543, |
|
"grad_norm": 0.21464629471302032, |
|
"learning_rate": 1.1314186248912098e-05, |
|
"loss": 2.6805, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.4698920988513749, |
|
"grad_norm": 0.25449061393737793, |
|
"learning_rate": 1.174934725848564e-05, |
|
"loss": 2.6806, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.4872955099199443, |
|
"grad_norm": 0.24079586565494537, |
|
"learning_rate": 1.2184508268059184e-05, |
|
"loss": 2.6844, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.5046989209885138, |
|
"grad_norm": 0.2414436638355255, |
|
"learning_rate": 1.2619669277632725e-05, |
|
"loss": 2.6817, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.5221023320570832, |
|
"grad_norm": 0.2530564069747925, |
|
"learning_rate": 1.3054830287206268e-05, |
|
"loss": 2.6556, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.5395057431256526, |
|
"grad_norm": 0.26441535353660583, |
|
"learning_rate": 1.348999129677981e-05, |
|
"loss": 2.6749, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.556909154194222, |
|
"grad_norm": 0.2584131062030792, |
|
"learning_rate": 1.392515230635335e-05, |
|
"loss": 2.6575, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.5743125652627915, |
|
"grad_norm": 0.25025609135627747, |
|
"learning_rate": 1.4360313315926895e-05, |
|
"loss": 2.6658, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.591715976331361, |
|
"grad_norm": 0.26518625020980835, |
|
"learning_rate": 1.4795474325500436e-05, |
|
"loss": 2.6586, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.6091193873999304, |
|
"grad_norm": 0.26597312092781067, |
|
"learning_rate": 1.5230635335073978e-05, |
|
"loss": 2.6451, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.6265227984684998, |
|
"grad_norm": 0.2725384831428528, |
|
"learning_rate": 1.5665796344647522e-05, |
|
"loss": 2.6521, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.6439262095370692, |
|
"grad_norm": 0.2752222716808319, |
|
"learning_rate": 1.6100957354221064e-05, |
|
"loss": 2.6398, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.6613296206056387, |
|
"grad_norm": 0.2558598518371582, |
|
"learning_rate": 1.6536118363794605e-05, |
|
"loss": 2.6486, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.6787330316742082, |
|
"grad_norm": 0.26938167214393616, |
|
"learning_rate": 1.697127937336815e-05, |
|
"loss": 2.641, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.6961364427427776, |
|
"grad_norm": 0.28793784976005554, |
|
"learning_rate": 1.740644038294169e-05, |
|
"loss": 2.6344, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.713539853811347, |
|
"grad_norm": 0.2677360773086548, |
|
"learning_rate": 1.7841601392515232e-05, |
|
"loss": 2.6542, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.7309432648799165, |
|
"grad_norm": 0.28143930435180664, |
|
"learning_rate": 1.8276762402088773e-05, |
|
"loss": 2.6446, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.7483466759484859, |
|
"grad_norm": 0.28870299458503723, |
|
"learning_rate": 1.8711923411662314e-05, |
|
"loss": 2.6243, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.7657500870170554, |
|
"grad_norm": 0.296633780002594, |
|
"learning_rate": 1.914708442123586e-05, |
|
"loss": 2.6306, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.7831534980856247, |
|
"grad_norm": 0.2806219160556793, |
|
"learning_rate": 1.95822454308094e-05, |
|
"loss": 2.6356, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.8005569091541942, |
|
"grad_norm": 0.2914940416812897, |
|
"learning_rate": 1.999940297883134e-05, |
|
"loss": 2.644, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.8179603202227637, |
|
"grad_norm": 0.28510311245918274, |
|
"learning_rate": 1.9599117132813187e-05, |
|
"loss": 2.6357, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.8353637312913331, |
|
"grad_norm": 0.3171123266220093, |
|
"learning_rate": 1.8486908682093175e-05, |
|
"loss": 2.6307, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.8527671423599026, |
|
"grad_norm": 0.2955775558948517, |
|
"learning_rate": 1.674526503944611e-05, |
|
"loss": 2.6315, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.8701705534284719, |
|
"grad_norm": 0.2767013907432556, |
|
"learning_rate": 1.450335594635761e-05, |
|
"loss": 2.6138, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.8875739644970414, |
|
"grad_norm": 0.27960339188575745, |
|
"learning_rate": 1.1927453544210397e-05, |
|
"loss": 2.6305, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.9049773755656109, |
|
"grad_norm": 0.31521016359329224, |
|
"learning_rate": 9.20860073020234e-06, |
|
"loss": 2.6249, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.9223807866341803, |
|
"grad_norm": 0.2640378773212433, |
|
"learning_rate": 6.548442379624425e-06, |
|
"loss": 2.6257, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.9397841977027498, |
|
"grad_norm": 0.28068870306015015, |
|
"learning_rate": 4.144270267924306e-06, |
|
"loss": 2.6261, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.9571876087713191, |
|
"grad_norm": 0.2999429702758789, |
|
"learning_rate": 2.1743908422712135e-06, |
|
"loss": 2.6245, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.9745910198398886, |
|
"grad_norm": 0.2793658971786499, |
|
"learning_rate": 7.849010480670938e-07, |
|
"loss": 2.6209, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.9919944309084581, |
|
"grad_norm": 0.30049070715904236, |
|
"learning_rate": 7.885298685522235e-08, |
|
"loss": 2.6215, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 5746, |
|
"total_flos": 8.372955480242258e+17, |
|
"train_loss": 2.6846868539578486, |
|
"train_runtime": 1624.688, |
|
"train_samples_per_second": 56.585, |
|
"train_steps_per_second": 3.537 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 5746, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 8.372955480242258e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|