|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 40.0, |
|
"eval_steps": 500, |
|
"global_step": 67600, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.0696972981095314, |
|
"learning_rate": 3.1018860946745563e-05, |
|
"loss": 0.785, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.1446572095155716, |
|
"learning_rate": 3.0787721893491126e-05, |
|
"loss": 0.7706, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.2664627134799957, |
|
"learning_rate": 3.055658284023669e-05, |
|
"loss": 0.7614, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 0.2499350905418396, |
|
"learning_rate": 3.032544378698225e-05, |
|
"loss": 0.7472, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 0.37448567152023315, |
|
"learning_rate": 3.009430473372781e-05, |
|
"loss": 0.7371, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 0.5009666085243225, |
|
"learning_rate": 2.9863165680473374e-05, |
|
"loss": 0.7284, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 2.07, |
|
"grad_norm": 0.4846726357936859, |
|
"learning_rate": 2.9632026627218937e-05, |
|
"loss": 0.707, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 0.5636312365531921, |
|
"learning_rate": 2.94008875739645e-05, |
|
"loss": 0.692, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 2.66, |
|
"grad_norm": 0.9609221816062927, |
|
"learning_rate": 2.9169748520710063e-05, |
|
"loss": 0.6832, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 0.6074559092521667, |
|
"learning_rate": 2.893860946745562e-05, |
|
"loss": 0.6713, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"grad_norm": 0.6442582011222839, |
|
"learning_rate": 2.8707470414201182e-05, |
|
"loss": 0.6624, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 3.55, |
|
"grad_norm": 0.52489173412323, |
|
"learning_rate": 2.8476331360946745e-05, |
|
"loss": 0.6533, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 3.85, |
|
"grad_norm": 0.6586915254592896, |
|
"learning_rate": 2.8245192307692307e-05, |
|
"loss": 0.6414, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 4.14, |
|
"grad_norm": 0.7074326276779175, |
|
"learning_rate": 2.801405325443787e-05, |
|
"loss": 0.632, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 4.44, |
|
"grad_norm": 0.7869051694869995, |
|
"learning_rate": 2.7782914201183433e-05, |
|
"loss": 0.625, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 4.73, |
|
"grad_norm": 0.7031483054161072, |
|
"learning_rate": 2.7551775147928993e-05, |
|
"loss": 0.6116, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 5.03, |
|
"grad_norm": 0.7165437340736389, |
|
"learning_rate": 2.7320636094674555e-05, |
|
"loss": 0.6067, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 5.33, |
|
"grad_norm": 0.6308349967002869, |
|
"learning_rate": 2.708949704142012e-05, |
|
"loss": 0.594, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 5.62, |
|
"grad_norm": 0.7305271625518799, |
|
"learning_rate": 2.685835798816568e-05, |
|
"loss": 0.5837, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 5.92, |
|
"grad_norm": 0.8089825510978699, |
|
"learning_rate": 2.6627218934911244e-05, |
|
"loss": 0.5725, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 6.21, |
|
"grad_norm": 0.7770040035247803, |
|
"learning_rate": 2.6396079881656807e-05, |
|
"loss": 0.5674, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 6.51, |
|
"grad_norm": 0.7730346322059631, |
|
"learning_rate": 2.6164940828402366e-05, |
|
"loss": 0.5567, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 6.8, |
|
"grad_norm": 0.6454223990440369, |
|
"learning_rate": 2.593380177514793e-05, |
|
"loss": 0.5486, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 7.1, |
|
"grad_norm": 0.5882906317710876, |
|
"learning_rate": 2.5702662721893492e-05, |
|
"loss": 0.5396, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 7.4, |
|
"grad_norm": 0.8279200792312622, |
|
"learning_rate": 2.5471523668639055e-05, |
|
"loss": 0.5309, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 7.69, |
|
"grad_norm": 0.8009528517723083, |
|
"learning_rate": 2.5240384615384618e-05, |
|
"loss": 0.5288, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 7.99, |
|
"grad_norm": 0.7412715554237366, |
|
"learning_rate": 2.500924556213018e-05, |
|
"loss": 0.5198, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 8.28, |
|
"grad_norm": 0.9230983853340149, |
|
"learning_rate": 2.4778106508875743e-05, |
|
"loss": 0.5163, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 8.58, |
|
"grad_norm": 0.7999468445777893, |
|
"learning_rate": 2.45469674556213e-05, |
|
"loss": 0.5131, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 8.88, |
|
"grad_norm": 0.7494385838508606, |
|
"learning_rate": 2.4315828402366862e-05, |
|
"loss": 0.506, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 9.17, |
|
"grad_norm": 0.7356762886047363, |
|
"learning_rate": 2.4084689349112425e-05, |
|
"loss": 0.5036, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 9.47, |
|
"grad_norm": 0.8011249303817749, |
|
"learning_rate": 2.3853550295857988e-05, |
|
"loss": 0.497, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 9.76, |
|
"grad_norm": 0.713610827922821, |
|
"learning_rate": 2.362241124260355e-05, |
|
"loss": 0.4967, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 10.06, |
|
"grad_norm": 0.8254227042198181, |
|
"learning_rate": 2.3391272189349114e-05, |
|
"loss": 0.4911, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 10.36, |
|
"grad_norm": 0.7040392756462097, |
|
"learning_rate": 2.3160133136094673e-05, |
|
"loss": 0.4859, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 10.65, |
|
"grad_norm": 0.7733869552612305, |
|
"learning_rate": 2.2928994082840236e-05, |
|
"loss": 0.4858, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 10.95, |
|
"grad_norm": 0.8573015928268433, |
|
"learning_rate": 2.26978550295858e-05, |
|
"loss": 0.4829, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 11.24, |
|
"grad_norm": 0.693286657333374, |
|
"learning_rate": 2.2466715976331362e-05, |
|
"loss": 0.4755, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 11.54, |
|
"grad_norm": 0.7536494135856628, |
|
"learning_rate": 2.2235576923076925e-05, |
|
"loss": 0.4768, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 11.83, |
|
"grad_norm": 0.6219621896743774, |
|
"learning_rate": 2.2004437869822487e-05, |
|
"loss": 0.4757, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 12.13, |
|
"grad_norm": 0.7244569063186646, |
|
"learning_rate": 2.1773298816568047e-05, |
|
"loss": 0.473, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 12.43, |
|
"grad_norm": 0.7847468852996826, |
|
"learning_rate": 2.154215976331361e-05, |
|
"loss": 0.4696, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 12.72, |
|
"grad_norm": 0.7616731524467468, |
|
"learning_rate": 2.1311020710059173e-05, |
|
"loss": 0.4696, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 13.02, |
|
"grad_norm": 0.7453758716583252, |
|
"learning_rate": 2.1079881656804735e-05, |
|
"loss": 0.4687, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 13.31, |
|
"grad_norm": 0.6706910729408264, |
|
"learning_rate": 2.0848742603550298e-05, |
|
"loss": 0.4624, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 13.61, |
|
"grad_norm": 0.819572389125824, |
|
"learning_rate": 2.061760355029586e-05, |
|
"loss": 0.4603, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 13.91, |
|
"grad_norm": 0.6898177266120911, |
|
"learning_rate": 2.0386464497041417e-05, |
|
"loss": 0.4599, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 14.2, |
|
"grad_norm": 0.6775723099708557, |
|
"learning_rate": 2.015532544378698e-05, |
|
"loss": 0.4622, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 14.5, |
|
"grad_norm": 0.7278532385826111, |
|
"learning_rate": 1.9924186390532543e-05, |
|
"loss": 0.4573, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 14.79, |
|
"grad_norm": 0.6195204257965088, |
|
"learning_rate": 1.9693047337278106e-05, |
|
"loss": 0.4536, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 15.09, |
|
"grad_norm": 0.6975180506706238, |
|
"learning_rate": 1.946190828402367e-05, |
|
"loss": 0.4532, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 15.38, |
|
"grad_norm": 0.7116599678993225, |
|
"learning_rate": 1.923076923076923e-05, |
|
"loss": 0.4521, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 15.68, |
|
"grad_norm": 0.6533932685852051, |
|
"learning_rate": 1.8999630177514794e-05, |
|
"loss": 0.4513, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 15.98, |
|
"grad_norm": 0.580528736114502, |
|
"learning_rate": 1.8768491124260354e-05, |
|
"loss": 0.4518, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 16.27, |
|
"grad_norm": 0.8283082842826843, |
|
"learning_rate": 1.8537352071005917e-05, |
|
"loss": 0.4473, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 16.57, |
|
"grad_norm": 0.6264183521270752, |
|
"learning_rate": 1.830621301775148e-05, |
|
"loss": 0.4466, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 16.86, |
|
"grad_norm": 0.6502621173858643, |
|
"learning_rate": 1.8075073964497042e-05, |
|
"loss": 0.446, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 17.16, |
|
"grad_norm": 0.6924391984939575, |
|
"learning_rate": 1.7843934911242605e-05, |
|
"loss": 0.4433, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 17.46, |
|
"grad_norm": 0.631476879119873, |
|
"learning_rate": 1.7612795857988168e-05, |
|
"loss": 0.4446, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 17.75, |
|
"grad_norm": 0.6945323348045349, |
|
"learning_rate": 1.7381656804733727e-05, |
|
"loss": 0.4451, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 18.05, |
|
"grad_norm": 0.6200039386749268, |
|
"learning_rate": 1.715051775147929e-05, |
|
"loss": 0.4434, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 18.34, |
|
"grad_norm": 0.6730862259864807, |
|
"learning_rate": 1.6919378698224853e-05, |
|
"loss": 0.4426, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 18.64, |
|
"grad_norm": 0.6520936489105225, |
|
"learning_rate": 1.6688239644970416e-05, |
|
"loss": 0.4401, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 18.93, |
|
"grad_norm": 0.7381883263587952, |
|
"learning_rate": 1.645710059171598e-05, |
|
"loss": 0.4381, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 19.23, |
|
"grad_norm": 0.6962296962738037, |
|
"learning_rate": 1.622596153846154e-05, |
|
"loss": 0.4399, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 19.53, |
|
"grad_norm": 0.5711750388145447, |
|
"learning_rate": 1.5994822485207098e-05, |
|
"loss": 0.4354, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 19.82, |
|
"grad_norm": 0.6115343570709229, |
|
"learning_rate": 1.576368343195266e-05, |
|
"loss": 0.437, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 20.12, |
|
"grad_norm": 0.6140381693840027, |
|
"learning_rate": 1.5532544378698223e-05, |
|
"loss": 0.4341, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 20.41, |
|
"grad_norm": 0.648704469203949, |
|
"learning_rate": 1.5301405325443786e-05, |
|
"loss": 0.4337, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 20.71, |
|
"grad_norm": 0.6556956171989441, |
|
"learning_rate": 1.507026627218935e-05, |
|
"loss": 0.4333, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 21.01, |
|
"grad_norm": 0.7024092674255371, |
|
"learning_rate": 1.483912721893491e-05, |
|
"loss": 0.4357, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 21.3, |
|
"grad_norm": 0.5994529128074646, |
|
"learning_rate": 1.4607988165680473e-05, |
|
"loss": 0.4311, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 21.6, |
|
"grad_norm": 0.599431037902832, |
|
"learning_rate": 1.4376849112426036e-05, |
|
"loss": 0.4334, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 21.89, |
|
"grad_norm": 0.6323761343955994, |
|
"learning_rate": 1.4145710059171597e-05, |
|
"loss": 0.4298, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 22.19, |
|
"grad_norm": 0.6665933132171631, |
|
"learning_rate": 1.391457100591716e-05, |
|
"loss": 0.4281, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 22.49, |
|
"grad_norm": 0.6103574633598328, |
|
"learning_rate": 1.3683431952662723e-05, |
|
"loss": 0.4311, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 22.78, |
|
"grad_norm": 0.5954911708831787, |
|
"learning_rate": 1.3452292899408284e-05, |
|
"loss": 0.4277, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 23.08, |
|
"grad_norm": 0.5706931352615356, |
|
"learning_rate": 1.3221153846153847e-05, |
|
"loss": 0.4278, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 23.37, |
|
"grad_norm": 0.5817924737930298, |
|
"learning_rate": 1.299001479289941e-05, |
|
"loss": 0.4239, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 23.67, |
|
"grad_norm": 0.591736912727356, |
|
"learning_rate": 1.2758875739644969e-05, |
|
"loss": 0.428, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 23.96, |
|
"grad_norm": 0.6267042756080627, |
|
"learning_rate": 1.2527736686390532e-05, |
|
"loss": 0.4275, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 24.26, |
|
"grad_norm": 0.5819630026817322, |
|
"learning_rate": 1.2296597633136095e-05, |
|
"loss": 0.4262, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 24.56, |
|
"grad_norm": 0.615161657333374, |
|
"learning_rate": 1.2065458579881656e-05, |
|
"loss": 0.4235, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 24.85, |
|
"grad_norm": 0.7147814631462097, |
|
"learning_rate": 1.1834319526627219e-05, |
|
"loss": 0.423, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 25.15, |
|
"grad_norm": 0.7751194834709167, |
|
"learning_rate": 1.1603180473372782e-05, |
|
"loss": 0.422, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 25.44, |
|
"grad_norm": 0.674323320388794, |
|
"learning_rate": 1.1372041420118345e-05, |
|
"loss": 0.4207, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 25.74, |
|
"grad_norm": 0.6965672969818115, |
|
"learning_rate": 1.1140902366863906e-05, |
|
"loss": 0.4244, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 26.04, |
|
"grad_norm": 0.6351442337036133, |
|
"learning_rate": 1.0909763313609469e-05, |
|
"loss": 0.4228, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 26.33, |
|
"grad_norm": 0.590655505657196, |
|
"learning_rate": 1.0678624260355031e-05, |
|
"loss": 0.4207, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 26.63, |
|
"grad_norm": 0.6553508639335632, |
|
"learning_rate": 1.044748520710059e-05, |
|
"loss": 0.422, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 26.92, |
|
"grad_norm": 0.6216753721237183, |
|
"learning_rate": 1.0216346153846154e-05, |
|
"loss": 0.4196, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 27.22, |
|
"grad_norm": 0.6628888249397278, |
|
"learning_rate": 9.985207100591717e-06, |
|
"loss": 0.4194, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 27.51, |
|
"grad_norm": 0.6111788749694824, |
|
"learning_rate": 9.754068047337278e-06, |
|
"loss": 0.4189, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 27.81, |
|
"grad_norm": 0.5751132965087891, |
|
"learning_rate": 9.52292899408284e-06, |
|
"loss": 0.4182, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 28.11, |
|
"grad_norm": 0.6333842873573303, |
|
"learning_rate": 9.291789940828403e-06, |
|
"loss": 0.4172, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 28.4, |
|
"grad_norm": 0.5846462845802307, |
|
"learning_rate": 9.060650887573965e-06, |
|
"loss": 0.417, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 28.7, |
|
"grad_norm": 0.5921066999435425, |
|
"learning_rate": 8.829511834319527e-06, |
|
"loss": 0.4178, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 28.99, |
|
"grad_norm": 0.6645215153694153, |
|
"learning_rate": 8.59837278106509e-06, |
|
"loss": 0.4166, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 29.29, |
|
"grad_norm": 0.6453720331192017, |
|
"learning_rate": 8.36723372781065e-06, |
|
"loss": 0.4142, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 29.59, |
|
"grad_norm": 0.6401262283325195, |
|
"learning_rate": 8.136094674556213e-06, |
|
"loss": 0.4152, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 29.88, |
|
"grad_norm": 0.6776517033576965, |
|
"learning_rate": 7.904955621301775e-06, |
|
"loss": 0.415, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 30.18, |
|
"grad_norm": 0.6697096228599548, |
|
"learning_rate": 7.673816568047338e-06, |
|
"loss": 0.4116, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 30.47, |
|
"grad_norm": 0.6276474595069885, |
|
"learning_rate": 7.442677514792899e-06, |
|
"loss": 0.4127, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 30.77, |
|
"grad_norm": 0.7491399049758911, |
|
"learning_rate": 7.211538461538462e-06, |
|
"loss": 0.4188, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 31.07, |
|
"grad_norm": 0.7292032837867737, |
|
"learning_rate": 6.980399408284024e-06, |
|
"loss": 0.4141, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 31.36, |
|
"grad_norm": 0.6432758569717407, |
|
"learning_rate": 6.749260355029585e-06, |
|
"loss": 0.4109, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 31.66, |
|
"grad_norm": 0.7142419815063477, |
|
"learning_rate": 6.518121301775148e-06, |
|
"loss": 0.4124, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 31.95, |
|
"grad_norm": 0.737123966217041, |
|
"learning_rate": 6.28698224852071e-06, |
|
"loss": 0.4106, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 32.25, |
|
"grad_norm": 0.6416926980018616, |
|
"learning_rate": 6.055843195266272e-06, |
|
"loss": 0.4131, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 32.54, |
|
"grad_norm": 0.8014604449272156, |
|
"learning_rate": 5.824704142011835e-06, |
|
"loss": 0.4113, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 32.84, |
|
"grad_norm": 0.745812714099884, |
|
"learning_rate": 5.593565088757396e-06, |
|
"loss": 0.4119, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 33.14, |
|
"grad_norm": 0.753264307975769, |
|
"learning_rate": 5.362426035502958e-06, |
|
"loss": 0.4109, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 33.43, |
|
"grad_norm": 0.7509620189666748, |
|
"learning_rate": 5.131286982248521e-06, |
|
"loss": 0.4094, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 33.73, |
|
"grad_norm": 0.6729797720909119, |
|
"learning_rate": 4.900147928994083e-06, |
|
"loss": 0.4086, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 34.02, |
|
"grad_norm": 0.6696120500564575, |
|
"learning_rate": 4.669008875739646e-06, |
|
"loss": 0.4109, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 34.32, |
|
"grad_norm": 0.591411828994751, |
|
"learning_rate": 4.437869822485207e-06, |
|
"loss": 0.4082, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 34.62, |
|
"grad_norm": 0.6425775289535522, |
|
"learning_rate": 4.206730769230769e-06, |
|
"loss": 0.4096, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 34.91, |
|
"grad_norm": 0.6289854645729065, |
|
"learning_rate": 3.975591715976332e-06, |
|
"loss": 0.4101, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 35.21, |
|
"grad_norm": 0.6215291023254395, |
|
"learning_rate": 3.7444526627218935e-06, |
|
"loss": 0.4094, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 35.5, |
|
"grad_norm": 0.7314534783363342, |
|
"learning_rate": 3.513313609467456e-06, |
|
"loss": 0.4062, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 35.8, |
|
"grad_norm": 0.6580629348754883, |
|
"learning_rate": 3.2821745562130175e-06, |
|
"loss": 0.4078, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 36.09, |
|
"grad_norm": 0.6824979186058044, |
|
"learning_rate": 3.05103550295858e-06, |
|
"loss": 0.4098, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 36.39, |
|
"grad_norm": 0.6403664946556091, |
|
"learning_rate": 2.8198964497041423e-06, |
|
"loss": 0.4078, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 36.69, |
|
"grad_norm": 0.6590360999107361, |
|
"learning_rate": 2.5887573964497043e-06, |
|
"loss": 0.4076, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 36.98, |
|
"grad_norm": 0.8094596862792969, |
|
"learning_rate": 2.3576183431952663e-06, |
|
"loss": 0.4059, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 37.28, |
|
"grad_norm": 0.6609135270118713, |
|
"learning_rate": 2.1264792899408283e-06, |
|
"loss": 0.4065, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 37.57, |
|
"grad_norm": 0.6913712024688721, |
|
"learning_rate": 1.8953402366863905e-06, |
|
"loss": 0.4033, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 37.87, |
|
"grad_norm": 0.6110714077949524, |
|
"learning_rate": 1.6642011834319528e-06, |
|
"loss": 0.4067, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 38.17, |
|
"grad_norm": 0.7397092580795288, |
|
"learning_rate": 1.4330621301775148e-06, |
|
"loss": 0.4056, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 38.46, |
|
"grad_norm": 0.6656752824783325, |
|
"learning_rate": 1.201923076923077e-06, |
|
"loss": 0.403, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 38.76, |
|
"grad_norm": 0.70158451795578, |
|
"learning_rate": 9.70784023668639e-07, |
|
"loss": 0.4063, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 39.05, |
|
"grad_norm": 0.7131240367889404, |
|
"learning_rate": 7.396449704142012e-07, |
|
"loss": 0.4058, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 39.35, |
|
"grad_norm": 0.7075223922729492, |
|
"learning_rate": 5.085059171597633e-07, |
|
"loss": 0.4058, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 39.64, |
|
"grad_norm": 0.7466796040534973, |
|
"learning_rate": 2.7736686390532544e-07, |
|
"loss": 0.4043, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 39.94, |
|
"grad_norm": 0.6051456332206726, |
|
"learning_rate": 4.6227810650887574e-08, |
|
"loss": 0.4047, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"step": 67600, |
|
"total_flos": 5.463929616806707e+19, |
|
"train_loss": 0.47529568801970173, |
|
"train_runtime": 7092.3542, |
|
"train_samples_per_second": 76.223, |
|
"train_steps_per_second": 9.531 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 67600, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 40, |
|
"save_steps": 1000000000, |
|
"total_flos": 5.463929616806707e+19, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|