|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 389, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.002570694087403599, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.6666666666666667e-06, |
|
"loss": 1.7749, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.005141388174807198, |
|
"grad_norm": 0.0, |
|
"learning_rate": 3.3333333333333333e-06, |
|
"loss": 1.7549, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.007712082262210797, |
|
"grad_norm": 0.0, |
|
"learning_rate": 5e-06, |
|
"loss": 1.7659, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.010282776349614395, |
|
"grad_norm": 0.0, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 1.7842, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.012853470437017995, |
|
"grad_norm": 0.0, |
|
"learning_rate": 8.333333333333334e-06, |
|
"loss": 1.8015, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.015424164524421594, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1e-05, |
|
"loss": 1.7534, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.017994858611825194, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.1666666666666668e-05, |
|
"loss": 1.7747, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.02056555269922879, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.3333333333333333e-05, |
|
"loss": 1.7783, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.02313624678663239, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.5000000000000002e-05, |
|
"loss": 1.803, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.02570694087403599, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 1.7859, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.028277634961439587, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.8333333333333333e-05, |
|
"loss": 1.7856, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.030848329048843187, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2e-05, |
|
"loss": 1.7825, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.033419023136246784, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.9999652796146877e-05, |
|
"loss": 1.8127, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.03598971722365039, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.9998611208697607e-05, |
|
"loss": 1.8115, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.038560411311053984, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.9996875309980824e-05, |
|
"loss": 1.7544, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.04113110539845758, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.9994445220538678e-05, |
|
"loss": 1.7886, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.043701799485861184, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.999132110911845e-05, |
|
"loss": 1.7866, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.04627249357326478, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.9987503192660842e-05, |
|
"loss": 1.7612, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.04884318766066838, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.9982991736284914e-05, |
|
"loss": 1.7944, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.05141388174807198, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.997778705326968e-05, |
|
"loss": 1.7656, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.05398457583547558, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.9971889505032337e-05, |
|
"loss": 1.7554, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.056555269922879174, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.9965299501103178e-05, |
|
"loss": 1.7637, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.05912596401028278, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.995801749909715e-05, |
|
"loss": 1.7803, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.061696658097686374, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.995004400468209e-05, |
|
"loss": 1.7402, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.06426735218508997, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.9941379571543597e-05, |
|
"loss": 1.7017, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.06683804627249357, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.9932024801346583e-05, |
|
"loss": 1.7671, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.06940874035989718, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.992198034369349e-05, |
|
"loss": 1.7014, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.07197943444730077, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.991124689607921e-05, |
|
"loss": 1.7532, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.07455012853470437, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.9899825203842613e-05, |
|
"loss": 1.7129, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.07712082262210797, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.988771606011481e-05, |
|
"loss": 1.7126, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.07969151670951156, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.987492030576407e-05, |
|
"loss": 1.7393, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.08226221079691516, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.986143882933744e-05, |
|
"loss": 1.7742, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.08483290488431877, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.9847272566999026e-05, |
|
"loss": 1.7483, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.08740359897172237, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.9832422502465013e-05, |
|
"loss": 1.707, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.08997429305912596, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.9816889666935318e-05, |
|
"loss": 1.7507, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.09254498714652956, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.9800675139022006e-05, |
|
"loss": 1.7339, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.09511568123393316, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.9783780044674402e-05, |
|
"loss": 1.748, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.09768637532133675, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.976620555710087e-05, |
|
"loss": 1.686, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.10025706940874037, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.974795289668737e-05, |
|
"loss": 1.7043, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.10282776349614396, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.972902333091271e-05, |
|
"loss": 1.7646, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.10539845758354756, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.9709418174260523e-05, |
|
"loss": 1.6802, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.10796915167095116, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.9689138788127994e-05, |
|
"loss": 1.6775, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.11053984575835475, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.966818658073133e-05, |
|
"loss": 1.6633, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.11311053984575835, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.9646563007007952e-05, |
|
"loss": 1.7637, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.11568123393316196, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.9624269568515486e-05, |
|
"loss": 1.7087, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.11825192802056556, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.960130781332748e-05, |
|
"loss": 1.6562, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.12082262210796915, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.957767933592591e-05, |
|
"loss": 1.698, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.12339331619537275, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.955338577709046e-05, |
|
"loss": 1.7444, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.12596401028277635, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.9528428823784567e-05, |
|
"loss": 1.6743, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.12853470437017994, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.9502810209038302e-05, |
|
"loss": 1.6741, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.13110539845758354, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.9476531711828027e-05, |
|
"loss": 1.708, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.13367609254498714, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.9449595156952827e-05, |
|
"loss": 1.6587, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.13624678663239073, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.9422002414907837e-05, |
|
"loss": 1.6887, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.13881748071979436, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.9393755401754324e-05, |
|
"loss": 1.6714, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.14138817480719795, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.936485607898665e-05, |
|
"loss": 1.7432, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.14395886889460155, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.9335306453396066e-05, |
|
"loss": 1.6675, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.14652956298200515, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.9305108576931336e-05, |
|
"loss": 1.6436, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.14910025706940874, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.927426454655627e-05, |
|
"loss": 1.6853, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.15167095115681234, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.924277650410412e-05, |
|
"loss": 1.6641, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.15424164524421594, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.9210646636128805e-05, |
|
"loss": 1.7385, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.15681233933161953, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.9177877173753127e-05, |
|
"loss": 1.7178, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.15938303341902313, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.91444703925138e-05, |
|
"loss": 1.6785, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.16195372750642673, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.9110428612203463e-05, |
|
"loss": 1.6799, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.16452442159383032, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.9075754196709574e-05, |
|
"loss": 1.7075, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.16709511568123395, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.904044955385026e-05, |
|
"loss": 1.6621, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.16966580976863754, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.9004517135207127e-05, |
|
"loss": 1.6492, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.17223650385604114, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.8967959435955027e-05, |
|
"loss": 1.7297, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.17480719794344474, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.893077899468876e-05, |
|
"loss": 1.6882, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.17737789203084833, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.889297839324682e-05, |
|
"loss": 1.6714, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.17994858611825193, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.8854560256532098e-05, |
|
"loss": 1.6489, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.18251928020565553, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.8815527252329624e-05, |
|
"loss": 1.6721, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.18508997429305912, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.8775882091121282e-05, |
|
"loss": 1.6533, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.18766066838046272, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.8735627525897618e-05, |
|
"loss": 1.6443, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.19023136246786632, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.8694766351966665e-05, |
|
"loss": 1.6631, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.1928020565552699, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.8653301406759827e-05, |
|
"loss": 1.6873, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.1953727506426735, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.8611235569634852e-05, |
|
"loss": 1.7046, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.19794344473007713, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.8568571761675893e-05, |
|
"loss": 1.7002, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.20051413881748073, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.8525312945490647e-05, |
|
"loss": 1.698, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.20308483290488433, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.8481462125004647e-05, |
|
"loss": 1.6765, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.20565552699228792, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.8437022345252666e-05, |
|
"loss": 1.7185, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.20822622107969152, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.8391996692167242e-05, |
|
"loss": 1.6653, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.21079691516709512, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.8346388292364438e-05, |
|
"loss": 1.7129, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.2133676092544987, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.8300200312926674e-05, |
|
"loss": 1.6709, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.2159383033419023, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.8253435961182844e-05, |
|
"loss": 1.6597, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.2185089974293059, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.8206098484485563e-05, |
|
"loss": 1.6812, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.2210796915167095, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.8158191169985696e-05, |
|
"loss": 1.6792, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.2236503856041131, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.810971734440408e-05, |
|
"loss": 1.6404, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.2262210796915167, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.806068037380052e-05, |
|
"loss": 1.6528, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.22879177377892032, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.801108366334004e-05, |
|
"loss": 1.6775, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.23136246786632392, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.796093065705644e-05, |
|
"loss": 1.679, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.23393316195372751, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.791022483761312e-05, |
|
"loss": 1.658, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.2365038560411311, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.7858969726061262e-05, |
|
"loss": 1.6277, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.2390745501285347, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.7807168881595304e-05, |
|
"loss": 1.6602, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.2416452442159383, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.7754825901305814e-05, |
|
"loss": 1.6758, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.2442159383033419, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.7701944419929673e-05, |
|
"loss": 1.6353, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.2467866323907455, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.7648528109597704e-05, |
|
"loss": 1.6602, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.2493573264781491, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.7594580679579654e-05, |
|
"loss": 1.6404, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.2519280205655527, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.7540105876026647e-05, |
|
"loss": 1.6365, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.2544987146529563, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.7485107481711014e-05, |
|
"loss": 1.6353, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.2570694087403599, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.7429589315763637e-05, |
|
"loss": 1.6541, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2596401028277635, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.737355523340875e-05, |
|
"loss": 1.6133, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.2622107969151671, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.7317009125696208e-05, |
|
"loss": 1.6687, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.2647814910025707, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.725995491923131e-05, |
|
"loss": 1.636, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.26735218508997427, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.7202396575902118e-05, |
|
"loss": 1.6497, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.2699228791773779, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.714433809260435e-05, |
|
"loss": 1.6458, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.27249357326478146, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.7085783500963825e-05, |
|
"loss": 1.624, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.2750642673521851, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.702673686705651e-05, |
|
"loss": 1.6353, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.2776349614395887, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.6967202291126174e-05, |
|
"loss": 1.6406, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.2802056555269923, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.690718390729964e-05, |
|
"loss": 1.6323, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.2827763496143959, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.684668588329973e-05, |
|
"loss": 1.665, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.2853470437017995, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.6785712420155864e-05, |
|
"loss": 1.635, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.2879177377892031, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.67242677519123e-05, |
|
"loss": 1.6335, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.29048843187660667, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.6662356145334158e-05, |
|
"loss": 1.6846, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.2930591259640103, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.6599981899611103e-05, |
|
"loss": 1.6353, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.29562982005141386, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.653714934605883e-05, |
|
"loss": 1.6189, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.2982005141388175, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.647386284781828e-05, |
|
"loss": 1.7021, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.30077120822622105, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.6410126799552653e-05, |
|
"loss": 1.6777, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.3033419023136247, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.6345945627142264e-05, |
|
"loss": 1.6377, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.3059125964010283, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.628132378737718e-05, |
|
"loss": 1.6616, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.30848329048843187, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.6216265767647756e-05, |
|
"loss": 1.616, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.3110539845758355, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.615077608563302e-05, |
|
"loss": 1.6816, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.31362467866323906, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.6084859288986957e-05, |
|
"loss": 1.6099, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.3161953727506427, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.601851995502272e-05, |
|
"loss": 1.6274, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.31876606683804626, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.5951762690394788e-05, |
|
"loss": 1.6663, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.3213367609254499, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.5884592130779056e-05, |
|
"loss": 1.6494, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.32390745501285345, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.581701294055095e-05, |
|
"loss": 1.614, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.3264781491002571, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.5749029812461515e-05, |
|
"loss": 1.6265, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.32904884318766064, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.568064746731156e-05, |
|
"loss": 1.5913, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.33161953727506427, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.5611870653623826e-05, |
|
"loss": 1.5984, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.3341902313624679, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.5542704147313257e-05, |
|
"loss": 1.6343, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.33676092544987146, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.5473152751355353e-05, |
|
"loss": 1.6355, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.3393316195372751, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.5403221295452647e-05, |
|
"loss": 1.647, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.34190231362467866, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.5332914635699327e-05, |
|
"loss": 1.6191, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.3444730077120823, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.5262237654244026e-05, |
|
"loss": 1.624, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.34704370179948585, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.5191195258950804e-05, |
|
"loss": 1.6055, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.3496143958868895, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.5119792383058338e-05, |
|
"loss": 1.6492, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.35218508997429304, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.5048033984837352e-05, |
|
"loss": 1.6155, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.35475578406169667, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.4975925047246319e-05, |
|
"loss": 1.6042, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.35732647814910024, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.4903470577585433e-05, |
|
"loss": 1.6367, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.35989717223650386, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.4830675607148899e-05, |
|
"loss": 1.5928, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.36246786632390743, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.475754519087557e-05, |
|
"loss": 1.6526, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.36503856041131105, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.4684084406997903e-05, |
|
"loss": 1.6362, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.3676092544987147, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.4610298356689341e-05, |
|
"loss": 1.6201, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.37017994858611825, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.453619216371008e-05, |
|
"loss": 1.6162, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.37275064267352187, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.446177097405127e-05, |
|
"loss": 1.6172, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.37532133676092544, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.4387039955577668e-05, |
|
"loss": 1.6301, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.37789203084832906, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.4312004297668791e-05, |
|
"loss": 1.6096, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.38046272493573263, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.4236669210858544e-05, |
|
"loss": 1.6152, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.38303341902313626, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.4161039926473412e-05, |
|
"loss": 1.6321, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.3856041131105398, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.4085121696269185e-05, |
|
"loss": 1.5957, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.38817480719794345, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.4008919792066273e-05, |
|
"loss": 1.6421, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.390745501285347, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.3932439505383628e-05, |
|
"loss": 1.6189, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.39331619537275064, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.385568614707129e-05, |
|
"loss": 1.6106, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.39588688946015427, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.3778665046941616e-05, |
|
"loss": 1.6321, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.39845758354755784, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.3701381553399147e-05, |
|
"loss": 1.5796, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.40102827763496146, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.3623841033069232e-05, |
|
"loss": 1.6555, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.40359897172236503, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.3546048870425356e-05, |
|
"loss": 1.6028, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.40616966580976865, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.3468010467415248e-05, |
|
"loss": 1.5969, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.4087403598971722, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.3389731243085747e-05, |
|
"loss": 1.6077, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.41131105398457585, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.3311216633206514e-05, |
|
"loss": 1.5762, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4138817480719794, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.3232472089892567e-05, |
|
"loss": 1.6079, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.41645244215938304, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.315350308122567e-05, |
|
"loss": 1.5994, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.4190231362467866, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.3074315090874652e-05, |
|
"loss": 1.5732, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.42159383033419023, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.2994913617714573e-05, |
|
"loss": 1.5901, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.4241645244215938, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.2915304175444929e-05, |
|
"loss": 1.6138, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.4267352185089974, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.2835492292206735e-05, |
|
"loss": 1.5945, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.42930591259640105, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.2755483510198668e-05, |
|
"loss": 1.6067, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.4318766066838046, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.2675283385292212e-05, |
|
"loss": 1.5957, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.43444730077120824, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.2594897486645836e-05, |
|
"loss": 1.6089, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.4370179948586118, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.2514331396318298e-05, |
|
"loss": 1.6335, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.43958868894601544, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.2433590708880991e-05, |
|
"loss": 1.6406, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.442159383033419, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.2352681031029476e-05, |
|
"loss": 1.5759, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.44473007712082263, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.2271607981194132e-05, |
|
"loss": 1.5955, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.4473007712082262, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.2190377189150016e-05, |
|
"loss": 1.6069, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.4498714652956298, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.2108994295625924e-05, |
|
"loss": 1.5796, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.4524421593830334, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.2027464951912703e-05, |
|
"loss": 1.5952, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.455012853470437, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.1945794819470805e-05, |
|
"loss": 1.6213, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.45758354755784064, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.1863989569537165e-05, |
|
"loss": 1.5974, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.4601542416452442, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.1782054882731377e-05, |
|
"loss": 1.5188, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.46272493573264784, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.1699996448661242e-05, |
|
"loss": 1.5964, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.4652956298200514, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.161781996552765e-05, |
|
"loss": 1.5681, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.46786632390745503, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.1535531139728918e-05, |
|
"loss": 1.5938, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.4704370179948586, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.1453135685464524e-05, |
|
"loss": 1.574, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.4730077120822622, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.1370639324338313e-05, |
|
"loss": 1.5872, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.4755784061696658, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.1288047784961166e-05, |
|
"loss": 1.5806, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.4781491002570694, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.1205366802553231e-05, |
|
"loss": 1.5542, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.480719794344473, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.1122602118545642e-05, |
|
"loss": 1.5723, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.4832904884318766, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.1039759480181836e-05, |
|
"loss": 1.5645, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.48586118251928023, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.0956844640118462e-05, |
|
"loss": 1.5884, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.4884318766066838, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.0873863356025911e-05, |
|
"loss": 1.5559, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.4910025706940874, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.0790821390188493e-05, |
|
"loss": 1.5623, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.493573264781491, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.0707724509104318e-05, |
|
"loss": 1.5916, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.4961439588688946, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.062457848308484e-05, |
|
"loss": 1.5696, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.4987146529562982, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.0541389085854177e-05, |
|
"loss": 1.5913, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.5012853470437018, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.0458162094148185e-05, |
|
"loss": 1.5439, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.5038560411311054, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.0374903287313307e-05, |
|
"loss": 1.6013, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.506426735218509, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.029161844690525e-05, |
|
"loss": 1.5813, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.5089974293059126, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.0208313356287505e-05, |
|
"loss": 1.5757, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.5115681233933161, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.0124993800229774e-05, |
|
"loss": 1.5508, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.5141388174807198, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.004166556450623e-05, |
|
"loss": 1.5774, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5167095115681234, |
|
"grad_norm": 0.0, |
|
"learning_rate": 9.958334435493776e-06, |
|
"loss": 1.594, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.519280205655527, |
|
"grad_norm": 0.0, |
|
"learning_rate": 9.87500619977023e-06, |
|
"loss": 1.5977, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.5218508997429306, |
|
"grad_norm": 0.0, |
|
"learning_rate": 9.791686643712498e-06, |
|
"loss": 1.5938, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.5244215938303342, |
|
"grad_norm": 0.0, |
|
"learning_rate": 9.708381553094754e-06, |
|
"loss": 1.5371, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.5269922879177378, |
|
"grad_norm": 0.0, |
|
"learning_rate": 9.625096712686694e-06, |
|
"loss": 1.5315, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.5295629820051414, |
|
"grad_norm": 0.0, |
|
"learning_rate": 9.541837905851817e-06, |
|
"loss": 1.5708, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.532133676092545, |
|
"grad_norm": 0.0, |
|
"learning_rate": 9.458610914145826e-06, |
|
"loss": 1.5691, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.5347043701799485, |
|
"grad_norm": 0.0, |
|
"learning_rate": 9.375421516915165e-06, |
|
"loss": 1.5881, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.5372750642673522, |
|
"grad_norm": 0.0, |
|
"learning_rate": 9.292275490895685e-06, |
|
"loss": 1.5732, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.5398457583547558, |
|
"grad_norm": 0.0, |
|
"learning_rate": 9.209178609811509e-06, |
|
"loss": 1.5562, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.5424164524421594, |
|
"grad_norm": 0.0, |
|
"learning_rate": 9.126136643974094e-06, |
|
"loss": 1.5603, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.5449871465295629, |
|
"grad_norm": 0.0, |
|
"learning_rate": 9.043155359881538e-06, |
|
"loss": 1.5352, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.5475578406169666, |
|
"grad_norm": 0.0, |
|
"learning_rate": 8.960240519818167e-06, |
|
"loss": 1.5647, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.5501285347043702, |
|
"grad_norm": 0.0, |
|
"learning_rate": 8.877397881454358e-06, |
|
"loss": 1.5747, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.5526992287917738, |
|
"grad_norm": 0.0, |
|
"learning_rate": 8.79463319744677e-06, |
|
"loss": 1.5586, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.5552699228791774, |
|
"grad_norm": 0.0, |
|
"learning_rate": 8.711952215038837e-06, |
|
"loss": 1.5527, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.5578406169665809, |
|
"grad_norm": 0.0, |
|
"learning_rate": 8.629360675661693e-06, |
|
"loss": 1.5374, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.5604113110539846, |
|
"grad_norm": 0.0, |
|
"learning_rate": 8.546864314535478e-06, |
|
"loss": 1.5647, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.5629820051413882, |
|
"grad_norm": 0.0, |
|
"learning_rate": 8.464468860271084e-06, |
|
"loss": 1.5356, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.5655526992287918, |
|
"grad_norm": 0.0, |
|
"learning_rate": 8.382180034472353e-06, |
|
"loss": 1.5483, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.5681233933161953, |
|
"grad_norm": 0.0, |
|
"learning_rate": 8.30000355133876e-06, |
|
"loss": 1.5386, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.570694087403599, |
|
"grad_norm": 0.0, |
|
"learning_rate": 8.217945117268624e-06, |
|
"loss": 1.5552, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.5732647814910026, |
|
"grad_norm": 0.0, |
|
"learning_rate": 8.136010430462837e-06, |
|
"loss": 1.5635, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.5758354755784062, |
|
"grad_norm": 0.0, |
|
"learning_rate": 8.0542051805292e-06, |
|
"loss": 1.5657, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.5784061696658098, |
|
"grad_norm": 0.0, |
|
"learning_rate": 7.9725350480873e-06, |
|
"loss": 1.5386, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.5809768637532133, |
|
"grad_norm": 0.0, |
|
"learning_rate": 7.89100570437408e-06, |
|
"loss": 1.6018, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.583547557840617, |
|
"grad_norm": 0.0, |
|
"learning_rate": 7.809622810849986e-06, |
|
"loss": 1.5396, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.5861182519280206, |
|
"grad_norm": 0.0, |
|
"learning_rate": 7.72839201880587e-06, |
|
"loss": 1.5474, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.5886889460154242, |
|
"grad_norm": 0.0, |
|
"learning_rate": 7.647318968970528e-06, |
|
"loss": 1.5654, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.5912596401028277, |
|
"grad_norm": 0.0, |
|
"learning_rate": 7.566409291119008e-06, |
|
"loss": 1.5732, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.5938303341902313, |
|
"grad_norm": 0.0, |
|
"learning_rate": 7.485668603681706e-06, |
|
"loss": 1.5779, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.596401028277635, |
|
"grad_norm": 0.0, |
|
"learning_rate": 7.405102513354166e-06, |
|
"loss": 1.5449, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.5989717223650386, |
|
"grad_norm": 0.0, |
|
"learning_rate": 7.324716614707794e-06, |
|
"loss": 1.5408, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.6015424164524421, |
|
"grad_norm": 0.0, |
|
"learning_rate": 7.2445164898013345e-06, |
|
"loss": 1.5403, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.6041131105398457, |
|
"grad_norm": 0.0, |
|
"learning_rate": 7.1645077077932666e-06, |
|
"loss": 1.5159, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.6066838046272494, |
|
"grad_norm": 0.0, |
|
"learning_rate": 7.084695824555074e-06, |
|
"loss": 1.5557, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.609254498714653, |
|
"grad_norm": 0.0, |
|
"learning_rate": 7.005086382285426e-06, |
|
"loss": 1.5625, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.6118251928020566, |
|
"grad_norm": 0.0, |
|
"learning_rate": 6.925684909125354e-06, |
|
"loss": 1.552, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.6143958868894601, |
|
"grad_norm": 0.0, |
|
"learning_rate": 6.84649691877433e-06, |
|
"loss": 1.5488, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.6169665809768637, |
|
"grad_norm": 0.0, |
|
"learning_rate": 6.767527910107437e-06, |
|
"loss": 1.5181, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6195372750642674, |
|
"grad_norm": 0.0, |
|
"learning_rate": 6.688783366793488e-06, |
|
"loss": 1.5403, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.622107969151671, |
|
"grad_norm": 0.0, |
|
"learning_rate": 6.610268756914254e-06, |
|
"loss": 1.5662, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.6246786632390745, |
|
"grad_norm": 0.0, |
|
"learning_rate": 6.5319895325847535e-06, |
|
"loss": 1.5222, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.6272493573264781, |
|
"grad_norm": 0.0, |
|
"learning_rate": 6.453951129574644e-06, |
|
"loss": 1.5439, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.6298200514138818, |
|
"grad_norm": 0.0, |
|
"learning_rate": 6.3761589669307745e-06, |
|
"loss": 1.5312, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.6323907455012854, |
|
"grad_norm": 0.0, |
|
"learning_rate": 6.298618446600856e-06, |
|
"loss": 1.5383, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.6349614395886889, |
|
"grad_norm": 0.0, |
|
"learning_rate": 6.221334953058389e-06, |
|
"loss": 1.5393, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.6375321336760925, |
|
"grad_norm": 0.0, |
|
"learning_rate": 6.144313852928712e-06, |
|
"loss": 1.5247, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.6401028277634961, |
|
"grad_norm": 0.0, |
|
"learning_rate": 6.067560494616374e-06, |
|
"loss": 1.5454, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.6426735218508998, |
|
"grad_norm": 0.0, |
|
"learning_rate": 5.9910802079337285e-06, |
|
"loss": 1.5215, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6452442159383034, |
|
"grad_norm": 0.0, |
|
"learning_rate": 5.9148783037308154e-06, |
|
"loss": 1.5427, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.6478149100257069, |
|
"grad_norm": 0.0, |
|
"learning_rate": 5.838960073526589e-06, |
|
"loss": 1.5427, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.6503856041131105, |
|
"grad_norm": 0.0, |
|
"learning_rate": 5.763330789141457e-06, |
|
"loss": 1.5552, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.6529562982005142, |
|
"grad_norm": 0.0, |
|
"learning_rate": 5.687995702331211e-06, |
|
"loss": 1.5388, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.6555269922879178, |
|
"grad_norm": 0.0, |
|
"learning_rate": 5.612960044422335e-06, |
|
"loss": 1.5854, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.6580976863753213, |
|
"grad_norm": 0.0, |
|
"learning_rate": 5.538229025948729e-06, |
|
"loss": 1.5588, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.6606683804627249, |
|
"grad_norm": 0.0, |
|
"learning_rate": 5.463807836289921e-06, |
|
"loss": 1.5217, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.6632390745501285, |
|
"grad_norm": 0.0, |
|
"learning_rate": 5.389701643310661e-06, |
|
"loss": 1.5066, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.6658097686375322, |
|
"grad_norm": 0.0, |
|
"learning_rate": 5.3159155930021e-06, |
|
"loss": 1.5327, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.6683804627249358, |
|
"grad_norm": 0.0, |
|
"learning_rate": 5.2424548091244334e-06, |
|
"loss": 1.5522, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.6709511568123393, |
|
"grad_norm": 0.0, |
|
"learning_rate": 5.169324392851105e-06, |
|
"loss": 1.543, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.6735218508997429, |
|
"grad_norm": 0.0, |
|
"learning_rate": 5.096529422414571e-06, |
|
"loss": 1.5483, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.6760925449871465, |
|
"grad_norm": 0.0, |
|
"learning_rate": 5.0240749527536845e-06, |
|
"loss": 1.5234, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.6786632390745502, |
|
"grad_norm": 0.0, |
|
"learning_rate": 4.951966015162652e-06, |
|
"loss": 1.5315, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.6812339331619537, |
|
"grad_norm": 0.0, |
|
"learning_rate": 4.880207616941663e-06, |
|
"loss": 1.5193, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.6838046272493573, |
|
"grad_norm": 0.0, |
|
"learning_rate": 4.8088047410492e-06, |
|
"loss": 1.5586, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.6863753213367609, |
|
"grad_norm": 0.0, |
|
"learning_rate": 4.737762345755975e-06, |
|
"loss": 1.481, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.6889460154241646, |
|
"grad_norm": 0.0, |
|
"learning_rate": 4.667085364300678e-06, |
|
"loss": 1.5869, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.6915167095115681, |
|
"grad_norm": 0.0, |
|
"learning_rate": 4.596778704547359e-06, |
|
"loss": 1.5366, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.6940874035989717, |
|
"grad_norm": 0.0, |
|
"learning_rate": 4.526847248644652e-06, |
|
"loss": 1.5007, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.6966580976863753, |
|
"grad_norm": 0.0, |
|
"learning_rate": 4.457295852686746e-06, |
|
"loss": 1.5352, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.699228791773779, |
|
"grad_norm": 0.0, |
|
"learning_rate": 4.388129346376177e-06, |
|
"loss": 1.5447, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.7017994858611826, |
|
"grad_norm": 0.0, |
|
"learning_rate": 4.319352532688444e-06, |
|
"loss": 1.5701, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.7043701799485861, |
|
"grad_norm": 0.0, |
|
"learning_rate": 4.250970187538484e-06, |
|
"loss": 1.5, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.7069408740359897, |
|
"grad_norm": 0.0, |
|
"learning_rate": 4.182987059449056e-06, |
|
"loss": 1.5513, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.7095115681233933, |
|
"grad_norm": 0.0, |
|
"learning_rate": 4.115407869220948e-06, |
|
"loss": 1.5007, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.712082262210797, |
|
"grad_norm": 0.0, |
|
"learning_rate": 4.048237309605216e-06, |
|
"loss": 1.5398, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.7146529562982005, |
|
"grad_norm": 0.0, |
|
"learning_rate": 3.981480044977284e-06, |
|
"loss": 1.5476, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.7172236503856041, |
|
"grad_norm": 0.0, |
|
"learning_rate": 3.915140711013044e-06, |
|
"loss": 1.5015, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.7197943444730077, |
|
"grad_norm": 0.0, |
|
"learning_rate": 3.849223914366981e-06, |
|
"loss": 1.5405, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7223650385604113, |
|
"grad_norm": 0.0, |
|
"learning_rate": 3.7837342323522454e-06, |
|
"loss": 1.5413, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.7249357326478149, |
|
"grad_norm": 0.0, |
|
"learning_rate": 3.7186762126228227e-06, |
|
"loss": 1.5874, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.7275064267352185, |
|
"grad_norm": 0.0, |
|
"learning_rate": 3.654054372857738e-06, |
|
"loss": 1.5122, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.7300771208226221, |
|
"grad_norm": 0.0, |
|
"learning_rate": 3.5898732004473523e-06, |
|
"loss": 1.55, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.7326478149100257, |
|
"grad_norm": 0.0, |
|
"learning_rate": 3.5261371521817247e-06, |
|
"loss": 1.5337, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.7352185089974294, |
|
"grad_norm": 0.0, |
|
"learning_rate": 3.462850653941171e-06, |
|
"loss": 1.5159, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.7377892030848329, |
|
"grad_norm": 0.0, |
|
"learning_rate": 3.4000181003889e-06, |
|
"loss": 1.5139, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.7403598971722365, |
|
"grad_norm": 0.0, |
|
"learning_rate": 3.337643854665843e-06, |
|
"loss": 1.499, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.7429305912596401, |
|
"grad_norm": 0.0, |
|
"learning_rate": 3.2757322480876996e-06, |
|
"loss": 1.5149, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.7455012853470437, |
|
"grad_norm": 0.0, |
|
"learning_rate": 3.2142875798441376e-06, |
|
"loss": 1.5098, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.7480719794344473, |
|
"grad_norm": 0.0, |
|
"learning_rate": 3.15331411670027e-06, |
|
"loss": 1.5217, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.7506426735218509, |
|
"grad_norm": 0.0, |
|
"learning_rate": 3.092816092700366e-06, |
|
"loss": 1.5017, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.7532133676092545, |
|
"grad_norm": 0.0, |
|
"learning_rate": 3.032797708873828e-06, |
|
"loss": 1.5398, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.7557840616966581, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.97326313294349e-06, |
|
"loss": 1.4983, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.7583547557840618, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.914216499036178e-06, |
|
"loss": 1.5271, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.7609254498714653, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.855661907395655e-06, |
|
"loss": 1.5286, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.7634961439588689, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.7976034240978834e-06, |
|
"loss": 1.4954, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.7660668380462725, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.740045080768694e-06, |
|
"loss": 1.4653, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.7686375321336761, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.6829908743037936e-06, |
|
"loss": 1.5271, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.7712082262210797, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.626444766591253e-06, |
|
"loss": 1.48, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7737789203084833, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.570410684236365e-06, |
|
"loss": 1.5093, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.7763496143958869, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.514892518288988e-06, |
|
"loss": 1.531, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.7789203084832905, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.4598941239733555e-06, |
|
"loss": 1.4795, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.781491002570694, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.4054193204203457e-06, |
|
"loss": 1.5056, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.7840616966580977, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.3514718904022993e-06, |
|
"loss": 1.4841, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.7866323907455013, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.2980555800703273e-06, |
|
"loss": 1.5337, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.7892030848329049, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.2451740986941905e-06, |
|
"loss": 1.5212, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.7917737789203085, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.1928311184046967e-06, |
|
"loss": 1.5308, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.794344473007712, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.1410302739387424e-06, |
|
"loss": 1.5159, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.7969151670951157, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.0897751623868833e-06, |
|
"loss": 1.5349, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.7994858611825193, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.0390693429435626e-06, |
|
"loss": 1.5029, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.8020565552699229, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.9889163366599607e-06, |
|
"loss": 1.519, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.8046272493573264, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.939319626199483e-06, |
|
"loss": 1.5054, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.8071979434447301, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.890282655595922e-06, |
|
"loss": 1.4736, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.8097686375321337, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.8418088300143044e-06, |
|
"loss": 1.5242, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.8123393316195373, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.7939015155144378e-06, |
|
"loss": 1.5208, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.8149100257069408, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.7465640388171589e-06, |
|
"loss": 1.5332, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.8174807197943444, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.6997996870733268e-06, |
|
"loss": 1.4978, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.8200514138817481, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.6536117076355652e-06, |
|
"loss": 1.4961, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.8226221079691517, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.6080033078327585e-06, |
|
"loss": 1.5559, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.8251928020565553, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.5629776547473397e-06, |
|
"loss": 1.5435, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.8277634961439588, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.5185378749953538e-06, |
|
"loss": 1.4744, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.8303341902313625, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.4746870545093528e-06, |
|
"loss": 1.4885, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.8329048843187661, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.4314282383241097e-06, |
|
"loss": 1.5088, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.8354755784061697, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.388764430365147e-06, |
|
"loss": 1.4971, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.8380462724935732, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.3466985932401743e-06, |
|
"loss": 1.5269, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.8406169665809768, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.3052336480333372e-06, |
|
"loss": 1.5088, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.8431876606683805, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.2643724741023845e-06, |
|
"loss": 1.5046, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.8457583547557841, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.2241179088787192e-06, |
|
"loss": 1.5217, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.8483290488431876, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.1844727476703776e-06, |
|
"loss": 1.4951, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.8508997429305912, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.1454397434679022e-06, |
|
"loss": 1.4941, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.8534704370179949, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.1070216067531825e-06, |
|
"loss": 1.5122, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.8560411311053985, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.0692210053112451e-06, |
|
"loss": 1.5427, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.8586118251928021, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.032040564044975e-06, |
|
"loss": 1.5278, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.8611825192802056, |
|
"grad_norm": 0.0, |
|
"learning_rate": 9.954828647928727e-07, |
|
"loss": 1.4768, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.8637532133676092, |
|
"grad_norm": 0.0, |
|
"learning_rate": 9.595504461497441e-07, |
|
"loss": 1.5066, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.8663239074550129, |
|
"grad_norm": 0.0, |
|
"learning_rate": 9.242458032904311e-07, |
|
"loss": 1.4871, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.8688946015424165, |
|
"grad_norm": 0.0, |
|
"learning_rate": 8.895713877965373e-07, |
|
"loss": 1.5212, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.87146529562982, |
|
"grad_norm": 0.0, |
|
"learning_rate": 8.555296074861996e-07, |
|
"loss": 1.4919, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.8740359897172236, |
|
"grad_norm": 0.0, |
|
"learning_rate": 8.22122826246875e-07, |
|
"loss": 1.5476, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.8766066838046273, |
|
"grad_norm": 0.0, |
|
"learning_rate": 7.89353363871197e-07, |
|
"loss": 1.5142, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.8791773778920309, |
|
"grad_norm": 0.0, |
|
"learning_rate": 7.572234958958846e-07, |
|
"loss": 1.5332, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.8817480719794345, |
|
"grad_norm": 0.0, |
|
"learning_rate": 7.2573545344373e-07, |
|
"loss": 1.4924, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.884318766066838, |
|
"grad_norm": 0.0, |
|
"learning_rate": 6.948914230686688e-07, |
|
"loss": 1.5181, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.8868894601542416, |
|
"grad_norm": 0.0, |
|
"learning_rate": 6.646935466039373e-07, |
|
"loss": 1.5137, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.8894601542416453, |
|
"grad_norm": 0.0, |
|
"learning_rate": 6.351439210133492e-07, |
|
"loss": 1.5056, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.8920308483290489, |
|
"grad_norm": 0.0, |
|
"learning_rate": 6.062445982456777e-07, |
|
"loss": 1.4688, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.8946015424164524, |
|
"grad_norm": 0.0, |
|
"learning_rate": 5.77997585092166e-07, |
|
"loss": 1.5146, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.897172236503856, |
|
"grad_norm": 0.0, |
|
"learning_rate": 5.504048430471753e-07, |
|
"loss": 1.4695, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.8997429305912596, |
|
"grad_norm": 0.0, |
|
"learning_rate": 5.234682881719766e-07, |
|
"loss": 1.5129, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.9023136246786633, |
|
"grad_norm": 0.0, |
|
"learning_rate": 4.971897909616985e-07, |
|
"loss": 1.5061, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.9048843187660668, |
|
"grad_norm": 0.0, |
|
"learning_rate": 4.715711762154362e-07, |
|
"loss": 1.4722, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.9074550128534704, |
|
"grad_norm": 0.0, |
|
"learning_rate": 4.4661422290954495e-07, |
|
"loss": 1.5056, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.910025706940874, |
|
"grad_norm": 0.0, |
|
"learning_rate": 4.2232066407409067e-07, |
|
"loss": 1.5017, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.9125964010282777, |
|
"grad_norm": 0.0, |
|
"learning_rate": 3.986921866725202e-07, |
|
"loss": 1.5393, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.9151670951156813, |
|
"grad_norm": 0.0, |
|
"learning_rate": 3.7573043148451673e-07, |
|
"loss": 1.5034, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.9177377892030848, |
|
"grad_norm": 0.0, |
|
"learning_rate": 3.5343699299205003e-07, |
|
"loss": 1.5139, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.9203084832904884, |
|
"grad_norm": 0.0, |
|
"learning_rate": 3.3181341926867283e-07, |
|
"loss": 1.4788, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.922879177377892, |
|
"grad_norm": 0.0, |
|
"learning_rate": 3.1086121187200667e-07, |
|
"loss": 1.4746, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.9254498714652957, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.905818257394799e-07, |
|
"loss": 1.5112, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.9280205655526992, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.7097666908729283e-07, |
|
"loss": 1.5071, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.9305912596401028, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.520471033126326e-07, |
|
"loss": 1.4773, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.9331619537275064, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.3379444289913344e-07, |
|
"loss": 1.5146, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.9357326478149101, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.1621995532559947e-07, |
|
"loss": 1.4978, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.9383033419023136, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.9932486097799408e-07, |
|
"loss": 1.5183, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.9408740359897172, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.8311033306468552e-07, |
|
"loss": 1.4761, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.9434447300771208, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.6757749753498865e-07, |
|
"loss": 1.509, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.9460154241645244, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.5272743300097316e-07, |
|
"loss": 1.5095, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.9485861182519281, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.3856117066256225e-07, |
|
"loss": 1.5361, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.9511568123393316, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.2507969423593225e-07, |
|
"loss": 1.5051, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.9537275064267352, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.1228393988519381e-07, |
|
"loss": 1.5532, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.9562982005141388, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.0017479615738957e-07, |
|
"loss": 1.553, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.9588688946015425, |
|
"grad_norm": 0.0, |
|
"learning_rate": 8.875310392079118e-08, |
|
"loss": 1.5125, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.961439588688946, |
|
"grad_norm": 0.0, |
|
"learning_rate": 7.801965630651165e-08, |
|
"loss": 1.4321, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.9640102827763496, |
|
"grad_norm": 0.0, |
|
"learning_rate": 6.797519865342161e-08, |
|
"loss": 1.5005, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.9665809768637532, |
|
"grad_norm": 0.0, |
|
"learning_rate": 5.862042845640403e-08, |
|
"loss": 1.4973, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.9691516709511568, |
|
"grad_norm": 0.0, |
|
"learning_rate": 4.9955995317908514e-08, |
|
"loss": 1.5449, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.9717223650385605, |
|
"grad_norm": 0.0, |
|
"learning_rate": 4.198250090284961e-08, |
|
"loss": 1.4795, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.974293059125964, |
|
"grad_norm": 0.0, |
|
"learning_rate": 3.47004988968247e-08, |
|
"loss": 1.5508, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.9768637532133676, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.8110494967664713e-08, |
|
"loss": 1.5095, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.9794344473007712, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.221294673032004e-08, |
|
"loss": 1.5146, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.9820051413881749, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.7008263715085904e-08, |
|
"loss": 1.5112, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.9845758354755784, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.24968073391607e-08, |
|
"loss": 1.5144, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.987146529562982, |
|
"grad_norm": 0.0, |
|
"learning_rate": 8.678890881552715e-09, |
|
"loss": 1.5459, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.9897172236503856, |
|
"grad_norm": 0.0, |
|
"learning_rate": 5.554779461323101e-09, |
|
"loss": 1.4885, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.9922879177377892, |
|
"grad_norm": 0.0, |
|
"learning_rate": 3.1246900191761463e-09, |
|
"loss": 1.4919, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.9948586118251928, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.3887913023946652e-09, |
|
"loss": 1.5034, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.9974293059125964, |
|
"grad_norm": 0.0, |
|
"learning_rate": 3.4720385312492223e-10, |
|
"loss": 1.4812, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.0, |
|
"learning_rate": 0.0, |
|
"loss": 1.3696, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 389, |
|
"total_flos": 1.4102482311698186e+18, |
|
"train_loss": 1.5950692380302056, |
|
"train_runtime": 5789.3639, |
|
"train_samples_per_second": 17.167, |
|
"train_steps_per_second": 0.067 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 389, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 3000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.4102482311698186e+18, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|