|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.49747474747474746, |
|
"eval_steps": 98, |
|
"global_step": 197, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0025252525252525255, |
|
"grad_norm": 17.0, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 1.9033, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.005050505050505051, |
|
"grad_norm": 19.5, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 2.1028, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.007575757575757576, |
|
"grad_norm": 17.125, |
|
"learning_rate": 3e-06, |
|
"loss": 2.1133, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.010101010101010102, |
|
"grad_norm": 15.8125, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 2.0188, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.012626262626262626, |
|
"grad_norm": 14.375, |
|
"learning_rate": 5e-06, |
|
"loss": 1.9426, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.015151515151515152, |
|
"grad_norm": 11.0, |
|
"learning_rate": 6e-06, |
|
"loss": 1.7974, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.017676767676767676, |
|
"grad_norm": 9.1875, |
|
"learning_rate": 7e-06, |
|
"loss": 1.7561, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.020202020202020204, |
|
"grad_norm": 8.4375, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 1.7865, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.022727272727272728, |
|
"grad_norm": 9.5, |
|
"learning_rate": 9e-06, |
|
"loss": 1.7445, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.025252525252525252, |
|
"grad_norm": 8.6875, |
|
"learning_rate": 1e-05, |
|
"loss": 1.6558, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.027777777777777776, |
|
"grad_norm": 6.625, |
|
"learning_rate": 1.1000000000000001e-05, |
|
"loss": 1.2742, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.030303030303030304, |
|
"grad_norm": 6.28125, |
|
"learning_rate": 1.2e-05, |
|
"loss": 1.2157, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.03282828282828283, |
|
"grad_norm": 6.0, |
|
"learning_rate": 1.3000000000000001e-05, |
|
"loss": 1.24, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.03535353535353535, |
|
"grad_norm": 5.15625, |
|
"learning_rate": 1.4e-05, |
|
"loss": 1.1103, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.03787878787878788, |
|
"grad_norm": 4.0, |
|
"learning_rate": 1.5000000000000002e-05, |
|
"loss": 0.9098, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.04040404040404041, |
|
"grad_norm": 3.90625, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 1.005, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.04292929292929293, |
|
"grad_norm": 3.203125, |
|
"learning_rate": 1.7e-05, |
|
"loss": 0.7924, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.045454545454545456, |
|
"grad_norm": 3.078125, |
|
"learning_rate": 1.8e-05, |
|
"loss": 0.8178, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.047979797979797977, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 1.9e-05, |
|
"loss": 0.7241, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.050505050505050504, |
|
"grad_norm": 2.90625, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9651, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.05303030303030303, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 1.99468085106383e-05, |
|
"loss": 0.7474, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.05555555555555555, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 1.9893617021276595e-05, |
|
"loss": 0.7503, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.05808080808080808, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 1.9840425531914894e-05, |
|
"loss": 0.7411, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.06060606060606061, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 1.9787234042553193e-05, |
|
"loss": 0.7489, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.06313131313131314, |
|
"grad_norm": 2.25, |
|
"learning_rate": 1.973404255319149e-05, |
|
"loss": 0.7545, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.06565656565656566, |
|
"grad_norm": 2.25, |
|
"learning_rate": 1.968085106382979e-05, |
|
"loss": 0.6775, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.06818181818181818, |
|
"grad_norm": 2.25, |
|
"learning_rate": 1.962765957446809e-05, |
|
"loss": 0.7888, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.0707070707070707, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 1.9574468085106384e-05, |
|
"loss": 0.8352, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.07323232323232323, |
|
"grad_norm": 2.125, |
|
"learning_rate": 1.9521276595744682e-05, |
|
"loss": 0.6698, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.07575757575757576, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 1.946808510638298e-05, |
|
"loss": 0.8309, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.07828282828282829, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 1.9414893617021276e-05, |
|
"loss": 0.6307, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.08080808080808081, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 1.9361702127659575e-05, |
|
"loss": 0.6557, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.08333333333333333, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 1.9308510638297873e-05, |
|
"loss": 0.6325, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.08585858585858586, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 1.9255319148936172e-05, |
|
"loss": 0.7254, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.08838383838383838, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 1.920212765957447e-05, |
|
"loss": 0.6736, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.09090909090909091, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 1.914893617021277e-05, |
|
"loss": 0.74, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.09343434343434344, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 1.9095744680851064e-05, |
|
"loss": 0.6725, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.09595959595959595, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 1.9042553191489363e-05, |
|
"loss": 0.631, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.09848484848484848, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 1.898936170212766e-05, |
|
"loss": 0.6208, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.10101010101010101, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 1.893617021276596e-05, |
|
"loss": 0.5535, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.10353535353535354, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 1.888297872340426e-05, |
|
"loss": 0.6425, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.10606060606060606, |
|
"grad_norm": 1.5, |
|
"learning_rate": 1.8829787234042557e-05, |
|
"loss": 0.5682, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.10858585858585859, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 1.8776595744680852e-05, |
|
"loss": 0.6053, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.1111111111111111, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 1.872340425531915e-05, |
|
"loss": 0.5894, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.11363636363636363, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 1.8670212765957446e-05, |
|
"loss": 0.523, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.11616161616161616, |
|
"grad_norm": 1.5, |
|
"learning_rate": 1.8617021276595745e-05, |
|
"loss": 0.5638, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.11868686868686869, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 1.8563829787234043e-05, |
|
"loss": 0.533, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.12121212121212122, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 1.8510638297872342e-05, |
|
"loss": 0.5829, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.12373737373737374, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 1.845744680851064e-05, |
|
"loss": 0.6252, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.12626262626262627, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 1.840425531914894e-05, |
|
"loss": 0.641, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.12878787878787878, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 1.8351063829787234e-05, |
|
"loss": 0.4742, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.13131313131313133, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 1.8297872340425533e-05, |
|
"loss": 0.5409, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.13383838383838384, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 1.824468085106383e-05, |
|
"loss": 0.6769, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.13636363636363635, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 1.8191489361702127e-05, |
|
"loss": 0.5134, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.1388888888888889, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 1.8138297872340425e-05, |
|
"loss": 0.6641, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.1414141414141414, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 1.8085106382978724e-05, |
|
"loss": 0.6039, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.14393939393939395, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 1.8031914893617023e-05, |
|
"loss": 0.5456, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.14646464646464646, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 1.797872340425532e-05, |
|
"loss": 0.6185, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.14898989898989898, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 1.792553191489362e-05, |
|
"loss": 0.6234, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.15151515151515152, |
|
"grad_norm": 1.625, |
|
"learning_rate": 1.7872340425531915e-05, |
|
"loss": 0.6293, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.15404040404040403, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 1.7819148936170214e-05, |
|
"loss": 0.5785, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.15656565656565657, |
|
"grad_norm": 1.5, |
|
"learning_rate": 1.7765957446808512e-05, |
|
"loss": 0.5021, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.1590909090909091, |
|
"grad_norm": 1.5, |
|
"learning_rate": 1.7712765957446807e-05, |
|
"loss": 0.6317, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.16161616161616163, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 1.765957446808511e-05, |
|
"loss": 0.56, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.16414141414141414, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 1.7606382978723408e-05, |
|
"loss": 0.5755, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.16666666666666666, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 1.7553191489361703e-05, |
|
"loss": 0.6197, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.1691919191919192, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 1.7500000000000002e-05, |
|
"loss": 0.5798, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.1717171717171717, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 1.74468085106383e-05, |
|
"loss": 0.5906, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.17424242424242425, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 1.7393617021276596e-05, |
|
"loss": 0.7807, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.17676767676767677, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 1.7340425531914894e-05, |
|
"loss": 0.626, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.17929292929292928, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 1.7287234042553193e-05, |
|
"loss": 0.6278, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.18181818181818182, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 1.723404255319149e-05, |
|
"loss": 0.5956, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.18434343434343434, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 1.718085106382979e-05, |
|
"loss": 0.5488, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.18686868686868688, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 1.712765957446809e-05, |
|
"loss": 0.5977, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.1893939393939394, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 1.7074468085106384e-05, |
|
"loss": 0.5672, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.1919191919191919, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 1.7021276595744682e-05, |
|
"loss": 0.5855, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.19444444444444445, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 1.696808510638298e-05, |
|
"loss": 0.5899, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.19696969696969696, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 1.6914893617021276e-05, |
|
"loss": 0.5164, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.1994949494949495, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 1.6861702127659575e-05, |
|
"loss": 0.7067, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.20202020202020202, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 1.6808510638297873e-05, |
|
"loss": 0.7673, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.20454545454545456, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 1.6755319148936172e-05, |
|
"loss": 0.5957, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.20707070707070707, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 1.670212765957447e-05, |
|
"loss": 0.593, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.20959595959595959, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 1.664893617021277e-05, |
|
"loss": 0.6161, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.21212121212121213, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 1.6595744680851064e-05, |
|
"loss": 0.5788, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.21464646464646464, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 1.6542553191489363e-05, |
|
"loss": 0.5945, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.21717171717171718, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 1.648936170212766e-05, |
|
"loss": 0.5936, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.2196969696969697, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 1.6436170212765957e-05, |
|
"loss": 0.5393, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.2222222222222222, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 1.6382978723404255e-05, |
|
"loss": 0.5319, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.22474747474747475, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 1.6329787234042554e-05, |
|
"loss": 0.7014, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.22727272727272727, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 1.6276595744680853e-05, |
|
"loss": 0.6423, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.2297979797979798, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 1.622340425531915e-05, |
|
"loss": 0.7358, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.23232323232323232, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 1.6170212765957446e-05, |
|
"loss": 0.4773, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.23484848484848486, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 1.6117021276595745e-05, |
|
"loss": 0.5955, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.23737373737373738, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 1.6063829787234044e-05, |
|
"loss": 0.6716, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.2398989898989899, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 1.6010638297872342e-05, |
|
"loss": 0.6673, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.24242424242424243, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 1.595744680851064e-05, |
|
"loss": 0.7431, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.24494949494949494, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 1.590425531914894e-05, |
|
"loss": 0.5161, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.2474747474747475, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 1.5851063829787235e-05, |
|
"loss": 0.5427, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.2474747474747475, |
|
"eval_loss": 0.4802989363670349, |
|
"eval_model_preparation_time": 0.0001, |
|
"eval_runtime": 4.4682, |
|
"eval_samples_per_second": 23.723, |
|
"eval_steps_per_second": 3.133, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 1.5797872340425533e-05, |
|
"loss": 0.5947, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.25252525252525254, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 1.5744680851063832e-05, |
|
"loss": 0.6192, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.255050505050505, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 1.5691489361702127e-05, |
|
"loss": 0.568, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.25757575757575757, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 1.5638297872340426e-05, |
|
"loss": 0.6521, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.2601010101010101, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 1.5585106382978724e-05, |
|
"loss": 0.65, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.26262626262626265, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 1.5531914893617023e-05, |
|
"loss": 0.4976, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.26515151515151514, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 1.547872340425532e-05, |
|
"loss": 0.5354, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.2676767676767677, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 1.542553191489362e-05, |
|
"loss": 0.5232, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.2702020202020202, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 1.5372340425531915e-05, |
|
"loss": 0.4453, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.2727272727272727, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 1.5319148936170214e-05, |
|
"loss": 0.6146, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.27525252525252525, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 1.5265957446808512e-05, |
|
"loss": 0.4385, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.2777777777777778, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 1.521276595744681e-05, |
|
"loss": 0.5709, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.2803030303030303, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 1.5159574468085108e-05, |
|
"loss": 0.5547, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.2828282828282828, |
|
"grad_norm": 1.5, |
|
"learning_rate": 1.5106382978723407e-05, |
|
"loss": 0.5413, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.28535353535353536, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 1.5053191489361702e-05, |
|
"loss": 0.5048, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.2878787878787879, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 1.5000000000000002e-05, |
|
"loss": 0.6009, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.2904040404040404, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 1.49468085106383e-05, |
|
"loss": 0.472, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.29292929292929293, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 1.4893617021276596e-05, |
|
"loss": 0.6086, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.29545454545454547, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 1.4840425531914894e-05, |
|
"loss": 0.5837, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.29797979797979796, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 1.4787234042553193e-05, |
|
"loss": 0.5546, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.3005050505050505, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 1.473404255319149e-05, |
|
"loss": 0.4623, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.30303030303030304, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 1.4680851063829789e-05, |
|
"loss": 0.492, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.3055555555555556, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 1.4627659574468087e-05, |
|
"loss": 0.5731, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.30808080808080807, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 1.4574468085106384e-05, |
|
"loss": 0.5626, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.3106060606060606, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 1.4521276595744683e-05, |
|
"loss": 0.475, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.31313131313131315, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 1.4468085106382981e-05, |
|
"loss": 0.552, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.31565656565656564, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 1.4414893617021276e-05, |
|
"loss": 0.525, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.3181818181818182, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 1.4361702127659575e-05, |
|
"loss": 0.5764, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.3207070707070707, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 1.4308510638297874e-05, |
|
"loss": 0.5816, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.32323232323232326, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 1.425531914893617e-05, |
|
"loss": 0.5908, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.32575757575757575, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 1.420212765957447e-05, |
|
"loss": 0.5943, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.3282828282828283, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 1.4148936170212768e-05, |
|
"loss": 0.4884, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.33080808080808083, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 1.4095744680851065e-05, |
|
"loss": 0.5799, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.3333333333333333, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 1.4042553191489363e-05, |
|
"loss": 0.5285, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.33585858585858586, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 1.3989361702127662e-05, |
|
"loss": 0.4816, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.3383838383838384, |
|
"grad_norm": 1.5, |
|
"learning_rate": 1.3936170212765959e-05, |
|
"loss": 0.6294, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.3409090909090909, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 1.3882978723404257e-05, |
|
"loss": 0.4614, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.3434343434343434, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 1.3829787234042556e-05, |
|
"loss": 0.5877, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.34595959595959597, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 1.3776595744680851e-05, |
|
"loss": 0.4327, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.3484848484848485, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 1.372340425531915e-05, |
|
"loss": 0.5889, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.351010101010101, |
|
"grad_norm": 1.5, |
|
"learning_rate": 1.3670212765957447e-05, |
|
"loss": 0.5389, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.35353535353535354, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 1.3617021276595745e-05, |
|
"loss": 0.4373, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.3560606060606061, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 1.3563829787234044e-05, |
|
"loss": 0.5761, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.35858585858585856, |
|
"grad_norm": 1.625, |
|
"learning_rate": 1.351063829787234e-05, |
|
"loss": 0.7717, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.3611111111111111, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 1.345744680851064e-05, |
|
"loss": 0.4974, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.36363636363636365, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 1.3404255319148938e-05, |
|
"loss": 0.5615, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.3661616161616162, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 1.3351063829787235e-05, |
|
"loss": 0.5081, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.3686868686868687, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 1.3297872340425533e-05, |
|
"loss": 0.486, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.3712121212121212, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 1.3244680851063832e-05, |
|
"loss": 0.4847, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.37373737373737376, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 1.3191489361702127e-05, |
|
"loss": 0.5955, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.37626262626262624, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 1.3138297872340426e-05, |
|
"loss": 0.46, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.3787878787878788, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 1.3085106382978724e-05, |
|
"loss": 0.5995, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.3813131313131313, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 1.3031914893617021e-05, |
|
"loss": 0.468, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.3838383838383838, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 1.297872340425532e-05, |
|
"loss": 0.5832, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.38636363636363635, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 1.2925531914893619e-05, |
|
"loss": 0.6264, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.3888888888888889, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 1.2872340425531915e-05, |
|
"loss": 0.5987, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.39141414141414144, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 1.2819148936170214e-05, |
|
"loss": 0.5271, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.3939393939393939, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 1.2765957446808513e-05, |
|
"loss": 0.4722, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.39646464646464646, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 1.2712765957446808e-05, |
|
"loss": 0.6128, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.398989898989899, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 1.2659574468085108e-05, |
|
"loss": 0.7296, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.4015151515151515, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 1.2606382978723407e-05, |
|
"loss": 0.658, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.40404040404040403, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 1.2553191489361702e-05, |
|
"loss": 0.5477, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4065656565656566, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 1.25e-05, |
|
"loss": 0.5191, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.4090909090909091, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 1.24468085106383e-05, |
|
"loss": 0.6534, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.4116161616161616, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 1.2393617021276596e-05, |
|
"loss": 0.5652, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.41414141414141414, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 1.2340425531914895e-05, |
|
"loss": 0.7201, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.4166666666666667, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 1.2287234042553193e-05, |
|
"loss": 0.6026, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.41919191919191917, |
|
"grad_norm": 1.375, |
|
"learning_rate": 1.223404255319149e-05, |
|
"loss": 0.5286, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.4217171717171717, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 1.2180851063829789e-05, |
|
"loss": 0.5045, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.42424242424242425, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 1.2127659574468087e-05, |
|
"loss": 0.5062, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.42676767676767674, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 1.2074468085106383e-05, |
|
"loss": 0.5936, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.4292929292929293, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 1.2021276595744681e-05, |
|
"loss": 0.4753, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.4318181818181818, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 1.196808510638298e-05, |
|
"loss": 0.5516, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.43434343434343436, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 1.1914893617021277e-05, |
|
"loss": 0.571, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.43686868686868685, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 1.1861702127659575e-05, |
|
"loss": 0.5234, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.4393939393939394, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 1.1808510638297874e-05, |
|
"loss": 0.5271, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.44191919191919193, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 1.175531914893617e-05, |
|
"loss": 0.6933, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.4444444444444444, |
|
"grad_norm": 1.5, |
|
"learning_rate": 1.170212765957447e-05, |
|
"loss": 0.5311, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.44696969696969696, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 1.1648936170212768e-05, |
|
"loss": 0.4916, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.4494949494949495, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 1.1595744680851065e-05, |
|
"loss": 0.6005, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.45202020202020204, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 1.1542553191489364e-05, |
|
"loss": 0.4982, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.45454545454545453, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 1.1489361702127662e-05, |
|
"loss": 0.5284, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.45707070707070707, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 1.1436170212765957e-05, |
|
"loss": 0.4811, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.4595959595959596, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 1.1382978723404256e-05, |
|
"loss": 0.4331, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.4621212121212121, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 1.1329787234042555e-05, |
|
"loss": 0.5808, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.46464646464646464, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 1.1276595744680851e-05, |
|
"loss": 0.7746, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.4671717171717172, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 1.122340425531915e-05, |
|
"loss": 0.5737, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.4696969696969697, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 1.1170212765957447e-05, |
|
"loss": 0.5221, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.4722222222222222, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 1.1117021276595746e-05, |
|
"loss": 0.6547, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.47474747474747475, |
|
"grad_norm": 1.375, |
|
"learning_rate": 1.1063829787234044e-05, |
|
"loss": 0.5264, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.4772727272727273, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 1.1010638297872341e-05, |
|
"loss": 0.5393, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.4797979797979798, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 1.095744680851064e-05, |
|
"loss": 0.5669, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.4823232323232323, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 1.0904255319148938e-05, |
|
"loss": 0.5213, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.48484848484848486, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 1.0851063829787233e-05, |
|
"loss": 0.5955, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.48737373737373735, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 1.0797872340425532e-05, |
|
"loss": 0.5273, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.4898989898989899, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 1.074468085106383e-05, |
|
"loss": 0.4816, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.49242424242424243, |
|
"grad_norm": 2.0, |
|
"learning_rate": 1.0691489361702128e-05, |
|
"loss": 0.6121, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.494949494949495, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 1.0638297872340426e-05, |
|
"loss": 0.6325, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.494949494949495, |
|
"eval_loss": 0.45528531074523926, |
|
"eval_model_preparation_time": 0.0001, |
|
"eval_runtime": 4.4693, |
|
"eval_samples_per_second": 23.717, |
|
"eval_steps_per_second": 3.132, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.49747474747474746, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 1.0585106382978725e-05, |
|
"loss": 0.4857, |
|
"step": 197 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 396, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 197, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.091750925875282e+17, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|