|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.0, |
|
"eval_steps": 200, |
|
"global_step": 266, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.007518796992481203, |
|
"grad_norm": 0.08289683091006875, |
|
"learning_rate": 3.7037037037037037e-06, |
|
"loss": 0.2087, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.015037593984962405, |
|
"grad_norm": 0.03030546873337256, |
|
"learning_rate": 7.4074074074074075e-06, |
|
"loss": 0.1045, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.022556390977443608, |
|
"grad_norm": 0.04340875250649354, |
|
"learning_rate": 1.1111111111111112e-05, |
|
"loss": 0.1291, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.03007518796992481, |
|
"grad_norm": 0.04223285184390201, |
|
"learning_rate": 1.4814814814814815e-05, |
|
"loss": 0.1263, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.03759398496240601, |
|
"grad_norm": 0.04894801143939966, |
|
"learning_rate": 1.8518518518518518e-05, |
|
"loss": 0.1409, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.045112781954887216, |
|
"grad_norm": 0.22261274174154347, |
|
"learning_rate": 2.2222222222222223e-05, |
|
"loss": 0.2362, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.05263157894736842, |
|
"grad_norm": 0.043978295203653116, |
|
"learning_rate": 2.5925925925925925e-05, |
|
"loss": 0.1195, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.06015037593984962, |
|
"grad_norm": 0.04381964595602848, |
|
"learning_rate": 2.962962962962963e-05, |
|
"loss": 0.1215, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.06766917293233082, |
|
"grad_norm": 0.03290036083527209, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 0.0881, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.07518796992481203, |
|
"grad_norm": 0.03303553719000837, |
|
"learning_rate": 3.7037037037037037e-05, |
|
"loss": 0.0923, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.08270676691729323, |
|
"grad_norm": 0.12832751130733108, |
|
"learning_rate": 4.074074074074074e-05, |
|
"loss": 0.218, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.09022556390977443, |
|
"grad_norm": 0.08479076437214379, |
|
"learning_rate": 4.4444444444444447e-05, |
|
"loss": 0.149, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.09774436090225563, |
|
"grad_norm": 0.13534422076541278, |
|
"learning_rate": 4.814814814814815e-05, |
|
"loss": 0.1835, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.10526315789473684, |
|
"grad_norm": 0.10341781138952844, |
|
"learning_rate": 5.185185185185185e-05, |
|
"loss": 0.1573, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.11278195488721804, |
|
"grad_norm": 0.12256701286625035, |
|
"learning_rate": 5.555555555555556e-05, |
|
"loss": 0.1701, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.12030075187969924, |
|
"grad_norm": 0.05708144315846648, |
|
"learning_rate": 5.925925925925926e-05, |
|
"loss": 0.0938, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.12781954887218044, |
|
"grad_norm": 0.0813311914428683, |
|
"learning_rate": 6.296296296296296e-05, |
|
"loss": 0.1221, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.13533834586466165, |
|
"grad_norm": 0.07670248585638807, |
|
"learning_rate": 6.666666666666667e-05, |
|
"loss": 0.1068, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.14285714285714285, |
|
"grad_norm": 0.12777013083805186, |
|
"learning_rate": 7.037037037037038e-05, |
|
"loss": 0.1183, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.15037593984962405, |
|
"grad_norm": 0.07203864112993859, |
|
"learning_rate": 7.407407407407407e-05, |
|
"loss": 0.0941, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.15789473684210525, |
|
"grad_norm": 0.06622495246697525, |
|
"learning_rate": 7.777777777777778e-05, |
|
"loss": 0.0851, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.16541353383458646, |
|
"grad_norm": 0.05064660711733651, |
|
"learning_rate": 8.148148148148148e-05, |
|
"loss": 0.0672, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.17293233082706766, |
|
"grad_norm": 0.05569880144395339, |
|
"learning_rate": 8.518518518518518e-05, |
|
"loss": 0.0692, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.18045112781954886, |
|
"grad_norm": 0.06341922542018791, |
|
"learning_rate": 8.888888888888889e-05, |
|
"loss": 0.0719, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.18796992481203006, |
|
"grad_norm": 0.09483517480751269, |
|
"learning_rate": 9.25925925925926e-05, |
|
"loss": 0.1039, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.19548872180451127, |
|
"grad_norm": 0.06345422292566975, |
|
"learning_rate": 9.62962962962963e-05, |
|
"loss": 0.0642, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.20300751879699247, |
|
"grad_norm": 0.06565559978972503, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0806, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.21052631578947367, |
|
"grad_norm": 0.07234940226716612, |
|
"learning_rate": 9.999568045802217e-05, |
|
"loss": 0.0699, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.21804511278195488, |
|
"grad_norm": 0.09174614011055109, |
|
"learning_rate": 9.998272257842641e-05, |
|
"loss": 0.0797, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.22556390977443608, |
|
"grad_norm": 0.0799372037045221, |
|
"learning_rate": 9.996112860009688e-05, |
|
"loss": 0.0599, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.23308270676691728, |
|
"grad_norm": 0.07650243821697233, |
|
"learning_rate": 9.993090225407743e-05, |
|
"loss": 0.0673, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.24060150375939848, |
|
"grad_norm": 0.07437978624039222, |
|
"learning_rate": 9.989204876292688e-05, |
|
"loss": 0.063, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.24812030075187969, |
|
"grad_norm": 0.05826090837310029, |
|
"learning_rate": 9.984457483981669e-05, |
|
"loss": 0.0563, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.2556390977443609, |
|
"grad_norm": 0.046830358894256296, |
|
"learning_rate": 9.978848868737098e-05, |
|
"loss": 0.0449, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.2631578947368421, |
|
"grad_norm": 0.059942032653184, |
|
"learning_rate": 9.972379999624936e-05, |
|
"loss": 0.0492, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.2706766917293233, |
|
"grad_norm": 0.04559622889503948, |
|
"learning_rate": 9.96505199434725e-05, |
|
"loss": 0.0384, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.2781954887218045, |
|
"grad_norm": 0.08582556953299057, |
|
"learning_rate": 9.956866119049095e-05, |
|
"loss": 0.052, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.2857142857142857, |
|
"grad_norm": 0.05879365562753825, |
|
"learning_rate": 9.947823788099753e-05, |
|
"loss": 0.0499, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.2932330827067669, |
|
"grad_norm": 0.07725729979493687, |
|
"learning_rate": 9.937926563848346e-05, |
|
"loss": 0.0382, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.3007518796992481, |
|
"grad_norm": 0.06791365316815774, |
|
"learning_rate": 9.927176156353899e-05, |
|
"loss": 0.0424, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.3082706766917293, |
|
"grad_norm": 0.06835456363607172, |
|
"learning_rate": 9.91557442308987e-05, |
|
"loss": 0.0477, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.3157894736842105, |
|
"grad_norm": 0.06785706541381617, |
|
"learning_rate": 9.903123368623216e-05, |
|
"loss": 0.0423, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.3233082706766917, |
|
"grad_norm": 0.037822284484082716, |
|
"learning_rate": 9.889825144268029e-05, |
|
"loss": 0.0373, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.3308270676691729, |
|
"grad_norm": 0.09335172889811039, |
|
"learning_rate": 9.875682047713846e-05, |
|
"loss": 0.0532, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.3383458646616541, |
|
"grad_norm": 0.03552601591664148, |
|
"learning_rate": 9.860696522628639e-05, |
|
"loss": 0.0302, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.3458646616541353, |
|
"grad_norm": 0.06792399841238587, |
|
"learning_rate": 9.844871158236591e-05, |
|
"loss": 0.043, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.3533834586466165, |
|
"grad_norm": 0.07394708716985816, |
|
"learning_rate": 9.828208688870735e-05, |
|
"loss": 0.0414, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.3609022556390977, |
|
"grad_norm": 0.07644206071621325, |
|
"learning_rate": 9.810711993500507e-05, |
|
"loss": 0.0442, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.3684210526315789, |
|
"grad_norm": 0.04448780324279346, |
|
"learning_rate": 9.792384095234313e-05, |
|
"loss": 0.0397, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.37593984962406013, |
|
"grad_norm": 0.040299146373067786, |
|
"learning_rate": 9.773228160797188e-05, |
|
"loss": 0.0294, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.38345864661654133, |
|
"grad_norm": 0.04600091352431098, |
|
"learning_rate": 9.753247499983649e-05, |
|
"loss": 0.0388, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.39097744360902253, |
|
"grad_norm": 0.05174024689025062, |
|
"learning_rate": 9.732445565085824e-05, |
|
"loss": 0.0464, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.39849624060150374, |
|
"grad_norm": 0.06048290755695799, |
|
"learning_rate": 9.71082595029695e-05, |
|
"loss": 0.0441, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.40601503759398494, |
|
"grad_norm": 0.06909111905381797, |
|
"learning_rate": 9.688392391090373e-05, |
|
"loss": 0.0403, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.41353383458646614, |
|
"grad_norm": 0.10580098842980783, |
|
"learning_rate": 9.665148763574123e-05, |
|
"loss": 0.0414, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.42105263157894735, |
|
"grad_norm": 0.06004492721880413, |
|
"learning_rate": 9.64109908382119e-05, |
|
"loss": 0.0348, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.42857142857142855, |
|
"grad_norm": 0.05616302785838828, |
|
"learning_rate": 9.616247507175623e-05, |
|
"loss": 0.0353, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.43609022556390975, |
|
"grad_norm": 0.04963402332052172, |
|
"learning_rate": 9.590598327534564e-05, |
|
"loss": 0.0354, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.44360902255639095, |
|
"grad_norm": 0.09520890937208057, |
|
"learning_rate": 9.564155976606339e-05, |
|
"loss": 0.0436, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.45112781954887216, |
|
"grad_norm": 0.07317691578763187, |
|
"learning_rate": 9.536925023144742e-05, |
|
"loss": 0.0448, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.45864661654135336, |
|
"grad_norm": 0.0653903652099525, |
|
"learning_rate": 9.508910172159635e-05, |
|
"loss": 0.0456, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.46616541353383456, |
|
"grad_norm": 0.08533000644485912, |
|
"learning_rate": 9.480116264104011e-05, |
|
"loss": 0.0417, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.47368421052631576, |
|
"grad_norm": 0.07477194348090598, |
|
"learning_rate": 9.450548274037653e-05, |
|
"loss": 0.0427, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.48120300751879697, |
|
"grad_norm": 0.040320894825821886, |
|
"learning_rate": 9.420211310767533e-05, |
|
"loss": 0.0317, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.48872180451127817, |
|
"grad_norm": 0.04204333897095501, |
|
"learning_rate": 9.389110615965102e-05, |
|
"loss": 0.0308, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.49624060150375937, |
|
"grad_norm": 0.06435209558835227, |
|
"learning_rate": 9.35725156326063e-05, |
|
"loss": 0.0404, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.5037593984962406, |
|
"grad_norm": 0.05292300086818655, |
|
"learning_rate": 9.324639657314742e-05, |
|
"loss": 0.0383, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.5112781954887218, |
|
"grad_norm": 0.0533359959006372, |
|
"learning_rate": 9.291280532867302e-05, |
|
"loss": 0.0419, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.518796992481203, |
|
"grad_norm": 0.0421677134855151, |
|
"learning_rate": 9.257179953763845e-05, |
|
"loss": 0.0301, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.5263157894736842, |
|
"grad_norm": 0.047396091527240565, |
|
"learning_rate": 9.222343811959693e-05, |
|
"loss": 0.0355, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.5338345864661654, |
|
"grad_norm": 0.05055865206409256, |
|
"learning_rate": 9.186778126501916e-05, |
|
"loss": 0.0379, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.5413533834586466, |
|
"grad_norm": 0.03922328494549794, |
|
"learning_rate": 9.150489042489367e-05, |
|
"loss": 0.03, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.5488721804511278, |
|
"grad_norm": 0.08580904921861318, |
|
"learning_rate": 9.113482830010918e-05, |
|
"loss": 0.038, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.556390977443609, |
|
"grad_norm": 0.04615991149700515, |
|
"learning_rate": 9.075765883062093e-05, |
|
"loss": 0.0321, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.5639097744360902, |
|
"grad_norm": 0.21688152384611062, |
|
"learning_rate": 9.037344718440322e-05, |
|
"loss": 0.0369, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.5714285714285714, |
|
"grad_norm": 0.06709856743156827, |
|
"learning_rate": 8.99822597461894e-05, |
|
"loss": 0.0429, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.5789473684210527, |
|
"grad_norm": 0.07300506123989278, |
|
"learning_rate": 8.958416410600187e-05, |
|
"loss": 0.0351, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.5864661654135338, |
|
"grad_norm": 0.08415403445437179, |
|
"learning_rate": 8.917922904747384e-05, |
|
"loss": 0.0425, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.5939849624060151, |
|
"grad_norm": 0.043734956942212244, |
|
"learning_rate": 8.876752453596462e-05, |
|
"loss": 0.0322, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.6015037593984962, |
|
"grad_norm": 0.11340147288766998, |
|
"learning_rate": 8.834912170647101e-05, |
|
"loss": 0.0446, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.6090225563909775, |
|
"grad_norm": 0.061288991507609664, |
|
"learning_rate": 8.792409285133642e-05, |
|
"loss": 0.0424, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.6165413533834586, |
|
"grad_norm": 0.043805649893633086, |
|
"learning_rate": 8.749251140776016e-05, |
|
"loss": 0.0342, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.6240601503759399, |
|
"grad_norm": 0.05953059965877648, |
|
"learning_rate": 8.705445194510868e-05, |
|
"loss": 0.0321, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.631578947368421, |
|
"grad_norm": 0.07945205955271631, |
|
"learning_rate": 8.66099901520315e-05, |
|
"loss": 0.0371, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.6390977443609023, |
|
"grad_norm": 0.04453806753518928, |
|
"learning_rate": 8.615920282338355e-05, |
|
"loss": 0.0349, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.6466165413533834, |
|
"grad_norm": 0.05196927124976879, |
|
"learning_rate": 8.570216784695637e-05, |
|
"loss": 0.0287, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.6541353383458647, |
|
"grad_norm": 0.08901603801098872, |
|
"learning_rate": 8.52389641900206e-05, |
|
"loss": 0.0379, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.6616541353383458, |
|
"grad_norm": 0.04173009472070016, |
|
"learning_rate": 8.476967188568188e-05, |
|
"loss": 0.0264, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.6691729323308271, |
|
"grad_norm": 0.06191267416598679, |
|
"learning_rate": 8.429437201905254e-05, |
|
"loss": 0.028, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.6766917293233082, |
|
"grad_norm": 0.05938205491417802, |
|
"learning_rate": 8.381314671324159e-05, |
|
"loss": 0.0353, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.6842105263157895, |
|
"grad_norm": 0.06594155945203996, |
|
"learning_rate": 8.332607911516545e-05, |
|
"loss": 0.0423, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.6917293233082706, |
|
"grad_norm": 0.03727901580427709, |
|
"learning_rate": 8.283325338118153e-05, |
|
"loss": 0.0288, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.6992481203007519, |
|
"grad_norm": 0.039506792129091334, |
|
"learning_rate": 8.233475466254765e-05, |
|
"loss": 0.0319, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.706766917293233, |
|
"grad_norm": 0.10114676138905467, |
|
"learning_rate": 8.183066909070947e-05, |
|
"loss": 0.0413, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.7142857142857143, |
|
"grad_norm": 0.0519720254987392, |
|
"learning_rate": 8.132108376241849e-05, |
|
"loss": 0.0319, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.7218045112781954, |
|
"grad_norm": 0.06828535688055823, |
|
"learning_rate": 8.08060867246834e-05, |
|
"loss": 0.0415, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.7293233082706767, |
|
"grad_norm": 0.04423778552147402, |
|
"learning_rate": 8.028576695955711e-05, |
|
"loss": 0.0307, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.7368421052631579, |
|
"grad_norm": 0.04301708267503238, |
|
"learning_rate": 7.97602143687623e-05, |
|
"loss": 0.0292, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.7443609022556391, |
|
"grad_norm": 0.07557692217243188, |
|
"learning_rate": 7.922951975815811e-05, |
|
"loss": 0.0304, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.7518796992481203, |
|
"grad_norm": 0.061041885279450855, |
|
"learning_rate": 7.869377482205042e-05, |
|
"loss": 0.0318, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.7593984962406015, |
|
"grad_norm": 0.040342152719196084, |
|
"learning_rate": 7.815307212734888e-05, |
|
"loss": 0.027, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.7669172932330827, |
|
"grad_norm": 0.07790755826343725, |
|
"learning_rate": 7.760750509757298e-05, |
|
"loss": 0.0339, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.7744360902255639, |
|
"grad_norm": 0.05210408795431101, |
|
"learning_rate": 7.705716799671019e-05, |
|
"loss": 0.0228, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.7819548872180451, |
|
"grad_norm": 0.08000736959421384, |
|
"learning_rate": 7.650215591292888e-05, |
|
"loss": 0.0357, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.7894736842105263, |
|
"grad_norm": 0.05843028390975531, |
|
"learning_rate": 7.594256474214882e-05, |
|
"loss": 0.0285, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.7969924812030075, |
|
"grad_norm": 0.13537509841914472, |
|
"learning_rate": 7.537849117147212e-05, |
|
"loss": 0.0359, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.8045112781954887, |
|
"grad_norm": 0.08230566866298178, |
|
"learning_rate": 7.481003266247744e-05, |
|
"loss": 0.0367, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.8120300751879699, |
|
"grad_norm": 0.09678557492723187, |
|
"learning_rate": 7.423728743438048e-05, |
|
"loss": 0.0358, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.8195488721804511, |
|
"grad_norm": 0.049541914871144996, |
|
"learning_rate": 7.366035444706347e-05, |
|
"loss": 0.0329, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.8270676691729323, |
|
"grad_norm": 0.08823757922929092, |
|
"learning_rate": 7.307933338397667e-05, |
|
"loss": 0.0364, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.8345864661654135, |
|
"grad_norm": 0.044744299992948704, |
|
"learning_rate": 7.249432463491498e-05, |
|
"loss": 0.0328, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.8421052631578947, |
|
"grad_norm": 0.03814585189064516, |
|
"learning_rate": 7.190542927867234e-05, |
|
"loss": 0.0242, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.849624060150376, |
|
"grad_norm": 0.03553642928460275, |
|
"learning_rate": 7.131274906557725e-05, |
|
"loss": 0.0277, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.8571428571428571, |
|
"grad_norm": 0.044176381361140944, |
|
"learning_rate": 7.071638639991207e-05, |
|
"loss": 0.0282, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.8646616541353384, |
|
"grad_norm": 0.04113727259330019, |
|
"learning_rate": 7.011644432221958e-05, |
|
"loss": 0.0311, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.8721804511278195, |
|
"grad_norm": 0.060773829286428965, |
|
"learning_rate": 6.95130264914993e-05, |
|
"loss": 0.0414, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.8796992481203008, |
|
"grad_norm": 0.05757846085257315, |
|
"learning_rate": 6.890623716729724e-05, |
|
"loss": 0.0279, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.8872180451127819, |
|
"grad_norm": 0.08428255259620104, |
|
"learning_rate": 6.82961811916917e-05, |
|
"loss": 0.0298, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.8947368421052632, |
|
"grad_norm": 0.04529601746123181, |
|
"learning_rate": 6.768296397117848e-05, |
|
"loss": 0.0263, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.9022556390977443, |
|
"grad_norm": 0.0559976345746786, |
|
"learning_rate": 6.706669145845863e-05, |
|
"loss": 0.0331, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.9097744360902256, |
|
"grad_norm": 0.046985300077111235, |
|
"learning_rate": 6.644747013413168e-05, |
|
"loss": 0.0323, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.9172932330827067, |
|
"grad_norm": 0.06973194335422163, |
|
"learning_rate": 6.582540698829781e-05, |
|
"loss": 0.0356, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.924812030075188, |
|
"grad_norm": 0.0550307651636393, |
|
"learning_rate": 6.520060950207185e-05, |
|
"loss": 0.0374, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.9323308270676691, |
|
"grad_norm": 0.04136098377224926, |
|
"learning_rate": 6.457318562901256e-05, |
|
"loss": 0.0281, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.9398496240601504, |
|
"grad_norm": 0.04471839673788357, |
|
"learning_rate": 6.394324377647028e-05, |
|
"loss": 0.0344, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.9473684210526315, |
|
"grad_norm": 0.04057335071418551, |
|
"learning_rate": 6.331089278685599e-05, |
|
"loss": 0.0289, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.9548872180451128, |
|
"grad_norm": 0.036632585834280834, |
|
"learning_rate": 6.26762419188355e-05, |
|
"loss": 0.0254, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.9624060150375939, |
|
"grad_norm": 0.05253467833143005, |
|
"learning_rate": 6.203940082845144e-05, |
|
"loss": 0.0423, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.9699248120300752, |
|
"grad_norm": 0.05828434847478486, |
|
"learning_rate": 6.140047955017671e-05, |
|
"loss": 0.0331, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.9774436090225563, |
|
"grad_norm": 0.052528332979290625, |
|
"learning_rate": 6.075958847790262e-05, |
|
"loss": 0.0344, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.9849624060150376, |
|
"grad_norm": 0.039125799054480936, |
|
"learning_rate": 6.011683834586473e-05, |
|
"loss": 0.0264, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.9924812030075187, |
|
"grad_norm": 0.03707157930189228, |
|
"learning_rate": 5.947234020951015e-05, |
|
"loss": 0.0237, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.054189982183542575, |
|
"learning_rate": 5.882620542630901e-05, |
|
"loss": 0.0317, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 1.0075187969924813, |
|
"grad_norm": 0.04357846265860899, |
|
"learning_rate": 5.8178545636514145e-05, |
|
"loss": 0.0268, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 1.0150375939849625, |
|
"grad_norm": 0.056012933476124856, |
|
"learning_rate": 5.752947274387147e-05, |
|
"loss": 0.0223, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.0225563909774436, |
|
"grad_norm": 0.049689439936320044, |
|
"learning_rate": 5.687909889628529e-05, |
|
"loss": 0.0304, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 1.0300751879699248, |
|
"grad_norm": 0.04830994322048754, |
|
"learning_rate": 5.622753646644102e-05, |
|
"loss": 0.0278, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 1.037593984962406, |
|
"grad_norm": 0.04418639970975713, |
|
"learning_rate": 5.557489803238933e-05, |
|
"loss": 0.0259, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 1.045112781954887, |
|
"grad_norm": 0.042738363591787835, |
|
"learning_rate": 5.492129635809473e-05, |
|
"loss": 0.0198, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 1.0526315789473684, |
|
"grad_norm": 0.03885713180148723, |
|
"learning_rate": 5.426684437395196e-05, |
|
"loss": 0.0191, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.0601503759398496, |
|
"grad_norm": 0.04951650926676435, |
|
"learning_rate": 5.361165515727374e-05, |
|
"loss": 0.0214, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 1.0676691729323309, |
|
"grad_norm": 0.059968470212708236, |
|
"learning_rate": 5.295584191275308e-05, |
|
"loss": 0.0243, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 1.0751879699248121, |
|
"grad_norm": 0.0676386940224187, |
|
"learning_rate": 5.229951795290353e-05, |
|
"loss": 0.029, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 1.0827067669172932, |
|
"grad_norm": 0.04250436122379926, |
|
"learning_rate": 5.164279667848094e-05, |
|
"loss": 0.0204, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 1.0902255639097744, |
|
"grad_norm": 0.04124846102938738, |
|
"learning_rate": 5.0985791558889785e-05, |
|
"loss": 0.0209, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.0977443609022557, |
|
"grad_norm": 0.05914558229310168, |
|
"learning_rate": 5.032861611257783e-05, |
|
"loss": 0.0285, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 1.1052631578947367, |
|
"grad_norm": 0.0465029543723527, |
|
"learning_rate": 4.967138388742218e-05, |
|
"loss": 0.0204, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 1.112781954887218, |
|
"grad_norm": 0.06469458945659604, |
|
"learning_rate": 4.901420844111021e-05, |
|
"loss": 0.0314, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 1.1203007518796992, |
|
"grad_norm": 0.06440915952496404, |
|
"learning_rate": 4.835720332151907e-05, |
|
"loss": 0.0281, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 1.1278195488721805, |
|
"grad_norm": 0.0571757163158284, |
|
"learning_rate": 4.770048204709648e-05, |
|
"loss": 0.0248, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.1353383458646618, |
|
"grad_norm": 0.05910301690921271, |
|
"learning_rate": 4.7044158087246926e-05, |
|
"loss": 0.0311, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 1.1428571428571428, |
|
"grad_norm": 0.04613839631194596, |
|
"learning_rate": 4.6388344842726264e-05, |
|
"loss": 0.0218, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 1.150375939849624, |
|
"grad_norm": 0.05741866552084954, |
|
"learning_rate": 4.5733155626048036e-05, |
|
"loss": 0.0271, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 1.1578947368421053, |
|
"grad_norm": 0.04682544810113655, |
|
"learning_rate": 4.507870364190527e-05, |
|
"loss": 0.0264, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 1.1654135338345863, |
|
"grad_norm": 0.06282838577083374, |
|
"learning_rate": 4.4425101967610674e-05, |
|
"loss": 0.024, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.1729323308270676, |
|
"grad_norm": 0.05388737782363021, |
|
"learning_rate": 4.377246353355899e-05, |
|
"loss": 0.0271, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.1804511278195489, |
|
"grad_norm": 0.05086578069156835, |
|
"learning_rate": 4.312090110371473e-05, |
|
"loss": 0.0278, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 1.1879699248120301, |
|
"grad_norm": 0.05863572980738164, |
|
"learning_rate": 4.247052725612852e-05, |
|
"loss": 0.0292, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 1.1954887218045114, |
|
"grad_norm": 0.04227523648124146, |
|
"learning_rate": 4.1821454363485866e-05, |
|
"loss": 0.0234, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 1.2030075187969924, |
|
"grad_norm": 0.04268704545270105, |
|
"learning_rate": 4.1173794573690996e-05, |
|
"loss": 0.0206, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.2105263157894737, |
|
"grad_norm": 0.04778787432486908, |
|
"learning_rate": 4.052765979048986e-05, |
|
"loss": 0.0227, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 1.218045112781955, |
|
"grad_norm": 0.0459311125342993, |
|
"learning_rate": 3.988316165413528e-05, |
|
"loss": 0.0205, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.225563909774436, |
|
"grad_norm": 0.05603215690118315, |
|
"learning_rate": 3.924041152209739e-05, |
|
"loss": 0.029, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 1.2330827067669172, |
|
"grad_norm": 0.060179119443112154, |
|
"learning_rate": 3.859952044982329e-05, |
|
"loss": 0.0271, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.2406015037593985, |
|
"grad_norm": 0.04740279415347567, |
|
"learning_rate": 3.7960599171548574e-05, |
|
"loss": 0.0213, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.2481203007518797, |
|
"grad_norm": 0.052482110362426594, |
|
"learning_rate": 3.732375808116451e-05, |
|
"loss": 0.0258, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.255639097744361, |
|
"grad_norm": 0.04835120393099329, |
|
"learning_rate": 3.668910721314402e-05, |
|
"loss": 0.0229, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 1.263157894736842, |
|
"grad_norm": 0.08311507045185516, |
|
"learning_rate": 3.605675622352973e-05, |
|
"loss": 0.0265, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.2706766917293233, |
|
"grad_norm": 0.053563077833150494, |
|
"learning_rate": 3.542681437098745e-05, |
|
"loss": 0.0256, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 1.2781954887218046, |
|
"grad_norm": 0.05567682482783888, |
|
"learning_rate": 3.479939049792817e-05, |
|
"loss": 0.0213, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.2857142857142856, |
|
"grad_norm": 0.054588031712222006, |
|
"learning_rate": 3.417459301170219e-05, |
|
"loss": 0.0266, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 1.2932330827067668, |
|
"grad_norm": 0.07694344232267265, |
|
"learning_rate": 3.355252986586832e-05, |
|
"loss": 0.0193, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.300751879699248, |
|
"grad_norm": 0.05943952613035603, |
|
"learning_rate": 3.293330854154136e-05, |
|
"loss": 0.0258, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 1.3082706766917294, |
|
"grad_norm": 0.038766556860819104, |
|
"learning_rate": 3.2317036028821523e-05, |
|
"loss": 0.0159, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.3157894736842106, |
|
"grad_norm": 0.05092188135687549, |
|
"learning_rate": 3.1703818808308324e-05, |
|
"loss": 0.0215, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.3233082706766917, |
|
"grad_norm": 0.04779789780883562, |
|
"learning_rate": 3.109376283270277e-05, |
|
"loss": 0.0268, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.330827067669173, |
|
"grad_norm": 0.04433720319245774, |
|
"learning_rate": 3.0486973508500727e-05, |
|
"loss": 0.0238, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 1.3383458646616542, |
|
"grad_norm": 0.049878475563895956, |
|
"learning_rate": 2.988355567778043e-05, |
|
"loss": 0.0259, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.3458646616541352, |
|
"grad_norm": 0.05962755604807658, |
|
"learning_rate": 2.9283613600087933e-05, |
|
"loss": 0.025, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 1.3533834586466165, |
|
"grad_norm": 0.04955718527923681, |
|
"learning_rate": 2.8687250934422772e-05, |
|
"loss": 0.0194, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.3609022556390977, |
|
"grad_norm": 0.03676456890831394, |
|
"learning_rate": 2.8094570721327662e-05, |
|
"loss": 0.0189, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 1.368421052631579, |
|
"grad_norm": 0.04868946152583533, |
|
"learning_rate": 2.750567536508504e-05, |
|
"loss": 0.0243, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.3759398496240602, |
|
"grad_norm": 0.0555305400721802, |
|
"learning_rate": 2.6920666616023327e-05, |
|
"loss": 0.0257, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.3834586466165413, |
|
"grad_norm": 0.04963192556183434, |
|
"learning_rate": 2.6339645552936536e-05, |
|
"loss": 0.0275, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.3909774436090225, |
|
"grad_norm": 0.05542091349920839, |
|
"learning_rate": 2.5762712565619528e-05, |
|
"loss": 0.023, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.3984962406015038, |
|
"grad_norm": 0.0426183120843919, |
|
"learning_rate": 2.5189967337522573e-05, |
|
"loss": 0.0206, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.4060150375939848, |
|
"grad_norm": 0.05205246245376388, |
|
"learning_rate": 2.46215088285279e-05, |
|
"loss": 0.0229, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 1.413533834586466, |
|
"grad_norm": 0.04337666332691105, |
|
"learning_rate": 2.4057435257851175e-05, |
|
"loss": 0.019, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.4210526315789473, |
|
"grad_norm": 0.05985729489503263, |
|
"learning_rate": 2.349784408707112e-05, |
|
"loss": 0.0274, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 1.4285714285714286, |
|
"grad_norm": 0.062032022184375604, |
|
"learning_rate": 2.2942832003289823e-05, |
|
"loss": 0.0271, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.4360902255639099, |
|
"grad_norm": 0.05773389436675615, |
|
"learning_rate": 2.2392494902427025e-05, |
|
"loss": 0.0263, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 1.443609022556391, |
|
"grad_norm": 0.048522536078850126, |
|
"learning_rate": 2.1846927872651137e-05, |
|
"loss": 0.0242, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.4511278195488722, |
|
"grad_norm": 0.05010560342148772, |
|
"learning_rate": 2.1306225177949585e-05, |
|
"loss": 0.024, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 1.4586466165413534, |
|
"grad_norm": 0.058011679310299026, |
|
"learning_rate": 2.07704802418419e-05, |
|
"loss": 0.0301, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.4661654135338344, |
|
"grad_norm": 0.052695628737558814, |
|
"learning_rate": 2.0239785631237705e-05, |
|
"loss": 0.0262, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.4736842105263157, |
|
"grad_norm": 0.0397195089948912, |
|
"learning_rate": 1.9714233040442915e-05, |
|
"loss": 0.0179, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.481203007518797, |
|
"grad_norm": 0.05532938780742867, |
|
"learning_rate": 1.9193913275316626e-05, |
|
"loss": 0.0234, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 1.4887218045112782, |
|
"grad_norm": 0.07349266479809795, |
|
"learning_rate": 1.8678916237581522e-05, |
|
"loss": 0.0236, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.4962406015037595, |
|
"grad_norm": 0.03995824607041351, |
|
"learning_rate": 1.816933090929055e-05, |
|
"loss": 0.0176, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 1.5037593984962405, |
|
"grad_norm": 0.07166373724308431, |
|
"learning_rate": 1.7665245337452368e-05, |
|
"loss": 0.0258, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.5037593984962405, |
|
"eval_loss": 0.029665347188711166, |
|
"eval_runtime": 6.5066, |
|
"eval_samples_per_second": 0.922, |
|
"eval_steps_per_second": 0.307, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.5112781954887218, |
|
"grad_norm": 0.048692577901512116, |
|
"learning_rate": 1.716674661881848e-05, |
|
"loss": 0.0224, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 1.518796992481203, |
|
"grad_norm": 0.04675059057360818, |
|
"learning_rate": 1.667392088483456e-05, |
|
"loss": 0.0223, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 1.526315789473684, |
|
"grad_norm": 0.05459458244813264, |
|
"learning_rate": 1.6186853286758397e-05, |
|
"loss": 0.0242, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 1.5338345864661656, |
|
"grad_norm": 0.051543551392068274, |
|
"learning_rate": 1.570562798094747e-05, |
|
"loss": 0.025, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.5413533834586466, |
|
"grad_norm": 0.14671926401344376, |
|
"learning_rate": 1.5230328114318127e-05, |
|
"loss": 0.0241, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.5488721804511278, |
|
"grad_norm": 0.058979726559234814, |
|
"learning_rate": 1.4761035809979395e-05, |
|
"loss": 0.0253, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 1.556390977443609, |
|
"grad_norm": 0.06494643885270886, |
|
"learning_rate": 1.4297832153043656e-05, |
|
"loss": 0.0236, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 1.5639097744360901, |
|
"grad_norm": 0.06627104647345526, |
|
"learning_rate": 1.3840797176616466e-05, |
|
"loss": 0.0278, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.5714285714285714, |
|
"grad_norm": 0.06190650675134399, |
|
"learning_rate": 1.3390009847968504e-05, |
|
"loss": 0.0255, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 1.5789473684210527, |
|
"grad_norm": 0.06250699899282167, |
|
"learning_rate": 1.2945548054891321e-05, |
|
"loss": 0.0254, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.5864661654135337, |
|
"grad_norm": 0.06214391708977836, |
|
"learning_rate": 1.2507488592239847e-05, |
|
"loss": 0.0233, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 1.5939849624060152, |
|
"grad_norm": 0.054608347620115995, |
|
"learning_rate": 1.2075907148663579e-05, |
|
"loss": 0.024, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 1.6015037593984962, |
|
"grad_norm": 0.05333683650123989, |
|
"learning_rate": 1.1650878293528994e-05, |
|
"loss": 0.0261, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 1.6090225563909775, |
|
"grad_norm": 0.047407562918454, |
|
"learning_rate": 1.1232475464035385e-05, |
|
"loss": 0.0192, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 1.6165413533834587, |
|
"grad_norm": 0.06549580580637923, |
|
"learning_rate": 1.0820770952526155e-05, |
|
"loss": 0.0192, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.6240601503759398, |
|
"grad_norm": 0.0582730317262946, |
|
"learning_rate": 1.0415835893998116e-05, |
|
"loss": 0.0267, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 1.631578947368421, |
|
"grad_norm": 0.06724858724013988, |
|
"learning_rate": 1.0017740253810609e-05, |
|
"loss": 0.0244, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 1.6390977443609023, |
|
"grad_norm": 0.07353126997097047, |
|
"learning_rate": 9.62655281559679e-06, |
|
"loss": 0.0265, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 1.6466165413533833, |
|
"grad_norm": 0.057567868642984674, |
|
"learning_rate": 9.242341169379076e-06, |
|
"loss": 0.0239, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 1.6541353383458648, |
|
"grad_norm": 0.06325334373179048, |
|
"learning_rate": 8.865171699890834e-06, |
|
"loss": 0.023, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.6616541353383458, |
|
"grad_norm": 0.057849806459398294, |
|
"learning_rate": 8.49510957510633e-06, |
|
"loss": 0.0286, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 1.669172932330827, |
|
"grad_norm": 0.06257054012996921, |
|
"learning_rate": 8.132218734980852e-06, |
|
"loss": 0.0205, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 1.6766917293233083, |
|
"grad_norm": 0.053291552200528655, |
|
"learning_rate": 7.776561880403072e-06, |
|
"loss": 0.0222, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 1.6842105263157894, |
|
"grad_norm": 0.055884993872003165, |
|
"learning_rate": 7.4282004623615396e-06, |
|
"loss": 0.0257, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 1.6917293233082706, |
|
"grad_norm": 0.04781226703104293, |
|
"learning_rate": 7.0871946713269856e-06, |
|
"loss": 0.021, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.699248120300752, |
|
"grad_norm": 0.04617454207758738, |
|
"learning_rate": 6.753603426852589e-06, |
|
"loss": 0.0206, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 1.706766917293233, |
|
"grad_norm": 0.05934488856386534, |
|
"learning_rate": 6.427484367393699e-06, |
|
"loss": 0.0221, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 1.7142857142857144, |
|
"grad_norm": 0.0563063349000768, |
|
"learning_rate": 6.108893840348995e-06, |
|
"loss": 0.0217, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 1.7218045112781954, |
|
"grad_norm": 0.058919681414065804, |
|
"learning_rate": 5.797886892324694e-06, |
|
"loss": 0.0241, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 1.7293233082706767, |
|
"grad_norm": 0.04652279001651371, |
|
"learning_rate": 5.494517259623477e-06, |
|
"loss": 0.023, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.736842105263158, |
|
"grad_norm": 0.05206753304811755, |
|
"learning_rate": 5.198837358959901e-06, |
|
"loss": 0.0247, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 1.744360902255639, |
|
"grad_norm": 0.05759411719610633, |
|
"learning_rate": 4.910898278403669e-06, |
|
"loss": 0.0275, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 1.7518796992481203, |
|
"grad_norm": 0.05493938568305548, |
|
"learning_rate": 4.630749768552589e-06, |
|
"loss": 0.0236, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 1.7593984962406015, |
|
"grad_norm": 0.045214515268897214, |
|
"learning_rate": 4.358440233936617e-06, |
|
"loss": 0.0196, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 1.7669172932330826, |
|
"grad_norm": 0.08670874372319154, |
|
"learning_rate": 4.094016724654359e-06, |
|
"loss": 0.0292, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.774436090225564, |
|
"grad_norm": 0.049117351787292686, |
|
"learning_rate": 3.837524928243774e-06, |
|
"loss": 0.0224, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 1.781954887218045, |
|
"grad_norm": 0.058397389390063136, |
|
"learning_rate": 3.589009161788104e-06, |
|
"loss": 0.0278, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 1.7894736842105263, |
|
"grad_norm": 0.05422155962388968, |
|
"learning_rate": 3.3485123642587658e-06, |
|
"loss": 0.0243, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 1.7969924812030076, |
|
"grad_norm": 0.07090059571835504, |
|
"learning_rate": 3.116076089096265e-06, |
|
"loss": 0.027, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 1.8045112781954886, |
|
"grad_norm": 0.05963059250846481, |
|
"learning_rate": 2.8917404970305097e-06, |
|
"loss": 0.0288, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.8120300751879699, |
|
"grad_norm": 0.06946365704174999, |
|
"learning_rate": 2.675544349141779e-06, |
|
"loss": 0.0259, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 1.8195488721804511, |
|
"grad_norm": 0.06143740644726876, |
|
"learning_rate": 2.4675250001635232e-06, |
|
"loss": 0.0247, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 1.8270676691729322, |
|
"grad_norm": 0.04728168437977354, |
|
"learning_rate": 2.2677183920281343e-06, |
|
"loss": 0.0193, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 1.8345864661654137, |
|
"grad_norm": 0.07042127314230426, |
|
"learning_rate": 2.076159047656889e-06, |
|
"loss": 0.0227, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 1.8421052631578947, |
|
"grad_norm": 0.05266415047166696, |
|
"learning_rate": 1.892880064994934e-06, |
|
"loss": 0.0256, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.849624060150376, |
|
"grad_norm": 0.05204878417509025, |
|
"learning_rate": 1.7179131112926627e-06, |
|
"loss": 0.024, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 1.8571428571428572, |
|
"grad_norm": 0.04727065912696429, |
|
"learning_rate": 1.551288417634106e-06, |
|
"loss": 0.0159, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 1.8646616541353382, |
|
"grad_norm": 0.049637487718030344, |
|
"learning_rate": 1.3930347737136196e-06, |
|
"loss": 0.0209, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 1.8721804511278195, |
|
"grad_norm": 0.0505669836884092, |
|
"learning_rate": 1.2431795228615372e-06, |
|
"loss": 0.0206, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 1.8796992481203008, |
|
"grad_norm": 0.07557073448805833, |
|
"learning_rate": 1.101748557319715e-06, |
|
"loss": 0.0315, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.8872180451127818, |
|
"grad_norm": 0.04855407299966349, |
|
"learning_rate": 9.687663137678604e-07, |
|
"loss": 0.0193, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 1.8947368421052633, |
|
"grad_norm": 0.05981871688003821, |
|
"learning_rate": 8.442557691013043e-07, |
|
"loss": 0.0245, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 1.9022556390977443, |
|
"grad_norm": 0.055297053623164526, |
|
"learning_rate": 7.282384364610206e-07, |
|
"loss": 0.0242, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 1.9097744360902256, |
|
"grad_norm": 0.05097924138111233, |
|
"learning_rate": 6.207343615165561e-07, |
|
"loss": 0.0207, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 1.9172932330827068, |
|
"grad_norm": 0.05870296620626846, |
|
"learning_rate": 5.217621190024779e-07, |
|
"loss": 0.0259, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.9248120300751879, |
|
"grad_norm": 0.05289043509456049, |
|
"learning_rate": 4.3133880950905205e-07, |
|
"loss": 0.0217, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 1.9323308270676691, |
|
"grad_norm": 0.05040687502136238, |
|
"learning_rate": 3.494800565275125e-07, |
|
"loss": 0.0226, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 1.9398496240601504, |
|
"grad_norm": 0.05483598628420617, |
|
"learning_rate": 2.762000037506485e-07, |
|
"loss": 0.0226, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 1.9473684210526314, |
|
"grad_norm": 0.052171052589092846, |
|
"learning_rate": 2.115113126290258e-07, |
|
"loss": 0.0224, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 1.954887218045113, |
|
"grad_norm": 0.060066029686361856, |
|
"learning_rate": 1.554251601833201e-07, |
|
"loss": 0.0242, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.962406015037594, |
|
"grad_norm": 0.04560282840465627, |
|
"learning_rate": 1.0795123707312283e-07, |
|
"loss": 0.0199, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 1.9699248120300752, |
|
"grad_norm": 0.055943707431487216, |
|
"learning_rate": 6.909774592258056e-08, |
|
"loss": 0.0218, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 1.9774436090225564, |
|
"grad_norm": 0.057987573660367824, |
|
"learning_rate": 3.8871399903134265e-08, |
|
"loss": 0.0242, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 1.9849624060150375, |
|
"grad_norm": 0.05438020219150765, |
|
"learning_rate": 1.7277421573608232e-08, |
|
"loss": 0.0278, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 1.9924812030075187, |
|
"grad_norm": 0.05442881774912085, |
|
"learning_rate": 4.319541977831909e-09, |
|
"loss": 0.0193, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.059490023866208885, |
|
"learning_rate": 0.0, |
|
"loss": 0.0224, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"step": 266, |
|
"total_flos": 673614818967552.0, |
|
"train_loss": 0.039493271835932604, |
|
"train_runtime": 2026.6163, |
|
"train_samples_per_second": 0.522, |
|
"train_steps_per_second": 0.131 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 266, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 300, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 673614818967552.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|