|
{ |
|
"best_metric": 0.01230713166296482, |
|
"best_model_checkpoint": "/home/paperspace/Data/models/centime/llm3br256/checkpoint-400", |
|
"epoch": 4.992143658810326, |
|
"eval_steps": 5, |
|
"global_step": 555, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.008978675645342313, |
|
"grad_norm": 0.12415610253810883, |
|
"learning_rate": 1.7857142857142857e-06, |
|
"loss": 0.0585, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.017957351290684626, |
|
"grad_norm": 0.11934095621109009, |
|
"learning_rate": 3.5714285714285714e-06, |
|
"loss": 0.0573, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.026936026936026935, |
|
"grad_norm": 0.12277644127607346, |
|
"learning_rate": 5.357142857142857e-06, |
|
"loss": 0.0583, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.03591470258136925, |
|
"grad_norm": 0.11886874586343765, |
|
"learning_rate": 7.142857142857143e-06, |
|
"loss": 0.0575, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.04489337822671156, |
|
"grad_norm": 0.12854638695716858, |
|
"learning_rate": 8.92857142857143e-06, |
|
"loss": 0.0612, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.04489337822671156, |
|
"eval_loss": 0.05599946156144142, |
|
"eval_runtime": 8.137, |
|
"eval_samples_per_second": 6.145, |
|
"eval_steps_per_second": 1.598, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.05387205387205387, |
|
"grad_norm": 0.09200441092252731, |
|
"learning_rate": 1.0714285714285714e-05, |
|
"loss": 0.05, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.06285072951739619, |
|
"grad_norm": 0.0996675118803978, |
|
"learning_rate": 1.25e-05, |
|
"loss": 0.0515, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.0718294051627385, |
|
"grad_norm": 0.08751285076141357, |
|
"learning_rate": 1.4285714285714285e-05, |
|
"loss": 0.0436, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.08080808080808081, |
|
"grad_norm": 0.07102574408054352, |
|
"learning_rate": 1.6071428571428572e-05, |
|
"loss": 0.044, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.08978675645342311, |
|
"grad_norm": 0.05904633551836014, |
|
"learning_rate": 1.785714285714286e-05, |
|
"loss": 0.0411, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.08978675645342311, |
|
"eval_loss": 0.035706527531147, |
|
"eval_runtime": 6.2599, |
|
"eval_samples_per_second": 7.987, |
|
"eval_steps_per_second": 2.077, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.09876543209876543, |
|
"grad_norm": 0.07080914080142975, |
|
"learning_rate": 1.9642857142857145e-05, |
|
"loss": 0.036, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.10774410774410774, |
|
"grad_norm": 0.07890115678310394, |
|
"learning_rate": 2.1428571428571428e-05, |
|
"loss": 0.0345, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.11672278338945005, |
|
"grad_norm": 0.06593428552150726, |
|
"learning_rate": 2.3214285714285715e-05, |
|
"loss": 0.0351, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.12570145903479238, |
|
"grad_norm": 0.04619583487510681, |
|
"learning_rate": 2.5e-05, |
|
"loss": 0.0338, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.13468013468013468, |
|
"grad_norm": 0.04647090658545494, |
|
"learning_rate": 2.6785714285714288e-05, |
|
"loss": 0.0353, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.13468013468013468, |
|
"eval_loss": 0.03013807348906994, |
|
"eval_runtime": 6.2491, |
|
"eval_samples_per_second": 8.001, |
|
"eval_steps_per_second": 2.08, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.143658810325477, |
|
"grad_norm": 0.04043864831328392, |
|
"learning_rate": 2.857142857142857e-05, |
|
"loss": 0.0304, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.1526374859708193, |
|
"grad_norm": 0.043566230684518814, |
|
"learning_rate": 3.0357142857142857e-05, |
|
"loss": 0.0303, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.16161616161616163, |
|
"grad_norm": 0.042150504887104034, |
|
"learning_rate": 3.2142857142857144e-05, |
|
"loss": 0.0327, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.17059483726150393, |
|
"grad_norm": 0.03820006549358368, |
|
"learning_rate": 3.392857142857143e-05, |
|
"loss": 0.0271, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.17957351290684623, |
|
"grad_norm": 0.03358590975403786, |
|
"learning_rate": 3.571428571428572e-05, |
|
"loss": 0.0286, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.17957351290684623, |
|
"eval_loss": 0.026352621614933014, |
|
"eval_runtime": 6.261, |
|
"eval_samples_per_second": 7.986, |
|
"eval_steps_per_second": 2.076, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.18855218855218855, |
|
"grad_norm": 0.03568700700998306, |
|
"learning_rate": 3.7500000000000003e-05, |
|
"loss": 0.0293, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.19753086419753085, |
|
"grad_norm": 0.033443234860897064, |
|
"learning_rate": 3.928571428571429e-05, |
|
"loss": 0.027, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.20650953984287318, |
|
"grad_norm": 0.03302915766835213, |
|
"learning_rate": 4.107142857142857e-05, |
|
"loss": 0.027, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.21548821548821548, |
|
"grad_norm": 0.03412705287337303, |
|
"learning_rate": 4.2857142857142856e-05, |
|
"loss": 0.0249, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.2244668911335578, |
|
"grad_norm": 0.03498664125800133, |
|
"learning_rate": 4.464285714285715e-05, |
|
"loss": 0.0282, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.2244668911335578, |
|
"eval_loss": 0.023908844217658043, |
|
"eval_runtime": 6.2612, |
|
"eval_samples_per_second": 7.986, |
|
"eval_steps_per_second": 2.076, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.2334455667789001, |
|
"grad_norm": 0.02876153774559498, |
|
"learning_rate": 4.642857142857143e-05, |
|
"loss": 0.0249, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.24242424242424243, |
|
"grad_norm": 0.03279321268200874, |
|
"learning_rate": 4.8214285714285716e-05, |
|
"loss": 0.0254, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.25140291806958476, |
|
"grad_norm": 0.03252970054745674, |
|
"learning_rate": 5e-05, |
|
"loss": 0.024, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.26038159371492703, |
|
"grad_norm": 0.02586694434285164, |
|
"learning_rate": 5.1785714285714296e-05, |
|
"loss": 0.0251, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.26936026936026936, |
|
"grad_norm": 0.028388267382979393, |
|
"learning_rate": 5.3571428571428575e-05, |
|
"loss": 0.0223, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.26936026936026936, |
|
"eval_loss": 0.02239508368074894, |
|
"eval_runtime": 6.2554, |
|
"eval_samples_per_second": 7.993, |
|
"eval_steps_per_second": 2.078, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.2783389450056117, |
|
"grad_norm": 0.040059663355350494, |
|
"learning_rate": 5.535714285714286e-05, |
|
"loss": 0.0281, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.287317620650954, |
|
"grad_norm": 0.02905772626399994, |
|
"learning_rate": 5.714285714285714e-05, |
|
"loss": 0.0243, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.2962962962962963, |
|
"grad_norm": 0.028802093118429184, |
|
"learning_rate": 5.8928571428571435e-05, |
|
"loss": 0.0224, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.3052749719416386, |
|
"grad_norm": 0.031860049813985825, |
|
"learning_rate": 6.0714285714285715e-05, |
|
"loss": 0.0246, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.31425364758698093, |
|
"grad_norm": 0.029610324651002884, |
|
"learning_rate": 6.25e-05, |
|
"loss": 0.0242, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.31425364758698093, |
|
"eval_loss": 0.020946728065609932, |
|
"eval_runtime": 6.2517, |
|
"eval_samples_per_second": 7.998, |
|
"eval_steps_per_second": 2.079, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.32323232323232326, |
|
"grad_norm": 0.025372274219989777, |
|
"learning_rate": 6.428571428571429e-05, |
|
"loss": 0.0257, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.33221099887766553, |
|
"grad_norm": 0.030153121799230576, |
|
"learning_rate": 6.607142857142857e-05, |
|
"loss": 0.0221, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.34118967452300786, |
|
"grad_norm": 0.023654770106077194, |
|
"learning_rate": 6.785714285714286e-05, |
|
"loss": 0.0201, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.3501683501683502, |
|
"grad_norm": 0.026218950748443604, |
|
"learning_rate": 6.964285714285715e-05, |
|
"loss": 0.0207, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.35914702581369246, |
|
"grad_norm": 0.02605343423783779, |
|
"learning_rate": 7.142857142857143e-05, |
|
"loss": 0.0211, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.35914702581369246, |
|
"eval_loss": 0.020273756235837936, |
|
"eval_runtime": 6.2552, |
|
"eval_samples_per_second": 7.993, |
|
"eval_steps_per_second": 2.078, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.3681257014590348, |
|
"grad_norm": 0.026552610099315643, |
|
"learning_rate": 7.321428571428571e-05, |
|
"loss": 0.0226, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.3771043771043771, |
|
"grad_norm": 0.020305411890149117, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 0.0187, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.38608305274971944, |
|
"grad_norm": 0.023941006511449814, |
|
"learning_rate": 7.67857142857143e-05, |
|
"loss": 0.0213, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.3950617283950617, |
|
"grad_norm": 0.024746833369135857, |
|
"learning_rate": 7.857142857142858e-05, |
|
"loss": 0.0214, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.40404040404040403, |
|
"grad_norm": 0.018778987228870392, |
|
"learning_rate": 8.035714285714287e-05, |
|
"loss": 0.0178, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.40404040404040403, |
|
"eval_loss": 0.0200771763920784, |
|
"eval_runtime": 6.2549, |
|
"eval_samples_per_second": 7.994, |
|
"eval_steps_per_second": 2.078, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.41301907968574636, |
|
"grad_norm": 0.02559836022555828, |
|
"learning_rate": 8.214285714285714e-05, |
|
"loss": 0.0186, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.4219977553310887, |
|
"grad_norm": 0.02603279985487461, |
|
"learning_rate": 8.392857142857144e-05, |
|
"loss": 0.0205, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.43097643097643096, |
|
"grad_norm": 0.023479154333472252, |
|
"learning_rate": 8.571428571428571e-05, |
|
"loss": 0.0207, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.4399551066217733, |
|
"grad_norm": 0.027948766946792603, |
|
"learning_rate": 8.75e-05, |
|
"loss": 0.0199, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.4489337822671156, |
|
"grad_norm": 0.028703948482871056, |
|
"learning_rate": 8.92857142857143e-05, |
|
"loss": 0.0206, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.4489337822671156, |
|
"eval_loss": 0.019621608778834343, |
|
"eval_runtime": 6.2561, |
|
"eval_samples_per_second": 7.992, |
|
"eval_steps_per_second": 2.078, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.45791245791245794, |
|
"grad_norm": 0.021676093339920044, |
|
"learning_rate": 9.107142857142857e-05, |
|
"loss": 0.0203, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.4668911335578002, |
|
"grad_norm": 0.02454349212348461, |
|
"learning_rate": 9.285714285714286e-05, |
|
"loss": 0.0209, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.47586980920314254, |
|
"grad_norm": 0.02334459312260151, |
|
"learning_rate": 9.464285714285715e-05, |
|
"loss": 0.0197, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.48484848484848486, |
|
"grad_norm": 0.03568955883383751, |
|
"learning_rate": 9.642857142857143e-05, |
|
"loss": 0.018, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.49382716049382713, |
|
"grad_norm": 0.03706267848610878, |
|
"learning_rate": 9.821428571428572e-05, |
|
"loss": 0.0196, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.49382716049382713, |
|
"eval_loss": 0.01931421272456646, |
|
"eval_runtime": 6.2564, |
|
"eval_samples_per_second": 7.992, |
|
"eval_steps_per_second": 2.078, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.5028058361391695, |
|
"grad_norm": 0.033131491392850876, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0195, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.5117845117845118, |
|
"grad_norm": 0.025286365300416946, |
|
"learning_rate": 9.999900908311602e-05, |
|
"loss": 0.0195, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.5207631874298541, |
|
"grad_norm": 0.03591889888048172, |
|
"learning_rate": 9.999603637174071e-05, |
|
"loss": 0.0195, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.5297418630751964, |
|
"grad_norm": 0.025091370567679405, |
|
"learning_rate": 9.999108198370249e-05, |
|
"loss": 0.0195, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.5387205387205387, |
|
"grad_norm": 0.030630730092525482, |
|
"learning_rate": 9.998414611537681e-05, |
|
"loss": 0.0173, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.5387205387205387, |
|
"eval_loss": 0.019306931644678116, |
|
"eval_runtime": 6.2911, |
|
"eval_samples_per_second": 7.948, |
|
"eval_steps_per_second": 2.066, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.547699214365881, |
|
"grad_norm": 0.03352862969040871, |
|
"learning_rate": 9.997522904167844e-05, |
|
"loss": 0.0199, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.5566778900112234, |
|
"grad_norm": 0.029540032148361206, |
|
"learning_rate": 9.996433111605052e-05, |
|
"loss": 0.0211, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.5656565656565656, |
|
"grad_norm": 0.03420255333185196, |
|
"learning_rate": 9.995145277045061e-05, |
|
"loss": 0.0181, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.574635241301908, |
|
"grad_norm": 0.026620274409651756, |
|
"learning_rate": 9.993659451533353e-05, |
|
"loss": 0.0206, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.5836139169472503, |
|
"grad_norm": 0.02483481727540493, |
|
"learning_rate": 9.991975693963107e-05, |
|
"loss": 0.0184, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.5836139169472503, |
|
"eval_loss": 0.019347084686160088, |
|
"eval_runtime": 6.2549, |
|
"eval_samples_per_second": 7.994, |
|
"eval_steps_per_second": 2.078, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.5925925925925926, |
|
"grad_norm": 0.0339241661131382, |
|
"learning_rate": 9.990094071072877e-05, |
|
"loss": 0.0193, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.6015712682379349, |
|
"grad_norm": 0.0241928081959486, |
|
"learning_rate": 9.988014657443941e-05, |
|
"loss": 0.0193, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.6105499438832772, |
|
"grad_norm": 0.029822858050465584, |
|
"learning_rate": 9.985737535497337e-05, |
|
"loss": 0.0184, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.6195286195286195, |
|
"grad_norm": 0.023182636126875877, |
|
"learning_rate": 9.983262795490613e-05, |
|
"loss": 0.0183, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.6285072951739619, |
|
"grad_norm": 0.021179169416427612, |
|
"learning_rate": 9.980590535514233e-05, |
|
"loss": 0.0194, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.6285072951739619, |
|
"eval_loss": 0.01905701868236065, |
|
"eval_runtime": 6.2492, |
|
"eval_samples_per_second": 8.001, |
|
"eval_steps_per_second": 2.08, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.6374859708193041, |
|
"grad_norm": 0.02753937430679798, |
|
"learning_rate": 9.9777208614877e-05, |
|
"loss": 0.0189, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.6464646464646465, |
|
"grad_norm": 0.027653615921735764, |
|
"learning_rate": 9.97465388715535e-05, |
|
"loss": 0.0191, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.6554433221099888, |
|
"grad_norm": 0.020188456401228905, |
|
"learning_rate": 9.971389734081848e-05, |
|
"loss": 0.0187, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.6644219977553311, |
|
"grad_norm": 0.029147446155548096, |
|
"learning_rate": 9.967928531647374e-05, |
|
"loss": 0.0177, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.6734006734006734, |
|
"grad_norm": 0.023492921143770218, |
|
"learning_rate": 9.96427041704248e-05, |
|
"loss": 0.0182, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.6734006734006734, |
|
"eval_loss": 0.018457170575857162, |
|
"eval_runtime": 6.2525, |
|
"eval_samples_per_second": 7.997, |
|
"eval_steps_per_second": 2.079, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.6823793490460157, |
|
"grad_norm": 0.016839003190398216, |
|
"learning_rate": 9.960415535262671e-05, |
|
"loss": 0.0169, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.691358024691358, |
|
"grad_norm": 0.024092335253953934, |
|
"learning_rate": 9.956364039102642e-05, |
|
"loss": 0.0184, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.7003367003367004, |
|
"grad_norm": 0.026985522359609604, |
|
"learning_rate": 9.952116089150232e-05, |
|
"loss": 0.0187, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.7093153759820426, |
|
"grad_norm": 0.01973740942776203, |
|
"learning_rate": 9.947671853780054e-05, |
|
"loss": 0.0166, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.7182940516273849, |
|
"grad_norm": 0.01972176879644394, |
|
"learning_rate": 9.943031509146825e-05, |
|
"loss": 0.0169, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.7182940516273849, |
|
"eval_loss": 0.01826069876551628, |
|
"eval_runtime": 6.2512, |
|
"eval_samples_per_second": 7.998, |
|
"eval_steps_per_second": 2.08, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.7272727272727273, |
|
"grad_norm": 0.020164869725704193, |
|
"learning_rate": 9.938195239178374e-05, |
|
"loss": 0.0172, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.7362514029180696, |
|
"grad_norm": 0.02163533680140972, |
|
"learning_rate": 9.933163235568367e-05, |
|
"loss": 0.0183, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.745230078563412, |
|
"grad_norm": 0.018531063571572304, |
|
"learning_rate": 9.927935697768698e-05, |
|
"loss": 0.0171, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.7542087542087542, |
|
"grad_norm": 0.027429502457380295, |
|
"learning_rate": 9.922512832981584e-05, |
|
"loss": 0.0214, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.7631874298540965, |
|
"grad_norm": 0.019089698791503906, |
|
"learning_rate": 9.916894856151357e-05, |
|
"loss": 0.0176, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.7631874298540965, |
|
"eval_loss": 0.017831869423389435, |
|
"eval_runtime": 6.2838, |
|
"eval_samples_per_second": 7.957, |
|
"eval_steps_per_second": 2.069, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.7721661054994389, |
|
"grad_norm": 0.018925415351986885, |
|
"learning_rate": 9.91108198995594e-05, |
|
"loss": 0.0156, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.7811447811447811, |
|
"grad_norm": 0.021234937012195587, |
|
"learning_rate": 9.905074464798024e-05, |
|
"loss": 0.0186, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.7901234567901234, |
|
"grad_norm": 0.018775297328829765, |
|
"learning_rate": 9.898872518795932e-05, |
|
"loss": 0.0163, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.7991021324354658, |
|
"grad_norm": 0.016489777714014053, |
|
"learning_rate": 9.892476397774186e-05, |
|
"loss": 0.0173, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.8080808080808081, |
|
"grad_norm": 0.018042977899312973, |
|
"learning_rate": 9.885886355253758e-05, |
|
"loss": 0.0158, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.8080808080808081, |
|
"eval_loss": 0.017583945766091347, |
|
"eval_runtime": 6.2497, |
|
"eval_samples_per_second": 8.0, |
|
"eval_steps_per_second": 2.08, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.8170594837261503, |
|
"grad_norm": 0.020747726783156395, |
|
"learning_rate": 9.879102652442024e-05, |
|
"loss": 0.0178, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.8260381593714927, |
|
"grad_norm": 0.024066736921668053, |
|
"learning_rate": 9.872125558222409e-05, |
|
"loss": 0.0158, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.835016835016835, |
|
"grad_norm": 0.015819448977708817, |
|
"learning_rate": 9.864955349143734e-05, |
|
"loss": 0.0162, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.8439955106621774, |
|
"grad_norm": 0.018356909975409508, |
|
"learning_rate": 9.857592309409247e-05, |
|
"loss": 0.0154, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.8529741863075196, |
|
"grad_norm": 0.027337217703461647, |
|
"learning_rate": 9.850036730865364e-05, |
|
"loss": 0.02, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.8529741863075196, |
|
"eval_loss": 0.017212502658367157, |
|
"eval_runtime": 6.3032, |
|
"eval_samples_per_second": 7.932, |
|
"eval_steps_per_second": 2.062, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.8619528619528619, |
|
"grad_norm": 0.019369499757885933, |
|
"learning_rate": 9.842288912990096e-05, |
|
"loss": 0.0194, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.8709315375982043, |
|
"grad_norm": 0.0162061657756567, |
|
"learning_rate": 9.83434916288119e-05, |
|
"loss": 0.0152, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.8799102132435466, |
|
"grad_norm": 0.023736393079161644, |
|
"learning_rate": 9.82621779524394e-05, |
|
"loss": 0.017, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.8888888888888888, |
|
"grad_norm": 0.01927589811384678, |
|
"learning_rate": 9.817895132378725e-05, |
|
"loss": 0.0158, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.8978675645342312, |
|
"grad_norm": 0.025313647463917732, |
|
"learning_rate": 9.809381504168234e-05, |
|
"loss": 0.0165, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.8978675645342312, |
|
"eval_loss": 0.017287207767367363, |
|
"eval_runtime": 6.2491, |
|
"eval_samples_per_second": 8.001, |
|
"eval_steps_per_second": 2.08, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.9068462401795735, |
|
"grad_norm": 0.021926885470747948, |
|
"learning_rate": 9.800677248064382e-05, |
|
"loss": 0.0169, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.9158249158249159, |
|
"grad_norm": 0.02485627494752407, |
|
"learning_rate": 9.791782709074944e-05, |
|
"loss": 0.0152, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.9248035914702581, |
|
"grad_norm": 0.021951181814074516, |
|
"learning_rate": 9.782698239749873e-05, |
|
"loss": 0.017, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.9337822671156004, |
|
"grad_norm": 0.02424493059515953, |
|
"learning_rate": 9.77342420016733e-05, |
|
"loss": 0.0172, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.9427609427609428, |
|
"grad_norm": 0.028901271522045135, |
|
"learning_rate": 9.763960957919413e-05, |
|
"loss": 0.0181, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.9427609427609428, |
|
"eval_loss": 0.016786355525255203, |
|
"eval_runtime": 6.2498, |
|
"eval_samples_per_second": 8.0, |
|
"eval_steps_per_second": 2.08, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.9517396184062851, |
|
"grad_norm": 0.017801107838749886, |
|
"learning_rate": 9.754308888097583e-05, |
|
"loss": 0.0165, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.9607182940516273, |
|
"grad_norm": 0.037076808512210846, |
|
"learning_rate": 9.744468373277797e-05, |
|
"loss": 0.0162, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.9696969696969697, |
|
"grad_norm": 0.019457461312413216, |
|
"learning_rate": 9.734439803505345e-05, |
|
"loss": 0.0158, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.978675645342312, |
|
"grad_norm": 0.02191469632089138, |
|
"learning_rate": 9.724223576279395e-05, |
|
"loss": 0.0163, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.9876543209876543, |
|
"grad_norm": 0.024242915213108063, |
|
"learning_rate": 9.713820096537225e-05, |
|
"loss": 0.0176, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.9876543209876543, |
|
"eval_loss": 0.016797909513115883, |
|
"eval_runtime": 6.2548, |
|
"eval_samples_per_second": 7.994, |
|
"eval_steps_per_second": 2.078, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.9966329966329966, |
|
"grad_norm": 0.02136295475065708, |
|
"learning_rate": 9.703229776638185e-05, |
|
"loss": 0.0166, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 1.0078563411896746, |
|
"grad_norm": 0.08904732018709183, |
|
"learning_rate": 9.692453036347351e-05, |
|
"loss": 0.0323, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 1.0168350168350169, |
|
"grad_norm": 0.031474873423576355, |
|
"learning_rate": 9.681490302818874e-05, |
|
"loss": 0.0159, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 1.0258136924803591, |
|
"grad_norm": 0.02854473888874054, |
|
"learning_rate": 9.670342010579065e-05, |
|
"loss": 0.0141, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 1.0347923681257014, |
|
"grad_norm": 0.029452061280608177, |
|
"learning_rate": 9.659008601509168e-05, |
|
"loss": 0.0184, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 1.0347923681257014, |
|
"eval_loss": 0.0183156318962574, |
|
"eval_runtime": 6.2458, |
|
"eval_samples_per_second": 8.005, |
|
"eval_steps_per_second": 2.081, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 1.0437710437710437, |
|
"grad_norm": 0.02467503771185875, |
|
"learning_rate": 9.647490524827834e-05, |
|
"loss": 0.0157, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 1.0527497194163862, |
|
"grad_norm": 0.021488968282938004, |
|
"learning_rate": 9.635788237073334e-05, |
|
"loss": 0.0152, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 1.0617283950617284, |
|
"grad_norm": 0.02372926101088524, |
|
"learning_rate": 9.623902202085444e-05, |
|
"loss": 0.0176, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 1.0707070707070707, |
|
"grad_norm": 0.01808401569724083, |
|
"learning_rate": 9.611832890987076e-05, |
|
"loss": 0.0156, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 1.079685746352413, |
|
"grad_norm": 0.021259043365716934, |
|
"learning_rate": 9.599580782165598e-05, |
|
"loss": 0.0162, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.079685746352413, |
|
"eval_loss": 0.017910869792103767, |
|
"eval_runtime": 6.2515, |
|
"eval_samples_per_second": 7.998, |
|
"eval_steps_per_second": 2.079, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.0886644219977553, |
|
"grad_norm": 0.024560727179050446, |
|
"learning_rate": 9.587146361253868e-05, |
|
"loss": 0.0161, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 1.0976430976430978, |
|
"grad_norm": 0.02424251101911068, |
|
"learning_rate": 9.57453012111099e-05, |
|
"loss": 0.0178, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 1.10662177328844, |
|
"grad_norm": 0.021675804629921913, |
|
"learning_rate": 9.561732561802778e-05, |
|
"loss": 0.017, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 1.1156004489337823, |
|
"grad_norm": 0.019624771550297737, |
|
"learning_rate": 9.548754190581939e-05, |
|
"loss": 0.017, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 1.1245791245791246, |
|
"grad_norm": 0.019135547801852226, |
|
"learning_rate": 9.53559552186796e-05, |
|
"loss": 0.017, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.1245791245791246, |
|
"eval_loss": 0.016845999285578728, |
|
"eval_runtime": 6.2653, |
|
"eval_samples_per_second": 7.98, |
|
"eval_steps_per_second": 2.075, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.1335578002244668, |
|
"grad_norm": 0.012162208557128906, |
|
"learning_rate": 9.522257077226717e-05, |
|
"loss": 0.0123, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 1.142536475869809, |
|
"grad_norm": 0.017864948138594627, |
|
"learning_rate": 9.508739385349812e-05, |
|
"loss": 0.017, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 1.1515151515151516, |
|
"grad_norm": 0.017155688256025314, |
|
"learning_rate": 9.49504298203361e-05, |
|
"loss": 0.0156, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 1.1604938271604939, |
|
"grad_norm": 0.016472933813929558, |
|
"learning_rate": 9.481168410158003e-05, |
|
"loss": 0.0154, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 1.1694725028058361, |
|
"grad_norm": 0.014728706330060959, |
|
"learning_rate": 9.467116219664894e-05, |
|
"loss": 0.0143, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.1694725028058361, |
|
"eval_loss": 0.016655858606100082, |
|
"eval_runtime": 6.2511, |
|
"eval_samples_per_second": 7.999, |
|
"eval_steps_per_second": 2.08, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.1784511784511784, |
|
"grad_norm": 0.014423094689846039, |
|
"learning_rate": 9.45288696753639e-05, |
|
"loss": 0.015, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 1.1874298540965207, |
|
"grad_norm": 0.01742757484316826, |
|
"learning_rate": 9.438481217772744e-05, |
|
"loss": 0.016, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 1.1964085297418632, |
|
"grad_norm": 0.01627536118030548, |
|
"learning_rate": 9.423899541369978e-05, |
|
"loss": 0.0131, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 1.2053872053872055, |
|
"grad_norm": 0.017055079340934753, |
|
"learning_rate": 9.409142516297269e-05, |
|
"loss": 0.016, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 1.2143658810325477, |
|
"grad_norm": 0.01879395917057991, |
|
"learning_rate": 9.394210727474028e-05, |
|
"loss": 0.0177, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.2143658810325477, |
|
"eval_loss": 0.016584472730755806, |
|
"eval_runtime": 6.2484, |
|
"eval_samples_per_second": 8.002, |
|
"eval_steps_per_second": 2.081, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.22334455667789, |
|
"grad_norm": 0.01821809820830822, |
|
"learning_rate": 9.379104766746722e-05, |
|
"loss": 0.0163, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 1.2323232323232323, |
|
"grad_norm": 0.018335649743676186, |
|
"learning_rate": 9.363825232865413e-05, |
|
"loss": 0.0138, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 1.2413019079685745, |
|
"grad_norm": 0.014240071177482605, |
|
"learning_rate": 9.348372731460023e-05, |
|
"loss": 0.0119, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 1.250280583613917, |
|
"grad_norm": 0.02089606784284115, |
|
"learning_rate": 9.332747875016332e-05, |
|
"loss": 0.0166, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 1.2592592592592593, |
|
"grad_norm": 0.023216476663947105, |
|
"learning_rate": 9.316951282851707e-05, |
|
"loss": 0.0138, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.2592592592592593, |
|
"eval_loss": 0.01608719676733017, |
|
"eval_runtime": 6.3168, |
|
"eval_samples_per_second": 7.915, |
|
"eval_steps_per_second": 2.058, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.2682379349046016, |
|
"grad_norm": 0.01914984919130802, |
|
"learning_rate": 9.300983581090541e-05, |
|
"loss": 0.0157, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 1.2772166105499438, |
|
"grad_norm": 0.020874816924333572, |
|
"learning_rate": 9.284845402639446e-05, |
|
"loss": 0.0142, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 1.2861952861952861, |
|
"grad_norm": 0.017665155231952667, |
|
"learning_rate": 9.26853738716216e-05, |
|
"loss": 0.0148, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 1.2951739618406286, |
|
"grad_norm": 0.01927882246673107, |
|
"learning_rate": 9.2520601810542e-05, |
|
"loss": 0.0153, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 1.3041526374859709, |
|
"grad_norm": 0.018328847363591194, |
|
"learning_rate": 9.235414437417234e-05, |
|
"loss": 0.0149, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.3041526374859709, |
|
"eval_loss": 0.01574772223830223, |
|
"eval_runtime": 6.2751, |
|
"eval_samples_per_second": 7.968, |
|
"eval_steps_per_second": 2.072, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.3131313131313131, |
|
"grad_norm": 0.016565755009651184, |
|
"learning_rate": 9.2186008160332e-05, |
|
"loss": 0.0124, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 1.3221099887766554, |
|
"grad_norm": 0.023094868287444115, |
|
"learning_rate": 9.201619983338153e-05, |
|
"loss": 0.0188, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 1.3310886644219977, |
|
"grad_norm": 0.022368893027305603, |
|
"learning_rate": 9.18447261239584e-05, |
|
"loss": 0.0128, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 1.34006734006734, |
|
"grad_norm": 0.013615472242236137, |
|
"learning_rate": 9.167159382871039e-05, |
|
"loss": 0.0142, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 1.3490460157126825, |
|
"grad_norm": 0.021693168208003044, |
|
"learning_rate": 9.149680981002609e-05, |
|
"loss": 0.0162, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.3490460157126825, |
|
"eval_loss": 0.0159834623336792, |
|
"eval_runtime": 6.2576, |
|
"eval_samples_per_second": 7.99, |
|
"eval_steps_per_second": 2.077, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.3580246913580247, |
|
"grad_norm": 0.016352703794836998, |
|
"learning_rate": 9.13203809957629e-05, |
|
"loss": 0.0149, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 1.367003367003367, |
|
"grad_norm": 0.019222285598516464, |
|
"learning_rate": 9.114231437897244e-05, |
|
"loss": 0.0166, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 1.3759820426487093, |
|
"grad_norm": 0.014903879724442959, |
|
"learning_rate": 9.096261701762342e-05, |
|
"loss": 0.0146, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 1.3849607182940518, |
|
"grad_norm": 0.01564696989953518, |
|
"learning_rate": 9.078129603432181e-05, |
|
"loss": 0.0141, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 1.393939393939394, |
|
"grad_norm": 0.020336154848337173, |
|
"learning_rate": 9.059835861602853e-05, |
|
"loss": 0.0148, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.393939393939394, |
|
"eval_loss": 0.015588033944368362, |
|
"eval_runtime": 6.2569, |
|
"eval_samples_per_second": 7.991, |
|
"eval_steps_per_second": 2.078, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.4029180695847363, |
|
"grad_norm": 0.01628415659070015, |
|
"learning_rate": 9.041381201377468e-05, |
|
"loss": 0.0152, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.4118967452300786, |
|
"grad_norm": 0.01812385767698288, |
|
"learning_rate": 9.0227663542374e-05, |
|
"loss": 0.0146, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 1.4208754208754208, |
|
"grad_norm": 0.023446347564458847, |
|
"learning_rate": 9.003992058013302e-05, |
|
"loss": 0.015, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 1.4298540965207631, |
|
"grad_norm": 0.02031407319009304, |
|
"learning_rate": 8.985059056855858e-05, |
|
"loss": 0.0144, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 1.4388327721661054, |
|
"grad_norm": 0.022712191566824913, |
|
"learning_rate": 8.965968101206291e-05, |
|
"loss": 0.0168, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.4388327721661054, |
|
"eval_loss": 0.015444611199200153, |
|
"eval_runtime": 6.2545, |
|
"eval_samples_per_second": 7.994, |
|
"eval_steps_per_second": 2.079, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.4478114478114479, |
|
"grad_norm": 0.018817342817783356, |
|
"learning_rate": 8.94671994776661e-05, |
|
"loss": 0.0147, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 1.4567901234567902, |
|
"grad_norm": 0.014623799361288548, |
|
"learning_rate": 8.927315359469626e-05, |
|
"loss": 0.0129, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.4657687991021324, |
|
"grad_norm": 0.02344674989581108, |
|
"learning_rate": 8.907755105448704e-05, |
|
"loss": 0.019, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 1.4747474747474747, |
|
"grad_norm": 0.02390502393245697, |
|
"learning_rate": 8.888039961007282e-05, |
|
"loss": 0.0157, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.4837261503928172, |
|
"grad_norm": 0.017417486757040024, |
|
"learning_rate": 8.868170707588142e-05, |
|
"loss": 0.0148, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.4837261503928172, |
|
"eval_loss": 0.015270690433681011, |
|
"eval_runtime": 6.2526, |
|
"eval_samples_per_second": 7.997, |
|
"eval_steps_per_second": 2.079, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.4927048260381595, |
|
"grad_norm": 0.01864814944565296, |
|
"learning_rate": 8.848148132742431e-05, |
|
"loss": 0.0133, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.5016835016835017, |
|
"grad_norm": 0.014610537327826023, |
|
"learning_rate": 8.827973030098448e-05, |
|
"loss": 0.0135, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 1.510662177328844, |
|
"grad_norm": 0.0179497878998518, |
|
"learning_rate": 8.807646199330187e-05, |
|
"loss": 0.0159, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.5196408529741863, |
|
"grad_norm": 0.024205263704061508, |
|
"learning_rate": 8.787168446125638e-05, |
|
"loss": 0.0129, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 1.5286195286195285, |
|
"grad_norm": 0.018040824681520462, |
|
"learning_rate": 8.766540582154859e-05, |
|
"loss": 0.0146, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.5286195286195285, |
|
"eval_loss": 0.015371887013316154, |
|
"eval_runtime": 6.2476, |
|
"eval_samples_per_second": 8.003, |
|
"eval_steps_per_second": 2.081, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.5375982042648708, |
|
"grad_norm": 0.017247065901756287, |
|
"learning_rate": 8.745763425037797e-05, |
|
"loss": 0.015, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 1.546576879910213, |
|
"grad_norm": 0.019299406558275223, |
|
"learning_rate": 8.724837798311882e-05, |
|
"loss": 0.0153, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.5555555555555556, |
|
"grad_norm": 0.015991205349564552, |
|
"learning_rate": 8.703764531399392e-05, |
|
"loss": 0.0122, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 1.5645342312008979, |
|
"grad_norm": 0.015190811827778816, |
|
"learning_rate": 8.682544459574562e-05, |
|
"loss": 0.0144, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.5735129068462403, |
|
"grad_norm": 0.01828867383301258, |
|
"learning_rate": 8.661178423930491e-05, |
|
"loss": 0.0137, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.5735129068462403, |
|
"eval_loss": 0.014978926628828049, |
|
"eval_runtime": 6.2611, |
|
"eval_samples_per_second": 7.986, |
|
"eval_steps_per_second": 2.076, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.5824915824915826, |
|
"grad_norm": 0.01734633930027485, |
|
"learning_rate": 8.639667271345798e-05, |
|
"loss": 0.0177, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.5914702581369249, |
|
"grad_norm": 0.019932016730308533, |
|
"learning_rate": 8.618011854451056e-05, |
|
"loss": 0.0115, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 1.6004489337822672, |
|
"grad_norm": 0.026310300454497337, |
|
"learning_rate": 8.596213031594991e-05, |
|
"loss": 0.0167, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.6094276094276094, |
|
"grad_norm": 0.024424167349934578, |
|
"learning_rate": 8.57427166681047e-05, |
|
"loss": 0.0153, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 1.6184062850729517, |
|
"grad_norm": 0.016459695994853973, |
|
"learning_rate": 8.552188629780244e-05, |
|
"loss": 0.0144, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.6184062850729517, |
|
"eval_loss": 0.014994567260146141, |
|
"eval_runtime": 6.2493, |
|
"eval_samples_per_second": 8.001, |
|
"eval_steps_per_second": 2.08, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.627384960718294, |
|
"grad_norm": 0.021496936678886414, |
|
"learning_rate": 8.529964795802485e-05, |
|
"loss": 0.0125, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 1.6363636363636362, |
|
"grad_norm": 0.019307784736156464, |
|
"learning_rate": 8.507601045756085e-05, |
|
"loss": 0.0152, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.6453423120089785, |
|
"grad_norm": 0.016401201486587524, |
|
"learning_rate": 8.485098266065744e-05, |
|
"loss": 0.0125, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.654320987654321, |
|
"grad_norm": 0.023525064811110497, |
|
"learning_rate": 8.462457348666835e-05, |
|
"loss": 0.0163, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.6632996632996633, |
|
"grad_norm": 0.020476635545492172, |
|
"learning_rate": 8.439679190970052e-05, |
|
"loss": 0.0129, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.6632996632996633, |
|
"eval_loss": 0.014757846482098103, |
|
"eval_runtime": 6.3022, |
|
"eval_samples_per_second": 7.934, |
|
"eval_steps_per_second": 2.063, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.6722783389450058, |
|
"grad_norm": 0.027425814419984818, |
|
"learning_rate": 8.416764695825835e-05, |
|
"loss": 0.015, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.681257014590348, |
|
"grad_norm": 0.019989849999547005, |
|
"learning_rate": 8.39371477148859e-05, |
|
"loss": 0.0166, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 1.6902356902356903, |
|
"grad_norm": 0.015978503972291946, |
|
"learning_rate": 8.370530331580686e-05, |
|
"loss": 0.0131, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.6992143658810326, |
|
"grad_norm": 0.02169989049434662, |
|
"learning_rate": 8.347212295056239e-05, |
|
"loss": 0.0158, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 1.7081930415263749, |
|
"grad_norm": 0.02388261817395687, |
|
"learning_rate": 8.323761586164695e-05, |
|
"loss": 0.0139, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.7081930415263749, |
|
"eval_loss": 0.014534353278577328, |
|
"eval_runtime": 6.2541, |
|
"eval_samples_per_second": 7.995, |
|
"eval_steps_per_second": 2.079, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.7171717171717171, |
|
"grad_norm": 0.014390088617801666, |
|
"learning_rate": 8.300179134414188e-05, |
|
"loss": 0.0116, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 1.7261503928170594, |
|
"grad_norm": 0.022068368270993233, |
|
"learning_rate": 8.276465874534702e-05, |
|
"loss": 0.0127, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.7351290684624017, |
|
"grad_norm": 0.020960543304681778, |
|
"learning_rate": 8.252622746441021e-05, |
|
"loss": 0.0142, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 1.7441077441077442, |
|
"grad_norm": 0.01721160300076008, |
|
"learning_rate": 8.228650695195472e-05, |
|
"loss": 0.0151, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.7530864197530864, |
|
"grad_norm": 0.014986108988523483, |
|
"learning_rate": 8.204550670970469e-05, |
|
"loss": 0.013, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.7530864197530864, |
|
"eval_loss": 0.014544461853802204, |
|
"eval_runtime": 6.2711, |
|
"eval_samples_per_second": 7.973, |
|
"eval_steps_per_second": 2.073, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.7620650953984287, |
|
"grad_norm": 0.01918155886232853, |
|
"learning_rate": 8.180323629010848e-05, |
|
"loss": 0.0146, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.7710437710437712, |
|
"grad_norm": 0.015731407329440117, |
|
"learning_rate": 8.155970529596006e-05, |
|
"loss": 0.0147, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 1.7800224466891135, |
|
"grad_norm": 0.01721978560090065, |
|
"learning_rate": 8.131492338001839e-05, |
|
"loss": 0.0132, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.7890011223344557, |
|
"grad_norm": 0.019031619653105736, |
|
"learning_rate": 8.106890024462481e-05, |
|
"loss": 0.0149, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 1.797979797979798, |
|
"grad_norm": 0.017630157992243767, |
|
"learning_rate": 8.082164564131845e-05, |
|
"loss": 0.013, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.797979797979798, |
|
"eval_loss": 0.01437403354793787, |
|
"eval_runtime": 6.2492, |
|
"eval_samples_per_second": 8.001, |
|
"eval_steps_per_second": 2.08, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.8069584736251403, |
|
"grad_norm": 0.02445027232170105, |
|
"learning_rate": 8.057316937044977e-05, |
|
"loss": 0.018, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 1.8159371492704826, |
|
"grad_norm": 0.021661758422851562, |
|
"learning_rate": 8.032348128079203e-05, |
|
"loss": 0.0151, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 1.8249158249158248, |
|
"grad_norm": 0.01885199546813965, |
|
"learning_rate": 8.0072591269151e-05, |
|
"loss": 0.0135, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 1.833894500561167, |
|
"grad_norm": 0.020552242174744606, |
|
"learning_rate": 7.982050927997264e-05, |
|
"loss": 0.0141, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.8428731762065096, |
|
"grad_norm": 0.016624536365270615, |
|
"learning_rate": 7.956724530494887e-05, |
|
"loss": 0.0124, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.8428731762065096, |
|
"eval_loss": 0.014373213052749634, |
|
"eval_runtime": 6.2577, |
|
"eval_samples_per_second": 7.99, |
|
"eval_steps_per_second": 2.077, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.8518518518518519, |
|
"grad_norm": 0.023108718916773796, |
|
"learning_rate": 7.931280938262169e-05, |
|
"loss": 0.0166, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 1.8608305274971941, |
|
"grad_norm": 0.014059687964618206, |
|
"learning_rate": 7.905721159798513e-05, |
|
"loss": 0.0129, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 1.8698092031425366, |
|
"grad_norm": 0.019422784447669983, |
|
"learning_rate": 7.880046208208563e-05, |
|
"loss": 0.0149, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.878787878787879, |
|
"grad_norm": 0.01858607865869999, |
|
"learning_rate": 7.854257101162037e-05, |
|
"loss": 0.0134, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 1.8877665544332212, |
|
"grad_norm": 0.019326094537973404, |
|
"learning_rate": 7.828354860853399e-05, |
|
"loss": 0.0135, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.8877665544332212, |
|
"eval_loss": 0.014275193214416504, |
|
"eval_runtime": 6.2658, |
|
"eval_samples_per_second": 7.98, |
|
"eval_steps_per_second": 2.075, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.8967452300785634, |
|
"grad_norm": 0.013561426661908627, |
|
"learning_rate": 7.802340513961342e-05, |
|
"loss": 0.012, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 1.9057239057239057, |
|
"grad_norm": 0.01908908411860466, |
|
"learning_rate": 7.776215091608085e-05, |
|
"loss": 0.0132, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 1.914702581369248, |
|
"grad_norm": 0.016201447695493698, |
|
"learning_rate": 7.749979629318516e-05, |
|
"loss": 0.0126, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 1.9236812570145903, |
|
"grad_norm": 0.015391174703836441, |
|
"learning_rate": 7.723635166979133e-05, |
|
"loss": 0.0135, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 1.9326599326599325, |
|
"grad_norm": 0.016166241839528084, |
|
"learning_rate": 7.697182748796841e-05, |
|
"loss": 0.0128, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.9326599326599325, |
|
"eval_loss": 0.014707864262163639, |
|
"eval_runtime": 6.2641, |
|
"eval_samples_per_second": 7.982, |
|
"eval_steps_per_second": 2.075, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.941638608305275, |
|
"grad_norm": 0.02329264022409916, |
|
"learning_rate": 7.670623423257548e-05, |
|
"loss": 0.0131, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 1.9506172839506173, |
|
"grad_norm": 0.01891166903078556, |
|
"learning_rate": 7.64395824308462e-05, |
|
"loss": 0.0137, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 1.9595959595959596, |
|
"grad_norm": 0.015786990523338318, |
|
"learning_rate": 7.617188265197148e-05, |
|
"loss": 0.0128, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 1.968574635241302, |
|
"grad_norm": 0.021409448236227036, |
|
"learning_rate": 7.590314550668054e-05, |
|
"loss": 0.0142, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 1.9775533108866443, |
|
"grad_norm": 0.025052543729543686, |
|
"learning_rate": 7.563338164682036e-05, |
|
"loss": 0.0149, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.9775533108866443, |
|
"eval_loss": 0.01433955691754818, |
|
"eval_runtime": 6.2639, |
|
"eval_samples_per_second": 7.982, |
|
"eval_steps_per_second": 2.075, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.9865319865319866, |
|
"grad_norm": 0.013938682153820992, |
|
"learning_rate": 7.536260176493348e-05, |
|
"loss": 0.0143, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 1.9955106621773289, |
|
"grad_norm": 0.01827586442232132, |
|
"learning_rate": 7.509081659383417e-05, |
|
"loss": 0.0134, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 2.006734006734007, |
|
"grad_norm": 0.04617556184530258, |
|
"learning_rate": 7.481803690618304e-05, |
|
"loss": 0.0255, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 2.015712682379349, |
|
"grad_norm": 0.01754753105342388, |
|
"learning_rate": 7.454427351405999e-05, |
|
"loss": 0.0154, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 2.0246913580246915, |
|
"grad_norm": 0.025898197665810585, |
|
"learning_rate": 7.426953726853574e-05, |
|
"loss": 0.0138, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 2.0246913580246915, |
|
"eval_loss": 0.014397691935300827, |
|
"eval_runtime": 6.2513, |
|
"eval_samples_per_second": 7.998, |
|
"eval_steps_per_second": 2.08, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 2.0336700336700337, |
|
"grad_norm": 0.01992025040090084, |
|
"learning_rate": 7.399383905924165e-05, |
|
"loss": 0.0113, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 2.042648709315376, |
|
"grad_norm": 0.014167600311338902, |
|
"learning_rate": 7.371718981393815e-05, |
|
"loss": 0.0108, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 2.0516273849607183, |
|
"grad_norm": 0.021143754944205284, |
|
"learning_rate": 7.343960049808156e-05, |
|
"loss": 0.0136, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 2.0606060606060606, |
|
"grad_norm": 0.025208963081240654, |
|
"learning_rate": 7.316108211438945e-05, |
|
"loss": 0.0129, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 2.069584736251403, |
|
"grad_norm": 0.017190365120768547, |
|
"learning_rate": 7.288164570240463e-05, |
|
"loss": 0.0127, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 2.069584736251403, |
|
"eval_loss": 0.014279232360422611, |
|
"eval_runtime": 6.2544, |
|
"eval_samples_per_second": 7.994, |
|
"eval_steps_per_second": 2.079, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 2.078563411896745, |
|
"grad_norm": 0.01911742240190506, |
|
"learning_rate": 7.26013023380574e-05, |
|
"loss": 0.0121, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 2.0875420875420874, |
|
"grad_norm": 0.022378170862793922, |
|
"learning_rate": 7.232006313322667e-05, |
|
"loss": 0.013, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 2.0965207631874296, |
|
"grad_norm": 0.020126372575759888, |
|
"learning_rate": 7.203793923529956e-05, |
|
"loss": 0.0127, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 2.1054994388327724, |
|
"grad_norm": 0.018375013023614883, |
|
"learning_rate": 7.175494182672939e-05, |
|
"loss": 0.0141, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 2.1144781144781146, |
|
"grad_norm": 0.016569405794143677, |
|
"learning_rate": 7.147108212459257e-05, |
|
"loss": 0.0116, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 2.1144781144781146, |
|
"eval_loss": 0.0142152588814497, |
|
"eval_runtime": 6.2529, |
|
"eval_samples_per_second": 7.996, |
|
"eval_steps_per_second": 2.079, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 2.123456790123457, |
|
"grad_norm": 0.013866296038031578, |
|
"learning_rate": 7.118637138014396e-05, |
|
"loss": 0.011, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 2.132435465768799, |
|
"grad_norm": 0.0164170078933239, |
|
"learning_rate": 7.090082087837092e-05, |
|
"loss": 0.0137, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 2.1414141414141414, |
|
"grad_norm": 0.014653601683676243, |
|
"learning_rate": 7.061444193754596e-05, |
|
"loss": 0.012, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 2.1503928170594837, |
|
"grad_norm": 0.020761555060744286, |
|
"learning_rate": 7.032724590877821e-05, |
|
"loss": 0.0119, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 2.159371492704826, |
|
"grad_norm": 0.019674135372042656, |
|
"learning_rate": 7.003924417556343e-05, |
|
"loss": 0.0128, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.159371492704826, |
|
"eval_loss": 0.014257782138884068, |
|
"eval_runtime": 6.307, |
|
"eval_samples_per_second": 7.928, |
|
"eval_steps_per_second": 2.061, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.1683501683501682, |
|
"grad_norm": 0.017319727689027786, |
|
"learning_rate": 6.975044815333282e-05, |
|
"loss": 0.0109, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 2.1773288439955105, |
|
"grad_norm": 0.02167445980012417, |
|
"learning_rate": 6.946086928900054e-05, |
|
"loss": 0.0132, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 2.186307519640853, |
|
"grad_norm": 0.016814757138490677, |
|
"learning_rate": 6.917051906051006e-05, |
|
"loss": 0.0106, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 2.1952861952861955, |
|
"grad_norm": 0.012457667849957943, |
|
"learning_rate": 6.887940897637908e-05, |
|
"loss": 0.0103, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 2.204264870931538, |
|
"grad_norm": 0.022774742916226387, |
|
"learning_rate": 6.858755057524354e-05, |
|
"loss": 0.0145, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 2.204264870931538, |
|
"eval_loss": 0.014115707948803902, |
|
"eval_runtime": 6.2503, |
|
"eval_samples_per_second": 8.0, |
|
"eval_steps_per_second": 2.08, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 2.21324354657688, |
|
"grad_norm": 0.018725769594311714, |
|
"learning_rate": 6.829495542540013e-05, |
|
"loss": 0.0118, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 2.2222222222222223, |
|
"grad_norm": 0.019993796944618225, |
|
"learning_rate": 6.80016351243478e-05, |
|
"loss": 0.0142, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 2.2312008978675646, |
|
"grad_norm": 0.016696954146027565, |
|
"learning_rate": 6.77076012983281e-05, |
|
"loss": 0.0119, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 2.240179573512907, |
|
"grad_norm": 0.016207238659262657, |
|
"learning_rate": 6.741286560186437e-05, |
|
"loss": 0.0096, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 2.249158249158249, |
|
"grad_norm": 0.019286343827843666, |
|
"learning_rate": 6.711743971729967e-05, |
|
"loss": 0.0147, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.249158249158249, |
|
"eval_loss": 0.013896584510803223, |
|
"eval_runtime": 6.2584, |
|
"eval_samples_per_second": 7.989, |
|
"eval_steps_per_second": 2.077, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.2581369248035914, |
|
"grad_norm": 0.013443589210510254, |
|
"learning_rate": 6.682133535433393e-05, |
|
"loss": 0.0102, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 2.2671156004489337, |
|
"grad_norm": 0.016910936683416367, |
|
"learning_rate": 6.652456424955963e-05, |
|
"loss": 0.0147, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 2.276094276094276, |
|
"grad_norm": 0.01767115481197834, |
|
"learning_rate": 6.622713816599673e-05, |
|
"loss": 0.0112, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 2.285072951739618, |
|
"grad_norm": 0.017718922346830368, |
|
"learning_rate": 6.592906889262632e-05, |
|
"loss": 0.013, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 2.2940516273849605, |
|
"grad_norm": 0.01557993981987238, |
|
"learning_rate": 6.563036824392344e-05, |
|
"loss": 0.0114, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 2.2940516273849605, |
|
"eval_loss": 0.013858611695468426, |
|
"eval_runtime": 6.2582, |
|
"eval_samples_per_second": 7.99, |
|
"eval_steps_per_second": 2.077, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 2.303030303030303, |
|
"grad_norm": 0.015414374880492687, |
|
"learning_rate": 6.533104805938873e-05, |
|
"loss": 0.0114, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 2.3120089786756455, |
|
"grad_norm": 0.01631319336593151, |
|
"learning_rate": 6.503112020307916e-05, |
|
"loss": 0.0116, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 2.3209876543209877, |
|
"grad_norm": 0.02034182660281658, |
|
"learning_rate": 6.473059656313782e-05, |
|
"loss": 0.0133, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 2.32996632996633, |
|
"grad_norm": 0.01835636980831623, |
|
"learning_rate": 6.442948905132266e-05, |
|
"loss": 0.0127, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 2.3389450056116723, |
|
"grad_norm": 0.01679323986172676, |
|
"learning_rate": 6.412780960253436e-05, |
|
"loss": 0.0114, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.3389450056116723, |
|
"eval_loss": 0.01390094868838787, |
|
"eval_runtime": 6.2592, |
|
"eval_samples_per_second": 7.988, |
|
"eval_steps_per_second": 2.077, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.3479236812570146, |
|
"grad_norm": 0.016404911875724792, |
|
"learning_rate": 6.382557017434332e-05, |
|
"loss": 0.0122, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 2.356902356902357, |
|
"grad_norm": 0.012671472504734993, |
|
"learning_rate": 6.352278274651561e-05, |
|
"loss": 0.0091, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 2.365881032547699, |
|
"grad_norm": 0.01708405278623104, |
|
"learning_rate": 6.321945932053822e-05, |
|
"loss": 0.0125, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 2.3748597081930414, |
|
"grad_norm": 0.019615929573774338, |
|
"learning_rate": 6.291561191914333e-05, |
|
"loss": 0.0125, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 2.3838383838383836, |
|
"grad_norm": 0.014311819337308407, |
|
"learning_rate": 6.261125258583171e-05, |
|
"loss": 0.0112, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 2.3838383838383836, |
|
"eval_loss": 0.013734661974012852, |
|
"eval_runtime": 6.2541, |
|
"eval_samples_per_second": 7.995, |
|
"eval_steps_per_second": 2.079, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 2.3928170594837264, |
|
"grad_norm": 0.01536188181489706, |
|
"learning_rate": 6.230639338439549e-05, |
|
"loss": 0.0134, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 2.4017957351290686, |
|
"grad_norm": 0.01723441854119301, |
|
"learning_rate": 6.200104639843985e-05, |
|
"loss": 0.0125, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 2.410774410774411, |
|
"grad_norm": 0.013469617813825607, |
|
"learning_rate": 6.169522373090412e-05, |
|
"loss": 0.0117, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 2.419753086419753, |
|
"grad_norm": 0.016679449006915092, |
|
"learning_rate": 6.138893750358212e-05, |
|
"loss": 0.012, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 2.4287317620650954, |
|
"grad_norm": 0.014688557013869286, |
|
"learning_rate": 6.108219985664161e-05, |
|
"loss": 0.0105, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.4287317620650954, |
|
"eval_loss": 0.013788803480565548, |
|
"eval_runtime": 6.2591, |
|
"eval_samples_per_second": 7.988, |
|
"eval_steps_per_second": 2.077, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.4377104377104377, |
|
"grad_norm": 0.01410532183945179, |
|
"learning_rate": 6.0775022948143115e-05, |
|
"loss": 0.0137, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 2.44668911335578, |
|
"grad_norm": 0.018949788063764572, |
|
"learning_rate": 6.046741895355802e-05, |
|
"loss": 0.0117, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 2.4556677890011223, |
|
"grad_norm": 0.01841340772807598, |
|
"learning_rate": 6.015940006528602e-05, |
|
"loss": 0.0108, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 2.4646464646464645, |
|
"grad_norm": 0.015688767656683922, |
|
"learning_rate": 5.9850978492171794e-05, |
|
"loss": 0.011, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 2.473625140291807, |
|
"grad_norm": 0.017808249220252037, |
|
"learning_rate": 5.954216645902109e-05, |
|
"loss": 0.0129, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 2.473625140291807, |
|
"eval_loss": 0.01363787055015564, |
|
"eval_runtime": 6.2939, |
|
"eval_samples_per_second": 7.944, |
|
"eval_steps_per_second": 2.066, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 2.482603815937149, |
|
"grad_norm": 0.016168171539902687, |
|
"learning_rate": 5.923297620611623e-05, |
|
"loss": 0.0107, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 2.4915824915824913, |
|
"grad_norm": 0.016473444178700447, |
|
"learning_rate": 5.892341998873089e-05, |
|
"loss": 0.0137, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 2.500561167227834, |
|
"grad_norm": 0.019177664071321487, |
|
"learning_rate": 5.861351007664434e-05, |
|
"loss": 0.0127, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 2.5095398428731763, |
|
"grad_norm": 0.01716047339141369, |
|
"learning_rate": 5.83032587536552e-05, |
|
"loss": 0.0127, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 2.5185185185185186, |
|
"grad_norm": 0.019998129457235336, |
|
"learning_rate": 5.799267831709442e-05, |
|
"loss": 0.014, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.5185185185185186, |
|
"eval_loss": 0.013543778099119663, |
|
"eval_runtime": 6.2505, |
|
"eval_samples_per_second": 7.999, |
|
"eval_steps_per_second": 2.08, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.527497194163861, |
|
"grad_norm": 0.013933787122368813, |
|
"learning_rate": 5.7681781077337905e-05, |
|
"loss": 0.0096, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 2.536475869809203, |
|
"grad_norm": 0.015824446454644203, |
|
"learning_rate": 5.737057935731868e-05, |
|
"loss": 0.0092, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 2.5454545454545454, |
|
"grad_norm": 0.01764868013560772, |
|
"learning_rate": 5.705908549203823e-05, |
|
"loss": 0.0126, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 2.5544332210998877, |
|
"grad_norm": 0.022920994088053703, |
|
"learning_rate": 5.674731182807781e-05, |
|
"loss": 0.0122, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 2.56341189674523, |
|
"grad_norm": 0.01605917327105999, |
|
"learning_rate": 5.643527072310891e-05, |
|
"loss": 0.0124, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 2.56341189674523, |
|
"eval_loss": 0.013581929728388786, |
|
"eval_runtime": 6.2549, |
|
"eval_samples_per_second": 7.994, |
|
"eval_steps_per_second": 2.078, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 2.5723905723905722, |
|
"grad_norm": 0.022532224655151367, |
|
"learning_rate": 5.612297454540352e-05, |
|
"loss": 0.0134, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 2.581369248035915, |
|
"grad_norm": 0.01806466281414032, |
|
"learning_rate": 5.581043567334383e-05, |
|
"loss": 0.0105, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 2.590347923681257, |
|
"grad_norm": 0.019253233447670937, |
|
"learning_rate": 5.5497666494931654e-05, |
|
"loss": 0.0116, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 2.5993265993265995, |
|
"grad_norm": 0.016167184337973595, |
|
"learning_rate": 5.518467940729739e-05, |
|
"loss": 0.0127, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 2.6083052749719418, |
|
"grad_norm": 0.019589709118008614, |
|
"learning_rate": 5.487148681620862e-05, |
|
"loss": 0.0128, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.6083052749719418, |
|
"eval_loss": 0.01327795721590519, |
|
"eval_runtime": 6.258, |
|
"eval_samples_per_second": 7.99, |
|
"eval_steps_per_second": 2.077, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.617283950617284, |
|
"grad_norm": 0.017755387350916862, |
|
"learning_rate": 5.455810113557839e-05, |
|
"loss": 0.0129, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 2.6262626262626263, |
|
"grad_norm": 0.01898750476539135, |
|
"learning_rate": 5.4244534786973214e-05, |
|
"loss": 0.0113, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 2.6352413019079686, |
|
"grad_norm": 0.014957334846258163, |
|
"learning_rate": 5.3930800199120616e-05, |
|
"loss": 0.0132, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 2.644219977553311, |
|
"grad_norm": 0.014244873076677322, |
|
"learning_rate": 5.361690980741663e-05, |
|
"loss": 0.0111, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 2.653198653198653, |
|
"grad_norm": 0.013911189511418343, |
|
"learning_rate": 5.330287605343279e-05, |
|
"loss": 0.0106, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 2.653198653198653, |
|
"eval_loss": 0.01291807834059, |
|
"eval_runtime": 6.2514, |
|
"eval_samples_per_second": 7.998, |
|
"eval_steps_per_second": 2.08, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 2.6621773288439954, |
|
"grad_norm": 0.017059607431292534, |
|
"learning_rate": 5.298871138442307e-05, |
|
"loss": 0.0127, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 2.6711560044893377, |
|
"grad_norm": 0.0140462601557374, |
|
"learning_rate": 5.267442825283048e-05, |
|
"loss": 0.0123, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 2.68013468013468, |
|
"grad_norm": 0.019492056220769882, |
|
"learning_rate": 5.236003911579345e-05, |
|
"loss": 0.0138, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 2.689113355780022, |
|
"grad_norm": 0.018796566873788834, |
|
"learning_rate": 5.204555643465215e-05, |
|
"loss": 0.011, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 2.698092031425365, |
|
"grad_norm": 0.012939135544002056, |
|
"learning_rate": 5.173099267445451e-05, |
|
"loss": 0.0099, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.698092031425365, |
|
"eval_loss": 0.012918239459395409, |
|
"eval_runtime": 6.2607, |
|
"eval_samples_per_second": 7.986, |
|
"eval_steps_per_second": 2.076, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.707070707070707, |
|
"grad_norm": 0.01702980510890484, |
|
"learning_rate": 5.1416360303462206e-05, |
|
"loss": 0.0116, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 2.7160493827160495, |
|
"grad_norm": 0.021528059616684914, |
|
"learning_rate": 5.110167179265636e-05, |
|
"loss": 0.0134, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 2.7250280583613917, |
|
"grad_norm": 0.013685944490134716, |
|
"learning_rate": 5.078693961524329e-05, |
|
"loss": 0.0109, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 2.734006734006734, |
|
"grad_norm": 0.016435936093330383, |
|
"learning_rate": 5.0472176246160184e-05, |
|
"loss": 0.0121, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 2.7429854096520763, |
|
"grad_norm": 0.019511640071868896, |
|
"learning_rate": 5.01573941615805e-05, |
|
"loss": 0.0111, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 2.7429854096520763, |
|
"eval_loss": 0.012916718609631062, |
|
"eval_runtime": 6.27, |
|
"eval_samples_per_second": 7.974, |
|
"eval_steps_per_second": 2.073, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 2.7519640852974185, |
|
"grad_norm": 0.01732565462589264, |
|
"learning_rate": 4.984260583841953e-05, |
|
"loss": 0.0115, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 2.760942760942761, |
|
"grad_norm": 0.021635642275214195, |
|
"learning_rate": 4.9527823753839834e-05, |
|
"loss": 0.0135, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 2.7699214365881035, |
|
"grad_norm": 0.012034310959279537, |
|
"learning_rate": 4.9213060384756716e-05, |
|
"loss": 0.009, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 2.778900112233446, |
|
"grad_norm": 0.020500048995018005, |
|
"learning_rate": 4.8898328207343666e-05, |
|
"loss": 0.0123, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 2.787878787878788, |
|
"grad_norm": 0.01599585823714733, |
|
"learning_rate": 4.858363969653781e-05, |
|
"loss": 0.0129, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.787878787878788, |
|
"eval_loss": 0.012887900695204735, |
|
"eval_runtime": 6.2513, |
|
"eval_samples_per_second": 7.998, |
|
"eval_steps_per_second": 2.08, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.7968574635241303, |
|
"grad_norm": 0.01829446479678154, |
|
"learning_rate": 4.8269007325545506e-05, |
|
"loss": 0.0127, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 2.8058361391694726, |
|
"grad_norm": 0.014843763783574104, |
|
"learning_rate": 4.7954443565347865e-05, |
|
"loss": 0.0104, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 2.814814814814815, |
|
"grad_norm": 0.018556272611021996, |
|
"learning_rate": 4.7639960884206576e-05, |
|
"loss": 0.0132, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 2.823793490460157, |
|
"grad_norm": 0.016161100938916206, |
|
"learning_rate": 4.7325571747169545e-05, |
|
"loss": 0.0106, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 2.8327721661054994, |
|
"grad_norm": 0.01768597401678562, |
|
"learning_rate": 4.7011288615576934e-05, |
|
"loss": 0.0088, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 2.8327721661054994, |
|
"eval_loss": 0.012852279469370842, |
|
"eval_runtime": 6.2725, |
|
"eval_samples_per_second": 7.971, |
|
"eval_steps_per_second": 2.073, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 2.8417508417508417, |
|
"grad_norm": 0.022071367129683495, |
|
"learning_rate": 4.6697123946567227e-05, |
|
"loss": 0.0159, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 2.850729517396184, |
|
"grad_norm": 0.012250754982233047, |
|
"learning_rate": 4.63830901925834e-05, |
|
"loss": 0.0098, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 2.8597081930415262, |
|
"grad_norm": 0.015096920542418957, |
|
"learning_rate": 4.60691998008794e-05, |
|
"loss": 0.011, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 2.8686868686868685, |
|
"grad_norm": 0.016319630667567253, |
|
"learning_rate": 4.575546521302681e-05, |
|
"loss": 0.0115, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 2.877665544332211, |
|
"grad_norm": 0.01583506353199482, |
|
"learning_rate": 4.544189886442162e-05, |
|
"loss": 0.0092, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.877665544332211, |
|
"eval_loss": 0.012964904308319092, |
|
"eval_runtime": 6.2575, |
|
"eval_samples_per_second": 7.99, |
|
"eval_steps_per_second": 2.077, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.886644219977553, |
|
"grad_norm": 0.020128175616264343, |
|
"learning_rate": 4.5128513183791386e-05, |
|
"loss": 0.0141, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 2.8956228956228958, |
|
"grad_norm": 0.019213715568184853, |
|
"learning_rate": 4.481532059270262e-05, |
|
"loss": 0.0115, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 2.904601571268238, |
|
"grad_norm": 0.024485057219862938, |
|
"learning_rate": 4.450233350506836e-05, |
|
"loss": 0.0135, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 2.9135802469135803, |
|
"grad_norm": 0.019438456743955612, |
|
"learning_rate": 4.418956432665618e-05, |
|
"loss": 0.0108, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 2.9225589225589226, |
|
"grad_norm": 0.013679172843694687, |
|
"learning_rate": 4.387702545459649e-05, |
|
"loss": 0.0086, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 2.9225589225589226, |
|
"eval_loss": 0.012879169546067715, |
|
"eval_runtime": 6.2664, |
|
"eval_samples_per_second": 7.979, |
|
"eval_steps_per_second": 2.075, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 2.931537598204265, |
|
"grad_norm": 0.018316788598895073, |
|
"learning_rate": 4.356472927689109e-05, |
|
"loss": 0.0112, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 2.940516273849607, |
|
"grad_norm": 0.01569702848792076, |
|
"learning_rate": 4.32526881719222e-05, |
|
"loss": 0.0129, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 2.9494949494949494, |
|
"grad_norm": 0.01372100692242384, |
|
"learning_rate": 4.2940914507961775e-05, |
|
"loss": 0.0104, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 2.9584736251402917, |
|
"grad_norm": 0.015811212360858917, |
|
"learning_rate": 4.262942064268134e-05, |
|
"loss": 0.0123, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 2.9674523007856344, |
|
"grad_norm": 0.018124472349882126, |
|
"learning_rate": 4.23182189226621e-05, |
|
"loss": 0.0132, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.9674523007856344, |
|
"eval_loss": 0.012610589154064655, |
|
"eval_runtime": 6.2539, |
|
"eval_samples_per_second": 7.995, |
|
"eval_steps_per_second": 2.079, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.9764309764309766, |
|
"grad_norm": 0.015013212338089943, |
|
"learning_rate": 4.20073216829056e-05, |
|
"loss": 0.0124, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 2.985409652076319, |
|
"grad_norm": 0.013129116035997868, |
|
"learning_rate": 4.169674124634481e-05, |
|
"loss": 0.009, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 2.994388327721661, |
|
"grad_norm": 0.01896873489022255, |
|
"learning_rate": 4.138648992335566e-05, |
|
"loss": 0.0131, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 3.005611672278339, |
|
"grad_norm": 0.03602827712893486, |
|
"learning_rate": 4.107658001126913e-05, |
|
"loss": 0.0173, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 3.014590347923681, |
|
"grad_norm": 0.021112319082021713, |
|
"learning_rate": 4.0767023793883785e-05, |
|
"loss": 0.0126, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 3.014590347923681, |
|
"eval_loss": 0.012994157150387764, |
|
"eval_runtime": 6.2538, |
|
"eval_samples_per_second": 7.995, |
|
"eval_steps_per_second": 2.079, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 3.0235690235690234, |
|
"grad_norm": 0.01987135224044323, |
|
"learning_rate": 4.045783354097893e-05, |
|
"loss": 0.0092, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 3.032547699214366, |
|
"grad_norm": 0.025037603452801704, |
|
"learning_rate": 4.0149021507828224e-05, |
|
"loss": 0.0131, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 3.0415263748597083, |
|
"grad_norm": 0.017175879329442978, |
|
"learning_rate": 3.984059993471399e-05, |
|
"loss": 0.0086, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 3.0505050505050506, |
|
"grad_norm": 0.018214913085103035, |
|
"learning_rate": 3.9532581046442e-05, |
|
"loss": 0.0104, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 3.059483726150393, |
|
"grad_norm": 0.016980677843093872, |
|
"learning_rate": 3.9224977051856904e-05, |
|
"loss": 0.0117, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 3.059483726150393, |
|
"eval_loss": 0.013331728056073189, |
|
"eval_runtime": 6.2785, |
|
"eval_samples_per_second": 7.964, |
|
"eval_steps_per_second": 2.071, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 3.068462401795735, |
|
"grad_norm": 0.016574935987591743, |
|
"learning_rate": 3.8917800143358404e-05, |
|
"loss": 0.0077, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 3.0774410774410774, |
|
"grad_norm": 0.020380405709147453, |
|
"learning_rate": 3.861106249641789e-05, |
|
"loss": 0.0097, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 3.0864197530864197, |
|
"grad_norm": 0.026123059913516045, |
|
"learning_rate": 3.830477626909589e-05, |
|
"loss": 0.0125, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 3.095398428731762, |
|
"grad_norm": 0.02141587808728218, |
|
"learning_rate": 3.7998953601560175e-05, |
|
"loss": 0.0111, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 3.1043771043771042, |
|
"grad_norm": 0.021793678402900696, |
|
"learning_rate": 3.769360661560453e-05, |
|
"loss": 0.0102, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 3.1043771043771042, |
|
"eval_loss": 0.013248049654066563, |
|
"eval_runtime": 6.2478, |
|
"eval_samples_per_second": 8.003, |
|
"eval_steps_per_second": 2.081, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 3.1133557800224465, |
|
"grad_norm": 0.013916864059865475, |
|
"learning_rate": 3.73887474141683e-05, |
|
"loss": 0.0088, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 3.122334455667789, |
|
"grad_norm": 0.015638204291462898, |
|
"learning_rate": 3.708438808085668e-05, |
|
"loss": 0.01, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 3.1313131313131315, |
|
"grad_norm": 0.017211005091667175, |
|
"learning_rate": 3.6780540679461784e-05, |
|
"loss": 0.0091, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 3.1402918069584738, |
|
"grad_norm": 0.02173846773803234, |
|
"learning_rate": 3.64772172534844e-05, |
|
"loss": 0.0118, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 3.149270482603816, |
|
"grad_norm": 0.012991265393793583, |
|
"learning_rate": 3.6174429825656685e-05, |
|
"loss": 0.0074, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 3.149270482603816, |
|
"eval_loss": 0.013174821622669697, |
|
"eval_runtime": 6.296, |
|
"eval_samples_per_second": 7.942, |
|
"eval_steps_per_second": 2.065, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 3.1582491582491583, |
|
"grad_norm": 0.02047666721045971, |
|
"learning_rate": 3.587219039746564e-05, |
|
"loss": 0.0124, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 3.1672278338945006, |
|
"grad_norm": 0.0238481592386961, |
|
"learning_rate": 3.557051094867735e-05, |
|
"loss": 0.0122, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 3.176206509539843, |
|
"grad_norm": 0.01623239740729332, |
|
"learning_rate": 3.5269403436862175e-05, |
|
"loss": 0.0089, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 3.185185185185185, |
|
"grad_norm": 0.019198792055249214, |
|
"learning_rate": 3.496887979692084e-05, |
|
"loss": 0.0111, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 3.1941638608305274, |
|
"grad_norm": 0.019949574023485184, |
|
"learning_rate": 3.466895194061128e-05, |
|
"loss": 0.0105, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 3.1941638608305274, |
|
"eval_loss": 0.01294049434363842, |
|
"eval_runtime": 6.2659, |
|
"eval_samples_per_second": 7.98, |
|
"eval_steps_per_second": 2.075, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 3.2031425364758697, |
|
"grad_norm": 0.01906234584748745, |
|
"learning_rate": 3.436963175607656e-05, |
|
"loss": 0.0096, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 3.212121212121212, |
|
"grad_norm": 0.019578171893954277, |
|
"learning_rate": 3.4070931107373675e-05, |
|
"loss": 0.0092, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 3.221099887766554, |
|
"grad_norm": 0.015484723262488842, |
|
"learning_rate": 3.377286183400328e-05, |
|
"loss": 0.011, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 3.230078563411897, |
|
"grad_norm": 0.017633194103837013, |
|
"learning_rate": 3.3475435750440356e-05, |
|
"loss": 0.0101, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 3.239057239057239, |
|
"grad_norm": 0.020478319376707077, |
|
"learning_rate": 3.3178664645666066e-05, |
|
"loss": 0.0117, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 3.239057239057239, |
|
"eval_loss": 0.012872601859271526, |
|
"eval_runtime": 6.2587, |
|
"eval_samples_per_second": 7.989, |
|
"eval_steps_per_second": 2.077, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 3.2480359147025815, |
|
"grad_norm": 0.015297391451895237, |
|
"learning_rate": 3.2882560282700336e-05, |
|
"loss": 0.0096, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 3.2570145903479237, |
|
"grad_norm": 0.01740657351911068, |
|
"learning_rate": 3.258713439813566e-05, |
|
"loss": 0.0105, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 3.265993265993266, |
|
"grad_norm": 0.019270438700914383, |
|
"learning_rate": 3.229239870167191e-05, |
|
"loss": 0.0103, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 3.2749719416386083, |
|
"grad_norm": 0.017028817906975746, |
|
"learning_rate": 3.199836487565222e-05, |
|
"loss": 0.0109, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 3.2839506172839505, |
|
"grad_norm": 0.0167669877409935, |
|
"learning_rate": 3.170504457459989e-05, |
|
"loss": 0.0107, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 3.2839506172839505, |
|
"eval_loss": 0.012733125127851963, |
|
"eval_runtime": 6.2533, |
|
"eval_samples_per_second": 7.996, |
|
"eval_steps_per_second": 2.079, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 3.292929292929293, |
|
"grad_norm": 0.01785757951438427, |
|
"learning_rate": 3.1412449424756474e-05, |
|
"loss": 0.0097, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 3.301907968574635, |
|
"grad_norm": 0.01732640527188778, |
|
"learning_rate": 3.112059102362093e-05, |
|
"loss": 0.0106, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 3.3108866442199774, |
|
"grad_norm": 0.015373657457530499, |
|
"learning_rate": 3.082948093948997e-05, |
|
"loss": 0.0094, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 3.31986531986532, |
|
"grad_norm": 0.01795523799955845, |
|
"learning_rate": 3.053913071099947e-05, |
|
"loss": 0.0096, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 3.3288439955106623, |
|
"grad_norm": 0.017400693148374557, |
|
"learning_rate": 3.0249551846667207e-05, |
|
"loss": 0.0098, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 3.3288439955106623, |
|
"eval_loss": 0.012761359103024006, |
|
"eval_runtime": 6.2866, |
|
"eval_samples_per_second": 7.953, |
|
"eval_steps_per_second": 2.068, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 3.3378226711560046, |
|
"grad_norm": 0.02055426687002182, |
|
"learning_rate": 2.996075582443658e-05, |
|
"loss": 0.0115, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 3.346801346801347, |
|
"grad_norm": 0.014372746460139751, |
|
"learning_rate": 2.9672754091221805e-05, |
|
"loss": 0.0086, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 3.355780022446689, |
|
"grad_norm": 0.01651662401854992, |
|
"learning_rate": 2.938555806245406e-05, |
|
"loss": 0.0106, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 3.3647586980920314, |
|
"grad_norm": 0.016991253942251205, |
|
"learning_rate": 2.9099179121629117e-05, |
|
"loss": 0.0115, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 3.3737373737373737, |
|
"grad_norm": 0.016502108424901962, |
|
"learning_rate": 2.881362861985606e-05, |
|
"loss": 0.0092, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 3.3737373737373737, |
|
"eval_loss": 0.012749603018164635, |
|
"eval_runtime": 6.2727, |
|
"eval_samples_per_second": 7.971, |
|
"eval_steps_per_second": 2.072, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 3.382716049382716, |
|
"grad_norm": 0.015984639525413513, |
|
"learning_rate": 2.8528917875407433e-05, |
|
"loss": 0.0094, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 3.3916947250280582, |
|
"grad_norm": 0.03028254769742489, |
|
"learning_rate": 2.8245058173270622e-05, |
|
"loss": 0.0102, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 3.4006734006734005, |
|
"grad_norm": 0.021088914945721626, |
|
"learning_rate": 2.796206076470044e-05, |
|
"loss": 0.0082, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 3.409652076318743, |
|
"grad_norm": 0.019139107316732407, |
|
"learning_rate": 2.7679936866773315e-05, |
|
"loss": 0.0095, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 3.418630751964085, |
|
"grad_norm": 0.018426954746246338, |
|
"learning_rate": 2.739869766194263e-05, |
|
"loss": 0.0114, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 3.418630751964085, |
|
"eval_loss": 0.012560264207422733, |
|
"eval_runtime": 6.2597, |
|
"eval_samples_per_second": 7.988, |
|
"eval_steps_per_second": 2.077, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 3.4276094276094278, |
|
"grad_norm": 0.019736235961318016, |
|
"learning_rate": 2.7118354297595396e-05, |
|
"loss": 0.01, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 3.43658810325477, |
|
"grad_norm": 0.015529734082520008, |
|
"learning_rate": 2.683891788561055e-05, |
|
"loss": 0.0109, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 3.4455667789001123, |
|
"grad_norm": 0.01170337200164795, |
|
"learning_rate": 2.6560399501918465e-05, |
|
"loss": 0.0083, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 3.4545454545454546, |
|
"grad_norm": 0.016096895560622215, |
|
"learning_rate": 2.6282810186061862e-05, |
|
"loss": 0.0095, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 3.463524130190797, |
|
"grad_norm": 0.022902794182300568, |
|
"learning_rate": 2.600616094075835e-05, |
|
"loss": 0.0118, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 3.463524130190797, |
|
"eval_loss": 0.012463411316275597, |
|
"eval_runtime": 6.256, |
|
"eval_samples_per_second": 7.992, |
|
"eval_steps_per_second": 2.078, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 3.472502805836139, |
|
"grad_norm": 0.01635258086025715, |
|
"learning_rate": 2.5730462731464273e-05, |
|
"loss": 0.0106, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 3.4814814814814814, |
|
"grad_norm": 0.019135868176817894, |
|
"learning_rate": 2.5455726485940012e-05, |
|
"loss": 0.0088, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 3.4904601571268237, |
|
"grad_norm": 0.015336276032030582, |
|
"learning_rate": 2.5181963093816962e-05, |
|
"loss": 0.0086, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 3.499438832772166, |
|
"grad_norm": 0.019149543717503548, |
|
"learning_rate": 2.4909183406165836e-05, |
|
"loss": 0.0095, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 3.5084175084175087, |
|
"grad_norm": 0.01899711787700653, |
|
"learning_rate": 2.4637398235066527e-05, |
|
"loss": 0.0108, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 3.5084175084175087, |
|
"eval_loss": 0.012311117723584175, |
|
"eval_runtime": 6.2629, |
|
"eval_samples_per_second": 7.983, |
|
"eval_steps_per_second": 2.076, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 3.517396184062851, |
|
"grad_norm": 0.025319568812847137, |
|
"learning_rate": 2.4366618353179644e-05, |
|
"loss": 0.0128, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 3.526374859708193, |
|
"grad_norm": 0.014999724924564362, |
|
"learning_rate": 2.4096854493319477e-05, |
|
"loss": 0.0089, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 3.5353535353535355, |
|
"grad_norm": 0.0133373336866498, |
|
"learning_rate": 2.3828117348028528e-05, |
|
"loss": 0.0088, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 3.5443322109988777, |
|
"grad_norm": 0.016392188146710396, |
|
"learning_rate": 2.3560417569153796e-05, |
|
"loss": 0.0096, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 3.55331088664422, |
|
"grad_norm": 0.017216209322214127, |
|
"learning_rate": 2.3293765767424537e-05, |
|
"loss": 0.0092, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 3.55331088664422, |
|
"eval_loss": 0.012257438153028488, |
|
"eval_runtime": 6.2604, |
|
"eval_samples_per_second": 7.987, |
|
"eval_steps_per_second": 2.077, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 3.5622895622895623, |
|
"grad_norm": 0.021824954077601433, |
|
"learning_rate": 2.3028172512031604e-05, |
|
"loss": 0.0101, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 3.5712682379349046, |
|
"grad_norm": 0.017453951761126518, |
|
"learning_rate": 2.276364833020868e-05, |
|
"loss": 0.0103, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 3.580246913580247, |
|
"grad_norm": 0.014409264549612999, |
|
"learning_rate": 2.2500203706814856e-05, |
|
"loss": 0.0089, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 3.589225589225589, |
|
"grad_norm": 0.015458152629435062, |
|
"learning_rate": 2.2237849083919142e-05, |
|
"loss": 0.0097, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 3.5982042648709314, |
|
"grad_norm": 0.01887071318924427, |
|
"learning_rate": 2.1976594860386597e-05, |
|
"loss": 0.0085, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 3.5982042648709314, |
|
"eval_loss": 0.01230713166296482, |
|
"eval_runtime": 6.2496, |
|
"eval_samples_per_second": 8.0, |
|
"eval_steps_per_second": 2.08, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 3.6071829405162736, |
|
"grad_norm": 0.019031310454010963, |
|
"learning_rate": 2.1716451391466008e-05, |
|
"loss": 0.01, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 3.616161616161616, |
|
"grad_norm": 0.016506759449839592, |
|
"learning_rate": 2.1457428988379635e-05, |
|
"loss": 0.01, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 3.6251402918069586, |
|
"grad_norm": 0.016758006066083908, |
|
"learning_rate": 2.1199537917914386e-05, |
|
"loss": 0.01, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 3.634118967452301, |
|
"grad_norm": 0.02015763334929943, |
|
"learning_rate": 2.0942788402014867e-05, |
|
"loss": 0.0097, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 3.643097643097643, |
|
"grad_norm": 0.021632149815559387, |
|
"learning_rate": 2.068719061737831e-05, |
|
"loss": 0.0088, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 3.643097643097643, |
|
"eval_loss": 0.012639960274100304, |
|
"eval_runtime": 6.255, |
|
"eval_samples_per_second": 7.994, |
|
"eval_steps_per_second": 2.078, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 3.6520763187429854, |
|
"grad_norm": 0.020619019865989685, |
|
"learning_rate": 2.0432754695051136e-05, |
|
"loss": 0.0112, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 3.6610549943883277, |
|
"grad_norm": 0.018945058807730675, |
|
"learning_rate": 2.0179490720027372e-05, |
|
"loss": 0.0104, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 3.67003367003367, |
|
"grad_norm": 0.016077589243650436, |
|
"learning_rate": 1.992740873084899e-05, |
|
"loss": 0.0084, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 3.6790123456790123, |
|
"grad_norm": 0.01626184582710266, |
|
"learning_rate": 1.9676518719207977e-05, |
|
"loss": 0.0093, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 3.6879910213243545, |
|
"grad_norm": 0.01499089039862156, |
|
"learning_rate": 1.9426830629550242e-05, |
|
"loss": 0.0095, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 3.6879910213243545, |
|
"eval_loss": 0.012408388778567314, |
|
"eval_runtime": 6.2471, |
|
"eval_samples_per_second": 8.004, |
|
"eval_steps_per_second": 2.081, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 3.6969696969696972, |
|
"grad_norm": 0.01810368523001671, |
|
"learning_rate": 1.917835435868155e-05, |
|
"loss": 0.0101, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 3.7059483726150395, |
|
"grad_norm": 0.016352152451872826, |
|
"learning_rate": 1.8931099755375203e-05, |
|
"loss": 0.0085, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 3.714927048260382, |
|
"grad_norm": 0.01950286328792572, |
|
"learning_rate": 1.8685076619981608e-05, |
|
"loss": 0.0102, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 3.723905723905724, |
|
"grad_norm": 0.016881579533219337, |
|
"learning_rate": 1.844029470403993e-05, |
|
"loss": 0.0097, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 3.7328843995510663, |
|
"grad_norm": 0.020003410056233406, |
|
"learning_rate": 1.8196763709891524e-05, |
|
"loss": 0.0072, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 3.7328843995510663, |
|
"eval_loss": 0.012383312918245792, |
|
"eval_runtime": 6.2483, |
|
"eval_samples_per_second": 8.002, |
|
"eval_steps_per_second": 2.081, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 3.7418630751964086, |
|
"grad_norm": 0.021129626780748367, |
|
"learning_rate": 1.795449329029531e-05, |
|
"loss": 0.0103, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 3.750841750841751, |
|
"grad_norm": 0.02026401087641716, |
|
"learning_rate": 1.7713493048045294e-05, |
|
"loss": 0.0108, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 3.759820426487093, |
|
"grad_norm": 0.019643913954496384, |
|
"learning_rate": 1.747377253558982e-05, |
|
"loss": 0.0102, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 3.7687991021324354, |
|
"grad_norm": 0.02257615514099598, |
|
"learning_rate": 1.7235341254653005e-05, |
|
"loss": 0.0103, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 3.7777777777777777, |
|
"grad_norm": 0.018804267048835754, |
|
"learning_rate": 1.6998208655858137e-05, |
|
"loss": 0.0105, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 3.7777777777777777, |
|
"eval_loss": 0.012317318469285965, |
|
"eval_runtime": 6.2923, |
|
"eval_samples_per_second": 7.946, |
|
"eval_steps_per_second": 2.066, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 3.78675645342312, |
|
"grad_norm": 0.01885891705751419, |
|
"learning_rate": 1.6762384138353078e-05, |
|
"loss": 0.011, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 3.795735129068462, |
|
"grad_norm": 0.016811871901154518, |
|
"learning_rate": 1.6527877049437622e-05, |
|
"loss": 0.0082, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 3.8047138047138045, |
|
"grad_norm": 0.01912999525666237, |
|
"learning_rate": 1.6294696684193154e-05, |
|
"loss": 0.0095, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 3.8136924803591468, |
|
"grad_norm": 0.01893465779721737, |
|
"learning_rate": 1.6062852285114123e-05, |
|
"loss": 0.0091, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 3.8226711560044895, |
|
"grad_norm": 0.019678136333823204, |
|
"learning_rate": 1.583235304174167e-05, |
|
"loss": 0.0115, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 3.8226711560044895, |
|
"eval_loss": 0.01215137168765068, |
|
"eval_runtime": 6.2539, |
|
"eval_samples_per_second": 7.995, |
|
"eval_steps_per_second": 2.079, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 3.8316498316498318, |
|
"grad_norm": 0.017283251509070396, |
|
"learning_rate": 1.5603208090299498e-05, |
|
"loss": 0.0082, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 3.840628507295174, |
|
"grad_norm": 0.018386101350188255, |
|
"learning_rate": 1.537542651333167e-05, |
|
"loss": 0.0098, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 3.8496071829405163, |
|
"grad_norm": 0.02193758450448513, |
|
"learning_rate": 1.5149017339342574e-05, |
|
"loss": 0.0105, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 3.8585858585858586, |
|
"grad_norm": 0.025852926075458527, |
|
"learning_rate": 1.4923989542439159e-05, |
|
"loss": 0.0108, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 3.867564534231201, |
|
"grad_norm": 0.015607142820954323, |
|
"learning_rate": 1.4700352041975168e-05, |
|
"loss": 0.007, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 3.867564534231201, |
|
"eval_loss": 0.012098519131541252, |
|
"eval_runtime": 6.2731, |
|
"eval_samples_per_second": 7.971, |
|
"eval_steps_per_second": 2.072, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 3.876543209876543, |
|
"grad_norm": 0.018707241863012314, |
|
"learning_rate": 1.447811370219757e-05, |
|
"loss": 0.01, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 3.8855218855218854, |
|
"grad_norm": 0.017066778615117073, |
|
"learning_rate": 1.4257283331895315e-05, |
|
"loss": 0.0082, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 3.894500561167228, |
|
"grad_norm": 0.02140822634100914, |
|
"learning_rate": 1.4037869684050115e-05, |
|
"loss": 0.0117, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 3.9034792368125704, |
|
"grad_norm": 0.020129108801484108, |
|
"learning_rate": 1.3819881455489458e-05, |
|
"loss": 0.0085, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 3.9124579124579126, |
|
"grad_norm": 0.020222559571266174, |
|
"learning_rate": 1.3603327286542023e-05, |
|
"loss": 0.0112, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 3.9124579124579126, |
|
"eval_loss": 0.012134820222854614, |
|
"eval_runtime": 6.2796, |
|
"eval_samples_per_second": 7.962, |
|
"eval_steps_per_second": 2.07, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 3.921436588103255, |
|
"grad_norm": 0.019684189930558205, |
|
"learning_rate": 1.33882157606951e-05, |
|
"loss": 0.0092, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 3.930415263748597, |
|
"grad_norm": 0.01632900908589363, |
|
"learning_rate": 1.317455540425439e-05, |
|
"loss": 0.0074, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 3.9393939393939394, |
|
"grad_norm": 0.021984471008181572, |
|
"learning_rate": 1.2962354686006084e-05, |
|
"loss": 0.0115, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 3.9483726150392817, |
|
"grad_norm": 0.014504092745482922, |
|
"learning_rate": 1.2751622016881182e-05, |
|
"loss": 0.0085, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 3.957351290684624, |
|
"grad_norm": 0.027196183800697327, |
|
"learning_rate": 1.2542365749622049e-05, |
|
"loss": 0.0103, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 3.957351290684624, |
|
"eval_loss": 0.012064680457115173, |
|
"eval_runtime": 6.2674, |
|
"eval_samples_per_second": 7.978, |
|
"eval_steps_per_second": 2.074, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 3.9663299663299663, |
|
"grad_norm": 0.02008941024541855, |
|
"learning_rate": 1.2334594178451425e-05, |
|
"loss": 0.0088, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 3.9753086419753085, |
|
"grad_norm": 0.017460942268371582, |
|
"learning_rate": 1.2128315538743646e-05, |
|
"loss": 0.0074, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 3.984287317620651, |
|
"grad_norm": 0.01718473620712757, |
|
"learning_rate": 1.1923538006698154e-05, |
|
"loss": 0.0099, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 3.993265993265993, |
|
"grad_norm": 0.017572911456227303, |
|
"learning_rate": 1.172026969901553e-05, |
|
"loss": 0.0089, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 4.004489337822672, |
|
"grad_norm": 0.05128241330385208, |
|
"learning_rate": 1.1518518672575701e-05, |
|
"loss": 0.0162, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 4.004489337822672, |
|
"eval_loss": 0.0121712451800704, |
|
"eval_runtime": 6.2683, |
|
"eval_samples_per_second": 7.977, |
|
"eval_steps_per_second": 2.074, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 4.013468013468014, |
|
"grad_norm": 0.017392810434103012, |
|
"learning_rate": 1.1318292924118584e-05, |
|
"loss": 0.0092, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 4.022446689113356, |
|
"grad_norm": 0.014832595363259315, |
|
"learning_rate": 1.1119600389927182e-05, |
|
"loss": 0.0089, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 4.031425364758698, |
|
"grad_norm": 0.014501234516501427, |
|
"learning_rate": 1.092244894551298e-05, |
|
"loss": 0.0076, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 4.040404040404041, |
|
"grad_norm": 0.01787727326154709, |
|
"learning_rate": 1.0726846405303754e-05, |
|
"loss": 0.0091, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 4.049382716049383, |
|
"grad_norm": 0.018708007410168648, |
|
"learning_rate": 1.0532800522333902e-05, |
|
"loss": 0.0079, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 4.049382716049383, |
|
"eval_loss": 0.0124581940472126, |
|
"eval_runtime": 6.2521, |
|
"eval_samples_per_second": 7.997, |
|
"eval_steps_per_second": 2.079, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 4.058361391694725, |
|
"grad_norm": 0.017758728936314583, |
|
"learning_rate": 1.0340318987937097e-05, |
|
"loss": 0.0086, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 4.0673400673400675, |
|
"grad_norm": 0.02003531903028488, |
|
"learning_rate": 1.014940943144142e-05, |
|
"loss": 0.0099, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 4.07631874298541, |
|
"grad_norm": 0.020393820479512215, |
|
"learning_rate": 9.960079419866985e-06, |
|
"loss": 0.0067, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 4.085297418630752, |
|
"grad_norm": 0.015872273594141006, |
|
"learning_rate": 9.772336457626014e-06, |
|
"loss": 0.0065, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 4.094276094276094, |
|
"grad_norm": 0.027125949040055275, |
|
"learning_rate": 9.586187986225325e-06, |
|
"loss": 0.0102, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 4.094276094276094, |
|
"eval_loss": 0.012580028735101223, |
|
"eval_runtime": 6.2573, |
|
"eval_samples_per_second": 7.991, |
|
"eval_steps_per_second": 2.078, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 4.103254769921437, |
|
"grad_norm": 0.023796912282705307, |
|
"learning_rate": 9.401641383971477e-06, |
|
"loss": 0.01, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 4.112233445566779, |
|
"grad_norm": 0.02073628082871437, |
|
"learning_rate": 9.218703965678204e-06, |
|
"loss": 0.0101, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 4.121212121212121, |
|
"grad_norm": 0.01744106411933899, |
|
"learning_rate": 9.03738298237658e-06, |
|
"loss": 0.0063, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 4.130190796857463, |
|
"grad_norm": 0.023079855367541313, |
|
"learning_rate": 8.857685621027568e-06, |
|
"loss": 0.0078, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 4.139169472502806, |
|
"grad_norm": 0.020268626511096954, |
|
"learning_rate": 8.67961900423711e-06, |
|
"loss": 0.0087, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 4.139169472502806, |
|
"eval_loss": 0.012577519752085209, |
|
"eval_runtime": 6.2777, |
|
"eval_samples_per_second": 7.965, |
|
"eval_steps_per_second": 2.071, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 4.148148148148148, |
|
"grad_norm": 0.020592128857970238, |
|
"learning_rate": 8.503190189973914e-06, |
|
"loss": 0.0099, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 4.15712682379349, |
|
"grad_norm": 0.016794390976428986, |
|
"learning_rate": 8.328406171289621e-06, |
|
"loss": 0.008, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 4.1661054994388325, |
|
"grad_norm": 0.016793973743915558, |
|
"learning_rate": 8.155273876041614e-06, |
|
"loss": 0.0076, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 4.175084175084175, |
|
"grad_norm": 0.01984918676316738, |
|
"learning_rate": 7.983800166618482e-06, |
|
"loss": 0.0087, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 4.184062850729517, |
|
"grad_norm": 0.034957610070705414, |
|
"learning_rate": 7.813991839667995e-06, |
|
"loss": 0.0107, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 4.184062850729517, |
|
"eval_loss": 0.012637750245630741, |
|
"eval_runtime": 6.3143, |
|
"eval_samples_per_second": 7.919, |
|
"eval_steps_per_second": 2.059, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 4.193041526374859, |
|
"grad_norm": 0.0235477052628994, |
|
"learning_rate": 7.645855625827658e-06, |
|
"loss": 0.007, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 4.202020202020202, |
|
"grad_norm": 0.021280469372868538, |
|
"learning_rate": 7.4793981894580034e-06, |
|
"loss": 0.0071, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 4.210998877665545, |
|
"grad_norm": 0.022229325026273727, |
|
"learning_rate": 7.3146261283784104e-06, |
|
"loss": 0.0086, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 4.219977553310887, |
|
"grad_norm": 0.020385632291436195, |
|
"learning_rate": 7.1515459736055505e-06, |
|
"loss": 0.0071, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 4.228956228956229, |
|
"grad_norm": 0.018814127892255783, |
|
"learning_rate": 6.990164189094589e-06, |
|
"loss": 0.0105, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 4.228956228956229, |
|
"eval_loss": 0.012454288080334663, |
|
"eval_runtime": 6.259, |
|
"eval_samples_per_second": 7.988, |
|
"eval_steps_per_second": 2.077, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 4.2379349046015715, |
|
"grad_norm": 0.022755559533834457, |
|
"learning_rate": 6.830487171482935e-06, |
|
"loss": 0.0085, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 4.246913580246914, |
|
"grad_norm": 0.019691364839673042, |
|
"learning_rate": 6.6725212498366885e-06, |
|
"loss": 0.0087, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 4.255892255892256, |
|
"grad_norm": 0.021712522953748703, |
|
"learning_rate": 6.516272685399793e-06, |
|
"loss": 0.009, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 4.264870931537598, |
|
"grad_norm": 0.015157987363636494, |
|
"learning_rate": 6.36174767134588e-06, |
|
"loss": 0.0056, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 4.273849607182941, |
|
"grad_norm": 0.01883016712963581, |
|
"learning_rate": 6.208952332532786e-06, |
|
"loss": 0.0089, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 4.273849607182941, |
|
"eval_loss": 0.012438948266208172, |
|
"eval_runtime": 6.2582, |
|
"eval_samples_per_second": 7.99, |
|
"eval_steps_per_second": 2.077, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 4.282828282828283, |
|
"grad_norm": 0.018075603991746902, |
|
"learning_rate": 6.057892725259717e-06, |
|
"loss": 0.0079, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 4.291806958473625, |
|
"grad_norm": 0.021707097068428993, |
|
"learning_rate": 5.908574837027309e-06, |
|
"loss": 0.0086, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 4.300785634118967, |
|
"grad_norm": 0.029295941814780235, |
|
"learning_rate": 5.761004586300234e-06, |
|
"loss": 0.0092, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 4.30976430976431, |
|
"grad_norm": 0.015479645691812038, |
|
"learning_rate": 5.615187822272583e-06, |
|
"loss": 0.0073, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 4.318742985409652, |
|
"grad_norm": 0.017396703362464905, |
|
"learning_rate": 5.4711303246361144e-06, |
|
"loss": 0.0061, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 4.318742985409652, |
|
"eval_loss": 0.012455189600586891, |
|
"eval_runtime": 6.2775, |
|
"eval_samples_per_second": 7.965, |
|
"eval_steps_per_second": 2.071, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 4.327721661054994, |
|
"grad_norm": 0.019893964752554893, |
|
"learning_rate": 5.328837803351083e-06, |
|
"loss": 0.008, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 4.3367003367003365, |
|
"grad_norm": 0.01733791083097458, |
|
"learning_rate": 5.188315898419971e-06, |
|
"loss": 0.0085, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 4.345679012345679, |
|
"grad_norm": 0.03376675769686699, |
|
"learning_rate": 5.04957017966391e-06, |
|
"loss": 0.0079, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 4.354657687991021, |
|
"grad_norm": 0.01885765977203846, |
|
"learning_rate": 4.912606146501886e-06, |
|
"loss": 0.0103, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 4.363636363636363, |
|
"grad_norm": 0.023781761527061462, |
|
"learning_rate": 4.777429227732844e-06, |
|
"loss": 0.0074, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 4.363636363636363, |
|
"eval_loss": 0.012579456903040409, |
|
"eval_runtime": 6.2532, |
|
"eval_samples_per_second": 7.996, |
|
"eval_steps_per_second": 2.079, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 4.372615039281706, |
|
"grad_norm": 0.020697975531220436, |
|
"learning_rate": 4.644044781320422e-06, |
|
"loss": 0.0082, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 4.381593714927048, |
|
"grad_norm": 0.01883615553379059, |
|
"learning_rate": 4.5124580941806165e-06, |
|
"loss": 0.0066, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 4.390572390572391, |
|
"grad_norm": 0.0223472248762846, |
|
"learning_rate": 4.382674381972224e-06, |
|
"loss": 0.0094, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 4.399551066217733, |
|
"grad_norm": 0.02023017778992653, |
|
"learning_rate": 4.254698788890127e-06, |
|
"loss": 0.0084, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 4.408529741863076, |
|
"grad_norm": 0.01804838329553604, |
|
"learning_rate": 4.12853638746134e-06, |
|
"loss": 0.008, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 4.408529741863076, |
|
"eval_loss": 0.01255893800407648, |
|
"eval_runtime": 6.2608, |
|
"eval_samples_per_second": 7.986, |
|
"eval_steps_per_second": 2.076, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 4.417508417508418, |
|
"grad_norm": 0.02165042981505394, |
|
"learning_rate": 4.004192178344029e-06, |
|
"loss": 0.0089, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 4.42648709315376, |
|
"grad_norm": 0.018235059455037117, |
|
"learning_rate": 3.881671090129247e-06, |
|
"loss": 0.0074, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 4.435465768799102, |
|
"grad_norm": 0.01595655083656311, |
|
"learning_rate": 3.7609779791455744e-06, |
|
"loss": 0.0077, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 4.444444444444445, |
|
"grad_norm": 0.01976379007101059, |
|
"learning_rate": 3.6421176292666783e-06, |
|
"loss": 0.0094, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 4.453423120089787, |
|
"grad_norm": 0.02034621126949787, |
|
"learning_rate": 3.5250947517216637e-06, |
|
"loss": 0.0092, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 4.453423120089787, |
|
"eval_loss": 0.012510711327195168, |
|
"eval_runtime": 6.2609, |
|
"eval_samples_per_second": 7.986, |
|
"eval_steps_per_second": 2.076, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 4.462401795735129, |
|
"grad_norm": 0.017096711322665215, |
|
"learning_rate": 3.4099139849083307e-06, |
|
"loss": 0.0068, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 4.4713804713804715, |
|
"grad_norm": 0.01568697765469551, |
|
"learning_rate": 3.296579894209345e-06, |
|
"loss": 0.006, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 4.480359147025814, |
|
"grad_norm": 0.03203177452087402, |
|
"learning_rate": 3.1850969718112745e-06, |
|
"loss": 0.0101, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 4.489337822671156, |
|
"grad_norm": 0.021872689947485924, |
|
"learning_rate": 3.0754696365265068e-06, |
|
"loss": 0.0071, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 4.498316498316498, |
|
"grad_norm": 0.02181348390877247, |
|
"learning_rate": 2.9677022336181413e-06, |
|
"loss": 0.0092, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 4.498316498316498, |
|
"eval_loss": 0.012468446046113968, |
|
"eval_runtime": 6.2655, |
|
"eval_samples_per_second": 7.98, |
|
"eval_steps_per_second": 2.075, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 4.5072951739618405, |
|
"grad_norm": 0.01611742191016674, |
|
"learning_rate": 2.8617990346277657e-06, |
|
"loss": 0.0057, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 4.516273849607183, |
|
"grad_norm": 0.020343905314803123, |
|
"learning_rate": 2.7577642372060673e-06, |
|
"loss": 0.0086, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 4.525252525252525, |
|
"grad_norm": 0.021957550197839737, |
|
"learning_rate": 2.6556019649465525e-06, |
|
"loss": 0.0103, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 4.534231200897867, |
|
"grad_norm": 0.017281070351600647, |
|
"learning_rate": 2.5553162672220465e-06, |
|
"loss": 0.008, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 4.54320987654321, |
|
"grad_norm": 0.015391329303383827, |
|
"learning_rate": 2.45691111902418e-06, |
|
"loss": 0.0061, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 4.54320987654321, |
|
"eval_loss": 0.012444855645298958, |
|
"eval_runtime": 6.2516, |
|
"eval_samples_per_second": 7.998, |
|
"eval_steps_per_second": 2.079, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 4.552188552188552, |
|
"grad_norm": 0.02002757415175438, |
|
"learning_rate": 2.360390420805869e-06, |
|
"loss": 0.0099, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 4.561167227833894, |
|
"grad_norm": 0.019125554710626602, |
|
"learning_rate": 2.2657579983267064e-06, |
|
"loss": 0.0059, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 4.570145903479236, |
|
"grad_norm": 0.017187196761369705, |
|
"learning_rate": 2.1730176025012816e-06, |
|
"loss": 0.0081, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 4.57912457912458, |
|
"grad_norm": 0.017918284982442856, |
|
"learning_rate": 2.082172909250568e-06, |
|
"loss": 0.0082, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 4.588103254769921, |
|
"grad_norm": 0.02295687235891819, |
|
"learning_rate": 1.993227519356189e-06, |
|
"loss": 0.0089, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 4.588103254769921, |
|
"eval_loss": 0.012445243075489998, |
|
"eval_runtime": 6.2516, |
|
"eval_samples_per_second": 7.998, |
|
"eval_steps_per_second": 2.079, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 4.597081930415264, |
|
"grad_norm": 0.017204539850354195, |
|
"learning_rate": 1.906184958317664e-06, |
|
"loss": 0.0069, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 4.606060606060606, |
|
"grad_norm": 0.019010348245501518, |
|
"learning_rate": 1.8210486762127499e-06, |
|
"loss": 0.0084, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 4.615039281705949, |
|
"grad_norm": 0.017709147185087204, |
|
"learning_rate": 1.737822047560611e-06, |
|
"loss": 0.0076, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 4.624017957351291, |
|
"grad_norm": 0.017709005624055862, |
|
"learning_rate": 1.656508371188109e-06, |
|
"loss": 0.0088, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 4.632996632996633, |
|
"grad_norm": 0.01932976208627224, |
|
"learning_rate": 1.5771108700990412e-06, |
|
"loss": 0.01, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 4.632996632996633, |
|
"eval_loss": 0.012414975091814995, |
|
"eval_runtime": 6.2511, |
|
"eval_samples_per_second": 7.999, |
|
"eval_steps_per_second": 2.08, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 4.6419753086419755, |
|
"grad_norm": 0.015371643006801605, |
|
"learning_rate": 1.4996326913463754e-06, |
|
"loss": 0.0058, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 4.650953984287318, |
|
"grad_norm": 0.02080700546503067, |
|
"learning_rate": 1.4240769059075342e-06, |
|
"loss": 0.0089, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 4.65993265993266, |
|
"grad_norm": 0.019819900393486023, |
|
"learning_rate": 1.3504465085626638e-06, |
|
"loss": 0.0059, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 4.668911335578002, |
|
"grad_norm": 0.023678896948695183, |
|
"learning_rate": 1.2787444177759068e-06, |
|
"loss": 0.0075, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 4.677890011223345, |
|
"grad_norm": 0.019294695928692818, |
|
"learning_rate": 1.208973475579761e-06, |
|
"loss": 0.0081, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 4.677890011223345, |
|
"eval_loss": 0.012423361651599407, |
|
"eval_runtime": 6.2709, |
|
"eval_samples_per_second": 7.973, |
|
"eval_steps_per_second": 2.073, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 4.686868686868687, |
|
"grad_norm": 0.019210556522011757, |
|
"learning_rate": 1.1411364474624264e-06, |
|
"loss": 0.007, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 4.695847362514029, |
|
"grad_norm": 0.020206844434142113, |
|
"learning_rate": 1.075236022258147e-06, |
|
"loss": 0.009, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 4.704826038159371, |
|
"grad_norm": 0.018495453521609306, |
|
"learning_rate": 1.0112748120406856e-06, |
|
"loss": 0.0092, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 4.713804713804714, |
|
"grad_norm": 0.015705464407801628, |
|
"learning_rate": 9.492553520197733e-07, |
|
"loss": 0.0072, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 4.722783389450056, |
|
"grad_norm": 0.020354999229311943, |
|
"learning_rate": 8.891801004406119e-07, |
|
"loss": 0.0072, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 4.722783389450056, |
|
"eval_loss": 0.01243152841925621, |
|
"eval_runtime": 6.2577, |
|
"eval_samples_per_second": 7.99, |
|
"eval_steps_per_second": 2.077, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 4.731762065095398, |
|
"grad_norm": 0.022362370043992996, |
|
"learning_rate": 8.31051438486441e-07, |
|
"loss": 0.0089, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 4.7407407407407405, |
|
"grad_norm": 0.022488482296466827, |
|
"learning_rate": 7.748716701841685e-07, |
|
"loss": 0.0097, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 4.749719416386083, |
|
"grad_norm": 0.017749127000570297, |
|
"learning_rate": 7.206430223130278e-07, |
|
"loss": 0.0054, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 4.758698092031425, |
|
"grad_norm": 0.023475971072912216, |
|
"learning_rate": 6.683676443163311e-07, |
|
"loss": 0.0114, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 4.767676767676767, |
|
"grad_norm": 0.020731423050165176, |
|
"learning_rate": 6.180476082162656e-07, |
|
"loss": 0.0078, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 4.767676767676767, |
|
"eval_loss": 0.012432167306542397, |
|
"eval_runtime": 6.2519, |
|
"eval_samples_per_second": 7.998, |
|
"eval_steps_per_second": 2.079, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 4.77665544332211, |
|
"grad_norm": 0.018397077918052673, |
|
"learning_rate": 5.696849085317646e-07, |
|
"loss": 0.0068, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 4.785634118967453, |
|
"grad_norm": 0.01895746774971485, |
|
"learning_rate": 5.232814621994598e-07, |
|
"loss": 0.0084, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 4.794612794612795, |
|
"grad_norm": 0.020780237391591072, |
|
"learning_rate": 4.788391084976862e-07, |
|
"loss": 0.0097, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 4.803591470258137, |
|
"grad_norm": 0.018094424158334732, |
|
"learning_rate": 4.363596089735911e-07, |
|
"loss": 0.0075, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 4.8125701459034795, |
|
"grad_norm": 0.02267824485898018, |
|
"learning_rate": 3.958446473733002e-07, |
|
"loss": 0.009, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 4.8125701459034795, |
|
"eval_loss": 0.012407775036990643, |
|
"eval_runtime": 6.269, |
|
"eval_samples_per_second": 7.976, |
|
"eval_steps_per_second": 2.074, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 4.821548821548822, |
|
"grad_norm": 0.016193151473999023, |
|
"learning_rate": 3.572958295752049e-07, |
|
"loss": 0.0066, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 4.830527497194164, |
|
"grad_norm": 0.016786446794867516, |
|
"learning_rate": 3.207146835262742e-07, |
|
"loss": 0.0063, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 4.839506172839506, |
|
"grad_norm": 0.019592830911278725, |
|
"learning_rate": 2.8610265918151414e-07, |
|
"loss": 0.0093, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 4.848484848484849, |
|
"grad_norm": 0.017766350880265236, |
|
"learning_rate": 2.534611284465083e-07, |
|
"loss": 0.0076, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 4.857463524130191, |
|
"grad_norm": 0.02071945182979107, |
|
"learning_rate": 2.2279138512300567e-07, |
|
"loss": 0.0106, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 4.857463524130191, |
|
"eval_loss": 0.01241106167435646, |
|
"eval_runtime": 6.2616, |
|
"eval_samples_per_second": 7.985, |
|
"eval_steps_per_second": 2.076, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 4.866442199775533, |
|
"grad_norm": 0.017140503972768784, |
|
"learning_rate": 1.940946448576675e-07, |
|
"loss": 0.0063, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 4.875420875420875, |
|
"grad_norm": 0.017304055392742157, |
|
"learning_rate": 1.6737204509387206e-07, |
|
"loss": 0.0075, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 4.884399551066218, |
|
"grad_norm": 0.018793294206261635, |
|
"learning_rate": 1.4262464502663443e-07, |
|
"loss": 0.0099, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 4.89337822671156, |
|
"grad_norm": 0.018352854996919632, |
|
"learning_rate": 1.1985342556060652e-07, |
|
"loss": 0.0074, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 4.902356902356902, |
|
"grad_norm": 0.018267234787344933, |
|
"learning_rate": 9.905928927123609e-08, |
|
"loss": 0.0079, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 4.902356902356902, |
|
"eval_loss": 0.012396564707159996, |
|
"eval_runtime": 6.2557, |
|
"eval_samples_per_second": 7.993, |
|
"eval_steps_per_second": 2.078, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 4.9113355780022445, |
|
"grad_norm": 0.02004612796008587, |
|
"learning_rate": 8.02430603689397e-08, |
|
"loss": 0.0079, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 4.920314253647587, |
|
"grad_norm": 0.017602894455194473, |
|
"learning_rate": 6.340548466648443e-08, |
|
"loss": 0.0084, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 4.929292929292929, |
|
"grad_norm": 0.017070379108190536, |
|
"learning_rate": 4.8547229549383844e-08, |
|
"loss": 0.0093, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 4.938271604938271, |
|
"grad_norm": 0.015886155888438225, |
|
"learning_rate": 3.566888394948009e-08, |
|
"loss": 0.0074, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 4.947250280583614, |
|
"grad_norm": 0.018648440018296242, |
|
"learning_rate": 2.4770958321568283e-08, |
|
"loss": 0.0082, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 4.947250280583614, |
|
"eval_loss": 0.01241134200245142, |
|
"eval_runtime": 6.2543, |
|
"eval_samples_per_second": 7.995, |
|
"eval_steps_per_second": 2.079, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 4.956228956228956, |
|
"grad_norm": 0.01602097600698471, |
|
"learning_rate": 1.5853884623195925e-08, |
|
"loss": 0.0063, |
|
"step": 551 |
|
}, |
|
{ |
|
"epoch": 4.965207631874298, |
|
"grad_norm": 0.02373385988175869, |
|
"learning_rate": 8.918016297515541e-09, |
|
"loss": 0.0098, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 4.974186307519641, |
|
"grad_norm": 0.021371588110923767, |
|
"learning_rate": 3.963628259290308e-09, |
|
"loss": 0.0088, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 4.983164983164983, |
|
"grad_norm": 0.017986396327614784, |
|
"learning_rate": 9.90916883986115e-10, |
|
"loss": 0.0077, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 4.992143658810326, |
|
"grad_norm": 0.023325249552726746, |
|
"learning_rate": 0.0, |
|
"loss": 0.0082, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 4.992143658810326, |
|
"eval_loss": 0.012434104457497597, |
|
"eval_runtime": 6.2582, |
|
"eval_samples_per_second": 7.989, |
|
"eval_steps_per_second": 2.077, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 4.992143658810326, |
|
"step": 555, |
|
"total_flos": 9.281622886148997e+17, |
|
"train_loss": 0.013788488566002868, |
|
"train_runtime": 7381.5879, |
|
"train_samples_per_second": 2.412, |
|
"train_steps_per_second": 0.075 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 555, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9.281622886148997e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|