|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.9991154356479433, |
|
"eval_steps": 142, |
|
"global_step": 1130, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 4.680586814880371, |
|
"learning_rate": 1e-05, |
|
"loss": 3.3182, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"eval_loss": 3.3362529277801514, |
|
"eval_runtime": 14.4477, |
|
"eval_samples_per_second": 33.016, |
|
"eval_steps_per_second": 8.306, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 4.609802722930908, |
|
"learning_rate": 2e-05, |
|
"loss": 3.2788, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 4.793943405151367, |
|
"learning_rate": 3e-05, |
|
"loss": 3.3432, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 4.687256336212158, |
|
"learning_rate": 4e-05, |
|
"loss": 3.2521, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 4.676945209503174, |
|
"learning_rate": 5e-05, |
|
"loss": 3.1085, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 4.490086078643799, |
|
"learning_rate": 6e-05, |
|
"loss": 2.8093, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 4.042544364929199, |
|
"learning_rate": 7e-05, |
|
"loss": 2.3501, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 3.4973549842834473, |
|
"learning_rate": 8e-05, |
|
"loss": 1.6118, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 2.9255049228668213, |
|
"learning_rate": 9e-05, |
|
"loss": 0.9938, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.9458708763122559, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4821, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.3317277431488037, |
|
"learning_rate": 9.999991309598974e-05, |
|
"loss": 0.3336, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.7918155193328857, |
|
"learning_rate": 9.999965238426104e-05, |
|
"loss": 0.1707, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.779201090335846, |
|
"learning_rate": 9.999921786572015e-05, |
|
"loss": 0.1089, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.8333582878112793, |
|
"learning_rate": 9.999860954187756e-05, |
|
"loss": 0.1829, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.7508969902992249, |
|
"learning_rate": 9.999782741484788e-05, |
|
"loss": 0.1284, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.6024438142776489, |
|
"learning_rate": 9.999687148734995e-05, |
|
"loss": 0.1321, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.2141278237104416, |
|
"learning_rate": 9.999574176270667e-05, |
|
"loss": 0.1294, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.8326414227485657, |
|
"learning_rate": 9.999443824484519e-05, |
|
"loss": 0.1414, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.25254812836647034, |
|
"learning_rate": 9.999296093829672e-05, |
|
"loss": 0.1389, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.24321849644184113, |
|
"learning_rate": 9.999130984819662e-05, |
|
"loss": 0.1354, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.27592459321022034, |
|
"learning_rate": 9.998948498028435e-05, |
|
"loss": 0.1032, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.5619893670082092, |
|
"learning_rate": 9.998748634090344e-05, |
|
"loss": 0.1264, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.7668437361717224, |
|
"learning_rate": 9.998531393700148e-05, |
|
"loss": 0.1223, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.38381776213645935, |
|
"learning_rate": 9.99829677761301e-05, |
|
"loss": 0.0988, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.8528074026107788, |
|
"learning_rate": 9.998044786644491e-05, |
|
"loss": 0.1421, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 2.61940336227417, |
|
"learning_rate": 9.997775421670556e-05, |
|
"loss": 0.2738, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.5642948150634766, |
|
"learning_rate": 9.997488683627559e-05, |
|
"loss": 0.1113, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.31713128089904785, |
|
"learning_rate": 9.997184573512245e-05, |
|
"loss": 0.0593, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.540770411491394, |
|
"learning_rate": 9.996863092381752e-05, |
|
"loss": 0.2008, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.5343081951141357, |
|
"learning_rate": 9.9965242413536e-05, |
|
"loss": 0.1141, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.3453178107738495, |
|
"learning_rate": 9.99616802160569e-05, |
|
"loss": 0.1137, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.24307988584041595, |
|
"learning_rate": 9.995794434376297e-05, |
|
"loss": 0.0971, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.1228247806429863, |
|
"learning_rate": 9.995403480964072e-05, |
|
"loss": 0.1246, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.28919702768325806, |
|
"learning_rate": 9.994995162728029e-05, |
|
"loss": 0.1485, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.16266010701656342, |
|
"learning_rate": 9.994569481087552e-05, |
|
"loss": 0.1196, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.13942277431488037, |
|
"learning_rate": 9.994126437522375e-05, |
|
"loss": 0.1266, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.22137008607387543, |
|
"learning_rate": 9.99366603357259e-05, |
|
"loss": 0.1226, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.4418635070323944, |
|
"learning_rate": 9.993188270838635e-05, |
|
"loss": 0.1577, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.1568412482738495, |
|
"learning_rate": 9.992693150981292e-05, |
|
"loss": 0.1205, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.33117881417274475, |
|
"learning_rate": 9.992180675721672e-05, |
|
"loss": 0.1179, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.18135391175746918, |
|
"learning_rate": 9.991650846841226e-05, |
|
"loss": 0.098, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.23688584566116333, |
|
"learning_rate": 9.99110366618172e-05, |
|
"loss": 0.0839, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.40056225657463074, |
|
"learning_rate": 9.990539135645245e-05, |
|
"loss": 0.1648, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.16901437938213348, |
|
"learning_rate": 9.9899572571942e-05, |
|
"loss": 0.052, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.6529514193534851, |
|
"learning_rate": 9.989358032851284e-05, |
|
"loss": 0.1448, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.25050362944602966, |
|
"learning_rate": 9.9887414646995e-05, |
|
"loss": 0.083, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.38230955600738525, |
|
"learning_rate": 9.988107554882138e-05, |
|
"loss": 0.0912, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.24738825857639313, |
|
"learning_rate": 9.987456305602769e-05, |
|
"loss": 0.1337, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.22692906856536865, |
|
"learning_rate": 9.986787719125241e-05, |
|
"loss": 0.0924, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.6348592638969421, |
|
"learning_rate": 9.986101797773667e-05, |
|
"loss": 0.1401, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.27844250202178955, |
|
"learning_rate": 9.985398543932421e-05, |
|
"loss": 0.1028, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.4353552460670471, |
|
"learning_rate": 9.984677960046123e-05, |
|
"loss": 0.1245, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.3049747943878174, |
|
"learning_rate": 9.98394004861964e-05, |
|
"loss": 0.0936, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.34531188011169434, |
|
"learning_rate": 9.983184812218072e-05, |
|
"loss": 0.0775, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.252056360244751, |
|
"learning_rate": 9.98241225346674e-05, |
|
"loss": 0.1082, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.14545601606369019, |
|
"learning_rate": 9.981622375051183e-05, |
|
"loss": 0.0857, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.313376784324646, |
|
"learning_rate": 9.980815179717145e-05, |
|
"loss": 0.0798, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.3164367079734802, |
|
"learning_rate": 9.979990670270564e-05, |
|
"loss": 0.1103, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.30412447452545166, |
|
"learning_rate": 9.979148849577572e-05, |
|
"loss": 0.0889, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.2582318186759949, |
|
"learning_rate": 9.978289720564471e-05, |
|
"loss": 0.0844, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.2912735939025879, |
|
"learning_rate": 9.977413286217728e-05, |
|
"loss": 0.077, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.3888057768344879, |
|
"learning_rate": 9.976519549583974e-05, |
|
"loss": 0.1386, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.42121654748916626, |
|
"learning_rate": 9.975608513769976e-05, |
|
"loss": 0.0823, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.4052259624004364, |
|
"learning_rate": 9.974680181942645e-05, |
|
"loss": 0.0846, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.2273682802915573, |
|
"learning_rate": 9.973734557329009e-05, |
|
"loss": 0.0589, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.5051669478416443, |
|
"learning_rate": 9.972771643216212e-05, |
|
"loss": 0.1111, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.2672370970249176, |
|
"learning_rate": 9.971791442951497e-05, |
|
"loss": 0.0819, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.28557881712913513, |
|
"learning_rate": 9.970793959942198e-05, |
|
"loss": 0.0912, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.34148702025413513, |
|
"learning_rate": 9.969779197655726e-05, |
|
"loss": 0.1036, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.3550778925418854, |
|
"learning_rate": 9.968747159619556e-05, |
|
"loss": 0.0833, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.3434258699417114, |
|
"learning_rate": 9.967697849421221e-05, |
|
"loss": 0.1186, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.23545867204666138, |
|
"learning_rate": 9.966631270708287e-05, |
|
"loss": 0.1185, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.28094542026519775, |
|
"learning_rate": 9.965547427188357e-05, |
|
"loss": 0.052, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.13517600297927856, |
|
"learning_rate": 9.964446322629043e-05, |
|
"loss": 0.0695, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.16696467995643616, |
|
"learning_rate": 9.963327960857962e-05, |
|
"loss": 0.1003, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.18569788336753845, |
|
"learning_rate": 9.962192345762717e-05, |
|
"loss": 0.0495, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.19817449152469635, |
|
"learning_rate": 9.961039481290888e-05, |
|
"loss": 0.067, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.14672966301441193, |
|
"learning_rate": 9.959869371450021e-05, |
|
"loss": 0.0737, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.302121639251709, |
|
"learning_rate": 9.958682020307601e-05, |
|
"loss": 0.0779, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.35970303416252136, |
|
"learning_rate": 9.957477431991054e-05, |
|
"loss": 0.134, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.308292955160141, |
|
"learning_rate": 9.956255610687719e-05, |
|
"loss": 0.1006, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.27124735713005066, |
|
"learning_rate": 9.955016560644847e-05, |
|
"loss": 0.0572, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.1820615977048874, |
|
"learning_rate": 9.953760286169571e-05, |
|
"loss": 0.0595, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.36385104060173035, |
|
"learning_rate": 9.952486791628905e-05, |
|
"loss": 0.0874, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.848340630531311, |
|
"learning_rate": 9.95119608144972e-05, |
|
"loss": 0.1178, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.7947489023208618, |
|
"learning_rate": 9.94988816011873e-05, |
|
"loss": 0.1115, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.33932074904441833, |
|
"learning_rate": 9.94856303218248e-05, |
|
"loss": 0.0546, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.26873940229415894, |
|
"learning_rate": 9.947220702247329e-05, |
|
"loss": 0.0873, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.3373044431209564, |
|
"learning_rate": 9.945861174979429e-05, |
|
"loss": 0.1051, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.24391719698905945, |
|
"learning_rate": 9.944484455104717e-05, |
|
"loss": 0.0986, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.4301680326461792, |
|
"learning_rate": 9.943090547408888e-05, |
|
"loss": 0.1524, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.4246341288089752, |
|
"learning_rate": 9.941679456737394e-05, |
|
"loss": 0.1619, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.2680893838405609, |
|
"learning_rate": 9.940251187995411e-05, |
|
"loss": 0.1187, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.18920297920703888, |
|
"learning_rate": 9.938805746147827e-05, |
|
"loss": 0.105, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.22168701887130737, |
|
"learning_rate": 9.937343136219233e-05, |
|
"loss": 0.0856, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.2235199213027954, |
|
"learning_rate": 9.935863363293896e-05, |
|
"loss": 0.1026, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.16578496992588043, |
|
"learning_rate": 9.93436643251574e-05, |
|
"loss": 0.0777, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.15994016826152802, |
|
"learning_rate": 9.932852349088342e-05, |
|
"loss": 0.0957, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.18692170083522797, |
|
"learning_rate": 9.931321118274897e-05, |
|
"loss": 0.0913, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.15477485954761505, |
|
"learning_rate": 9.929772745398206e-05, |
|
"loss": 0.0911, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.28473320603370667, |
|
"learning_rate": 9.928207235840664e-05, |
|
"loss": 0.1283, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.38557159900665283, |
|
"learning_rate": 9.926624595044234e-05, |
|
"loss": 0.1125, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.13523289561271667, |
|
"learning_rate": 9.925024828510427e-05, |
|
"loss": 0.0555, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.15305563807487488, |
|
"learning_rate": 9.923407941800291e-05, |
|
"loss": 0.1003, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.21130621433258057, |
|
"learning_rate": 9.921773940534382e-05, |
|
"loss": 0.0945, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.2301904559135437, |
|
"learning_rate": 9.920122830392748e-05, |
|
"loss": 0.1019, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.16425654292106628, |
|
"learning_rate": 9.918454617114918e-05, |
|
"loss": 0.0781, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.2672991454601288, |
|
"learning_rate": 9.916769306499866e-05, |
|
"loss": 0.085, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.3746218681335449, |
|
"learning_rate": 9.915066904406e-05, |
|
"loss": 0.1698, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.1691233068704605, |
|
"learning_rate": 9.913347416751148e-05, |
|
"loss": 0.046, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.3089153468608856, |
|
"learning_rate": 9.91161084951252e-05, |
|
"loss": 0.131, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.1581045240163803, |
|
"learning_rate": 9.909857208726705e-05, |
|
"loss": 0.0654, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.2545772194862366, |
|
"learning_rate": 9.908086500489637e-05, |
|
"loss": 0.1021, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.2257249355316162, |
|
"learning_rate": 9.906298730956586e-05, |
|
"loss": 0.0636, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.17862719297409058, |
|
"learning_rate": 9.904493906342123e-05, |
|
"loss": 0.0942, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.23423053324222565, |
|
"learning_rate": 9.902672032920106e-05, |
|
"loss": 0.0676, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.26653358340263367, |
|
"learning_rate": 9.900833117023664e-05, |
|
"loss": 0.0918, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.4517073631286621, |
|
"learning_rate": 9.89897716504516e-05, |
|
"loss": 0.1102, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.20187437534332275, |
|
"learning_rate": 9.897104183436183e-05, |
|
"loss": 0.0713, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.5759711861610413, |
|
"learning_rate": 9.895214178707516e-05, |
|
"loss": 0.0837, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.43704915046691895, |
|
"learning_rate": 9.89330715742912e-05, |
|
"loss": 0.0868, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.30784374475479126, |
|
"learning_rate": 9.891383126230104e-05, |
|
"loss": 0.1171, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.23538921773433685, |
|
"learning_rate": 9.889442091798712e-05, |
|
"loss": 0.055, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.27727362513542175, |
|
"learning_rate": 9.887484060882291e-05, |
|
"loss": 0.041, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.21666617691516876, |
|
"learning_rate": 9.885509040287268e-05, |
|
"loss": 0.0624, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.2829636335372925, |
|
"learning_rate": 9.883517036879132e-05, |
|
"loss": 0.0946, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.34035512804985046, |
|
"learning_rate": 9.88150805758241e-05, |
|
"loss": 0.0635, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.44064444303512573, |
|
"learning_rate": 9.879482109380634e-05, |
|
"loss": 0.0931, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.5164741277694702, |
|
"learning_rate": 9.877439199316323e-05, |
|
"loss": 0.0891, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.5549228191375732, |
|
"learning_rate": 9.875379334490962e-05, |
|
"loss": 0.1144, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.41133901476860046, |
|
"learning_rate": 9.873302522064972e-05, |
|
"loss": 0.1022, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.5204330682754517, |
|
"learning_rate": 9.871208769257685e-05, |
|
"loss": 0.0867, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.3383274972438812, |
|
"learning_rate": 9.869098083347323e-05, |
|
"loss": 0.0558, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.7084139585494995, |
|
"learning_rate": 9.866970471670967e-05, |
|
"loss": 0.1208, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.2974587380886078, |
|
"learning_rate": 9.864825941624537e-05, |
|
"loss": 0.1199, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.1811504065990448, |
|
"learning_rate": 9.862664500662764e-05, |
|
"loss": 0.1025, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.23748300969600677, |
|
"learning_rate": 9.860486156299164e-05, |
|
"loss": 0.0864, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.21784676611423492, |
|
"learning_rate": 9.85829091610601e-05, |
|
"loss": 0.095, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.26979225873947144, |
|
"learning_rate": 9.856078787714309e-05, |
|
"loss": 0.0864, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.1479984074831009, |
|
"learning_rate": 9.853849778813777e-05, |
|
"loss": 0.0904, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.17924343049526215, |
|
"learning_rate": 9.851603897152803e-05, |
|
"loss": 0.0752, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.16448016464710236, |
|
"learning_rate": 9.849341150538434e-05, |
|
"loss": 0.0781, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"eval_loss": 0.0848281979560852, |
|
"eval_runtime": 14.6961, |
|
"eval_samples_per_second": 32.458, |
|
"eval_steps_per_second": 8.165, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.14405055344104767, |
|
"learning_rate": 9.847061546836339e-05, |
|
"loss": 0.1007, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.20907168090343475, |
|
"learning_rate": 9.844765093970787e-05, |
|
"loss": 0.1126, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.1777975857257843, |
|
"learning_rate": 9.842451799924616e-05, |
|
"loss": 0.0928, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.1817995309829712, |
|
"learning_rate": 9.840121672739208e-05, |
|
"loss": 0.046, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.2099136859178543, |
|
"learning_rate": 9.837774720514457e-05, |
|
"loss": 0.1032, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.19467169046401978, |
|
"learning_rate": 9.835410951408748e-05, |
|
"loss": 0.0913, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.24700500071048737, |
|
"learning_rate": 9.833030373638919e-05, |
|
"loss": 0.1101, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.22854459285736084, |
|
"learning_rate": 9.830632995480242e-05, |
|
"loss": 0.0729, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.206742063164711, |
|
"learning_rate": 9.828218825266388e-05, |
|
"loss": 0.0861, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.40378740429878235, |
|
"learning_rate": 9.8257878713894e-05, |
|
"loss": 0.0948, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.5055291652679443, |
|
"learning_rate": 9.823340142299662e-05, |
|
"loss": 0.193, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.3036790192127228, |
|
"learning_rate": 9.820875646505874e-05, |
|
"loss": 0.0859, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.1878231018781662, |
|
"learning_rate": 9.818394392575019e-05, |
|
"loss": 0.0702, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.17990007996559143, |
|
"learning_rate": 9.815896389132333e-05, |
|
"loss": 0.0967, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.22680750489234924, |
|
"learning_rate": 9.813381644861277e-05, |
|
"loss": 0.0959, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.299663782119751, |
|
"learning_rate": 9.810850168503506e-05, |
|
"loss": 0.0801, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.3132835924625397, |
|
"learning_rate": 9.808301968858837e-05, |
|
"loss": 0.1151, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.16891297698020935, |
|
"learning_rate": 9.805737054785222e-05, |
|
"loss": 0.0799, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.23542983829975128, |
|
"learning_rate": 9.803155435198712e-05, |
|
"loss": 0.0645, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.1784803569316864, |
|
"learning_rate": 9.800557119073433e-05, |
|
"loss": 0.0475, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.37001606822013855, |
|
"learning_rate": 9.797942115441545e-05, |
|
"loss": 0.1331, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.18926851451396942, |
|
"learning_rate": 9.795310433393226e-05, |
|
"loss": 0.0744, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.15572589635849, |
|
"learning_rate": 9.792662082076618e-05, |
|
"loss": 0.0551, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.2562514841556549, |
|
"learning_rate": 9.789997070697821e-05, |
|
"loss": 0.106, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.4756919741630554, |
|
"learning_rate": 9.787315408520838e-05, |
|
"loss": 0.1229, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.2322833091020584, |
|
"learning_rate": 9.78461710486756e-05, |
|
"loss": 0.1212, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.28180065751075745, |
|
"learning_rate": 9.78190216911772e-05, |
|
"loss": 0.0855, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.2058788239955902, |
|
"learning_rate": 9.779170610708872e-05, |
|
"loss": 0.0445, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.16885802149772644, |
|
"learning_rate": 9.776422439136352e-05, |
|
"loss": 0.0619, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.2562006413936615, |
|
"learning_rate": 9.773657663953243e-05, |
|
"loss": 0.1111, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.2394249439239502, |
|
"learning_rate": 9.770876294770349e-05, |
|
"loss": 0.0562, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.214800164103508, |
|
"learning_rate": 9.768078341256155e-05, |
|
"loss": 0.0428, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.30056923627853394, |
|
"learning_rate": 9.765263813136796e-05, |
|
"loss": 0.1173, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.3174525499343872, |
|
"learning_rate": 9.762432720196024e-05, |
|
"loss": 0.0871, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.31560906767845154, |
|
"learning_rate": 9.75958507227517e-05, |
|
"loss": 0.1133, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.21752357482910156, |
|
"learning_rate": 9.756720879273117e-05, |
|
"loss": 0.0421, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.3215218484401703, |
|
"learning_rate": 9.753840151146259e-05, |
|
"loss": 0.0596, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.3161137104034424, |
|
"learning_rate": 9.750942897908468e-05, |
|
"loss": 0.122, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.4206744134426117, |
|
"learning_rate": 9.748029129631062e-05, |
|
"loss": 0.0966, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.28242579102516174, |
|
"learning_rate": 9.745098856442768e-05, |
|
"loss": 0.0853, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.24647079408168793, |
|
"learning_rate": 9.742152088529684e-05, |
|
"loss": 0.1077, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.29940977692604065, |
|
"learning_rate": 9.739188836135247e-05, |
|
"loss": 0.0837, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.21811984479427338, |
|
"learning_rate": 9.7362091095602e-05, |
|
"loss": 0.1, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.24434742331504822, |
|
"learning_rate": 9.733212919162549e-05, |
|
"loss": 0.0839, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.1656690537929535, |
|
"learning_rate": 9.730200275357535e-05, |
|
"loss": 0.0894, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.16984042525291443, |
|
"learning_rate": 9.727171188617587e-05, |
|
"loss": 0.0732, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.19889003038406372, |
|
"learning_rate": 9.7241256694723e-05, |
|
"loss": 0.0832, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.1639273464679718, |
|
"learning_rate": 9.721063728508383e-05, |
|
"loss": 0.0912, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.26211172342300415, |
|
"learning_rate": 9.717985376369639e-05, |
|
"loss": 0.0986, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.12403538823127747, |
|
"learning_rate": 9.714890623756912e-05, |
|
"loss": 0.0844, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.1882586032152176, |
|
"learning_rate": 9.711779481428057e-05, |
|
"loss": 0.1163, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.1780715435743332, |
|
"learning_rate": 9.708651960197904e-05, |
|
"loss": 0.1038, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.1291002333164215, |
|
"learning_rate": 9.705508070938218e-05, |
|
"loss": 0.0746, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.2384466975927353, |
|
"learning_rate": 9.702347824577666e-05, |
|
"loss": 0.0909, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.25463247299194336, |
|
"learning_rate": 9.699171232101768e-05, |
|
"loss": 0.0977, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.19303986430168152, |
|
"learning_rate": 9.69597830455287e-05, |
|
"loss": 0.1137, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.21899022161960602, |
|
"learning_rate": 9.692769053030099e-05, |
|
"loss": 0.0671, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.16923530399799347, |
|
"learning_rate": 9.689543488689332e-05, |
|
"loss": 0.0776, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.260955810546875, |
|
"learning_rate": 9.686301622743144e-05, |
|
"loss": 0.092, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.1771455854177475, |
|
"learning_rate": 9.683043466460782e-05, |
|
"loss": 0.06, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.18851810693740845, |
|
"learning_rate": 9.67976903116812e-05, |
|
"loss": 0.0965, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.589522659778595, |
|
"learning_rate": 9.676478328247622e-05, |
|
"loss": 0.1673, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.21747058629989624, |
|
"learning_rate": 9.673171369138296e-05, |
|
"loss": 0.0997, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.4168107807636261, |
|
"learning_rate": 9.669848165335666e-05, |
|
"loss": 0.0795, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.6520416140556335, |
|
"learning_rate": 9.666508728391719e-05, |
|
"loss": 0.1177, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.3752453029155731, |
|
"learning_rate": 9.663153069914875e-05, |
|
"loss": 0.0871, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.26546868681907654, |
|
"learning_rate": 9.65978120156994e-05, |
|
"loss": 0.0647, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.20044176280498505, |
|
"learning_rate": 9.656393135078068e-05, |
|
"loss": 0.1072, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.25033503770828247, |
|
"learning_rate": 9.652988882216724e-05, |
|
"loss": 0.1326, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.24569682776927948, |
|
"learning_rate": 9.649568454819637e-05, |
|
"loss": 0.0931, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.5623157620429993, |
|
"learning_rate": 9.64613186477676e-05, |
|
"loss": 0.2157, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.15052182972431183, |
|
"learning_rate": 9.642679124034233e-05, |
|
"loss": 0.1236, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.14209671318531036, |
|
"learning_rate": 9.639210244594334e-05, |
|
"loss": 0.0971, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.1627768725156784, |
|
"learning_rate": 9.635725238515445e-05, |
|
"loss": 0.1161, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.10190293937921524, |
|
"learning_rate": 9.63222411791201e-05, |
|
"loss": 0.0999, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.11575043201446533, |
|
"learning_rate": 9.62870689495448e-05, |
|
"loss": 0.0986, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.09842410683631897, |
|
"learning_rate": 9.62517358186929e-05, |
|
"loss": 0.1176, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.14816004037857056, |
|
"learning_rate": 9.621624190938803e-05, |
|
"loss": 0.0833, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.11311839520931244, |
|
"learning_rate": 9.618058734501269e-05, |
|
"loss": 0.0815, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.17481163144111633, |
|
"learning_rate": 9.614477224950789e-05, |
|
"loss": 0.0678, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.2277013659477234, |
|
"learning_rate": 9.610879674737264e-05, |
|
"loss": 0.0941, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.14689870178699493, |
|
"learning_rate": 9.607266096366352e-05, |
|
"loss": 0.0991, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.24558769166469574, |
|
"learning_rate": 9.603636502399436e-05, |
|
"loss": 0.0878, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.1541660875082016, |
|
"learning_rate": 9.599990905453567e-05, |
|
"loss": 0.0784, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.12188339233398438, |
|
"learning_rate": 9.59632931820142e-05, |
|
"loss": 0.0464, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.32710394263267517, |
|
"learning_rate": 9.592651753371265e-05, |
|
"loss": 0.0541, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.3118465840816498, |
|
"learning_rate": 9.588958223746903e-05, |
|
"loss": 0.0845, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.26805219054222107, |
|
"learning_rate": 9.585248742167639e-05, |
|
"loss": 0.0485, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.7972936630249023, |
|
"learning_rate": 9.581523321528223e-05, |
|
"loss": 0.1013, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.6285438537597656, |
|
"learning_rate": 9.577781974778817e-05, |
|
"loss": 0.0767, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.6384493708610535, |
|
"learning_rate": 9.57402471492494e-05, |
|
"loss": 0.1855, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.5759001970291138, |
|
"learning_rate": 9.570251555027432e-05, |
|
"loss": 0.1585, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.42002353072166443, |
|
"learning_rate": 9.566462508202402e-05, |
|
"loss": 0.1479, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.18405884504318237, |
|
"learning_rate": 9.562657587621184e-05, |
|
"loss": 0.09, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.20893922448158264, |
|
"learning_rate": 9.558836806510291e-05, |
|
"loss": 0.0685, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.31388092041015625, |
|
"learning_rate": 9.555000178151374e-05, |
|
"loss": 0.0983, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.20344533026218414, |
|
"learning_rate": 9.551147715881166e-05, |
|
"loss": 0.0944, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.1582648903131485, |
|
"learning_rate": 9.547279433091446e-05, |
|
"loss": 0.0662, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.16737405955791473, |
|
"learning_rate": 9.543395343228983e-05, |
|
"loss": 0.1565, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.21974924206733704, |
|
"learning_rate": 9.539495459795499e-05, |
|
"loss": 0.1243, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.1147058829665184, |
|
"learning_rate": 9.535579796347612e-05, |
|
"loss": 0.0727, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.13460345566272736, |
|
"learning_rate": 9.531648366496799e-05, |
|
"loss": 0.0691, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.1404263824224472, |
|
"learning_rate": 9.527701183909336e-05, |
|
"loss": 0.0975, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.17380090057849884, |
|
"learning_rate": 9.523738262306269e-05, |
|
"loss": 0.0873, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.13862797617912292, |
|
"learning_rate": 9.519759615463346e-05, |
|
"loss": 0.0738, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.17551685869693756, |
|
"learning_rate": 9.51576525721098e-05, |
|
"loss": 0.0676, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.20715269446372986, |
|
"learning_rate": 9.511755201434205e-05, |
|
"loss": 0.0737, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.14763356745243073, |
|
"learning_rate": 9.507729462072614e-05, |
|
"loss": 0.07, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.317452073097229, |
|
"learning_rate": 9.503688053120327e-05, |
|
"loss": 0.1252, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.21908459067344666, |
|
"learning_rate": 9.499630988625925e-05, |
|
"loss": 0.0877, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.3233601450920105, |
|
"learning_rate": 9.49555828269242e-05, |
|
"loss": 0.0891, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.4098372161388397, |
|
"learning_rate": 9.491469949477187e-05, |
|
"loss": 0.0805, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.40573808550834656, |
|
"learning_rate": 9.487366003191931e-05, |
|
"loss": 0.1284, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.3391616940498352, |
|
"learning_rate": 9.483246458102625e-05, |
|
"loss": 0.0901, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.1822938323020935, |
|
"learning_rate": 9.479111328529473e-05, |
|
"loss": 0.0398, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.4700302183628082, |
|
"learning_rate": 9.474960628846843e-05, |
|
"loss": 0.1509, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.20210890471935272, |
|
"learning_rate": 9.470794373483236e-05, |
|
"loss": 0.0765, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.28329914808273315, |
|
"learning_rate": 9.466612576921223e-05, |
|
"loss": 0.0666, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.41083166003227234, |
|
"learning_rate": 9.462415253697401e-05, |
|
"loss": 0.1248, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.17644570767879486, |
|
"learning_rate": 9.458202418402338e-05, |
|
"loss": 0.0532, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.2667219936847687, |
|
"learning_rate": 9.453974085680526e-05, |
|
"loss": 0.0937, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.20900332927703857, |
|
"learning_rate": 9.449730270230326e-05, |
|
"loss": 0.0853, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.26425743103027344, |
|
"learning_rate": 9.445470986803922e-05, |
|
"loss": 0.12, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.1956167221069336, |
|
"learning_rate": 9.441196250207267e-05, |
|
"loss": 0.0965, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.21896903216838837, |
|
"learning_rate": 9.436906075300032e-05, |
|
"loss": 0.0867, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.2082919031381607, |
|
"learning_rate": 9.432600476995551e-05, |
|
"loss": 0.0847, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.1674569696187973, |
|
"learning_rate": 9.428279470260776e-05, |
|
"loss": 0.0846, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.23109744489192963, |
|
"learning_rate": 9.423943070116218e-05, |
|
"loss": 0.136, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.21344415843486786, |
|
"learning_rate": 9.4195912916359e-05, |
|
"loss": 0.1091, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.16391590237617493, |
|
"learning_rate": 9.415224149947306e-05, |
|
"loss": 0.0901, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.2023243010044098, |
|
"learning_rate": 9.410841660231315e-05, |
|
"loss": 0.0635, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.1723608821630478, |
|
"learning_rate": 9.406443837722168e-05, |
|
"loss": 0.1001, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.1470147669315338, |
|
"learning_rate": 9.402030697707398e-05, |
|
"loss": 0.0721, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.15082985162734985, |
|
"learning_rate": 9.397602255527791e-05, |
|
"loss": 0.0698, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.16322006285190582, |
|
"learning_rate": 9.393158526577323e-05, |
|
"loss": 0.0809, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.10098633915185928, |
|
"learning_rate": 9.388699526303105e-05, |
|
"loss": 0.0386, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.19049708545207977, |
|
"learning_rate": 9.38422527020534e-05, |
|
"loss": 0.0559, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.22742775082588196, |
|
"learning_rate": 9.37973577383726e-05, |
|
"loss": 0.0802, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.2055177539587021, |
|
"learning_rate": 9.375231052805072e-05, |
|
"loss": 0.1048, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.1366245150566101, |
|
"learning_rate": 9.370711122767913e-05, |
|
"loss": 0.0204, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.3235447406768799, |
|
"learning_rate": 9.36617599943778e-05, |
|
"loss": 0.0974, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.09579204767942429, |
|
"learning_rate": 9.361625698579493e-05, |
|
"loss": 0.0151, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"eval_loss": 0.07987037301063538, |
|
"eval_runtime": 14.6437, |
|
"eval_samples_per_second": 32.574, |
|
"eval_steps_per_second": 8.195, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.22850771248340607, |
|
"learning_rate": 9.357060236010625e-05, |
|
"loss": 0.0458, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.4980478882789612, |
|
"learning_rate": 9.352479627601457e-05, |
|
"loss": 0.1306, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.18770304322242737, |
|
"learning_rate": 9.347883889274923e-05, |
|
"loss": 0.0218, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.6386083364486694, |
|
"learning_rate": 9.34327303700654e-05, |
|
"loss": 0.0912, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.4997164309024811, |
|
"learning_rate": 9.338647086824372e-05, |
|
"loss": 0.1083, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.31682559847831726, |
|
"learning_rate": 9.334006054808966e-05, |
|
"loss": 0.0947, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.28325051069259644, |
|
"learning_rate": 9.329349957093292e-05, |
|
"loss": 0.0794, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.5778185725212097, |
|
"learning_rate": 9.324678809862695e-05, |
|
"loss": 0.1223, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.2953624725341797, |
|
"learning_rate": 9.319992629354828e-05, |
|
"loss": 0.0747, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.28283196687698364, |
|
"learning_rate": 9.31529143185961e-05, |
|
"loss": 0.1099, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.36138102412223816, |
|
"learning_rate": 9.310575233719154e-05, |
|
"loss": 0.1303, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.16202205419540405, |
|
"learning_rate": 9.305844051327725e-05, |
|
"loss": 0.0805, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.11523901671171188, |
|
"learning_rate": 9.30109790113167e-05, |
|
"loss": 0.0775, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.10756238549947739, |
|
"learning_rate": 9.296336799629369e-05, |
|
"loss": 0.0795, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.18142195045948029, |
|
"learning_rate": 9.291560763371173e-05, |
|
"loss": 0.0833, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.14596430957317352, |
|
"learning_rate": 9.28676980895935e-05, |
|
"loss": 0.0904, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.10054739564657211, |
|
"learning_rate": 9.28196395304803e-05, |
|
"loss": 0.0898, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.24579764902591705, |
|
"learning_rate": 9.277143212343134e-05, |
|
"loss": 0.145, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.13506978750228882, |
|
"learning_rate": 9.272307603602334e-05, |
|
"loss": 0.0847, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.17480792105197906, |
|
"learning_rate": 9.267457143634979e-05, |
|
"loss": 0.125, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.31458401679992676, |
|
"learning_rate": 9.262591849302048e-05, |
|
"loss": 0.1047, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.17494355142116547, |
|
"learning_rate": 9.257711737516082e-05, |
|
"loss": 0.0576, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.2996468245983124, |
|
"learning_rate": 9.252816825241134e-05, |
|
"loss": 0.1012, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.19122976064682007, |
|
"learning_rate": 9.247907129492707e-05, |
|
"loss": 0.0878, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.16079925000667572, |
|
"learning_rate": 9.242982667337685e-05, |
|
"loss": 0.0778, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.2628028392791748, |
|
"learning_rate": 9.238043455894293e-05, |
|
"loss": 0.0987, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.318097859621048, |
|
"learning_rate": 9.23308951233202e-05, |
|
"loss": 0.1108, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.2207389920949936, |
|
"learning_rate": 9.228120853871571e-05, |
|
"loss": 0.0826, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.34375905990600586, |
|
"learning_rate": 9.223137497784797e-05, |
|
"loss": 0.1174, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.16714760661125183, |
|
"learning_rate": 9.218139461394644e-05, |
|
"loss": 0.0883, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.24213539063930511, |
|
"learning_rate": 9.213126762075088e-05, |
|
"loss": 0.0686, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.2654499113559723, |
|
"learning_rate": 9.208099417251077e-05, |
|
"loss": 0.1185, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.1353083997964859, |
|
"learning_rate": 9.203057444398469e-05, |
|
"loss": 0.0806, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.30304938554763794, |
|
"learning_rate": 9.198000861043967e-05, |
|
"loss": 0.0817, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.14495517313480377, |
|
"learning_rate": 9.192929684765067e-05, |
|
"loss": 0.0436, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.2180556207895279, |
|
"learning_rate": 9.187843933189995e-05, |
|
"loss": 0.1255, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.3051697611808777, |
|
"learning_rate": 9.182743623997634e-05, |
|
"loss": 0.1241, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.42936787009239197, |
|
"learning_rate": 9.17762877491748e-05, |
|
"loss": 0.1847, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.20895107090473175, |
|
"learning_rate": 9.172499403729566e-05, |
|
"loss": 0.0939, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.15273532271385193, |
|
"learning_rate": 9.167355528264414e-05, |
|
"loss": 0.1012, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.15428248047828674, |
|
"learning_rate": 9.162197166402956e-05, |
|
"loss": 0.061, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.13089029490947723, |
|
"learning_rate": 9.157024336076487e-05, |
|
"loss": 0.089, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.12000248581171036, |
|
"learning_rate": 9.151837055266594e-05, |
|
"loss": 0.0813, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.12965545058250427, |
|
"learning_rate": 9.146635342005099e-05, |
|
"loss": 0.113, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.12225235253572464, |
|
"learning_rate": 9.14141921437399e-05, |
|
"loss": 0.0968, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.11869696527719498, |
|
"learning_rate": 9.136188690505363e-05, |
|
"loss": 0.0752, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.22600843012332916, |
|
"learning_rate": 9.130943788581359e-05, |
|
"loss": 0.1049, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.13381795585155487, |
|
"learning_rate": 9.125684526834099e-05, |
|
"loss": 0.0917, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.12936879694461823, |
|
"learning_rate": 9.120410923545619e-05, |
|
"loss": 0.0782, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.14804388582706451, |
|
"learning_rate": 9.115122997047811e-05, |
|
"loss": 0.0959, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.18504676222801208, |
|
"learning_rate": 9.109820765722357e-05, |
|
"loss": 0.1126, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.1957363486289978, |
|
"learning_rate": 9.10450424800066e-05, |
|
"loss": 0.101, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.15677915513515472, |
|
"learning_rate": 9.099173462363792e-05, |
|
"loss": 0.0775, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.124906025826931, |
|
"learning_rate": 9.093828427342418e-05, |
|
"loss": 0.07, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.12451624125242233, |
|
"learning_rate": 9.088469161516735e-05, |
|
"loss": 0.0588, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.509678304195404, |
|
"learning_rate": 9.083095683516414e-05, |
|
"loss": 0.1563, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.2245551496744156, |
|
"learning_rate": 9.077708012020524e-05, |
|
"loss": 0.1029, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.2735763192176819, |
|
"learning_rate": 9.072306165757476e-05, |
|
"loss": 0.0958, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.2062731385231018, |
|
"learning_rate": 9.066890163504955e-05, |
|
"loss": 0.0638, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.1664024293422699, |
|
"learning_rate": 9.061460024089853e-05, |
|
"loss": 0.0555, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.15788845717906952, |
|
"learning_rate": 9.056015766388205e-05, |
|
"loss": 0.0509, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.185616135597229, |
|
"learning_rate": 9.050557409325125e-05, |
|
"loss": 0.1196, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.24650661647319794, |
|
"learning_rate": 9.045084971874738e-05, |
|
"loss": 0.0723, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.23959776759147644, |
|
"learning_rate": 9.039598473060113e-05, |
|
"loss": 0.1139, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.24370582401752472, |
|
"learning_rate": 9.034097931953201e-05, |
|
"loss": 0.0559, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.11590461432933807, |
|
"learning_rate": 9.028583367674765e-05, |
|
"loss": 0.0285, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.21419131755828857, |
|
"learning_rate": 9.023054799394316e-05, |
|
"loss": 0.0686, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.2115790992975235, |
|
"learning_rate": 9.017512246330042e-05, |
|
"loss": 0.071, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.2025454044342041, |
|
"learning_rate": 9.011955727748748e-05, |
|
"loss": 0.0993, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.22743502259254456, |
|
"learning_rate": 9.006385262965786e-05, |
|
"loss": 0.0705, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.16963045299053192, |
|
"learning_rate": 9.00080087134498e-05, |
|
"loss": 0.0569, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.12319042533636093, |
|
"learning_rate": 8.995202572298576e-05, |
|
"loss": 0.0427, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.15424852073192596, |
|
"learning_rate": 8.989590385287155e-05, |
|
"loss": 0.0564, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.306594580411911, |
|
"learning_rate": 8.983964329819583e-05, |
|
"loss": 0.095, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.13876177370548248, |
|
"learning_rate": 8.978324425452931e-05, |
|
"loss": 0.0641, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.19561870396137238, |
|
"learning_rate": 8.972670691792409e-05, |
|
"loss": 0.0635, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.3458711504936218, |
|
"learning_rate": 8.967003148491304e-05, |
|
"loss": 0.1328, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.1129189059138298, |
|
"learning_rate": 8.961321815250905e-05, |
|
"loss": 0.0205, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.3680332601070404, |
|
"learning_rate": 8.955626711820438e-05, |
|
"loss": 0.1302, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.2695287764072418, |
|
"learning_rate": 8.949917857996996e-05, |
|
"loss": 0.0511, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.17332953214645386, |
|
"learning_rate": 8.94419527362547e-05, |
|
"loss": 0.0494, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.11610284447669983, |
|
"learning_rate": 8.938458978598483e-05, |
|
"loss": 0.0381, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.23595061898231506, |
|
"learning_rate": 8.932708992856315e-05, |
|
"loss": 0.0802, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.26876452565193176, |
|
"learning_rate": 8.926945336386838e-05, |
|
"loss": 0.0461, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.19504375755786896, |
|
"learning_rate": 8.921168029225448e-05, |
|
"loss": 0.0317, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.4416268467903137, |
|
"learning_rate": 8.915377091454992e-05, |
|
"loss": 0.0952, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.32519325613975525, |
|
"learning_rate": 8.909572543205698e-05, |
|
"loss": 0.1027, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.3939536511898041, |
|
"learning_rate": 8.903754404655106e-05, |
|
"loss": 0.1718, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.20514678955078125, |
|
"learning_rate": 8.897922696027999e-05, |
|
"loss": 0.06, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.3049127459526062, |
|
"learning_rate": 8.892077437596332e-05, |
|
"loss": 0.1014, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.19251297414302826, |
|
"learning_rate": 8.88621864967916e-05, |
|
"loss": 0.048, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.21226820349693298, |
|
"learning_rate": 8.880346352642575e-05, |
|
"loss": 0.0652, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.42633509635925293, |
|
"learning_rate": 8.874460566899616e-05, |
|
"loss": 0.1083, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.24073313176631927, |
|
"learning_rate": 8.868561312910221e-05, |
|
"loss": 0.0851, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.3047339618206024, |
|
"learning_rate": 8.862648611181145e-05, |
|
"loss": 0.086, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.2227114737033844, |
|
"learning_rate": 8.856722482265886e-05, |
|
"loss": 0.1196, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.1860799938440323, |
|
"learning_rate": 8.850782946764619e-05, |
|
"loss": 0.0779, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.2109043002128601, |
|
"learning_rate": 8.844830025324122e-05, |
|
"loss": 0.076, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.1998620331287384, |
|
"learning_rate": 8.838863738637706e-05, |
|
"loss": 0.1027, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.12607474625110626, |
|
"learning_rate": 8.832884107445139e-05, |
|
"loss": 0.0436, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.2890150845050812, |
|
"learning_rate": 8.826891152532579e-05, |
|
"loss": 0.0966, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.4496447443962097, |
|
"learning_rate": 8.820884894732497e-05, |
|
"loss": 0.1575, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.19411596655845642, |
|
"learning_rate": 8.814865354923613e-05, |
|
"loss": 0.1201, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.19513021409511566, |
|
"learning_rate": 8.808832554030808e-05, |
|
"loss": 0.0747, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.14038780331611633, |
|
"learning_rate": 8.802786513025068e-05, |
|
"loss": 0.0608, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.14907363057136536, |
|
"learning_rate": 8.796727252923402e-05, |
|
"loss": 0.0843, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.18512780964374542, |
|
"learning_rate": 8.790654794788769e-05, |
|
"loss": 0.0988, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.17880797386169434, |
|
"learning_rate": 8.784569159730007e-05, |
|
"loss": 0.079, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.16263402998447418, |
|
"learning_rate": 8.778470368901762e-05, |
|
"loss": 0.0704, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.27071598172187805, |
|
"learning_rate": 8.772358443504405e-05, |
|
"loss": 0.0983, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.23446398973464966, |
|
"learning_rate": 8.766233404783974e-05, |
|
"loss": 0.0577, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.2932927906513214, |
|
"learning_rate": 8.760095274032083e-05, |
|
"loss": 0.0946, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.29224956035614014, |
|
"learning_rate": 8.75394407258586e-05, |
|
"loss": 0.078, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.15467233955860138, |
|
"learning_rate": 8.747779821827868e-05, |
|
"loss": 0.0779, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.1883499026298523, |
|
"learning_rate": 8.741602543186032e-05, |
|
"loss": 0.0721, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.25179481506347656, |
|
"learning_rate": 8.735412258133562e-05, |
|
"loss": 0.0875, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.23851999640464783, |
|
"learning_rate": 8.729208988188881e-05, |
|
"loss": 0.0959, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.2662704885005951, |
|
"learning_rate": 8.722992754915554e-05, |
|
"loss": 0.1025, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.17909982800483704, |
|
"learning_rate": 8.716763579922204e-05, |
|
"loss": 0.0504, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.17002324759960175, |
|
"learning_rate": 8.710521484862439e-05, |
|
"loss": 0.0856, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.2229025810956955, |
|
"learning_rate": 8.704266491434788e-05, |
|
"loss": 0.0591, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.1444559544324875, |
|
"learning_rate": 8.697998621382607e-05, |
|
"loss": 0.0297, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.2677093744277954, |
|
"learning_rate": 8.69171789649402e-05, |
|
"loss": 0.0543, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.5875506401062012, |
|
"learning_rate": 8.685424338601834e-05, |
|
"loss": 0.1199, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.34576529264450073, |
|
"learning_rate": 8.679117969583464e-05, |
|
"loss": 0.1003, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.2765222489833832, |
|
"learning_rate": 8.672798811360863e-05, |
|
"loss": 0.0358, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.17154745757579803, |
|
"learning_rate": 8.666466885900438e-05, |
|
"loss": 0.0736, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.1607416719198227, |
|
"learning_rate": 8.660122215212977e-05, |
|
"loss": 0.0678, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.1216413602232933, |
|
"learning_rate": 8.653764821353573e-05, |
|
"loss": 0.0341, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.3713608980178833, |
|
"learning_rate": 8.647394726421547e-05, |
|
"loss": 0.118, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.48576387763023376, |
|
"learning_rate": 8.641011952560371e-05, |
|
"loss": 0.0931, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.14704179763793945, |
|
"learning_rate": 8.63461652195759e-05, |
|
"loss": 0.0285, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.3306657671928406, |
|
"learning_rate": 8.628208456844747e-05, |
|
"loss": 0.0737, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.39767885208129883, |
|
"learning_rate": 8.621787779497305e-05, |
|
"loss": 0.097, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.3198534846305847, |
|
"learning_rate": 8.615354512234569e-05, |
|
"loss": 0.0731, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.19191338121891022, |
|
"learning_rate": 8.608908677419606e-05, |
|
"loss": 0.0697, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.26490989327430725, |
|
"learning_rate": 8.602450297459172e-05, |
|
"loss": 0.1214, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.3545917868614197, |
|
"learning_rate": 8.595979394803634e-05, |
|
"loss": 0.0933, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.28239014744758606, |
|
"learning_rate": 8.589495991946885e-05, |
|
"loss": 0.0707, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.48272502422332764, |
|
"learning_rate": 8.583000111426276e-05, |
|
"loss": 0.0831, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.15938633680343628, |
|
"learning_rate": 8.576491775822527e-05, |
|
"loss": 0.0899, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.2616162896156311, |
|
"learning_rate": 8.569971007759657e-05, |
|
"loss": 0.118, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"eval_loss": 0.0783080980181694, |
|
"eval_runtime": 14.6414, |
|
"eval_samples_per_second": 32.579, |
|
"eval_steps_per_second": 8.196, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.19521737098693848, |
|
"learning_rate": 8.563437829904903e-05, |
|
"loss": 0.0814, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.194011390209198, |
|
"learning_rate": 8.55689226496864e-05, |
|
"loss": 0.0799, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.2743787169456482, |
|
"learning_rate": 8.550334335704298e-05, |
|
"loss": 0.0869, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.1369010955095291, |
|
"learning_rate": 8.543764064908295e-05, |
|
"loss": 0.0435, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.24237819015979767, |
|
"learning_rate": 8.537181475419944e-05, |
|
"loss": 0.1148, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.14511409401893616, |
|
"learning_rate": 8.530586590121383e-05, |
|
"loss": 0.0764, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.15356196463108063, |
|
"learning_rate": 8.523979431937492e-05, |
|
"loss": 0.05, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.21860916912555695, |
|
"learning_rate": 8.51736002383581e-05, |
|
"loss": 0.0971, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.23724305629730225, |
|
"learning_rate": 8.510728388826463e-05, |
|
"loss": 0.1049, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.25301918387413025, |
|
"learning_rate": 8.50408454996208e-05, |
|
"loss": 0.0848, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.22409550845623016, |
|
"learning_rate": 8.497428530337706e-05, |
|
"loss": 0.101, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.1324710100889206, |
|
"learning_rate": 8.490760353090737e-05, |
|
"loss": 0.0723, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.1362515389919281, |
|
"learning_rate": 8.484080041400826e-05, |
|
"loss": 0.0709, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.16375669836997986, |
|
"learning_rate": 8.477387618489807e-05, |
|
"loss": 0.0405, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.21752700209617615, |
|
"learning_rate": 8.470683107621616e-05, |
|
"loss": 0.0455, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.1989530771970749, |
|
"learning_rate": 8.463966532102207e-05, |
|
"loss": 0.0704, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.192123144865036, |
|
"learning_rate": 8.457237915279476e-05, |
|
"loss": 0.063, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.1221012994647026, |
|
"learning_rate": 8.450497280543174e-05, |
|
"loss": 0.0302, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.5705539584159851, |
|
"learning_rate": 8.443744651324827e-05, |
|
"loss": 0.1531, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.21490426361560822, |
|
"learning_rate": 8.436980051097659e-05, |
|
"loss": 0.0626, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.2654309570789337, |
|
"learning_rate": 8.430203503376505e-05, |
|
"loss": 0.0838, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.22457195818424225, |
|
"learning_rate": 8.423415031717733e-05, |
|
"loss": 0.0309, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.10934180021286011, |
|
"learning_rate": 8.416614659719157e-05, |
|
"loss": 0.0132, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.32622861862182617, |
|
"learning_rate": 8.409802411019963e-05, |
|
"loss": 0.107, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.24298590421676636, |
|
"learning_rate": 8.40297830930062e-05, |
|
"loss": 0.1268, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.29994437098503113, |
|
"learning_rate": 8.396142378282798e-05, |
|
"loss": 0.0747, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.16668649017810822, |
|
"learning_rate": 8.389294641729293e-05, |
|
"loss": 0.0479, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.26706060767173767, |
|
"learning_rate": 8.382435123443934e-05, |
|
"loss": 0.1116, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.1750030219554901, |
|
"learning_rate": 8.375563847271506e-05, |
|
"loss": 0.0597, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.2318125069141388, |
|
"learning_rate": 8.36868083709767e-05, |
|
"loss": 0.0909, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.1834569126367569, |
|
"learning_rate": 8.361786116848872e-05, |
|
"loss": 0.0813, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.27685895562171936, |
|
"learning_rate": 8.354879710492264e-05, |
|
"loss": 0.1301, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.16120545566082, |
|
"learning_rate": 8.347961642035624e-05, |
|
"loss": 0.0717, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.17625439167022705, |
|
"learning_rate": 8.341031935527267e-05, |
|
"loss": 0.0867, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.2321135252714157, |
|
"learning_rate": 8.334090615055966e-05, |
|
"loss": 0.1122, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.12720270454883575, |
|
"learning_rate": 8.327137704750862e-05, |
|
"loss": 0.0375, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.2046743929386139, |
|
"learning_rate": 8.320173228781389e-05, |
|
"loss": 0.0808, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.16513489186763763, |
|
"learning_rate": 8.313197211357181e-05, |
|
"loss": 0.0825, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.12492749094963074, |
|
"learning_rate": 8.306209676727994e-05, |
|
"loss": 0.0876, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.1343008577823639, |
|
"learning_rate": 8.299210649183619e-05, |
|
"loss": 0.0852, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.13951613008975983, |
|
"learning_rate": 8.2922001530538e-05, |
|
"loss": 0.1003, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.16553768515586853, |
|
"learning_rate": 8.285178212708143e-05, |
|
"loss": 0.0662, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.15311822295188904, |
|
"learning_rate": 8.278144852556042e-05, |
|
"loss": 0.0785, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.1948017179965973, |
|
"learning_rate": 8.271100097046584e-05, |
|
"loss": 0.0898, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.11078551411628723, |
|
"learning_rate": 8.264043970668469e-05, |
|
"loss": 0.0386, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.1605585515499115, |
|
"learning_rate": 8.256976497949924e-05, |
|
"loss": 0.0497, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.1617887318134308, |
|
"learning_rate": 8.249897703458619e-05, |
|
"loss": 0.0624, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.1274091750383377, |
|
"learning_rate": 8.242807611801578e-05, |
|
"loss": 0.0578, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.15953154861927032, |
|
"learning_rate": 8.235706247625098e-05, |
|
"loss": 0.042, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.27984094619750977, |
|
"learning_rate": 8.228593635614659e-05, |
|
"loss": 0.1037, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.1895013153553009, |
|
"learning_rate": 8.22146980049484e-05, |
|
"loss": 0.0728, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.1580246388912201, |
|
"learning_rate": 8.214334767029239e-05, |
|
"loss": 0.0398, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.2391231805086136, |
|
"learning_rate": 8.207188560020373e-05, |
|
"loss": 0.0707, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.25975751876831055, |
|
"learning_rate": 8.200031204309603e-05, |
|
"loss": 0.1369, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.32591861486434937, |
|
"learning_rate": 8.192862724777051e-05, |
|
"loss": 0.0878, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.32488566637039185, |
|
"learning_rate": 8.185683146341496e-05, |
|
"loss": 0.0692, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.1918002963066101, |
|
"learning_rate": 8.178492493960309e-05, |
|
"loss": 0.0942, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.43140751123428345, |
|
"learning_rate": 8.171290792629347e-05, |
|
"loss": 0.0979, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.1771157830953598, |
|
"learning_rate": 8.164078067382882e-05, |
|
"loss": 0.0894, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.25012728571891785, |
|
"learning_rate": 8.1568543432935e-05, |
|
"loss": 0.0734, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.19328337907791138, |
|
"learning_rate": 8.149619645472031e-05, |
|
"loss": 0.0869, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.18180640041828156, |
|
"learning_rate": 8.142373999067439e-05, |
|
"loss": 0.0897, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.25753355026245117, |
|
"learning_rate": 8.135117429266757e-05, |
|
"loss": 0.0883, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.23837833106517792, |
|
"learning_rate": 8.127849961294984e-05, |
|
"loss": 0.0549, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.25032365322113037, |
|
"learning_rate": 8.120571620415006e-05, |
|
"loss": 0.0976, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.30728307366371155, |
|
"learning_rate": 8.113282431927502e-05, |
|
"loss": 0.0709, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.1391928791999817, |
|
"learning_rate": 8.10598242117086e-05, |
|
"loss": 0.0378, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.1786775141954422, |
|
"learning_rate": 8.098671613521089e-05, |
|
"loss": 0.0671, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.21963584423065186, |
|
"learning_rate": 8.091350034391732e-05, |
|
"loss": 0.0936, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.13954299688339233, |
|
"learning_rate": 8.084017709233767e-05, |
|
"loss": 0.052, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.19656923413276672, |
|
"learning_rate": 8.076674663535537e-05, |
|
"loss": 0.0584, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.14330637454986572, |
|
"learning_rate": 8.069320922822643e-05, |
|
"loss": 0.0786, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.19019991159439087, |
|
"learning_rate": 8.061956512657871e-05, |
|
"loss": 0.0837, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.2079285979270935, |
|
"learning_rate": 8.05458145864109e-05, |
|
"loss": 0.0459, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.3516862392425537, |
|
"learning_rate": 8.047195786409172e-05, |
|
"loss": 0.191, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.192392036318779, |
|
"learning_rate": 8.039799521635896e-05, |
|
"loss": 0.1072, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.1787678301334381, |
|
"learning_rate": 8.032392690031867e-05, |
|
"loss": 0.0649, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.21046535670757294, |
|
"learning_rate": 8.024975317344421e-05, |
|
"loss": 0.1065, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.1215684562921524, |
|
"learning_rate": 8.017547429357532e-05, |
|
"loss": 0.0433, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.1342051476240158, |
|
"learning_rate": 8.010109051891731e-05, |
|
"loss": 0.0774, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.10215850174427032, |
|
"learning_rate": 8.002660210804011e-05, |
|
"loss": 0.0338, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.23539598286151886, |
|
"learning_rate": 7.995200931987743e-05, |
|
"loss": 0.0516, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.15601155161857605, |
|
"learning_rate": 7.987731241372572e-05, |
|
"loss": 0.0559, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.165851429104805, |
|
"learning_rate": 7.98025116492434e-05, |
|
"loss": 0.0372, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.21045421063899994, |
|
"learning_rate": 7.972760728644996e-05, |
|
"loss": 0.086, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.14814500510692596, |
|
"learning_rate": 7.965259958572496e-05, |
|
"loss": 0.0587, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.15543898940086365, |
|
"learning_rate": 7.95774888078072e-05, |
|
"loss": 0.0682, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.13527697324752808, |
|
"learning_rate": 7.950227521379382e-05, |
|
"loss": 0.0468, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.35773295164108276, |
|
"learning_rate": 7.94269590651393e-05, |
|
"loss": 0.1273, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.22433511912822723, |
|
"learning_rate": 7.935154062365467e-05, |
|
"loss": 0.0438, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.1453983038663864, |
|
"learning_rate": 7.927602015150655e-05, |
|
"loss": 0.0367, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.2343645691871643, |
|
"learning_rate": 7.920039791121617e-05, |
|
"loss": 0.128, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.25178173184394836, |
|
"learning_rate": 7.912467416565861e-05, |
|
"loss": 0.1094, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.3170076012611389, |
|
"learning_rate": 7.904884917806174e-05, |
|
"loss": 0.1323, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.15917453169822693, |
|
"learning_rate": 7.897292321200538e-05, |
|
"loss": 0.036, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.31578320264816284, |
|
"learning_rate": 7.889689653142036e-05, |
|
"loss": 0.0909, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.16602741181850433, |
|
"learning_rate": 7.882076940058764e-05, |
|
"loss": 0.0371, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.3235325813293457, |
|
"learning_rate": 7.874454208413731e-05, |
|
"loss": 0.1561, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.1256486028432846, |
|
"learning_rate": 7.866821484704776e-05, |
|
"loss": 0.0364, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.2234162986278534, |
|
"learning_rate": 7.859178795464472e-05, |
|
"loss": 0.0883, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.1564294993877411, |
|
"learning_rate": 7.851526167260034e-05, |
|
"loss": 0.0679, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.16309525072574615, |
|
"learning_rate": 7.84386362669322e-05, |
|
"loss": 0.0912, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.21584004163742065, |
|
"learning_rate": 7.836191200400255e-05, |
|
"loss": 0.0695, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.15948422253131866, |
|
"learning_rate": 7.828508915051724e-05, |
|
"loss": 0.0459, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.24016940593719482, |
|
"learning_rate": 7.82081679735248e-05, |
|
"loss": 0.1127, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.2894397974014282, |
|
"learning_rate": 7.813114874041557e-05, |
|
"loss": 0.0584, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.20707662403583527, |
|
"learning_rate": 7.805403171892079e-05, |
|
"loss": 0.1045, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.23427248001098633, |
|
"learning_rate": 7.797681717711161e-05, |
|
"loss": 0.1345, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.13141866028308868, |
|
"learning_rate": 7.789950538339812e-05, |
|
"loss": 0.052, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.21118536591529846, |
|
"learning_rate": 7.782209660652855e-05, |
|
"loss": 0.1272, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.15485352277755737, |
|
"learning_rate": 7.77445911155882e-05, |
|
"loss": 0.0686, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.11380946636199951, |
|
"learning_rate": 7.766698917999861e-05, |
|
"loss": 0.0735, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.26798170804977417, |
|
"learning_rate": 7.758929106951656e-05, |
|
"loss": 0.0934, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.23003587126731873, |
|
"learning_rate": 7.751149705423312e-05, |
|
"loss": 0.0816, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.2122953236103058, |
|
"learning_rate": 7.743360740457278e-05, |
|
"loss": 0.0827, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.22673499584197998, |
|
"learning_rate": 7.735562239129247e-05, |
|
"loss": 0.1232, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.24960415065288544, |
|
"learning_rate": 7.727754228548058e-05, |
|
"loss": 0.1124, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.10405872017145157, |
|
"learning_rate": 7.719936735855611e-05, |
|
"loss": 0.0687, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.16980154812335968, |
|
"learning_rate": 7.712109788226762e-05, |
|
"loss": 0.0874, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.1486412137746811, |
|
"learning_rate": 7.704273412869238e-05, |
|
"loss": 0.0815, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.14432762563228607, |
|
"learning_rate": 7.696427637023538e-05, |
|
"loss": 0.0752, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.2627028822898865, |
|
"learning_rate": 7.688572487962835e-05, |
|
"loss": 0.0982, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.16832011938095093, |
|
"learning_rate": 7.680707992992888e-05, |
|
"loss": 0.0895, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.14999301731586456, |
|
"learning_rate": 7.672834179451942e-05, |
|
"loss": 0.0544, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.14237482845783234, |
|
"learning_rate": 7.664951074710638e-05, |
|
"loss": 0.0623, |
|
"step": 551 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.1694159060716629, |
|
"learning_rate": 7.657058706171911e-05, |
|
"loss": 0.0784, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.1470886617898941, |
|
"learning_rate": 7.649157101270902e-05, |
|
"loss": 0.0635, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.16492018103599548, |
|
"learning_rate": 7.641246287474855e-05, |
|
"loss": 0.0669, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.195392444729805, |
|
"learning_rate": 7.633326292283028e-05, |
|
"loss": 0.0387, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.17653177678585052, |
|
"learning_rate": 7.625397143226596e-05, |
|
"loss": 0.0592, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.23455718159675598, |
|
"learning_rate": 7.617458867868553e-05, |
|
"loss": 0.0882, |
|
"step": 557 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.3588998317718506, |
|
"learning_rate": 7.609511493803616e-05, |
|
"loss": 0.107, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.2767946720123291, |
|
"learning_rate": 7.601555048658134e-05, |
|
"loss": 0.1609, |
|
"step": 559 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.22181196510791779, |
|
"learning_rate": 7.593589560089985e-05, |
|
"loss": 0.0598, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.30335313081741333, |
|
"learning_rate": 7.585615055788484e-05, |
|
"loss": 0.0825, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.19477833807468414, |
|
"learning_rate": 7.577631563474291e-05, |
|
"loss": 0.0446, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.11036123335361481, |
|
"learning_rate": 7.569639110899303e-05, |
|
"loss": 0.025, |
|
"step": 563 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.19955220818519592, |
|
"learning_rate": 7.561637725846568e-05, |
|
"loss": 0.0484, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.20293684303760529, |
|
"learning_rate": 7.553627436130183e-05, |
|
"loss": 0.0689, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.18100765347480774, |
|
"learning_rate": 7.545608269595202e-05, |
|
"loss": 0.0371, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.43053922057151794, |
|
"learning_rate": 7.537580254117531e-05, |
|
"loss": 0.0901, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.3413926959037781, |
|
"learning_rate": 7.529543417603844e-05, |
|
"loss": 0.1088, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.09266742318868637, |
|
"eval_runtime": 14.642, |
|
"eval_samples_per_second": 32.578, |
|
"eval_steps_per_second": 8.196, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 0.1796027272939682, |
|
"learning_rate": 7.521497787991471e-05, |
|
"loss": 0.0244, |
|
"step": 569 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 0.30515041947364807, |
|
"learning_rate": 7.513443393248312e-05, |
|
"loss": 0.0682, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 0.3712550103664398, |
|
"learning_rate": 7.505380261372734e-05, |
|
"loss": 0.0921, |
|
"step": 571 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 0.4219339191913605, |
|
"learning_rate": 7.497308420393477e-05, |
|
"loss": 0.0785, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 0.24129725992679596, |
|
"learning_rate": 7.489227898369559e-05, |
|
"loss": 0.0851, |
|
"step": 573 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 0.24595998227596283, |
|
"learning_rate": 7.481138723390164e-05, |
|
"loss": 0.1143, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 0.13906948268413544, |
|
"learning_rate": 7.473040923574567e-05, |
|
"loss": 0.0402, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 0.1885530650615692, |
|
"learning_rate": 7.464934527072016e-05, |
|
"loss": 0.0384, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 0.13116823136806488, |
|
"learning_rate": 7.456819562061649e-05, |
|
"loss": 0.0447, |
|
"step": 577 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 0.23953841626644135, |
|
"learning_rate": 7.448696056752383e-05, |
|
"loss": 0.0602, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 0.17374739050865173, |
|
"learning_rate": 7.440564039382827e-05, |
|
"loss": 0.0657, |
|
"step": 579 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 0.20921552181243896, |
|
"learning_rate": 7.432423538221178e-05, |
|
"loss": 0.0757, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 0.10258325189352036, |
|
"learning_rate": 7.424274581565123e-05, |
|
"loss": 0.0237, |
|
"step": 581 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 0.31752172112464905, |
|
"learning_rate": 7.416117197741742e-05, |
|
"loss": 0.0625, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 0.229179248213768, |
|
"learning_rate": 7.407951415107413e-05, |
|
"loss": 0.0792, |
|
"step": 583 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 0.16059361398220062, |
|
"learning_rate": 7.3997772620477e-05, |
|
"loss": 0.0718, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 0.1626499593257904, |
|
"learning_rate": 7.391594766977277e-05, |
|
"loss": 0.0457, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 0.1549261212348938, |
|
"learning_rate": 7.383403958339807e-05, |
|
"loss": 0.0544, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 0.1588374525308609, |
|
"learning_rate": 7.375204864607852e-05, |
|
"loss": 0.0342, |
|
"step": 587 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 0.09883646667003632, |
|
"learning_rate": 7.366997514282782e-05, |
|
"loss": 0.0292, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 0.3421178460121155, |
|
"learning_rate": 7.358781935894659e-05, |
|
"loss": 0.0999, |
|
"step": 589 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 0.12105683982372284, |
|
"learning_rate": 7.350558158002154e-05, |
|
"loss": 0.023, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 0.15255074203014374, |
|
"learning_rate": 7.342326209192435e-05, |
|
"loss": 0.0423, |
|
"step": 591 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 0.18337713181972504, |
|
"learning_rate": 7.33408611808108e-05, |
|
"loss": 0.0305, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 0.12969495356082916, |
|
"learning_rate": 7.325837913311966e-05, |
|
"loss": 0.0175, |
|
"step": 593 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 0.2849477231502533, |
|
"learning_rate": 7.317581623557177e-05, |
|
"loss": 0.0878, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 0.24307942390441895, |
|
"learning_rate": 7.3093172775169e-05, |
|
"loss": 0.037, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 0.2612784206867218, |
|
"learning_rate": 7.301044903919325e-05, |
|
"loss": 0.097, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 0.2548207640647888, |
|
"learning_rate": 7.292764531520553e-05, |
|
"loss": 0.0928, |
|
"step": 597 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 0.3221377432346344, |
|
"learning_rate": 7.284476189104485e-05, |
|
"loss": 0.0874, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 0.20441681146621704, |
|
"learning_rate": 7.27617990548273e-05, |
|
"loss": 0.035, |
|
"step": 599 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 0.26378926634788513, |
|
"learning_rate": 7.267875709494499e-05, |
|
"loss": 0.0494, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 0.5505862832069397, |
|
"learning_rate": 7.259563630006512e-05, |
|
"loss": 0.1241, |
|
"step": 601 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 0.13375498354434967, |
|
"learning_rate": 7.251243695912886e-05, |
|
"loss": 0.0241, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 0.19569019973278046, |
|
"learning_rate": 7.242915936135051e-05, |
|
"loss": 0.0698, |
|
"step": 603 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 0.28432735800743103, |
|
"learning_rate": 7.234580379621637e-05, |
|
"loss": 0.0641, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 0.13998962938785553, |
|
"learning_rate": 7.22623705534837e-05, |
|
"loss": 0.0365, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 0.1772097498178482, |
|
"learning_rate": 7.217885992317985e-05, |
|
"loss": 0.081, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 0.4318295121192932, |
|
"learning_rate": 7.209527219560119e-05, |
|
"loss": 0.0532, |
|
"step": 607 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 0.2505156695842743, |
|
"learning_rate": 7.201160766131207e-05, |
|
"loss": 0.0667, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 0.13024090230464935, |
|
"learning_rate": 7.192786661114384e-05, |
|
"loss": 0.0234, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 0.2824789583683014, |
|
"learning_rate": 7.184404933619377e-05, |
|
"loss": 0.095, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 0.18059489130973816, |
|
"learning_rate": 7.17601561278242e-05, |
|
"loss": 0.0471, |
|
"step": 611 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 0.2839769124984741, |
|
"learning_rate": 7.167618727766138e-05, |
|
"loss": 0.0783, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 0.1342955082654953, |
|
"learning_rate": 7.159214307759448e-05, |
|
"loss": 0.0453, |
|
"step": 613 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 0.1336507499217987, |
|
"learning_rate": 7.150802381977464e-05, |
|
"loss": 0.0431, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 0.2953212559223175, |
|
"learning_rate": 7.142382979661386e-05, |
|
"loss": 0.0705, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 0.17532870173454285, |
|
"learning_rate": 7.133956130078412e-05, |
|
"loss": 0.0666, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 0.17404836416244507, |
|
"learning_rate": 7.12552186252162e-05, |
|
"loss": 0.0522, |
|
"step": 617 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 0.25231000781059265, |
|
"learning_rate": 7.117080206309878e-05, |
|
"loss": 0.0854, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 0.2264215499162674, |
|
"learning_rate": 7.108631190787735e-05, |
|
"loss": 0.0692, |
|
"step": 619 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 0.3555202782154083, |
|
"learning_rate": 7.100174845325327e-05, |
|
"loss": 0.074, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 0.34550729393959045, |
|
"learning_rate": 7.091711199318264e-05, |
|
"loss": 0.0831, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 0.14560338854789734, |
|
"learning_rate": 7.083240282187543e-05, |
|
"loss": 0.0404, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 0.23464788496494293, |
|
"learning_rate": 7.074762123379423e-05, |
|
"loss": 0.0699, |
|
"step": 623 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 0.22587832808494568, |
|
"learning_rate": 7.066276752365352e-05, |
|
"loss": 0.0887, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 0.17183855175971985, |
|
"learning_rate": 7.057784198641834e-05, |
|
"loss": 0.0373, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 0.19148162007331848, |
|
"learning_rate": 7.049284491730354e-05, |
|
"loss": 0.0289, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 0.26134082674980164, |
|
"learning_rate": 7.040777661177251e-05, |
|
"loss": 0.0367, |
|
"step": 627 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 0.5379131436347961, |
|
"learning_rate": 7.032263736553635e-05, |
|
"loss": 0.1049, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 0.13634662330150604, |
|
"learning_rate": 7.023742747455276e-05, |
|
"loss": 0.018, |
|
"step": 629 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 0.28767991065979004, |
|
"learning_rate": 7.015214723502496e-05, |
|
"loss": 0.06, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 0.2551933825016022, |
|
"learning_rate": 7.006679694340073e-05, |
|
"loss": 0.0407, |
|
"step": 631 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 0.41325151920318604, |
|
"learning_rate": 6.998137689637142e-05, |
|
"loss": 0.046, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 0.25655174255371094, |
|
"learning_rate": 6.989588739087078e-05, |
|
"loss": 0.0398, |
|
"step": 633 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 0.3917771279811859, |
|
"learning_rate": 6.981032872407405e-05, |
|
"loss": 0.1072, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 0.20738206803798676, |
|
"learning_rate": 6.972470119339691e-05, |
|
"loss": 0.0457, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 0.1865154653787613, |
|
"learning_rate": 6.963900509649434e-05, |
|
"loss": 0.0258, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 0.282071590423584, |
|
"learning_rate": 6.955324073125979e-05, |
|
"loss": 0.07, |
|
"step": 637 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 0.27442115545272827, |
|
"learning_rate": 6.946740839582388e-05, |
|
"loss": 0.0875, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 0.2635151445865631, |
|
"learning_rate": 6.938150838855359e-05, |
|
"loss": 0.0332, |
|
"step": 639 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 0.16783182322978973, |
|
"learning_rate": 6.929554100805118e-05, |
|
"loss": 0.0405, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 0.3328685760498047, |
|
"learning_rate": 6.920950655315297e-05, |
|
"loss": 0.1076, |
|
"step": 641 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 0.20146729052066803, |
|
"learning_rate": 6.91234053229286e-05, |
|
"loss": 0.0481, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 0.21599121391773224, |
|
"learning_rate": 6.903723761667973e-05, |
|
"loss": 0.0502, |
|
"step": 643 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 0.16269706189632416, |
|
"learning_rate": 6.895100373393913e-05, |
|
"loss": 0.0652, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 0.3716180622577667, |
|
"learning_rate": 6.886470397446958e-05, |
|
"loss": 0.0914, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 0.18003414571285248, |
|
"learning_rate": 6.877833863826295e-05, |
|
"loss": 0.0484, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 0.2745915949344635, |
|
"learning_rate": 6.869190802553894e-05, |
|
"loss": 0.1057, |
|
"step": 647 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 0.2507147490978241, |
|
"learning_rate": 6.860541243674426e-05, |
|
"loss": 0.0587, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 0.19874247908592224, |
|
"learning_rate": 6.851885217255145e-05, |
|
"loss": 0.0452, |
|
"step": 649 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 0.21256215870380402, |
|
"learning_rate": 6.843222753385786e-05, |
|
"loss": 0.0434, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 0.13569054007530212, |
|
"learning_rate": 6.834553882178463e-05, |
|
"loss": 0.0275, |
|
"step": 651 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 0.26869267225265503, |
|
"learning_rate": 6.825878633767563e-05, |
|
"loss": 0.1006, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 0.21900776028633118, |
|
"learning_rate": 6.817197038309644e-05, |
|
"loss": 0.0564, |
|
"step": 653 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 0.13306765258312225, |
|
"learning_rate": 6.80850912598332e-05, |
|
"loss": 0.0304, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 0.09863998740911484, |
|
"learning_rate": 6.79981492698917e-05, |
|
"loss": 0.0178, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 0.2170545756816864, |
|
"learning_rate": 6.791114471549627e-05, |
|
"loss": 0.0754, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 0.26661446690559387, |
|
"learning_rate": 6.782407789908863e-05, |
|
"loss": 0.1083, |
|
"step": 657 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 0.2747049629688263, |
|
"learning_rate": 6.773694912332707e-05, |
|
"loss": 0.0758, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 0.252560019493103, |
|
"learning_rate": 6.764975869108514e-05, |
|
"loss": 0.0681, |
|
"step": 659 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 0.2867240905761719, |
|
"learning_rate": 6.756250690545079e-05, |
|
"loss": 0.095, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 0.09760677814483643, |
|
"learning_rate": 6.747519406972524e-05, |
|
"loss": 0.0123, |
|
"step": 661 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 0.17589041590690613, |
|
"learning_rate": 6.738782048742187e-05, |
|
"loss": 0.0437, |
|
"step": 662 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 0.29266613721847534, |
|
"learning_rate": 6.730038646226532e-05, |
|
"loss": 0.0706, |
|
"step": 663 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 0.1055804044008255, |
|
"learning_rate": 6.721289229819024e-05, |
|
"loss": 0.0343, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 0.2909635305404663, |
|
"learning_rate": 6.712533829934042e-05, |
|
"loss": 0.0817, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 0.2604895532131195, |
|
"learning_rate": 6.703772477006757e-05, |
|
"loss": 0.0452, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 0.10520771890878677, |
|
"learning_rate": 6.695005201493038e-05, |
|
"loss": 0.0215, |
|
"step": 667 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 0.10080817341804504, |
|
"learning_rate": 6.686232033869344e-05, |
|
"loss": 0.0188, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 0.3340647220611572, |
|
"learning_rate": 6.677453004632608e-05, |
|
"loss": 0.0612, |
|
"step": 669 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 0.29719796776771545, |
|
"learning_rate": 6.668668144300149e-05, |
|
"loss": 0.1014, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 0.2131602168083191, |
|
"learning_rate": 6.659877483409545e-05, |
|
"loss": 0.0621, |
|
"step": 671 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 0.1867963820695877, |
|
"learning_rate": 6.65108105251855e-05, |
|
"loss": 0.0312, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 0.4250008463859558, |
|
"learning_rate": 6.642278882204963e-05, |
|
"loss": 0.0684, |
|
"step": 673 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 0.20828047394752502, |
|
"learning_rate": 6.633471003066543e-05, |
|
"loss": 0.0421, |
|
"step": 674 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 0.23356445133686066, |
|
"learning_rate": 6.62465744572089e-05, |
|
"loss": 0.0277, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.42427390813827515, |
|
"learning_rate": 6.615838240805344e-05, |
|
"loss": 0.0745, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.23298533260822296, |
|
"learning_rate": 6.607013418976874e-05, |
|
"loss": 0.047, |
|
"step": 677 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.5681192278862, |
|
"learning_rate": 6.598183010911978e-05, |
|
"loss": 0.1032, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.15370431542396545, |
|
"learning_rate": 6.589347047306571e-05, |
|
"loss": 0.0224, |
|
"step": 679 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.2974132001399994, |
|
"learning_rate": 6.580505558875877e-05, |
|
"loss": 0.0908, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.12158460170030594, |
|
"learning_rate": 6.571658576354333e-05, |
|
"loss": 0.0212, |
|
"step": 681 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 0.32594335079193115, |
|
"learning_rate": 6.562806130495467e-05, |
|
"loss": 0.1016, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 0.3316996097564697, |
|
"learning_rate": 6.5539482520718e-05, |
|
"loss": 0.0639, |
|
"step": 683 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 0.21660655736923218, |
|
"learning_rate": 6.545084971874738e-05, |
|
"loss": 0.043, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 0.350033164024353, |
|
"learning_rate": 6.536216320714466e-05, |
|
"loss": 0.0752, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 0.30745336413383484, |
|
"learning_rate": 6.527342329419837e-05, |
|
"loss": 0.0927, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 0.24984771013259888, |
|
"learning_rate": 6.51846302883827e-05, |
|
"loss": 0.0685, |
|
"step": 687 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 0.07773179560899734, |
|
"learning_rate": 6.509578449835636e-05, |
|
"loss": 0.0152, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 0.1620987057685852, |
|
"learning_rate": 6.500688623296159e-05, |
|
"loss": 0.0514, |
|
"step": 689 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 0.1917831003665924, |
|
"learning_rate": 6.491793580122301e-05, |
|
"loss": 0.066, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 0.21920029819011688, |
|
"learning_rate": 6.482893351234658e-05, |
|
"loss": 0.0547, |
|
"step": 691 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 0.29076483845710754, |
|
"learning_rate": 6.473987967571856e-05, |
|
"loss": 0.079, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 0.30292215943336487, |
|
"learning_rate": 6.46507746009043e-05, |
|
"loss": 0.0957, |
|
"step": 693 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 0.14139439165592194, |
|
"learning_rate": 6.456161859764744e-05, |
|
"loss": 0.0346, |
|
"step": 694 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 0.22850438952445984, |
|
"learning_rate": 6.447241197586847e-05, |
|
"loss": 0.0744, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 0.48915836215019226, |
|
"learning_rate": 6.438315504566397e-05, |
|
"loss": 0.0953, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 0.17644958198070526, |
|
"learning_rate": 6.429384811730528e-05, |
|
"loss": 0.046, |
|
"step": 697 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 0.2039819210767746, |
|
"learning_rate": 6.420449150123767e-05, |
|
"loss": 0.1052, |
|
"step": 698 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 0.17715586721897125, |
|
"learning_rate": 6.411508550807906e-05, |
|
"loss": 0.0447, |
|
"step": 699 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 0.16100600361824036, |
|
"learning_rate": 6.4025630448619e-05, |
|
"loss": 0.0344, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 0.22480256855487823, |
|
"learning_rate": 6.393612663381763e-05, |
|
"loss": 0.0495, |
|
"step": 701 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 0.12992677092552185, |
|
"learning_rate": 6.384657437480458e-05, |
|
"loss": 0.0409, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 0.1325366348028183, |
|
"learning_rate": 6.375697398287787e-05, |
|
"loss": 0.0257, |
|
"step": 703 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 0.16241514682769775, |
|
"learning_rate": 6.366732576950284e-05, |
|
"loss": 0.0427, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 0.21476183831691742, |
|
"learning_rate": 6.357763004631104e-05, |
|
"loss": 0.0451, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 0.32039332389831543, |
|
"learning_rate": 6.34878871250992e-05, |
|
"loss": 0.0545, |
|
"step": 706 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 0.3203076124191284, |
|
"learning_rate": 6.33980973178281e-05, |
|
"loss": 0.0917, |
|
"step": 707 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 0.25006967782974243, |
|
"learning_rate": 6.330826093662156e-05, |
|
"loss": 0.1028, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 0.52630215883255, |
|
"learning_rate": 6.32183782937652e-05, |
|
"loss": 0.0889, |
|
"step": 709 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 0.33741331100463867, |
|
"learning_rate": 6.31284497017055e-05, |
|
"loss": 0.0725, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"eval_loss": 0.07285241782665253, |
|
"eval_runtime": 14.6756, |
|
"eval_samples_per_second": 32.503, |
|
"eval_steps_per_second": 8.177, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 0.40746867656707764, |
|
"learning_rate": 6.303847547304873e-05, |
|
"loss": 0.0945, |
|
"step": 711 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 0.22757941484451294, |
|
"learning_rate": 6.294845592055967e-05, |
|
"loss": 0.0532, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 0.19006334245204926, |
|
"learning_rate": 6.285839135716079e-05, |
|
"loss": 0.0484, |
|
"step": 713 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 0.48126357793807983, |
|
"learning_rate": 6.27682820959309e-05, |
|
"loss": 0.0967, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 0.23766569793224335, |
|
"learning_rate": 6.26781284501043e-05, |
|
"loss": 0.106, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 0.16818860173225403, |
|
"learning_rate": 6.258793073306949e-05, |
|
"loss": 0.0494, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 0.28579115867614746, |
|
"learning_rate": 6.249768925836822e-05, |
|
"loss": 0.0937, |
|
"step": 717 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 0.16623319685459137, |
|
"learning_rate": 6.240740433969432e-05, |
|
"loss": 0.0301, |
|
"step": 718 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 0.1560198813676834, |
|
"learning_rate": 6.231707629089262e-05, |
|
"loss": 0.0384, |
|
"step": 719 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 0.2002251148223877, |
|
"learning_rate": 6.2226705425958e-05, |
|
"loss": 0.0545, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 0.16213096678256989, |
|
"learning_rate": 6.2136292059034e-05, |
|
"loss": 0.0433, |
|
"step": 721 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 0.27064821124076843, |
|
"learning_rate": 6.204583650441201e-05, |
|
"loss": 0.0796, |
|
"step": 722 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 0.11131159216165543, |
|
"learning_rate": 6.195533907653004e-05, |
|
"loss": 0.0229, |
|
"step": 723 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 0.22354401648044586, |
|
"learning_rate": 6.18648000899717e-05, |
|
"loss": 0.0475, |
|
"step": 724 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 0.19944117963314056, |
|
"learning_rate": 6.177421985946499e-05, |
|
"loss": 0.0413, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 0.32458746433258057, |
|
"learning_rate": 6.168359869988134e-05, |
|
"loss": 0.1205, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 0.19088833034038544, |
|
"learning_rate": 6.159293692623443e-05, |
|
"loss": 0.0626, |
|
"step": 727 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 0.2114744633436203, |
|
"learning_rate": 6.150223485367914e-05, |
|
"loss": 0.048, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 0.11308068782091141, |
|
"learning_rate": 6.141149279751043e-05, |
|
"loss": 0.0286, |
|
"step": 729 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 0.22453975677490234, |
|
"learning_rate": 6.13207110731622e-05, |
|
"loss": 0.0279, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 0.274513840675354, |
|
"learning_rate": 6.122988999620634e-05, |
|
"loss": 0.0553, |
|
"step": 731 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 0.2700372636318207, |
|
"learning_rate": 6.113902988235145e-05, |
|
"loss": 0.0973, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 0.3287579119205475, |
|
"learning_rate": 6.104813104744188e-05, |
|
"loss": 0.0853, |
|
"step": 733 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 0.37582048773765564, |
|
"learning_rate": 6.095719380745654e-05, |
|
"loss": 0.088, |
|
"step": 734 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 0.2009502649307251, |
|
"learning_rate": 6.086621847850788e-05, |
|
"loss": 0.0525, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 0.1417909413576126, |
|
"learning_rate": 6.077520537684072e-05, |
|
"loss": 0.0311, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 0.18328174948692322, |
|
"learning_rate": 6.068415481883122e-05, |
|
"loss": 0.0379, |
|
"step": 737 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 0.1513252556324005, |
|
"learning_rate": 6.059306712098571e-05, |
|
"loss": 0.0319, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 0.2544059753417969, |
|
"learning_rate": 6.0501942599939666e-05, |
|
"loss": 0.0593, |
|
"step": 739 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 0.4964008331298828, |
|
"learning_rate": 6.0410781572456486e-05, |
|
"loss": 0.0367, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 0.14776591956615448, |
|
"learning_rate": 6.031958435542659e-05, |
|
"loss": 0.0284, |
|
"step": 741 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 0.20590472221374512, |
|
"learning_rate": 6.022835126586609e-05, |
|
"loss": 0.0359, |
|
"step": 742 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 0.2493211179971695, |
|
"learning_rate": 6.0137082620915863e-05, |
|
"loss": 0.0424, |
|
"step": 743 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 0.31557443737983704, |
|
"learning_rate": 6.0045778737840344e-05, |
|
"loss": 0.0563, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 0.14257828891277313, |
|
"learning_rate": 5.995443993402647e-05, |
|
"loss": 0.024, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 0.21385452151298523, |
|
"learning_rate": 5.9863066526982605e-05, |
|
"loss": 0.0721, |
|
"step": 746 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 0.17539048194885254, |
|
"learning_rate": 5.977165883433734e-05, |
|
"loss": 0.025, |
|
"step": 747 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 0.28508231043815613, |
|
"learning_rate": 5.9680217173838494e-05, |
|
"loss": 0.0595, |
|
"step": 748 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 0.30929744243621826, |
|
"learning_rate": 5.9588741863351924e-05, |
|
"loss": 0.112, |
|
"step": 749 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 0.439656525850296, |
|
"learning_rate": 5.949723322086053e-05, |
|
"loss": 0.0427, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 0.2300054430961609, |
|
"learning_rate": 5.940569156446298e-05, |
|
"loss": 0.0437, |
|
"step": 751 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 0.4155109226703644, |
|
"learning_rate": 5.931411721237279e-05, |
|
"loss": 0.0569, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 0.25196224451065063, |
|
"learning_rate": 5.922251048291707e-05, |
|
"loss": 0.0413, |
|
"step": 753 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 0.5078486204147339, |
|
"learning_rate": 5.913087169453554e-05, |
|
"loss": 0.0988, |
|
"step": 754 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 0.26931652426719666, |
|
"learning_rate": 5.9039201165779315e-05, |
|
"loss": 0.0578, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 0.2641213834285736, |
|
"learning_rate": 5.8947499215309834e-05, |
|
"loss": 0.0362, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 0.23865339159965515, |
|
"learning_rate": 5.8855766161897805e-05, |
|
"loss": 0.0375, |
|
"step": 757 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 0.2594137191772461, |
|
"learning_rate": 5.876400232442205e-05, |
|
"loss": 0.0489, |
|
"step": 758 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 0.2721590995788574, |
|
"learning_rate": 5.867220802186837e-05, |
|
"loss": 0.0407, |
|
"step": 759 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 0.3681499660015106, |
|
"learning_rate": 5.85803835733285e-05, |
|
"loss": 0.0554, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 0.3132595121860504, |
|
"learning_rate": 5.848852929799894e-05, |
|
"loss": 0.0486, |
|
"step": 761 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 0.16972127556800842, |
|
"learning_rate": 5.8396645515179884e-05, |
|
"loss": 0.0473, |
|
"step": 762 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 0.30628886818885803, |
|
"learning_rate": 5.83047325442741e-05, |
|
"loss": 0.0664, |
|
"step": 763 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 0.3327179551124573, |
|
"learning_rate": 5.8212790704785824e-05, |
|
"loss": 0.0605, |
|
"step": 764 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 0.3301398754119873, |
|
"learning_rate": 5.812082031631966e-05, |
|
"loss": 0.0477, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 0.23960134387016296, |
|
"learning_rate": 5.8028821698579385e-05, |
|
"loss": 0.0376, |
|
"step": 766 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 0.2526357173919678, |
|
"learning_rate": 5.7936795171367e-05, |
|
"loss": 0.0712, |
|
"step": 767 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 0.32746273279190063, |
|
"learning_rate": 5.784474105458143e-05, |
|
"loss": 0.0542, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 0.10859230905771255, |
|
"learning_rate": 5.77526596682176e-05, |
|
"loss": 0.019, |
|
"step": 769 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 0.2908915877342224, |
|
"learning_rate": 5.766055133236513e-05, |
|
"loss": 0.142, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 0.26869770884513855, |
|
"learning_rate": 5.7568416367207404e-05, |
|
"loss": 0.0774, |
|
"step": 771 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 0.39681994915008545, |
|
"learning_rate": 5.7476255093020326e-05, |
|
"loss": 0.0632, |
|
"step": 772 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 0.14335761964321136, |
|
"learning_rate": 5.7384067830171274e-05, |
|
"loss": 0.03, |
|
"step": 773 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 0.1377771943807602, |
|
"learning_rate": 5.729185489911797e-05, |
|
"loss": 0.0263, |
|
"step": 774 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 0.19834232330322266, |
|
"learning_rate": 5.719961662040733e-05, |
|
"loss": 0.0506, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 0.14378659427165985, |
|
"learning_rate": 5.710735331467444e-05, |
|
"loss": 0.0285, |
|
"step": 776 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 0.25368401408195496, |
|
"learning_rate": 5.701506530264132e-05, |
|
"loss": 0.0584, |
|
"step": 777 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 0.12339203804731369, |
|
"learning_rate": 5.692275290511592e-05, |
|
"loss": 0.0282, |
|
"step": 778 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 0.203715518116951, |
|
"learning_rate": 5.683041644299093e-05, |
|
"loss": 0.0849, |
|
"step": 779 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 0.1526814103126526, |
|
"learning_rate": 5.673805623724272e-05, |
|
"loss": 0.0256, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 0.18840323388576508, |
|
"learning_rate": 5.664567260893019e-05, |
|
"loss": 0.048, |
|
"step": 781 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 0.15979206562042236, |
|
"learning_rate": 5.6553265879193606e-05, |
|
"loss": 0.0237, |
|
"step": 782 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 0.1128401905298233, |
|
"learning_rate": 5.6460836369253624e-05, |
|
"loss": 0.0213, |
|
"step": 783 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 0.1648949831724167, |
|
"learning_rate": 5.6368384400410035e-05, |
|
"loss": 0.0348, |
|
"step": 784 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 0.31091129779815674, |
|
"learning_rate": 5.627591029404071e-05, |
|
"loss": 0.0685, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 0.2921251654624939, |
|
"learning_rate": 5.6183414371600496e-05, |
|
"loss": 0.045, |
|
"step": 786 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 0.3398689925670624, |
|
"learning_rate": 5.609089695462002e-05, |
|
"loss": 0.0546, |
|
"step": 787 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 0.21610289812088013, |
|
"learning_rate": 5.599835836470469e-05, |
|
"loss": 0.0322, |
|
"step": 788 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 0.3218781054019928, |
|
"learning_rate": 5.5905798923533484e-05, |
|
"loss": 0.0331, |
|
"step": 789 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 0.5338783860206604, |
|
"learning_rate": 5.581321895285787e-05, |
|
"loss": 0.0764, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 0.2539553940296173, |
|
"learning_rate": 5.5720618774500675e-05, |
|
"loss": 0.0553, |
|
"step": 791 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 0.22199298441410065, |
|
"learning_rate": 5.5627998710354957e-05, |
|
"loss": 0.0304, |
|
"step": 792 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 0.13213643431663513, |
|
"learning_rate": 5.5535359082382944e-05, |
|
"loss": 0.0115, |
|
"step": 793 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 0.3686007857322693, |
|
"learning_rate": 5.544270021261483e-05, |
|
"loss": 0.0371, |
|
"step": 794 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 0.08815140277147293, |
|
"learning_rate": 5.535002242314772e-05, |
|
"loss": 0.0089, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 0.40321916341781616, |
|
"learning_rate": 5.525732603614444e-05, |
|
"loss": 0.0653, |
|
"step": 796 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 0.44097116589546204, |
|
"learning_rate": 5.5164611373832544e-05, |
|
"loss": 0.0555, |
|
"step": 797 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 0.5565125942230225, |
|
"learning_rate": 5.5071878758503046e-05, |
|
"loss": 0.0646, |
|
"step": 798 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 0.8579866290092468, |
|
"learning_rate": 5.49791285125094e-05, |
|
"loss": 0.1532, |
|
"step": 799 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 0.550639271736145, |
|
"learning_rate": 5.488636095826636e-05, |
|
"loss": 0.0574, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 0.07725897431373596, |
|
"learning_rate": 5.479357641824877e-05, |
|
"loss": 0.0087, |
|
"step": 801 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 0.25981655716896057, |
|
"learning_rate": 5.470077521499063e-05, |
|
"loss": 0.0328, |
|
"step": 802 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 0.07465404272079468, |
|
"learning_rate": 5.4607957671083786e-05, |
|
"loss": 0.0117, |
|
"step": 803 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 0.22613628208637238, |
|
"learning_rate": 5.4515124109176904e-05, |
|
"loss": 0.0596, |
|
"step": 804 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 0.20493067800998688, |
|
"learning_rate": 5.442227485197435e-05, |
|
"loss": 0.0394, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 0.2182394117116928, |
|
"learning_rate": 5.4329410222235034e-05, |
|
"loss": 0.0491, |
|
"step": 806 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 0.16270771622657776, |
|
"learning_rate": 5.42365305427713e-05, |
|
"loss": 0.0333, |
|
"step": 807 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 0.3527982234954834, |
|
"learning_rate": 5.414363613644782e-05, |
|
"loss": 0.1369, |
|
"step": 808 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 0.38832610845565796, |
|
"learning_rate": 5.405072732618043e-05, |
|
"loss": 0.0719, |
|
"step": 809 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 0.18581318855285645, |
|
"learning_rate": 5.395780443493508e-05, |
|
"loss": 0.0305, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 0.298115611076355, |
|
"learning_rate": 5.386486778572665e-05, |
|
"loss": 0.0676, |
|
"step": 811 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 0.15877433121204376, |
|
"learning_rate": 5.3771917701617827e-05, |
|
"loss": 0.0343, |
|
"step": 812 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 0.14393776655197144, |
|
"learning_rate": 5.367895450571801e-05, |
|
"loss": 0.0395, |
|
"step": 813 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 0.25177934765815735, |
|
"learning_rate": 5.358597852118219e-05, |
|
"loss": 0.0757, |
|
"step": 814 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 0.16125288605690002, |
|
"learning_rate": 5.3492990071209806e-05, |
|
"loss": 0.0432, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 0.21766537427902222, |
|
"learning_rate": 5.3399989479043624e-05, |
|
"loss": 0.087, |
|
"step": 816 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 0.18221743404865265, |
|
"learning_rate": 5.3306977067968614e-05, |
|
"loss": 0.0403, |
|
"step": 817 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 0.2517869472503662, |
|
"learning_rate": 5.3213953161310825e-05, |
|
"loss": 0.0666, |
|
"step": 818 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 0.20983122289180756, |
|
"learning_rate": 5.3120918082436314e-05, |
|
"loss": 0.0664, |
|
"step": 819 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 0.17914025485515594, |
|
"learning_rate": 5.3027872154749915e-05, |
|
"loss": 0.0382, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 0.12290598452091217, |
|
"learning_rate": 5.2934815701694204e-05, |
|
"loss": 0.0282, |
|
"step": 821 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 0.17699532210826874, |
|
"learning_rate": 5.2841749046748345e-05, |
|
"loss": 0.0413, |
|
"step": 822 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 0.3889511227607727, |
|
"learning_rate": 5.274867251342694e-05, |
|
"loss": 0.0758, |
|
"step": 823 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 0.24286973476409912, |
|
"learning_rate": 5.2655586425278966e-05, |
|
"loss": 0.0532, |
|
"step": 824 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 0.14911137521266937, |
|
"learning_rate": 5.256249110588659e-05, |
|
"loss": 0.0277, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 0.31271466612815857, |
|
"learning_rate": 5.246938687886409e-05, |
|
"loss": 0.0726, |
|
"step": 826 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 0.2684333920478821, |
|
"learning_rate": 5.237627406785667e-05, |
|
"loss": 0.0993, |
|
"step": 827 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 0.14797139167785645, |
|
"learning_rate": 5.228315299653942e-05, |
|
"loss": 0.0198, |
|
"step": 828 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 0.29548555612564087, |
|
"learning_rate": 5.2190023988616113e-05, |
|
"loss": 0.0562, |
|
"step": 829 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 0.29321712255477905, |
|
"learning_rate": 5.2096887367818105e-05, |
|
"loss": 0.1208, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 0.18331380188465118, |
|
"learning_rate": 5.2003743457903256e-05, |
|
"loss": 0.0256, |
|
"step": 831 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 0.21740898489952087, |
|
"learning_rate": 5.1910592582654715e-05, |
|
"loss": 0.057, |
|
"step": 832 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 0.2625051736831665, |
|
"learning_rate": 5.181743506587989e-05, |
|
"loss": 0.0667, |
|
"step": 833 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 0.2670525908470154, |
|
"learning_rate": 5.172427123140923e-05, |
|
"loss": 0.0883, |
|
"step": 834 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 0.33282265067100525, |
|
"learning_rate": 5.1631101403095184e-05, |
|
"loss": 0.0424, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 0.21608753502368927, |
|
"learning_rate": 5.1537925904811004e-05, |
|
"loss": 0.049, |
|
"step": 836 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 0.10450909286737442, |
|
"learning_rate": 5.144474506044968e-05, |
|
"loss": 0.0158, |
|
"step": 837 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 0.3188491761684418, |
|
"learning_rate": 5.135155919392279e-05, |
|
"loss": 0.0547, |
|
"step": 838 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 0.24398969113826752, |
|
"learning_rate": 5.125836862915934e-05, |
|
"loss": 0.053, |
|
"step": 839 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 0.1743936687707901, |
|
"learning_rate": 5.116517369010466e-05, |
|
"loss": 0.0239, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 0.180791437625885, |
|
"learning_rate": 5.1071974700719326e-05, |
|
"loss": 0.0864, |
|
"step": 841 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 0.19678902626037598, |
|
"learning_rate": 5.0978771984978003e-05, |
|
"loss": 0.0376, |
|
"step": 842 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 0.230797678232193, |
|
"learning_rate": 5.0885565866868227e-05, |
|
"loss": 0.0597, |
|
"step": 843 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 0.4890972971916199, |
|
"learning_rate": 5.079235667038944e-05, |
|
"loss": 0.0832, |
|
"step": 844 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 0.20508797466754913, |
|
"learning_rate": 5.069914471955178e-05, |
|
"loss": 0.0349, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.21593628823757172, |
|
"learning_rate": 5.060593033837493e-05, |
|
"loss": 0.0354, |
|
"step": 846 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.2712628245353699, |
|
"learning_rate": 5.051271385088702e-05, |
|
"loss": 0.0311, |
|
"step": 847 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.11844774335622787, |
|
"learning_rate": 5.041949558112351e-05, |
|
"loss": 0.0109, |
|
"step": 848 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.1798882633447647, |
|
"learning_rate": 5.032627585312608e-05, |
|
"loss": 0.0196, |
|
"step": 849 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.290019690990448, |
|
"learning_rate": 5.023305499094144e-05, |
|
"loss": 0.0667, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 0.24924272298812866, |
|
"learning_rate": 5.013983331862027e-05, |
|
"loss": 0.0556, |
|
"step": 851 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 0.22597135603427887, |
|
"learning_rate": 5.004661116021605e-05, |
|
"loss": 0.0495, |
|
"step": 852 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"eval_loss": 0.07545028626918793, |
|
"eval_runtime": 14.6561, |
|
"eval_samples_per_second": 32.546, |
|
"eval_steps_per_second": 8.188, |
|
"step": 852 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 0.15348747372627258, |
|
"learning_rate": 4.9953388839783954e-05, |
|
"loss": 0.0204, |
|
"step": 853 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 0.6507572531700134, |
|
"learning_rate": 4.9860166681379745e-05, |
|
"loss": 0.076, |
|
"step": 854 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 0.13102935254573822, |
|
"learning_rate": 4.976694500905857e-05, |
|
"loss": 0.0143, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 0.43004414439201355, |
|
"learning_rate": 4.967372414687393e-05, |
|
"loss": 0.0675, |
|
"step": 856 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 0.38339918851852417, |
|
"learning_rate": 4.95805044188765e-05, |
|
"loss": 0.0747, |
|
"step": 857 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 0.4646240472793579, |
|
"learning_rate": 4.9487286149112986e-05, |
|
"loss": 0.0883, |
|
"step": 858 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 0.2721651792526245, |
|
"learning_rate": 4.9394069661625076e-05, |
|
"loss": 0.062, |
|
"step": 859 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 0.356275349855423, |
|
"learning_rate": 4.930085528044823e-05, |
|
"loss": 0.0321, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 0.3066048324108124, |
|
"learning_rate": 4.9207643329610556e-05, |
|
"loss": 0.0525, |
|
"step": 861 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 0.12254035472869873, |
|
"learning_rate": 4.911443413313179e-05, |
|
"loss": 0.0106, |
|
"step": 862 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 0.19596797227859497, |
|
"learning_rate": 4.9021228015022015e-05, |
|
"loss": 0.0242, |
|
"step": 863 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 0.39066678285598755, |
|
"learning_rate": 4.892802529928067e-05, |
|
"loss": 0.0558, |
|
"step": 864 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 0.17401085793972015, |
|
"learning_rate": 4.883482630989535e-05, |
|
"loss": 0.0203, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 0.28903472423553467, |
|
"learning_rate": 4.874163137084068e-05, |
|
"loss": 0.0819, |
|
"step": 866 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 0.2746363580226898, |
|
"learning_rate": 4.8648440806077226e-05, |
|
"loss": 0.0696, |
|
"step": 867 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 0.40534642338752747, |
|
"learning_rate": 4.8555254939550324e-05, |
|
"loss": 0.111, |
|
"step": 868 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 0.41272208094596863, |
|
"learning_rate": 4.8462074095188994e-05, |
|
"loss": 0.1089, |
|
"step": 869 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 0.40718454122543335, |
|
"learning_rate": 4.8368898596904834e-05, |
|
"loss": 0.1339, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 0.14493143558502197, |
|
"learning_rate": 4.827572876859078e-05, |
|
"loss": 0.0227, |
|
"step": 871 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 0.20390640199184418, |
|
"learning_rate": 4.8182564934120115e-05, |
|
"loss": 0.0464, |
|
"step": 872 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 0.12927311658859253, |
|
"learning_rate": 4.80894074173453e-05, |
|
"loss": 0.0253, |
|
"step": 873 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 0.17653903365135193, |
|
"learning_rate": 4.799625654209675e-05, |
|
"loss": 0.0509, |
|
"step": 874 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 0.34687289595603943, |
|
"learning_rate": 4.790311263218191e-05, |
|
"loss": 0.0916, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 0.22851605713367462, |
|
"learning_rate": 4.7809976011383905e-05, |
|
"loss": 0.0857, |
|
"step": 876 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 0.27485382556915283, |
|
"learning_rate": 4.771684700346059e-05, |
|
"loss": 0.0623, |
|
"step": 877 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 0.21887461841106415, |
|
"learning_rate": 4.762372593214335e-05, |
|
"loss": 0.0573, |
|
"step": 878 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 0.2466115951538086, |
|
"learning_rate": 4.753061312113592e-05, |
|
"loss": 0.1039, |
|
"step": 879 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 0.344625860452652, |
|
"learning_rate": 4.743750889411342e-05, |
|
"loss": 0.0637, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 0.1676146686077118, |
|
"learning_rate": 4.7344413574721046e-05, |
|
"loss": 0.0372, |
|
"step": 881 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 0.14225785434246063, |
|
"learning_rate": 4.725132748657307e-05, |
|
"loss": 0.0506, |
|
"step": 882 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 0.19915729761123657, |
|
"learning_rate": 4.715825095325168e-05, |
|
"loss": 0.0459, |
|
"step": 883 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 0.20955249667167664, |
|
"learning_rate": 4.70651842983058e-05, |
|
"loss": 0.0539, |
|
"step": 884 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 0.171535924077034, |
|
"learning_rate": 4.697212784525008e-05, |
|
"loss": 0.0346, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 0.11981090158224106, |
|
"learning_rate": 4.687908191756369e-05, |
|
"loss": 0.0378, |
|
"step": 886 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 0.18210795521736145, |
|
"learning_rate": 4.678604683868918e-05, |
|
"loss": 0.0563, |
|
"step": 887 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 0.18884742259979248, |
|
"learning_rate": 4.669302293203142e-05, |
|
"loss": 0.0393, |
|
"step": 888 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 0.21338443458080292, |
|
"learning_rate": 4.660001052095639e-05, |
|
"loss": 0.054, |
|
"step": 889 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 0.16022799909114838, |
|
"learning_rate": 4.65070099287902e-05, |
|
"loss": 0.0497, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 0.30642077326774597, |
|
"learning_rate": 4.641402147881782e-05, |
|
"loss": 0.0702, |
|
"step": 891 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 0.24659690260887146, |
|
"learning_rate": 4.6321045494282e-05, |
|
"loss": 0.0986, |
|
"step": 892 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 0.4151371419429779, |
|
"learning_rate": 4.62280822983822e-05, |
|
"loss": 0.1064, |
|
"step": 893 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 0.19555744528770447, |
|
"learning_rate": 4.613513221427337e-05, |
|
"loss": 0.034, |
|
"step": 894 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 0.3575385510921478, |
|
"learning_rate": 4.604219556506492e-05, |
|
"loss": 0.0563, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 0.33982524275779724, |
|
"learning_rate": 4.594927267381958e-05, |
|
"loss": 0.1152, |
|
"step": 896 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 0.40054503083229065, |
|
"learning_rate": 4.58563638635522e-05, |
|
"loss": 0.0684, |
|
"step": 897 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 0.16741478443145752, |
|
"learning_rate": 4.5763469457228695e-05, |
|
"loss": 0.0221, |
|
"step": 898 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 0.30603042244911194, |
|
"learning_rate": 4.5670589777764984e-05, |
|
"loss": 0.0725, |
|
"step": 899 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 0.345217227935791, |
|
"learning_rate": 4.5577725148025646e-05, |
|
"loss": 0.062, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 0.4248473048210144, |
|
"learning_rate": 4.54848758908231e-05, |
|
"loss": 0.1482, |
|
"step": 901 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.25596097111701965, |
|
"learning_rate": 4.5392042328916226e-05, |
|
"loss": 0.0417, |
|
"step": 902 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.14021873474121094, |
|
"learning_rate": 4.5299224785009374e-05, |
|
"loss": 0.0242, |
|
"step": 903 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.16665437817573547, |
|
"learning_rate": 4.5206423581751245e-05, |
|
"loss": 0.0569, |
|
"step": 904 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.29362550377845764, |
|
"learning_rate": 4.511363904173366e-05, |
|
"loss": 0.068, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.44577184319496155, |
|
"learning_rate": 4.5020871487490604e-05, |
|
"loss": 0.0787, |
|
"step": 906 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.20594125986099243, |
|
"learning_rate": 4.492812124149696e-05, |
|
"loss": 0.0868, |
|
"step": 907 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 0.2906559109687805, |
|
"learning_rate": 4.483538862616747e-05, |
|
"loss": 0.0592, |
|
"step": 908 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 0.17545486986637115, |
|
"learning_rate": 4.4742673963855576e-05, |
|
"loss": 0.0225, |
|
"step": 909 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 0.18305286765098572, |
|
"learning_rate": 4.46499775768523e-05, |
|
"loss": 0.0483, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 0.2249644249677658, |
|
"learning_rate": 4.455729978738517e-05, |
|
"loss": 0.0383, |
|
"step": 911 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 0.3094448149204254, |
|
"learning_rate": 4.446464091761706e-05, |
|
"loss": 0.0533, |
|
"step": 912 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 0.22453933954238892, |
|
"learning_rate": 4.437200128964504e-05, |
|
"loss": 0.0435, |
|
"step": 913 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 0.1814616322517395, |
|
"learning_rate": 4.4279381225499344e-05, |
|
"loss": 0.0245, |
|
"step": 914 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 0.20599542558193207, |
|
"learning_rate": 4.418678104714214e-05, |
|
"loss": 0.0321, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 0.27197298407554626, |
|
"learning_rate": 4.409420107646652e-05, |
|
"loss": 0.0512, |
|
"step": 916 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 0.35009968280792236, |
|
"learning_rate": 4.400164163529532e-05, |
|
"loss": 0.0717, |
|
"step": 917 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 0.17196977138519287, |
|
"learning_rate": 4.390910304537999e-05, |
|
"loss": 0.033, |
|
"step": 918 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 0.1884760707616806, |
|
"learning_rate": 4.381658562839953e-05, |
|
"loss": 0.0526, |
|
"step": 919 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 0.4165942966938019, |
|
"learning_rate": 4.3724089705959305e-05, |
|
"loss": 0.0824, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 0.36213231086730957, |
|
"learning_rate": 4.363161559958996e-05, |
|
"loss": 0.0524, |
|
"step": 921 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 0.22675907611846924, |
|
"learning_rate": 4.353916363074638e-05, |
|
"loss": 0.0367, |
|
"step": 922 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 0.29561713337898254, |
|
"learning_rate": 4.34467341208064e-05, |
|
"loss": 0.0364, |
|
"step": 923 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 0.22054970264434814, |
|
"learning_rate": 4.3354327391069826e-05, |
|
"loss": 0.025, |
|
"step": 924 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 0.06693907827138901, |
|
"learning_rate": 4.3261943762757287e-05, |
|
"loss": 0.0104, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 0.339631050825119, |
|
"learning_rate": 4.3169583557009064e-05, |
|
"loss": 0.0732, |
|
"step": 926 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 0.2175193578004837, |
|
"learning_rate": 4.307724709488409e-05, |
|
"loss": 0.0464, |
|
"step": 927 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 0.23093104362487793, |
|
"learning_rate": 4.298493469735869e-05, |
|
"loss": 0.0335, |
|
"step": 928 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 0.38875579833984375, |
|
"learning_rate": 4.289264668532557e-05, |
|
"loss": 0.0327, |
|
"step": 929 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 0.05294647812843323, |
|
"learning_rate": 4.280038337959268e-05, |
|
"loss": 0.007, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 0.36184802651405334, |
|
"learning_rate": 4.270814510088203e-05, |
|
"loss": 0.0688, |
|
"step": 931 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 0.531517744064331, |
|
"learning_rate": 4.2615932169828744e-05, |
|
"loss": 0.1305, |
|
"step": 932 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 0.3471108376979828, |
|
"learning_rate": 4.2523744906979686e-05, |
|
"loss": 0.0236, |
|
"step": 933 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 0.2624709904193878, |
|
"learning_rate": 4.24315836327926e-05, |
|
"loss": 0.0272, |
|
"step": 934 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 0.3648707866668701, |
|
"learning_rate": 4.233944866763489e-05, |
|
"loss": 0.0384, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 0.2199166864156723, |
|
"learning_rate": 4.224734033178241e-05, |
|
"loss": 0.0347, |
|
"step": 936 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 0.44493308663368225, |
|
"learning_rate": 4.2155258945418566e-05, |
|
"loss": 0.0405, |
|
"step": 937 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 0.4102453291416168, |
|
"learning_rate": 4.206320482863301e-05, |
|
"loss": 0.0849, |
|
"step": 938 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 0.33510318398475647, |
|
"learning_rate": 4.1971178301420613e-05, |
|
"loss": 0.052, |
|
"step": 939 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 0.40965744853019714, |
|
"learning_rate": 4.187917968368036e-05, |
|
"loss": 0.0848, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 0.2755095064640045, |
|
"learning_rate": 4.178720929521418e-05, |
|
"loss": 0.0391, |
|
"step": 941 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 0.32818931341171265, |
|
"learning_rate": 4.16952674557259e-05, |
|
"loss": 0.0327, |
|
"step": 942 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 0.39538365602493286, |
|
"learning_rate": 4.1603354484820134e-05, |
|
"loss": 0.043, |
|
"step": 943 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 0.2221785932779312, |
|
"learning_rate": 4.1511470702001074e-05, |
|
"loss": 0.0288, |
|
"step": 944 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 0.2112448811531067, |
|
"learning_rate": 4.141961642667152e-05, |
|
"loss": 0.025, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 0.15548011660575867, |
|
"learning_rate": 4.132779197813164e-05, |
|
"loss": 0.0694, |
|
"step": 946 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 0.10790842771530151, |
|
"learning_rate": 4.1235997675577956e-05, |
|
"loss": 0.0124, |
|
"step": 947 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 0.42269936203956604, |
|
"learning_rate": 4.11442338381022e-05, |
|
"loss": 0.0937, |
|
"step": 948 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 0.36002832651138306, |
|
"learning_rate": 4.105250078469018e-05, |
|
"loss": 0.1242, |
|
"step": 949 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 0.22437407076358795, |
|
"learning_rate": 4.0960798834220704e-05, |
|
"loss": 0.0638, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 0.3967968225479126, |
|
"learning_rate": 4.086912830546448e-05, |
|
"loss": 0.0439, |
|
"step": 951 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 0.20550177991390228, |
|
"learning_rate": 4.077748951708292e-05, |
|
"loss": 0.0347, |
|
"step": 952 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 0.2020653337240219, |
|
"learning_rate": 4.068588278762723e-05, |
|
"loss": 0.0376, |
|
"step": 953 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 0.19614431262016296, |
|
"learning_rate": 4.0594308435537024e-05, |
|
"loss": 0.032, |
|
"step": 954 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 0.08721073716878891, |
|
"learning_rate": 4.0502766779139484e-05, |
|
"loss": 0.012, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 0.2303171306848526, |
|
"learning_rate": 4.041125813664808e-05, |
|
"loss": 0.0268, |
|
"step": 956 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 0.2725672721862793, |
|
"learning_rate": 4.031978282616151e-05, |
|
"loss": 0.0413, |
|
"step": 957 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 0.28184202313423157, |
|
"learning_rate": 4.0228341165662685e-05, |
|
"loss": 0.0383, |
|
"step": 958 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 0.28514358401298523, |
|
"learning_rate": 4.0136933473017407e-05, |
|
"loss": 0.044, |
|
"step": 959 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 0.374714732170105, |
|
"learning_rate": 4.004556006597353e-05, |
|
"loss": 0.044, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 0.19936969876289368, |
|
"learning_rate": 3.9954221262159674e-05, |
|
"loss": 0.0334, |
|
"step": 961 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 0.3212338984012604, |
|
"learning_rate": 3.986291737908414e-05, |
|
"loss": 0.0473, |
|
"step": 962 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 0.28045183420181274, |
|
"learning_rate": 3.9771648734133906e-05, |
|
"loss": 0.0321, |
|
"step": 963 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 0.34167036414146423, |
|
"learning_rate": 3.968041564457342e-05, |
|
"loss": 0.0696, |
|
"step": 964 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 0.5529135465621948, |
|
"learning_rate": 3.958921842754351e-05, |
|
"loss": 0.131, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 0.275803804397583, |
|
"learning_rate": 3.949805740006036e-05, |
|
"loss": 0.0436, |
|
"step": 966 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 0.3003288209438324, |
|
"learning_rate": 3.94069328790143e-05, |
|
"loss": 0.073, |
|
"step": 967 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 0.20078504085540771, |
|
"learning_rate": 3.9315845181168784e-05, |
|
"loss": 0.0425, |
|
"step": 968 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 0.3904169797897339, |
|
"learning_rate": 3.9224794623159294e-05, |
|
"loss": 0.0668, |
|
"step": 969 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 0.212997168302536, |
|
"learning_rate": 3.913378152149214e-05, |
|
"loss": 0.0436, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 0.04039880260825157, |
|
"learning_rate": 3.904280619254348e-05, |
|
"loss": 0.0077, |
|
"step": 971 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 0.21076536178588867, |
|
"learning_rate": 3.895186895255814e-05, |
|
"loss": 0.0677, |
|
"step": 972 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 0.37436169385910034, |
|
"learning_rate": 3.886097011764856e-05, |
|
"loss": 0.0294, |
|
"step": 973 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 0.26611942052841187, |
|
"learning_rate": 3.877011000379367e-05, |
|
"loss": 0.057, |
|
"step": 974 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 0.32198566198349, |
|
"learning_rate": 3.8679288926837804e-05, |
|
"loss": 0.0583, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 0.2785477340221405, |
|
"learning_rate": 3.8588507202489586e-05, |
|
"loss": 0.0913, |
|
"step": 976 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 0.20920749008655548, |
|
"learning_rate": 3.8497765146320876e-05, |
|
"loss": 0.0454, |
|
"step": 977 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 0.31738653779029846, |
|
"learning_rate": 3.840706307376557e-05, |
|
"loss": 0.0464, |
|
"step": 978 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 0.1887190192937851, |
|
"learning_rate": 3.8316401300118675e-05, |
|
"loss": 0.026, |
|
"step": 979 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 0.22016988694667816, |
|
"learning_rate": 3.8225780140535025e-05, |
|
"loss": 0.0375, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 0.2261650264263153, |
|
"learning_rate": 3.813519991002831e-05, |
|
"loss": 0.0368, |
|
"step": 981 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 0.3108493983745575, |
|
"learning_rate": 3.804466092346997e-05, |
|
"loss": 0.0539, |
|
"step": 982 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 0.23392857611179352, |
|
"learning_rate": 3.7954163495587995e-05, |
|
"loss": 0.0363, |
|
"step": 983 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 0.23699642717838287, |
|
"learning_rate": 3.786370794096603e-05, |
|
"loss": 0.0362, |
|
"step": 984 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 0.29672032594680786, |
|
"learning_rate": 3.777329457404202e-05, |
|
"loss": 0.0388, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 0.25258907675743103, |
|
"learning_rate": 3.768292370910737e-05, |
|
"loss": 0.0278, |
|
"step": 986 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 0.40179169178009033, |
|
"learning_rate": 3.759259566030571e-05, |
|
"loss": 0.1118, |
|
"step": 987 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 0.3777885437011719, |
|
"learning_rate": 3.750231074163179e-05, |
|
"loss": 0.073, |
|
"step": 988 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 0.39896661043167114, |
|
"learning_rate": 3.7412069266930516e-05, |
|
"loss": 0.0428, |
|
"step": 989 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 0.26577284932136536, |
|
"learning_rate": 3.7321871549895714e-05, |
|
"loss": 0.0335, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 0.5360684990882874, |
|
"learning_rate": 3.7231717904069094e-05, |
|
"loss": 0.0979, |
|
"step": 991 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 0.21000511944293976, |
|
"learning_rate": 3.714160864283923e-05, |
|
"loss": 0.0156, |
|
"step": 992 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 0.11425631493330002, |
|
"learning_rate": 3.7051544079440336e-05, |
|
"loss": 0.0143, |
|
"step": 993 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 0.20283763110637665, |
|
"learning_rate": 3.696152452695128e-05, |
|
"loss": 0.0627, |
|
"step": 994 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"eval_loss": 0.07780980318784714, |
|
"eval_runtime": 14.6775, |
|
"eval_samples_per_second": 32.499, |
|
"eval_steps_per_second": 8.176, |
|
"step": 994 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 0.13721764087677002, |
|
"learning_rate": 3.68715502982945e-05, |
|
"loss": 0.0154, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 0.397158145904541, |
|
"learning_rate": 3.678162170623481e-05, |
|
"loss": 0.0494, |
|
"step": 996 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 0.47730910778045654, |
|
"learning_rate": 3.669173906337846e-05, |
|
"loss": 0.0897, |
|
"step": 997 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 0.19200514256954193, |
|
"learning_rate": 3.6601902682171894e-05, |
|
"loss": 0.0145, |
|
"step": 998 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 0.39415818452835083, |
|
"learning_rate": 3.65121128749008e-05, |
|
"loss": 0.0778, |
|
"step": 999 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 0.17673304677009583, |
|
"learning_rate": 3.642236995368897e-05, |
|
"loss": 0.0211, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 0.3190731108188629, |
|
"learning_rate": 3.633267423049717e-05, |
|
"loss": 0.0856, |
|
"step": 1001 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 0.3833164572715759, |
|
"learning_rate": 3.624302601712213e-05, |
|
"loss": 0.0687, |
|
"step": 1002 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 0.2938999831676483, |
|
"learning_rate": 3.6153425625195425e-05, |
|
"loss": 0.0717, |
|
"step": 1003 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 0.2645472586154938, |
|
"learning_rate": 3.606387336618237e-05, |
|
"loss": 0.0341, |
|
"step": 1004 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 0.1330021619796753, |
|
"learning_rate": 3.597436955138102e-05, |
|
"loss": 0.0244, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 0.34308159351348877, |
|
"learning_rate": 3.588491449192096e-05, |
|
"loss": 0.091, |
|
"step": 1006 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 0.2861696481704712, |
|
"learning_rate": 3.579550849876233e-05, |
|
"loss": 0.0601, |
|
"step": 1007 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 0.32016104459762573, |
|
"learning_rate": 3.570615188269473e-05, |
|
"loss": 0.0699, |
|
"step": 1008 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 0.2865599989891052, |
|
"learning_rate": 3.561684495433605e-05, |
|
"loss": 0.0742, |
|
"step": 1009 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 0.2045123279094696, |
|
"learning_rate": 3.5527588024131544e-05, |
|
"loss": 0.0323, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 0.13562075793743134, |
|
"learning_rate": 3.5438381402352574e-05, |
|
"loss": 0.0167, |
|
"step": 1011 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 0.2536921799182892, |
|
"learning_rate": 3.534922539909569e-05, |
|
"loss": 0.047, |
|
"step": 1012 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 0.193417489528656, |
|
"learning_rate": 3.5260120324281474e-05, |
|
"loss": 0.0349, |
|
"step": 1013 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 0.18863309919834137, |
|
"learning_rate": 3.517106648765343e-05, |
|
"loss": 0.0261, |
|
"step": 1014 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 0.5035936832427979, |
|
"learning_rate": 3.5082064198777e-05, |
|
"loss": 0.0963, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 0.33512285351753235, |
|
"learning_rate": 3.499311376703842e-05, |
|
"loss": 0.0534, |
|
"step": 1016 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 0.14217574894428253, |
|
"learning_rate": 3.4904215501643646e-05, |
|
"loss": 0.0246, |
|
"step": 1017 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 0.3124421536922455, |
|
"learning_rate": 3.4815369711617316e-05, |
|
"loss": 0.0498, |
|
"step": 1018 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 0.22936655580997467, |
|
"learning_rate": 3.4726576705801636e-05, |
|
"loss": 0.0249, |
|
"step": 1019 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 0.5534436106681824, |
|
"learning_rate": 3.463783679285535e-05, |
|
"loss": 0.1696, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 0.3127197027206421, |
|
"learning_rate": 3.4549150281252636e-05, |
|
"loss": 0.045, |
|
"step": 1021 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 0.13819609582424164, |
|
"learning_rate": 3.446051747928202e-05, |
|
"loss": 0.0203, |
|
"step": 1022 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 0.35015806555747986, |
|
"learning_rate": 3.4371938695045346e-05, |
|
"loss": 0.0608, |
|
"step": 1023 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 0.3767643868923187, |
|
"learning_rate": 3.428341423645668e-05, |
|
"loss": 0.0686, |
|
"step": 1024 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 0.3282710611820221, |
|
"learning_rate": 3.419494441124121e-05, |
|
"loss": 0.0697, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 0.24474768340587616, |
|
"learning_rate": 3.4106529526934306e-05, |
|
"loss": 0.0583, |
|
"step": 1026 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 0.24781620502471924, |
|
"learning_rate": 3.4018169890880225e-05, |
|
"loss": 0.0327, |
|
"step": 1027 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 0.2474932074546814, |
|
"learning_rate": 3.392986581023126e-05, |
|
"loss": 0.0679, |
|
"step": 1028 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 0.39474180340766907, |
|
"learning_rate": 3.384161759194658e-05, |
|
"loss": 0.0713, |
|
"step": 1029 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 0.13963012397289276, |
|
"learning_rate": 3.375342554279111e-05, |
|
"loss": 0.0179, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 0.32144859433174133, |
|
"learning_rate": 3.3665289969334585e-05, |
|
"loss": 0.0447, |
|
"step": 1031 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 0.14767055213451385, |
|
"learning_rate": 3.3577211177950385e-05, |
|
"loss": 0.017, |
|
"step": 1032 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 0.3088414967060089, |
|
"learning_rate": 3.348918947481452e-05, |
|
"loss": 0.0483, |
|
"step": 1033 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 0.33189231157302856, |
|
"learning_rate": 3.340122516590456e-05, |
|
"loss": 0.0382, |
|
"step": 1034 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 0.4593893885612488, |
|
"learning_rate": 3.3313318556998526e-05, |
|
"loss": 0.0523, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 0.253412127494812, |
|
"learning_rate": 3.322546995367394e-05, |
|
"loss": 0.0212, |
|
"step": 1036 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 0.42397648096084595, |
|
"learning_rate": 3.3137679661306576e-05, |
|
"loss": 0.087, |
|
"step": 1037 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 0.40718910098075867, |
|
"learning_rate": 3.3049947985069616e-05, |
|
"loss": 0.0965, |
|
"step": 1038 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 0.3604757487773895, |
|
"learning_rate": 3.2962275229932446e-05, |
|
"loss": 0.0973, |
|
"step": 1039 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 0.2129022628068924, |
|
"learning_rate": 3.287466170065959e-05, |
|
"loss": 0.0384, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 0.20552971959114075, |
|
"learning_rate": 3.2787107701809754e-05, |
|
"loss": 0.0305, |
|
"step": 1041 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 0.41909754276275635, |
|
"learning_rate": 3.269961353773469e-05, |
|
"loss": 0.0725, |
|
"step": 1042 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 0.2766873240470886, |
|
"learning_rate": 3.261217951257813e-05, |
|
"loss": 0.0649, |
|
"step": 1043 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 0.1571783572435379, |
|
"learning_rate": 3.252480593027478e-05, |
|
"loss": 0.0238, |
|
"step": 1044 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 0.2809221148490906, |
|
"learning_rate": 3.243749309454922e-05, |
|
"loss": 0.0613, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 0.4999224841594696, |
|
"learning_rate": 3.235024130891487e-05, |
|
"loss": 0.0806, |
|
"step": 1046 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 0.1895889937877655, |
|
"learning_rate": 3.226305087667295e-05, |
|
"loss": 0.026, |
|
"step": 1047 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 0.220509871840477, |
|
"learning_rate": 3.217592210091137e-05, |
|
"loss": 0.0681, |
|
"step": 1048 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 0.19226831197738647, |
|
"learning_rate": 3.208885528450376e-05, |
|
"loss": 0.0232, |
|
"step": 1049 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 0.30779534578323364, |
|
"learning_rate": 3.200185073010831e-05, |
|
"loss": 0.0547, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 0.16252338886260986, |
|
"learning_rate": 3.1914908740166795e-05, |
|
"loss": 0.0237, |
|
"step": 1051 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 0.4130539000034332, |
|
"learning_rate": 3.182802961690357e-05, |
|
"loss": 0.0437, |
|
"step": 1052 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 0.20889542996883392, |
|
"learning_rate": 3.1741213662324365e-05, |
|
"loss": 0.0493, |
|
"step": 1053 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 0.27447709441185, |
|
"learning_rate": 3.165446117821538e-05, |
|
"loss": 0.0859, |
|
"step": 1054 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 0.07740370184183121, |
|
"learning_rate": 3.1567772466142156e-05, |
|
"loss": 0.011, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 0.1462429016828537, |
|
"learning_rate": 3.148114782744855e-05, |
|
"loss": 0.0228, |
|
"step": 1056 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 0.34325939416885376, |
|
"learning_rate": 3.139458756325576e-05, |
|
"loss": 0.0928, |
|
"step": 1057 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 0.2712673246860504, |
|
"learning_rate": 3.130809197446106e-05, |
|
"loss": 0.0408, |
|
"step": 1058 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 0.24180567264556885, |
|
"learning_rate": 3.122166136173706e-05, |
|
"loss": 0.0309, |
|
"step": 1059 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 0.2045336663722992, |
|
"learning_rate": 3.113529602553042e-05, |
|
"loss": 0.0391, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 0.09178590029478073, |
|
"learning_rate": 3.104899626606088e-05, |
|
"loss": 0.0132, |
|
"step": 1061 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 0.300592303276062, |
|
"learning_rate": 3.0962762383320285e-05, |
|
"loss": 0.0787, |
|
"step": 1062 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 0.31034108996391296, |
|
"learning_rate": 3.08765946770714e-05, |
|
"loss": 0.0307, |
|
"step": 1063 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 0.1603342443704605, |
|
"learning_rate": 3.0790493446847024e-05, |
|
"loss": 0.0209, |
|
"step": 1064 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 0.23990066349506378, |
|
"learning_rate": 3.070445899194885e-05, |
|
"loss": 0.0361, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 0.15933218598365784, |
|
"learning_rate": 3.061849161144641e-05, |
|
"loss": 0.0137, |
|
"step": 1066 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 0.4810096025466919, |
|
"learning_rate": 3.053259160417613e-05, |
|
"loss": 0.1255, |
|
"step": 1067 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 0.4099353849887848, |
|
"learning_rate": 3.0446759268740233e-05, |
|
"loss": 0.0718, |
|
"step": 1068 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 0.39221569895744324, |
|
"learning_rate": 3.0360994903505653e-05, |
|
"loss": 0.0721, |
|
"step": 1069 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 0.48040878772735596, |
|
"learning_rate": 3.02752988066031e-05, |
|
"loss": 0.1077, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 0.1548435539007187, |
|
"learning_rate": 3.018967127592595e-05, |
|
"loss": 0.0273, |
|
"step": 1071 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 0.37816908955574036, |
|
"learning_rate": 3.010411260912922e-05, |
|
"loss": 0.0421, |
|
"step": 1072 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 0.2902359366416931, |
|
"learning_rate": 3.0018623103628596e-05, |
|
"loss": 0.0645, |
|
"step": 1073 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 0.32010090351104736, |
|
"learning_rate": 2.9933203056599275e-05, |
|
"loss": 0.0444, |
|
"step": 1074 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 0.47379517555236816, |
|
"learning_rate": 2.984785276497507e-05, |
|
"loss": 0.0423, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 0.3198046088218689, |
|
"learning_rate": 2.9762572525447262e-05, |
|
"loss": 0.0678, |
|
"step": 1076 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 0.33741793036460876, |
|
"learning_rate": 2.9677362634463647e-05, |
|
"loss": 0.0643, |
|
"step": 1077 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 0.2264060080051422, |
|
"learning_rate": 2.9592223388227503e-05, |
|
"loss": 0.0275, |
|
"step": 1078 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 0.3069595694541931, |
|
"learning_rate": 2.9507155082696482e-05, |
|
"loss": 0.0481, |
|
"step": 1079 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 0.26178085803985596, |
|
"learning_rate": 2.9422158013581658e-05, |
|
"loss": 0.0298, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 0.4191998243331909, |
|
"learning_rate": 2.93372324763465e-05, |
|
"loss": 0.1156, |
|
"step": 1081 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 0.21308496594429016, |
|
"learning_rate": 2.9252378766205758e-05, |
|
"loss": 0.0478, |
|
"step": 1082 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 0.20033082365989685, |
|
"learning_rate": 2.9167597178124585e-05, |
|
"loss": 0.0262, |
|
"step": 1083 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 0.21688255667686462, |
|
"learning_rate": 2.9082888006817365e-05, |
|
"loss": 0.0767, |
|
"step": 1084 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 0.300791472196579, |
|
"learning_rate": 2.899825154674674e-05, |
|
"loss": 0.0464, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 0.2416476011276245, |
|
"learning_rate": 2.8913688092122664e-05, |
|
"loss": 0.0605, |
|
"step": 1086 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 0.2521096169948578, |
|
"learning_rate": 2.8829197936901232e-05, |
|
"loss": 0.0293, |
|
"step": 1087 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 0.3229115605354309, |
|
"learning_rate": 2.8744781374783813e-05, |
|
"loss": 0.0435, |
|
"step": 1088 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 0.0780615508556366, |
|
"learning_rate": 2.8660438699215898e-05, |
|
"loss": 0.0109, |
|
"step": 1089 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 0.2879962623119354, |
|
"learning_rate": 2.8576170203386143e-05, |
|
"loss": 0.0665, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 0.19985683262348175, |
|
"learning_rate": 2.8491976180225388e-05, |
|
"loss": 0.0378, |
|
"step": 1091 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 0.15360887348651886, |
|
"learning_rate": 2.840785692240553e-05, |
|
"loss": 0.0257, |
|
"step": 1092 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 0.19224387407302856, |
|
"learning_rate": 2.832381272233864e-05, |
|
"loss": 0.0529, |
|
"step": 1093 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 0.2037738561630249, |
|
"learning_rate": 2.8239843872175814e-05, |
|
"loss": 0.0228, |
|
"step": 1094 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 0.380874902009964, |
|
"learning_rate": 2.8155950663806235e-05, |
|
"loss": 0.0525, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 0.28522253036499023, |
|
"learning_rate": 2.8072133388856192e-05, |
|
"loss": 0.0615, |
|
"step": 1096 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 0.1098146066069603, |
|
"learning_rate": 2.7988392338687926e-05, |
|
"loss": 0.0159, |
|
"step": 1097 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 0.2615334987640381, |
|
"learning_rate": 2.7904727804398812e-05, |
|
"loss": 0.0353, |
|
"step": 1098 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 0.2041955441236496, |
|
"learning_rate": 2.7821140076820162e-05, |
|
"loss": 0.0187, |
|
"step": 1099 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 0.1846192330121994, |
|
"learning_rate": 2.773762944651632e-05, |
|
"loss": 0.0554, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 0.17711102962493896, |
|
"learning_rate": 2.765419620378366e-05, |
|
"loss": 0.0342, |
|
"step": 1101 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 0.3703756332397461, |
|
"learning_rate": 2.7570840638649486e-05, |
|
"loss": 0.0378, |
|
"step": 1102 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 0.4282096326351166, |
|
"learning_rate": 2.7487563040871145e-05, |
|
"loss": 0.0789, |
|
"step": 1103 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 0.32506605982780457, |
|
"learning_rate": 2.740436369993491e-05, |
|
"loss": 0.0337, |
|
"step": 1104 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 0.250688374042511, |
|
"learning_rate": 2.7321242905055013e-05, |
|
"loss": 0.0554, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 0.3557257354259491, |
|
"learning_rate": 2.7238200945172698e-05, |
|
"loss": 0.0356, |
|
"step": 1106 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 0.3472774028778076, |
|
"learning_rate": 2.715523810895515e-05, |
|
"loss": 0.0348, |
|
"step": 1107 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 0.07373315095901489, |
|
"learning_rate": 2.707235468479449e-05, |
|
"loss": 0.0088, |
|
"step": 1108 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 0.21439437568187714, |
|
"learning_rate": 2.6989550960806768e-05, |
|
"loss": 0.0222, |
|
"step": 1109 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 0.2730681002140045, |
|
"learning_rate": 2.690682722483102e-05, |
|
"loss": 0.068, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 0.41124334931373596, |
|
"learning_rate": 2.6824183764428224e-05, |
|
"loss": 0.086, |
|
"step": 1111 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 0.6637737154960632, |
|
"learning_rate": 2.6741620866880335e-05, |
|
"loss": 0.0365, |
|
"step": 1112 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 0.425441712141037, |
|
"learning_rate": 2.665913881918921e-05, |
|
"loss": 0.095, |
|
"step": 1113 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 0.5520187020301819, |
|
"learning_rate": 2.6576737908075668e-05, |
|
"loss": 0.0514, |
|
"step": 1114 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 0.5284621119499207, |
|
"learning_rate": 2.6494418419978482e-05, |
|
"loss": 0.0593, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 0.08148845285177231, |
|
"learning_rate": 2.641218064105341e-05, |
|
"loss": 0.0084, |
|
"step": 1116 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 0.8642109036445618, |
|
"learning_rate": 2.6330024857172192e-05, |
|
"loss": 0.0766, |
|
"step": 1117 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 0.40509146451950073, |
|
"learning_rate": 2.6247951353921485e-05, |
|
"loss": 0.1148, |
|
"step": 1118 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 0.2981242537498474, |
|
"learning_rate": 2.616596041660194e-05, |
|
"loss": 0.0666, |
|
"step": 1119 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 0.21514151990413666, |
|
"learning_rate": 2.6084052330227238e-05, |
|
"loss": 0.0363, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 0.10281267762184143, |
|
"learning_rate": 2.6002227379522992e-05, |
|
"loss": 0.0169, |
|
"step": 1121 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 0.3236760199069977, |
|
"learning_rate": 2.5920485848925913e-05, |
|
"loss": 0.0296, |
|
"step": 1122 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 0.22741632163524628, |
|
"learning_rate": 2.5838828022582594e-05, |
|
"loss": 0.023, |
|
"step": 1123 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 0.3826078772544861, |
|
"learning_rate": 2.5757254184348778e-05, |
|
"loss": 0.0744, |
|
"step": 1124 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 0.226307213306427, |
|
"learning_rate": 2.5675764617788234e-05, |
|
"loss": 0.0297, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 0.31913021206855774, |
|
"learning_rate": 2.5594359606171724e-05, |
|
"loss": 0.0793, |
|
"step": 1126 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 0.2947479486465454, |
|
"learning_rate": 2.5513039432476193e-05, |
|
"loss": 0.1363, |
|
"step": 1127 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.26046791672706604, |
|
"learning_rate": 2.5431804379383523e-05, |
|
"loss": 0.0727, |
|
"step": 1128 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.1183793917298317, |
|
"learning_rate": 2.535065472927983e-05, |
|
"loss": 0.0139, |
|
"step": 1129 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.23370495438575745, |
|
"learning_rate": 2.526959076425434e-05, |
|
"loss": 0.0503, |
|
"step": 1130 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 1695, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 565, |
|
"total_flos": 1.0339891388035891e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|