|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.0, |
|
"eval_steps": 200, |
|
"global_step": 692, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.002890173410404624, |
|
"grad_norm": 3.413911899913347, |
|
"learning_rate": 9.999948473953725e-06, |
|
"loss": 0.0577, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.005780346820809248, |
|
"grad_norm": 2.897400778584227, |
|
"learning_rate": 9.999793896876868e-06, |
|
"loss": 0.0552, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.008670520231213872, |
|
"grad_norm": 4.246403961679219, |
|
"learning_rate": 9.99953627195533e-06, |
|
"loss": 0.0695, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.011560693641618497, |
|
"grad_norm": 3.544275760472012, |
|
"learning_rate": 9.999175604498867e-06, |
|
"loss": 0.054, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.014450867052023121, |
|
"grad_norm": 6.6935720376972565, |
|
"learning_rate": 9.998711901940989e-06, |
|
"loss": 0.0875, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.017341040462427744, |
|
"grad_norm": 7.418656211317936, |
|
"learning_rate": 9.998145173838796e-06, |
|
"loss": 0.1044, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.02023121387283237, |
|
"grad_norm": 5.845467274560381, |
|
"learning_rate": 9.997475431872795e-06, |
|
"loss": 0.0894, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.023121387283236993, |
|
"grad_norm": 8.046971007820595, |
|
"learning_rate": 9.996702689846645e-06, |
|
"loss": 0.1113, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.02601156069364162, |
|
"grad_norm": 5.557087876304546, |
|
"learning_rate": 9.995826963686883e-06, |
|
"loss": 0.0899, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.028901734104046242, |
|
"grad_norm": 7.013405088920962, |
|
"learning_rate": 9.994848271442595e-06, |
|
"loss": 0.0947, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.031791907514450865, |
|
"grad_norm": 5.148671016900013, |
|
"learning_rate": 9.993766633285033e-06, |
|
"loss": 0.081, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.03468208092485549, |
|
"grad_norm": 4.719125802513109, |
|
"learning_rate": 9.992582071507217e-06, |
|
"loss": 0.0795, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.03757225433526012, |
|
"grad_norm": 4.926080178684607, |
|
"learning_rate": 9.991294610523456e-06, |
|
"loss": 0.1108, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.04046242774566474, |
|
"grad_norm": 4.8632538049944705, |
|
"learning_rate": 9.989904276868865e-06, |
|
"loss": 0.118, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.04335260115606936, |
|
"grad_norm": 4.49166064777025, |
|
"learning_rate": 9.988411099198797e-06, |
|
"loss": 0.1029, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.046242774566473986, |
|
"grad_norm": 3.7145419864840066, |
|
"learning_rate": 9.986815108288273e-06, |
|
"loss": 0.0744, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.049132947976878616, |
|
"grad_norm": 4.099867309174663, |
|
"learning_rate": 9.98511633703133e-06, |
|
"loss": 0.0914, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.05202312138728324, |
|
"grad_norm": 3.437121075753801, |
|
"learning_rate": 9.98331482044036e-06, |
|
"loss": 0.0803, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.05491329479768786, |
|
"grad_norm": 3.9662109797189267, |
|
"learning_rate": 9.981410595645369e-06, |
|
"loss": 0.0788, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.057803468208092484, |
|
"grad_norm": 4.8667640721708105, |
|
"learning_rate": 9.979403701893226e-06, |
|
"loss": 0.1069, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.06069364161849711, |
|
"grad_norm": 5.834375526681122, |
|
"learning_rate": 9.977294180546857e-06, |
|
"loss": 0.1179, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.06358381502890173, |
|
"grad_norm": 4.402795466175506, |
|
"learning_rate": 9.975082075084375e-06, |
|
"loss": 0.1312, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.06647398843930635, |
|
"grad_norm": 4.425118866581384, |
|
"learning_rate": 9.9727674310982e-06, |
|
"loss": 0.1073, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.06936416184971098, |
|
"grad_norm": 4.074778629199514, |
|
"learning_rate": 9.970350296294114e-06, |
|
"loss": 0.1029, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.07225433526011561, |
|
"grad_norm": 4.72700192719594, |
|
"learning_rate": 9.967830720490277e-06, |
|
"loss": 0.1005, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.07514450867052024, |
|
"grad_norm": 4.663929794890595, |
|
"learning_rate": 9.9652087556162e-06, |
|
"loss": 0.111, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.07803468208092486, |
|
"grad_norm": 4.922655803987667, |
|
"learning_rate": 9.962484455711679e-06, |
|
"loss": 0.1043, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.08092485549132948, |
|
"grad_norm": 3.881077548927933, |
|
"learning_rate": 9.959657876925671e-06, |
|
"loss": 0.0856, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.0838150289017341, |
|
"grad_norm": 4.677532571658142, |
|
"learning_rate": 9.956729077515151e-06, |
|
"loss": 0.1028, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.08670520231213873, |
|
"grad_norm": 3.6049503639471796, |
|
"learning_rate": 9.9536981178439e-06, |
|
"loss": 0.0743, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.08959537572254335, |
|
"grad_norm": 4.217598474174594, |
|
"learning_rate": 9.950565060381264e-06, |
|
"loss": 0.0983, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.09248554913294797, |
|
"grad_norm": 5.093429475733392, |
|
"learning_rate": 9.94732996970087e-06, |
|
"loss": 0.1133, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.0953757225433526, |
|
"grad_norm": 5.129318191872631, |
|
"learning_rate": 9.94399291247929e-06, |
|
"loss": 0.0888, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.09826589595375723, |
|
"grad_norm": 3.6982076421332923, |
|
"learning_rate": 9.940553957494669e-06, |
|
"loss": 0.0831, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.10115606936416185, |
|
"grad_norm": 4.087472683580998, |
|
"learning_rate": 9.937013175625313e-06, |
|
"loss": 0.0972, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.10404624277456648, |
|
"grad_norm": 4.077053797813939, |
|
"learning_rate": 9.93337063984821e-06, |
|
"loss": 0.0939, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.1069364161849711, |
|
"grad_norm": 4.459254832264914, |
|
"learning_rate": 9.929626425237555e-06, |
|
"loss": 0.1073, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.10982658959537572, |
|
"grad_norm": 3.653561777734679, |
|
"learning_rate": 9.925780608963173e-06, |
|
"loss": 0.0787, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.11271676300578035, |
|
"grad_norm": 4.207379444042341, |
|
"learning_rate": 9.92183327028895e-06, |
|
"loss": 0.0881, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.11560693641618497, |
|
"grad_norm": 5.28282743437778, |
|
"learning_rate": 9.917784490571188e-06, |
|
"loss": 0.0981, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.11849710982658959, |
|
"grad_norm": 3.958835693054809, |
|
"learning_rate": 9.913634353256926e-06, |
|
"loss": 0.0747, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.12138728323699421, |
|
"grad_norm": 3.815694424138344, |
|
"learning_rate": 9.909382943882238e-06, |
|
"loss": 0.0962, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.12427745664739884, |
|
"grad_norm": 4.616222762096817, |
|
"learning_rate": 9.905030350070446e-06, |
|
"loss": 0.09, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.12716763005780346, |
|
"grad_norm": 4.8258864571124835, |
|
"learning_rate": 9.900576661530334e-06, |
|
"loss": 0.1068, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.13005780346820808, |
|
"grad_norm": 4.98128890149854, |
|
"learning_rate": 9.896021970054282e-06, |
|
"loss": 0.0908, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.1329479768786127, |
|
"grad_norm": 3.2157418735511456, |
|
"learning_rate": 9.89136636951639e-06, |
|
"loss": 0.0771, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.13583815028901733, |
|
"grad_norm": 5.174195271609547, |
|
"learning_rate": 9.886609955870536e-06, |
|
"loss": 0.1144, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.13872832369942195, |
|
"grad_norm": 3.799076912387004, |
|
"learning_rate": 9.881752827148391e-06, |
|
"loss": 0.105, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.1416184971098266, |
|
"grad_norm": 4.478882016025104, |
|
"learning_rate": 9.876795083457414e-06, |
|
"loss": 0.1084, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.14450867052023122, |
|
"grad_norm": 4.130383884149315, |
|
"learning_rate": 9.871736826978776e-06, |
|
"loss": 0.1028, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.14739884393063585, |
|
"grad_norm": 4.120395848476724, |
|
"learning_rate": 9.866578161965259e-06, |
|
"loss": 0.086, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.15028901734104047, |
|
"grad_norm": 4.4606365855568795, |
|
"learning_rate": 9.861319194739109e-06, |
|
"loss": 0.1048, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.1531791907514451, |
|
"grad_norm": 4.235827775616701, |
|
"learning_rate": 9.855960033689843e-06, |
|
"loss": 0.1011, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.15606936416184972, |
|
"grad_norm": 3.863096874826039, |
|
"learning_rate": 9.85050078927201e-06, |
|
"loss": 0.0787, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.15895953757225434, |
|
"grad_norm": 4.930472249007545, |
|
"learning_rate": 9.844941574002927e-06, |
|
"loss": 0.113, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.16184971098265896, |
|
"grad_norm": 4.9133176516275, |
|
"learning_rate": 9.83928250246034e-06, |
|
"loss": 0.1126, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.16473988439306358, |
|
"grad_norm": 4.334161886190985, |
|
"learning_rate": 9.83352369128009e-06, |
|
"loss": 0.1163, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.1676300578034682, |
|
"grad_norm": 3.952135073411162, |
|
"learning_rate": 9.82766525915368e-06, |
|
"loss": 0.0997, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.17052023121387283, |
|
"grad_norm": 5.407354621789884, |
|
"learning_rate": 9.821707326825849e-06, |
|
"loss": 0.1464, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.17341040462427745, |
|
"grad_norm": 4.710011204025672, |
|
"learning_rate": 9.815650017092078e-06, |
|
"loss": 0.0966, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.17630057803468208, |
|
"grad_norm": 3.6046217871002897, |
|
"learning_rate": 9.809493454796051e-06, |
|
"loss": 0.0808, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.1791907514450867, |
|
"grad_norm": 4.333327639577169, |
|
"learning_rate": 9.803237766827098e-06, |
|
"loss": 0.1058, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.18208092485549132, |
|
"grad_norm": 3.7757843201095094, |
|
"learning_rate": 9.796883082117565e-06, |
|
"loss": 0.092, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.18497109826589594, |
|
"grad_norm": 3.292155410682368, |
|
"learning_rate": 9.790429531640163e-06, |
|
"loss": 0.0833, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.18786127167630057, |
|
"grad_norm": 3.1069406572396354, |
|
"learning_rate": 9.783877248405266e-06, |
|
"loss": 0.0836, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.1907514450867052, |
|
"grad_norm": 3.4381561878895925, |
|
"learning_rate": 9.77722636745818e-06, |
|
"loss": 0.0927, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.1936416184971098, |
|
"grad_norm": 4.516150027751648, |
|
"learning_rate": 9.770477025876338e-06, |
|
"loss": 0.1158, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.19653179190751446, |
|
"grad_norm": 4.593877711768214, |
|
"learning_rate": 9.763629362766495e-06, |
|
"loss": 0.1085, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.1994219653179191, |
|
"grad_norm": 3.9274365769789865, |
|
"learning_rate": 9.75668351926186e-06, |
|
"loss": 0.0954, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.2023121387283237, |
|
"grad_norm": 3.8806786830392466, |
|
"learning_rate": 9.749639638519167e-06, |
|
"loss": 0.0929, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.20520231213872833, |
|
"grad_norm": 4.019652719431334, |
|
"learning_rate": 9.742497865715752e-06, |
|
"loss": 0.1106, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.20809248554913296, |
|
"grad_norm": 4.432334879926278, |
|
"learning_rate": 9.735258348046538e-06, |
|
"loss": 0.1175, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.21098265895953758, |
|
"grad_norm": 3.4711831909967295, |
|
"learning_rate": 9.727921234721013e-06, |
|
"loss": 0.0852, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.2138728323699422, |
|
"grad_norm": 3.2064291726730256, |
|
"learning_rate": 9.720486676960157e-06, |
|
"loss": 0.0728, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.21676300578034682, |
|
"grad_norm": 3.808224427132749, |
|
"learning_rate": 9.712954827993314e-06, |
|
"loss": 0.1021, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.21965317919075145, |
|
"grad_norm": 3.607343192971514, |
|
"learning_rate": 9.705325843055045e-06, |
|
"loss": 0.0859, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.22254335260115607, |
|
"grad_norm": 5.194995083585211, |
|
"learning_rate": 9.69759987938192e-06, |
|
"loss": 0.1043, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.2254335260115607, |
|
"grad_norm": 4.054952478603542, |
|
"learning_rate": 9.689777096209287e-06, |
|
"loss": 0.103, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.22832369942196531, |
|
"grad_norm": 3.640845125492553, |
|
"learning_rate": 9.681857654767978e-06, |
|
"loss": 0.0797, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.23121387283236994, |
|
"grad_norm": 3.8972080333317765, |
|
"learning_rate": 9.673841718281e-06, |
|
"loss": 0.0896, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.23410404624277456, |
|
"grad_norm": 4.07281569833404, |
|
"learning_rate": 9.665729451960152e-06, |
|
"loss": 0.1053, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.23699421965317918, |
|
"grad_norm": 4.030872577083022, |
|
"learning_rate": 9.657521023002644e-06, |
|
"loss": 0.1148, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.2398843930635838, |
|
"grad_norm": 4.704073079780987, |
|
"learning_rate": 9.64921660058763e-06, |
|
"loss": 0.0992, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.24277456647398843, |
|
"grad_norm": 5.250684980442873, |
|
"learning_rate": 9.64081635587273e-06, |
|
"loss": 0.1066, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.24566473988439305, |
|
"grad_norm": 4.1597493954101035, |
|
"learning_rate": 9.632320461990505e-06, |
|
"loss": 0.1029, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.24855491329479767, |
|
"grad_norm": 4.76611183508526, |
|
"learning_rate": 9.623729094044882e-06, |
|
"loss": 0.0927, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.2514450867052023, |
|
"grad_norm": 4.328180556711655, |
|
"learning_rate": 9.615042429107554e-06, |
|
"loss": 0.105, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.2543352601156069, |
|
"grad_norm": 3.7978576663551658, |
|
"learning_rate": 9.606260646214314e-06, |
|
"loss": 0.0974, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.25722543352601157, |
|
"grad_norm": 3.9006554035006027, |
|
"learning_rate": 9.597383926361388e-06, |
|
"loss": 0.0806, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.26011560693641617, |
|
"grad_norm": 3.841382258509325, |
|
"learning_rate": 9.588412452501686e-06, |
|
"loss": 0.0972, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.2630057803468208, |
|
"grad_norm": 4.180653104918261, |
|
"learning_rate": 9.579346409541037e-06, |
|
"loss": 0.1034, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.2658959537572254, |
|
"grad_norm": 4.7454554804737885, |
|
"learning_rate": 9.570185984334383e-06, |
|
"loss": 0.1285, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.26878612716763006, |
|
"grad_norm": 4.10415672097511, |
|
"learning_rate": 9.56093136568192e-06, |
|
"loss": 0.0958, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.27167630057803466, |
|
"grad_norm": 3.5059005548760984, |
|
"learning_rate": 9.551582744325213e-06, |
|
"loss": 0.0866, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.2745664739884393, |
|
"grad_norm": 3.6654919235269645, |
|
"learning_rate": 9.542140312943257e-06, |
|
"loss": 0.091, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.2774566473988439, |
|
"grad_norm": 3.637246259933994, |
|
"learning_rate": 9.532604266148521e-06, |
|
"loss": 0.0885, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.28034682080924855, |
|
"grad_norm": 3.0790903920072865, |
|
"learning_rate": 9.522974800482914e-06, |
|
"loss": 0.08, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.2832369942196532, |
|
"grad_norm": 3.478040295573873, |
|
"learning_rate": 9.513252114413756e-06, |
|
"loss": 0.0874, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.2861271676300578, |
|
"grad_norm": 4.810373026536352, |
|
"learning_rate": 9.503436408329677e-06, |
|
"loss": 0.1175, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.28901734104046245, |
|
"grad_norm": 5.0600900263791315, |
|
"learning_rate": 9.493527884536487e-06, |
|
"loss": 0.1118, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.29190751445086704, |
|
"grad_norm": 5.024451679855302, |
|
"learning_rate": 9.483526747253004e-06, |
|
"loss": 0.1313, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.2947976878612717, |
|
"grad_norm": 4.266039616349381, |
|
"learning_rate": 9.473433202606859e-06, |
|
"loss": 0.0923, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.2976878612716763, |
|
"grad_norm": 4.948633226230769, |
|
"learning_rate": 9.46324745863023e-06, |
|
"loss": 0.1035, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.30057803468208094, |
|
"grad_norm": 4.444955459378445, |
|
"learning_rate": 9.452969725255558e-06, |
|
"loss": 0.1014, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.30346820809248554, |
|
"grad_norm": 4.110235690060319, |
|
"learning_rate": 9.442600214311236e-06, |
|
"loss": 0.1158, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.3063583815028902, |
|
"grad_norm": 4.086004629721548, |
|
"learning_rate": 9.432139139517222e-06, |
|
"loss": 0.1039, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.3092485549132948, |
|
"grad_norm": 3.7533923262128157, |
|
"learning_rate": 9.421586716480645e-06, |
|
"loss": 0.0902, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.31213872832369943, |
|
"grad_norm": 3.7885314784566892, |
|
"learning_rate": 9.410943162691359e-06, |
|
"loss": 0.0904, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.315028901734104, |
|
"grad_norm": 4.407963100332848, |
|
"learning_rate": 9.400208697517463e-06, |
|
"loss": 0.1011, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.3179190751445087, |
|
"grad_norm": 3.3317597063787194, |
|
"learning_rate": 9.389383542200779e-06, |
|
"loss": 0.0793, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.3208092485549133, |
|
"grad_norm": 3.6021995713337085, |
|
"learning_rate": 9.378467919852285e-06, |
|
"loss": 0.1021, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.3236994219653179, |
|
"grad_norm": 3.060883940108233, |
|
"learning_rate": 9.367462055447528e-06, |
|
"loss": 0.0782, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.3265895953757225, |
|
"grad_norm": 3.6322429756958443, |
|
"learning_rate": 9.356366175821977e-06, |
|
"loss": 0.1152, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.32947976878612717, |
|
"grad_norm": 3.5862125438438186, |
|
"learning_rate": 9.34518050966636e-06, |
|
"loss": 0.0958, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.33236994219653176, |
|
"grad_norm": 3.5665497104723944, |
|
"learning_rate": 9.333905287521933e-06, |
|
"loss": 0.0847, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.3352601156069364, |
|
"grad_norm": 4.102595062306423, |
|
"learning_rate": 9.322540741775745e-06, |
|
"loss": 0.1247, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.33815028901734107, |
|
"grad_norm": 3.6212682400747638, |
|
"learning_rate": 9.311087106655838e-06, |
|
"loss": 0.1005, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.34104046242774566, |
|
"grad_norm": 3.5539806452557445, |
|
"learning_rate": 9.299544618226428e-06, |
|
"loss": 0.0961, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.3439306358381503, |
|
"grad_norm": 4.240074088752261, |
|
"learning_rate": 9.287913514383031e-06, |
|
"loss": 0.0975, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.3468208092485549, |
|
"grad_norm": 3.065068340901538, |
|
"learning_rate": 9.276194034847565e-06, |
|
"loss": 0.092, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.34971098265895956, |
|
"grad_norm": 4.031446686004421, |
|
"learning_rate": 9.26438642116341e-06, |
|
"loss": 0.0945, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.35260115606936415, |
|
"grad_norm": 3.489698753275862, |
|
"learning_rate": 9.252490916690422e-06, |
|
"loss": 0.0848, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.3554913294797688, |
|
"grad_norm": 3.608823390219841, |
|
"learning_rate": 9.240507766599928e-06, |
|
"loss": 0.0801, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.3583815028901734, |
|
"grad_norm": 4.639617732442707, |
|
"learning_rate": 9.228437217869668e-06, |
|
"loss": 0.122, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.36127167630057805, |
|
"grad_norm": 3.661308322897671, |
|
"learning_rate": 9.2162795192787e-06, |
|
"loss": 0.0935, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.36416184971098264, |
|
"grad_norm": 4.1280140168078665, |
|
"learning_rate": 9.204034921402282e-06, |
|
"loss": 0.1184, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.3670520231213873, |
|
"grad_norm": 4.901427060033713, |
|
"learning_rate": 9.191703676606702e-06, |
|
"loss": 0.091, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.3699421965317919, |
|
"grad_norm": 3.796410122108623, |
|
"learning_rate": 9.179286039044072e-06, |
|
"loss": 0.1042, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.37283236994219654, |
|
"grad_norm": 4.575336333649151, |
|
"learning_rate": 9.166782264647105e-06, |
|
"loss": 0.1111, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.37572254335260113, |
|
"grad_norm": 4.177294896339385, |
|
"learning_rate": 9.15419261112382e-06, |
|
"loss": 0.1028, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.3786127167630058, |
|
"grad_norm": 3.772689277647484, |
|
"learning_rate": 9.141517337952243e-06, |
|
"loss": 0.0874, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.3815028901734104, |
|
"grad_norm": 3.9602527742947404, |
|
"learning_rate": 9.128756706375065e-06, |
|
"loss": 0.0966, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.38439306358381503, |
|
"grad_norm": 3.859417212423571, |
|
"learning_rate": 9.115910979394238e-06, |
|
"loss": 0.0921, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.3872832369942196, |
|
"grad_norm": 5.107869845200311, |
|
"learning_rate": 9.102980421765575e-06, |
|
"loss": 0.105, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.3901734104046243, |
|
"grad_norm": 4.5458122953341125, |
|
"learning_rate": 9.089965299993278e-06, |
|
"loss": 0.1022, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.3930635838150289, |
|
"grad_norm": 3.8501993337102327, |
|
"learning_rate": 9.076865882324453e-06, |
|
"loss": 0.0763, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.3959537572254335, |
|
"grad_norm": 4.28944054979154, |
|
"learning_rate": 9.063682438743582e-06, |
|
"loss": 0.0964, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.3988439306358382, |
|
"grad_norm": 4.587040074279113, |
|
"learning_rate": 9.050415240966953e-06, |
|
"loss": 0.1169, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.40173410404624277, |
|
"grad_norm": 4.826886123603603, |
|
"learning_rate": 9.037064562437068e-06, |
|
"loss": 0.1162, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.4046242774566474, |
|
"grad_norm": 4.451491462388894, |
|
"learning_rate": 9.023630678316994e-06, |
|
"loss": 0.0869, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.407514450867052, |
|
"grad_norm": 4.142756867333979, |
|
"learning_rate": 9.01011386548471e-06, |
|
"loss": 0.1083, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.41040462427745666, |
|
"grad_norm": 4.058664610851421, |
|
"learning_rate": 8.996514402527383e-06, |
|
"loss": 0.093, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.41329479768786126, |
|
"grad_norm": 3.9517689050770874, |
|
"learning_rate": 8.982832569735635e-06, |
|
"loss": 0.1049, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.4161849710982659, |
|
"grad_norm": 4.3831397108335075, |
|
"learning_rate": 8.969068649097766e-06, |
|
"loss": 0.0973, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.4190751445086705, |
|
"grad_norm": 3.3204794647179052, |
|
"learning_rate": 8.955222924293943e-06, |
|
"loss": 0.0893, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.42196531791907516, |
|
"grad_norm": 3.9958981086720726, |
|
"learning_rate": 8.941295680690347e-06, |
|
"loss": 0.0937, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.42485549132947975, |
|
"grad_norm": 3.560039411695779, |
|
"learning_rate": 8.9272872053333e-06, |
|
"loss": 0.0922, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.4277456647398844, |
|
"grad_norm": 3.866129097726797, |
|
"learning_rate": 8.913197786943335e-06, |
|
"loss": 0.0871, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.430635838150289, |
|
"grad_norm": 3.9338404603221293, |
|
"learning_rate": 8.89902771590927e-06, |
|
"loss": 0.1027, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.43352601156069365, |
|
"grad_norm": 4.362521573474016, |
|
"learning_rate": 8.884777284282193e-06, |
|
"loss": 0.0956, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.43641618497109824, |
|
"grad_norm": 3.945559735382797, |
|
"learning_rate": 8.870446785769468e-06, |
|
"loss": 0.0991, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.4393063583815029, |
|
"grad_norm": 3.180223515345157, |
|
"learning_rate": 8.856036515728666e-06, |
|
"loss": 0.0909, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.4421965317919075, |
|
"grad_norm": 3.7371844752520365, |
|
"learning_rate": 8.84154677116148e-06, |
|
"loss": 0.1071, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.44508670520231214, |
|
"grad_norm": 4.754557509579758, |
|
"learning_rate": 8.826977850707612e-06, |
|
"loss": 0.1067, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.4479768786127168, |
|
"grad_norm": 4.678190152571308, |
|
"learning_rate": 8.812330054638611e-06, |
|
"loss": 0.1177, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.4508670520231214, |
|
"grad_norm": 3.2121865098638027, |
|
"learning_rate": 8.797603684851685e-06, |
|
"loss": 0.0874, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.45375722543352603, |
|
"grad_norm": 3.17679106481082, |
|
"learning_rate": 8.782799044863475e-06, |
|
"loss": 0.0864, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.45664739884393063, |
|
"grad_norm": 3.2522042838865213, |
|
"learning_rate": 8.767916439803808e-06, |
|
"loss": 0.0916, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.4595375722543353, |
|
"grad_norm": 3.216689798967245, |
|
"learning_rate": 8.752956176409404e-06, |
|
"loss": 0.0893, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.4624277456647399, |
|
"grad_norm": 3.8865315971560404, |
|
"learning_rate": 8.737918563017553e-06, |
|
"loss": 0.0963, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4653179190751445, |
|
"grad_norm": 3.563126634285472, |
|
"learning_rate": 8.722803909559758e-06, |
|
"loss": 0.1016, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.4682080924855491, |
|
"grad_norm": 6.544416998936724, |
|
"learning_rate": 8.707612527555356e-06, |
|
"loss": 0.1252, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.47109826589595377, |
|
"grad_norm": 3.9078238788443995, |
|
"learning_rate": 8.692344730105084e-06, |
|
"loss": 0.1006, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.47398843930635837, |
|
"grad_norm": 3.465202393031836, |
|
"learning_rate": 8.677000831884639e-06, |
|
"loss": 0.0974, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.476878612716763, |
|
"grad_norm": 3.4753816819973915, |
|
"learning_rate": 8.661581149138185e-06, |
|
"loss": 0.0859, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.4797687861271676, |
|
"grad_norm": 4.370748869146705, |
|
"learning_rate": 8.646085999671838e-06, |
|
"loss": 0.1195, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.48265895953757226, |
|
"grad_norm": 4.025212498814417, |
|
"learning_rate": 8.630515702847109e-06, |
|
"loss": 0.0968, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.48554913294797686, |
|
"grad_norm": 4.497291267391845, |
|
"learning_rate": 8.614870579574338e-06, |
|
"loss": 0.1209, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.4884393063583815, |
|
"grad_norm": 3.4760700610759536, |
|
"learning_rate": 8.599150952306058e-06, |
|
"loss": 0.0977, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.4913294797687861, |
|
"grad_norm": 3.4995292892009386, |
|
"learning_rate": 8.58335714503037e-06, |
|
"loss": 0.0972, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.49421965317919075, |
|
"grad_norm": 4.073017918693432, |
|
"learning_rate": 8.567489483264247e-06, |
|
"loss": 0.1049, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.49710982658959535, |
|
"grad_norm": 3.8679218357162233, |
|
"learning_rate": 8.551548294046843e-06, |
|
"loss": 0.1094, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 3.7071156064360493, |
|
"learning_rate": 8.535533905932739e-06, |
|
"loss": 0.0883, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.5028901734104047, |
|
"grad_norm": 4.851312118928478, |
|
"learning_rate": 8.519446648985173e-06, |
|
"loss": 0.1139, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.5057803468208093, |
|
"grad_norm": 5.218623175111435, |
|
"learning_rate": 8.503286854769247e-06, |
|
"loss": 0.0963, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.5086705202312138, |
|
"grad_norm": 4.199494517228212, |
|
"learning_rate": 8.487054856345081e-06, |
|
"loss": 0.1028, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.5115606936416185, |
|
"grad_norm": 3.7907330310051166, |
|
"learning_rate": 8.470750988260956e-06, |
|
"loss": 0.0986, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.5144508670520231, |
|
"grad_norm": 3.4270783123645487, |
|
"learning_rate": 8.454375586546418e-06, |
|
"loss": 0.0905, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.5173410404624278, |
|
"grad_norm": 4.290100611914985, |
|
"learning_rate": 8.437928988705346e-06, |
|
"loss": 0.1138, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.5202312138728323, |
|
"grad_norm": 4.172228203124501, |
|
"learning_rate": 8.42141153370901e-06, |
|
"loss": 0.1122, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.523121387283237, |
|
"grad_norm": 3.1661691393240514, |
|
"learning_rate": 8.404823561989063e-06, |
|
"loss": 0.0659, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.5260115606936416, |
|
"grad_norm": 3.606488236503898, |
|
"learning_rate": 8.388165415430551e-06, |
|
"loss": 0.0967, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.5289017341040463, |
|
"grad_norm": 4.0351274821452225, |
|
"learning_rate": 8.371437437364844e-06, |
|
"loss": 0.1112, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.5317919075144508, |
|
"grad_norm": 3.578234343255972, |
|
"learning_rate": 8.35463997256257e-06, |
|
"loss": 0.0879, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.5346820809248555, |
|
"grad_norm": 2.881061742935368, |
|
"learning_rate": 8.337773367226509e-06, |
|
"loss": 0.0777, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.5375722543352601, |
|
"grad_norm": 3.522910118232901, |
|
"learning_rate": 8.320837968984456e-06, |
|
"loss": 0.0919, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.5404624277456648, |
|
"grad_norm": 3.7797177948481506, |
|
"learning_rate": 8.303834126882056e-06, |
|
"loss": 0.0948, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.5433526011560693, |
|
"grad_norm": 3.42150444228012, |
|
"learning_rate": 8.28676219137561e-06, |
|
"loss": 0.0842, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.546242774566474, |
|
"grad_norm": 3.536980226076555, |
|
"learning_rate": 8.269622514324856e-06, |
|
"loss": 0.0799, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.5491329479768786, |
|
"grad_norm": 4.0285830647679886, |
|
"learning_rate": 8.25241544898571e-06, |
|
"loss": 0.1136, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.5520231213872833, |
|
"grad_norm": 3.6610676809643143, |
|
"learning_rate": 8.23514135000299e-06, |
|
"loss": 0.0897, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.5549132947976878, |
|
"grad_norm": 3.848269323679635, |
|
"learning_rate": 8.217800573403105e-06, |
|
"loss": 0.0968, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.5578034682080925, |
|
"grad_norm": 3.167566563939419, |
|
"learning_rate": 8.20039347658672e-06, |
|
"loss": 0.0829, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.5606936416184971, |
|
"grad_norm": 3.837712170703205, |
|
"learning_rate": 8.18292041832138e-06, |
|
"loss": 0.0955, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.5635838150289018, |
|
"grad_norm": 4.098712711661327, |
|
"learning_rate": 8.165381758734134e-06, |
|
"loss": 0.1147, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.5664739884393064, |
|
"grad_norm": 3.6241971235179102, |
|
"learning_rate": 8.147777859304095e-06, |
|
"loss": 0.1017, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.569364161849711, |
|
"grad_norm": 3.830171016987476, |
|
"learning_rate": 8.130109082854998e-06, |
|
"loss": 0.0945, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.5722543352601156, |
|
"grad_norm": 3.578104803720523, |
|
"learning_rate": 8.112375793547718e-06, |
|
"loss": 0.0893, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.5751445086705202, |
|
"grad_norm": 4.371585616818501, |
|
"learning_rate": 8.09457835687277e-06, |
|
"loss": 0.0933, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.5780346820809249, |
|
"grad_norm": 3.909778241746307, |
|
"learning_rate": 8.076717139642775e-06, |
|
"loss": 0.11, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5780346820809249, |
|
"eval_loss": 0.09941592067480087, |
|
"eval_runtime": 0.9324, |
|
"eval_samples_per_second": 30.029, |
|
"eval_steps_per_second": 7.507, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5809248554913294, |
|
"grad_norm": 3.935272505639913, |
|
"learning_rate": 8.058792509984893e-06, |
|
"loss": 0.1024, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.5838150289017341, |
|
"grad_norm": 4.654720715350939, |
|
"learning_rate": 8.040804837333243e-06, |
|
"loss": 0.1019, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.5867052023121387, |
|
"grad_norm": 4.277556402582181, |
|
"learning_rate": 8.022754492421284e-06, |
|
"loss": 0.1083, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.5895953757225434, |
|
"grad_norm": 3.9171978917704458, |
|
"learning_rate": 8.004641847274182e-06, |
|
"loss": 0.1078, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.5924855491329479, |
|
"grad_norm": 3.5396394939537963, |
|
"learning_rate": 7.986467275201135e-06, |
|
"loss": 0.0841, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.5953757225433526, |
|
"grad_norm": 3.3936510531339703, |
|
"learning_rate": 7.968231150787674e-06, |
|
"loss": 0.092, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.5982658959537572, |
|
"grad_norm": 3.5159415187918794, |
|
"learning_rate": 7.949933849887963e-06, |
|
"loss": 0.0827, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.6011560693641619, |
|
"grad_norm": 3.6681764621931303, |
|
"learning_rate": 7.931575749617027e-06, |
|
"loss": 0.0665, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.6040462427745664, |
|
"grad_norm": 3.6083828563308042, |
|
"learning_rate": 7.913157228342994e-06, |
|
"loss": 0.106, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.6069364161849711, |
|
"grad_norm": 4.2477637082450315, |
|
"learning_rate": 7.894678665679298e-06, |
|
"loss": 0.1117, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.6098265895953757, |
|
"grad_norm": 4.6492747986553695, |
|
"learning_rate": 7.876140442476847e-06, |
|
"loss": 0.1062, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.6127167630057804, |
|
"grad_norm": 3.647527727939632, |
|
"learning_rate": 7.857542940816183e-06, |
|
"loss": 0.0853, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.615606936416185, |
|
"grad_norm": 3.93578958304064, |
|
"learning_rate": 7.838886543999596e-06, |
|
"loss": 0.0936, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.6184971098265896, |
|
"grad_norm": 4.072364355852493, |
|
"learning_rate": 7.820171636543233e-06, |
|
"loss": 0.0806, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.6213872832369942, |
|
"grad_norm": 4.441885738921091, |
|
"learning_rate": 7.80139860416917e-06, |
|
"loss": 0.1067, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.6242774566473989, |
|
"grad_norm": 4.310946247204338, |
|
"learning_rate": 7.782567833797458e-06, |
|
"loss": 0.1195, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.6271676300578035, |
|
"grad_norm": 3.7113356442945116, |
|
"learning_rate": 7.763679713538158e-06, |
|
"loss": 0.0826, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.630057803468208, |
|
"grad_norm": 4.057810111847526, |
|
"learning_rate": 7.744734632683332e-06, |
|
"loss": 0.0739, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.6329479768786127, |
|
"grad_norm": 3.9634738814885324, |
|
"learning_rate": 7.725732981699028e-06, |
|
"loss": 0.1089, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.6358381502890174, |
|
"grad_norm": 4.185189494202386, |
|
"learning_rate": 7.70667515221722e-06, |
|
"loss": 0.1013, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.638728323699422, |
|
"grad_norm": 4.1930722895881205, |
|
"learning_rate": 7.687561537027754e-06, |
|
"loss": 0.0989, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.6416184971098265, |
|
"grad_norm": 3.4917142430832544, |
|
"learning_rate": 7.668392530070238e-06, |
|
"loss": 0.0901, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.6445086705202312, |
|
"grad_norm": 4.746355802069406, |
|
"learning_rate": 7.649168526425924e-06, |
|
"loss": 0.1189, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.6473988439306358, |
|
"grad_norm": 3.8007941418147455, |
|
"learning_rate": 7.629889922309576e-06, |
|
"loss": 0.1021, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.6502890173410405, |
|
"grad_norm": 3.7915979937662794, |
|
"learning_rate": 7.610557115061292e-06, |
|
"loss": 0.0942, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.653179190751445, |
|
"grad_norm": 3.4975651641815757, |
|
"learning_rate": 7.5911705031383235e-06, |
|
"loss": 0.0758, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.6560693641618497, |
|
"grad_norm": 3.8976041216491972, |
|
"learning_rate": 7.571730486106849e-06, |
|
"loss": 0.0966, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.6589595375722543, |
|
"grad_norm": 3.5215116107725346, |
|
"learning_rate": 7.55223746463376e-06, |
|
"loss": 0.0868, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.661849710982659, |
|
"grad_norm": 3.217072576489954, |
|
"learning_rate": 7.532691840478388e-06, |
|
"loss": 0.0848, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.6647398843930635, |
|
"grad_norm": 3.4340150681487556, |
|
"learning_rate": 7.513094016484225e-06, |
|
"loss": 0.0977, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.6676300578034682, |
|
"grad_norm": 3.69714853322805, |
|
"learning_rate": 7.493444396570625e-06, |
|
"loss": 0.0865, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.6705202312138728, |
|
"grad_norm": 4.042244071122905, |
|
"learning_rate": 7.473743385724478e-06, |
|
"loss": 0.1144, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.6734104046242775, |
|
"grad_norm": 3.518643461499892, |
|
"learning_rate": 7.453991389991864e-06, |
|
"loss": 0.0772, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.6763005780346821, |
|
"grad_norm": 3.615509903388122, |
|
"learning_rate": 7.434188816469681e-06, |
|
"loss": 0.0926, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.6791907514450867, |
|
"grad_norm": 3.6364313649144626, |
|
"learning_rate": 7.414336073297255e-06, |
|
"loss": 0.0861, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.6820809248554913, |
|
"grad_norm": 3.1761480310047863, |
|
"learning_rate": 7.394433569647935e-06, |
|
"loss": 0.0858, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.684971098265896, |
|
"grad_norm": 3.284548753029961, |
|
"learning_rate": 7.374481715720647e-06, |
|
"loss": 0.086, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.6878612716763006, |
|
"grad_norm": 3.3199374348167265, |
|
"learning_rate": 7.354480922731454e-06, |
|
"loss": 0.0788, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.6907514450867052, |
|
"grad_norm": 3.8232040800857936, |
|
"learning_rate": 7.334431602905068e-06, |
|
"loss": 0.0829, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.6936416184971098, |
|
"grad_norm": 3.989063308502132, |
|
"learning_rate": 7.3143341694663604e-06, |
|
"loss": 0.0951, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6965317919075145, |
|
"grad_norm": 3.598886651406255, |
|
"learning_rate": 7.294189036631847e-06, |
|
"loss": 0.0975, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.6994219653179191, |
|
"grad_norm": 3.790070882387216, |
|
"learning_rate": 7.273996619601146e-06, |
|
"loss": 0.0916, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.7023121387283237, |
|
"grad_norm": 3.301945711159591, |
|
"learning_rate": 7.253757334548424e-06, |
|
"loss": 0.0873, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.7052023121387283, |
|
"grad_norm": 3.368321053717709, |
|
"learning_rate": 7.233471598613815e-06, |
|
"loss": 0.0881, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.708092485549133, |
|
"grad_norm": 4.69106656832974, |
|
"learning_rate": 7.213139829894826e-06, |
|
"loss": 0.0953, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.7109826589595376, |
|
"grad_norm": 3.7249528749087997, |
|
"learning_rate": 7.192762447437722e-06, |
|
"loss": 0.0791, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.7138728323699421, |
|
"grad_norm": 4.7512868447958425, |
|
"learning_rate": 7.17233987122888e-06, |
|
"loss": 0.1132, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.7167630057803468, |
|
"grad_norm": 3.9162600055471595, |
|
"learning_rate": 7.151872522186147e-06, |
|
"loss": 0.0914, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.7196531791907514, |
|
"grad_norm": 3.015385891196794, |
|
"learning_rate": 7.131360822150147e-06, |
|
"loss": 0.064, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.7225433526011561, |
|
"grad_norm": 3.525616281685469, |
|
"learning_rate": 7.110805193875607e-06, |
|
"loss": 0.0847, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.7254335260115607, |
|
"grad_norm": 3.7840741766879997, |
|
"learning_rate": 7.090206061022628e-06, |
|
"loss": 0.0892, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.7283236994219653, |
|
"grad_norm": 3.005699776026508, |
|
"learning_rate": 7.0695638481479565e-06, |
|
"loss": 0.0813, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.7312138728323699, |
|
"grad_norm": 4.468302540719233, |
|
"learning_rate": 7.048878980696241e-06, |
|
"loss": 0.0923, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.7341040462427746, |
|
"grad_norm": 3.970063283349485, |
|
"learning_rate": 7.028151884991254e-06, |
|
"loss": 0.0887, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.7369942196531792, |
|
"grad_norm": 3.748663847925155, |
|
"learning_rate": 7.007382988227116e-06, |
|
"loss": 0.0769, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.7398843930635838, |
|
"grad_norm": 3.2014963138639123, |
|
"learning_rate": 6.986572718459479e-06, |
|
"loss": 0.0746, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.7427745664739884, |
|
"grad_norm": 3.8984665123074347, |
|
"learning_rate": 6.965721504596712e-06, |
|
"loss": 0.0839, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.7456647398843931, |
|
"grad_norm": 4.449372024694936, |
|
"learning_rate": 6.94482977639106e-06, |
|
"loss": 0.1147, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.7485549132947977, |
|
"grad_norm": 3.5860135752034252, |
|
"learning_rate": 6.923897964429784e-06, |
|
"loss": 0.0987, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.7514450867052023, |
|
"grad_norm": 3.9680857005528023, |
|
"learning_rate": 6.902926500126292e-06, |
|
"loss": 0.0952, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.7543352601156069, |
|
"grad_norm": 3.5008251567033857, |
|
"learning_rate": 6.881915815711235e-06, |
|
"loss": 0.0935, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.7572254335260116, |
|
"grad_norm": 3.5150218688906354, |
|
"learning_rate": 6.8608663442236156e-06, |
|
"loss": 0.0913, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.7601156069364162, |
|
"grad_norm": 3.4487002898741763, |
|
"learning_rate": 6.839778519501848e-06, |
|
"loss": 0.0827, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.7630057803468208, |
|
"grad_norm": 4.109991878752816, |
|
"learning_rate": 6.818652776174828e-06, |
|
"loss": 0.0861, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.7658959537572254, |
|
"grad_norm": 3.6854040146116307, |
|
"learning_rate": 6.797489549652965e-06, |
|
"loss": 0.0848, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.7687861271676301, |
|
"grad_norm": 4.48017404399493, |
|
"learning_rate": 6.776289276119214e-06, |
|
"loss": 0.1077, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.7716763005780347, |
|
"grad_norm": 3.5935256781587985, |
|
"learning_rate": 6.7550523925200876e-06, |
|
"loss": 0.0836, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.7745664739884393, |
|
"grad_norm": 3.949250387337809, |
|
"learning_rate": 6.733779336556643e-06, |
|
"loss": 0.082, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.7774566473988439, |
|
"grad_norm": 3.394592216776318, |
|
"learning_rate": 6.712470546675467e-06, |
|
"loss": 0.0714, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.7803468208092486, |
|
"grad_norm": 4.50351141769006, |
|
"learning_rate": 6.691126462059636e-06, |
|
"loss": 0.098, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.7832369942196532, |
|
"grad_norm": 3.891769583944111, |
|
"learning_rate": 6.669747522619668e-06, |
|
"loss": 0.097, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.7861271676300579, |
|
"grad_norm": 3.5621684716248687, |
|
"learning_rate": 6.648334168984452e-06, |
|
"loss": 0.0808, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.7890173410404624, |
|
"grad_norm": 3.4588584589807474, |
|
"learning_rate": 6.626886842492168e-06, |
|
"loss": 0.0892, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.791907514450867, |
|
"grad_norm": 3.7827135135353647, |
|
"learning_rate": 6.60540598518119e-06, |
|
"loss": 0.0883, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.7947976878612717, |
|
"grad_norm": 3.417524992238082, |
|
"learning_rate": 6.583892039780979e-06, |
|
"loss": 0.0845, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.7976878612716763, |
|
"grad_norm": 3.956403018811007, |
|
"learning_rate": 6.562345449702952e-06, |
|
"loss": 0.0866, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.8005780346820809, |
|
"grad_norm": 3.467743823738221, |
|
"learning_rate": 6.540766659031348e-06, |
|
"loss": 0.085, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.8034682080924855, |
|
"grad_norm": 2.9058940217436264, |
|
"learning_rate": 6.519156112514074e-06, |
|
"loss": 0.0622, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.8063583815028902, |
|
"grad_norm": 3.5477458403494913, |
|
"learning_rate": 6.497514255553538e-06, |
|
"loss": 0.0852, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.8092485549132948, |
|
"grad_norm": 3.3180443483421684, |
|
"learning_rate": 6.4758415341974705e-06, |
|
"loss": 0.0813, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.8121387283236994, |
|
"grad_norm": 3.6388404590362704, |
|
"learning_rate": 6.454138395129727e-06, |
|
"loss": 0.0771, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.815028901734104, |
|
"grad_norm": 4.510025043476393, |
|
"learning_rate": 6.432405285661087e-06, |
|
"loss": 0.1043, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.8179190751445087, |
|
"grad_norm": 3.7069972674864777, |
|
"learning_rate": 6.410642653720033e-06, |
|
"loss": 0.0955, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.8208092485549133, |
|
"grad_norm": 3.762263730027283, |
|
"learning_rate": 6.388850947843517e-06, |
|
"loss": 0.1028, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.8236994219653179, |
|
"grad_norm": 3.8877343897591334, |
|
"learning_rate": 6.367030617167717e-06, |
|
"loss": 0.0934, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.8265895953757225, |
|
"grad_norm": 4.226473140728079, |
|
"learning_rate": 6.345182111418781e-06, |
|
"loss": 0.0918, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.8294797687861272, |
|
"grad_norm": 3.621018489335686, |
|
"learning_rate": 6.323305880903555e-06, |
|
"loss": 0.0836, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.8323699421965318, |
|
"grad_norm": 3.8331628689604775, |
|
"learning_rate": 6.301402376500306e-06, |
|
"loss": 0.0924, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.8352601156069365, |
|
"grad_norm": 3.940039042927964, |
|
"learning_rate": 6.279472049649426e-06, |
|
"loss": 0.1116, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.838150289017341, |
|
"grad_norm": 4.441094589237161, |
|
"learning_rate": 6.257515352344131e-06, |
|
"loss": 0.1003, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.8410404624277457, |
|
"grad_norm": 3.6897875138985903, |
|
"learning_rate": 6.2355327371211404e-06, |
|
"loss": 0.0877, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.8439306358381503, |
|
"grad_norm": 2.9510789132394253, |
|
"learning_rate": 6.213524657051354e-06, |
|
"loss": 0.0762, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.846820809248555, |
|
"grad_norm": 4.173200678288769, |
|
"learning_rate": 6.191491565730512e-06, |
|
"loss": 0.0981, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.8497109826589595, |
|
"grad_norm": 3.591972918555686, |
|
"learning_rate": 6.16943391726985e-06, |
|
"loss": 0.0768, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.8526011560693642, |
|
"grad_norm": 3.636329925569424, |
|
"learning_rate": 6.147352166286731e-06, |
|
"loss": 0.0865, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.8554913294797688, |
|
"grad_norm": 3.643462204481586, |
|
"learning_rate": 6.125246767895287e-06, |
|
"loss": 0.0889, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.8583815028901735, |
|
"grad_norm": 3.563652514109132, |
|
"learning_rate": 6.103118177697027e-06, |
|
"loss": 0.0793, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.861271676300578, |
|
"grad_norm": 3.781227516799238, |
|
"learning_rate": 6.0809668517714615e-06, |
|
"loss": 0.0891, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.8641618497109826, |
|
"grad_norm": 3.512843215775068, |
|
"learning_rate": 6.0587932466666825e-06, |
|
"loss": 0.0691, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.8670520231213873, |
|
"grad_norm": 3.505371533644849, |
|
"learning_rate": 6.036597819389972e-06, |
|
"loss": 0.0796, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.869942196531792, |
|
"grad_norm": 3.654392241450492, |
|
"learning_rate": 6.014381027398379e-06, |
|
"loss": 0.0825, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.8728323699421965, |
|
"grad_norm": 3.5879948178102854, |
|
"learning_rate": 5.992143328589282e-06, |
|
"loss": 0.0855, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.8757225433526011, |
|
"grad_norm": 4.276864949222476, |
|
"learning_rate": 5.96988518129096e-06, |
|
"loss": 0.0989, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.8786127167630058, |
|
"grad_norm": 3.680706587097863, |
|
"learning_rate": 5.947607044253142e-06, |
|
"loss": 0.0877, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.8815028901734104, |
|
"grad_norm": 3.1181713444360697, |
|
"learning_rate": 5.92530937663756e-06, |
|
"loss": 0.0782, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.884393063583815, |
|
"grad_norm": 3.6130639841107275, |
|
"learning_rate": 5.902992638008475e-06, |
|
"loss": 0.0773, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.8872832369942196, |
|
"grad_norm": 2.9053534072517544, |
|
"learning_rate": 5.880657288323207e-06, |
|
"loss": 0.0674, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.8901734104046243, |
|
"grad_norm": 4.138796949543857, |
|
"learning_rate": 5.858303787922663e-06, |
|
"loss": 0.0937, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.8930635838150289, |
|
"grad_norm": 4.554927322190945, |
|
"learning_rate": 5.835932597521839e-06, |
|
"loss": 0.0887, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.8959537572254336, |
|
"grad_norm": 4.130255984143273, |
|
"learning_rate": 5.8135441782003354e-06, |
|
"loss": 0.0915, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.8988439306358381, |
|
"grad_norm": 3.703954373509824, |
|
"learning_rate": 5.791138991392843e-06, |
|
"loss": 0.0754, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.9017341040462428, |
|
"grad_norm": 4.873227420557636, |
|
"learning_rate": 5.768717498879635e-06, |
|
"loss": 0.1212, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.9046242774566474, |
|
"grad_norm": 3.2713591661227936, |
|
"learning_rate": 5.746280162777061e-06, |
|
"loss": 0.0843, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.9075144508670521, |
|
"grad_norm": 3.9663264457968803, |
|
"learning_rate": 5.723827445528003e-06, |
|
"loss": 0.0763, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.9104046242774566, |
|
"grad_norm": 4.199805628532218, |
|
"learning_rate": 5.701359809892367e-06, |
|
"loss": 0.1101, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.9132947976878613, |
|
"grad_norm": 3.7370869573089895, |
|
"learning_rate": 5.67887771893752e-06, |
|
"loss": 0.0752, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.9161849710982659, |
|
"grad_norm": 3.182039257660417, |
|
"learning_rate": 5.656381636028769e-06, |
|
"loss": 0.0812, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.9190751445086706, |
|
"grad_norm": 3.3708995105429858, |
|
"learning_rate": 5.633872024819796e-06, |
|
"loss": 0.0726, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.9219653179190751, |
|
"grad_norm": 3.384533244992893, |
|
"learning_rate": 5.6113493492431105e-06, |
|
"loss": 0.0628, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.9248554913294798, |
|
"grad_norm": 3.3156425908108105, |
|
"learning_rate": 5.588814073500481e-06, |
|
"loss": 0.0733, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.9277456647398844, |
|
"grad_norm": 3.638157508219399, |
|
"learning_rate": 5.56626666205337e-06, |
|
"loss": 0.0764, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.930635838150289, |
|
"grad_norm": 3.3598544316213013, |
|
"learning_rate": 5.543707579613367e-06, |
|
"loss": 0.084, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.9335260115606936, |
|
"grad_norm": 3.959366527529141, |
|
"learning_rate": 5.5211372911326e-06, |
|
"loss": 0.0854, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.9364161849710982, |
|
"grad_norm": 3.81567718746474, |
|
"learning_rate": 5.498556261794161e-06, |
|
"loss": 0.0898, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.9393063583815029, |
|
"grad_norm": 3.32242644124862, |
|
"learning_rate": 5.475964957002516e-06, |
|
"loss": 0.0858, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.9421965317919075, |
|
"grad_norm": 4.384037573727284, |
|
"learning_rate": 5.45336384237391e-06, |
|
"loss": 0.0929, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.9450867052023122, |
|
"grad_norm": 3.0790304811954656, |
|
"learning_rate": 5.430753383726776e-06, |
|
"loss": 0.0773, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.9479768786127167, |
|
"grad_norm": 3.4124598879380987, |
|
"learning_rate": 5.4081340470721286e-06, |
|
"loss": 0.0797, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.9508670520231214, |
|
"grad_norm": 4.1220306324446785, |
|
"learning_rate": 5.385506298603962e-06, |
|
"loss": 0.0907, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.953757225433526, |
|
"grad_norm": 3.3556166825004814, |
|
"learning_rate": 5.362870604689643e-06, |
|
"loss": 0.0771, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.9566473988439307, |
|
"grad_norm": 3.2419310606822345, |
|
"learning_rate": 5.340227431860295e-06, |
|
"loss": 0.0684, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.9595375722543352, |
|
"grad_norm": 3.655597904932805, |
|
"learning_rate": 5.31757724680119e-06, |
|
"loss": 0.0846, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.9624277456647399, |
|
"grad_norm": 4.960497588181285, |
|
"learning_rate": 5.294920516342117e-06, |
|
"loss": 0.0808, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.9653179190751445, |
|
"grad_norm": 3.185250487634402, |
|
"learning_rate": 5.272257707447776e-06, |
|
"loss": 0.0813, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.9682080924855492, |
|
"grad_norm": 4.274523790048614, |
|
"learning_rate": 5.24958928720814e-06, |
|
"loss": 0.0979, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.9710982658959537, |
|
"grad_norm": 3.4623164735925385, |
|
"learning_rate": 5.22691572282884e-06, |
|
"loss": 0.0831, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.9739884393063584, |
|
"grad_norm": 4.255813187391108, |
|
"learning_rate": 5.2042374816215265e-06, |
|
"loss": 0.0855, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.976878612716763, |
|
"grad_norm": 3.6733563161934204, |
|
"learning_rate": 5.18155503099424e-06, |
|
"loss": 0.0924, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.9797687861271677, |
|
"grad_norm": 3.767726660475069, |
|
"learning_rate": 5.1588688384417816e-06, |
|
"loss": 0.0845, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.9826589595375722, |
|
"grad_norm": 3.3855145820664743, |
|
"learning_rate": 5.136179371536076e-06, |
|
"loss": 0.0793, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.9855491329479769, |
|
"grad_norm": 3.408221038715105, |
|
"learning_rate": 5.113487097916531e-06, |
|
"loss": 0.0632, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.9884393063583815, |
|
"grad_norm": 3.3541043863328843, |
|
"learning_rate": 5.090792485280401e-06, |
|
"loss": 0.0717, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.9913294797687862, |
|
"grad_norm": 3.8506033223008624, |
|
"learning_rate": 5.068096001373152e-06, |
|
"loss": 0.0862, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.9942196531791907, |
|
"grad_norm": 3.478313323623953, |
|
"learning_rate": 5.045398113978816e-06, |
|
"loss": 0.0682, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.9971098265895953, |
|
"grad_norm": 3.5967316347838914, |
|
"learning_rate": 5.022699290910351e-06, |
|
"loss": 0.0864, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 4.094478069223065, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0859, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 1.0028901734104045, |
|
"grad_norm": 3.191994252444766, |
|
"learning_rate": 4.9773007090896505e-06, |
|
"loss": 0.059, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 1.0057803468208093, |
|
"grad_norm": 2.592704679368971, |
|
"learning_rate": 4.9546018860211845e-06, |
|
"loss": 0.046, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 1.0086705202312138, |
|
"grad_norm": 2.4513026804496008, |
|
"learning_rate": 4.931903998626851e-06, |
|
"loss": 0.0386, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 1.0115606936416186, |
|
"grad_norm": 2.3175335938744572, |
|
"learning_rate": 4.9092075147196005e-06, |
|
"loss": 0.0282, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.0144508670520231, |
|
"grad_norm": 2.573974359707393, |
|
"learning_rate": 4.886512902083471e-06, |
|
"loss": 0.0388, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 1.0173410404624277, |
|
"grad_norm": 2.9332611203985923, |
|
"learning_rate": 4.863820628463925e-06, |
|
"loss": 0.0418, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 1.0202312138728324, |
|
"grad_norm": 1.973596205721184, |
|
"learning_rate": 4.8411311615582176e-06, |
|
"loss": 0.0295, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 1.023121387283237, |
|
"grad_norm": 1.8137644492843443, |
|
"learning_rate": 4.818444969005762e-06, |
|
"loss": 0.0259, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 1.0260115606936415, |
|
"grad_norm": 2.654278918599863, |
|
"learning_rate": 4.795762518378476e-06, |
|
"loss": 0.0442, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 1.0289017341040463, |
|
"grad_norm": 2.0967426909755353, |
|
"learning_rate": 4.773084277171161e-06, |
|
"loss": 0.0251, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 1.0317919075144508, |
|
"grad_norm": 2.442874432065364, |
|
"learning_rate": 4.750410712791862e-06, |
|
"loss": 0.0371, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 1.0346820809248556, |
|
"grad_norm": 2.5120312556057245, |
|
"learning_rate": 4.727742292552225e-06, |
|
"loss": 0.0377, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 1.0375722543352601, |
|
"grad_norm": 3.340524695416432, |
|
"learning_rate": 4.705079483657885e-06, |
|
"loss": 0.0443, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 1.0404624277456647, |
|
"grad_norm": 2.575030053289034, |
|
"learning_rate": 4.682422753198812e-06, |
|
"loss": 0.0299, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.0433526011560694, |
|
"grad_norm": 2.685175007956344, |
|
"learning_rate": 4.659772568139706e-06, |
|
"loss": 0.0398, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 1.046242774566474, |
|
"grad_norm": 2.6972818012368687, |
|
"learning_rate": 4.637129395310359e-06, |
|
"loss": 0.0399, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 1.0491329479768785, |
|
"grad_norm": 3.005217593332792, |
|
"learning_rate": 4.614493701396041e-06, |
|
"loss": 0.0407, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 1.0520231213872833, |
|
"grad_norm": 2.5154458128124024, |
|
"learning_rate": 4.591865952927873e-06, |
|
"loss": 0.0342, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 1.0549132947976878, |
|
"grad_norm": 4.439366071837881, |
|
"learning_rate": 4.569246616273225e-06, |
|
"loss": 0.0442, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 1.0578034682080926, |
|
"grad_norm": 3.0953883001786724, |
|
"learning_rate": 4.546636157626091e-06, |
|
"loss": 0.0421, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 1.060693641618497, |
|
"grad_norm": 2.2860089112442847, |
|
"learning_rate": 4.524035042997485e-06, |
|
"loss": 0.0273, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 1.0635838150289016, |
|
"grad_norm": 1.9635223901429677, |
|
"learning_rate": 4.501443738205841e-06, |
|
"loss": 0.0253, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 1.0664739884393064, |
|
"grad_norm": 3.600763598734996, |
|
"learning_rate": 4.478862708867401e-06, |
|
"loss": 0.0457, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 1.069364161849711, |
|
"grad_norm": 2.933454423722625, |
|
"learning_rate": 4.456292420386635e-06, |
|
"loss": 0.0353, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.0722543352601157, |
|
"grad_norm": 3.2964567748192946, |
|
"learning_rate": 4.43373333794663e-06, |
|
"loss": 0.0438, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 1.0751445086705202, |
|
"grad_norm": 2.4663446624125, |
|
"learning_rate": 4.41118592649952e-06, |
|
"loss": 0.0255, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 1.0780346820809248, |
|
"grad_norm": 2.9695281540539007, |
|
"learning_rate": 4.388650650756891e-06, |
|
"loss": 0.034, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 1.0809248554913296, |
|
"grad_norm": 3.213673243729433, |
|
"learning_rate": 4.366127975180204e-06, |
|
"loss": 0.0373, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 1.083815028901734, |
|
"grad_norm": 2.9297645443450344, |
|
"learning_rate": 4.3436183639712326e-06, |
|
"loss": 0.0345, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.0867052023121386, |
|
"grad_norm": 3.4897778865158835, |
|
"learning_rate": 4.321122281062481e-06, |
|
"loss": 0.0498, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 1.0895953757225434, |
|
"grad_norm": 3.83716948783189, |
|
"learning_rate": 4.298640190107634e-06, |
|
"loss": 0.0492, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 1.092485549132948, |
|
"grad_norm": 3.2428592157194007, |
|
"learning_rate": 4.276172554471998e-06, |
|
"loss": 0.0369, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 1.0953757225433527, |
|
"grad_norm": 2.724934147808616, |
|
"learning_rate": 4.25371983722294e-06, |
|
"loss": 0.0257, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 1.0982658959537572, |
|
"grad_norm": 2.7411102239022243, |
|
"learning_rate": 4.231282501120366e-06, |
|
"loss": 0.0355, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.1011560693641618, |
|
"grad_norm": 3.306500748639738, |
|
"learning_rate": 4.20886100860716e-06, |
|
"loss": 0.0311, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 1.1040462427745665, |
|
"grad_norm": 2.2266738254387515, |
|
"learning_rate": 4.1864558217996645e-06, |
|
"loss": 0.0353, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 1.106936416184971, |
|
"grad_norm": 3.1531866052818893, |
|
"learning_rate": 4.164067402478162e-06, |
|
"loss": 0.0424, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 1.1098265895953756, |
|
"grad_norm": 3.651012462730916, |
|
"learning_rate": 4.14169621207734e-06, |
|
"loss": 0.0429, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 1.1127167630057804, |
|
"grad_norm": 3.580488177101031, |
|
"learning_rate": 4.119342711676794e-06, |
|
"loss": 0.0459, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 1.115606936416185, |
|
"grad_norm": 2.964620530840838, |
|
"learning_rate": 4.0970073619915264e-06, |
|
"loss": 0.0437, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 1.1184971098265897, |
|
"grad_norm": 3.433449491668461, |
|
"learning_rate": 4.074690623362439e-06, |
|
"loss": 0.0457, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 1.1213872832369942, |
|
"grad_norm": 3.56646677299915, |
|
"learning_rate": 4.05239295574686e-06, |
|
"loss": 0.0552, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 1.1242774566473988, |
|
"grad_norm": 4.135460144949851, |
|
"learning_rate": 4.030114818709044e-06, |
|
"loss": 0.0541, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 1.1271676300578035, |
|
"grad_norm": 3.4522750039792927, |
|
"learning_rate": 4.00785667141072e-06, |
|
"loss": 0.0436, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.130057803468208, |
|
"grad_norm": 2.36010831219563, |
|
"learning_rate": 3.985618972601622e-06, |
|
"loss": 0.0307, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 1.1329479768786128, |
|
"grad_norm": 2.67796155921764, |
|
"learning_rate": 3.963402180610028e-06, |
|
"loss": 0.0331, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 1.1358381502890174, |
|
"grad_norm": 3.3768235978909726, |
|
"learning_rate": 3.941206753333319e-06, |
|
"loss": 0.0362, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 1.138728323699422, |
|
"grad_norm": 2.476496213479462, |
|
"learning_rate": 3.919033148228542e-06, |
|
"loss": 0.0279, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 1.1416184971098267, |
|
"grad_norm": 2.995513684870669, |
|
"learning_rate": 3.896881822302973e-06, |
|
"loss": 0.0431, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 1.1445086705202312, |
|
"grad_norm": 2.5680489305606264, |
|
"learning_rate": 3.874753232104714e-06, |
|
"loss": 0.0317, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 1.147398843930636, |
|
"grad_norm": 2.5370547981396423, |
|
"learning_rate": 3.852647833713271e-06, |
|
"loss": 0.0281, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 1.1502890173410405, |
|
"grad_norm": 2.9610590499557903, |
|
"learning_rate": 3.830566082730151e-06, |
|
"loss": 0.0332, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 1.153179190751445, |
|
"grad_norm": 3.249858884007628, |
|
"learning_rate": 3.8085084342694894e-06, |
|
"loss": 0.0421, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 1.1560693641618498, |
|
"grad_norm": 2.2116083552742563, |
|
"learning_rate": 3.7864753429486475e-06, |
|
"loss": 0.033, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.1560693641618498, |
|
"eval_loss": 0.09519536048173904, |
|
"eval_runtime": 0.9252, |
|
"eval_samples_per_second": 30.263, |
|
"eval_steps_per_second": 7.566, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.1589595375722543, |
|
"grad_norm": 2.3890215101503744, |
|
"learning_rate": 3.764467262878861e-06, |
|
"loss": 0.0248, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 1.1618497109826589, |
|
"grad_norm": 2.5670031649481944, |
|
"learning_rate": 3.7424846476558716e-06, |
|
"loss": 0.0351, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 1.1647398843930636, |
|
"grad_norm": 2.260051259828975, |
|
"learning_rate": 3.7205279503505744e-06, |
|
"loss": 0.03, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 1.1676300578034682, |
|
"grad_norm": 2.892119519137359, |
|
"learning_rate": 3.6985976234996957e-06, |
|
"loss": 0.036, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 1.1705202312138727, |
|
"grad_norm": 2.5690145184191255, |
|
"learning_rate": 3.676694119096446e-06, |
|
"loss": 0.0276, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 1.1734104046242775, |
|
"grad_norm": 3.8499230666390303, |
|
"learning_rate": 3.6548178885812203e-06, |
|
"loss": 0.0648, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 1.176300578034682, |
|
"grad_norm": 2.700191616579353, |
|
"learning_rate": 3.6329693828322843e-06, |
|
"loss": 0.0395, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 1.1791907514450868, |
|
"grad_norm": 2.684567031546241, |
|
"learning_rate": 3.611149052156483e-06, |
|
"loss": 0.0314, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 1.1820809248554913, |
|
"grad_norm": 2.665135353367264, |
|
"learning_rate": 3.5893573462799685e-06, |
|
"loss": 0.0315, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 1.1849710982658959, |
|
"grad_norm": 3.352352872583342, |
|
"learning_rate": 3.5675947143389144e-06, |
|
"loss": 0.0381, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.1878612716763006, |
|
"grad_norm": 2.669792807397523, |
|
"learning_rate": 3.545861604870274e-06, |
|
"loss": 0.032, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 1.1907514450867052, |
|
"grad_norm": 4.631333627989559, |
|
"learning_rate": 3.524158465802531e-06, |
|
"loss": 0.0596, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 1.19364161849711, |
|
"grad_norm": 2.6548804106561814, |
|
"learning_rate": 3.502485744446462e-06, |
|
"loss": 0.0394, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 1.1965317919075145, |
|
"grad_norm": 3.0215989136881425, |
|
"learning_rate": 3.4808438874859274e-06, |
|
"loss": 0.0346, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 1.199421965317919, |
|
"grad_norm": 3.2761815094391795, |
|
"learning_rate": 3.459233340968654e-06, |
|
"loss": 0.0443, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 1.2023121387283238, |
|
"grad_norm": 2.640717210980499, |
|
"learning_rate": 3.437654550297049e-06, |
|
"loss": 0.0339, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 1.2052023121387283, |
|
"grad_norm": 2.9354149855485194, |
|
"learning_rate": 3.4161079602190227e-06, |
|
"loss": 0.032, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 1.208092485549133, |
|
"grad_norm": 3.254143923157989, |
|
"learning_rate": 3.3945940148188117e-06, |
|
"loss": 0.0388, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 1.2109826589595376, |
|
"grad_norm": 2.331283759083418, |
|
"learning_rate": 3.3731131575078337e-06, |
|
"loss": 0.0255, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 1.2138728323699421, |
|
"grad_norm": 2.5839607337424977, |
|
"learning_rate": 3.3516658310155493e-06, |
|
"loss": 0.0289, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.216763005780347, |
|
"grad_norm": 2.051445554385587, |
|
"learning_rate": 3.3302524773803326e-06, |
|
"loss": 0.0234, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 1.2196531791907514, |
|
"grad_norm": 3.1864371805455827, |
|
"learning_rate": 3.3088735379403648e-06, |
|
"loss": 0.0465, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 1.222543352601156, |
|
"grad_norm": 2.8343323344432156, |
|
"learning_rate": 3.2875294533245355e-06, |
|
"loss": 0.033, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 1.2254335260115607, |
|
"grad_norm": 3.101107885328744, |
|
"learning_rate": 3.266220663443358e-06, |
|
"loss": 0.0381, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 1.2283236994219653, |
|
"grad_norm": 3.605841286653843, |
|
"learning_rate": 3.2449476074799137e-06, |
|
"loss": 0.0471, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 1.2312138728323698, |
|
"grad_norm": 2.732444910454241, |
|
"learning_rate": 3.223710723880786e-06, |
|
"loss": 0.0361, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 1.2341040462427746, |
|
"grad_norm": 2.1721496892014516, |
|
"learning_rate": 3.202510450347036e-06, |
|
"loss": 0.0251, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 1.2369942196531791, |
|
"grad_norm": 3.0081441814863386, |
|
"learning_rate": 3.1813472238251742e-06, |
|
"loss": 0.0324, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 1.239884393063584, |
|
"grad_norm": 2.192339362377574, |
|
"learning_rate": 3.160221480498153e-06, |
|
"loss": 0.0265, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 1.2427745664739884, |
|
"grad_norm": 3.01343269094507, |
|
"learning_rate": 3.139133655776386e-06, |
|
"loss": 0.0414, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.245664739884393, |
|
"grad_norm": 3.325885420350197, |
|
"learning_rate": 3.1180841842887667e-06, |
|
"loss": 0.0414, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 1.2485549132947977, |
|
"grad_norm": 3.0528391154930774, |
|
"learning_rate": 3.0970734998737095e-06, |
|
"loss": 0.0356, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 1.2514450867052023, |
|
"grad_norm": 2.850794422724131, |
|
"learning_rate": 3.0761020355702166e-06, |
|
"loss": 0.0413, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 1.254335260115607, |
|
"grad_norm": 2.770056911108068, |
|
"learning_rate": 3.055170223608941e-06, |
|
"loss": 0.0351, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 1.2572254335260116, |
|
"grad_norm": 2.5955998288718716, |
|
"learning_rate": 3.0342784954032893e-06, |
|
"loss": 0.0276, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 1.260115606936416, |
|
"grad_norm": 3.701530230528004, |
|
"learning_rate": 3.013427281540523e-06, |
|
"loss": 0.0417, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 1.2630057803468209, |
|
"grad_norm": 3.0698772434133543, |
|
"learning_rate": 2.992617011772885e-06, |
|
"loss": 0.0345, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 1.2658959537572254, |
|
"grad_norm": 3.9760034819905274, |
|
"learning_rate": 2.9718481150087475e-06, |
|
"loss": 0.0561, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 1.2687861271676302, |
|
"grad_norm": 3.133018364068792, |
|
"learning_rate": 2.9511210193037614e-06, |
|
"loss": 0.0376, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 1.2716763005780347, |
|
"grad_norm": 2.9993982887005575, |
|
"learning_rate": 2.9304361518520447e-06, |
|
"loss": 0.0302, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.2745664739884393, |
|
"grad_norm": 2.8666702412666525, |
|
"learning_rate": 2.9097939389773734e-06, |
|
"loss": 0.0293, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 1.2774566473988438, |
|
"grad_norm": 2.497499578695883, |
|
"learning_rate": 2.8891948061243925e-06, |
|
"loss": 0.0374, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 1.2803468208092486, |
|
"grad_norm": 3.324055869265504, |
|
"learning_rate": 2.8686391778498536e-06, |
|
"loss": 0.0376, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 1.2832369942196533, |
|
"grad_norm": 4.425783556605509, |
|
"learning_rate": 2.8481274778138567e-06, |
|
"loss": 0.0487, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 1.2861271676300579, |
|
"grad_norm": 3.10467455625404, |
|
"learning_rate": 2.827660128771119e-06, |
|
"loss": 0.0369, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 1.2890173410404624, |
|
"grad_norm": 2.8080755687640417, |
|
"learning_rate": 2.80723755256228e-06, |
|
"loss": 0.034, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 1.291907514450867, |
|
"grad_norm": 3.2116525331760095, |
|
"learning_rate": 2.786860170105174e-06, |
|
"loss": 0.0392, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 1.2947976878612717, |
|
"grad_norm": 2.397691984436016, |
|
"learning_rate": 2.766528401386187e-06, |
|
"loss": 0.0309, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 1.2976878612716762, |
|
"grad_norm": 3.2789036120305117, |
|
"learning_rate": 2.7462426654515797e-06, |
|
"loss": 0.0405, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 1.300578034682081, |
|
"grad_norm": 2.69063461395359, |
|
"learning_rate": 2.726003380398854e-06, |
|
"loss": 0.0408, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.3034682080924855, |
|
"grad_norm": 3.1072958464943072, |
|
"learning_rate": 2.705810963368154e-06, |
|
"loss": 0.0387, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 1.30635838150289, |
|
"grad_norm": 3.5267578595885736, |
|
"learning_rate": 2.685665830533642e-06, |
|
"loss": 0.0383, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 1.3092485549132948, |
|
"grad_norm": 2.903217987827139, |
|
"learning_rate": 2.665568397094934e-06, |
|
"loss": 0.0297, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 1.3121387283236994, |
|
"grad_norm": 2.532011759421094, |
|
"learning_rate": 2.6455190772685463e-06, |
|
"loss": 0.0303, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 1.3150289017341041, |
|
"grad_norm": 3.4412261236059747, |
|
"learning_rate": 2.6255182842793514e-06, |
|
"loss": 0.0386, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 1.3179190751445087, |
|
"grad_norm": 2.26659906351238, |
|
"learning_rate": 2.6055664303520655e-06, |
|
"loss": 0.0265, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 1.3208092485549132, |
|
"grad_norm": 2.5513912795757583, |
|
"learning_rate": 2.5856639267027463e-06, |
|
"loss": 0.0314, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 1.323699421965318, |
|
"grad_norm": 2.4879494991424096, |
|
"learning_rate": 2.5658111835303206e-06, |
|
"loss": 0.0325, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 1.3265895953757225, |
|
"grad_norm": 2.8723586176408302, |
|
"learning_rate": 2.5460086100081366e-06, |
|
"loss": 0.0344, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 1.3294797687861273, |
|
"grad_norm": 4.178177986784516, |
|
"learning_rate": 2.526256614275524e-06, |
|
"loss": 0.0427, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.3323699421965318, |
|
"grad_norm": 3.2580659867039077, |
|
"learning_rate": 2.506555603429377e-06, |
|
"loss": 0.0408, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 1.3352601156069364, |
|
"grad_norm": 1.9694897719040088, |
|
"learning_rate": 2.486905983515778e-06, |
|
"loss": 0.0211, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 1.3381502890173411, |
|
"grad_norm": 2.653638767366116, |
|
"learning_rate": 2.4673081595216136e-06, |
|
"loss": 0.0345, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 1.3410404624277457, |
|
"grad_norm": 2.4307989679026547, |
|
"learning_rate": 2.44776253536624e-06, |
|
"loss": 0.0271, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 1.3439306358381504, |
|
"grad_norm": 3.199779022608156, |
|
"learning_rate": 2.428269513893153e-06, |
|
"loss": 0.0427, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 1.346820809248555, |
|
"grad_norm": 2.9276329342731935, |
|
"learning_rate": 2.408829496861679e-06, |
|
"loss": 0.0328, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 1.3497109826589595, |
|
"grad_norm": 3.0746209540433793, |
|
"learning_rate": 2.389442884938709e-06, |
|
"loss": 0.037, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 1.352601156069364, |
|
"grad_norm": 2.509196565821076, |
|
"learning_rate": 2.370110077690425e-06, |
|
"loss": 0.0313, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 1.3554913294797688, |
|
"grad_norm": 2.560118707356469, |
|
"learning_rate": 2.3508314735740763e-06, |
|
"loss": 0.0302, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 1.3583815028901733, |
|
"grad_norm": 2.0668214382886405, |
|
"learning_rate": 2.331607469929765e-06, |
|
"loss": 0.0243, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.361271676300578, |
|
"grad_norm": 2.479561489138274, |
|
"learning_rate": 2.312438462972246e-06, |
|
"loss": 0.0303, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 1.3641618497109826, |
|
"grad_norm": 2.726381817824829, |
|
"learning_rate": 2.2933248477827814e-06, |
|
"loss": 0.037, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 1.3670520231213872, |
|
"grad_norm": 3.33995350193323, |
|
"learning_rate": 2.274267018300974e-06, |
|
"loss": 0.0389, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 1.369942196531792, |
|
"grad_norm": 2.832996915893155, |
|
"learning_rate": 2.2552653673166676e-06, |
|
"loss": 0.0294, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 1.3728323699421965, |
|
"grad_norm": 3.400276604343586, |
|
"learning_rate": 2.2363202864618432e-06, |
|
"loss": 0.0302, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 1.3757225433526012, |
|
"grad_norm": 3.747613052965299, |
|
"learning_rate": 2.2174321662025427e-06, |
|
"loss": 0.045, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 1.3786127167630058, |
|
"grad_norm": 2.352541139134834, |
|
"learning_rate": 2.1986013958308327e-06, |
|
"loss": 0.0298, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 1.3815028901734103, |
|
"grad_norm": 2.6690961854018447, |
|
"learning_rate": 2.179828363456768e-06, |
|
"loss": 0.0281, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 1.384393063583815, |
|
"grad_norm": 2.564512639491241, |
|
"learning_rate": 2.1611134560004045e-06, |
|
"loss": 0.0242, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 1.3872832369942196, |
|
"grad_norm": 3.3760774282609063, |
|
"learning_rate": 2.1424570591838184e-06, |
|
"loss": 0.037, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.3901734104046244, |
|
"grad_norm": 2.943402455691774, |
|
"learning_rate": 2.123859557523153e-06, |
|
"loss": 0.035, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 1.393063583815029, |
|
"grad_norm": 2.6158536911307824, |
|
"learning_rate": 2.1053213343207045e-06, |
|
"loss": 0.0335, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 1.3959537572254335, |
|
"grad_norm": 2.472107173937846, |
|
"learning_rate": 2.0868427716570078e-06, |
|
"loss": 0.0291, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 1.3988439306358382, |
|
"grad_norm": 2.574302293095002, |
|
"learning_rate": 2.068424250382974e-06, |
|
"loss": 0.0339, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 1.4017341040462428, |
|
"grad_norm": 2.6025674697204755, |
|
"learning_rate": 2.0500661501120378e-06, |
|
"loss": 0.0309, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 1.4046242774566475, |
|
"grad_norm": 3.2228489798100224, |
|
"learning_rate": 2.031768849212326e-06, |
|
"loss": 0.0274, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 1.407514450867052, |
|
"grad_norm": 2.5345873926010776, |
|
"learning_rate": 2.013532724798867e-06, |
|
"loss": 0.0275, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 1.4104046242774566, |
|
"grad_norm": 2.9597299639439254, |
|
"learning_rate": 1.995358152725818e-06, |
|
"loss": 0.0359, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 1.4132947976878611, |
|
"grad_norm": 3.1633383402105415, |
|
"learning_rate": 1.977245507578716e-06, |
|
"loss": 0.0414, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 1.416184971098266, |
|
"grad_norm": 2.190197891627958, |
|
"learning_rate": 1.959195162666759e-06, |
|
"loss": 0.0224, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.4190751445086704, |
|
"grad_norm": 2.5140382935279506, |
|
"learning_rate": 1.9412074900151094e-06, |
|
"loss": 0.0343, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 1.4219653179190752, |
|
"grad_norm": 3.1439956653511385, |
|
"learning_rate": 1.9232828603572255e-06, |
|
"loss": 0.034, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 1.4248554913294798, |
|
"grad_norm": 2.5477953734275802, |
|
"learning_rate": 1.9054216431272293e-06, |
|
"loss": 0.0229, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 1.4277456647398843, |
|
"grad_norm": 2.9742747539659162, |
|
"learning_rate": 1.8876242064522833e-06, |
|
"loss": 0.0345, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 1.430635838150289, |
|
"grad_norm": 2.5865848063902637, |
|
"learning_rate": 1.869890917145003e-06, |
|
"loss": 0.0321, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 1.4335260115606936, |
|
"grad_norm": 3.124734907713117, |
|
"learning_rate": 1.8522221406959063e-06, |
|
"loss": 0.0426, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 1.4364161849710984, |
|
"grad_norm": 2.4623442770984463, |
|
"learning_rate": 1.8346182412658665e-06, |
|
"loss": 0.0368, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 1.439306358381503, |
|
"grad_norm": 4.111782764544109, |
|
"learning_rate": 1.8170795816786202e-06, |
|
"loss": 0.0525, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 1.4421965317919074, |
|
"grad_norm": 3.239712158794509, |
|
"learning_rate": 1.7996065234132836e-06, |
|
"loss": 0.0263, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 1.4450867052023122, |
|
"grad_norm": 2.334313474090217, |
|
"learning_rate": 1.7821994265968962e-06, |
|
"loss": 0.0271, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.4479768786127167, |
|
"grad_norm": 2.437665058243356, |
|
"learning_rate": 1.7648586499970123e-06, |
|
"loss": 0.0233, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 1.4508670520231215, |
|
"grad_norm": 3.3205593098358976, |
|
"learning_rate": 1.747584551014291e-06, |
|
"loss": 0.0455, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 1.453757225433526, |
|
"grad_norm": 3.517378382534404, |
|
"learning_rate": 1.7303774856751443e-06, |
|
"loss": 0.0377, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 1.4566473988439306, |
|
"grad_norm": 2.737219012324894, |
|
"learning_rate": 1.7132378086243907e-06, |
|
"loss": 0.0317, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 1.4595375722543353, |
|
"grad_norm": 2.3603945703999885, |
|
"learning_rate": 1.6961658731179452e-06, |
|
"loss": 0.0227, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 1.4624277456647399, |
|
"grad_norm": 3.12571466433377, |
|
"learning_rate": 1.679162031015546e-06, |
|
"loss": 0.0243, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 1.4653179190751446, |
|
"grad_norm": 2.9355074878435428, |
|
"learning_rate": 1.662226632773492e-06, |
|
"loss": 0.0304, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 1.4682080924855492, |
|
"grad_norm": 3.2151717972495195, |
|
"learning_rate": 1.64536002743743e-06, |
|
"loss": 0.044, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 1.4710982658959537, |
|
"grad_norm": 2.5493514288687327, |
|
"learning_rate": 1.628562562635157e-06, |
|
"loss": 0.0277, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 1.4739884393063583, |
|
"grad_norm": 2.925418140628061, |
|
"learning_rate": 1.6118345845694489e-06, |
|
"loss": 0.0326, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.476878612716763, |
|
"grad_norm": 4.364624930763201, |
|
"learning_rate": 1.5951764380109374e-06, |
|
"loss": 0.0388, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 1.4797687861271676, |
|
"grad_norm": 2.422471626390923, |
|
"learning_rate": 1.5785884662909917e-06, |
|
"loss": 0.0256, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 1.4826589595375723, |
|
"grad_norm": 2.8693832519867573, |
|
"learning_rate": 1.5620710112946536e-06, |
|
"loss": 0.0344, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 1.4855491329479769, |
|
"grad_norm": 2.4815608656291386, |
|
"learning_rate": 1.5456244134535836e-06, |
|
"loss": 0.0245, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 1.4884393063583814, |
|
"grad_norm": 1.4984532129341321, |
|
"learning_rate": 1.5292490117390457e-06, |
|
"loss": 0.017, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 1.4913294797687862, |
|
"grad_norm": 2.805421146569415, |
|
"learning_rate": 1.5129451436549203e-06, |
|
"loss": 0.0379, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 1.4942196531791907, |
|
"grad_norm": 2.212196592484195, |
|
"learning_rate": 1.4967131452307537e-06, |
|
"loss": 0.0284, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 1.4971098265895955, |
|
"grad_norm": 3.1407969224113783, |
|
"learning_rate": 1.4805533510148268e-06, |
|
"loss": 0.0357, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 3.2595411846223183, |
|
"learning_rate": 1.4644660940672628e-06, |
|
"loss": 0.0356, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 1.5028901734104045, |
|
"grad_norm": 3.28488525309383, |
|
"learning_rate": 1.4484517059531588e-06, |
|
"loss": 0.0355, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.5057803468208093, |
|
"grad_norm": 3.0922561562434017, |
|
"learning_rate": 1.4325105167357545e-06, |
|
"loss": 0.0339, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 1.5086705202312138, |
|
"grad_norm": 2.3544575695219043, |
|
"learning_rate": 1.416642854969632e-06, |
|
"loss": 0.0226, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 1.5115606936416186, |
|
"grad_norm": 3.346063294192223, |
|
"learning_rate": 1.4008490476939423e-06, |
|
"loss": 0.0385, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 1.5144508670520231, |
|
"grad_norm": 3.1038077146303085, |
|
"learning_rate": 1.3851294204256638e-06, |
|
"loss": 0.0335, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 1.5173410404624277, |
|
"grad_norm": 2.8284054263519147, |
|
"learning_rate": 1.3694842971528927e-06, |
|
"loss": 0.0268, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 1.5202312138728322, |
|
"grad_norm": 3.85505423838039, |
|
"learning_rate": 1.3539140003281647e-06, |
|
"loss": 0.0297, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 1.523121387283237, |
|
"grad_norm": 2.610739320006818, |
|
"learning_rate": 1.3384188508618157e-06, |
|
"loss": 0.0303, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 1.5260115606936417, |
|
"grad_norm": 3.1615204643844095, |
|
"learning_rate": 1.3229991681153632e-06, |
|
"loss": 0.0384, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 1.5289017341040463, |
|
"grad_norm": 2.5500870773853053, |
|
"learning_rate": 1.3076552698949175e-06, |
|
"loss": 0.022, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 1.5317919075144508, |
|
"grad_norm": 3.148763012910272, |
|
"learning_rate": 1.2923874724446472e-06, |
|
"loss": 0.0348, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.5346820809248554, |
|
"grad_norm": 3.3175768415192475, |
|
"learning_rate": 1.277196090440243e-06, |
|
"loss": 0.0337, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 1.5375722543352601, |
|
"grad_norm": 3.644399114002089, |
|
"learning_rate": 1.262081436982448e-06, |
|
"loss": 0.038, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 1.5404624277456649, |
|
"grad_norm": 2.585122401857608, |
|
"learning_rate": 1.2470438235905975e-06, |
|
"loss": 0.0288, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 1.5433526011560694, |
|
"grad_norm": 3.1018957028076097, |
|
"learning_rate": 1.2320835601961928e-06, |
|
"loss": 0.0358, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 1.546242774566474, |
|
"grad_norm": 2.238711021090234, |
|
"learning_rate": 1.217200955136527e-06, |
|
"loss": 0.0247, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 1.5491329479768785, |
|
"grad_norm": 2.6791734885813394, |
|
"learning_rate": 1.2023963151483165e-06, |
|
"loss": 0.0296, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 1.5520231213872833, |
|
"grad_norm": 2.841826452346151, |
|
"learning_rate": 1.1876699453613883e-06, |
|
"loss": 0.0404, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 1.5549132947976878, |
|
"grad_norm": 2.4678639583105433, |
|
"learning_rate": 1.1730221492923882e-06, |
|
"loss": 0.0273, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 1.5578034682080926, |
|
"grad_norm": 2.4973929272338644, |
|
"learning_rate": 1.1584532288385209e-06, |
|
"loss": 0.0256, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 1.560693641618497, |
|
"grad_norm": 2.341033650504072, |
|
"learning_rate": 1.1439634842713371e-06, |
|
"loss": 0.0227, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.5635838150289016, |
|
"grad_norm": 3.065988190194671, |
|
"learning_rate": 1.1295532142305332e-06, |
|
"loss": 0.0313, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 1.5664739884393064, |
|
"grad_norm": 2.1835285043969828, |
|
"learning_rate": 1.115222715717807e-06, |
|
"loss": 0.0208, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 1.569364161849711, |
|
"grad_norm": 3.500157776002619, |
|
"learning_rate": 1.1009722840907316e-06, |
|
"loss": 0.0319, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 1.5722543352601157, |
|
"grad_norm": 3.2214524688064574, |
|
"learning_rate": 1.0868022130566652e-06, |
|
"loss": 0.0416, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 1.5751445086705202, |
|
"grad_norm": 3.04668472610985, |
|
"learning_rate": 1.0727127946667032e-06, |
|
"loss": 0.0306, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 1.5780346820809248, |
|
"grad_norm": 2.679583691046011, |
|
"learning_rate": 1.0587043193096535e-06, |
|
"loss": 0.0308, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 1.5809248554913293, |
|
"grad_norm": 2.4842548396047226, |
|
"learning_rate": 1.0447770757060571e-06, |
|
"loss": 0.0328, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 1.583815028901734, |
|
"grad_norm": 3.2742820682763085, |
|
"learning_rate": 1.030931350902235e-06, |
|
"loss": 0.0352, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 1.5867052023121389, |
|
"grad_norm": 2.4207210641270485, |
|
"learning_rate": 1.017167430264368e-06, |
|
"loss": 0.0279, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 1.5895953757225434, |
|
"grad_norm": 2.152127329332973, |
|
"learning_rate": 1.0034855974726194e-06, |
|
"loss": 0.0201, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.592485549132948, |
|
"grad_norm": 3.0074798195082653, |
|
"learning_rate": 9.89886134515291e-07, |
|
"loss": 0.0338, |
|
"step": 551 |
|
}, |
|
{ |
|
"epoch": 1.5953757225433525, |
|
"grad_norm": 3.184876769094888, |
|
"learning_rate": 9.763693216830055e-07, |
|
"loss": 0.038, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 1.5982658959537572, |
|
"grad_norm": 2.869846326145178, |
|
"learning_rate": 9.629354375629341e-07, |
|
"loss": 0.0276, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 1.601156069364162, |
|
"grad_norm": 2.585461539082119, |
|
"learning_rate": 9.495847590330486e-07, |
|
"loss": 0.0324, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 1.6040462427745665, |
|
"grad_norm": 2.742836913299859, |
|
"learning_rate": 9.363175612564202e-07, |
|
"loss": 0.0302, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 1.606936416184971, |
|
"grad_norm": 3.2924360152641494, |
|
"learning_rate": 9.231341176755487e-07, |
|
"loss": 0.0297, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 1.6098265895953756, |
|
"grad_norm": 2.686788179114119, |
|
"learning_rate": 9.10034700006725e-07, |
|
"loss": 0.0321, |
|
"step": 557 |
|
}, |
|
{ |
|
"epoch": 1.6127167630057804, |
|
"grad_norm": 3.0218421958689063, |
|
"learning_rate": 8.970195782344266e-07, |
|
"loss": 0.0332, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 1.6156069364161851, |
|
"grad_norm": 2.3212479985232077, |
|
"learning_rate": 8.840890206057634e-07, |
|
"loss": 0.0261, |
|
"step": 559 |
|
}, |
|
{ |
|
"epoch": 1.6184971098265897, |
|
"grad_norm": 2.8736201021277377, |
|
"learning_rate": 8.712432936249365e-07, |
|
"loss": 0.03, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.6213872832369942, |
|
"grad_norm": 2.3595839968196866, |
|
"learning_rate": 8.584826620477566e-07, |
|
"loss": 0.0408, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 1.6242774566473988, |
|
"grad_norm": 3.0347610178615674, |
|
"learning_rate": 8.458073888761826e-07, |
|
"loss": 0.0362, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 1.6271676300578035, |
|
"grad_norm": 2.38603129609543, |
|
"learning_rate": 8.332177353528964e-07, |
|
"loss": 0.0251, |
|
"step": 563 |
|
}, |
|
{ |
|
"epoch": 1.630057803468208, |
|
"grad_norm": 2.6055073766794687, |
|
"learning_rate": 8.207139609559284e-07, |
|
"loss": 0.0253, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 1.6329479768786128, |
|
"grad_norm": 2.583083431789858, |
|
"learning_rate": 8.082963233932995e-07, |
|
"loss": 0.0259, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 1.6358381502890174, |
|
"grad_norm": 2.0515263932259518, |
|
"learning_rate": 7.959650785977179e-07, |
|
"loss": 0.0217, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 1.638728323699422, |
|
"grad_norm": 2.746779673644074, |
|
"learning_rate": 7.837204807213017e-07, |
|
"loss": 0.0314, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 1.6416184971098264, |
|
"grad_norm": 2.3296657216334102, |
|
"learning_rate": 7.71562782130334e-07, |
|
"loss": 0.0273, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 1.6445086705202312, |
|
"grad_norm": 2.3206553449940284, |
|
"learning_rate": 7.594922334000738e-07, |
|
"loss": 0.0202, |
|
"step": 569 |
|
}, |
|
{ |
|
"epoch": 1.647398843930636, |
|
"grad_norm": 3.801639203355854, |
|
"learning_rate": 7.475090833095799e-07, |
|
"loss": 0.0451, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.6502890173410405, |
|
"grad_norm": 2.3365922810094872, |
|
"learning_rate": 7.356135788365915e-07, |
|
"loss": 0.0216, |
|
"step": 571 |
|
}, |
|
{ |
|
"epoch": 1.653179190751445, |
|
"grad_norm": 2.7038418344399258, |
|
"learning_rate": 7.238059651524354e-07, |
|
"loss": 0.0238, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 1.6560693641618496, |
|
"grad_norm": 2.9959245625608184, |
|
"learning_rate": 7.120864856169696e-07, |
|
"loss": 0.0305, |
|
"step": 573 |
|
}, |
|
{ |
|
"epoch": 1.6589595375722543, |
|
"grad_norm": 2.3080167609962046, |
|
"learning_rate": 7.004553817735732e-07, |
|
"loss": 0.0217, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 1.661849710982659, |
|
"grad_norm": 2.4821282413657033, |
|
"learning_rate": 6.88912893344163e-07, |
|
"loss": 0.0267, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 1.6647398843930636, |
|
"grad_norm": 2.653018792418636, |
|
"learning_rate": 6.774592582242567e-07, |
|
"loss": 0.0298, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 1.6676300578034682, |
|
"grad_norm": 2.642689505666662, |
|
"learning_rate": 6.660947124780686e-07, |
|
"loss": 0.0254, |
|
"step": 577 |
|
}, |
|
{ |
|
"epoch": 1.6705202312138727, |
|
"grad_norm": 3.169845791000904, |
|
"learning_rate": 6.548194903336408e-07, |
|
"loss": 0.031, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 1.6734104046242775, |
|
"grad_norm": 2.9465397732730363, |
|
"learning_rate": 6.436338241780227e-07, |
|
"loss": 0.0338, |
|
"step": 579 |
|
}, |
|
{ |
|
"epoch": 1.6763005780346822, |
|
"grad_norm": 2.2311471874361373, |
|
"learning_rate": 6.325379445524732e-07, |
|
"loss": 0.0229, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.6791907514450868, |
|
"grad_norm": 3.081369160616298, |
|
"learning_rate": 6.215320801477154e-07, |
|
"loss": 0.0363, |
|
"step": 581 |
|
}, |
|
{ |
|
"epoch": 1.6820809248554913, |
|
"grad_norm": 2.421411126342786, |
|
"learning_rate": 6.106164577992224e-07, |
|
"loss": 0.0271, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 1.6849710982658959, |
|
"grad_norm": 2.9947960888538767, |
|
"learning_rate": 5.99791302482538e-07, |
|
"loss": 0.0337, |
|
"step": 583 |
|
}, |
|
{ |
|
"epoch": 1.6878612716763006, |
|
"grad_norm": 2.227042908760082, |
|
"learning_rate": 5.890568373086425e-07, |
|
"loss": 0.0211, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 1.6907514450867052, |
|
"grad_norm": 3.712493485936692, |
|
"learning_rate": 5.784132835193562e-07, |
|
"loss": 0.0334, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 1.69364161849711, |
|
"grad_norm": 2.725787651475256, |
|
"learning_rate": 5.678608604827784e-07, |
|
"loss": 0.0297, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 1.6965317919075145, |
|
"grad_norm": 2.1287017206524106, |
|
"learning_rate": 5.573997856887642e-07, |
|
"loss": 0.0227, |
|
"step": 587 |
|
}, |
|
{ |
|
"epoch": 1.699421965317919, |
|
"grad_norm": 2.0962651279994864, |
|
"learning_rate": 5.470302747444428e-07, |
|
"loss": 0.0222, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 1.7023121387283235, |
|
"grad_norm": 2.2102726315322005, |
|
"learning_rate": 5.367525413697716e-07, |
|
"loss": 0.0198, |
|
"step": 589 |
|
}, |
|
{ |
|
"epoch": 1.7052023121387283, |
|
"grad_norm": 3.081800122703875, |
|
"learning_rate": 5.265667973931416e-07, |
|
"loss": 0.0293, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.708092485549133, |
|
"grad_norm": 2.5073112696737234, |
|
"learning_rate": 5.164732527469968e-07, |
|
"loss": 0.0255, |
|
"step": 591 |
|
}, |
|
{ |
|
"epoch": 1.7109826589595376, |
|
"grad_norm": 2.9631917652655515, |
|
"learning_rate": 5.064721154635155e-07, |
|
"loss": 0.0238, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 1.7138728323699421, |
|
"grad_norm": 2.9449876977836835, |
|
"learning_rate": 4.965635916703248e-07, |
|
"loss": 0.0266, |
|
"step": 593 |
|
}, |
|
{ |
|
"epoch": 1.7167630057803467, |
|
"grad_norm": 2.8389071960037877, |
|
"learning_rate": 4.86747885586244e-07, |
|
"loss": 0.0289, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 1.7196531791907514, |
|
"grad_norm": 3.7075338546107095, |
|
"learning_rate": 4.770251995170871e-07, |
|
"loss": 0.0354, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 1.7225433526011562, |
|
"grad_norm": 2.8275729208383127, |
|
"learning_rate": 4.673957338514812e-07, |
|
"loss": 0.0284, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 1.7254335260115607, |
|
"grad_norm": 2.394113671682943, |
|
"learning_rate": 4.5785968705674255e-07, |
|
"loss": 0.026, |
|
"step": 597 |
|
}, |
|
{ |
|
"epoch": 1.7283236994219653, |
|
"grad_norm": 2.6354299148272378, |
|
"learning_rate": 4.48417255674789e-07, |
|
"loss": 0.0285, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 1.7312138728323698, |
|
"grad_norm": 2.487760980340726, |
|
"learning_rate": 4.3906863431808e-07, |
|
"loss": 0.0258, |
|
"step": 599 |
|
}, |
|
{ |
|
"epoch": 1.7341040462427746, |
|
"grad_norm": 2.7589247543697226, |
|
"learning_rate": 4.298140156656178e-07, |
|
"loss": 0.0286, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.7341040462427746, |
|
"eval_loss": 0.08402061462402344, |
|
"eval_runtime": 0.9305, |
|
"eval_samples_per_second": 30.092, |
|
"eval_steps_per_second": 7.523, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.7369942196531793, |
|
"grad_norm": 3.2204676092038107, |
|
"learning_rate": 4.2065359045896427e-07, |
|
"loss": 0.0302, |
|
"step": 601 |
|
}, |
|
{ |
|
"epoch": 1.739884393063584, |
|
"grad_norm": 4.899074590727239, |
|
"learning_rate": 4.115875474983161e-07, |
|
"loss": 0.0279, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 1.7427745664739884, |
|
"grad_norm": 3.1374643269567724, |
|
"learning_rate": 4.0261607363861365e-07, |
|
"loss": 0.0344, |
|
"step": 603 |
|
}, |
|
{ |
|
"epoch": 1.745664739884393, |
|
"grad_norm": 2.749091436221168, |
|
"learning_rate": 3.937393537856871e-07, |
|
"loss": 0.0305, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 1.7485549132947977, |
|
"grad_norm": 2.741639181393763, |
|
"learning_rate": 3.84957570892448e-07, |
|
"loss": 0.0308, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 1.7514450867052023, |
|
"grad_norm": 3.8598686607362223, |
|
"learning_rate": 3.762709059551184e-07, |
|
"loss": 0.0467, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 1.754335260115607, |
|
"grad_norm": 2.3025890101533317, |
|
"learning_rate": 3.6767953800949554e-07, |
|
"loss": 0.0226, |
|
"step": 607 |
|
}, |
|
{ |
|
"epoch": 1.7572254335260116, |
|
"grad_norm": 1.902682668166002, |
|
"learning_rate": 3.5918364412727004e-07, |
|
"loss": 0.0181, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 1.760115606936416, |
|
"grad_norm": 3.3068575007443783, |
|
"learning_rate": 3.5078339941237107e-07, |
|
"loss": 0.0309, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 1.7630057803468207, |
|
"grad_norm": 2.476320454660619, |
|
"learning_rate": 3.4247897699735575e-07, |
|
"loss": 0.0339, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.7658959537572254, |
|
"grad_norm": 2.8791371798867584, |
|
"learning_rate": 3.3427054803984784e-07, |
|
"loss": 0.0279, |
|
"step": 611 |
|
}, |
|
{ |
|
"epoch": 1.7687861271676302, |
|
"grad_norm": 2.0036596494241343, |
|
"learning_rate": 3.2615828171900234e-07, |
|
"loss": 0.0206, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 1.7716763005780347, |
|
"grad_norm": 2.103998944647541, |
|
"learning_rate": 3.181423452320209e-07, |
|
"loss": 0.0194, |
|
"step": 613 |
|
}, |
|
{ |
|
"epoch": 1.7745664739884393, |
|
"grad_norm": 2.995779745958891, |
|
"learning_rate": 3.102229037907134e-07, |
|
"loss": 0.0281, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 1.7774566473988438, |
|
"grad_norm": 3.388218734539243, |
|
"learning_rate": 3.024001206180799e-07, |
|
"loss": 0.031, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 1.7803468208092486, |
|
"grad_norm": 2.8090278682040197, |
|
"learning_rate": 2.946741569449563e-07, |
|
"loss": 0.0262, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 1.7832369942196533, |
|
"grad_norm": 2.4275238476413845, |
|
"learning_rate": 2.8704517200668746e-07, |
|
"loss": 0.0246, |
|
"step": 617 |
|
}, |
|
{ |
|
"epoch": 1.7861271676300579, |
|
"grad_norm": 2.8947907296916604, |
|
"learning_rate": 2.7951332303984335e-07, |
|
"loss": 0.0273, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 1.7890173410404624, |
|
"grad_norm": 2.842966900220881, |
|
"learning_rate": 2.7207876527898746e-07, |
|
"loss": 0.0345, |
|
"step": 619 |
|
}, |
|
{ |
|
"epoch": 1.791907514450867, |
|
"grad_norm": 2.7282199212357816, |
|
"learning_rate": 2.6474165195346346e-07, |
|
"loss": 0.0285, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.7947976878612717, |
|
"grad_norm": 2.9452491096470603, |
|
"learning_rate": 2.575021342842493e-07, |
|
"loss": 0.0303, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 1.7976878612716765, |
|
"grad_norm": 2.595963079044866, |
|
"learning_rate": 2.5036036148083367e-07, |
|
"loss": 0.029, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 1.800578034682081, |
|
"grad_norm": 3.1034369862005313, |
|
"learning_rate": 2.4331648073814107e-07, |
|
"loss": 0.0362, |
|
"step": 623 |
|
}, |
|
{ |
|
"epoch": 1.8034682080924855, |
|
"grad_norm": 2.0293576730904404, |
|
"learning_rate": 2.363706372335045e-07, |
|
"loss": 0.0214, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 1.80635838150289, |
|
"grad_norm": 3.0290797901523447, |
|
"learning_rate": 2.2952297412366432e-07, |
|
"loss": 0.0285, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 1.8092485549132948, |
|
"grad_norm": 2.4301291771331375, |
|
"learning_rate": 2.2277363254182228e-07, |
|
"loss": 0.0306, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 1.8121387283236994, |
|
"grad_norm": 2.0209004900593452, |
|
"learning_rate": 2.161227515947334e-07, |
|
"loss": 0.0219, |
|
"step": 627 |
|
}, |
|
{ |
|
"epoch": 1.8150289017341041, |
|
"grad_norm": 2.272922638009751, |
|
"learning_rate": 2.0957046835983764e-07, |
|
"loss": 0.0261, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 1.8179190751445087, |
|
"grad_norm": 2.267867365649924, |
|
"learning_rate": 2.0311691788243548e-07, |
|
"loss": 0.0226, |
|
"step": 629 |
|
}, |
|
{ |
|
"epoch": 1.8208092485549132, |
|
"grad_norm": 2.6833144758951826, |
|
"learning_rate": 1.9676223317290245e-07, |
|
"loss": 0.0292, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.8236994219653178, |
|
"grad_norm": 2.7577394624568763, |
|
"learning_rate": 1.905065452039495e-07, |
|
"loss": 0.0328, |
|
"step": 631 |
|
}, |
|
{ |
|
"epoch": 1.8265895953757225, |
|
"grad_norm": 2.507658068123955, |
|
"learning_rate": 1.8434998290792373e-07, |
|
"loss": 0.0273, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 1.8294797687861273, |
|
"grad_norm": 2.0490162702647226, |
|
"learning_rate": 1.7829267317415188e-07, |
|
"loss": 0.0242, |
|
"step": 633 |
|
}, |
|
{ |
|
"epoch": 1.8323699421965318, |
|
"grad_norm": 2.7696464782973256, |
|
"learning_rate": 1.7233474084632107e-07, |
|
"loss": 0.0282, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 1.8352601156069364, |
|
"grad_norm": 3.3311577875403917, |
|
"learning_rate": 1.6647630871991116e-07, |
|
"loss": 0.0263, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 1.838150289017341, |
|
"grad_norm": 2.4750883996948674, |
|
"learning_rate": 1.6071749753965914e-07, |
|
"loss": 0.0274, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 1.8410404624277457, |
|
"grad_norm": 3.173715006325098, |
|
"learning_rate": 1.5505842599707442e-07, |
|
"loss": 0.0343, |
|
"step": 637 |
|
}, |
|
{ |
|
"epoch": 1.8439306358381504, |
|
"grad_norm": 3.489741224468283, |
|
"learning_rate": 1.4949921072798967e-07, |
|
"loss": 0.035, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 1.846820809248555, |
|
"grad_norm": 2.882500598594475, |
|
"learning_rate": 1.440399663101577e-07, |
|
"loss": 0.0277, |
|
"step": 639 |
|
}, |
|
{ |
|
"epoch": 1.8497109826589595, |
|
"grad_norm": 3.233374202089709, |
|
"learning_rate": 1.386808052608918e-07, |
|
"loss": 0.0391, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.852601156069364, |
|
"grad_norm": 2.7034183642322347, |
|
"learning_rate": 1.334218380347424e-07, |
|
"loss": 0.0292, |
|
"step": 641 |
|
}, |
|
{ |
|
"epoch": 1.8554913294797688, |
|
"grad_norm": 2.197194297058419, |
|
"learning_rate": 1.282631730212258e-07, |
|
"loss": 0.0249, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 1.8583815028901736, |
|
"grad_norm": 2.9036298407234313, |
|
"learning_rate": 1.2320491654258803e-07, |
|
"loss": 0.0298, |
|
"step": 643 |
|
}, |
|
{ |
|
"epoch": 1.861271676300578, |
|
"grad_norm": 2.8094195877802393, |
|
"learning_rate": 1.1824717285160992e-07, |
|
"loss": 0.0298, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 1.8641618497109826, |
|
"grad_norm": 2.717689585519079, |
|
"learning_rate": 1.1339004412946553e-07, |
|
"loss": 0.0301, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 1.8670520231213872, |
|
"grad_norm": 3.2884617474113127, |
|
"learning_rate": 1.0863363048360942e-07, |
|
"loss": 0.042, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 1.869942196531792, |
|
"grad_norm": 2.142509289269094, |
|
"learning_rate": 1.0397802994571826e-07, |
|
"loss": 0.0212, |
|
"step": 647 |
|
}, |
|
{ |
|
"epoch": 1.8728323699421965, |
|
"grad_norm": 3.6837900459823167, |
|
"learning_rate": 9.942333846966745e-08, |
|
"loss": 0.0443, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 1.8757225433526012, |
|
"grad_norm": 3.331072208277014, |
|
"learning_rate": 9.496964992955382e-08, |
|
"loss": 0.03, |
|
"step": 649 |
|
}, |
|
{ |
|
"epoch": 1.8786127167630058, |
|
"grad_norm": 2.231335330345806, |
|
"learning_rate": 9.061705611776273e-08, |
|
"loss": 0.0285, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.8815028901734103, |
|
"grad_norm": 2.6317094385765016, |
|
"learning_rate": 8.636564674307402e-08, |
|
"loss": 0.0334, |
|
"step": 651 |
|
}, |
|
{ |
|
"epoch": 1.8843930635838149, |
|
"grad_norm": 3.3131881744849396, |
|
"learning_rate": 8.221550942881406e-08, |
|
"loss": 0.0342, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 1.8872832369942196, |
|
"grad_norm": 3.150237302456506, |
|
"learning_rate": 7.816672971105055e-08, |
|
"loss": 0.0276, |
|
"step": 653 |
|
}, |
|
{ |
|
"epoch": 1.8901734104046244, |
|
"grad_norm": 1.9693826852564784, |
|
"learning_rate": 7.421939103682662e-08, |
|
"loss": 0.0209, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 1.893063583815029, |
|
"grad_norm": 2.337524129982516, |
|
"learning_rate": 7.037357476244566e-08, |
|
"loss": 0.0247, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 1.8959537572254335, |
|
"grad_norm": 2.8829233540951984, |
|
"learning_rate": 6.662936015178978e-08, |
|
"loss": 0.0313, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 1.898843930635838, |
|
"grad_norm": 3.3070223112681676, |
|
"learning_rate": 6.298682437468895e-08, |
|
"loss": 0.0351, |
|
"step": 657 |
|
}, |
|
{ |
|
"epoch": 1.9017341040462428, |
|
"grad_norm": 1.9799003747289656, |
|
"learning_rate": 5.9446042505330594e-08, |
|
"loss": 0.0235, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 1.9046242774566475, |
|
"grad_norm": 2.8571160793919206, |
|
"learning_rate": 5.600708752071082e-08, |
|
"loss": 0.0359, |
|
"step": 659 |
|
}, |
|
{ |
|
"epoch": 1.907514450867052, |
|
"grad_norm": 2.7689616678406055, |
|
"learning_rate": 5.267003029913065e-08, |
|
"loss": 0.0236, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.9104046242774566, |
|
"grad_norm": 2.762628793592965, |
|
"learning_rate": 4.943493961873658e-08, |
|
"loss": 0.023, |
|
"step": 661 |
|
}, |
|
{ |
|
"epoch": 1.9132947976878611, |
|
"grad_norm": 2.930260556434489, |
|
"learning_rate": 4.630188215610065e-08, |
|
"loss": 0.0235, |
|
"step": 662 |
|
}, |
|
{ |
|
"epoch": 1.916184971098266, |
|
"grad_norm": 2.358664696088892, |
|
"learning_rate": 4.327092248484932e-08, |
|
"loss": 0.0272, |
|
"step": 663 |
|
}, |
|
{ |
|
"epoch": 1.9190751445086707, |
|
"grad_norm": 3.09076782112662, |
|
"learning_rate": 4.03421230743295e-08, |
|
"loss": 0.0343, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 1.9219653179190752, |
|
"grad_norm": 1.7621073890549188, |
|
"learning_rate": 3.751554428832238e-08, |
|
"loss": 0.0177, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 1.9248554913294798, |
|
"grad_norm": 2.1762751213497933, |
|
"learning_rate": 3.4791244383799994e-08, |
|
"loss": 0.0185, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 1.9277456647398843, |
|
"grad_norm": 2.5431146597843997, |
|
"learning_rate": 3.216927950972393e-08, |
|
"loss": 0.0322, |
|
"step": 667 |
|
}, |
|
{ |
|
"epoch": 1.930635838150289, |
|
"grad_norm": 2.417193867852855, |
|
"learning_rate": 2.964970370588738e-08, |
|
"loss": 0.025, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 1.9335260115606936, |
|
"grad_norm": 2.2455101858667565, |
|
"learning_rate": 2.7232568901801592e-08, |
|
"loss": 0.0277, |
|
"step": 669 |
|
}, |
|
{ |
|
"epoch": 1.9364161849710984, |
|
"grad_norm": 2.4713814937235385, |
|
"learning_rate": 2.4917924915626725e-08, |
|
"loss": 0.0252, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.939306358381503, |
|
"grad_norm": 2.7741728341467775, |
|
"learning_rate": 2.2705819453144316e-08, |
|
"loss": 0.0233, |
|
"step": 671 |
|
}, |
|
{ |
|
"epoch": 1.9421965317919074, |
|
"grad_norm": 3.2682376836372056, |
|
"learning_rate": 2.0596298106774214e-08, |
|
"loss": 0.0342, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 1.9450867052023122, |
|
"grad_norm": 2.582049310045216, |
|
"learning_rate": 1.8589404354632523e-08, |
|
"loss": 0.03, |
|
"step": 673 |
|
}, |
|
{ |
|
"epoch": 1.9479768786127167, |
|
"grad_norm": 2.429361450682195, |
|
"learning_rate": 1.6685179559641217e-08, |
|
"loss": 0.0233, |
|
"step": 674 |
|
}, |
|
{ |
|
"epoch": 1.9508670520231215, |
|
"grad_norm": 3.132745641356759, |
|
"learning_rate": 1.4883662968669387e-08, |
|
"loss": 0.0407, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 1.953757225433526, |
|
"grad_norm": 2.3239626051059386, |
|
"learning_rate": 1.3184891711727766e-08, |
|
"loss": 0.0247, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 1.9566473988439306, |
|
"grad_norm": 2.477156881588747, |
|
"learning_rate": 1.1588900801203229e-08, |
|
"loss": 0.0228, |
|
"step": 677 |
|
}, |
|
{ |
|
"epoch": 1.9595375722543351, |
|
"grad_norm": 2.139721681837424, |
|
"learning_rate": 1.0095723131136603e-08, |
|
"loss": 0.0252, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 1.9624277456647399, |
|
"grad_norm": 3.426665247902062, |
|
"learning_rate": 8.705389476543758e-09, |
|
"loss": 0.0267, |
|
"step": 679 |
|
}, |
|
{ |
|
"epoch": 1.9653179190751446, |
|
"grad_norm": 2.8650595430397288, |
|
"learning_rate": 7.417928492784443e-09, |
|
"loss": 0.0323, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.9682080924855492, |
|
"grad_norm": 2.7291850369640613, |
|
"learning_rate": 6.233366714967215e-09, |
|
"loss": 0.026, |
|
"step": 681 |
|
}, |
|
{ |
|
"epoch": 1.9710982658959537, |
|
"grad_norm": 3.359249209071856, |
|
"learning_rate": 5.151728557406532e-09, |
|
"loss": 0.0357, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 1.9739884393063583, |
|
"grad_norm": 2.6983693308255314, |
|
"learning_rate": 4.173036313117607e-09, |
|
"loss": 0.0268, |
|
"step": 683 |
|
}, |
|
{ |
|
"epoch": 1.976878612716763, |
|
"grad_norm": 2.2142359024895644, |
|
"learning_rate": 3.2973101533567698e-09, |
|
"loss": 0.0277, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 1.9797687861271678, |
|
"grad_norm": 2.3357628732709474, |
|
"learning_rate": 2.5245681272068057e-09, |
|
"loss": 0.0229, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 1.9826589595375723, |
|
"grad_norm": 2.1678730353523386, |
|
"learning_rate": 1.8548261612050255e-09, |
|
"loss": 0.0236, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 1.9855491329479769, |
|
"grad_norm": 2.90668259735446, |
|
"learning_rate": 1.2880980590124214e-09, |
|
"loss": 0.0325, |
|
"step": 687 |
|
}, |
|
{ |
|
"epoch": 1.9884393063583814, |
|
"grad_norm": 1.775137098330101, |
|
"learning_rate": 8.243955011333349e-10, |
|
"loss": 0.0188, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 1.9913294797687862, |
|
"grad_norm": 3.1118509748289944, |
|
"learning_rate": 4.637280446712078e-10, |
|
"loss": 0.034, |
|
"step": 689 |
|
}, |
|
{ |
|
"epoch": 1.9942196531791907, |
|
"grad_norm": 2.7054381917248325, |
|
"learning_rate": 2.0610312313318336e-10, |
|
"loss": 0.0344, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.9971098265895955, |
|
"grad_norm": 2.7307481010776367, |
|
"learning_rate": 5.152604627634006e-11, |
|
"loss": 0.0265, |
|
"step": 691 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 1.7811358836387599, |
|
"learning_rate": 0.0, |
|
"loss": 0.0164, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"step": 692, |
|
"total_flos": 1412134244352.0, |
|
"train_loss": 0.0628656179461133, |
|
"train_runtime": 422.1909, |
|
"train_samples_per_second": 13.103, |
|
"train_steps_per_second": 1.639 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 692, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 2000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1412134244352.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|