diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,14213 @@ +{ + "best_metric": 0.15097101032733917, + "best_model_checkpoint": "saved_model/lop_sep_2024/checkpoint-13492", + "epoch": 2.999888827126181, + "eval_steps": 500, + "global_step": 20238, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": NaN, + "learning_rate": 0.0, + "loss": 70.4771, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 18.768062591552734, + "learning_rate": 3.5000000000000004e-06, + "loss": 70.9315, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 19.307415008544922, + "learning_rate": 8.000000000000001e-06, + "loss": 70.6744, + "step": 20 + }, + { + "epoch": 0.0, + "grad_norm": 22.3438663482666, + "learning_rate": 1.2e-05, + "loss": 68.4448, + "step": 30 + }, + { + "epoch": 0.01, + "grad_norm": 27.788856506347656, + "learning_rate": 1.7000000000000003e-05, + "loss": 64.7802, + "step": 40 + }, + { + "epoch": 0.01, + "grad_norm": 34.86759948730469, + "learning_rate": 2.15e-05, + "loss": 56.6582, + "step": 50 + }, + { + "epoch": 0.01, + "grad_norm": 43.42910385131836, + "learning_rate": 2.6500000000000004e-05, + "loss": 42.3126, + "step": 60 + }, + { + "epoch": 0.01, + "grad_norm": 14.587575912475586, + "learning_rate": 3.15e-05, + "loss": 20.2735, + "step": 70 + }, + { + "epoch": 0.01, + "grad_norm": 12.488475799560547, + "learning_rate": 3.65e-05, + "loss": 6.4981, + "step": 80 + }, + { + "epoch": 0.01, + "grad_norm": 3.4391798973083496, + "learning_rate": 4.15e-05, + "loss": 0.9414, + "step": 90 + }, + { + "epoch": 0.01, + "grad_norm": 1.6826833486557007, + "learning_rate": 4.6500000000000005e-05, + "loss": 0.4819, + "step": 100 + }, + { + "epoch": 0.02, + "grad_norm": 2.1236026287078857, + "learning_rate": 5.1500000000000005e-05, + "loss": 0.4228, + "step": 110 + }, + { + "epoch": 0.02, + "grad_norm": 2.420114278793335, + "learning_rate": 5.65e-05, + "loss": 0.4401, + "step": 120 + }, + { + "epoch": 0.02, + "grad_norm": 2.8946385383605957, + "learning_rate": 6.15e-05, + "loss": 0.6266, + "step": 130 + }, + { + "epoch": 0.02, + "grad_norm": 12.989470481872559, + "learning_rate": 6.65e-05, + "loss": 0.5741, + "step": 140 + }, + { + "epoch": 0.02, + "grad_norm": 2.2886247634887695, + "learning_rate": 7.15e-05, + "loss": 0.4404, + "step": 150 + }, + { + "epoch": 0.02, + "grad_norm": 3.232952833175659, + "learning_rate": 7.65e-05, + "loss": 0.4768, + "step": 160 + }, + { + "epoch": 0.03, + "grad_norm": 2.457054615020752, + "learning_rate": 8.15e-05, + "loss": 0.4633, + "step": 170 + }, + { + "epoch": 0.03, + "grad_norm": 1.454648733139038, + "learning_rate": 8.65e-05, + "loss": 0.4295, + "step": 180 + }, + { + "epoch": 0.03, + "grad_norm": 3.4959018230438232, + "learning_rate": 9.15e-05, + "loss": 0.4526, + "step": 190 + }, + { + "epoch": 0.03, + "grad_norm": 1.8352223634719849, + "learning_rate": 9.65e-05, + "loss": 0.3898, + "step": 200 + }, + { + "epoch": 0.03, + "grad_norm": 2.780531406402588, + "learning_rate": 9.999553969669938e-05, + "loss": 0.3962, + "step": 210 + }, + { + "epoch": 0.03, + "grad_norm": 2.455747127532959, + "learning_rate": 9.998067201903063e-05, + "loss": 0.4071, + "step": 220 + }, + { + "epoch": 0.03, + "grad_norm": 1.8026726245880127, + "learning_rate": 9.996580434136188e-05, + "loss": 0.383, + "step": 230 + }, + { + "epoch": 0.04, + "grad_norm": 1.4635368585586548, + "learning_rate": 9.995093666369314e-05, + "loss": 0.4433, + "step": 240 + }, + { + "epoch": 0.04, + "grad_norm": 4.1103901863098145, + "learning_rate": 9.993606898602439e-05, + "loss": 0.3985, + "step": 250 + }, + { + "epoch": 0.04, + "grad_norm": 4.443718910217285, + "learning_rate": 9.992120130835564e-05, + "loss": 0.3978, + "step": 260 + }, + { + "epoch": 0.04, + "grad_norm": 5.011767387390137, + "learning_rate": 9.990633363068688e-05, + "loss": 0.4696, + "step": 270 + }, + { + "epoch": 0.04, + "grad_norm": 5.497318744659424, + "learning_rate": 9.989146595301815e-05, + "loss": 0.4204, + "step": 280 + }, + { + "epoch": 0.04, + "grad_norm": 1.5428732633590698, + "learning_rate": 9.98765982753494e-05, + "loss": 0.4107, + "step": 290 + }, + { + "epoch": 0.04, + "grad_norm": 2.399179220199585, + "learning_rate": 9.986173059768065e-05, + "loss": 0.3831, + "step": 300 + }, + { + "epoch": 0.05, + "grad_norm": 1.3291690349578857, + "learning_rate": 9.98468629200119e-05, + "loss": 0.3904, + "step": 310 + }, + { + "epoch": 0.05, + "grad_norm": 1.5392258167266846, + "learning_rate": 9.983199524234314e-05, + "loss": 0.3349, + "step": 320 + }, + { + "epoch": 0.05, + "grad_norm": 4.950677871704102, + "learning_rate": 9.981712756467441e-05, + "loss": 0.3969, + "step": 330 + }, + { + "epoch": 0.05, + "grad_norm": 5.227260112762451, + "learning_rate": 9.980225988700565e-05, + "loss": 0.4026, + "step": 340 + }, + { + "epoch": 0.05, + "grad_norm": 1.5111877918243408, + "learning_rate": 9.97873922093369e-05, + "loss": 0.3014, + "step": 350 + }, + { + "epoch": 0.05, + "grad_norm": 2.647972583770752, + "learning_rate": 9.977252453166816e-05, + "loss": 0.2976, + "step": 360 + }, + { + "epoch": 0.05, + "grad_norm": 3.595974922180176, + "learning_rate": 9.975765685399942e-05, + "loss": 0.3304, + "step": 370 + }, + { + "epoch": 0.06, + "grad_norm": 3.389302968978882, + "learning_rate": 9.974278917633067e-05, + "loss": 0.2689, + "step": 380 + }, + { + "epoch": 0.06, + "grad_norm": 2.902669906616211, + "learning_rate": 9.972792149866191e-05, + "loss": 0.2924, + "step": 390 + }, + { + "epoch": 0.06, + "grad_norm": 3.135059118270874, + "learning_rate": 9.971305382099317e-05, + "loss": 0.3061, + "step": 400 + }, + { + "epoch": 0.06, + "grad_norm": 3.143860101699829, + "learning_rate": 9.969818614332441e-05, + "loss": 0.2687, + "step": 410 + }, + { + "epoch": 0.06, + "grad_norm": 2.1663131713867188, + "learning_rate": 9.968331846565568e-05, + "loss": 0.2633, + "step": 420 + }, + { + "epoch": 0.06, + "grad_norm": 4.806302547454834, + "learning_rate": 9.966845078798692e-05, + "loss": 0.2626, + "step": 430 + }, + { + "epoch": 0.07, + "grad_norm": 1.9977257251739502, + "learning_rate": 9.965358311031817e-05, + "loss": 0.2876, + "step": 440 + }, + { + "epoch": 0.07, + "grad_norm": 1.0454623699188232, + "learning_rate": 9.963871543264943e-05, + "loss": 0.2663, + "step": 450 + }, + { + "epoch": 0.07, + "grad_norm": 2.6900148391723633, + "learning_rate": 9.962384775498068e-05, + "loss": 0.2593, + "step": 460 + }, + { + "epoch": 0.07, + "grad_norm": 4.413218975067139, + "learning_rate": 9.960898007731194e-05, + "loss": 0.2523, + "step": 470 + }, + { + "epoch": 0.07, + "grad_norm": 4.643527984619141, + "learning_rate": 9.959411239964318e-05, + "loss": 0.272, + "step": 480 + }, + { + "epoch": 0.07, + "grad_norm": 0.9696474075317383, + "learning_rate": 9.957924472197443e-05, + "loss": 0.2435, + "step": 490 + }, + { + "epoch": 0.07, + "grad_norm": 1.4355976581573486, + "learning_rate": 9.956437704430567e-05, + "loss": 0.2267, + "step": 500 + }, + { + "epoch": 0.08, + "grad_norm": 2.6311492919921875, + "learning_rate": 9.954950936663694e-05, + "loss": 0.2622, + "step": 510 + }, + { + "epoch": 0.08, + "grad_norm": 2.0157711505889893, + "learning_rate": 9.953464168896818e-05, + "loss": 0.2461, + "step": 520 + }, + { + "epoch": 0.08, + "grad_norm": 1.7786158323287964, + "learning_rate": 9.951977401129944e-05, + "loss": 0.2369, + "step": 530 + }, + { + "epoch": 0.08, + "grad_norm": 2.6747984886169434, + "learning_rate": 9.950490633363069e-05, + "loss": 0.2115, + "step": 540 + }, + { + "epoch": 0.08, + "grad_norm": 0.9853987693786621, + "learning_rate": 9.949003865596195e-05, + "loss": 0.2139, + "step": 550 + }, + { + "epoch": 0.08, + "grad_norm": 2.4679114818573, + "learning_rate": 9.94751709782932e-05, + "loss": 0.262, + "step": 560 + }, + { + "epoch": 0.08, + "grad_norm": 0.7220086455345154, + "learning_rate": 9.946030330062444e-05, + "loss": 0.2262, + "step": 570 + }, + { + "epoch": 0.09, + "grad_norm": 1.3707178831100464, + "learning_rate": 9.94454356229557e-05, + "loss": 0.2501, + "step": 580 + }, + { + "epoch": 0.09, + "grad_norm": 0.6298980712890625, + "learning_rate": 9.943056794528694e-05, + "loss": 0.2237, + "step": 590 + }, + { + "epoch": 0.09, + "grad_norm": 5.072420597076416, + "learning_rate": 9.941570026761821e-05, + "loss": 0.2358, + "step": 600 + }, + { + "epoch": 0.09, + "grad_norm": 0.9326792359352112, + "learning_rate": 9.940083258994945e-05, + "loss": 0.2013, + "step": 610 + }, + { + "epoch": 0.09, + "grad_norm": 0.6766129732131958, + "learning_rate": 9.93859649122807e-05, + "loss": 0.1931, + "step": 620 + }, + { + "epoch": 0.09, + "grad_norm": 4.741708755493164, + "learning_rate": 9.937109723461196e-05, + "loss": 0.2234, + "step": 630 + }, + { + "epoch": 0.09, + "grad_norm": 4.426298141479492, + "learning_rate": 9.935622955694321e-05, + "loss": 0.2425, + "step": 640 + }, + { + "epoch": 0.1, + "grad_norm": 0.8037042021751404, + "learning_rate": 9.934136187927447e-05, + "loss": 0.2243, + "step": 650 + }, + { + "epoch": 0.1, + "grad_norm": 2.159052610397339, + "learning_rate": 9.932649420160571e-05, + "loss": 0.2101, + "step": 660 + }, + { + "epoch": 0.1, + "grad_norm": 0.6584820747375488, + "learning_rate": 9.931162652393696e-05, + "loss": 0.2234, + "step": 670 + }, + { + "epoch": 0.1, + "grad_norm": 4.1978840827941895, + "learning_rate": 9.929675884626822e-05, + "loss": 0.2182, + "step": 680 + }, + { + "epoch": 0.1, + "grad_norm": 6.110688209533691, + "learning_rate": 9.928189116859947e-05, + "loss": 0.2312, + "step": 690 + }, + { + "epoch": 0.1, + "grad_norm": 1.9383026361465454, + "learning_rate": 9.926702349093073e-05, + "loss": 0.2148, + "step": 700 + }, + { + "epoch": 0.11, + "grad_norm": 0.8565307855606079, + "learning_rate": 9.925215581326197e-05, + "loss": 0.2262, + "step": 710 + }, + { + "epoch": 0.11, + "grad_norm": 0.8206549286842346, + "learning_rate": 9.923728813559322e-05, + "loss": 0.2025, + "step": 720 + }, + { + "epoch": 0.11, + "grad_norm": 1.799793004989624, + "learning_rate": 9.922242045792448e-05, + "loss": 0.2057, + "step": 730 + }, + { + "epoch": 0.11, + "grad_norm": 2.572327136993408, + "learning_rate": 9.920755278025573e-05, + "loss": 0.1932, + "step": 740 + }, + { + "epoch": 0.11, + "grad_norm": 3.779280185699463, + "learning_rate": 9.919268510258697e-05, + "loss": 0.2238, + "step": 750 + }, + { + "epoch": 0.11, + "grad_norm": 4.892170429229736, + "learning_rate": 9.917781742491823e-05, + "loss": 0.2243, + "step": 760 + }, + { + "epoch": 0.11, + "grad_norm": 0.9165033102035522, + "learning_rate": 9.916294974724948e-05, + "loss": 0.2015, + "step": 770 + }, + { + "epoch": 0.12, + "grad_norm": 1.2273848056793213, + "learning_rate": 9.914808206958074e-05, + "loss": 0.1935, + "step": 780 + }, + { + "epoch": 0.12, + "grad_norm": 0.9774864315986633, + "learning_rate": 9.9133214391912e-05, + "loss": 0.1876, + "step": 790 + }, + { + "epoch": 0.12, + "grad_norm": 1.4223579168319702, + "learning_rate": 9.911834671424323e-05, + "loss": 0.1963, + "step": 800 + }, + { + "epoch": 0.12, + "grad_norm": 0.9810081124305725, + "learning_rate": 9.910347903657449e-05, + "loss": 0.1806, + "step": 810 + }, + { + "epoch": 0.12, + "grad_norm": 0.8502489328384399, + "learning_rate": 9.908861135890574e-05, + "loss": 0.2476, + "step": 820 + }, + { + "epoch": 0.12, + "grad_norm": 1.8983548879623413, + "learning_rate": 9.9073743681237e-05, + "loss": 0.1994, + "step": 830 + }, + { + "epoch": 0.12, + "grad_norm": 0.5797375440597534, + "learning_rate": 9.905887600356824e-05, + "loss": 0.1977, + "step": 840 + }, + { + "epoch": 0.13, + "grad_norm": 0.8001552224159241, + "learning_rate": 9.90440083258995e-05, + "loss": 0.2037, + "step": 850 + }, + { + "epoch": 0.13, + "grad_norm": 1.5331403017044067, + "learning_rate": 9.902914064823075e-05, + "loss": 0.197, + "step": 860 + }, + { + "epoch": 0.13, + "grad_norm": 0.6793760061264038, + "learning_rate": 9.9014272970562e-05, + "loss": 0.2199, + "step": 870 + }, + { + "epoch": 0.13, + "grad_norm": 0.6174043416976929, + "learning_rate": 9.899940529289326e-05, + "loss": 0.2152, + "step": 880 + }, + { + "epoch": 0.13, + "grad_norm": 1.187853455543518, + "learning_rate": 9.89845376152245e-05, + "loss": 0.2005, + "step": 890 + }, + { + "epoch": 0.13, + "grad_norm": 2.541826009750366, + "learning_rate": 9.896966993755576e-05, + "loss": 0.2007, + "step": 900 + }, + { + "epoch": 0.13, + "grad_norm": 0.699213981628418, + "learning_rate": 9.895480225988701e-05, + "loss": 0.2046, + "step": 910 + }, + { + "epoch": 0.14, + "grad_norm": 0.9006096124649048, + "learning_rate": 9.893993458221826e-05, + "loss": 0.196, + "step": 920 + }, + { + "epoch": 0.14, + "grad_norm": 0.516926646232605, + "learning_rate": 9.89250669045495e-05, + "loss": 0.1857, + "step": 930 + }, + { + "epoch": 0.14, + "grad_norm": 1.140449047088623, + "learning_rate": 9.891019922688076e-05, + "loss": 0.1988, + "step": 940 + }, + { + "epoch": 0.14, + "grad_norm": 1.6413800716400146, + "learning_rate": 9.889533154921203e-05, + "loss": 0.1958, + "step": 950 + }, + { + "epoch": 0.14, + "grad_norm": 0.7474754452705383, + "learning_rate": 9.888046387154327e-05, + "loss": 0.1907, + "step": 960 + }, + { + "epoch": 0.14, + "grad_norm": 0.7224648594856262, + "learning_rate": 9.886559619387453e-05, + "loss": 0.1969, + "step": 970 + }, + { + "epoch": 0.15, + "grad_norm": 0.9668368697166443, + "learning_rate": 9.885072851620577e-05, + "loss": 0.2143, + "step": 980 + }, + { + "epoch": 0.15, + "grad_norm": 1.3923943042755127, + "learning_rate": 9.883586083853702e-05, + "loss": 0.1999, + "step": 990 + }, + { + "epoch": 0.15, + "grad_norm": 2.00828218460083, + "learning_rate": 9.882099316086828e-05, + "loss": 0.1885, + "step": 1000 + }, + { + "epoch": 0.15, + "grad_norm": 0.7937036156654358, + "learning_rate": 9.880612548319953e-05, + "loss": 0.1842, + "step": 1010 + }, + { + "epoch": 0.15, + "grad_norm": 0.9639225602149963, + "learning_rate": 9.879125780553079e-05, + "loss": 0.1884, + "step": 1020 + }, + { + "epoch": 0.15, + "grad_norm": 2.1198477745056152, + "learning_rate": 9.877639012786203e-05, + "loss": 0.1958, + "step": 1030 + }, + { + "epoch": 0.15, + "grad_norm": 0.7752804756164551, + "learning_rate": 9.87615224501933e-05, + "loss": 0.1889, + "step": 1040 + }, + { + "epoch": 0.16, + "grad_norm": 0.39463403820991516, + "learning_rate": 9.874665477252454e-05, + "loss": 0.1738, + "step": 1050 + }, + { + "epoch": 0.16, + "grad_norm": 1.6980741024017334, + "learning_rate": 9.873178709485579e-05, + "loss": 0.1875, + "step": 1060 + }, + { + "epoch": 0.16, + "grad_norm": 0.7736172080039978, + "learning_rate": 9.871691941718703e-05, + "loss": 0.1893, + "step": 1070 + }, + { + "epoch": 0.16, + "grad_norm": 0.968677282333374, + "learning_rate": 9.870205173951829e-05, + "loss": 0.1737, + "step": 1080 + }, + { + "epoch": 0.16, + "grad_norm": 1.1933799982070923, + "learning_rate": 9.868718406184954e-05, + "loss": 0.1706, + "step": 1090 + }, + { + "epoch": 0.16, + "grad_norm": 2.9177980422973633, + "learning_rate": 9.86723163841808e-05, + "loss": 0.1942, + "step": 1100 + }, + { + "epoch": 0.16, + "grad_norm": 0.6118002533912659, + "learning_rate": 9.865744870651205e-05, + "loss": 0.172, + "step": 1110 + }, + { + "epoch": 0.17, + "grad_norm": 1.0720818042755127, + "learning_rate": 9.864258102884329e-05, + "loss": 0.1899, + "step": 1120 + }, + { + "epoch": 0.17, + "grad_norm": 1.246082067489624, + "learning_rate": 9.862771335117456e-05, + "loss": 0.1944, + "step": 1130 + }, + { + "epoch": 0.17, + "grad_norm": 2.5863025188446045, + "learning_rate": 9.86128456735058e-05, + "loss": 0.1836, + "step": 1140 + }, + { + "epoch": 0.17, + "grad_norm": 1.1113203763961792, + "learning_rate": 9.859797799583706e-05, + "loss": 0.1969, + "step": 1150 + }, + { + "epoch": 0.17, + "grad_norm": 1.720690369606018, + "learning_rate": 9.85831103181683e-05, + "loss": 0.2078, + "step": 1160 + }, + { + "epoch": 0.17, + "grad_norm": 2.5786492824554443, + "learning_rate": 9.856824264049957e-05, + "loss": 0.1941, + "step": 1170 + }, + { + "epoch": 0.17, + "grad_norm": 2.1644389629364014, + "learning_rate": 9.855337496283081e-05, + "loss": 0.1853, + "step": 1180 + }, + { + "epoch": 0.18, + "grad_norm": 0.7992768287658691, + "learning_rate": 9.853850728516206e-05, + "loss": 0.1937, + "step": 1190 + }, + { + "epoch": 0.18, + "grad_norm": 2.3800251483917236, + "learning_rate": 9.852363960749332e-05, + "loss": 0.1991, + "step": 1200 + }, + { + "epoch": 0.18, + "grad_norm": 3.22444224357605, + "learning_rate": 9.850877192982456e-05, + "loss": 0.1876, + "step": 1210 + }, + { + "epoch": 0.18, + "grad_norm": 0.721161961555481, + "learning_rate": 9.849390425215583e-05, + "loss": 0.1776, + "step": 1220 + }, + { + "epoch": 0.18, + "grad_norm": 2.3023221492767334, + "learning_rate": 9.847903657448707e-05, + "loss": 0.1912, + "step": 1230 + }, + { + "epoch": 0.18, + "grad_norm": 0.44242483377456665, + "learning_rate": 9.846416889681832e-05, + "loss": 0.1716, + "step": 1240 + }, + { + "epoch": 0.19, + "grad_norm": 1.6946061849594116, + "learning_rate": 9.844930121914956e-05, + "loss": 0.1822, + "step": 1250 + }, + { + "epoch": 0.19, + "grad_norm": 2.3526597023010254, + "learning_rate": 9.843443354148083e-05, + "loss": 0.1931, + "step": 1260 + }, + { + "epoch": 0.19, + "grad_norm": 0.595988392829895, + "learning_rate": 9.841956586381207e-05, + "loss": 0.1764, + "step": 1270 + }, + { + "epoch": 0.19, + "grad_norm": 1.5539588928222656, + "learning_rate": 9.840469818614333e-05, + "loss": 0.181, + "step": 1280 + }, + { + "epoch": 0.19, + "grad_norm": 0.45790284872055054, + "learning_rate": 9.838983050847458e-05, + "loss": 0.1831, + "step": 1290 + }, + { + "epoch": 0.19, + "grad_norm": 1.5096701383590698, + "learning_rate": 9.837496283080582e-05, + "loss": 0.1648, + "step": 1300 + }, + { + "epoch": 0.19, + "grad_norm": 0.4358065128326416, + "learning_rate": 9.836009515313709e-05, + "loss": 0.1892, + "step": 1310 + }, + { + "epoch": 0.2, + "grad_norm": 0.45344677567481995, + "learning_rate": 9.834522747546833e-05, + "loss": 0.1882, + "step": 1320 + }, + { + "epoch": 0.2, + "grad_norm": 1.9001879692077637, + "learning_rate": 9.833035979779959e-05, + "loss": 0.182, + "step": 1330 + }, + { + "epoch": 0.2, + "grad_norm": 1.2151376008987427, + "learning_rate": 9.831549212013084e-05, + "loss": 0.2068, + "step": 1340 + }, + { + "epoch": 0.2, + "grad_norm": 2.489504814147949, + "learning_rate": 9.83006244424621e-05, + "loss": 0.1874, + "step": 1350 + }, + { + "epoch": 0.2, + "grad_norm": 2.392239809036255, + "learning_rate": 9.828575676479335e-05, + "loss": 0.2012, + "step": 1360 + }, + { + "epoch": 0.2, + "grad_norm": 0.8293458819389343, + "learning_rate": 9.82708890871246e-05, + "loss": 0.1756, + "step": 1370 + }, + { + "epoch": 0.2, + "grad_norm": 0.4222114086151123, + "learning_rate": 9.825602140945585e-05, + "loss": 0.1781, + "step": 1380 + }, + { + "epoch": 0.21, + "grad_norm": 0.6707155704498291, + "learning_rate": 9.824115373178709e-05, + "loss": 0.1687, + "step": 1390 + }, + { + "epoch": 0.21, + "grad_norm": 1.2496354579925537, + "learning_rate": 9.822628605411836e-05, + "loss": 0.1698, + "step": 1400 + }, + { + "epoch": 0.21, + "grad_norm": 0.7204919457435608, + "learning_rate": 9.82114183764496e-05, + "loss": 0.1756, + "step": 1410 + }, + { + "epoch": 0.21, + "grad_norm": 1.3436416387557983, + "learning_rate": 9.819655069878085e-05, + "loss": 0.1813, + "step": 1420 + }, + { + "epoch": 0.21, + "grad_norm": 1.1300007104873657, + "learning_rate": 9.818168302111211e-05, + "loss": 0.1865, + "step": 1430 + }, + { + "epoch": 0.21, + "grad_norm": 0.4229620695114136, + "learning_rate": 9.816681534344336e-05, + "loss": 0.1765, + "step": 1440 + }, + { + "epoch": 0.21, + "grad_norm": 1.7664425373077393, + "learning_rate": 9.815194766577462e-05, + "loss": 0.19, + "step": 1450 + }, + { + "epoch": 0.22, + "grad_norm": 0.6647424697875977, + "learning_rate": 9.813707998810586e-05, + "loss": 0.1759, + "step": 1460 + }, + { + "epoch": 0.22, + "grad_norm": 1.3758875131607056, + "learning_rate": 9.812221231043711e-05, + "loss": 0.1771, + "step": 1470 + }, + { + "epoch": 0.22, + "grad_norm": 1.0963157415390015, + "learning_rate": 9.810734463276836e-05, + "loss": 0.181, + "step": 1480 + }, + { + "epoch": 0.22, + "grad_norm": 1.6835664510726929, + "learning_rate": 9.809247695509962e-05, + "loss": 0.1668, + "step": 1490 + }, + { + "epoch": 0.22, + "grad_norm": 2.1593921184539795, + "learning_rate": 9.807760927743087e-05, + "loss": 0.1816, + "step": 1500 + }, + { + "epoch": 0.22, + "grad_norm": 1.253033995628357, + "learning_rate": 9.806274159976212e-05, + "loss": 0.1721, + "step": 1510 + }, + { + "epoch": 0.23, + "grad_norm": 2.998976945877075, + "learning_rate": 9.804787392209337e-05, + "loss": 0.1867, + "step": 1520 + }, + { + "epoch": 0.23, + "grad_norm": 1.396804928779602, + "learning_rate": 9.803300624442463e-05, + "loss": 0.1721, + "step": 1530 + }, + { + "epoch": 0.23, + "grad_norm": 0.6815312504768372, + "learning_rate": 9.801813856675588e-05, + "loss": 0.1826, + "step": 1540 + }, + { + "epoch": 0.23, + "grad_norm": 0.8488101959228516, + "learning_rate": 9.800327088908713e-05, + "loss": 0.1872, + "step": 1550 + }, + { + "epoch": 0.23, + "grad_norm": 1.0025837421417236, + "learning_rate": 9.798840321141838e-05, + "loss": 0.1683, + "step": 1560 + }, + { + "epoch": 0.23, + "grad_norm": 1.0327823162078857, + "learning_rate": 9.797353553374962e-05, + "loss": 0.1722, + "step": 1570 + }, + { + "epoch": 0.23, + "grad_norm": 1.4573004245758057, + "learning_rate": 9.795866785608089e-05, + "loss": 0.1717, + "step": 1580 + }, + { + "epoch": 0.24, + "grad_norm": 0.6835706233978271, + "learning_rate": 9.794380017841213e-05, + "loss": 0.1797, + "step": 1590 + }, + { + "epoch": 0.24, + "grad_norm": 1.981534481048584, + "learning_rate": 9.792893250074339e-05, + "loss": 0.1733, + "step": 1600 + }, + { + "epoch": 0.24, + "grad_norm": 0.49460187554359436, + "learning_rate": 9.791406482307464e-05, + "loss": 0.175, + "step": 1610 + }, + { + "epoch": 0.24, + "grad_norm": 1.1679047346115112, + "learning_rate": 9.78991971454059e-05, + "loss": 0.1754, + "step": 1620 + }, + { + "epoch": 0.24, + "grad_norm": 0.691370964050293, + "learning_rate": 9.788432946773715e-05, + "loss": 0.167, + "step": 1630 + }, + { + "epoch": 0.24, + "grad_norm": 0.409994900226593, + "learning_rate": 9.786946179006839e-05, + "loss": 0.1688, + "step": 1640 + }, + { + "epoch": 0.24, + "grad_norm": 0.7671188116073608, + "learning_rate": 9.785459411239965e-05, + "loss": 0.1626, + "step": 1650 + }, + { + "epoch": 0.25, + "grad_norm": 0.6950274109840393, + "learning_rate": 9.78397264347309e-05, + "loss": 0.1717, + "step": 1660 + }, + { + "epoch": 0.25, + "grad_norm": 1.0336863994598389, + "learning_rate": 9.782485875706216e-05, + "loss": 0.1906, + "step": 1670 + }, + { + "epoch": 0.25, + "grad_norm": 3.111928939819336, + "learning_rate": 9.780999107939341e-05, + "loss": 0.2204, + "step": 1680 + }, + { + "epoch": 0.25, + "grad_norm": 0.817621648311615, + "learning_rate": 9.779512340172465e-05, + "loss": 0.192, + "step": 1690 + }, + { + "epoch": 0.25, + "grad_norm": 2.2715766429901123, + "learning_rate": 9.77802557240559e-05, + "loss": 0.1728, + "step": 1700 + }, + { + "epoch": 0.25, + "grad_norm": 0.4694221019744873, + "learning_rate": 9.776538804638716e-05, + "loss": 0.1751, + "step": 1710 + }, + { + "epoch": 0.25, + "grad_norm": 2.2308688163757324, + "learning_rate": 9.775052036871842e-05, + "loss": 0.1749, + "step": 1720 + }, + { + "epoch": 0.26, + "grad_norm": 1.3650729656219482, + "learning_rate": 9.773565269104966e-05, + "loss": 0.1786, + "step": 1730 + }, + { + "epoch": 0.26, + "grad_norm": 0.3981892466545105, + "learning_rate": 9.772078501338091e-05, + "loss": 0.1724, + "step": 1740 + }, + { + "epoch": 0.26, + "grad_norm": 1.6411010026931763, + "learning_rate": 9.770591733571217e-05, + "loss": 0.1774, + "step": 1750 + }, + { + "epoch": 0.26, + "grad_norm": 0.6380265951156616, + "learning_rate": 9.769104965804342e-05, + "loss": 0.1793, + "step": 1760 + }, + { + "epoch": 0.26, + "grad_norm": 0.4115835130214691, + "learning_rate": 9.767618198037468e-05, + "loss": 0.1636, + "step": 1770 + }, + { + "epoch": 0.26, + "grad_norm": 0.61713707447052, + "learning_rate": 9.766131430270592e-05, + "loss": 0.1769, + "step": 1780 + }, + { + "epoch": 0.27, + "grad_norm": 1.0060006380081177, + "learning_rate": 9.764644662503717e-05, + "loss": 0.1691, + "step": 1790 + }, + { + "epoch": 0.27, + "grad_norm": 0.9878020286560059, + "learning_rate": 9.763157894736843e-05, + "loss": 0.1749, + "step": 1800 + }, + { + "epoch": 0.27, + "grad_norm": 2.226569414138794, + "learning_rate": 9.761671126969968e-05, + "loss": 0.1671, + "step": 1810 + }, + { + "epoch": 0.27, + "grad_norm": 0.7667270302772522, + "learning_rate": 9.760184359203092e-05, + "loss": 0.1822, + "step": 1820 + }, + { + "epoch": 0.27, + "grad_norm": 1.7759143114089966, + "learning_rate": 9.758697591436218e-05, + "loss": 0.175, + "step": 1830 + }, + { + "epoch": 0.27, + "grad_norm": 1.309430718421936, + "learning_rate": 9.757210823669343e-05, + "loss": 0.1683, + "step": 1840 + }, + { + "epoch": 0.27, + "grad_norm": 0.3665989339351654, + "learning_rate": 9.755724055902469e-05, + "loss": 0.1748, + "step": 1850 + }, + { + "epoch": 0.28, + "grad_norm": 0.5607463121414185, + "learning_rate": 9.754237288135594e-05, + "loss": 0.1646, + "step": 1860 + }, + { + "epoch": 0.28, + "grad_norm": 2.0324952602386475, + "learning_rate": 9.752750520368718e-05, + "loss": 0.1818, + "step": 1870 + }, + { + "epoch": 0.28, + "grad_norm": 1.5004349946975708, + "learning_rate": 9.751263752601844e-05, + "loss": 0.1795, + "step": 1880 + }, + { + "epoch": 0.28, + "grad_norm": 0.5794762372970581, + "learning_rate": 9.749776984834969e-05, + "loss": 0.1877, + "step": 1890 + }, + { + "epoch": 0.28, + "grad_norm": 0.6873047947883606, + "learning_rate": 9.748290217068095e-05, + "loss": 0.1872, + "step": 1900 + }, + { + "epoch": 0.28, + "grad_norm": 0.3563670814037323, + "learning_rate": 9.746803449301219e-05, + "loss": 0.173, + "step": 1910 + }, + { + "epoch": 0.28, + "grad_norm": 0.47983822226524353, + "learning_rate": 9.745316681534344e-05, + "loss": 0.1654, + "step": 1920 + }, + { + "epoch": 0.29, + "grad_norm": 0.31588858366012573, + "learning_rate": 9.74382991376747e-05, + "loss": 0.1588, + "step": 1930 + }, + { + "epoch": 0.29, + "grad_norm": 1.5894639492034912, + "learning_rate": 9.742343146000595e-05, + "loss": 0.1778, + "step": 1940 + }, + { + "epoch": 0.29, + "grad_norm": 0.5094777941703796, + "learning_rate": 9.740856378233721e-05, + "loss": 0.1869, + "step": 1950 + }, + { + "epoch": 0.29, + "grad_norm": 0.4982427656650543, + "learning_rate": 9.739369610466845e-05, + "loss": 0.182, + "step": 1960 + }, + { + "epoch": 0.29, + "grad_norm": 0.5018313527107239, + "learning_rate": 9.73788284269997e-05, + "loss": 0.1621, + "step": 1970 + }, + { + "epoch": 0.29, + "grad_norm": 0.4199548363685608, + "learning_rate": 9.736396074933096e-05, + "loss": 0.1654, + "step": 1980 + }, + { + "epoch": 0.29, + "grad_norm": 1.6279269456863403, + "learning_rate": 9.734909307166221e-05, + "loss": 0.1802, + "step": 1990 + }, + { + "epoch": 0.3, + "grad_norm": 1.3614368438720703, + "learning_rate": 9.733422539399347e-05, + "loss": 0.1636, + "step": 2000 + }, + { + "epoch": 0.3, + "grad_norm": 0.6040695905685425, + "learning_rate": 9.731935771632471e-05, + "loss": 0.1694, + "step": 2010 + }, + { + "epoch": 0.3, + "grad_norm": 0.44964730739593506, + "learning_rate": 9.730449003865598e-05, + "loss": 0.164, + "step": 2020 + }, + { + "epoch": 0.3, + "grad_norm": 1.9905171394348145, + "learning_rate": 9.728962236098722e-05, + "loss": 0.1692, + "step": 2030 + }, + { + "epoch": 0.3, + "grad_norm": 1.676896572113037, + "learning_rate": 9.727475468331847e-05, + "loss": 0.1734, + "step": 2040 + }, + { + "epoch": 0.3, + "grad_norm": 1.833280086517334, + "learning_rate": 9.725988700564971e-05, + "loss": 0.1842, + "step": 2050 + }, + { + "epoch": 0.31, + "grad_norm": 1.3920565843582153, + "learning_rate": 9.724501932798098e-05, + "loss": 0.1834, + "step": 2060 + }, + { + "epoch": 0.31, + "grad_norm": 0.7751169800758362, + "learning_rate": 9.723015165031222e-05, + "loss": 0.1755, + "step": 2070 + }, + { + "epoch": 0.31, + "grad_norm": 1.8627113103866577, + "learning_rate": 9.721528397264348e-05, + "loss": 0.1695, + "step": 2080 + }, + { + "epoch": 0.31, + "grad_norm": 0.434849351644516, + "learning_rate": 9.720041629497473e-05, + "loss": 0.1675, + "step": 2090 + }, + { + "epoch": 0.31, + "grad_norm": 1.202000617980957, + "learning_rate": 9.718554861730597e-05, + "loss": 0.1749, + "step": 2100 + }, + { + "epoch": 0.31, + "grad_norm": 0.7374542951583862, + "learning_rate": 9.717068093963724e-05, + "loss": 0.169, + "step": 2110 + }, + { + "epoch": 0.31, + "grad_norm": 0.6856274604797363, + "learning_rate": 9.715581326196848e-05, + "loss": 0.1724, + "step": 2120 + }, + { + "epoch": 0.32, + "grad_norm": 0.8698721528053284, + "learning_rate": 9.714094558429974e-05, + "loss": 0.1705, + "step": 2130 + }, + { + "epoch": 0.32, + "grad_norm": 1.2129219770431519, + "learning_rate": 9.712607790663098e-05, + "loss": 0.1704, + "step": 2140 + }, + { + "epoch": 0.32, + "grad_norm": 0.3243730962276459, + "learning_rate": 9.711121022896225e-05, + "loss": 0.1753, + "step": 2150 + }, + { + "epoch": 0.32, + "grad_norm": 1.1494802236557007, + "learning_rate": 9.709634255129349e-05, + "loss": 0.1702, + "step": 2160 + }, + { + "epoch": 0.32, + "grad_norm": 0.7077322602272034, + "learning_rate": 9.708147487362474e-05, + "loss": 0.176, + "step": 2170 + }, + { + "epoch": 0.32, + "grad_norm": 3.5321290493011475, + "learning_rate": 9.7066607195956e-05, + "loss": 0.1799, + "step": 2180 + }, + { + "epoch": 0.32, + "grad_norm": 0.40482622385025024, + "learning_rate": 9.705173951828724e-05, + "loss": 0.1817, + "step": 2190 + }, + { + "epoch": 0.33, + "grad_norm": 2.4115407466888428, + "learning_rate": 9.703687184061851e-05, + "loss": 0.1863, + "step": 2200 + }, + { + "epoch": 0.33, + "grad_norm": 2.6012375354766846, + "learning_rate": 9.702200416294975e-05, + "loss": 0.1697, + "step": 2210 + }, + { + "epoch": 0.33, + "grad_norm": 3.5311667919158936, + "learning_rate": 9.7007136485281e-05, + "loss": 0.1756, + "step": 2220 + }, + { + "epoch": 0.33, + "grad_norm": 2.0460686683654785, + "learning_rate": 9.699226880761225e-05, + "loss": 0.1733, + "step": 2230 + }, + { + "epoch": 0.33, + "grad_norm": 2.7502028942108154, + "learning_rate": 9.697740112994351e-05, + "loss": 0.177, + "step": 2240 + }, + { + "epoch": 0.33, + "grad_norm": 1.4361170530319214, + "learning_rate": 9.696253345227476e-05, + "loss": 0.1798, + "step": 2250 + }, + { + "epoch": 0.34, + "grad_norm": 2.131711006164551, + "learning_rate": 9.694766577460601e-05, + "loss": 0.1678, + "step": 2260 + }, + { + "epoch": 0.34, + "grad_norm": 0.6454190611839294, + "learning_rate": 9.693279809693727e-05, + "loss": 0.1575, + "step": 2270 + }, + { + "epoch": 0.34, + "grad_norm": 0.5098270773887634, + "learning_rate": 9.69179304192685e-05, + "loss": 0.1788, + "step": 2280 + }, + { + "epoch": 0.34, + "grad_norm": 0.5451599359512329, + "learning_rate": 9.690306274159977e-05, + "loss": 0.1641, + "step": 2290 + }, + { + "epoch": 0.34, + "grad_norm": 0.5445219874382019, + "learning_rate": 9.688819506393102e-05, + "loss": 0.1667, + "step": 2300 + }, + { + "epoch": 0.34, + "grad_norm": 0.9343221187591553, + "learning_rate": 9.687332738626227e-05, + "loss": 0.1676, + "step": 2310 + }, + { + "epoch": 0.34, + "grad_norm": 0.7586581707000732, + "learning_rate": 9.685845970859351e-05, + "loss": 0.1726, + "step": 2320 + }, + { + "epoch": 0.35, + "grad_norm": 0.4477677643299103, + "learning_rate": 9.684359203092478e-05, + "loss": 0.1703, + "step": 2330 + }, + { + "epoch": 0.35, + "grad_norm": 0.32315364480018616, + "learning_rate": 9.682872435325604e-05, + "loss": 0.1541, + "step": 2340 + }, + { + "epoch": 0.35, + "grad_norm": 0.7400575876235962, + "learning_rate": 9.681385667558728e-05, + "loss": 0.1718, + "step": 2350 + }, + { + "epoch": 0.35, + "grad_norm": 0.6638018488883972, + "learning_rate": 9.679898899791853e-05, + "loss": 0.1673, + "step": 2360 + }, + { + "epoch": 0.35, + "grad_norm": 0.43184348940849304, + "learning_rate": 9.678412132024977e-05, + "loss": 0.1853, + "step": 2370 + }, + { + "epoch": 0.35, + "grad_norm": 1.706583023071289, + "learning_rate": 9.676925364258104e-05, + "loss": 0.169, + "step": 2380 + }, + { + "epoch": 0.35, + "grad_norm": 2.9006285667419434, + "learning_rate": 9.675438596491228e-05, + "loss": 0.1705, + "step": 2390 + }, + { + "epoch": 0.36, + "grad_norm": 0.9855265617370605, + "learning_rate": 9.673951828724354e-05, + "loss": 0.1694, + "step": 2400 + }, + { + "epoch": 0.36, + "grad_norm": 1.0507304668426514, + "learning_rate": 9.672465060957479e-05, + "loss": 0.1595, + "step": 2410 + }, + { + "epoch": 0.36, + "grad_norm": 1.4149112701416016, + "learning_rate": 9.670978293190605e-05, + "loss": 0.1763, + "step": 2420 + }, + { + "epoch": 0.36, + "grad_norm": 0.5968409180641174, + "learning_rate": 9.66949152542373e-05, + "loss": 0.1552, + "step": 2430 + }, + { + "epoch": 0.36, + "grad_norm": 0.7032540440559387, + "learning_rate": 9.668004757656854e-05, + "loss": 0.159, + "step": 2440 + }, + { + "epoch": 0.36, + "grad_norm": 1.025107979774475, + "learning_rate": 9.66651798988998e-05, + "loss": 0.1634, + "step": 2450 + }, + { + "epoch": 0.36, + "grad_norm": 0.37000682950019836, + "learning_rate": 9.665031222123104e-05, + "loss": 0.1653, + "step": 2460 + }, + { + "epoch": 0.37, + "grad_norm": 0.8552184700965881, + "learning_rate": 9.66354445435623e-05, + "loss": 0.1566, + "step": 2470 + }, + { + "epoch": 0.37, + "grad_norm": 0.3340959846973419, + "learning_rate": 9.662057686589355e-05, + "loss": 0.1654, + "step": 2480 + }, + { + "epoch": 0.37, + "grad_norm": 0.7219660878181458, + "learning_rate": 9.66057091882248e-05, + "loss": 0.1668, + "step": 2490 + }, + { + "epoch": 0.37, + "grad_norm": 0.3376657962799072, + "learning_rate": 9.659084151055606e-05, + "loss": 0.1631, + "step": 2500 + }, + { + "epoch": 0.37, + "grad_norm": 0.40849828720092773, + "learning_rate": 9.657597383288731e-05, + "loss": 0.1714, + "step": 2510 + }, + { + "epoch": 0.37, + "grad_norm": 4.0708909034729, + "learning_rate": 9.656110615521857e-05, + "loss": 0.1748, + "step": 2520 + }, + { + "epoch": 0.38, + "grad_norm": 0.8569905757904053, + "learning_rate": 9.654623847754981e-05, + "loss": 0.1759, + "step": 2530 + }, + { + "epoch": 0.38, + "grad_norm": 0.6083622574806213, + "learning_rate": 9.653137079988106e-05, + "loss": 0.1771, + "step": 2540 + }, + { + "epoch": 0.38, + "grad_norm": 0.9755212068557739, + "learning_rate": 9.651650312221232e-05, + "loss": 0.1653, + "step": 2550 + }, + { + "epoch": 0.38, + "grad_norm": 0.2756294906139374, + "learning_rate": 9.650163544454357e-05, + "loss": 0.161, + "step": 2560 + }, + { + "epoch": 0.38, + "grad_norm": 0.36292827129364014, + "learning_rate": 9.648676776687481e-05, + "loss": 0.1627, + "step": 2570 + }, + { + "epoch": 0.38, + "grad_norm": 0.31777092814445496, + "learning_rate": 9.647190008920607e-05, + "loss": 0.1635, + "step": 2580 + }, + { + "epoch": 0.38, + "grad_norm": 0.4227207601070404, + "learning_rate": 9.645703241153732e-05, + "loss": 0.1693, + "step": 2590 + }, + { + "epoch": 0.39, + "grad_norm": 0.3526834547519684, + "learning_rate": 9.644216473386858e-05, + "loss": 0.1567, + "step": 2600 + }, + { + "epoch": 0.39, + "grad_norm": 0.9331281185150146, + "learning_rate": 9.642729705619983e-05, + "loss": 0.1691, + "step": 2610 + }, + { + "epoch": 0.39, + "grad_norm": 1.4683771133422852, + "learning_rate": 9.641242937853107e-05, + "loss": 0.1773, + "step": 2620 + }, + { + "epoch": 0.39, + "grad_norm": 2.2021124362945557, + "learning_rate": 9.639756170086233e-05, + "loss": 0.1749, + "step": 2630 + }, + { + "epoch": 0.39, + "grad_norm": 0.63923180103302, + "learning_rate": 9.638269402319358e-05, + "loss": 0.1755, + "step": 2640 + }, + { + "epoch": 0.39, + "grad_norm": 0.34077224135398865, + "learning_rate": 9.636782634552484e-05, + "loss": 0.1595, + "step": 2650 + }, + { + "epoch": 0.39, + "grad_norm": 0.8533111810684204, + "learning_rate": 9.635295866785608e-05, + "loss": 0.1765, + "step": 2660 + }, + { + "epoch": 0.4, + "grad_norm": 2.3589627742767334, + "learning_rate": 9.633809099018733e-05, + "loss": 0.1564, + "step": 2670 + }, + { + "epoch": 0.4, + "grad_norm": 0.4352911412715912, + "learning_rate": 9.632322331251859e-05, + "loss": 0.1645, + "step": 2680 + }, + { + "epoch": 0.4, + "grad_norm": 0.36782145500183105, + "learning_rate": 9.630835563484984e-05, + "loss": 0.1651, + "step": 2690 + }, + { + "epoch": 0.4, + "grad_norm": 0.4545479714870453, + "learning_rate": 9.62934879571811e-05, + "loss": 0.168, + "step": 2700 + }, + { + "epoch": 0.4, + "grad_norm": 1.7421327829360962, + "learning_rate": 9.627862027951234e-05, + "loss": 0.1643, + "step": 2710 + }, + { + "epoch": 0.4, + "grad_norm": 0.7703805565834045, + "learning_rate": 9.62637526018436e-05, + "loss": 0.1613, + "step": 2720 + }, + { + "epoch": 0.4, + "grad_norm": 0.7927514314651489, + "learning_rate": 9.624888492417485e-05, + "loss": 0.1542, + "step": 2730 + }, + { + "epoch": 0.41, + "grad_norm": 0.9497324824333191, + "learning_rate": 9.62340172465061e-05, + "loss": 0.169, + "step": 2740 + }, + { + "epoch": 0.41, + "grad_norm": 0.6463251113891602, + "learning_rate": 9.621914956883736e-05, + "loss": 0.1545, + "step": 2750 + }, + { + "epoch": 0.41, + "grad_norm": 0.5164654850959778, + "learning_rate": 9.62042818911686e-05, + "loss": 0.1746, + "step": 2760 + }, + { + "epoch": 0.41, + "grad_norm": 1.879035472869873, + "learning_rate": 9.618941421349985e-05, + "loss": 0.1617, + "step": 2770 + }, + { + "epoch": 0.41, + "grad_norm": 1.6450806856155396, + "learning_rate": 9.617454653583111e-05, + "loss": 0.1694, + "step": 2780 + }, + { + "epoch": 0.41, + "grad_norm": 0.4430038630962372, + "learning_rate": 9.615967885816236e-05, + "loss": 0.1618, + "step": 2790 + }, + { + "epoch": 0.42, + "grad_norm": 2.111541509628296, + "learning_rate": 9.61448111804936e-05, + "loss": 0.1537, + "step": 2800 + }, + { + "epoch": 0.42, + "grad_norm": 1.177138090133667, + "learning_rate": 9.612994350282486e-05, + "loss": 0.171, + "step": 2810 + }, + { + "epoch": 0.42, + "grad_norm": 2.1342763900756836, + "learning_rate": 9.611507582515611e-05, + "loss": 0.1701, + "step": 2820 + }, + { + "epoch": 0.42, + "grad_norm": 0.7735350131988525, + "learning_rate": 9.610020814748737e-05, + "loss": 0.1588, + "step": 2830 + }, + { + "epoch": 0.42, + "grad_norm": 0.3419579267501831, + "learning_rate": 9.608534046981862e-05, + "loss": 0.1573, + "step": 2840 + }, + { + "epoch": 0.42, + "grad_norm": 0.48323726654052734, + "learning_rate": 9.607047279214987e-05, + "loss": 0.1593, + "step": 2850 + }, + { + "epoch": 0.42, + "grad_norm": 0.437145471572876, + "learning_rate": 9.605560511448112e-05, + "loss": 0.1691, + "step": 2860 + }, + { + "epoch": 0.43, + "grad_norm": 0.7413353323936462, + "learning_rate": 9.604073743681237e-05, + "loss": 0.1665, + "step": 2870 + }, + { + "epoch": 0.43, + "grad_norm": 0.7211183905601501, + "learning_rate": 9.602586975914363e-05, + "loss": 0.155, + "step": 2880 + }, + { + "epoch": 0.43, + "grad_norm": 0.7412411570549011, + "learning_rate": 9.601100208147487e-05, + "loss": 0.1627, + "step": 2890 + }, + { + "epoch": 0.43, + "grad_norm": 0.9689399003982544, + "learning_rate": 9.599613440380613e-05, + "loss": 0.1734, + "step": 2900 + }, + { + "epoch": 0.43, + "grad_norm": 0.9807812571525574, + "learning_rate": 9.598126672613738e-05, + "loss": 0.1668, + "step": 2910 + }, + { + "epoch": 0.43, + "grad_norm": 2.6782937049865723, + "learning_rate": 9.596639904846864e-05, + "loss": 0.1802, + "step": 2920 + }, + { + "epoch": 0.43, + "grad_norm": 0.54747074842453, + "learning_rate": 9.595153137079989e-05, + "loss": 0.1727, + "step": 2930 + }, + { + "epoch": 0.44, + "grad_norm": 1.2045674324035645, + "learning_rate": 9.593666369313113e-05, + "loss": 0.1677, + "step": 2940 + }, + { + "epoch": 0.44, + "grad_norm": 1.2668368816375732, + "learning_rate": 9.592179601546239e-05, + "loss": 0.1652, + "step": 2950 + }, + { + "epoch": 0.44, + "grad_norm": 0.7421100735664368, + "learning_rate": 9.590692833779364e-05, + "loss": 0.1552, + "step": 2960 + }, + { + "epoch": 0.44, + "grad_norm": 0.7410224676132202, + "learning_rate": 9.58920606601249e-05, + "loss": 0.1569, + "step": 2970 + }, + { + "epoch": 0.44, + "grad_norm": 0.5380434989929199, + "learning_rate": 9.587719298245614e-05, + "loss": 0.1676, + "step": 2980 + }, + { + "epoch": 0.44, + "grad_norm": 1.3664904832839966, + "learning_rate": 9.586232530478739e-05, + "loss": 0.1629, + "step": 2990 + }, + { + "epoch": 0.44, + "grad_norm": 1.5946515798568726, + "learning_rate": 9.584745762711866e-05, + "loss": 0.1779, + "step": 3000 + }, + { + "epoch": 0.45, + "grad_norm": 1.7002317905426025, + "learning_rate": 9.58325899494499e-05, + "loss": 0.1769, + "step": 3010 + }, + { + "epoch": 0.45, + "grad_norm": 0.5792291164398193, + "learning_rate": 9.581772227178116e-05, + "loss": 0.173, + "step": 3020 + }, + { + "epoch": 0.45, + "grad_norm": 2.637342691421509, + "learning_rate": 9.58028545941124e-05, + "loss": 0.184, + "step": 3030 + }, + { + "epoch": 0.45, + "grad_norm": 0.315929651260376, + "learning_rate": 9.578798691644367e-05, + "loss": 0.1626, + "step": 3040 + }, + { + "epoch": 0.45, + "grad_norm": 1.4628597497940063, + "learning_rate": 9.57731192387749e-05, + "loss": 0.1585, + "step": 3050 + }, + { + "epoch": 0.45, + "grad_norm": 1.037359356880188, + "learning_rate": 9.575825156110616e-05, + "loss": 0.1659, + "step": 3060 + }, + { + "epoch": 0.46, + "grad_norm": 0.7535769939422607, + "learning_rate": 9.574338388343742e-05, + "loss": 0.1552, + "step": 3070 + }, + { + "epoch": 0.46, + "grad_norm": 1.6997020244598389, + "learning_rate": 9.572851620576866e-05, + "loss": 0.161, + "step": 3080 + }, + { + "epoch": 0.46, + "grad_norm": 0.6227778196334839, + "learning_rate": 9.571364852809993e-05, + "loss": 0.1664, + "step": 3090 + }, + { + "epoch": 0.46, + "grad_norm": 0.9602690935134888, + "learning_rate": 9.569878085043117e-05, + "loss": 0.1588, + "step": 3100 + }, + { + "epoch": 0.46, + "grad_norm": 0.9173592925071716, + "learning_rate": 9.568391317276242e-05, + "loss": 0.1628, + "step": 3110 + }, + { + "epoch": 0.46, + "grad_norm": 1.1894303560256958, + "learning_rate": 9.566904549509366e-05, + "loss": 0.1629, + "step": 3120 + }, + { + "epoch": 0.46, + "grad_norm": 0.7936124205589294, + "learning_rate": 9.565417781742493e-05, + "loss": 0.169, + "step": 3130 + }, + { + "epoch": 0.47, + "grad_norm": 0.7208412289619446, + "learning_rate": 9.563931013975617e-05, + "loss": 0.1766, + "step": 3140 + }, + { + "epoch": 0.47, + "grad_norm": 1.2436951398849487, + "learning_rate": 9.562444246208743e-05, + "loss": 0.1621, + "step": 3150 + }, + { + "epoch": 0.47, + "grad_norm": 1.0052576065063477, + "learning_rate": 9.560957478441868e-05, + "loss": 0.1607, + "step": 3160 + }, + { + "epoch": 0.47, + "grad_norm": 2.342569589614868, + "learning_rate": 9.559470710674992e-05, + "loss": 0.1655, + "step": 3170 + }, + { + "epoch": 0.47, + "grad_norm": 0.5531949400901794, + "learning_rate": 9.557983942908119e-05, + "loss": 0.1684, + "step": 3180 + }, + { + "epoch": 0.47, + "grad_norm": 0.5959020256996155, + "learning_rate": 9.556497175141243e-05, + "loss": 0.1592, + "step": 3190 + }, + { + "epoch": 0.47, + "grad_norm": 0.7381748557090759, + "learning_rate": 9.555010407374369e-05, + "loss": 0.1689, + "step": 3200 + }, + { + "epoch": 0.48, + "grad_norm": 1.3665748834609985, + "learning_rate": 9.553523639607493e-05, + "loss": 0.157, + "step": 3210 + }, + { + "epoch": 0.48, + "grad_norm": 1.744511365890503, + "learning_rate": 9.55203687184062e-05, + "loss": 0.1715, + "step": 3220 + }, + { + "epoch": 0.48, + "grad_norm": 1.2675743103027344, + "learning_rate": 9.550550104073744e-05, + "loss": 0.1627, + "step": 3230 + }, + { + "epoch": 0.48, + "grad_norm": 0.7240175008773804, + "learning_rate": 9.549063336306869e-05, + "loss": 0.1652, + "step": 3240 + }, + { + "epoch": 0.48, + "grad_norm": 1.61540687084198, + "learning_rate": 9.547576568539995e-05, + "loss": 0.182, + "step": 3250 + }, + { + "epoch": 0.48, + "grad_norm": 1.7293829917907715, + "learning_rate": 9.546089800773119e-05, + "loss": 0.173, + "step": 3260 + }, + { + "epoch": 0.48, + "grad_norm": 1.0945603847503662, + "learning_rate": 9.544603033006246e-05, + "loss": 0.1598, + "step": 3270 + }, + { + "epoch": 0.49, + "grad_norm": 0.4362248182296753, + "learning_rate": 9.54311626523937e-05, + "loss": 0.1634, + "step": 3280 + }, + { + "epoch": 0.49, + "grad_norm": 0.6548261642456055, + "learning_rate": 9.541629497472495e-05, + "loss": 0.1594, + "step": 3290 + }, + { + "epoch": 0.49, + "grad_norm": 0.3333025872707367, + "learning_rate": 9.54014272970562e-05, + "loss": 0.1636, + "step": 3300 + }, + { + "epoch": 0.49, + "grad_norm": 1.0312073230743408, + "learning_rate": 9.538655961938746e-05, + "loss": 0.1623, + "step": 3310 + }, + { + "epoch": 0.49, + "grad_norm": 1.9303184747695923, + "learning_rate": 9.53716919417187e-05, + "loss": 0.1667, + "step": 3320 + }, + { + "epoch": 0.49, + "grad_norm": 0.3857884109020233, + "learning_rate": 9.535682426404996e-05, + "loss": 0.1604, + "step": 3330 + }, + { + "epoch": 0.5, + "grad_norm": 0.8410763144493103, + "learning_rate": 9.534195658638121e-05, + "loss": 0.1536, + "step": 3340 + }, + { + "epoch": 0.5, + "grad_norm": 0.9080554842948914, + "learning_rate": 9.532708890871245e-05, + "loss": 0.1625, + "step": 3350 + }, + { + "epoch": 0.5, + "grad_norm": 1.0813413858413696, + "learning_rate": 9.531222123104372e-05, + "loss": 0.1587, + "step": 3360 + }, + { + "epoch": 0.5, + "grad_norm": 0.5331724286079407, + "learning_rate": 9.529735355337496e-05, + "loss": 0.1606, + "step": 3370 + }, + { + "epoch": 0.5, + "grad_norm": 2.220775842666626, + "learning_rate": 9.528248587570622e-05, + "loss": 0.1642, + "step": 3380 + }, + { + "epoch": 0.5, + "grad_norm": 1.435058355331421, + "learning_rate": 9.526761819803747e-05, + "loss": 0.1704, + "step": 3390 + }, + { + "epoch": 0.5, + "grad_norm": 1.1016638278961182, + "learning_rate": 9.525275052036873e-05, + "loss": 0.1602, + "step": 3400 + }, + { + "epoch": 0.51, + "grad_norm": 1.0263643264770508, + "learning_rate": 9.523788284269998e-05, + "loss": 0.1734, + "step": 3410 + }, + { + "epoch": 0.51, + "grad_norm": 1.1863274574279785, + "learning_rate": 9.522301516503122e-05, + "loss": 0.1804, + "step": 3420 + }, + { + "epoch": 0.51, + "grad_norm": 0.5040327310562134, + "learning_rate": 9.520814748736248e-05, + "loss": 0.1688, + "step": 3430 + }, + { + "epoch": 0.51, + "grad_norm": 0.4307524561882019, + "learning_rate": 9.519327980969372e-05, + "loss": 0.162, + "step": 3440 + }, + { + "epoch": 0.51, + "grad_norm": 1.9712070226669312, + "learning_rate": 9.517841213202499e-05, + "loss": 0.171, + "step": 3450 + }, + { + "epoch": 0.51, + "grad_norm": 0.4332945942878723, + "learning_rate": 9.516354445435623e-05, + "loss": 0.1603, + "step": 3460 + }, + { + "epoch": 0.51, + "grad_norm": 0.5506505966186523, + "learning_rate": 9.514867677668748e-05, + "loss": 0.156, + "step": 3470 + }, + { + "epoch": 0.52, + "grad_norm": 1.394182562828064, + "learning_rate": 9.513380909901874e-05, + "loss": 0.1645, + "step": 3480 + }, + { + "epoch": 0.52, + "grad_norm": 0.7404809594154358, + "learning_rate": 9.511894142135e-05, + "loss": 0.1652, + "step": 3490 + }, + { + "epoch": 0.52, + "grad_norm": 2.453972339630127, + "learning_rate": 9.510407374368125e-05, + "loss": 0.17, + "step": 3500 + }, + { + "epoch": 0.52, + "grad_norm": 1.8291536569595337, + "learning_rate": 9.508920606601249e-05, + "loss": 0.1711, + "step": 3510 + }, + { + "epoch": 0.52, + "grad_norm": 1.329593539237976, + "learning_rate": 9.507433838834374e-05, + "loss": 0.1689, + "step": 3520 + }, + { + "epoch": 0.52, + "grad_norm": 1.166062593460083, + "learning_rate": 9.5059470710675e-05, + "loss": 0.1697, + "step": 3530 + }, + { + "epoch": 0.52, + "grad_norm": 0.4099891185760498, + "learning_rate": 9.504460303300625e-05, + "loss": 0.1536, + "step": 3540 + }, + { + "epoch": 0.53, + "grad_norm": 0.6420068740844727, + "learning_rate": 9.50297353553375e-05, + "loss": 0.1588, + "step": 3550 + }, + { + "epoch": 0.53, + "grad_norm": 1.9775470495224, + "learning_rate": 9.501486767766875e-05, + "loss": 0.1709, + "step": 3560 + }, + { + "epoch": 0.53, + "grad_norm": 1.8352495431900024, + "learning_rate": 9.5e-05, + "loss": 0.1669, + "step": 3570 + }, + { + "epoch": 0.53, + "grad_norm": 1.752253532409668, + "learning_rate": 9.498513232233126e-05, + "loss": 0.1556, + "step": 3580 + }, + { + "epoch": 0.53, + "grad_norm": 1.41259765625, + "learning_rate": 9.497026464466251e-05, + "loss": 0.1606, + "step": 3590 + }, + { + "epoch": 0.53, + "grad_norm": 0.6287984251976013, + "learning_rate": 9.495539696699376e-05, + "loss": 0.1553, + "step": 3600 + }, + { + "epoch": 0.54, + "grad_norm": 0.47083550691604614, + "learning_rate": 9.494052928932501e-05, + "loss": 0.1577, + "step": 3610 + }, + { + "epoch": 0.54, + "grad_norm": 0.32023587822914124, + "learning_rate": 9.492566161165627e-05, + "loss": 0.158, + "step": 3620 + }, + { + "epoch": 0.54, + "grad_norm": 2.142852544784546, + "learning_rate": 9.491079393398752e-05, + "loss": 0.167, + "step": 3630 + }, + { + "epoch": 0.54, + "grad_norm": 0.24139684438705444, + "learning_rate": 9.489592625631876e-05, + "loss": 0.1673, + "step": 3640 + }, + { + "epoch": 0.54, + "grad_norm": 0.6434214115142822, + "learning_rate": 9.488105857865002e-05, + "loss": 0.1657, + "step": 3650 + }, + { + "epoch": 0.54, + "grad_norm": 0.7096123695373535, + "learning_rate": 9.486619090098127e-05, + "loss": 0.1685, + "step": 3660 + }, + { + "epoch": 0.54, + "grad_norm": 0.4420715868473053, + "learning_rate": 9.485132322331253e-05, + "loss": 0.1735, + "step": 3670 + }, + { + "epoch": 0.55, + "grad_norm": 1.7800606489181519, + "learning_rate": 9.483645554564378e-05, + "loss": 0.1563, + "step": 3680 + }, + { + "epoch": 0.55, + "grad_norm": 0.5434880256652832, + "learning_rate": 9.482158786797502e-05, + "loss": 0.1669, + "step": 3690 + }, + { + "epoch": 0.55, + "grad_norm": 0.879857063293457, + "learning_rate": 9.480672019030628e-05, + "loss": 0.1715, + "step": 3700 + }, + { + "epoch": 0.55, + "grad_norm": 1.1706616878509521, + "learning_rate": 9.479185251263753e-05, + "loss": 0.168, + "step": 3710 + }, + { + "epoch": 0.55, + "grad_norm": 1.0965744256973267, + "learning_rate": 9.477698483496879e-05, + "loss": 0.1684, + "step": 3720 + }, + { + "epoch": 0.55, + "grad_norm": 0.4567830562591553, + "learning_rate": 9.476211715730004e-05, + "loss": 0.1566, + "step": 3730 + }, + { + "epoch": 0.55, + "grad_norm": 0.6394124031066895, + "learning_rate": 9.474724947963128e-05, + "loss": 0.1629, + "step": 3740 + }, + { + "epoch": 0.56, + "grad_norm": 0.6630517244338989, + "learning_rate": 9.473238180196254e-05, + "loss": 0.1519, + "step": 3750 + }, + { + "epoch": 0.56, + "grad_norm": 1.7115375995635986, + "learning_rate": 9.471751412429379e-05, + "loss": 0.1602, + "step": 3760 + }, + { + "epoch": 0.56, + "grad_norm": 0.2985321581363678, + "learning_rate": 9.470264644662505e-05, + "loss": 0.154, + "step": 3770 + }, + { + "epoch": 0.56, + "grad_norm": 1.459088921546936, + "learning_rate": 9.468777876895629e-05, + "loss": 0.1651, + "step": 3780 + }, + { + "epoch": 0.56, + "grad_norm": 0.6432485580444336, + "learning_rate": 9.467291109128754e-05, + "loss": 0.1618, + "step": 3790 + }, + { + "epoch": 0.56, + "grad_norm": 0.6554367542266846, + "learning_rate": 9.46580434136188e-05, + "loss": 0.1554, + "step": 3800 + }, + { + "epoch": 0.56, + "grad_norm": 0.821692705154419, + "learning_rate": 9.464317573595005e-05, + "loss": 0.1665, + "step": 3810 + }, + { + "epoch": 0.57, + "grad_norm": 0.4133671522140503, + "learning_rate": 9.46283080582813e-05, + "loss": 0.1574, + "step": 3820 + }, + { + "epoch": 0.57, + "grad_norm": 1.609708547592163, + "learning_rate": 9.461344038061255e-05, + "loss": 0.1658, + "step": 3830 + }, + { + "epoch": 0.57, + "grad_norm": 0.29960155487060547, + "learning_rate": 9.45985727029438e-05, + "loss": 0.1638, + "step": 3840 + }, + { + "epoch": 0.57, + "grad_norm": 0.37204432487487793, + "learning_rate": 9.458370502527506e-05, + "loss": 0.1502, + "step": 3850 + }, + { + "epoch": 0.57, + "grad_norm": 1.4778790473937988, + "learning_rate": 9.456883734760631e-05, + "loss": 0.1585, + "step": 3860 + }, + { + "epoch": 0.57, + "grad_norm": 0.30051159858703613, + "learning_rate": 9.455396966993755e-05, + "loss": 0.1595, + "step": 3870 + }, + { + "epoch": 0.58, + "grad_norm": 0.3415975570678711, + "learning_rate": 9.453910199226881e-05, + "loss": 0.1556, + "step": 3880 + }, + { + "epoch": 0.58, + "grad_norm": 2.3571910858154297, + "learning_rate": 9.452423431460006e-05, + "loss": 0.1657, + "step": 3890 + }, + { + "epoch": 0.58, + "grad_norm": 1.502201795578003, + "learning_rate": 9.450936663693132e-05, + "loss": 0.1525, + "step": 3900 + }, + { + "epoch": 0.58, + "grad_norm": 0.4980350434780121, + "learning_rate": 9.449449895926257e-05, + "loss": 0.1575, + "step": 3910 + }, + { + "epoch": 0.58, + "grad_norm": 0.7586767077445984, + "learning_rate": 9.447963128159381e-05, + "loss": 0.1685, + "step": 3920 + }, + { + "epoch": 0.58, + "grad_norm": 0.33721524477005005, + "learning_rate": 9.446476360392507e-05, + "loss": 0.1616, + "step": 3930 + }, + { + "epoch": 0.58, + "grad_norm": 0.36988070607185364, + "learning_rate": 9.444989592625632e-05, + "loss": 0.1645, + "step": 3940 + }, + { + "epoch": 0.59, + "grad_norm": 0.6627219915390015, + "learning_rate": 9.443502824858758e-05, + "loss": 0.1507, + "step": 3950 + }, + { + "epoch": 0.59, + "grad_norm": 0.4597422778606415, + "learning_rate": 9.442016057091882e-05, + "loss": 0.1492, + "step": 3960 + }, + { + "epoch": 0.59, + "grad_norm": 0.6863605380058289, + "learning_rate": 9.440529289325007e-05, + "loss": 0.1649, + "step": 3970 + }, + { + "epoch": 0.59, + "grad_norm": 0.924536943435669, + "learning_rate": 9.439042521558133e-05, + "loss": 0.1605, + "step": 3980 + }, + { + "epoch": 0.59, + "grad_norm": 1.3096588850021362, + "learning_rate": 9.437555753791258e-05, + "loss": 0.1642, + "step": 3990 + }, + { + "epoch": 0.59, + "grad_norm": 0.33845776319503784, + "learning_rate": 9.436068986024384e-05, + "loss": 0.1631, + "step": 4000 + }, + { + "epoch": 0.59, + "grad_norm": 0.4413401782512665, + "learning_rate": 9.434582218257508e-05, + "loss": 0.1574, + "step": 4010 + }, + { + "epoch": 0.6, + "grad_norm": 3.155444622039795, + "learning_rate": 9.433095450490635e-05, + "loss": 0.1685, + "step": 4020 + }, + { + "epoch": 0.6, + "grad_norm": 0.26439428329467773, + "learning_rate": 9.431608682723759e-05, + "loss": 0.1585, + "step": 4030 + }, + { + "epoch": 0.6, + "grad_norm": 0.3483470380306244, + "learning_rate": 9.430121914956884e-05, + "loss": 0.1636, + "step": 4040 + }, + { + "epoch": 0.6, + "grad_norm": 0.4697681665420532, + "learning_rate": 9.42863514719001e-05, + "loss": 0.1528, + "step": 4050 + }, + { + "epoch": 0.6, + "grad_norm": 0.4681277871131897, + "learning_rate": 9.427148379423134e-05, + "loss": 0.1492, + "step": 4060 + }, + { + "epoch": 0.6, + "grad_norm": 1.5307562351226807, + "learning_rate": 9.425661611656261e-05, + "loss": 0.1599, + "step": 4070 + }, + { + "epoch": 0.6, + "grad_norm": 0.5165395736694336, + "learning_rate": 9.424174843889385e-05, + "loss": 0.1605, + "step": 4080 + }, + { + "epoch": 0.61, + "grad_norm": 1.7521703243255615, + "learning_rate": 9.42268807612251e-05, + "loss": 0.1564, + "step": 4090 + }, + { + "epoch": 0.61, + "grad_norm": 1.0230251550674438, + "learning_rate": 9.421201308355635e-05, + "loss": 0.1552, + "step": 4100 + }, + { + "epoch": 0.61, + "grad_norm": 1.3919179439544678, + "learning_rate": 9.419714540588761e-05, + "loss": 0.1598, + "step": 4110 + }, + { + "epoch": 0.61, + "grad_norm": 1.1220026016235352, + "learning_rate": 9.418227772821885e-05, + "loss": 0.1674, + "step": 4120 + }, + { + "epoch": 0.61, + "grad_norm": 1.90169358253479, + "learning_rate": 9.416741005055011e-05, + "loss": 0.1635, + "step": 4130 + }, + { + "epoch": 0.61, + "grad_norm": 2.0692789554595947, + "learning_rate": 9.415254237288136e-05, + "loss": 0.1698, + "step": 4140 + }, + { + "epoch": 0.62, + "grad_norm": 0.22812828421592712, + "learning_rate": 9.41376746952126e-05, + "loss": 0.1542, + "step": 4150 + }, + { + "epoch": 0.62, + "grad_norm": 1.1693663597106934, + "learning_rate": 9.412280701754387e-05, + "loss": 0.1615, + "step": 4160 + }, + { + "epoch": 0.62, + "grad_norm": 0.3354921042919159, + "learning_rate": 9.410793933987511e-05, + "loss": 0.1671, + "step": 4170 + }, + { + "epoch": 0.62, + "grad_norm": 0.6962907314300537, + "learning_rate": 9.409307166220637e-05, + "loss": 0.1613, + "step": 4180 + }, + { + "epoch": 0.62, + "grad_norm": 0.6421184539794922, + "learning_rate": 9.407820398453761e-05, + "loss": 0.1618, + "step": 4190 + }, + { + "epoch": 0.62, + "grad_norm": 0.7653113007545471, + "learning_rate": 9.406333630686888e-05, + "loss": 0.1654, + "step": 4200 + }, + { + "epoch": 0.62, + "grad_norm": 2.4837770462036133, + "learning_rate": 9.404846862920012e-05, + "loss": 0.1732, + "step": 4210 + }, + { + "epoch": 0.63, + "grad_norm": 0.7182124853134155, + "learning_rate": 9.403360095153138e-05, + "loss": 0.1543, + "step": 4220 + }, + { + "epoch": 0.63, + "grad_norm": 0.36238598823547363, + "learning_rate": 9.401873327386263e-05, + "loss": 0.1632, + "step": 4230 + }, + { + "epoch": 0.63, + "grad_norm": 0.8226011991500854, + "learning_rate": 9.400386559619387e-05, + "loss": 0.1523, + "step": 4240 + }, + { + "epoch": 0.63, + "grad_norm": 1.411160945892334, + "learning_rate": 9.398899791852514e-05, + "loss": 0.1557, + "step": 4250 + }, + { + "epoch": 0.63, + "grad_norm": 1.1795895099639893, + "learning_rate": 9.397413024085638e-05, + "loss": 0.166, + "step": 4260 + }, + { + "epoch": 0.63, + "grad_norm": 0.4530087113380432, + "learning_rate": 9.395926256318764e-05, + "loss": 0.1567, + "step": 4270 + }, + { + "epoch": 0.63, + "grad_norm": 2.1539831161499023, + "learning_rate": 9.394439488551888e-05, + "loss": 0.1627, + "step": 4280 + }, + { + "epoch": 0.64, + "grad_norm": 0.4007501006126404, + "learning_rate": 9.392952720785015e-05, + "loss": 0.1609, + "step": 4290 + }, + { + "epoch": 0.64, + "grad_norm": 0.5847845673561096, + "learning_rate": 9.391465953018139e-05, + "loss": 0.16, + "step": 4300 + }, + { + "epoch": 0.64, + "grad_norm": 0.2719428241252899, + "learning_rate": 9.389979185251264e-05, + "loss": 0.1712, + "step": 4310 + }, + { + "epoch": 0.64, + "grad_norm": 1.8159836530685425, + "learning_rate": 9.38849241748439e-05, + "loss": 0.1618, + "step": 4320 + }, + { + "epoch": 0.64, + "grad_norm": 2.801260232925415, + "learning_rate": 9.387005649717514e-05, + "loss": 0.1607, + "step": 4330 + }, + { + "epoch": 0.64, + "grad_norm": 1.856196641921997, + "learning_rate": 9.38551888195064e-05, + "loss": 0.1585, + "step": 4340 + }, + { + "epoch": 0.64, + "grad_norm": 0.8878040909767151, + "learning_rate": 9.384032114183765e-05, + "loss": 0.1576, + "step": 4350 + }, + { + "epoch": 0.65, + "grad_norm": 0.753246009349823, + "learning_rate": 9.38254534641689e-05, + "loss": 0.1705, + "step": 4360 + }, + { + "epoch": 0.65, + "grad_norm": 0.5893975496292114, + "learning_rate": 9.381058578650014e-05, + "loss": 0.1597, + "step": 4370 + }, + { + "epoch": 0.65, + "grad_norm": 1.1142498254776, + "learning_rate": 9.379571810883141e-05, + "loss": 0.1548, + "step": 4380 + }, + { + "epoch": 0.65, + "grad_norm": 0.9535636901855469, + "learning_rate": 9.378085043116267e-05, + "loss": 0.1747, + "step": 4390 + }, + { + "epoch": 0.65, + "grad_norm": 1.1514941453933716, + "learning_rate": 9.376598275349391e-05, + "loss": 0.1619, + "step": 4400 + }, + { + "epoch": 0.65, + "grad_norm": 0.8565515875816345, + "learning_rate": 9.375111507582516e-05, + "loss": 0.1619, + "step": 4410 + }, + { + "epoch": 0.66, + "grad_norm": 0.49910810589790344, + "learning_rate": 9.37362473981564e-05, + "loss": 0.1742, + "step": 4420 + }, + { + "epoch": 0.66, + "grad_norm": 0.327064573764801, + "learning_rate": 9.372137972048767e-05, + "loss": 0.15, + "step": 4430 + }, + { + "epoch": 0.66, + "grad_norm": 0.32551905512809753, + "learning_rate": 9.370651204281891e-05, + "loss": 0.1617, + "step": 4440 + }, + { + "epoch": 0.66, + "grad_norm": 0.39346516132354736, + "learning_rate": 9.369164436515017e-05, + "loss": 0.1597, + "step": 4450 + }, + { + "epoch": 0.66, + "grad_norm": 0.9522442817687988, + "learning_rate": 9.367677668748142e-05, + "loss": 0.1538, + "step": 4460 + }, + { + "epoch": 0.66, + "grad_norm": 1.2629470825195312, + "learning_rate": 9.366190900981268e-05, + "loss": 0.1596, + "step": 4470 + }, + { + "epoch": 0.66, + "grad_norm": 0.3080795109272003, + "learning_rate": 9.364704133214393e-05, + "loss": 0.1606, + "step": 4480 + }, + { + "epoch": 0.67, + "grad_norm": 0.3222214877605438, + "learning_rate": 9.363217365447517e-05, + "loss": 0.1502, + "step": 4490 + }, + { + "epoch": 0.67, + "grad_norm": 1.2670872211456299, + "learning_rate": 9.361730597680643e-05, + "loss": 0.1571, + "step": 4500 + }, + { + "epoch": 0.67, + "grad_norm": 1.1200741529464722, + "learning_rate": 9.360243829913768e-05, + "loss": 0.1644, + "step": 4510 + }, + { + "epoch": 0.67, + "grad_norm": 0.3835075795650482, + "learning_rate": 9.358757062146894e-05, + "loss": 0.1548, + "step": 4520 + }, + { + "epoch": 0.67, + "grad_norm": 1.4065500497817993, + "learning_rate": 9.357270294380018e-05, + "loss": 0.1622, + "step": 4530 + }, + { + "epoch": 0.67, + "grad_norm": 1.0452622175216675, + "learning_rate": 9.355783526613143e-05, + "loss": 0.1646, + "step": 4540 + }, + { + "epoch": 0.67, + "grad_norm": 0.7965924739837646, + "learning_rate": 9.354296758846269e-05, + "loss": 0.1689, + "step": 4550 + }, + { + "epoch": 0.68, + "grad_norm": 1.2334436178207397, + "learning_rate": 9.352809991079394e-05, + "loss": 0.1754, + "step": 4560 + }, + { + "epoch": 0.68, + "grad_norm": 1.1238471269607544, + "learning_rate": 9.35132322331252e-05, + "loss": 0.1593, + "step": 4570 + }, + { + "epoch": 0.68, + "grad_norm": 0.39519649744033813, + "learning_rate": 9.349836455545644e-05, + "loss": 0.1664, + "step": 4580 + }, + { + "epoch": 0.68, + "grad_norm": 0.2505687177181244, + "learning_rate": 9.348349687778769e-05, + "loss": 0.1644, + "step": 4590 + }, + { + "epoch": 0.68, + "grad_norm": 1.1792103052139282, + "learning_rate": 9.346862920011895e-05, + "loss": 0.1644, + "step": 4600 + }, + { + "epoch": 0.68, + "grad_norm": 0.8339989185333252, + "learning_rate": 9.34537615224502e-05, + "loss": 0.158, + "step": 4610 + }, + { + "epoch": 0.68, + "grad_norm": 0.2600146532058716, + "learning_rate": 9.343889384478144e-05, + "loss": 0.1657, + "step": 4620 + }, + { + "epoch": 0.69, + "grad_norm": 1.7673107385635376, + "learning_rate": 9.34240261671127e-05, + "loss": 0.1624, + "step": 4630 + }, + { + "epoch": 0.69, + "grad_norm": 0.3657977283000946, + "learning_rate": 9.340915848944395e-05, + "loss": 0.165, + "step": 4640 + }, + { + "epoch": 0.69, + "grad_norm": 0.5369674563407898, + "learning_rate": 9.339429081177521e-05, + "loss": 0.1643, + "step": 4650 + }, + { + "epoch": 0.69, + "grad_norm": 1.0297385454177856, + "learning_rate": 9.337942313410646e-05, + "loss": 0.1551, + "step": 4660 + }, + { + "epoch": 0.69, + "grad_norm": 1.2391917705535889, + "learning_rate": 9.33645554564377e-05, + "loss": 0.1541, + "step": 4670 + }, + { + "epoch": 0.69, + "grad_norm": 1.3810855150222778, + "learning_rate": 9.334968777876896e-05, + "loss": 0.1597, + "step": 4680 + }, + { + "epoch": 0.7, + "grad_norm": 1.7452163696289062, + "learning_rate": 9.333482010110021e-05, + "loss": 0.1672, + "step": 4690 + }, + { + "epoch": 0.7, + "grad_norm": 1.4795212745666504, + "learning_rate": 9.331995242343147e-05, + "loss": 0.1686, + "step": 4700 + }, + { + "epoch": 0.7, + "grad_norm": 1.1324572563171387, + "learning_rate": 9.330508474576271e-05, + "loss": 0.1568, + "step": 4710 + }, + { + "epoch": 0.7, + "grad_norm": 1.038382887840271, + "learning_rate": 9.329021706809396e-05, + "loss": 0.1605, + "step": 4720 + }, + { + "epoch": 0.7, + "grad_norm": 0.45325565338134766, + "learning_rate": 9.327534939042522e-05, + "loss": 0.1497, + "step": 4730 + }, + { + "epoch": 0.7, + "grad_norm": 0.37989452481269836, + "learning_rate": 9.326048171275647e-05, + "loss": 0.1581, + "step": 4740 + }, + { + "epoch": 0.7, + "grad_norm": 0.8576381802558899, + "learning_rate": 9.324561403508773e-05, + "loss": 0.1636, + "step": 4750 + }, + { + "epoch": 0.71, + "grad_norm": 0.400477796792984, + "learning_rate": 9.323074635741897e-05, + "loss": 0.1526, + "step": 4760 + }, + { + "epoch": 0.71, + "grad_norm": 0.37354516983032227, + "learning_rate": 9.321587867975022e-05, + "loss": 0.1575, + "step": 4770 + }, + { + "epoch": 0.71, + "grad_norm": 2.289604425430298, + "learning_rate": 9.320101100208148e-05, + "loss": 0.1655, + "step": 4780 + }, + { + "epoch": 0.71, + "grad_norm": 1.4728387594223022, + "learning_rate": 9.318614332441273e-05, + "loss": 0.1626, + "step": 4790 + }, + { + "epoch": 0.71, + "grad_norm": 1.617341160774231, + "learning_rate": 9.317127564674399e-05, + "loss": 0.1641, + "step": 4800 + }, + { + "epoch": 0.71, + "grad_norm": 1.5229002237319946, + "learning_rate": 9.315640796907523e-05, + "loss": 0.1575, + "step": 4810 + }, + { + "epoch": 0.71, + "grad_norm": 0.7396848797798157, + "learning_rate": 9.314154029140648e-05, + "loss": 0.1692, + "step": 4820 + }, + { + "epoch": 0.72, + "grad_norm": 1.8826942443847656, + "learning_rate": 9.312667261373774e-05, + "loss": 0.154, + "step": 4830 + }, + { + "epoch": 0.72, + "grad_norm": 0.8908949494361877, + "learning_rate": 9.3111804936069e-05, + "loss": 0.1576, + "step": 4840 + }, + { + "epoch": 0.72, + "grad_norm": 0.829403281211853, + "learning_rate": 9.309693725840024e-05, + "loss": 0.1626, + "step": 4850 + }, + { + "epoch": 0.72, + "grad_norm": 1.9040987491607666, + "learning_rate": 9.308206958073149e-05, + "loss": 0.1617, + "step": 4860 + }, + { + "epoch": 0.72, + "grad_norm": 0.24965029954910278, + "learning_rate": 9.306720190306275e-05, + "loss": 0.1677, + "step": 4870 + }, + { + "epoch": 0.72, + "grad_norm": 0.3442169427871704, + "learning_rate": 9.3052334225394e-05, + "loss": 0.1563, + "step": 4880 + }, + { + "epoch": 0.72, + "grad_norm": 0.4237022399902344, + "learning_rate": 9.303746654772525e-05, + "loss": 0.1645, + "step": 4890 + }, + { + "epoch": 0.73, + "grad_norm": 0.734087347984314, + "learning_rate": 9.30225988700565e-05, + "loss": 0.1523, + "step": 4900 + }, + { + "epoch": 0.73, + "grad_norm": 0.896193265914917, + "learning_rate": 9.300773119238775e-05, + "loss": 0.167, + "step": 4910 + }, + { + "epoch": 0.73, + "grad_norm": 0.47176918387413025, + "learning_rate": 9.2992863514719e-05, + "loss": 0.1584, + "step": 4920 + }, + { + "epoch": 0.73, + "grad_norm": 1.4008644819259644, + "learning_rate": 9.297799583705026e-05, + "loss": 0.1584, + "step": 4930 + }, + { + "epoch": 0.73, + "grad_norm": 1.161672830581665, + "learning_rate": 9.29631281593815e-05, + "loss": 0.1653, + "step": 4940 + }, + { + "epoch": 0.73, + "grad_norm": 1.0139836072921753, + "learning_rate": 9.294826048171276e-05, + "loss": 0.1574, + "step": 4950 + }, + { + "epoch": 0.74, + "grad_norm": 0.755262017250061, + "learning_rate": 9.293339280404401e-05, + "loss": 0.1549, + "step": 4960 + }, + { + "epoch": 0.74, + "grad_norm": 0.763871431350708, + "learning_rate": 9.291852512637527e-05, + "loss": 0.1598, + "step": 4970 + }, + { + "epoch": 0.74, + "grad_norm": 0.4599681496620178, + "learning_rate": 9.290365744870652e-05, + "loss": 0.1581, + "step": 4980 + }, + { + "epoch": 0.74, + "grad_norm": 0.6009973287582397, + "learning_rate": 9.288878977103776e-05, + "loss": 0.1627, + "step": 4990 + }, + { + "epoch": 0.74, + "grad_norm": 0.40760254859924316, + "learning_rate": 9.287392209336903e-05, + "loss": 0.1633, + "step": 5000 + }, + { + "epoch": 0.74, + "grad_norm": 0.41892722249031067, + "learning_rate": 9.285905441570027e-05, + "loss": 0.1606, + "step": 5010 + }, + { + "epoch": 0.74, + "grad_norm": 0.4405099153518677, + "learning_rate": 9.284418673803153e-05, + "loss": 0.1565, + "step": 5020 + }, + { + "epoch": 0.75, + "grad_norm": 0.49205201864242554, + "learning_rate": 9.282931906036277e-05, + "loss": 0.1631, + "step": 5030 + }, + { + "epoch": 0.75, + "grad_norm": 0.3461616635322571, + "learning_rate": 9.281445138269402e-05, + "loss": 0.1629, + "step": 5040 + }, + { + "epoch": 0.75, + "grad_norm": 0.33874744176864624, + "learning_rate": 9.279958370502529e-05, + "loss": 0.1497, + "step": 5050 + }, + { + "epoch": 0.75, + "grad_norm": 0.9239627122879028, + "learning_rate": 9.278471602735653e-05, + "loss": 0.1568, + "step": 5060 + }, + { + "epoch": 0.75, + "grad_norm": 1.629723310470581, + "learning_rate": 9.276984834968779e-05, + "loss": 0.1583, + "step": 5070 + }, + { + "epoch": 0.75, + "grad_norm": 0.38796573877334595, + "learning_rate": 9.275498067201903e-05, + "loss": 0.1595, + "step": 5080 + }, + { + "epoch": 0.75, + "grad_norm": 1.2880231142044067, + "learning_rate": 9.27401129943503e-05, + "loss": 0.1649, + "step": 5090 + }, + { + "epoch": 0.76, + "grad_norm": 0.4222949147224426, + "learning_rate": 9.272524531668154e-05, + "loss": 0.1614, + "step": 5100 + }, + { + "epoch": 0.76, + "grad_norm": 1.258013367652893, + "learning_rate": 9.271037763901279e-05, + "loss": 0.1645, + "step": 5110 + }, + { + "epoch": 0.76, + "grad_norm": 1.2159521579742432, + "learning_rate": 9.269550996134405e-05, + "loss": 0.1536, + "step": 5120 + }, + { + "epoch": 0.76, + "grad_norm": 0.3227091133594513, + "learning_rate": 9.268064228367529e-05, + "loss": 0.1621, + "step": 5130 + }, + { + "epoch": 0.76, + "grad_norm": 1.7758958339691162, + "learning_rate": 9.266577460600656e-05, + "loss": 0.1545, + "step": 5140 + }, + { + "epoch": 0.76, + "grad_norm": 1.6320562362670898, + "learning_rate": 9.26509069283378e-05, + "loss": 0.1676, + "step": 5150 + }, + { + "epoch": 0.76, + "grad_norm": 0.9210827350616455, + "learning_rate": 9.263603925066905e-05, + "loss": 0.157, + "step": 5160 + }, + { + "epoch": 0.77, + "grad_norm": 0.6037706732749939, + "learning_rate": 9.26211715730003e-05, + "loss": 0.1535, + "step": 5170 + }, + { + "epoch": 0.77, + "grad_norm": 2.3255116939544678, + "learning_rate": 9.260630389533156e-05, + "loss": 0.1681, + "step": 5180 + }, + { + "epoch": 0.77, + "grad_norm": 0.8426361680030823, + "learning_rate": 9.25914362176628e-05, + "loss": 0.1633, + "step": 5190 + }, + { + "epoch": 0.77, + "grad_norm": 0.29783931374549866, + "learning_rate": 9.257656853999406e-05, + "loss": 0.1557, + "step": 5200 + }, + { + "epoch": 0.77, + "grad_norm": 0.6732050776481628, + "learning_rate": 9.256170086232531e-05, + "loss": 0.1604, + "step": 5210 + }, + { + "epoch": 0.77, + "grad_norm": 0.9958044290542603, + "learning_rate": 9.254683318465655e-05, + "loss": 0.1654, + "step": 5220 + }, + { + "epoch": 0.78, + "grad_norm": 1.254502534866333, + "learning_rate": 9.253196550698782e-05, + "loss": 0.1559, + "step": 5230 + }, + { + "epoch": 0.78, + "grad_norm": 0.2834548056125641, + "learning_rate": 9.251709782931906e-05, + "loss": 0.1625, + "step": 5240 + }, + { + "epoch": 0.78, + "grad_norm": 0.3441438376903534, + "learning_rate": 9.250223015165032e-05, + "loss": 0.1511, + "step": 5250 + }, + { + "epoch": 0.78, + "grad_norm": 0.31368038058280945, + "learning_rate": 9.248736247398156e-05, + "loss": 0.1587, + "step": 5260 + }, + { + "epoch": 0.78, + "grad_norm": 0.5394402146339417, + "learning_rate": 9.247249479631283e-05, + "loss": 0.1582, + "step": 5270 + }, + { + "epoch": 0.78, + "grad_norm": 0.29884475469589233, + "learning_rate": 9.245762711864407e-05, + "loss": 0.1552, + "step": 5280 + }, + { + "epoch": 0.78, + "grad_norm": 1.530449628829956, + "learning_rate": 9.244275944097532e-05, + "loss": 0.1703, + "step": 5290 + }, + { + "epoch": 0.79, + "grad_norm": 2.9983556270599365, + "learning_rate": 9.242789176330658e-05, + "loss": 0.1549, + "step": 5300 + }, + { + "epoch": 0.79, + "grad_norm": 0.7032681703567505, + "learning_rate": 9.241302408563782e-05, + "loss": 0.1609, + "step": 5310 + }, + { + "epoch": 0.79, + "grad_norm": 1.7556791305541992, + "learning_rate": 9.239815640796909e-05, + "loss": 0.1576, + "step": 5320 + }, + { + "epoch": 0.79, + "grad_norm": 0.46143102645874023, + "learning_rate": 9.238328873030033e-05, + "loss": 0.1725, + "step": 5330 + }, + { + "epoch": 0.79, + "grad_norm": 0.6132158041000366, + "learning_rate": 9.236842105263158e-05, + "loss": 0.1578, + "step": 5340 + }, + { + "epoch": 0.79, + "grad_norm": 0.317751407623291, + "learning_rate": 9.235355337496282e-05, + "loss": 0.1548, + "step": 5350 + }, + { + "epoch": 0.79, + "grad_norm": 0.5321698188781738, + "learning_rate": 9.23386856972941e-05, + "loss": 0.1616, + "step": 5360 + }, + { + "epoch": 0.8, + "grad_norm": 1.509246826171875, + "learning_rate": 9.232381801962533e-05, + "loss": 0.1582, + "step": 5370 + }, + { + "epoch": 0.8, + "grad_norm": 0.4150841534137726, + "learning_rate": 9.230895034195659e-05, + "loss": 0.1588, + "step": 5380 + }, + { + "epoch": 0.8, + "grad_norm": 0.5779004693031311, + "learning_rate": 9.229408266428784e-05, + "loss": 0.1571, + "step": 5390 + }, + { + "epoch": 0.8, + "grad_norm": 0.8876129984855652, + "learning_rate": 9.227921498661909e-05, + "loss": 0.1528, + "step": 5400 + }, + { + "epoch": 0.8, + "grad_norm": 0.6326704025268555, + "learning_rate": 9.226434730895035e-05, + "loss": 0.1632, + "step": 5410 + }, + { + "epoch": 0.8, + "grad_norm": 1.0380185842514038, + "learning_rate": 9.22494796312816e-05, + "loss": 0.1577, + "step": 5420 + }, + { + "epoch": 0.8, + "grad_norm": 1.268033504486084, + "learning_rate": 9.223461195361285e-05, + "loss": 0.1608, + "step": 5430 + }, + { + "epoch": 0.81, + "grad_norm": 0.5740419626235962, + "learning_rate": 9.22197442759441e-05, + "loss": 0.1648, + "step": 5440 + }, + { + "epoch": 0.81, + "grad_norm": 0.36971914768218994, + "learning_rate": 9.220487659827536e-05, + "loss": 0.1601, + "step": 5450 + }, + { + "epoch": 0.81, + "grad_norm": 0.39549487829208374, + "learning_rate": 9.219000892060661e-05, + "loss": 0.154, + "step": 5460 + }, + { + "epoch": 0.81, + "grad_norm": 0.9269174933433533, + "learning_rate": 9.217514124293785e-05, + "loss": 0.1581, + "step": 5470 + }, + { + "epoch": 0.81, + "grad_norm": 1.122969388961792, + "learning_rate": 9.216027356526911e-05, + "loss": 0.154, + "step": 5480 + }, + { + "epoch": 0.81, + "grad_norm": 0.5427556037902832, + "learning_rate": 9.214540588760036e-05, + "loss": 0.1555, + "step": 5490 + }, + { + "epoch": 0.82, + "grad_norm": 0.5179612636566162, + "learning_rate": 9.213053820993162e-05, + "loss": 0.1569, + "step": 5500 + }, + { + "epoch": 0.82, + "grad_norm": 0.7659315466880798, + "learning_rate": 9.211567053226286e-05, + "loss": 0.1667, + "step": 5510 + }, + { + "epoch": 0.82, + "grad_norm": 1.04986572265625, + "learning_rate": 9.210080285459412e-05, + "loss": 0.1646, + "step": 5520 + }, + { + "epoch": 0.82, + "grad_norm": 1.455209493637085, + "learning_rate": 9.208593517692537e-05, + "loss": 0.1579, + "step": 5530 + }, + { + "epoch": 0.82, + "grad_norm": 1.3870110511779785, + "learning_rate": 9.207106749925662e-05, + "loss": 0.1526, + "step": 5540 + }, + { + "epoch": 0.82, + "grad_norm": 0.7180333733558655, + "learning_rate": 9.205619982158788e-05, + "loss": 0.16, + "step": 5550 + }, + { + "epoch": 0.82, + "grad_norm": 0.5366026759147644, + "learning_rate": 9.204133214391912e-05, + "loss": 0.1571, + "step": 5560 + }, + { + "epoch": 0.83, + "grad_norm": 1.4958914518356323, + "learning_rate": 9.202646446625038e-05, + "loss": 0.155, + "step": 5570 + }, + { + "epoch": 0.83, + "grad_norm": 0.368592768907547, + "learning_rate": 9.201159678858163e-05, + "loss": 0.1559, + "step": 5580 + }, + { + "epoch": 0.83, + "grad_norm": 0.8154517412185669, + "learning_rate": 9.199672911091289e-05, + "loss": 0.1526, + "step": 5590 + }, + { + "epoch": 0.83, + "grad_norm": 0.32349810004234314, + "learning_rate": 9.198186143324413e-05, + "loss": 0.1689, + "step": 5600 + }, + { + "epoch": 0.83, + "grad_norm": 0.8520846366882324, + "learning_rate": 9.196699375557538e-05, + "loss": 0.1599, + "step": 5610 + }, + { + "epoch": 0.83, + "grad_norm": 0.8153768181800842, + "learning_rate": 9.195212607790664e-05, + "loss": 0.1583, + "step": 5620 + }, + { + "epoch": 0.83, + "grad_norm": 1.0507680177688599, + "learning_rate": 9.193725840023789e-05, + "loss": 0.1553, + "step": 5630 + }, + { + "epoch": 0.84, + "grad_norm": 0.34485527873039246, + "learning_rate": 9.192239072256915e-05, + "loss": 0.1521, + "step": 5640 + }, + { + "epoch": 0.84, + "grad_norm": 0.3874395787715912, + "learning_rate": 9.190752304490039e-05, + "loss": 0.159, + "step": 5650 + }, + { + "epoch": 0.84, + "grad_norm": 1.9768149852752686, + "learning_rate": 9.189265536723164e-05, + "loss": 0.1572, + "step": 5660 + }, + { + "epoch": 0.84, + "grad_norm": 0.3996683359146118, + "learning_rate": 9.18777876895629e-05, + "loss": 0.1579, + "step": 5670 + }, + { + "epoch": 0.84, + "grad_norm": 0.6045333743095398, + "learning_rate": 9.186292001189415e-05, + "loss": 0.1532, + "step": 5680 + }, + { + "epoch": 0.84, + "grad_norm": 1.318505883216858, + "learning_rate": 9.184805233422539e-05, + "loss": 0.1589, + "step": 5690 + }, + { + "epoch": 0.84, + "grad_norm": 0.7289694547653198, + "learning_rate": 9.183318465655665e-05, + "loss": 0.168, + "step": 5700 + }, + { + "epoch": 0.85, + "grad_norm": 1.2598559856414795, + "learning_rate": 9.18183169788879e-05, + "loss": 0.1595, + "step": 5710 + }, + { + "epoch": 0.85, + "grad_norm": 1.0090540647506714, + "learning_rate": 9.180344930121916e-05, + "loss": 0.1579, + "step": 5720 + }, + { + "epoch": 0.85, + "grad_norm": 1.2339668273925781, + "learning_rate": 9.178858162355041e-05, + "loss": 0.1568, + "step": 5730 + }, + { + "epoch": 0.85, + "grad_norm": 1.9758799076080322, + "learning_rate": 9.177371394588165e-05, + "loss": 0.1568, + "step": 5740 + }, + { + "epoch": 0.85, + "grad_norm": 1.6045335531234741, + "learning_rate": 9.175884626821291e-05, + "loss": 0.1564, + "step": 5750 + }, + { + "epoch": 0.85, + "grad_norm": 0.40253713726997375, + "learning_rate": 9.174397859054416e-05, + "loss": 0.1584, + "step": 5760 + }, + { + "epoch": 0.86, + "grad_norm": 0.332417756319046, + "learning_rate": 9.172911091287542e-05, + "loss": 0.1492, + "step": 5770 + }, + { + "epoch": 0.86, + "grad_norm": 0.3337450623512268, + "learning_rate": 9.171424323520667e-05, + "loss": 0.1496, + "step": 5780 + }, + { + "epoch": 0.86, + "grad_norm": 0.6006415486335754, + "learning_rate": 9.169937555753791e-05, + "loss": 0.1664, + "step": 5790 + }, + { + "epoch": 0.86, + "grad_norm": 1.1703321933746338, + "learning_rate": 9.168450787986917e-05, + "loss": 0.1515, + "step": 5800 + }, + { + "epoch": 0.86, + "grad_norm": 1.246114730834961, + "learning_rate": 9.166964020220042e-05, + "loss": 0.1569, + "step": 5810 + }, + { + "epoch": 0.86, + "grad_norm": 1.7468287944793701, + "learning_rate": 9.165477252453168e-05, + "loss": 0.1523, + "step": 5820 + }, + { + "epoch": 0.86, + "grad_norm": 0.8291600942611694, + "learning_rate": 9.163990484686292e-05, + "loss": 0.1642, + "step": 5830 + }, + { + "epoch": 0.87, + "grad_norm": 0.5658857226371765, + "learning_rate": 9.162503716919417e-05, + "loss": 0.151, + "step": 5840 + }, + { + "epoch": 0.87, + "grad_norm": 1.1321245431900024, + "learning_rate": 9.161016949152543e-05, + "loss": 0.1544, + "step": 5850 + }, + { + "epoch": 0.87, + "grad_norm": 0.9204747080802917, + "learning_rate": 9.159530181385668e-05, + "loss": 0.1565, + "step": 5860 + }, + { + "epoch": 0.87, + "grad_norm": 0.616126298904419, + "learning_rate": 9.158043413618794e-05, + "loss": 0.1665, + "step": 5870 + }, + { + "epoch": 0.87, + "grad_norm": 0.37067389488220215, + "learning_rate": 9.156556645851918e-05, + "loss": 0.1609, + "step": 5880 + }, + { + "epoch": 0.87, + "grad_norm": 0.9909270405769348, + "learning_rate": 9.155069878085043e-05, + "loss": 0.1481, + "step": 5890 + }, + { + "epoch": 0.87, + "grad_norm": 1.0309427976608276, + "learning_rate": 9.153583110318169e-05, + "loss": 0.1523, + "step": 5900 + }, + { + "epoch": 0.88, + "grad_norm": 1.7706396579742432, + "learning_rate": 9.152096342551294e-05, + "loss": 0.1608, + "step": 5910 + }, + { + "epoch": 0.88, + "grad_norm": 2.229520082473755, + "learning_rate": 9.150609574784418e-05, + "loss": 0.152, + "step": 5920 + }, + { + "epoch": 0.88, + "grad_norm": 0.4807567298412323, + "learning_rate": 9.149122807017544e-05, + "loss": 0.156, + "step": 5930 + }, + { + "epoch": 0.88, + "grad_norm": 0.37514257431030273, + "learning_rate": 9.14763603925067e-05, + "loss": 0.1587, + "step": 5940 + }, + { + "epoch": 0.88, + "grad_norm": 2.0821585655212402, + "learning_rate": 9.146149271483795e-05, + "loss": 0.151, + "step": 5950 + }, + { + "epoch": 0.88, + "grad_norm": 0.8426055908203125, + "learning_rate": 9.14466250371692e-05, + "loss": 0.1617, + "step": 5960 + }, + { + "epoch": 0.88, + "grad_norm": 0.3103870451450348, + "learning_rate": 9.143175735950044e-05, + "loss": 0.1565, + "step": 5970 + }, + { + "epoch": 0.89, + "grad_norm": 0.8296404480934143, + "learning_rate": 9.141688968183171e-05, + "loss": 0.1533, + "step": 5980 + }, + { + "epoch": 0.89, + "grad_norm": 1.0698027610778809, + "learning_rate": 9.140202200416295e-05, + "loss": 0.1537, + "step": 5990 + }, + { + "epoch": 0.89, + "grad_norm": 1.0414214134216309, + "learning_rate": 9.138715432649421e-05, + "loss": 0.1532, + "step": 6000 + }, + { + "epoch": 0.89, + "grad_norm": 0.6497471332550049, + "learning_rate": 9.137228664882545e-05, + "loss": 0.1606, + "step": 6010 + }, + { + "epoch": 0.89, + "grad_norm": 0.3416892886161804, + "learning_rate": 9.13574189711567e-05, + "loss": 0.1499, + "step": 6020 + }, + { + "epoch": 0.89, + "grad_norm": 0.4196206033229828, + "learning_rate": 9.134255129348796e-05, + "loss": 0.1449, + "step": 6030 + }, + { + "epoch": 0.9, + "grad_norm": 0.3455292582511902, + "learning_rate": 9.132768361581921e-05, + "loss": 0.149, + "step": 6040 + }, + { + "epoch": 0.9, + "grad_norm": 0.3500816822052002, + "learning_rate": 9.131281593815047e-05, + "loss": 0.1543, + "step": 6050 + }, + { + "epoch": 0.9, + "grad_norm": 0.5100831389427185, + "learning_rate": 9.129794826048171e-05, + "loss": 0.1535, + "step": 6060 + }, + { + "epoch": 0.9, + "grad_norm": 1.426044225692749, + "learning_rate": 9.128308058281298e-05, + "loss": 0.1532, + "step": 6070 + }, + { + "epoch": 0.9, + "grad_norm": 0.7652605772018433, + "learning_rate": 9.126821290514422e-05, + "loss": 0.1612, + "step": 6080 + }, + { + "epoch": 0.9, + "grad_norm": 0.6746863126754761, + "learning_rate": 9.125334522747547e-05, + "loss": 0.1595, + "step": 6090 + }, + { + "epoch": 0.9, + "grad_norm": 0.6860318779945374, + "learning_rate": 9.123847754980673e-05, + "loss": 0.156, + "step": 6100 + }, + { + "epoch": 0.91, + "grad_norm": 1.8772507905960083, + "learning_rate": 9.122360987213797e-05, + "loss": 0.1549, + "step": 6110 + }, + { + "epoch": 0.91, + "grad_norm": 0.5276684761047363, + "learning_rate": 9.120874219446924e-05, + "loss": 0.1692, + "step": 6120 + }, + { + "epoch": 0.91, + "grad_norm": 1.8537567853927612, + "learning_rate": 9.119387451680048e-05, + "loss": 0.1646, + "step": 6130 + }, + { + "epoch": 0.91, + "grad_norm": 1.389715552330017, + "learning_rate": 9.117900683913173e-05, + "loss": 0.1606, + "step": 6140 + }, + { + "epoch": 0.91, + "grad_norm": 0.4574753940105438, + "learning_rate": 9.116413916146298e-05, + "loss": 0.1483, + "step": 6150 + }, + { + "epoch": 0.91, + "grad_norm": 0.41414326429367065, + "learning_rate": 9.114927148379424e-05, + "loss": 0.1575, + "step": 6160 + }, + { + "epoch": 0.91, + "grad_norm": 0.36833006143569946, + "learning_rate": 9.113440380612549e-05, + "loss": 0.1583, + "step": 6170 + }, + { + "epoch": 0.92, + "grad_norm": 0.7688261866569519, + "learning_rate": 9.111953612845674e-05, + "loss": 0.1553, + "step": 6180 + }, + { + "epoch": 0.92, + "grad_norm": 0.27714991569519043, + "learning_rate": 9.1104668450788e-05, + "loss": 0.1607, + "step": 6190 + }, + { + "epoch": 0.92, + "grad_norm": 0.7245810627937317, + "learning_rate": 9.108980077311924e-05, + "loss": 0.1495, + "step": 6200 + }, + { + "epoch": 0.92, + "grad_norm": 2.5172832012176514, + "learning_rate": 9.10749330954505e-05, + "loss": 0.17, + "step": 6210 + }, + { + "epoch": 0.92, + "grad_norm": 0.39452874660491943, + "learning_rate": 9.106006541778175e-05, + "loss": 0.152, + "step": 6220 + }, + { + "epoch": 0.92, + "grad_norm": 1.6295764446258545, + "learning_rate": 9.1045197740113e-05, + "loss": 0.155, + "step": 6230 + }, + { + "epoch": 0.92, + "grad_norm": 0.8026036620140076, + "learning_rate": 9.103033006244424e-05, + "loss": 0.149, + "step": 6240 + }, + { + "epoch": 0.93, + "grad_norm": 1.2626301050186157, + "learning_rate": 9.101546238477551e-05, + "loss": 0.1527, + "step": 6250 + }, + { + "epoch": 0.93, + "grad_norm": 0.3312925398349762, + "learning_rate": 9.100059470710675e-05, + "loss": 0.147, + "step": 6260 + }, + { + "epoch": 0.93, + "grad_norm": 1.411736249923706, + "learning_rate": 9.0985727029438e-05, + "loss": 0.1511, + "step": 6270 + }, + { + "epoch": 0.93, + "grad_norm": 0.4057709872722626, + "learning_rate": 9.097085935176926e-05, + "loss": 0.1557, + "step": 6280 + }, + { + "epoch": 0.93, + "grad_norm": 1.2932243347167969, + "learning_rate": 9.09559916741005e-05, + "loss": 0.1552, + "step": 6290 + }, + { + "epoch": 0.93, + "grad_norm": 1.286341667175293, + "learning_rate": 9.094112399643177e-05, + "loss": 0.1472, + "step": 6300 + }, + { + "epoch": 0.94, + "grad_norm": 1.9994767904281616, + "learning_rate": 9.092625631876301e-05, + "loss": 0.1651, + "step": 6310 + }, + { + "epoch": 0.94, + "grad_norm": 1.4747790098190308, + "learning_rate": 9.091138864109427e-05, + "loss": 0.1657, + "step": 6320 + }, + { + "epoch": 0.94, + "grad_norm": 0.8761974573135376, + "learning_rate": 9.089652096342551e-05, + "loss": 0.1469, + "step": 6330 + }, + { + "epoch": 0.94, + "grad_norm": 0.5455211997032166, + "learning_rate": 9.088165328575678e-05, + "loss": 0.1611, + "step": 6340 + }, + { + "epoch": 0.94, + "grad_norm": 1.3673850297927856, + "learning_rate": 9.086678560808802e-05, + "loss": 0.1655, + "step": 6350 + }, + { + "epoch": 0.94, + "grad_norm": 0.8073334693908691, + "learning_rate": 9.085191793041927e-05, + "loss": 0.1555, + "step": 6360 + }, + { + "epoch": 0.94, + "grad_norm": 0.519607424736023, + "learning_rate": 9.083705025275053e-05, + "loss": 0.1597, + "step": 6370 + }, + { + "epoch": 0.95, + "grad_norm": 0.8752426505088806, + "learning_rate": 9.082218257508177e-05, + "loss": 0.1519, + "step": 6380 + }, + { + "epoch": 0.95, + "grad_norm": 0.3730453550815582, + "learning_rate": 9.080731489741304e-05, + "loss": 0.1535, + "step": 6390 + }, + { + "epoch": 0.95, + "grad_norm": 1.3529815673828125, + "learning_rate": 9.079244721974428e-05, + "loss": 0.1618, + "step": 6400 + }, + { + "epoch": 0.95, + "grad_norm": 1.1450254917144775, + "learning_rate": 9.077757954207553e-05, + "loss": 0.1618, + "step": 6410 + }, + { + "epoch": 0.95, + "grad_norm": 0.6045798659324646, + "learning_rate": 9.076271186440677e-05, + "loss": 0.1594, + "step": 6420 + }, + { + "epoch": 0.95, + "grad_norm": 0.7981701493263245, + "learning_rate": 9.074784418673804e-05, + "loss": 0.1568, + "step": 6430 + }, + { + "epoch": 0.95, + "grad_norm": 0.3604912757873535, + "learning_rate": 9.07329765090693e-05, + "loss": 0.1493, + "step": 6440 + }, + { + "epoch": 0.96, + "grad_norm": 0.8800011873245239, + "learning_rate": 9.071810883140054e-05, + "loss": 0.149, + "step": 6450 + }, + { + "epoch": 0.96, + "grad_norm": 1.3312394618988037, + "learning_rate": 9.070324115373179e-05, + "loss": 0.1507, + "step": 6460 + }, + { + "epoch": 0.96, + "grad_norm": 1.0276165008544922, + "learning_rate": 9.068837347606305e-05, + "loss": 0.1539, + "step": 6470 + }, + { + "epoch": 0.96, + "grad_norm": 1.2565714120864868, + "learning_rate": 9.06735057983943e-05, + "loss": 0.1615, + "step": 6480 + }, + { + "epoch": 0.96, + "grad_norm": 1.5447033643722534, + "learning_rate": 9.065863812072554e-05, + "loss": 0.1467, + "step": 6490 + }, + { + "epoch": 0.96, + "grad_norm": 1.4500904083251953, + "learning_rate": 9.06437704430568e-05, + "loss": 0.1514, + "step": 6500 + }, + { + "epoch": 0.96, + "grad_norm": 1.5841881036758423, + "learning_rate": 9.062890276538805e-05, + "loss": 0.1576, + "step": 6510 + }, + { + "epoch": 0.97, + "grad_norm": 0.9153670072555542, + "learning_rate": 9.061403508771931e-05, + "loss": 0.15, + "step": 6520 + }, + { + "epoch": 0.97, + "grad_norm": 0.758307695388794, + "learning_rate": 9.059916741005056e-05, + "loss": 0.1564, + "step": 6530 + }, + { + "epoch": 0.97, + "grad_norm": 0.8557407855987549, + "learning_rate": 9.05842997323818e-05, + "loss": 0.1545, + "step": 6540 + }, + { + "epoch": 0.97, + "grad_norm": 1.1173782348632812, + "learning_rate": 9.056943205471306e-05, + "loss": 0.1519, + "step": 6550 + }, + { + "epoch": 0.97, + "grad_norm": 1.6467628479003906, + "learning_rate": 9.055456437704431e-05, + "loss": 0.1522, + "step": 6560 + }, + { + "epoch": 0.97, + "grad_norm": 0.2692776620388031, + "learning_rate": 9.053969669937557e-05, + "loss": 0.1558, + "step": 6570 + }, + { + "epoch": 0.98, + "grad_norm": 0.31524816155433655, + "learning_rate": 9.052482902170681e-05, + "loss": 0.1534, + "step": 6580 + }, + { + "epoch": 0.98, + "grad_norm": 0.44416341185569763, + "learning_rate": 9.050996134403806e-05, + "loss": 0.1574, + "step": 6590 + }, + { + "epoch": 0.98, + "grad_norm": 1.8063576221466064, + "learning_rate": 9.049509366636932e-05, + "loss": 0.1636, + "step": 6600 + }, + { + "epoch": 0.98, + "grad_norm": 0.8080287575721741, + "learning_rate": 9.048022598870057e-05, + "loss": 0.1518, + "step": 6610 + }, + { + "epoch": 0.98, + "grad_norm": 0.5311675071716309, + "learning_rate": 9.046535831103183e-05, + "loss": 0.1546, + "step": 6620 + }, + { + "epoch": 0.98, + "grad_norm": 0.27562418580055237, + "learning_rate": 9.045049063336307e-05, + "loss": 0.1472, + "step": 6630 + }, + { + "epoch": 0.98, + "grad_norm": 1.1780297756195068, + "learning_rate": 9.043562295569432e-05, + "loss": 0.1484, + "step": 6640 + }, + { + "epoch": 0.99, + "grad_norm": 1.2727899551391602, + "learning_rate": 9.042075527802558e-05, + "loss": 0.153, + "step": 6650 + }, + { + "epoch": 0.99, + "grad_norm": 2.0948357582092285, + "learning_rate": 9.040588760035683e-05, + "loss": 0.1465, + "step": 6660 + }, + { + "epoch": 0.99, + "grad_norm": 0.42854198813438416, + "learning_rate": 9.039101992268807e-05, + "loss": 0.1606, + "step": 6670 + }, + { + "epoch": 0.99, + "grad_norm": 1.9154927730560303, + "learning_rate": 9.037615224501933e-05, + "loss": 0.1568, + "step": 6680 + }, + { + "epoch": 0.99, + "grad_norm": 0.49313873052597046, + "learning_rate": 9.036128456735058e-05, + "loss": 0.15, + "step": 6690 + }, + { + "epoch": 0.99, + "grad_norm": 1.1896381378173828, + "learning_rate": 9.034641688968184e-05, + "loss": 0.1519, + "step": 6700 + }, + { + "epoch": 0.99, + "grad_norm": 1.2026231288909912, + "learning_rate": 9.03315492120131e-05, + "loss": 0.1514, + "step": 6710 + }, + { + "epoch": 1.0, + "grad_norm": 0.7389522194862366, + "learning_rate": 9.031668153434433e-05, + "loss": 0.1506, + "step": 6720 + }, + { + "epoch": 1.0, + "grad_norm": 1.0212032794952393, + "learning_rate": 9.030181385667559e-05, + "loss": 0.1541, + "step": 6730 + }, + { + "epoch": 1.0, + "grad_norm": 0.937395453453064, + "learning_rate": 9.028694617900684e-05, + "loss": 0.1568, + "step": 6740 + }, + { + "epoch": 1.0, + "eval_loss": 0.15145954489707947, + "eval_runtime": 2482.4757, + "eval_samples_per_second": 235.119, + "eval_steps_per_second": 3.674, + "step": 6746 + }, + { + "epoch": 1.0, + "grad_norm": 1.5865614414215088, + "learning_rate": 9.02720785013381e-05, + "loss": 0.1487, + "step": 6750 + }, + { + "epoch": 1.0, + "grad_norm": 2.532686471939087, + "learning_rate": 9.025721082366934e-05, + "loss": 0.1525, + "step": 6760 + }, + { + "epoch": 1.0, + "grad_norm": 0.690150797367096, + "learning_rate": 9.02423431460006e-05, + "loss": 0.1543, + "step": 6770 + }, + { + "epoch": 1.01, + "grad_norm": 0.32673001289367676, + "learning_rate": 9.022747546833185e-05, + "loss": 0.1401, + "step": 6780 + }, + { + "epoch": 1.01, + "grad_norm": 0.432534396648407, + "learning_rate": 9.02126077906631e-05, + "loss": 0.1482, + "step": 6790 + }, + { + "epoch": 1.01, + "grad_norm": 1.469577431678772, + "learning_rate": 9.019774011299436e-05, + "loss": 0.1517, + "step": 6800 + }, + { + "epoch": 1.01, + "grad_norm": 1.8778477907180786, + "learning_rate": 9.01828724353256e-05, + "loss": 0.143, + "step": 6810 + }, + { + "epoch": 1.01, + "grad_norm": 0.7702934741973877, + "learning_rate": 9.016800475765686e-05, + "loss": 0.1547, + "step": 6820 + }, + { + "epoch": 1.01, + "grad_norm": 1.1953880786895752, + "learning_rate": 9.015313707998811e-05, + "loss": 0.1537, + "step": 6830 + }, + { + "epoch": 1.01, + "grad_norm": 1.4178327322006226, + "learning_rate": 9.013826940231936e-05, + "loss": 0.1463, + "step": 6840 + }, + { + "epoch": 1.02, + "grad_norm": 1.5760785341262817, + "learning_rate": 9.012340172465062e-05, + "loss": 0.1661, + "step": 6850 + }, + { + "epoch": 1.02, + "grad_norm": 0.34104621410369873, + "learning_rate": 9.010853404698186e-05, + "loss": 0.156, + "step": 6860 + }, + { + "epoch": 1.02, + "grad_norm": 0.4194226562976837, + "learning_rate": 9.009366636931312e-05, + "loss": 0.1533, + "step": 6870 + }, + { + "epoch": 1.02, + "grad_norm": 1.504073143005371, + "learning_rate": 9.007879869164437e-05, + "loss": 0.1447, + "step": 6880 + }, + { + "epoch": 1.02, + "grad_norm": 1.8167688846588135, + "learning_rate": 9.006393101397563e-05, + "loss": 0.1501, + "step": 6890 + }, + { + "epoch": 1.02, + "grad_norm": 0.4795800447463989, + "learning_rate": 9.004906333630687e-05, + "loss": 0.1444, + "step": 6900 + }, + { + "epoch": 1.02, + "grad_norm": 0.4590786397457123, + "learning_rate": 9.003419565863812e-05, + "loss": 0.1471, + "step": 6910 + }, + { + "epoch": 1.03, + "grad_norm": 0.7949062585830688, + "learning_rate": 9.001932798096938e-05, + "loss": 0.149, + "step": 6920 + }, + { + "epoch": 1.03, + "grad_norm": 0.5231783390045166, + "learning_rate": 9.000446030330063e-05, + "loss": 0.1459, + "step": 6930 + }, + { + "epoch": 1.03, + "grad_norm": 1.8959499597549438, + "learning_rate": 8.998959262563189e-05, + "loss": 0.1585, + "step": 6940 + }, + { + "epoch": 1.03, + "grad_norm": 0.5581046342849731, + "learning_rate": 8.997472494796313e-05, + "loss": 0.1516, + "step": 6950 + }, + { + "epoch": 1.03, + "grad_norm": 2.33567476272583, + "learning_rate": 8.99598572702944e-05, + "loss": 0.1512, + "step": 6960 + }, + { + "epoch": 1.03, + "grad_norm": 0.262761652469635, + "learning_rate": 8.994498959262564e-05, + "loss": 0.1537, + "step": 6970 + }, + { + "epoch": 1.03, + "grad_norm": 0.34269747138023376, + "learning_rate": 8.993012191495689e-05, + "loss": 0.1539, + "step": 6980 + }, + { + "epoch": 1.04, + "grad_norm": 0.5534690618515015, + "learning_rate": 8.991525423728813e-05, + "loss": 0.1462, + "step": 6990 + }, + { + "epoch": 1.04, + "grad_norm": 0.38825082778930664, + "learning_rate": 8.990038655961939e-05, + "loss": 0.1413, + "step": 7000 + }, + { + "epoch": 1.04, + "grad_norm": 0.9814270734786987, + "learning_rate": 8.988551888195064e-05, + "loss": 0.1545, + "step": 7010 + }, + { + "epoch": 1.04, + "grad_norm": 0.6726862192153931, + "learning_rate": 8.98706512042819e-05, + "loss": 0.1528, + "step": 7020 + }, + { + "epoch": 1.04, + "grad_norm": 1.0980238914489746, + "learning_rate": 8.985578352661315e-05, + "loss": 0.1505, + "step": 7030 + }, + { + "epoch": 1.04, + "grad_norm": 1.7269726991653442, + "learning_rate": 8.984091584894439e-05, + "loss": 0.1519, + "step": 7040 + }, + { + "epoch": 1.05, + "grad_norm": 0.8081973195075989, + "learning_rate": 8.982604817127566e-05, + "loss": 0.1487, + "step": 7050 + }, + { + "epoch": 1.05, + "grad_norm": 0.884931206703186, + "learning_rate": 8.98111804936069e-05, + "loss": 0.1451, + "step": 7060 + }, + { + "epoch": 1.05, + "grad_norm": 0.8839534521102905, + "learning_rate": 8.979631281593816e-05, + "loss": 0.1534, + "step": 7070 + }, + { + "epoch": 1.05, + "grad_norm": 0.30532771348953247, + "learning_rate": 8.97814451382694e-05, + "loss": 0.1534, + "step": 7080 + }, + { + "epoch": 1.05, + "grad_norm": 1.0127397775650024, + "learning_rate": 8.976657746060065e-05, + "loss": 0.1516, + "step": 7090 + }, + { + "epoch": 1.05, + "grad_norm": 1.871941328048706, + "learning_rate": 8.975170978293192e-05, + "loss": 0.1472, + "step": 7100 + }, + { + "epoch": 1.05, + "grad_norm": 1.0577542781829834, + "learning_rate": 8.973684210526316e-05, + "loss": 0.1484, + "step": 7110 + }, + { + "epoch": 1.06, + "grad_norm": 0.9246624112129211, + "learning_rate": 8.972197442759442e-05, + "loss": 0.1467, + "step": 7120 + }, + { + "epoch": 1.06, + "grad_norm": 1.204312801361084, + "learning_rate": 8.970710674992566e-05, + "loss": 0.1523, + "step": 7130 + }, + { + "epoch": 1.06, + "grad_norm": 0.6742764115333557, + "learning_rate": 8.969223907225693e-05, + "loss": 0.1497, + "step": 7140 + }, + { + "epoch": 1.06, + "grad_norm": 1.5196964740753174, + "learning_rate": 8.967737139458817e-05, + "loss": 0.1512, + "step": 7150 + }, + { + "epoch": 1.06, + "grad_norm": 0.3582424521446228, + "learning_rate": 8.966250371691942e-05, + "loss": 0.1618, + "step": 7160 + }, + { + "epoch": 1.06, + "grad_norm": 0.34651896357536316, + "learning_rate": 8.964763603925068e-05, + "loss": 0.1511, + "step": 7170 + }, + { + "epoch": 1.06, + "grad_norm": 1.343338966369629, + "learning_rate": 8.963276836158192e-05, + "loss": 0.1431, + "step": 7180 + }, + { + "epoch": 1.07, + "grad_norm": 1.5551012754440308, + "learning_rate": 8.961790068391319e-05, + "loss": 0.1451, + "step": 7190 + }, + { + "epoch": 1.07, + "grad_norm": 0.4299207627773285, + "learning_rate": 8.960303300624443e-05, + "loss": 0.1464, + "step": 7200 + }, + { + "epoch": 1.07, + "grad_norm": 0.44814541935920715, + "learning_rate": 8.958816532857568e-05, + "loss": 0.1508, + "step": 7210 + }, + { + "epoch": 1.07, + "grad_norm": 0.5933278799057007, + "learning_rate": 8.957329765090692e-05, + "loss": 0.1558, + "step": 7220 + }, + { + "epoch": 1.07, + "grad_norm": 0.4118923842906952, + "learning_rate": 8.955842997323819e-05, + "loss": 0.1492, + "step": 7230 + }, + { + "epoch": 1.07, + "grad_norm": 1.303074836730957, + "learning_rate": 8.954356229556943e-05, + "loss": 0.1451, + "step": 7240 + }, + { + "epoch": 1.07, + "grad_norm": 1.5382298231124878, + "learning_rate": 8.952869461790069e-05, + "loss": 0.154, + "step": 7250 + }, + { + "epoch": 1.08, + "grad_norm": 1.7457761764526367, + "learning_rate": 8.951382694023194e-05, + "loss": 0.1494, + "step": 7260 + }, + { + "epoch": 1.08, + "grad_norm": 0.7700926661491394, + "learning_rate": 8.949895926256318e-05, + "loss": 0.1629, + "step": 7270 + }, + { + "epoch": 1.08, + "grad_norm": 0.2998206615447998, + "learning_rate": 8.948409158489445e-05, + "loss": 0.1459, + "step": 7280 + }, + { + "epoch": 1.08, + "grad_norm": 0.41362789273262024, + "learning_rate": 8.94692239072257e-05, + "loss": 0.1468, + "step": 7290 + }, + { + "epoch": 1.08, + "grad_norm": 1.507246494293213, + "learning_rate": 8.945435622955695e-05, + "loss": 0.1526, + "step": 7300 + }, + { + "epoch": 1.08, + "grad_norm": 1.7437586784362793, + "learning_rate": 8.943948855188819e-05, + "loss": 0.1632, + "step": 7310 + }, + { + "epoch": 1.09, + "grad_norm": 0.2855551242828369, + "learning_rate": 8.942462087421946e-05, + "loss": 0.1485, + "step": 7320 + }, + { + "epoch": 1.09, + "grad_norm": 0.5423638820648193, + "learning_rate": 8.94097531965507e-05, + "loss": 0.1572, + "step": 7330 + }, + { + "epoch": 1.09, + "grad_norm": 0.9757463932037354, + "learning_rate": 8.939488551888195e-05, + "loss": 0.1523, + "step": 7340 + }, + { + "epoch": 1.09, + "grad_norm": 0.25639691948890686, + "learning_rate": 8.938001784121321e-05, + "loss": 0.1576, + "step": 7350 + }, + { + "epoch": 1.09, + "grad_norm": 0.7795055508613586, + "learning_rate": 8.936515016354445e-05, + "loss": 0.1516, + "step": 7360 + }, + { + "epoch": 1.09, + "grad_norm": 0.48449787497520447, + "learning_rate": 8.935028248587572e-05, + "loss": 0.158, + "step": 7370 + }, + { + "epoch": 1.09, + "grad_norm": 1.0721737146377563, + "learning_rate": 8.933541480820696e-05, + "loss": 0.1523, + "step": 7380 + }, + { + "epoch": 1.1, + "grad_norm": 1.2508677244186401, + "learning_rate": 8.932054713053821e-05, + "loss": 0.1454, + "step": 7390 + }, + { + "epoch": 1.1, + "grad_norm": 0.39935556054115295, + "learning_rate": 8.930567945286946e-05, + "loss": 0.1543, + "step": 7400 + }, + { + "epoch": 1.1, + "grad_norm": 0.5586414337158203, + "learning_rate": 8.929081177520072e-05, + "loss": 0.1596, + "step": 7410 + }, + { + "epoch": 1.1, + "grad_norm": 1.2489172220230103, + "learning_rate": 8.927594409753197e-05, + "loss": 0.1548, + "step": 7420 + }, + { + "epoch": 1.1, + "grad_norm": 2.5162367820739746, + "learning_rate": 8.926107641986322e-05, + "loss": 0.15, + "step": 7430 + }, + { + "epoch": 1.1, + "grad_norm": 1.1505943536758423, + "learning_rate": 8.924620874219447e-05, + "loss": 0.1472, + "step": 7440 + }, + { + "epoch": 1.1, + "grad_norm": 1.3584840297698975, + "learning_rate": 8.923134106452573e-05, + "loss": 0.1535, + "step": 7450 + }, + { + "epoch": 1.11, + "grad_norm": 0.40715011954307556, + "learning_rate": 8.921647338685698e-05, + "loss": 0.1556, + "step": 7460 + }, + { + "epoch": 1.11, + "grad_norm": 1.03001070022583, + "learning_rate": 8.920160570918823e-05, + "loss": 0.1571, + "step": 7470 + }, + { + "epoch": 1.11, + "grad_norm": 0.2230408489704132, + "learning_rate": 8.918673803151948e-05, + "loss": 0.147, + "step": 7480 + }, + { + "epoch": 1.11, + "grad_norm": 1.4231066703796387, + "learning_rate": 8.917187035385073e-05, + "loss": 0.157, + "step": 7490 + }, + { + "epoch": 1.11, + "grad_norm": 0.9302521347999573, + "learning_rate": 8.915700267618199e-05, + "loss": 0.1538, + "step": 7500 + }, + { + "epoch": 1.11, + "grad_norm": 1.3577553033828735, + "learning_rate": 8.914213499851324e-05, + "loss": 0.1447, + "step": 7510 + }, + { + "epoch": 1.11, + "grad_norm": 2.897783041000366, + "learning_rate": 8.912726732084449e-05, + "loss": 0.1476, + "step": 7520 + }, + { + "epoch": 1.12, + "grad_norm": 0.3453262448310852, + "learning_rate": 8.911239964317574e-05, + "loss": 0.1603, + "step": 7530 + }, + { + "epoch": 1.12, + "grad_norm": 1.2519605159759521, + "learning_rate": 8.9097531965507e-05, + "loss": 0.1551, + "step": 7540 + }, + { + "epoch": 1.12, + "grad_norm": 1.0790461301803589, + "learning_rate": 8.908266428783825e-05, + "loss": 0.1472, + "step": 7550 + }, + { + "epoch": 1.12, + "grad_norm": 0.6116156578063965, + "learning_rate": 8.906779661016949e-05, + "loss": 0.1483, + "step": 7560 + }, + { + "epoch": 1.12, + "grad_norm": 0.32002952694892883, + "learning_rate": 8.905292893250075e-05, + "loss": 0.1455, + "step": 7570 + }, + { + "epoch": 1.12, + "grad_norm": 2.1142666339874268, + "learning_rate": 8.9038061254832e-05, + "loss": 0.1514, + "step": 7580 + }, + { + "epoch": 1.13, + "grad_norm": 0.7548339366912842, + "learning_rate": 8.902319357716326e-05, + "loss": 0.1503, + "step": 7590 + }, + { + "epoch": 1.13, + "grad_norm": 1.0314886569976807, + "learning_rate": 8.900832589949451e-05, + "loss": 0.1484, + "step": 7600 + }, + { + "epoch": 1.13, + "grad_norm": 1.0514923334121704, + "learning_rate": 8.899345822182575e-05, + "loss": 0.1597, + "step": 7610 + }, + { + "epoch": 1.13, + "grad_norm": 0.3399512767791748, + "learning_rate": 8.8978590544157e-05, + "loss": 0.1531, + "step": 7620 + }, + { + "epoch": 1.13, + "grad_norm": 1.079697608947754, + "learning_rate": 8.896372286648826e-05, + "loss": 0.1472, + "step": 7630 + }, + { + "epoch": 1.13, + "grad_norm": 0.370299369096756, + "learning_rate": 8.894885518881952e-05, + "loss": 0.1448, + "step": 7640 + }, + { + "epoch": 1.13, + "grad_norm": 2.146461248397827, + "learning_rate": 8.893398751115076e-05, + "loss": 0.1494, + "step": 7650 + }, + { + "epoch": 1.14, + "grad_norm": 1.532644271850586, + "learning_rate": 8.891911983348201e-05, + "loss": 0.1442, + "step": 7660 + }, + { + "epoch": 1.14, + "grad_norm": 1.0400909185409546, + "learning_rate": 8.890425215581327e-05, + "loss": 0.155, + "step": 7670 + }, + { + "epoch": 1.14, + "grad_norm": 0.5614340901374817, + "learning_rate": 8.888938447814452e-05, + "loss": 0.149, + "step": 7680 + }, + { + "epoch": 1.14, + "grad_norm": 1.8623751401901245, + "learning_rate": 8.887451680047578e-05, + "loss": 0.1646, + "step": 7690 + }, + { + "epoch": 1.14, + "grad_norm": 1.2477229833602905, + "learning_rate": 8.885964912280702e-05, + "loss": 0.1502, + "step": 7700 + }, + { + "epoch": 1.14, + "grad_norm": 1.0643510818481445, + "learning_rate": 8.884478144513827e-05, + "loss": 0.1558, + "step": 7710 + }, + { + "epoch": 1.14, + "grad_norm": 0.7074918746948242, + "learning_rate": 8.882991376746953e-05, + "loss": 0.1465, + "step": 7720 + }, + { + "epoch": 1.15, + "grad_norm": 1.3349978923797607, + "learning_rate": 8.881504608980078e-05, + "loss": 0.1538, + "step": 7730 + }, + { + "epoch": 1.15, + "grad_norm": 0.33089113235473633, + "learning_rate": 8.880017841213202e-05, + "loss": 0.1466, + "step": 7740 + }, + { + "epoch": 1.15, + "grad_norm": 1.4978218078613281, + "learning_rate": 8.878531073446328e-05, + "loss": 0.155, + "step": 7750 + }, + { + "epoch": 1.15, + "grad_norm": 0.7654755711555481, + "learning_rate": 8.877044305679453e-05, + "loss": 0.1467, + "step": 7760 + }, + { + "epoch": 1.15, + "grad_norm": 0.5333597660064697, + "learning_rate": 8.875557537912579e-05, + "loss": 0.1523, + "step": 7770 + }, + { + "epoch": 1.15, + "grad_norm": 0.6722001433372498, + "learning_rate": 8.874070770145704e-05, + "loss": 0.1503, + "step": 7780 + }, + { + "epoch": 1.15, + "grad_norm": 1.1389368772506714, + "learning_rate": 8.872584002378828e-05, + "loss": 0.1491, + "step": 7790 + }, + { + "epoch": 1.16, + "grad_norm": 0.3832188546657562, + "learning_rate": 8.871097234611954e-05, + "loss": 0.1483, + "step": 7800 + }, + { + "epoch": 1.16, + "grad_norm": 1.1785801649093628, + "learning_rate": 8.869610466845079e-05, + "loss": 0.1556, + "step": 7810 + }, + { + "epoch": 1.16, + "grad_norm": 0.5937321186065674, + "learning_rate": 8.868123699078205e-05, + "loss": 0.1515, + "step": 7820 + }, + { + "epoch": 1.16, + "grad_norm": 1.9288078546524048, + "learning_rate": 8.86663693131133e-05, + "loss": 0.1524, + "step": 7830 + }, + { + "epoch": 1.16, + "grad_norm": 0.9835410714149475, + "learning_rate": 8.865150163544454e-05, + "loss": 0.1453, + "step": 7840 + }, + { + "epoch": 1.16, + "grad_norm": 2.201693534851074, + "learning_rate": 8.86366339577758e-05, + "loss": 0.1516, + "step": 7850 + }, + { + "epoch": 1.17, + "grad_norm": 2.7419443130493164, + "learning_rate": 8.862176628010705e-05, + "loss": 0.153, + "step": 7860 + }, + { + "epoch": 1.17, + "grad_norm": 1.239664077758789, + "learning_rate": 8.860689860243831e-05, + "loss": 0.1514, + "step": 7870 + }, + { + "epoch": 1.17, + "grad_norm": 0.9454985857009888, + "learning_rate": 8.859203092476955e-05, + "loss": 0.1453, + "step": 7880 + }, + { + "epoch": 1.17, + "grad_norm": 0.5655785202980042, + "learning_rate": 8.85771632471008e-05, + "loss": 0.1442, + "step": 7890 + }, + { + "epoch": 1.17, + "grad_norm": 1.2254222631454468, + "learning_rate": 8.856229556943206e-05, + "loss": 0.1555, + "step": 7900 + }, + { + "epoch": 1.17, + "grad_norm": 1.7310748100280762, + "learning_rate": 8.854742789176331e-05, + "loss": 0.1514, + "step": 7910 + }, + { + "epoch": 1.17, + "grad_norm": 2.103505849838257, + "learning_rate": 8.853256021409457e-05, + "loss": 0.1504, + "step": 7920 + }, + { + "epoch": 1.18, + "grad_norm": 0.45951688289642334, + "learning_rate": 8.851769253642581e-05, + "loss": 0.1507, + "step": 7930 + }, + { + "epoch": 1.18, + "grad_norm": 0.5322825312614441, + "learning_rate": 8.850282485875708e-05, + "loss": 0.1455, + "step": 7940 + }, + { + "epoch": 1.18, + "grad_norm": 0.80711430311203, + "learning_rate": 8.848795718108832e-05, + "loss": 0.1489, + "step": 7950 + }, + { + "epoch": 1.18, + "grad_norm": 1.1508015394210815, + "learning_rate": 8.847308950341957e-05, + "loss": 0.1504, + "step": 7960 + }, + { + "epoch": 1.18, + "grad_norm": 0.5405661463737488, + "learning_rate": 8.845822182575081e-05, + "loss": 0.155, + "step": 7970 + }, + { + "epoch": 1.18, + "grad_norm": 0.4886877238750458, + "learning_rate": 8.844335414808207e-05, + "loss": 0.1471, + "step": 7980 + }, + { + "epoch": 1.18, + "grad_norm": 0.6233669519424438, + "learning_rate": 8.842848647041332e-05, + "loss": 0.1481, + "step": 7990 + }, + { + "epoch": 1.19, + "grad_norm": 0.8412140011787415, + "learning_rate": 8.841361879274458e-05, + "loss": 0.1476, + "step": 8000 + }, + { + "epoch": 1.19, + "grad_norm": 0.6617283821105957, + "learning_rate": 8.839875111507583e-05, + "loss": 0.1457, + "step": 8010 + }, + { + "epoch": 1.19, + "grad_norm": 0.756656289100647, + "learning_rate": 8.838388343740707e-05, + "loss": 0.1436, + "step": 8020 + }, + { + "epoch": 1.19, + "grad_norm": 1.0248194932937622, + "learning_rate": 8.836901575973834e-05, + "loss": 0.1483, + "step": 8030 + }, + { + "epoch": 1.19, + "grad_norm": 0.9655269980430603, + "learning_rate": 8.835414808206958e-05, + "loss": 0.1455, + "step": 8040 + }, + { + "epoch": 1.19, + "grad_norm": 0.9099483489990234, + "learning_rate": 8.833928040440084e-05, + "loss": 0.1449, + "step": 8050 + }, + { + "epoch": 1.19, + "grad_norm": 0.3274220824241638, + "learning_rate": 8.832441272673208e-05, + "loss": 0.1586, + "step": 8060 + }, + { + "epoch": 1.2, + "grad_norm": 0.5055317878723145, + "learning_rate": 8.830954504906334e-05, + "loss": 0.1482, + "step": 8070 + }, + { + "epoch": 1.2, + "grad_norm": 0.36009564995765686, + "learning_rate": 8.829467737139459e-05, + "loss": 0.1536, + "step": 8080 + }, + { + "epoch": 1.2, + "grad_norm": 0.616855263710022, + "learning_rate": 8.827980969372584e-05, + "loss": 0.1432, + "step": 8090 + }, + { + "epoch": 1.2, + "grad_norm": 2.257025957107544, + "learning_rate": 8.82649420160571e-05, + "loss": 0.1491, + "step": 8100 + }, + { + "epoch": 1.2, + "grad_norm": 1.2415412664413452, + "learning_rate": 8.825007433838834e-05, + "loss": 0.159, + "step": 8110 + }, + { + "epoch": 1.2, + "grad_norm": 0.49086448550224304, + "learning_rate": 8.823520666071961e-05, + "loss": 0.1477, + "step": 8120 + }, + { + "epoch": 1.21, + "grad_norm": 1.015421748161316, + "learning_rate": 8.822033898305085e-05, + "loss": 0.1456, + "step": 8130 + }, + { + "epoch": 1.21, + "grad_norm": 0.44600141048431396, + "learning_rate": 8.82054713053821e-05, + "loss": 0.1567, + "step": 8140 + }, + { + "epoch": 1.21, + "grad_norm": 1.0561884641647339, + "learning_rate": 8.819060362771336e-05, + "loss": 0.1583, + "step": 8150 + }, + { + "epoch": 1.21, + "grad_norm": 1.7750810384750366, + "learning_rate": 8.81757359500446e-05, + "loss": 0.1497, + "step": 8160 + }, + { + "epoch": 1.21, + "grad_norm": 0.3530331552028656, + "learning_rate": 8.816086827237587e-05, + "loss": 0.1529, + "step": 8170 + }, + { + "epoch": 1.21, + "grad_norm": 1.564988613128662, + "learning_rate": 8.814600059470711e-05, + "loss": 0.157, + "step": 8180 + }, + { + "epoch": 1.21, + "grad_norm": 0.599676251411438, + "learning_rate": 8.813113291703837e-05, + "loss": 0.15, + "step": 8190 + }, + { + "epoch": 1.22, + "grad_norm": 1.738433837890625, + "learning_rate": 8.81162652393696e-05, + "loss": 0.1481, + "step": 8200 + }, + { + "epoch": 1.22, + "grad_norm": 1.467214822769165, + "learning_rate": 8.810139756170087e-05, + "loss": 0.1456, + "step": 8210 + }, + { + "epoch": 1.22, + "grad_norm": 0.8870519399642944, + "learning_rate": 8.808652988403212e-05, + "loss": 0.1467, + "step": 8220 + }, + { + "epoch": 1.22, + "grad_norm": 0.8673567175865173, + "learning_rate": 8.807166220636337e-05, + "loss": 0.1511, + "step": 8230 + }, + { + "epoch": 1.22, + "grad_norm": 0.7647720575332642, + "learning_rate": 8.805679452869463e-05, + "loss": 0.1409, + "step": 8240 + }, + { + "epoch": 1.22, + "grad_norm": 1.1034643650054932, + "learning_rate": 8.804192685102587e-05, + "loss": 0.1377, + "step": 8250 + }, + { + "epoch": 1.22, + "grad_norm": 1.3493852615356445, + "learning_rate": 8.802705917335713e-05, + "loss": 0.1511, + "step": 8260 + }, + { + "epoch": 1.23, + "grad_norm": 1.0202926397323608, + "learning_rate": 8.801219149568838e-05, + "loss": 0.1443, + "step": 8270 + }, + { + "epoch": 1.23, + "grad_norm": 1.9459919929504395, + "learning_rate": 8.799732381801963e-05, + "loss": 0.1476, + "step": 8280 + }, + { + "epoch": 1.23, + "grad_norm": 1.475825309753418, + "learning_rate": 8.798245614035087e-05, + "loss": 0.1431, + "step": 8290 + }, + { + "epoch": 1.23, + "grad_norm": 0.3227538466453552, + "learning_rate": 8.796758846268214e-05, + "loss": 0.151, + "step": 8300 + }, + { + "epoch": 1.23, + "grad_norm": 2.327113151550293, + "learning_rate": 8.795272078501338e-05, + "loss": 0.1465, + "step": 8310 + }, + { + "epoch": 1.23, + "grad_norm": 0.28288453817367554, + "learning_rate": 8.793785310734464e-05, + "loss": 0.1451, + "step": 8320 + }, + { + "epoch": 1.23, + "grad_norm": 1.3570489883422852, + "learning_rate": 8.792298542967589e-05, + "loss": 0.1544, + "step": 8330 + }, + { + "epoch": 1.24, + "grad_norm": 1.3985998630523682, + "learning_rate": 8.790811775200713e-05, + "loss": 0.1453, + "step": 8340 + }, + { + "epoch": 1.24, + "grad_norm": 1.7871092557907104, + "learning_rate": 8.78932500743384e-05, + "loss": 0.1485, + "step": 8350 + }, + { + "epoch": 1.24, + "grad_norm": 0.5759921669960022, + "learning_rate": 8.787838239666964e-05, + "loss": 0.1578, + "step": 8360 + }, + { + "epoch": 1.24, + "grad_norm": 0.36994293332099915, + "learning_rate": 8.78635147190009e-05, + "loss": 0.1553, + "step": 8370 + }, + { + "epoch": 1.24, + "grad_norm": 0.2848123610019684, + "learning_rate": 8.784864704133214e-05, + "loss": 0.1488, + "step": 8380 + }, + { + "epoch": 1.24, + "grad_norm": 0.373757928609848, + "learning_rate": 8.78337793636634e-05, + "loss": 0.1434, + "step": 8390 + }, + { + "epoch": 1.25, + "grad_norm": 0.3947400450706482, + "learning_rate": 8.781891168599465e-05, + "loss": 0.1433, + "step": 8400 + }, + { + "epoch": 1.25, + "grad_norm": 0.7297700643539429, + "learning_rate": 8.78040440083259e-05, + "loss": 0.1489, + "step": 8410 + }, + { + "epoch": 1.25, + "grad_norm": 0.4975549280643463, + "learning_rate": 8.778917633065716e-05, + "loss": 0.1556, + "step": 8420 + }, + { + "epoch": 1.25, + "grad_norm": 0.2790517508983612, + "learning_rate": 8.777430865298841e-05, + "loss": 0.1478, + "step": 8430 + }, + { + "epoch": 1.25, + "grad_norm": 1.5845293998718262, + "learning_rate": 8.775944097531967e-05, + "loss": 0.1446, + "step": 8440 + }, + { + "epoch": 1.25, + "grad_norm": 1.2284702062606812, + "learning_rate": 8.774457329765091e-05, + "loss": 0.1506, + "step": 8450 + }, + { + "epoch": 1.25, + "grad_norm": 0.9352693557739258, + "learning_rate": 8.772970561998216e-05, + "loss": 0.1447, + "step": 8460 + }, + { + "epoch": 1.26, + "grad_norm": 0.5690743327140808, + "learning_rate": 8.771632471008029e-05, + "loss": 0.1677, + "step": 8470 + }, + { + "epoch": 1.26, + "grad_norm": 1.257373571395874, + "learning_rate": 8.770145703241154e-05, + "loss": 0.1507, + "step": 8480 + }, + { + "epoch": 1.26, + "grad_norm": 1.0186293125152588, + "learning_rate": 8.76865893547428e-05, + "loss": 0.1531, + "step": 8490 + }, + { + "epoch": 1.26, + "grad_norm": 1.0547682046890259, + "learning_rate": 8.767172167707405e-05, + "loss": 0.1578, + "step": 8500 + }, + { + "epoch": 1.26, + "grad_norm": 0.7997061014175415, + "learning_rate": 8.765685399940529e-05, + "loss": 0.1546, + "step": 8510 + }, + { + "epoch": 1.26, + "grad_norm": 0.921222448348999, + "learning_rate": 8.764198632173655e-05, + "loss": 0.149, + "step": 8520 + }, + { + "epoch": 1.26, + "grad_norm": 0.5580063462257385, + "learning_rate": 8.76271186440678e-05, + "loss": 0.153, + "step": 8530 + }, + { + "epoch": 1.27, + "grad_norm": 1.184913992881775, + "learning_rate": 8.761225096639906e-05, + "loss": 0.1519, + "step": 8540 + }, + { + "epoch": 1.27, + "grad_norm": 1.3542340993881226, + "learning_rate": 8.759738328873031e-05, + "loss": 0.1558, + "step": 8550 + }, + { + "epoch": 1.27, + "grad_norm": 0.39081811904907227, + "learning_rate": 8.758251561106155e-05, + "loss": 0.1499, + "step": 8560 + }, + { + "epoch": 1.27, + "grad_norm": 1.175268530845642, + "learning_rate": 8.756764793339281e-05, + "loss": 0.1463, + "step": 8570 + }, + { + "epoch": 1.27, + "grad_norm": 0.516953706741333, + "learning_rate": 8.755278025572406e-05, + "loss": 0.15, + "step": 8580 + }, + { + "epoch": 1.27, + "grad_norm": 0.45542412996292114, + "learning_rate": 8.753791257805532e-05, + "loss": 0.1443, + "step": 8590 + }, + { + "epoch": 1.27, + "grad_norm": 0.25929051637649536, + "learning_rate": 8.752304490038656e-05, + "loss": 0.145, + "step": 8600 + }, + { + "epoch": 1.28, + "grad_norm": 1.7502154111862183, + "learning_rate": 8.750817722271781e-05, + "loss": 0.1529, + "step": 8610 + }, + { + "epoch": 1.28, + "grad_norm": 0.36616241931915283, + "learning_rate": 8.749330954504907e-05, + "loss": 0.1484, + "step": 8620 + }, + { + "epoch": 1.28, + "grad_norm": 0.555590808391571, + "learning_rate": 8.747844186738032e-05, + "loss": 0.1502, + "step": 8630 + }, + { + "epoch": 1.28, + "grad_norm": 1.2765053510665894, + "learning_rate": 8.746357418971158e-05, + "loss": 0.1497, + "step": 8640 + }, + { + "epoch": 1.28, + "grad_norm": 1.160529613494873, + "learning_rate": 8.744870651204282e-05, + "loss": 0.1488, + "step": 8650 + }, + { + "epoch": 1.28, + "grad_norm": 0.5419323444366455, + "learning_rate": 8.743383883437407e-05, + "loss": 0.1531, + "step": 8660 + }, + { + "epoch": 1.29, + "grad_norm": 0.5298939943313599, + "learning_rate": 8.741897115670533e-05, + "loss": 0.1486, + "step": 8670 + }, + { + "epoch": 1.29, + "grad_norm": 1.2307096719741821, + "learning_rate": 8.740410347903658e-05, + "loss": 0.1525, + "step": 8680 + }, + { + "epoch": 1.29, + "grad_norm": 0.4540654122829437, + "learning_rate": 8.738923580136782e-05, + "loss": 0.1535, + "step": 8690 + }, + { + "epoch": 1.29, + "grad_norm": 1.8886024951934814, + "learning_rate": 8.737436812369908e-05, + "loss": 0.1541, + "step": 8700 + }, + { + "epoch": 1.29, + "grad_norm": 1.232670545578003, + "learning_rate": 8.735950044603033e-05, + "loss": 0.1596, + "step": 8710 + }, + { + "epoch": 1.29, + "grad_norm": 1.6752057075500488, + "learning_rate": 8.734463276836159e-05, + "loss": 0.1579, + "step": 8720 + }, + { + "epoch": 1.29, + "grad_norm": 0.6931501030921936, + "learning_rate": 8.732976509069284e-05, + "loss": 0.1492, + "step": 8730 + }, + { + "epoch": 1.3, + "grad_norm": 1.9172624349594116, + "learning_rate": 8.731489741302409e-05, + "loss": 0.1492, + "step": 8740 + }, + { + "epoch": 1.3, + "grad_norm": 0.35857778787612915, + "learning_rate": 8.730002973535534e-05, + "loss": 0.1436, + "step": 8750 + }, + { + "epoch": 1.3, + "grad_norm": 0.6047963500022888, + "learning_rate": 8.72851620576866e-05, + "loss": 0.1434, + "step": 8760 + }, + { + "epoch": 1.3, + "grad_norm": 1.0199538469314575, + "learning_rate": 8.727029438001785e-05, + "loss": 0.1467, + "step": 8770 + }, + { + "epoch": 1.3, + "grad_norm": 0.5496332049369812, + "learning_rate": 8.725542670234909e-05, + "loss": 0.146, + "step": 8780 + }, + { + "epoch": 1.3, + "grad_norm": 0.5620203614234924, + "learning_rate": 8.724055902468035e-05, + "loss": 0.1483, + "step": 8790 + }, + { + "epoch": 1.3, + "grad_norm": 0.36885154247283936, + "learning_rate": 8.72256913470116e-05, + "loss": 0.1366, + "step": 8800 + }, + { + "epoch": 1.31, + "grad_norm": 1.235111117362976, + "learning_rate": 8.721082366934285e-05, + "loss": 0.145, + "step": 8810 + }, + { + "epoch": 1.31, + "grad_norm": 0.3093607723712921, + "learning_rate": 8.719595599167411e-05, + "loss": 0.1625, + "step": 8820 + }, + { + "epoch": 1.31, + "grad_norm": 0.4602314233779907, + "learning_rate": 8.718108831400535e-05, + "loss": 0.1433, + "step": 8830 + }, + { + "epoch": 1.31, + "grad_norm": 0.46107497811317444, + "learning_rate": 8.71662206363366e-05, + "loss": 0.1451, + "step": 8840 + }, + { + "epoch": 1.31, + "grad_norm": 0.23074732720851898, + "learning_rate": 8.715135295866786e-05, + "loss": 0.1498, + "step": 8850 + }, + { + "epoch": 1.31, + "grad_norm": 1.0006585121154785, + "learning_rate": 8.713648528099912e-05, + "loss": 0.1585, + "step": 8860 + }, + { + "epoch": 1.31, + "grad_norm": 0.6358426213264465, + "learning_rate": 8.712161760333037e-05, + "loss": 0.1522, + "step": 8870 + }, + { + "epoch": 1.32, + "grad_norm": 0.39934954047203064, + "learning_rate": 8.710674992566161e-05, + "loss": 0.1449, + "step": 8880 + }, + { + "epoch": 1.32, + "grad_norm": 0.5131770372390747, + "learning_rate": 8.709188224799288e-05, + "loss": 0.1424, + "step": 8890 + }, + { + "epoch": 1.32, + "grad_norm": 3.1670219898223877, + "learning_rate": 8.707701457032412e-05, + "loss": 0.1516, + "step": 8900 + }, + { + "epoch": 1.32, + "grad_norm": 0.6028851270675659, + "learning_rate": 8.706214689265538e-05, + "loss": 0.1485, + "step": 8910 + }, + { + "epoch": 1.32, + "grad_norm": 0.5696167945861816, + "learning_rate": 8.704727921498662e-05, + "loss": 0.1451, + "step": 8920 + }, + { + "epoch": 1.32, + "grad_norm": 0.7680513262748718, + "learning_rate": 8.703241153731787e-05, + "loss": 0.1431, + "step": 8930 + }, + { + "epoch": 1.33, + "grad_norm": 1.506998896598816, + "learning_rate": 8.701754385964913e-05, + "loss": 0.1562, + "step": 8940 + }, + { + "epoch": 1.33, + "grad_norm": 0.24090486764907837, + "learning_rate": 8.700267618198038e-05, + "loss": 0.1519, + "step": 8950 + }, + { + "epoch": 1.33, + "grad_norm": 0.36027562618255615, + "learning_rate": 8.698780850431164e-05, + "loss": 0.1425, + "step": 8960 + }, + { + "epoch": 1.33, + "grad_norm": 1.2053183317184448, + "learning_rate": 8.697294082664288e-05, + "loss": 0.1426, + "step": 8970 + }, + { + "epoch": 1.33, + "grad_norm": 0.4818166494369507, + "learning_rate": 8.695807314897415e-05, + "loss": 0.1549, + "step": 8980 + }, + { + "epoch": 1.33, + "grad_norm": 1.4848952293395996, + "learning_rate": 8.694320547130539e-05, + "loss": 0.1505, + "step": 8990 + }, + { + "epoch": 1.33, + "grad_norm": 1.1278949975967407, + "learning_rate": 8.692833779363664e-05, + "loss": 0.1514, + "step": 9000 + }, + { + "epoch": 1.34, + "grad_norm": 0.55078125, + "learning_rate": 8.691347011596788e-05, + "loss": 0.1522, + "step": 9010 + }, + { + "epoch": 1.34, + "grad_norm": 0.6898670196533203, + "learning_rate": 8.689860243829914e-05, + "loss": 0.1439, + "step": 9020 + }, + { + "epoch": 1.34, + "grad_norm": 0.9387264251708984, + "learning_rate": 8.688373476063039e-05, + "loss": 0.1492, + "step": 9030 + }, + { + "epoch": 1.34, + "grad_norm": 0.6217661499977112, + "learning_rate": 8.686886708296165e-05, + "loss": 0.1448, + "step": 9040 + }, + { + "epoch": 1.34, + "grad_norm": 0.3894174098968506, + "learning_rate": 8.68539994052929e-05, + "loss": 0.1473, + "step": 9050 + }, + { + "epoch": 1.34, + "grad_norm": 0.6577085852622986, + "learning_rate": 8.683913172762414e-05, + "loss": 0.1522, + "step": 9060 + }, + { + "epoch": 1.34, + "grad_norm": 0.36721089482307434, + "learning_rate": 8.682426404995541e-05, + "loss": 0.1518, + "step": 9070 + }, + { + "epoch": 1.35, + "grad_norm": 1.7687819004058838, + "learning_rate": 8.680939637228665e-05, + "loss": 0.1527, + "step": 9080 + }, + { + "epoch": 1.35, + "grad_norm": 2.106863260269165, + "learning_rate": 8.679452869461791e-05, + "loss": 0.1526, + "step": 9090 + }, + { + "epoch": 1.35, + "grad_norm": 0.8379847407341003, + "learning_rate": 8.677966101694915e-05, + "loss": 0.1525, + "step": 9100 + }, + { + "epoch": 1.35, + "grad_norm": 0.9839054346084595, + "learning_rate": 8.67647933392804e-05, + "loss": 0.1476, + "step": 9110 + }, + { + "epoch": 1.35, + "grad_norm": 1.1237411499023438, + "learning_rate": 8.674992566161166e-05, + "loss": 0.1516, + "step": 9120 + }, + { + "epoch": 1.35, + "grad_norm": 1.2351957559585571, + "learning_rate": 8.673505798394291e-05, + "loss": 0.1553, + "step": 9130 + }, + { + "epoch": 1.35, + "grad_norm": 1.330193281173706, + "learning_rate": 8.672019030627417e-05, + "loss": 0.1507, + "step": 9140 + }, + { + "epoch": 1.36, + "grad_norm": 0.5106310844421387, + "learning_rate": 8.670532262860541e-05, + "loss": 0.1509, + "step": 9150 + }, + { + "epoch": 1.36, + "grad_norm": 0.2819235026836395, + "learning_rate": 8.669045495093668e-05, + "loss": 0.1496, + "step": 9160 + }, + { + "epoch": 1.36, + "grad_norm": 0.3204076290130615, + "learning_rate": 8.667558727326792e-05, + "loss": 0.1541, + "step": 9170 + }, + { + "epoch": 1.36, + "grad_norm": 0.3066985309123993, + "learning_rate": 8.666071959559917e-05, + "loss": 0.1432, + "step": 9180 + }, + { + "epoch": 1.36, + "grad_norm": 0.5264164805412292, + "learning_rate": 8.664585191793041e-05, + "loss": 0.1501, + "step": 9190 + }, + { + "epoch": 1.36, + "grad_norm": 1.0511165857315063, + "learning_rate": 8.663098424026167e-05, + "loss": 0.1512, + "step": 9200 + }, + { + "epoch": 1.37, + "grad_norm": 0.3407282531261444, + "learning_rate": 8.661611656259294e-05, + "loss": 0.1483, + "step": 9210 + }, + { + "epoch": 1.37, + "grad_norm": 0.6586543321609497, + "learning_rate": 8.660124888492418e-05, + "loss": 0.1528, + "step": 9220 + }, + { + "epoch": 1.37, + "grad_norm": 0.29302939772605896, + "learning_rate": 8.658638120725543e-05, + "loss": 0.146, + "step": 9230 + }, + { + "epoch": 1.37, + "grad_norm": 1.0969610214233398, + "learning_rate": 8.657151352958667e-05, + "loss": 0.1515, + "step": 9240 + }, + { + "epoch": 1.37, + "grad_norm": 0.3798691928386688, + "learning_rate": 8.655664585191794e-05, + "loss": 0.1502, + "step": 9250 + }, + { + "epoch": 1.37, + "grad_norm": 0.5284072756767273, + "learning_rate": 8.654177817424918e-05, + "loss": 0.1438, + "step": 9260 + }, + { + "epoch": 1.37, + "grad_norm": 0.5143082141876221, + "learning_rate": 8.652691049658044e-05, + "loss": 0.1548, + "step": 9270 + }, + { + "epoch": 1.38, + "grad_norm": 2.502887725830078, + "learning_rate": 8.65120428189117e-05, + "loss": 0.154, + "step": 9280 + }, + { + "epoch": 1.38, + "grad_norm": 0.34845390915870667, + "learning_rate": 8.649717514124293e-05, + "loss": 0.1461, + "step": 9290 + }, + { + "epoch": 1.38, + "grad_norm": 0.576566219329834, + "learning_rate": 8.64823074635742e-05, + "loss": 0.157, + "step": 9300 + }, + { + "epoch": 1.38, + "grad_norm": 0.34153446555137634, + "learning_rate": 8.646743978590544e-05, + "loss": 0.1529, + "step": 9310 + }, + { + "epoch": 1.38, + "grad_norm": 1.3493391275405884, + "learning_rate": 8.64525721082367e-05, + "loss": 0.1418, + "step": 9320 + }, + { + "epoch": 1.38, + "grad_norm": 0.4563460648059845, + "learning_rate": 8.643770443056794e-05, + "loss": 0.1525, + "step": 9330 + }, + { + "epoch": 1.38, + "grad_norm": 0.9894744157791138, + "learning_rate": 8.642283675289921e-05, + "loss": 0.1508, + "step": 9340 + }, + { + "epoch": 1.39, + "grad_norm": 0.21944831311702728, + "learning_rate": 8.640796907523045e-05, + "loss": 0.1478, + "step": 9350 + }, + { + "epoch": 1.39, + "grad_norm": 1.688185691833496, + "learning_rate": 8.63931013975617e-05, + "loss": 0.1545, + "step": 9360 + }, + { + "epoch": 1.39, + "grad_norm": 0.3348112106323242, + "learning_rate": 8.637823371989296e-05, + "loss": 0.1498, + "step": 9370 + }, + { + "epoch": 1.39, + "grad_norm": 0.32698962092399597, + "learning_rate": 8.636336604222421e-05, + "loss": 0.1485, + "step": 9380 + }, + { + "epoch": 1.39, + "grad_norm": 0.4172670543193817, + "learning_rate": 8.634849836455547e-05, + "loss": 0.157, + "step": 9390 + }, + { + "epoch": 1.39, + "grad_norm": 0.5992726683616638, + "learning_rate": 8.633363068688671e-05, + "loss": 0.1473, + "step": 9400 + }, + { + "epoch": 1.39, + "grad_norm": 0.4641229510307312, + "learning_rate": 8.631876300921796e-05, + "loss": 0.1454, + "step": 9410 + }, + { + "epoch": 1.4, + "grad_norm": 1.2482893466949463, + "learning_rate": 8.63038953315492e-05, + "loss": 0.1438, + "step": 9420 + }, + { + "epoch": 1.4, + "grad_norm": 0.3719273805618286, + "learning_rate": 8.628902765388047e-05, + "loss": 0.1467, + "step": 9430 + }, + { + "epoch": 1.4, + "grad_norm": 0.5880813002586365, + "learning_rate": 8.627415997621172e-05, + "loss": 0.1496, + "step": 9440 + }, + { + "epoch": 1.4, + "grad_norm": 0.5161025524139404, + "learning_rate": 8.625929229854297e-05, + "loss": 0.1449, + "step": 9450 + }, + { + "epoch": 1.4, + "grad_norm": 0.3996943533420563, + "learning_rate": 8.624442462087422e-05, + "loss": 0.1499, + "step": 9460 + }, + { + "epoch": 1.4, + "grad_norm": 0.5722046494483948, + "learning_rate": 8.622955694320548e-05, + "loss": 0.1474, + "step": 9470 + }, + { + "epoch": 1.41, + "grad_norm": 1.8009772300720215, + "learning_rate": 8.621617603330359e-05, + "loss": 0.1635, + "step": 9480 + }, + { + "epoch": 1.41, + "grad_norm": 0.959286630153656, + "learning_rate": 8.620130835563486e-05, + "loss": 0.153, + "step": 9490 + }, + { + "epoch": 1.41, + "grad_norm": 1.4346505403518677, + "learning_rate": 8.61864406779661e-05, + "loss": 0.1599, + "step": 9500 + }, + { + "epoch": 1.41, + "grad_norm": 0.3234134018421173, + "learning_rate": 8.617157300029736e-05, + "loss": 0.1445, + "step": 9510 + }, + { + "epoch": 1.41, + "grad_norm": 0.4339183270931244, + "learning_rate": 8.615670532262861e-05, + "loss": 0.151, + "step": 9520 + }, + { + "epoch": 1.41, + "grad_norm": 0.9209549427032471, + "learning_rate": 8.614183764495987e-05, + "loss": 0.1498, + "step": 9530 + }, + { + "epoch": 1.41, + "grad_norm": 0.8989786505699158, + "learning_rate": 8.612696996729112e-05, + "loss": 0.1423, + "step": 9540 + }, + { + "epoch": 1.42, + "grad_norm": 0.9294031858444214, + "learning_rate": 8.611210228962236e-05, + "loss": 0.1435, + "step": 9550 + }, + { + "epoch": 1.42, + "grad_norm": 0.4785991907119751, + "learning_rate": 8.609723461195362e-05, + "loss": 0.143, + "step": 9560 + }, + { + "epoch": 1.42, + "grad_norm": 0.7767314314842224, + "learning_rate": 8.608236693428486e-05, + "loss": 0.1424, + "step": 9570 + }, + { + "epoch": 1.42, + "grad_norm": 1.2666493654251099, + "learning_rate": 8.606749925661613e-05, + "loss": 0.1527, + "step": 9580 + }, + { + "epoch": 1.42, + "grad_norm": 0.4553685188293457, + "learning_rate": 8.605263157894738e-05, + "loss": 0.1511, + "step": 9590 + }, + { + "epoch": 1.42, + "grad_norm": 0.3553789556026459, + "learning_rate": 8.603776390127862e-05, + "loss": 0.1475, + "step": 9600 + }, + { + "epoch": 1.42, + "grad_norm": 0.8654747009277344, + "learning_rate": 8.602289622360988e-05, + "loss": 0.1389, + "step": 9610 + }, + { + "epoch": 1.43, + "grad_norm": 0.7433649301528931, + "learning_rate": 8.600802854594113e-05, + "loss": 0.1477, + "step": 9620 + }, + { + "epoch": 1.43, + "grad_norm": 0.25381016731262207, + "learning_rate": 8.599316086827239e-05, + "loss": 0.1494, + "step": 9630 + }, + { + "epoch": 1.43, + "grad_norm": 1.8853144645690918, + "learning_rate": 8.597829319060363e-05, + "loss": 0.1529, + "step": 9640 + }, + { + "epoch": 1.43, + "grad_norm": 0.3927299380302429, + "learning_rate": 8.596342551293488e-05, + "loss": 0.1487, + "step": 9650 + }, + { + "epoch": 1.43, + "grad_norm": 0.46614259481430054, + "learning_rate": 8.594855783526614e-05, + "loss": 0.1521, + "step": 9660 + }, + { + "epoch": 1.43, + "grad_norm": 1.0246176719665527, + "learning_rate": 8.593369015759739e-05, + "loss": 0.1439, + "step": 9670 + }, + { + "epoch": 1.43, + "grad_norm": 0.7353618144989014, + "learning_rate": 8.591882247992865e-05, + "loss": 0.1433, + "step": 9680 + }, + { + "epoch": 1.44, + "grad_norm": 0.6362217664718628, + "learning_rate": 8.590395480225989e-05, + "loss": 0.1451, + "step": 9690 + }, + { + "epoch": 1.44, + "grad_norm": 0.9315203428268433, + "learning_rate": 8.588908712459114e-05, + "loss": 0.1453, + "step": 9700 + }, + { + "epoch": 1.44, + "grad_norm": 1.2445534467697144, + "learning_rate": 8.58742194469224e-05, + "loss": 0.1356, + "step": 9710 + }, + { + "epoch": 1.44, + "grad_norm": 0.40973347425460815, + "learning_rate": 8.585935176925365e-05, + "loss": 0.1481, + "step": 9720 + }, + { + "epoch": 1.44, + "grad_norm": 0.397741436958313, + "learning_rate": 8.584448409158489e-05, + "loss": 0.1449, + "step": 9730 + }, + { + "epoch": 1.44, + "grad_norm": 3.0569045543670654, + "learning_rate": 8.582961641391615e-05, + "loss": 0.1561, + "step": 9740 + }, + { + "epoch": 1.45, + "grad_norm": 0.37079283595085144, + "learning_rate": 8.58147487362474e-05, + "loss": 0.1542, + "step": 9750 + }, + { + "epoch": 1.45, + "grad_norm": 0.637252688407898, + "learning_rate": 8.579988105857866e-05, + "loss": 0.1543, + "step": 9760 + }, + { + "epoch": 1.45, + "grad_norm": 0.6145210862159729, + "learning_rate": 8.578501338090991e-05, + "loss": 0.1461, + "step": 9770 + }, + { + "epoch": 1.45, + "grad_norm": 1.3545942306518555, + "learning_rate": 8.577014570324115e-05, + "loss": 0.1531, + "step": 9780 + }, + { + "epoch": 1.45, + "grad_norm": 0.720514714717865, + "learning_rate": 8.575527802557241e-05, + "loss": 0.1441, + "step": 9790 + }, + { + "epoch": 1.45, + "grad_norm": 0.3378904461860657, + "learning_rate": 8.574041034790366e-05, + "loss": 0.1464, + "step": 9800 + }, + { + "epoch": 1.45, + "grad_norm": 0.7078978419303894, + "learning_rate": 8.572554267023492e-05, + "loss": 0.1544, + "step": 9810 + }, + { + "epoch": 1.46, + "grad_norm": 1.368641972541809, + "learning_rate": 8.571067499256616e-05, + "loss": 0.1477, + "step": 9820 + }, + { + "epoch": 1.46, + "grad_norm": 0.42889463901519775, + "learning_rate": 8.569580731489741e-05, + "loss": 0.1487, + "step": 9830 + }, + { + "epoch": 1.46, + "grad_norm": 1.435215950012207, + "learning_rate": 8.568093963722867e-05, + "loss": 0.1474, + "step": 9840 + }, + { + "epoch": 1.46, + "grad_norm": 0.9903163313865662, + "learning_rate": 8.566607195955992e-05, + "loss": 0.1433, + "step": 9850 + }, + { + "epoch": 1.46, + "grad_norm": 1.3809950351715088, + "learning_rate": 8.565120428189118e-05, + "loss": 0.1447, + "step": 9860 + }, + { + "epoch": 1.46, + "grad_norm": 0.8493334650993347, + "learning_rate": 8.563633660422242e-05, + "loss": 0.148, + "step": 9870 + }, + { + "epoch": 1.46, + "grad_norm": 0.33634793758392334, + "learning_rate": 8.562146892655367e-05, + "loss": 0.1451, + "step": 9880 + }, + { + "epoch": 1.47, + "grad_norm": 0.403535395860672, + "learning_rate": 8.560660124888493e-05, + "loss": 0.1467, + "step": 9890 + }, + { + "epoch": 1.47, + "grad_norm": 0.7498649954795837, + "learning_rate": 8.559173357121618e-05, + "loss": 0.1442, + "step": 9900 + }, + { + "epoch": 1.47, + "grad_norm": 0.8162224888801575, + "learning_rate": 8.557686589354744e-05, + "loss": 0.1475, + "step": 9910 + }, + { + "epoch": 1.47, + "grad_norm": 0.3795585632324219, + "learning_rate": 8.556199821587868e-05, + "loss": 0.1471, + "step": 9920 + }, + { + "epoch": 1.47, + "grad_norm": 0.37970471382141113, + "learning_rate": 8.554713053820995e-05, + "loss": 0.1542, + "step": 9930 + }, + { + "epoch": 1.47, + "grad_norm": 1.756047248840332, + "learning_rate": 8.553226286054119e-05, + "loss": 0.1424, + "step": 9940 + }, + { + "epoch": 1.47, + "grad_norm": 0.36348363757133484, + "learning_rate": 8.551739518287244e-05, + "loss": 0.141, + "step": 9950 + }, + { + "epoch": 1.48, + "grad_norm": 0.6128315329551697, + "learning_rate": 8.550252750520368e-05, + "loss": 0.1555, + "step": 9960 + }, + { + "epoch": 1.48, + "grad_norm": 0.5416569113731384, + "learning_rate": 8.548765982753494e-05, + "loss": 0.1541, + "step": 9970 + }, + { + "epoch": 1.48, + "grad_norm": 0.9355424642562866, + "learning_rate": 8.54727921498662e-05, + "loss": 0.1475, + "step": 9980 + }, + { + "epoch": 1.48, + "grad_norm": 1.1831605434417725, + "learning_rate": 8.545792447219745e-05, + "loss": 0.1533, + "step": 9990 + }, + { + "epoch": 1.48, + "grad_norm": 1.0193519592285156, + "learning_rate": 8.54430567945287e-05, + "loss": 0.1401, + "step": 10000 + }, + { + "epoch": 1.48, + "grad_norm": 0.3186480700969696, + "learning_rate": 8.542818911685994e-05, + "loss": 0.1584, + "step": 10010 + }, + { + "epoch": 1.49, + "grad_norm": 0.3131497800350189, + "learning_rate": 8.541332143919121e-05, + "loss": 0.1461, + "step": 10020 + }, + { + "epoch": 1.49, + "grad_norm": 0.8620591759681702, + "learning_rate": 8.539845376152245e-05, + "loss": 0.1408, + "step": 10030 + }, + { + "epoch": 1.49, + "grad_norm": 1.469287633895874, + "learning_rate": 8.538358608385371e-05, + "loss": 0.1462, + "step": 10040 + }, + { + "epoch": 1.49, + "grad_norm": 0.40487322211265564, + "learning_rate": 8.536871840618495e-05, + "loss": 0.1419, + "step": 10050 + }, + { + "epoch": 1.49, + "grad_norm": 0.30698254704475403, + "learning_rate": 8.53538507285162e-05, + "loss": 0.1473, + "step": 10060 + }, + { + "epoch": 1.49, + "grad_norm": 1.4092752933502197, + "learning_rate": 8.533898305084746e-05, + "loss": 0.1605, + "step": 10070 + }, + { + "epoch": 1.49, + "grad_norm": 0.45507368445396423, + "learning_rate": 8.532411537317871e-05, + "loss": 0.1482, + "step": 10080 + }, + { + "epoch": 1.5, + "grad_norm": 0.43545153737068176, + "learning_rate": 8.530924769550997e-05, + "loss": 0.1506, + "step": 10090 + }, + { + "epoch": 1.5, + "grad_norm": 0.4408663809299469, + "learning_rate": 8.529438001784121e-05, + "loss": 0.1448, + "step": 10100 + }, + { + "epoch": 1.5, + "grad_norm": 0.2926168441772461, + "learning_rate": 8.527951234017248e-05, + "loss": 0.1538, + "step": 10110 + }, + { + "epoch": 1.5, + "grad_norm": 0.5207601189613342, + "learning_rate": 8.526464466250372e-05, + "loss": 0.144, + "step": 10120 + }, + { + "epoch": 1.5, + "grad_norm": 1.4128458499908447, + "learning_rate": 8.524977698483497e-05, + "loss": 0.1407, + "step": 10130 + }, + { + "epoch": 1.5, + "grad_norm": 0.6333006620407104, + "learning_rate": 8.523490930716622e-05, + "loss": 0.1476, + "step": 10140 + }, + { + "epoch": 1.5, + "grad_norm": 1.300640344619751, + "learning_rate": 8.522004162949747e-05, + "loss": 0.145, + "step": 10150 + }, + { + "epoch": 1.51, + "grad_norm": 1.1418519020080566, + "learning_rate": 8.520517395182873e-05, + "loss": 0.138, + "step": 10160 + }, + { + "epoch": 1.51, + "grad_norm": 0.2554391324520111, + "learning_rate": 8.519030627415998e-05, + "loss": 0.1514, + "step": 10170 + }, + { + "epoch": 1.51, + "grad_norm": 0.49666744470596313, + "learning_rate": 8.517543859649124e-05, + "loss": 0.1406, + "step": 10180 + }, + { + "epoch": 1.51, + "grad_norm": 1.7986241579055786, + "learning_rate": 8.516057091882248e-05, + "loss": 0.1476, + "step": 10190 + }, + { + "epoch": 1.51, + "grad_norm": 1.6229901313781738, + "learning_rate": 8.514570324115374e-05, + "loss": 0.1539, + "step": 10200 + }, + { + "epoch": 1.51, + "grad_norm": 1.0691964626312256, + "learning_rate": 8.513083556348499e-05, + "loss": 0.1462, + "step": 10210 + }, + { + "epoch": 1.51, + "grad_norm": 0.3225235342979431, + "learning_rate": 8.511596788581624e-05, + "loss": 0.1574, + "step": 10220 + }, + { + "epoch": 1.52, + "grad_norm": 0.46615543961524963, + "learning_rate": 8.510110020814748e-05, + "loss": 0.1395, + "step": 10230 + }, + { + "epoch": 1.52, + "grad_norm": 1.2598484754562378, + "learning_rate": 8.508623253047874e-05, + "loss": 0.1358, + "step": 10240 + }, + { + "epoch": 1.52, + "grad_norm": 0.3854273855686188, + "learning_rate": 8.507136485281e-05, + "loss": 0.1458, + "step": 10250 + }, + { + "epoch": 1.52, + "grad_norm": 0.6581094264984131, + "learning_rate": 8.505649717514125e-05, + "loss": 0.1506, + "step": 10260 + }, + { + "epoch": 1.52, + "grad_norm": 0.3710658550262451, + "learning_rate": 8.50416294974725e-05, + "loss": 0.1439, + "step": 10270 + }, + { + "epoch": 1.52, + "grad_norm": 0.9312505722045898, + "learning_rate": 8.502676181980374e-05, + "loss": 0.1486, + "step": 10280 + }, + { + "epoch": 1.53, + "grad_norm": 0.8688971996307373, + "learning_rate": 8.501189414213501e-05, + "loss": 0.1434, + "step": 10290 + }, + { + "epoch": 1.53, + "grad_norm": 0.39325767755508423, + "learning_rate": 8.499702646446625e-05, + "loss": 0.1432, + "step": 10300 + }, + { + "epoch": 1.53, + "grad_norm": 0.48646825551986694, + "learning_rate": 8.49821587867975e-05, + "loss": 0.1522, + "step": 10310 + }, + { + "epoch": 1.53, + "grad_norm": 0.8287198543548584, + "learning_rate": 8.496729110912876e-05, + "loss": 0.1463, + "step": 10320 + }, + { + "epoch": 1.53, + "grad_norm": 0.5924571752548218, + "learning_rate": 8.495242343146002e-05, + "loss": 0.1455, + "step": 10330 + }, + { + "epoch": 1.53, + "grad_norm": 0.2828935980796814, + "learning_rate": 8.493755575379127e-05, + "loss": 0.1436, + "step": 10340 + }, + { + "epoch": 1.53, + "grad_norm": 1.714647889137268, + "learning_rate": 8.492268807612251e-05, + "loss": 0.1476, + "step": 10350 + }, + { + "epoch": 1.54, + "grad_norm": 1.1875629425048828, + "learning_rate": 8.490782039845377e-05, + "loss": 0.1486, + "step": 10360 + }, + { + "epoch": 1.54, + "grad_norm": 1.7616733312606812, + "learning_rate": 8.489295272078501e-05, + "loss": 0.1464, + "step": 10370 + }, + { + "epoch": 1.54, + "grad_norm": 0.29138264060020447, + "learning_rate": 8.487808504311628e-05, + "loss": 0.1407, + "step": 10380 + }, + { + "epoch": 1.54, + "grad_norm": 0.49664175510406494, + "learning_rate": 8.486321736544752e-05, + "loss": 0.1515, + "step": 10390 + }, + { + "epoch": 1.54, + "grad_norm": 0.4667070806026459, + "learning_rate": 8.484834968777877e-05, + "loss": 0.15, + "step": 10400 + }, + { + "epoch": 1.54, + "grad_norm": 0.3020360469818115, + "learning_rate": 8.483348201011003e-05, + "loss": 0.145, + "step": 10410 + }, + { + "epoch": 1.54, + "grad_norm": 0.41862982511520386, + "learning_rate": 8.481861433244128e-05, + "loss": 0.1495, + "step": 10420 + }, + { + "epoch": 1.55, + "grad_norm": 0.3439256250858307, + "learning_rate": 8.480374665477254e-05, + "loss": 0.1509, + "step": 10430 + }, + { + "epoch": 1.55, + "grad_norm": 0.6848132610321045, + "learning_rate": 8.478887897710378e-05, + "loss": 0.1456, + "step": 10440 + }, + { + "epoch": 1.55, + "grad_norm": 0.34469151496887207, + "learning_rate": 8.477401129943503e-05, + "loss": 0.147, + "step": 10450 + }, + { + "epoch": 1.55, + "grad_norm": 2.268392324447632, + "learning_rate": 8.475914362176627e-05, + "loss": 0.1561, + "step": 10460 + }, + { + "epoch": 1.55, + "grad_norm": 1.23007071018219, + "learning_rate": 8.474427594409754e-05, + "loss": 0.159, + "step": 10470 + }, + { + "epoch": 1.55, + "grad_norm": 0.5129968523979187, + "learning_rate": 8.472940826642878e-05, + "loss": 0.1468, + "step": 10480 + }, + { + "epoch": 1.55, + "grad_norm": 0.5921646952629089, + "learning_rate": 8.471454058876004e-05, + "loss": 0.1428, + "step": 10490 + }, + { + "epoch": 1.56, + "grad_norm": 0.3901660740375519, + "learning_rate": 8.469967291109129e-05, + "loss": 0.1483, + "step": 10500 + }, + { + "epoch": 1.56, + "grad_norm": 0.8023537993431091, + "learning_rate": 8.468480523342255e-05, + "loss": 0.1503, + "step": 10510 + }, + { + "epoch": 1.56, + "grad_norm": 0.708949625492096, + "learning_rate": 8.46699375557538e-05, + "loss": 0.1437, + "step": 10520 + }, + { + "epoch": 1.56, + "grad_norm": 0.8118116855621338, + "learning_rate": 8.465506987808504e-05, + "loss": 0.1391, + "step": 10530 + }, + { + "epoch": 1.56, + "grad_norm": 0.32893306016921997, + "learning_rate": 8.46402022004163e-05, + "loss": 0.1416, + "step": 10540 + }, + { + "epoch": 1.56, + "grad_norm": 0.5859858393669128, + "learning_rate": 8.462533452274754e-05, + "loss": 0.1415, + "step": 10550 + }, + { + "epoch": 1.57, + "grad_norm": 0.7075644135475159, + "learning_rate": 8.461046684507881e-05, + "loss": 0.1485, + "step": 10560 + }, + { + "epoch": 1.57, + "grad_norm": 0.966854453086853, + "learning_rate": 8.459559916741005e-05, + "loss": 0.1433, + "step": 10570 + }, + { + "epoch": 1.57, + "grad_norm": 0.5331383347511292, + "learning_rate": 8.45807314897413e-05, + "loss": 0.1508, + "step": 10580 + }, + { + "epoch": 1.57, + "grad_norm": 1.2724835872650146, + "learning_rate": 8.456586381207256e-05, + "loss": 0.1507, + "step": 10590 + }, + { + "epoch": 1.57, + "grad_norm": 0.28108617663383484, + "learning_rate": 8.455099613440381e-05, + "loss": 0.1453, + "step": 10600 + }, + { + "epoch": 1.57, + "grad_norm": 0.3450464904308319, + "learning_rate": 8.453612845673507e-05, + "loss": 0.1478, + "step": 10610 + }, + { + "epoch": 1.57, + "grad_norm": 0.8239097595214844, + "learning_rate": 8.452126077906631e-05, + "loss": 0.1438, + "step": 10620 + }, + { + "epoch": 1.58, + "grad_norm": 0.9062354564666748, + "learning_rate": 8.450639310139756e-05, + "loss": 0.1467, + "step": 10630 + }, + { + "epoch": 1.58, + "grad_norm": 0.6238219141960144, + "learning_rate": 8.449152542372882e-05, + "loss": 0.1454, + "step": 10640 + }, + { + "epoch": 1.58, + "grad_norm": 2.089106798171997, + "learning_rate": 8.447665774606007e-05, + "loss": 0.1393, + "step": 10650 + }, + { + "epoch": 1.58, + "grad_norm": 0.6828879714012146, + "learning_rate": 8.446179006839133e-05, + "loss": 0.1471, + "step": 10660 + }, + { + "epoch": 1.58, + "grad_norm": 0.3843400478363037, + "learning_rate": 8.444692239072257e-05, + "loss": 0.1414, + "step": 10670 + }, + { + "epoch": 1.58, + "grad_norm": 1.483210802078247, + "learning_rate": 8.443205471305382e-05, + "loss": 0.146, + "step": 10680 + }, + { + "epoch": 1.58, + "grad_norm": 1.3872867822647095, + "learning_rate": 8.441718703538508e-05, + "loss": 0.1386, + "step": 10690 + }, + { + "epoch": 1.59, + "grad_norm": 0.7687630653381348, + "learning_rate": 8.440231935771633e-05, + "loss": 0.1524, + "step": 10700 + }, + { + "epoch": 1.59, + "grad_norm": 0.3632531762123108, + "learning_rate": 8.438745168004758e-05, + "loss": 0.1457, + "step": 10710 + }, + { + "epoch": 1.59, + "grad_norm": 0.7489669919013977, + "learning_rate": 8.437258400237883e-05, + "loss": 0.149, + "step": 10720 + }, + { + "epoch": 1.59, + "grad_norm": 0.3110584318637848, + "learning_rate": 8.435771632471008e-05, + "loss": 0.1464, + "step": 10730 + }, + { + "epoch": 1.59, + "grad_norm": 0.9831928014755249, + "learning_rate": 8.434284864704134e-05, + "loss": 0.1494, + "step": 10740 + }, + { + "epoch": 1.59, + "grad_norm": 0.27060386538505554, + "learning_rate": 8.43279809693726e-05, + "loss": 0.1492, + "step": 10750 + }, + { + "epoch": 1.59, + "grad_norm": 0.3173629939556122, + "learning_rate": 8.431311329170384e-05, + "loss": 0.1501, + "step": 10760 + }, + { + "epoch": 1.6, + "grad_norm": 1.2598687410354614, + "learning_rate": 8.429824561403509e-05, + "loss": 0.1528, + "step": 10770 + }, + { + "epoch": 1.6, + "grad_norm": 0.29677924513816833, + "learning_rate": 8.428337793636634e-05, + "loss": 0.1419, + "step": 10780 + }, + { + "epoch": 1.6, + "grad_norm": 0.3866405487060547, + "learning_rate": 8.42685102586976e-05, + "loss": 0.1359, + "step": 10790 + }, + { + "epoch": 1.6, + "grad_norm": 0.707709014415741, + "learning_rate": 8.425364258102884e-05, + "loss": 0.145, + "step": 10800 + }, + { + "epoch": 1.6, + "grad_norm": 0.9584312438964844, + "learning_rate": 8.42387749033601e-05, + "loss": 0.1472, + "step": 10810 + }, + { + "epoch": 1.6, + "grad_norm": 0.9559441208839417, + "learning_rate": 8.422390722569135e-05, + "loss": 0.1432, + "step": 10820 + }, + { + "epoch": 1.61, + "grad_norm": 1.803369402885437, + "learning_rate": 8.42090395480226e-05, + "loss": 0.1508, + "step": 10830 + }, + { + "epoch": 1.61, + "grad_norm": 0.5966110229492188, + "learning_rate": 8.419417187035386e-05, + "loss": 0.1502, + "step": 10840 + }, + { + "epoch": 1.61, + "grad_norm": 1.5593287944793701, + "learning_rate": 8.41793041926851e-05, + "loss": 0.145, + "step": 10850 + }, + { + "epoch": 1.61, + "grad_norm": 2.490039348602295, + "learning_rate": 8.416443651501636e-05, + "loss": 0.1561, + "step": 10860 + }, + { + "epoch": 1.61, + "grad_norm": 0.7378647327423096, + "learning_rate": 8.414956883734761e-05, + "loss": 0.1475, + "step": 10870 + }, + { + "epoch": 1.61, + "grad_norm": 1.7256468534469604, + "learning_rate": 8.413470115967887e-05, + "loss": 0.1471, + "step": 10880 + }, + { + "epoch": 1.61, + "grad_norm": 0.516965389251709, + "learning_rate": 8.41198334820101e-05, + "loss": 0.1427, + "step": 10890 + }, + { + "epoch": 1.62, + "grad_norm": 0.44696128368377686, + "learning_rate": 8.410496580434136e-05, + "loss": 0.1506, + "step": 10900 + }, + { + "epoch": 1.62, + "grad_norm": 0.663616418838501, + "learning_rate": 8.409009812667262e-05, + "loss": 0.1451, + "step": 10910 + }, + { + "epoch": 1.62, + "grad_norm": 0.6834636330604553, + "learning_rate": 8.407523044900387e-05, + "loss": 0.1464, + "step": 10920 + }, + { + "epoch": 1.62, + "grad_norm": 0.3726295232772827, + "learning_rate": 8.406036277133513e-05, + "loss": 0.1496, + "step": 10930 + }, + { + "epoch": 1.62, + "grad_norm": 0.9244528412818909, + "learning_rate": 8.404549509366637e-05, + "loss": 0.1445, + "step": 10940 + }, + { + "epoch": 1.62, + "grad_norm": 1.3497867584228516, + "learning_rate": 8.403062741599762e-05, + "loss": 0.1486, + "step": 10950 + }, + { + "epoch": 1.62, + "grad_norm": 0.5133809447288513, + "learning_rate": 8.401575973832888e-05, + "loss": 0.1521, + "step": 10960 + }, + { + "epoch": 1.63, + "grad_norm": 0.28419455885887146, + "learning_rate": 8.400089206066013e-05, + "loss": 0.1466, + "step": 10970 + }, + { + "epoch": 1.63, + "grad_norm": 0.7684698104858398, + "learning_rate": 8.398602438299139e-05, + "loss": 0.15, + "step": 10980 + }, + { + "epoch": 1.63, + "grad_norm": 1.1589276790618896, + "learning_rate": 8.397115670532263e-05, + "loss": 0.1472, + "step": 10990 + }, + { + "epoch": 1.63, + "grad_norm": 1.120748519897461, + "learning_rate": 8.39562890276539e-05, + "loss": 0.1433, + "step": 11000 + }, + { + "epoch": 1.63, + "grad_norm": 1.4150292873382568, + "learning_rate": 8.394142134998514e-05, + "loss": 0.1431, + "step": 11010 + }, + { + "epoch": 1.63, + "grad_norm": 1.4702545404434204, + "learning_rate": 8.392655367231639e-05, + "loss": 0.1456, + "step": 11020 + }, + { + "epoch": 1.63, + "grad_norm": 0.38210058212280273, + "learning_rate": 8.391168599464763e-05, + "loss": 0.1436, + "step": 11030 + }, + { + "epoch": 1.64, + "grad_norm": 0.4304143190383911, + "learning_rate": 8.389681831697889e-05, + "loss": 0.1487, + "step": 11040 + }, + { + "epoch": 1.64, + "grad_norm": 0.3502871096134186, + "learning_rate": 8.388195063931014e-05, + "loss": 0.1469, + "step": 11050 + }, + { + "epoch": 1.64, + "grad_norm": 0.36102864146232605, + "learning_rate": 8.38670829616414e-05, + "loss": 0.1552, + "step": 11060 + }, + { + "epoch": 1.64, + "grad_norm": 0.8794988989830017, + "learning_rate": 8.385221528397265e-05, + "loss": 0.1436, + "step": 11070 + }, + { + "epoch": 1.64, + "grad_norm": 0.30111417174339294, + "learning_rate": 8.383734760630389e-05, + "loss": 0.1411, + "step": 11080 + }, + { + "epoch": 1.64, + "grad_norm": 0.3722517788410187, + "learning_rate": 8.382247992863516e-05, + "loss": 0.1416, + "step": 11090 + }, + { + "epoch": 1.65, + "grad_norm": 1.2735258340835571, + "learning_rate": 8.38076122509664e-05, + "loss": 0.1492, + "step": 11100 + }, + { + "epoch": 1.65, + "grad_norm": 0.6874326467514038, + "learning_rate": 8.379274457329766e-05, + "loss": 0.1472, + "step": 11110 + }, + { + "epoch": 1.65, + "grad_norm": 0.6189324855804443, + "learning_rate": 8.37778768956289e-05, + "loss": 0.1492, + "step": 11120 + }, + { + "epoch": 1.65, + "grad_norm": 0.7874268293380737, + "learning_rate": 8.376300921796015e-05, + "loss": 0.1489, + "step": 11130 + }, + { + "epoch": 1.65, + "grad_norm": 1.9058316946029663, + "learning_rate": 8.374814154029141e-05, + "loss": 0.1466, + "step": 11140 + }, + { + "epoch": 1.65, + "grad_norm": 0.27435657382011414, + "learning_rate": 8.373327386262266e-05, + "loss": 0.1548, + "step": 11150 + }, + { + "epoch": 1.65, + "grad_norm": 0.48889249563217163, + "learning_rate": 8.371840618495392e-05, + "loss": 0.1531, + "step": 11160 + }, + { + "epoch": 1.66, + "grad_norm": 0.809974730014801, + "learning_rate": 8.370353850728516e-05, + "loss": 0.1492, + "step": 11170 + }, + { + "epoch": 1.66, + "grad_norm": 0.6859524846076965, + "learning_rate": 8.368867082961643e-05, + "loss": 0.1462, + "step": 11180 + }, + { + "epoch": 1.66, + "grad_norm": 2.093574047088623, + "learning_rate": 8.367380315194767e-05, + "loss": 0.1498, + "step": 11190 + }, + { + "epoch": 1.66, + "grad_norm": 1.384391188621521, + "learning_rate": 8.365893547427892e-05, + "loss": 0.1529, + "step": 11200 + }, + { + "epoch": 1.66, + "grad_norm": 1.227850317955017, + "learning_rate": 8.364406779661016e-05, + "loss": 0.1448, + "step": 11210 + }, + { + "epoch": 1.66, + "grad_norm": 1.0657614469528198, + "learning_rate": 8.362920011894142e-05, + "loss": 0.1444, + "step": 11220 + }, + { + "epoch": 1.66, + "grad_norm": 0.9369791746139526, + "learning_rate": 8.361433244127267e-05, + "loss": 0.1484, + "step": 11230 + }, + { + "epoch": 1.67, + "grad_norm": 0.27957215905189514, + "learning_rate": 8.359946476360393e-05, + "loss": 0.142, + "step": 11240 + }, + { + "epoch": 1.67, + "grad_norm": 0.39011701941490173, + "learning_rate": 8.358459708593518e-05, + "loss": 0.1485, + "step": 11250 + }, + { + "epoch": 1.67, + "grad_norm": 0.4581025242805481, + "learning_rate": 8.356972940826642e-05, + "loss": 0.1567, + "step": 11260 + }, + { + "epoch": 1.67, + "grad_norm": 0.7829630970954895, + "learning_rate": 8.355486173059769e-05, + "loss": 0.1439, + "step": 11270 + }, + { + "epoch": 1.67, + "grad_norm": 0.7222351431846619, + "learning_rate": 8.353999405292893e-05, + "loss": 0.1411, + "step": 11280 + }, + { + "epoch": 1.67, + "grad_norm": 0.4510668218135834, + "learning_rate": 8.352512637526019e-05, + "loss": 0.14, + "step": 11290 + }, + { + "epoch": 1.68, + "grad_norm": 0.3421786427497864, + "learning_rate": 8.351025869759144e-05, + "loss": 0.1495, + "step": 11300 + }, + { + "epoch": 1.68, + "grad_norm": 0.24841587245464325, + "learning_rate": 8.34953910199227e-05, + "loss": 0.1395, + "step": 11310 + }, + { + "epoch": 1.68, + "grad_norm": 0.25271904468536377, + "learning_rate": 8.348052334225395e-05, + "loss": 0.1427, + "step": 11320 + }, + { + "epoch": 1.68, + "grad_norm": 0.5229553580284119, + "learning_rate": 8.34656556645852e-05, + "loss": 0.1425, + "step": 11330 + }, + { + "epoch": 1.68, + "grad_norm": 0.3751966953277588, + "learning_rate": 8.345078798691645e-05, + "loss": 0.1435, + "step": 11340 + }, + { + "epoch": 1.68, + "grad_norm": 1.0860705375671387, + "learning_rate": 8.343592030924769e-05, + "loss": 0.1482, + "step": 11350 + }, + { + "epoch": 1.68, + "grad_norm": 1.6879124641418457, + "learning_rate": 8.342105263157896e-05, + "loss": 0.1495, + "step": 11360 + }, + { + "epoch": 1.69, + "grad_norm": 0.5659767985343933, + "learning_rate": 8.34061849539102e-05, + "loss": 0.1421, + "step": 11370 + }, + { + "epoch": 1.69, + "grad_norm": 0.4345433712005615, + "learning_rate": 8.339131727624145e-05, + "loss": 0.1499, + "step": 11380 + }, + { + "epoch": 1.69, + "grad_norm": 1.5097520351409912, + "learning_rate": 8.337644959857271e-05, + "loss": 0.1554, + "step": 11390 + }, + { + "epoch": 1.69, + "grad_norm": 0.5569794178009033, + "learning_rate": 8.336158192090396e-05, + "loss": 0.1438, + "step": 11400 + }, + { + "epoch": 1.69, + "grad_norm": 1.062078833580017, + "learning_rate": 8.334671424323522e-05, + "loss": 0.1532, + "step": 11410 + }, + { + "epoch": 1.69, + "grad_norm": 0.6521728038787842, + "learning_rate": 8.333184656556646e-05, + "loss": 0.146, + "step": 11420 + }, + { + "epoch": 1.69, + "grad_norm": 1.404455542564392, + "learning_rate": 8.331697888789771e-05, + "loss": 0.1506, + "step": 11430 + }, + { + "epoch": 1.7, + "grad_norm": 0.4847467243671417, + "learning_rate": 8.330211121022896e-05, + "loss": 0.1416, + "step": 11440 + }, + { + "epoch": 1.7, + "grad_norm": 0.5724780559539795, + "learning_rate": 8.328724353256022e-05, + "loss": 0.1499, + "step": 11450 + }, + { + "epoch": 1.7, + "grad_norm": 0.3448980152606964, + "learning_rate": 8.327237585489147e-05, + "loss": 0.1492, + "step": 11460 + }, + { + "epoch": 1.7, + "grad_norm": 0.49433252215385437, + "learning_rate": 8.325750817722272e-05, + "loss": 0.1497, + "step": 11470 + }, + { + "epoch": 1.7, + "grad_norm": 1.2433007955551147, + "learning_rate": 8.324264049955398e-05, + "loss": 0.1426, + "step": 11480 + }, + { + "epoch": 1.7, + "grad_norm": 0.5715564489364624, + "learning_rate": 8.322777282188523e-05, + "loss": 0.1462, + "step": 11490 + }, + { + "epoch": 1.7, + "grad_norm": 0.7039369940757751, + "learning_rate": 8.321290514421648e-05, + "loss": 0.1468, + "step": 11500 + }, + { + "epoch": 1.71, + "grad_norm": 0.6003488898277283, + "learning_rate": 8.319803746654773e-05, + "loss": 0.1423, + "step": 11510 + }, + { + "epoch": 1.71, + "grad_norm": 0.4361822009086609, + "learning_rate": 8.318316978887898e-05, + "loss": 0.1389, + "step": 11520 + }, + { + "epoch": 1.71, + "grad_norm": 0.3091180622577667, + "learning_rate": 8.316830211121022e-05, + "loss": 0.1503, + "step": 11530 + }, + { + "epoch": 1.71, + "grad_norm": 0.3191719651222229, + "learning_rate": 8.315343443354149e-05, + "loss": 0.1499, + "step": 11540 + }, + { + "epoch": 1.71, + "grad_norm": 0.5728769898414612, + "learning_rate": 8.313856675587273e-05, + "loss": 0.1539, + "step": 11550 + }, + { + "epoch": 1.71, + "grad_norm": 1.094186782836914, + "learning_rate": 8.312369907820399e-05, + "loss": 0.1457, + "step": 11560 + }, + { + "epoch": 1.72, + "grad_norm": 0.9020346999168396, + "learning_rate": 8.310883140053524e-05, + "loss": 0.1417, + "step": 11570 + }, + { + "epoch": 1.72, + "grad_norm": 0.3199603855609894, + "learning_rate": 8.30939637228665e-05, + "loss": 0.1461, + "step": 11580 + }, + { + "epoch": 1.72, + "grad_norm": 1.0080779790878296, + "learning_rate": 8.307909604519775e-05, + "loss": 0.1436, + "step": 11590 + }, + { + "epoch": 1.72, + "grad_norm": 0.37147411704063416, + "learning_rate": 8.306422836752899e-05, + "loss": 0.1354, + "step": 11600 + }, + { + "epoch": 1.72, + "grad_norm": 0.7766250371932983, + "learning_rate": 8.304936068986025e-05, + "loss": 0.1465, + "step": 11610 + }, + { + "epoch": 1.72, + "grad_norm": 0.32852333784103394, + "learning_rate": 8.303449301219149e-05, + "loss": 0.1422, + "step": 11620 + }, + { + "epoch": 1.72, + "grad_norm": 0.5026690363883972, + "learning_rate": 8.301962533452276e-05, + "loss": 0.1493, + "step": 11630 + }, + { + "epoch": 1.73, + "grad_norm": 0.4522212743759155, + "learning_rate": 8.300475765685401e-05, + "loss": 0.1381, + "step": 11640 + }, + { + "epoch": 1.73, + "grad_norm": 0.3424636125564575, + "learning_rate": 8.298988997918525e-05, + "loss": 0.1468, + "step": 11650 + }, + { + "epoch": 1.73, + "grad_norm": 0.5932589769363403, + "learning_rate": 8.29750223015165e-05, + "loss": 0.1533, + "step": 11660 + }, + { + "epoch": 1.73, + "grad_norm": 0.817040741443634, + "learning_rate": 8.296015462384776e-05, + "loss": 0.1396, + "step": 11670 + }, + { + "epoch": 1.73, + "grad_norm": 0.25373414158821106, + "learning_rate": 8.294528694617902e-05, + "loss": 0.147, + "step": 11680 + }, + { + "epoch": 1.73, + "grad_norm": 0.36040133237838745, + "learning_rate": 8.293041926851026e-05, + "loss": 0.1508, + "step": 11690 + }, + { + "epoch": 1.73, + "grad_norm": 1.032599925994873, + "learning_rate": 8.291555159084151e-05, + "loss": 0.1394, + "step": 11700 + }, + { + "epoch": 1.74, + "grad_norm": 0.42168989777565, + "learning_rate": 8.290068391317277e-05, + "loss": 0.1495, + "step": 11710 + }, + { + "epoch": 1.74, + "grad_norm": 0.49878957867622375, + "learning_rate": 8.288581623550402e-05, + "loss": 0.1383, + "step": 11720 + }, + { + "epoch": 1.74, + "grad_norm": 0.8637439012527466, + "learning_rate": 8.287094855783528e-05, + "loss": 0.1388, + "step": 11730 + }, + { + "epoch": 1.74, + "grad_norm": 0.34474968910217285, + "learning_rate": 8.285608088016652e-05, + "loss": 0.1506, + "step": 11740 + }, + { + "epoch": 1.74, + "grad_norm": 0.9870109558105469, + "learning_rate": 8.284121320249777e-05, + "loss": 0.1516, + "step": 11750 + }, + { + "epoch": 1.74, + "grad_norm": 0.502234160900116, + "learning_rate": 8.282634552482903e-05, + "loss": 0.149, + "step": 11760 + }, + { + "epoch": 1.74, + "grad_norm": 1.7274667024612427, + "learning_rate": 8.281147784716028e-05, + "loss": 0.1487, + "step": 11770 + }, + { + "epoch": 1.75, + "grad_norm": 0.27660071849823, + "learning_rate": 8.279661016949152e-05, + "loss": 0.1398, + "step": 11780 + }, + { + "epoch": 1.75, + "grad_norm": 0.35568374395370483, + "learning_rate": 8.278174249182278e-05, + "loss": 0.1505, + "step": 11790 + }, + { + "epoch": 1.75, + "grad_norm": 1.317064881324768, + "learning_rate": 8.276687481415403e-05, + "loss": 0.1522, + "step": 11800 + }, + { + "epoch": 1.75, + "grad_norm": 0.6406526565551758, + "learning_rate": 8.275200713648529e-05, + "loss": 0.1518, + "step": 11810 + }, + { + "epoch": 1.75, + "grad_norm": 0.24043872952461243, + "learning_rate": 8.273713945881654e-05, + "loss": 0.1514, + "step": 11820 + }, + { + "epoch": 1.75, + "grad_norm": 2.5549800395965576, + "learning_rate": 8.272227178114778e-05, + "loss": 0.1501, + "step": 11830 + }, + { + "epoch": 1.76, + "grad_norm": 0.8846415877342224, + "learning_rate": 8.270740410347904e-05, + "loss": 0.1491, + "step": 11840 + }, + { + "epoch": 1.76, + "grad_norm": 0.6220967173576355, + "learning_rate": 8.269253642581029e-05, + "loss": 0.1497, + "step": 11850 + }, + { + "epoch": 1.76, + "grad_norm": 0.24483416974544525, + "learning_rate": 8.267766874814155e-05, + "loss": 0.1385, + "step": 11860 + }, + { + "epoch": 1.76, + "grad_norm": 0.46550866961479187, + "learning_rate": 8.266280107047279e-05, + "loss": 0.1484, + "step": 11870 + }, + { + "epoch": 1.76, + "grad_norm": 0.3104701340198517, + "learning_rate": 8.264793339280404e-05, + "loss": 0.1435, + "step": 11880 + }, + { + "epoch": 1.76, + "grad_norm": 2.2419028282165527, + "learning_rate": 8.26330657151353e-05, + "loss": 0.1523, + "step": 11890 + }, + { + "epoch": 1.76, + "grad_norm": 0.3503584563732147, + "learning_rate": 8.261819803746655e-05, + "loss": 0.1461, + "step": 11900 + }, + { + "epoch": 1.77, + "grad_norm": 0.2698685824871063, + "learning_rate": 8.260333035979781e-05, + "loss": 0.148, + "step": 11910 + }, + { + "epoch": 1.77, + "grad_norm": 0.36493173241615295, + "learning_rate": 8.258846268212905e-05, + "loss": 0.1447, + "step": 11920 + }, + { + "epoch": 1.77, + "grad_norm": 0.378431111574173, + "learning_rate": 8.25735950044603e-05, + "loss": 0.1528, + "step": 11930 + }, + { + "epoch": 1.77, + "grad_norm": 0.6286464333534241, + "learning_rate": 8.255872732679156e-05, + "loss": 0.1419, + "step": 11940 + }, + { + "epoch": 1.77, + "grad_norm": 1.7695555686950684, + "learning_rate": 8.254385964912281e-05, + "loss": 0.1496, + "step": 11950 + }, + { + "epoch": 1.77, + "grad_norm": 0.46831145882606506, + "learning_rate": 8.252899197145407e-05, + "loss": 0.1418, + "step": 11960 + }, + { + "epoch": 1.77, + "grad_norm": 0.4352415204048157, + "learning_rate": 8.251412429378531e-05, + "loss": 0.1506, + "step": 11970 + }, + { + "epoch": 1.78, + "grad_norm": 0.2404990941286087, + "learning_rate": 8.249925661611658e-05, + "loss": 0.1527, + "step": 11980 + }, + { + "epoch": 1.78, + "grad_norm": 1.8183870315551758, + "learning_rate": 8.248438893844782e-05, + "loss": 0.1429, + "step": 11990 + }, + { + "epoch": 1.78, + "grad_norm": 0.2836654484272003, + "learning_rate": 8.246952126077907e-05, + "loss": 0.1403, + "step": 12000 + }, + { + "epoch": 1.78, + "grad_norm": 0.32020139694213867, + "learning_rate": 8.245465358311032e-05, + "loss": 0.1513, + "step": 12010 + }, + { + "epoch": 1.78, + "grad_norm": 1.3946151733398438, + "learning_rate": 8.243978590544157e-05, + "loss": 0.1385, + "step": 12020 + }, + { + "epoch": 1.78, + "grad_norm": 1.3150290250778198, + "learning_rate": 8.242491822777282e-05, + "loss": 0.1456, + "step": 12030 + }, + { + "epoch": 1.78, + "grad_norm": 0.21857360005378723, + "learning_rate": 8.241005055010408e-05, + "loss": 0.1452, + "step": 12040 + }, + { + "epoch": 1.79, + "grad_norm": 1.4940718412399292, + "learning_rate": 8.239518287243533e-05, + "loss": 0.1406, + "step": 12050 + }, + { + "epoch": 1.79, + "grad_norm": 1.5689001083374023, + "learning_rate": 8.238031519476658e-05, + "loss": 0.1566, + "step": 12060 + }, + { + "epoch": 1.79, + "grad_norm": 0.8649786710739136, + "learning_rate": 8.236544751709784e-05, + "loss": 0.1498, + "step": 12070 + }, + { + "epoch": 1.79, + "grad_norm": 1.0453568696975708, + "learning_rate": 8.235057983942908e-05, + "loss": 0.1461, + "step": 12080 + }, + { + "epoch": 1.79, + "grad_norm": 1.147850513458252, + "learning_rate": 8.233571216176034e-05, + "loss": 0.1438, + "step": 12090 + }, + { + "epoch": 1.79, + "grad_norm": 0.5856927037239075, + "learning_rate": 8.232084448409158e-05, + "loss": 0.1477, + "step": 12100 + }, + { + "epoch": 1.8, + "grad_norm": 0.24364539980888367, + "learning_rate": 8.230597680642284e-05, + "loss": 0.1489, + "step": 12110 + }, + { + "epoch": 1.8, + "grad_norm": 0.500666618347168, + "learning_rate": 8.229110912875409e-05, + "loss": 0.1473, + "step": 12120 + }, + { + "epoch": 1.8, + "grad_norm": 0.2935793697834015, + "learning_rate": 8.227624145108535e-05, + "loss": 0.1497, + "step": 12130 + }, + { + "epoch": 1.8, + "grad_norm": 0.3563259541988373, + "learning_rate": 8.22613737734166e-05, + "loss": 0.1418, + "step": 12140 + }, + { + "epoch": 1.8, + "grad_norm": 0.3523156940937042, + "learning_rate": 8.224650609574784e-05, + "loss": 0.1453, + "step": 12150 + }, + { + "epoch": 1.8, + "grad_norm": 1.0185507535934448, + "learning_rate": 8.223163841807911e-05, + "loss": 0.1399, + "step": 12160 + }, + { + "epoch": 1.8, + "grad_norm": 0.5404852628707886, + "learning_rate": 8.221677074041035e-05, + "loss": 0.1463, + "step": 12170 + }, + { + "epoch": 1.81, + "grad_norm": 1.4389984607696533, + "learning_rate": 8.22019030627416e-05, + "loss": 0.1524, + "step": 12180 + }, + { + "epoch": 1.81, + "grad_norm": 0.39658161997795105, + "learning_rate": 8.218703538507285e-05, + "loss": 0.1424, + "step": 12190 + }, + { + "epoch": 1.81, + "grad_norm": 0.3328675329685211, + "learning_rate": 8.217216770740411e-05, + "loss": 0.1478, + "step": 12200 + }, + { + "epoch": 1.81, + "grad_norm": 0.8993117213249207, + "learning_rate": 8.215730002973536e-05, + "loss": 0.1431, + "step": 12210 + }, + { + "epoch": 1.81, + "grad_norm": 0.31393373012542725, + "learning_rate": 8.214243235206661e-05, + "loss": 0.153, + "step": 12220 + }, + { + "epoch": 1.81, + "grad_norm": 0.40346676111221313, + "learning_rate": 8.212756467439787e-05, + "loss": 0.1552, + "step": 12230 + }, + { + "epoch": 1.81, + "grad_norm": 0.2991549074649811, + "learning_rate": 8.211269699672911e-05, + "loss": 0.1443, + "step": 12240 + }, + { + "epoch": 1.82, + "grad_norm": 0.729020357131958, + "learning_rate": 8.209782931906038e-05, + "loss": 0.1419, + "step": 12250 + }, + { + "epoch": 1.82, + "grad_norm": 1.1650629043579102, + "learning_rate": 8.208296164139162e-05, + "loss": 0.1547, + "step": 12260 + }, + { + "epoch": 1.82, + "grad_norm": 1.2502027750015259, + "learning_rate": 8.206809396372287e-05, + "loss": 0.1468, + "step": 12270 + }, + { + "epoch": 1.82, + "grad_norm": 0.664297342300415, + "learning_rate": 8.205322628605411e-05, + "loss": 0.1468, + "step": 12280 + }, + { + "epoch": 1.82, + "grad_norm": 0.310942143201828, + "learning_rate": 8.203835860838538e-05, + "loss": 0.1441, + "step": 12290 + }, + { + "epoch": 1.82, + "grad_norm": 0.28401151299476624, + "learning_rate": 8.202349093071664e-05, + "loss": 0.1436, + "step": 12300 + }, + { + "epoch": 1.82, + "grad_norm": 0.32295674085617065, + "learning_rate": 8.200862325304788e-05, + "loss": 0.1445, + "step": 12310 + }, + { + "epoch": 1.83, + "grad_norm": 0.49534422159194946, + "learning_rate": 8.199375557537913e-05, + "loss": 0.1505, + "step": 12320 + }, + { + "epoch": 1.83, + "grad_norm": 0.6425632834434509, + "learning_rate": 8.197888789771037e-05, + "loss": 0.1489, + "step": 12330 + }, + { + "epoch": 1.83, + "grad_norm": 0.6739435195922852, + "learning_rate": 8.196402022004164e-05, + "loss": 0.1453, + "step": 12340 + }, + { + "epoch": 1.83, + "grad_norm": 0.706912636756897, + "learning_rate": 8.194915254237288e-05, + "loss": 0.141, + "step": 12350 + }, + { + "epoch": 1.83, + "grad_norm": 0.9807479381561279, + "learning_rate": 8.193428486470414e-05, + "loss": 0.1542, + "step": 12360 + }, + { + "epoch": 1.83, + "grad_norm": 1.7933177947998047, + "learning_rate": 8.191941718703539e-05, + "loss": 0.1478, + "step": 12370 + }, + { + "epoch": 1.84, + "grad_norm": 0.555246889591217, + "learning_rate": 8.190454950936665e-05, + "loss": 0.1593, + "step": 12380 + }, + { + "epoch": 1.84, + "grad_norm": 0.279439777135849, + "learning_rate": 8.18896818316979e-05, + "loss": 0.1494, + "step": 12390 + }, + { + "epoch": 1.84, + "grad_norm": 0.6358618140220642, + "learning_rate": 8.187481415402914e-05, + "loss": 0.1567, + "step": 12400 + }, + { + "epoch": 1.84, + "grad_norm": 0.4418742060661316, + "learning_rate": 8.18599464763604e-05, + "loss": 0.1452, + "step": 12410 + }, + { + "epoch": 1.84, + "grad_norm": 0.34239551424980164, + "learning_rate": 8.184507879869164e-05, + "loss": 0.1405, + "step": 12420 + }, + { + "epoch": 1.84, + "grad_norm": 1.068682074546814, + "learning_rate": 8.183021112102291e-05, + "loss": 0.1454, + "step": 12430 + }, + { + "epoch": 1.84, + "grad_norm": 0.748917281627655, + "learning_rate": 8.181534344335415e-05, + "loss": 0.1468, + "step": 12440 + }, + { + "epoch": 1.85, + "grad_norm": 1.5174105167388916, + "learning_rate": 8.18004757656854e-05, + "loss": 0.137, + "step": 12450 + }, + { + "epoch": 1.85, + "grad_norm": 0.29713815450668335, + "learning_rate": 8.178560808801666e-05, + "loss": 0.1443, + "step": 12460 + }, + { + "epoch": 1.85, + "grad_norm": 0.3194617033004761, + "learning_rate": 8.177074041034791e-05, + "loss": 0.1463, + "step": 12470 + }, + { + "epoch": 1.85, + "grad_norm": 0.9451056718826294, + "learning_rate": 8.175587273267917e-05, + "loss": 0.1488, + "step": 12480 + }, + { + "epoch": 1.85, + "grad_norm": 0.5475955009460449, + "learning_rate": 8.174100505501041e-05, + "loss": 0.1487, + "step": 12490 + }, + { + "epoch": 1.85, + "grad_norm": 0.4979711174964905, + "learning_rate": 8.172613737734166e-05, + "loss": 0.1379, + "step": 12500 + }, + { + "epoch": 1.85, + "grad_norm": 0.6958137154579163, + "learning_rate": 8.17112696996729e-05, + "loss": 0.1395, + "step": 12510 + }, + { + "epoch": 1.86, + "grad_norm": 0.6111878156661987, + "learning_rate": 8.169640202200417e-05, + "loss": 0.1396, + "step": 12520 + }, + { + "epoch": 1.86, + "grad_norm": 1.024361252784729, + "learning_rate": 8.168153434433541e-05, + "loss": 0.1453, + "step": 12530 + }, + { + "epoch": 1.86, + "grad_norm": 0.42374831438064575, + "learning_rate": 8.166666666666667e-05, + "loss": 0.1441, + "step": 12540 + }, + { + "epoch": 1.86, + "grad_norm": 0.4444429278373718, + "learning_rate": 8.165179898899792e-05, + "loss": 0.1515, + "step": 12550 + }, + { + "epoch": 1.86, + "grad_norm": 0.30336788296699524, + "learning_rate": 8.163693131132918e-05, + "loss": 0.1548, + "step": 12560 + }, + { + "epoch": 1.86, + "grad_norm": 1.3255070447921753, + "learning_rate": 8.162206363366043e-05, + "loss": 0.1498, + "step": 12570 + }, + { + "epoch": 1.86, + "grad_norm": 0.9393750429153442, + "learning_rate": 8.160719595599167e-05, + "loss": 0.1439, + "step": 12580 + }, + { + "epoch": 1.87, + "grad_norm": 0.9588826298713684, + "learning_rate": 8.159232827832293e-05, + "loss": 0.1427, + "step": 12590 + }, + { + "epoch": 1.87, + "grad_norm": 1.0595064163208008, + "learning_rate": 8.157746060065417e-05, + "loss": 0.1481, + "step": 12600 + }, + { + "epoch": 1.87, + "grad_norm": 0.584743082523346, + "learning_rate": 8.156259292298544e-05, + "loss": 0.1322, + "step": 12610 + }, + { + "epoch": 1.87, + "grad_norm": 0.6917292475700378, + "learning_rate": 8.154772524531668e-05, + "loss": 0.1406, + "step": 12620 + }, + { + "epoch": 1.87, + "grad_norm": 0.39056283235549927, + "learning_rate": 8.153285756764793e-05, + "loss": 0.1464, + "step": 12630 + }, + { + "epoch": 1.87, + "grad_norm": 2.2323410511016846, + "learning_rate": 8.151798988997919e-05, + "loss": 0.1497, + "step": 12640 + }, + { + "epoch": 1.88, + "grad_norm": 0.2180223912000656, + "learning_rate": 8.150312221231044e-05, + "loss": 0.147, + "step": 12650 + }, + { + "epoch": 1.88, + "grad_norm": 1.1179107427597046, + "learning_rate": 8.14882545346417e-05, + "loss": 0.1433, + "step": 12660 + }, + { + "epoch": 1.88, + "grad_norm": 0.2884316146373749, + "learning_rate": 8.147338685697294e-05, + "loss": 0.1545, + "step": 12670 + }, + { + "epoch": 1.88, + "grad_norm": 0.5363054275512695, + "learning_rate": 8.14585191793042e-05, + "loss": 0.152, + "step": 12680 + }, + { + "epoch": 1.88, + "grad_norm": 0.2892308831214905, + "learning_rate": 8.144365150163545e-05, + "loss": 0.1472, + "step": 12690 + }, + { + "epoch": 1.88, + "grad_norm": 0.8672362565994263, + "learning_rate": 8.14287838239667e-05, + "loss": 0.1566, + "step": 12700 + }, + { + "epoch": 1.88, + "grad_norm": 0.8178413510322571, + "learning_rate": 8.141391614629796e-05, + "loss": 0.1459, + "step": 12710 + }, + { + "epoch": 1.89, + "grad_norm": 0.3414991497993469, + "learning_rate": 8.13990484686292e-05, + "loss": 0.1454, + "step": 12720 + }, + { + "epoch": 1.89, + "grad_norm": 1.8329755067825317, + "learning_rate": 8.138418079096045e-05, + "loss": 0.1429, + "step": 12730 + }, + { + "epoch": 1.89, + "grad_norm": 0.6989759802818298, + "learning_rate": 8.136931311329171e-05, + "loss": 0.1533, + "step": 12740 + }, + { + "epoch": 1.89, + "grad_norm": 0.6167986392974854, + "learning_rate": 8.135444543562296e-05, + "loss": 0.1415, + "step": 12750 + }, + { + "epoch": 1.89, + "grad_norm": 0.7545574903488159, + "learning_rate": 8.13395777579542e-05, + "loss": 0.1451, + "step": 12760 + }, + { + "epoch": 1.89, + "grad_norm": 0.7932333946228027, + "learning_rate": 8.132471008028546e-05, + "loss": 0.1473, + "step": 12770 + }, + { + "epoch": 1.89, + "grad_norm": 0.26402798295021057, + "learning_rate": 8.130984240261672e-05, + "loss": 0.1394, + "step": 12780 + }, + { + "epoch": 1.9, + "grad_norm": 0.5565439462661743, + "learning_rate": 8.129497472494797e-05, + "loss": 0.14, + "step": 12790 + }, + { + "epoch": 1.9, + "grad_norm": 0.40671250224113464, + "learning_rate": 8.128010704727922e-05, + "loss": 0.1454, + "step": 12800 + }, + { + "epoch": 1.9, + "grad_norm": 0.8522148728370667, + "learning_rate": 8.126523936961047e-05, + "loss": 0.1558, + "step": 12810 + }, + { + "epoch": 1.9, + "grad_norm": 0.5363799929618835, + "learning_rate": 8.125037169194172e-05, + "loss": 0.1404, + "step": 12820 + }, + { + "epoch": 1.9, + "grad_norm": 0.44085416197776794, + "learning_rate": 8.123550401427298e-05, + "loss": 0.1512, + "step": 12830 + }, + { + "epoch": 1.9, + "grad_norm": 0.3973836600780487, + "learning_rate": 8.122063633660423e-05, + "loss": 0.1456, + "step": 12840 + }, + { + "epoch": 1.9, + "grad_norm": 0.5485923886299133, + "learning_rate": 8.120576865893547e-05, + "loss": 0.1424, + "step": 12850 + }, + { + "epoch": 1.91, + "grad_norm": 0.4346306622028351, + "learning_rate": 8.119090098126673e-05, + "loss": 0.1508, + "step": 12860 + }, + { + "epoch": 1.91, + "grad_norm": 0.2852851450443268, + "learning_rate": 8.117603330359798e-05, + "loss": 0.1459, + "step": 12870 + }, + { + "epoch": 1.91, + "grad_norm": 1.7352854013442993, + "learning_rate": 8.116116562592924e-05, + "loss": 0.1475, + "step": 12880 + }, + { + "epoch": 1.91, + "grad_norm": 0.3881273567676544, + "learning_rate": 8.114629794826049e-05, + "loss": 0.1429, + "step": 12890 + }, + { + "epoch": 1.91, + "grad_norm": 1.1615941524505615, + "learning_rate": 8.113143027059173e-05, + "loss": 0.1473, + "step": 12900 + }, + { + "epoch": 1.91, + "grad_norm": 0.435965359210968, + "learning_rate": 8.111656259292299e-05, + "loss": 0.1461, + "step": 12910 + }, + { + "epoch": 1.92, + "grad_norm": 0.6154923439025879, + "learning_rate": 8.110169491525424e-05, + "loss": 0.1438, + "step": 12920 + }, + { + "epoch": 1.92, + "grad_norm": 0.5796480774879456, + "learning_rate": 8.10868272375855e-05, + "loss": 0.145, + "step": 12930 + }, + { + "epoch": 1.92, + "grad_norm": 1.1157699823379517, + "learning_rate": 8.107195955991674e-05, + "loss": 0.15, + "step": 12940 + }, + { + "epoch": 1.92, + "grad_norm": 0.9312446117401123, + "learning_rate": 8.105709188224799e-05, + "loss": 0.1419, + "step": 12950 + }, + { + "epoch": 1.92, + "grad_norm": 0.6173706650733948, + "learning_rate": 8.104222420457925e-05, + "loss": 0.1464, + "step": 12960 + }, + { + "epoch": 1.92, + "grad_norm": 0.7330856919288635, + "learning_rate": 8.10273565269105e-05, + "loss": 0.1507, + "step": 12970 + }, + { + "epoch": 1.92, + "grad_norm": 0.6786613464355469, + "learning_rate": 8.101248884924176e-05, + "loss": 0.1486, + "step": 12980 + }, + { + "epoch": 1.93, + "grad_norm": 1.6271902322769165, + "learning_rate": 8.0997621171573e-05, + "loss": 0.1497, + "step": 12990 + }, + { + "epoch": 1.93, + "grad_norm": 0.4697366952896118, + "learning_rate": 8.098275349390425e-05, + "loss": 0.1436, + "step": 13000 + }, + { + "epoch": 1.93, + "grad_norm": 1.4403237104415894, + "learning_rate": 8.096788581623551e-05, + "loss": 0.142, + "step": 13010 + }, + { + "epoch": 1.93, + "grad_norm": 0.9504378437995911, + "learning_rate": 8.095301813856676e-05, + "loss": 0.1441, + "step": 13020 + }, + { + "epoch": 1.93, + "grad_norm": 0.271784245967865, + "learning_rate": 8.093815046089802e-05, + "loss": 0.1419, + "step": 13030 + }, + { + "epoch": 1.93, + "grad_norm": 0.47144797444343567, + "learning_rate": 8.092328278322926e-05, + "loss": 0.1471, + "step": 13040 + }, + { + "epoch": 1.93, + "grad_norm": 0.2984504997730255, + "learning_rate": 8.090841510556053e-05, + "loss": 0.1453, + "step": 13050 + }, + { + "epoch": 1.94, + "grad_norm": 1.020990252494812, + "learning_rate": 8.089354742789177e-05, + "loss": 0.1454, + "step": 13060 + }, + { + "epoch": 1.94, + "grad_norm": 0.3087640702724457, + "learning_rate": 8.087867975022302e-05, + "loss": 0.1497, + "step": 13070 + }, + { + "epoch": 1.94, + "grad_norm": 0.4391845464706421, + "learning_rate": 8.086381207255426e-05, + "loss": 0.1506, + "step": 13080 + }, + { + "epoch": 1.94, + "grad_norm": 2.1297712326049805, + "learning_rate": 8.084894439488552e-05, + "loss": 0.1502, + "step": 13090 + }, + { + "epoch": 1.94, + "grad_norm": 0.29619669914245605, + "learning_rate": 8.083407671721677e-05, + "loss": 0.1547, + "step": 13100 + }, + { + "epoch": 1.94, + "grad_norm": 1.5125505924224854, + "learning_rate": 8.081920903954803e-05, + "loss": 0.145, + "step": 13110 + }, + { + "epoch": 1.94, + "grad_norm": 1.0014768838882446, + "learning_rate": 8.080434136187928e-05, + "loss": 0.1471, + "step": 13120 + }, + { + "epoch": 1.95, + "grad_norm": 0.2931712865829468, + "learning_rate": 8.078947368421052e-05, + "loss": 0.1463, + "step": 13130 + }, + { + "epoch": 1.95, + "grad_norm": 1.7871705293655396, + "learning_rate": 8.077460600654179e-05, + "loss": 0.1438, + "step": 13140 + }, + { + "epoch": 1.95, + "grad_norm": 1.4664849042892456, + "learning_rate": 8.075973832887303e-05, + "loss": 0.1432, + "step": 13150 + }, + { + "epoch": 1.95, + "grad_norm": 1.5097386837005615, + "learning_rate": 8.074487065120429e-05, + "loss": 0.1459, + "step": 13160 + }, + { + "epoch": 1.95, + "grad_norm": 0.2903710603713989, + "learning_rate": 8.073000297353553e-05, + "loss": 0.1455, + "step": 13170 + }, + { + "epoch": 1.95, + "grad_norm": 0.6520920991897583, + "learning_rate": 8.07151352958668e-05, + "loss": 0.1491, + "step": 13180 + }, + { + "epoch": 1.96, + "grad_norm": 0.3731312155723572, + "learning_rate": 8.070026761819804e-05, + "loss": 0.1372, + "step": 13190 + }, + { + "epoch": 1.96, + "grad_norm": 0.653975248336792, + "learning_rate": 8.06853999405293e-05, + "loss": 0.1439, + "step": 13200 + }, + { + "epoch": 1.96, + "grad_norm": 0.7446883320808411, + "learning_rate": 8.067053226286055e-05, + "loss": 0.148, + "step": 13210 + }, + { + "epoch": 1.96, + "grad_norm": 1.1628508567810059, + "learning_rate": 8.065566458519179e-05, + "loss": 0.1482, + "step": 13220 + }, + { + "epoch": 1.96, + "grad_norm": 1.871608018875122, + "learning_rate": 8.064079690752306e-05, + "loss": 0.1473, + "step": 13230 + }, + { + "epoch": 1.96, + "grad_norm": 0.35242322087287903, + "learning_rate": 8.06259292298543e-05, + "loss": 0.1481, + "step": 13240 + }, + { + "epoch": 1.96, + "grad_norm": 0.6800448894500732, + "learning_rate": 8.061106155218555e-05, + "loss": 0.1469, + "step": 13250 + }, + { + "epoch": 1.97, + "grad_norm": 0.6389768719673157, + "learning_rate": 8.05961938745168e-05, + "loss": 0.1433, + "step": 13260 + }, + { + "epoch": 1.97, + "grad_norm": 0.30566221475601196, + "learning_rate": 8.058132619684806e-05, + "loss": 0.1475, + "step": 13270 + }, + { + "epoch": 1.97, + "grad_norm": 0.6695141196250916, + "learning_rate": 8.05664585191793e-05, + "loss": 0.1494, + "step": 13280 + }, + { + "epoch": 1.97, + "grad_norm": 0.8827765583992004, + "learning_rate": 8.055159084151056e-05, + "loss": 0.1525, + "step": 13290 + }, + { + "epoch": 1.97, + "grad_norm": 0.7884755730628967, + "learning_rate": 8.053672316384181e-05, + "loss": 0.1347, + "step": 13300 + }, + { + "epoch": 1.97, + "grad_norm": 0.29021501541137695, + "learning_rate": 8.052185548617306e-05, + "loss": 0.1514, + "step": 13310 + }, + { + "epoch": 1.97, + "grad_norm": 3.1000945568084717, + "learning_rate": 8.050698780850432e-05, + "loss": 0.1576, + "step": 13320 + }, + { + "epoch": 1.98, + "grad_norm": 1.0783339738845825, + "learning_rate": 8.049212013083556e-05, + "loss": 0.1487, + "step": 13330 + }, + { + "epoch": 1.98, + "grad_norm": 0.3350972533226013, + "learning_rate": 8.047725245316682e-05, + "loss": 0.1393, + "step": 13340 + }, + { + "epoch": 1.98, + "grad_norm": 0.26007136702537537, + "learning_rate": 8.046238477549807e-05, + "loss": 0.138, + "step": 13350 + }, + { + "epoch": 1.98, + "grad_norm": 0.37395599484443665, + "learning_rate": 8.044751709782933e-05, + "loss": 0.1444, + "step": 13360 + }, + { + "epoch": 1.98, + "grad_norm": 0.48090776801109314, + "learning_rate": 8.043264942016058e-05, + "loss": 0.146, + "step": 13370 + }, + { + "epoch": 1.98, + "grad_norm": 0.6154760718345642, + "learning_rate": 8.041778174249182e-05, + "loss": 0.1423, + "step": 13380 + }, + { + "epoch": 1.98, + "grad_norm": 0.34295591711997986, + "learning_rate": 8.040291406482308e-05, + "loss": 0.1515, + "step": 13390 + }, + { + "epoch": 1.99, + "grad_norm": 0.2805001735687256, + "learning_rate": 8.038804638715432e-05, + "loss": 0.1539, + "step": 13400 + }, + { + "epoch": 1.99, + "grad_norm": 0.4166673719882965, + "learning_rate": 8.037317870948559e-05, + "loss": 0.1498, + "step": 13410 + }, + { + "epoch": 1.99, + "grad_norm": 0.5549523830413818, + "learning_rate": 8.035831103181683e-05, + "loss": 0.1401, + "step": 13420 + }, + { + "epoch": 1.99, + "grad_norm": 0.3639054298400879, + "learning_rate": 8.034344335414809e-05, + "loss": 0.1479, + "step": 13430 + }, + { + "epoch": 1.99, + "grad_norm": 0.5058225393295288, + "learning_rate": 8.032857567647934e-05, + "loss": 0.1437, + "step": 13440 + }, + { + "epoch": 1.99, + "grad_norm": 0.7204506397247314, + "learning_rate": 8.03137079988106e-05, + "loss": 0.147, + "step": 13450 + }, + { + "epoch": 2.0, + "grad_norm": 0.34979936480522156, + "learning_rate": 8.029884032114185e-05, + "loss": 0.1409, + "step": 13460 + }, + { + "epoch": 2.0, + "grad_norm": 0.71246337890625, + "learning_rate": 8.028397264347309e-05, + "loss": 0.1442, + "step": 13470 + }, + { + "epoch": 2.0, + "grad_norm": 0.47565925121307373, + "learning_rate": 8.026910496580435e-05, + "loss": 0.145, + "step": 13480 + }, + { + "epoch": 2.0, + "grad_norm": 0.7164893746376038, + "learning_rate": 8.025423728813559e-05, + "loss": 0.1368, + "step": 13490 + }, + { + "epoch": 2.0, + "eval_loss": 0.15097101032733917, + "eval_runtime": 2479.6392, + "eval_samples_per_second": 235.388, + "eval_steps_per_second": 3.678, + "step": 13492 + }, + { + "epoch": 2.0, + "grad_norm": 0.3602011501789093, + "learning_rate": 8.023936961046685e-05, + "loss": 0.1409, + "step": 13500 + }, + { + "epoch": 2.0, + "grad_norm": 1.1467143297195435, + "learning_rate": 8.02245019327981e-05, + "loss": 0.1445, + "step": 13510 + }, + { + "epoch": 2.0, + "grad_norm": 0.553377091884613, + "learning_rate": 8.020963425512935e-05, + "loss": 0.1326, + "step": 13520 + }, + { + "epoch": 2.01, + "grad_norm": 0.7122890949249268, + "learning_rate": 8.01947665774606e-05, + "loss": 0.1414, + "step": 13530 + }, + { + "epoch": 2.01, + "grad_norm": 0.6272186040878296, + "learning_rate": 8.017989889979186e-05, + "loss": 0.1428, + "step": 13540 + }, + { + "epoch": 2.01, + "grad_norm": 0.5186712741851807, + "learning_rate": 8.016503122212312e-05, + "loss": 0.1473, + "step": 13550 + }, + { + "epoch": 2.01, + "grad_norm": 0.7185566425323486, + "learning_rate": 8.015016354445436e-05, + "loss": 0.1427, + "step": 13560 + }, + { + "epoch": 2.01, + "grad_norm": 0.32846587896347046, + "learning_rate": 8.013529586678561e-05, + "loss": 0.143, + "step": 13570 + }, + { + "epoch": 2.01, + "grad_norm": 1.1040384769439697, + "learning_rate": 8.012042818911685e-05, + "loss": 0.1442, + "step": 13580 + }, + { + "epoch": 2.01, + "grad_norm": 0.3372102379798889, + "learning_rate": 8.010556051144812e-05, + "loss": 0.1373, + "step": 13590 + }, + { + "epoch": 2.02, + "grad_norm": 0.5751693248748779, + "learning_rate": 8.009069283377936e-05, + "loss": 0.1372, + "step": 13600 + }, + { + "epoch": 2.02, + "grad_norm": 0.8382410407066345, + "learning_rate": 8.007582515611062e-05, + "loss": 0.1393, + "step": 13610 + }, + { + "epoch": 2.02, + "grad_norm": 0.2772016227245331, + "learning_rate": 8.006095747844187e-05, + "loss": 0.136, + "step": 13620 + }, + { + "epoch": 2.02, + "grad_norm": 0.9653019309043884, + "learning_rate": 8.004608980077313e-05, + "loss": 0.1389, + "step": 13630 + }, + { + "epoch": 2.02, + "grad_norm": 0.6972894668579102, + "learning_rate": 8.003122212310438e-05, + "loss": 0.1497, + "step": 13640 + }, + { + "epoch": 2.02, + "grad_norm": 0.28461864590644836, + "learning_rate": 8.001635444543562e-05, + "loss": 0.1389, + "step": 13650 + }, + { + "epoch": 2.02, + "grad_norm": 1.647381067276001, + "learning_rate": 8.000148676776688e-05, + "loss": 0.1411, + "step": 13660 + }, + { + "epoch": 2.03, + "grad_norm": 0.6920177340507507, + "learning_rate": 7.998661909009813e-05, + "loss": 0.1464, + "step": 13670 + }, + { + "epoch": 2.03, + "grad_norm": 0.29943400621414185, + "learning_rate": 7.997175141242939e-05, + "loss": 0.141, + "step": 13680 + }, + { + "epoch": 2.03, + "grad_norm": 0.3627011179924011, + "learning_rate": 7.995688373476064e-05, + "loss": 0.1414, + "step": 13690 + }, + { + "epoch": 2.03, + "grad_norm": 0.7024916410446167, + "learning_rate": 7.994201605709188e-05, + "loss": 0.1403, + "step": 13700 + }, + { + "epoch": 2.03, + "grad_norm": 1.041019082069397, + "learning_rate": 7.992714837942314e-05, + "loss": 0.1461, + "step": 13710 + }, + { + "epoch": 2.03, + "grad_norm": 0.9740252494812012, + "learning_rate": 7.991228070175439e-05, + "loss": 0.1423, + "step": 13720 + }, + { + "epoch": 2.04, + "grad_norm": 0.3003685176372528, + "learning_rate": 7.989741302408565e-05, + "loss": 0.1615, + "step": 13730 + }, + { + "epoch": 2.04, + "grad_norm": 0.32233926653862, + "learning_rate": 7.988254534641689e-05, + "loss": 0.1328, + "step": 13740 + }, + { + "epoch": 2.04, + "grad_norm": 0.23946711421012878, + "learning_rate": 7.986767766874814e-05, + "loss": 0.1512, + "step": 13750 + }, + { + "epoch": 2.04, + "grad_norm": 0.9506574273109436, + "learning_rate": 7.98528099910794e-05, + "loss": 0.1485, + "step": 13760 + }, + { + "epoch": 2.04, + "grad_norm": 2.029719114303589, + "learning_rate": 7.983794231341065e-05, + "loss": 0.1527, + "step": 13770 + }, + { + "epoch": 2.04, + "grad_norm": 0.22863923013210297, + "learning_rate": 7.982307463574191e-05, + "loss": 0.1448, + "step": 13780 + }, + { + "epoch": 2.04, + "grad_norm": 0.300920307636261, + "learning_rate": 7.980820695807315e-05, + "loss": 0.1477, + "step": 13790 + }, + { + "epoch": 2.05, + "grad_norm": 0.3894307315349579, + "learning_rate": 7.97933392804044e-05, + "loss": 0.1428, + "step": 13800 + }, + { + "epoch": 2.05, + "grad_norm": 0.7516509294509888, + "learning_rate": 7.977847160273566e-05, + "loss": 0.1423, + "step": 13810 + }, + { + "epoch": 2.05, + "grad_norm": 0.6397115588188171, + "learning_rate": 7.976360392506691e-05, + "loss": 0.1446, + "step": 13820 + }, + { + "epoch": 2.05, + "grad_norm": 0.33174657821655273, + "learning_rate": 7.975022301516504e-05, + "loss": 0.1415, + "step": 13830 + }, + { + "epoch": 2.05, + "grad_norm": 0.393704891204834, + "learning_rate": 7.973535533749629e-05, + "loss": 0.1471, + "step": 13840 + }, + { + "epoch": 2.05, + "grad_norm": 1.0416827201843262, + "learning_rate": 7.972048765982753e-05, + "loss": 0.1369, + "step": 13850 + }, + { + "epoch": 2.05, + "grad_norm": 1.268273949623108, + "learning_rate": 7.970561998215879e-05, + "loss": 0.1461, + "step": 13860 + }, + { + "epoch": 2.06, + "grad_norm": 0.27585187554359436, + "learning_rate": 7.969075230449004e-05, + "loss": 0.1483, + "step": 13870 + }, + { + "epoch": 2.06, + "grad_norm": 0.27429184317588806, + "learning_rate": 7.96758846268213e-05, + "loss": 0.1434, + "step": 13880 + }, + { + "epoch": 2.06, + "grad_norm": 1.2318065166473389, + "learning_rate": 7.966101694915254e-05, + "loss": 0.1402, + "step": 13890 + }, + { + "epoch": 2.06, + "grad_norm": 0.34538692235946655, + "learning_rate": 7.96461492714838e-05, + "loss": 0.1368, + "step": 13900 + }, + { + "epoch": 2.06, + "grad_norm": 1.7623499631881714, + "learning_rate": 7.963128159381505e-05, + "loss": 0.1447, + "step": 13910 + }, + { + "epoch": 2.06, + "grad_norm": 1.2780420780181885, + "learning_rate": 7.96164139161463e-05, + "loss": 0.1466, + "step": 13920 + }, + { + "epoch": 2.06, + "grad_norm": 0.6700235605239868, + "learning_rate": 7.960154623847756e-05, + "loss": 0.1429, + "step": 13930 + }, + { + "epoch": 2.07, + "grad_norm": 0.46738964319229126, + "learning_rate": 7.95866785608088e-05, + "loss": 0.1411, + "step": 13940 + }, + { + "epoch": 2.07, + "grad_norm": 0.8376699090003967, + "learning_rate": 7.957181088314005e-05, + "loss": 0.1376, + "step": 13950 + }, + { + "epoch": 2.07, + "grad_norm": 0.43358296155929565, + "learning_rate": 7.955694320547131e-05, + "loss": 0.1482, + "step": 13960 + }, + { + "epoch": 2.07, + "grad_norm": 0.7221712470054626, + "learning_rate": 7.954207552780256e-05, + "loss": 0.139, + "step": 13970 + }, + { + "epoch": 2.07, + "grad_norm": 0.6004229187965393, + "learning_rate": 7.95272078501338e-05, + "loss": 0.1446, + "step": 13980 + }, + { + "epoch": 2.07, + "grad_norm": 0.7398319840431213, + "learning_rate": 7.951234017246506e-05, + "loss": 0.14, + "step": 13990 + }, + { + "epoch": 2.08, + "grad_norm": 0.39415350556373596, + "learning_rate": 7.949747249479631e-05, + "loss": 0.1425, + "step": 14000 + }, + { + "epoch": 2.08, + "grad_norm": 0.32933810353279114, + "learning_rate": 7.948260481712757e-05, + "loss": 0.141, + "step": 14010 + }, + { + "epoch": 2.08, + "grad_norm": 1.5092155933380127, + "learning_rate": 7.946773713945882e-05, + "loss": 0.1498, + "step": 14020 + }, + { + "epoch": 2.08, + "grad_norm": 1.36288583278656, + "learning_rate": 7.945286946179007e-05, + "loss": 0.1441, + "step": 14030 + }, + { + "epoch": 2.08, + "grad_norm": 0.4052281081676483, + "learning_rate": 7.943800178412132e-05, + "loss": 0.1387, + "step": 14040 + }, + { + "epoch": 2.08, + "grad_norm": 1.4487920999526978, + "learning_rate": 7.942313410645257e-05, + "loss": 0.1451, + "step": 14050 + }, + { + "epoch": 2.08, + "grad_norm": 0.6891205906867981, + "learning_rate": 7.940826642878383e-05, + "loss": 0.1397, + "step": 14060 + }, + { + "epoch": 2.09, + "grad_norm": 0.8474586606025696, + "learning_rate": 7.939339875111508e-05, + "loss": 0.1501, + "step": 14070 + }, + { + "epoch": 2.09, + "grad_norm": 0.4147012531757355, + "learning_rate": 7.937853107344633e-05, + "loss": 0.131, + "step": 14080 + }, + { + "epoch": 2.09, + "grad_norm": 1.2289682626724243, + "learning_rate": 7.93636633957776e-05, + "loss": 0.1447, + "step": 14090 + }, + { + "epoch": 2.09, + "grad_norm": 0.4058608114719391, + "learning_rate": 7.934879571810884e-05, + "loss": 0.144, + "step": 14100 + }, + { + "epoch": 2.09, + "grad_norm": 0.19358326494693756, + "learning_rate": 7.933392804044009e-05, + "loss": 0.1414, + "step": 14110 + }, + { + "epoch": 2.09, + "grad_norm": 0.3566335439682007, + "learning_rate": 7.931906036277133e-05, + "loss": 0.137, + "step": 14120 + }, + { + "epoch": 2.09, + "grad_norm": 0.7426595091819763, + "learning_rate": 7.93041926851026e-05, + "loss": 0.1302, + "step": 14130 + }, + { + "epoch": 2.1, + "grad_norm": 1.8978757858276367, + "learning_rate": 7.928932500743384e-05, + "loss": 0.146, + "step": 14140 + }, + { + "epoch": 2.1, + "grad_norm": 0.46663588285446167, + "learning_rate": 7.92744573297651e-05, + "loss": 0.144, + "step": 14150 + }, + { + "epoch": 2.1, + "grad_norm": 0.2703957259654999, + "learning_rate": 7.925958965209635e-05, + "loss": 0.1376, + "step": 14160 + }, + { + "epoch": 2.1, + "grad_norm": 0.3722328841686249, + "learning_rate": 7.924472197442759e-05, + "loss": 0.1486, + "step": 14170 + }, + { + "epoch": 2.1, + "grad_norm": 1.1712533235549927, + "learning_rate": 7.922985429675886e-05, + "loss": 0.1369, + "step": 14180 + }, + { + "epoch": 2.1, + "grad_norm": 0.2290862798690796, + "learning_rate": 7.92149866190901e-05, + "loss": 0.1448, + "step": 14190 + }, + { + "epoch": 2.1, + "grad_norm": 0.2642858028411865, + "learning_rate": 7.920011894142136e-05, + "loss": 0.1406, + "step": 14200 + }, + { + "epoch": 2.11, + "grad_norm": 0.7167410850524902, + "learning_rate": 7.91852512637526e-05, + "loss": 0.1376, + "step": 14210 + }, + { + "epoch": 2.11, + "grad_norm": 1.1799192428588867, + "learning_rate": 7.917038358608387e-05, + "loss": 0.1393, + "step": 14220 + }, + { + "epoch": 2.11, + "grad_norm": 0.2893933951854706, + "learning_rate": 7.91555159084151e-05, + "loss": 0.1457, + "step": 14230 + }, + { + "epoch": 2.11, + "grad_norm": 0.45935529470443726, + "learning_rate": 7.914064823074636e-05, + "loss": 0.1436, + "step": 14240 + }, + { + "epoch": 2.11, + "grad_norm": 0.2944597005844116, + "learning_rate": 7.912578055307762e-05, + "loss": 0.1412, + "step": 14250 + }, + { + "epoch": 2.11, + "grad_norm": 0.5963975191116333, + "learning_rate": 7.911091287540886e-05, + "loss": 0.1474, + "step": 14260 + }, + { + "epoch": 2.12, + "grad_norm": 0.6776736378669739, + "learning_rate": 7.909604519774013e-05, + "loss": 0.1374, + "step": 14270 + }, + { + "epoch": 2.12, + "grad_norm": 0.3224552869796753, + "learning_rate": 7.908117752007137e-05, + "loss": 0.1385, + "step": 14280 + }, + { + "epoch": 2.12, + "grad_norm": 0.4411897361278534, + "learning_rate": 7.906630984240262e-05, + "loss": 0.1445, + "step": 14290 + }, + { + "epoch": 2.12, + "grad_norm": 0.4700928330421448, + "learning_rate": 7.905144216473386e-05, + "loss": 0.1388, + "step": 14300 + }, + { + "epoch": 2.12, + "grad_norm": 0.5549743175506592, + "learning_rate": 7.903657448706513e-05, + "loss": 0.1418, + "step": 14310 + }, + { + "epoch": 2.12, + "grad_norm": 0.7805799841880798, + "learning_rate": 7.902170680939637e-05, + "loss": 0.1429, + "step": 14320 + }, + { + "epoch": 2.12, + "grad_norm": 0.6569302678108215, + "learning_rate": 7.900683913172763e-05, + "loss": 0.1413, + "step": 14330 + }, + { + "epoch": 2.13, + "grad_norm": 0.9454148411750793, + "learning_rate": 7.899197145405888e-05, + "loss": 0.1422, + "step": 14340 + }, + { + "epoch": 2.13, + "grad_norm": 0.41172319650650024, + "learning_rate": 7.897710377639012e-05, + "loss": 0.1411, + "step": 14350 + }, + { + "epoch": 2.13, + "grad_norm": 0.7529705166816711, + "learning_rate": 7.896223609872139e-05, + "loss": 0.1388, + "step": 14360 + }, + { + "epoch": 2.13, + "grad_norm": 0.5935275554656982, + "learning_rate": 7.894736842105263e-05, + "loss": 0.1448, + "step": 14370 + }, + { + "epoch": 2.13, + "grad_norm": 1.1465091705322266, + "learning_rate": 7.893250074338389e-05, + "loss": 0.132, + "step": 14380 + }, + { + "epoch": 2.13, + "grad_norm": 0.5180565714836121, + "learning_rate": 7.891763306571513e-05, + "loss": 0.1394, + "step": 14390 + }, + { + "epoch": 2.13, + "grad_norm": 0.3858262896537781, + "learning_rate": 7.89027653880464e-05, + "loss": 0.1426, + "step": 14400 + }, + { + "epoch": 2.14, + "grad_norm": 0.3078696131706238, + "learning_rate": 7.888789771037765e-05, + "loss": 0.1347, + "step": 14410 + }, + { + "epoch": 2.14, + "grad_norm": 0.2894585430622101, + "learning_rate": 7.887303003270889e-05, + "loss": 0.1412, + "step": 14420 + }, + { + "epoch": 2.14, + "grad_norm": 0.9229311943054199, + "learning_rate": 7.885816235504015e-05, + "loss": 0.145, + "step": 14430 + }, + { + "epoch": 2.14, + "grad_norm": 0.8589563369750977, + "learning_rate": 7.884329467737139e-05, + "loss": 0.1423, + "step": 14440 + }, + { + "epoch": 2.14, + "grad_norm": 0.33812132477760315, + "learning_rate": 7.882842699970266e-05, + "loss": 0.1312, + "step": 14450 + }, + { + "epoch": 2.14, + "grad_norm": 0.4707375466823578, + "learning_rate": 7.88135593220339e-05, + "loss": 0.1333, + "step": 14460 + }, + { + "epoch": 2.14, + "grad_norm": 0.35323792695999146, + "learning_rate": 7.879869164436515e-05, + "loss": 0.1503, + "step": 14470 + }, + { + "epoch": 2.15, + "grad_norm": 0.7402976751327515, + "learning_rate": 7.878382396669641e-05, + "loss": 0.1429, + "step": 14480 + }, + { + "epoch": 2.15, + "grad_norm": 0.49308282136917114, + "learning_rate": 7.876895628902766e-05, + "loss": 0.138, + "step": 14490 + }, + { + "epoch": 2.15, + "grad_norm": 0.824302613735199, + "learning_rate": 7.875408861135892e-05, + "loss": 0.1456, + "step": 14500 + }, + { + "epoch": 2.15, + "grad_norm": 1.5500679016113281, + "learning_rate": 7.873922093369016e-05, + "loss": 0.1466, + "step": 14510 + }, + { + "epoch": 2.15, + "grad_norm": 0.34378743171691895, + "learning_rate": 7.872435325602141e-05, + "loss": 0.1382, + "step": 14520 + }, + { + "epoch": 2.15, + "grad_norm": 0.48903822898864746, + "learning_rate": 7.870948557835265e-05, + "loss": 0.1398, + "step": 14530 + }, + { + "epoch": 2.16, + "grad_norm": 0.954559862613678, + "learning_rate": 7.869461790068392e-05, + "loss": 0.1406, + "step": 14540 + }, + { + "epoch": 2.16, + "grad_norm": 0.3512939214706421, + "learning_rate": 7.867975022301516e-05, + "loss": 0.1404, + "step": 14550 + }, + { + "epoch": 2.16, + "grad_norm": 1.6540653705596924, + "learning_rate": 7.866488254534642e-05, + "loss": 0.1399, + "step": 14560 + }, + { + "epoch": 2.16, + "grad_norm": 1.4381154775619507, + "learning_rate": 7.865001486767767e-05, + "loss": 0.1456, + "step": 14570 + }, + { + "epoch": 2.16, + "grad_norm": 0.2611878514289856, + "learning_rate": 7.863514719000893e-05, + "loss": 0.1436, + "step": 14580 + }, + { + "epoch": 2.16, + "grad_norm": 1.1198517084121704, + "learning_rate": 7.862027951234018e-05, + "loss": 0.1421, + "step": 14590 + }, + { + "epoch": 2.16, + "grad_norm": 1.4881504774093628, + "learning_rate": 7.860541183467142e-05, + "loss": 0.1431, + "step": 14600 + }, + { + "epoch": 2.17, + "grad_norm": 0.5614832043647766, + "learning_rate": 7.859054415700268e-05, + "loss": 0.1444, + "step": 14610 + }, + { + "epoch": 2.17, + "grad_norm": 1.2753747701644897, + "learning_rate": 7.857567647933393e-05, + "loss": 0.1355, + "step": 14620 + }, + { + "epoch": 2.17, + "grad_norm": 0.7569044232368469, + "learning_rate": 7.856080880166519e-05, + "loss": 0.1403, + "step": 14630 + }, + { + "epoch": 2.17, + "grad_norm": 0.8014097213745117, + "learning_rate": 7.854594112399643e-05, + "loss": 0.1444, + "step": 14640 + }, + { + "epoch": 2.17, + "grad_norm": 0.4055616855621338, + "learning_rate": 7.853107344632768e-05, + "loss": 0.1384, + "step": 14650 + }, + { + "epoch": 2.17, + "grad_norm": 0.8355204463005066, + "learning_rate": 7.851620576865894e-05, + "loss": 0.1374, + "step": 14660 + }, + { + "epoch": 2.17, + "grad_norm": 0.5635973215103149, + "learning_rate": 7.85013380909902e-05, + "loss": 0.1453, + "step": 14670 + }, + { + "epoch": 2.18, + "grad_norm": 0.3572874069213867, + "learning_rate": 7.848647041332145e-05, + "loss": 0.1474, + "step": 14680 + }, + { + "epoch": 2.18, + "grad_norm": 0.31111082434654236, + "learning_rate": 7.847160273565269e-05, + "loss": 0.1436, + "step": 14690 + }, + { + "epoch": 2.18, + "grad_norm": 0.38512876629829407, + "learning_rate": 7.845673505798394e-05, + "loss": 0.138, + "step": 14700 + }, + { + "epoch": 2.18, + "grad_norm": 0.26960206031799316, + "learning_rate": 7.84418673803152e-05, + "loss": 0.147, + "step": 14710 + }, + { + "epoch": 2.18, + "grad_norm": 0.29528844356536865, + "learning_rate": 7.842699970264645e-05, + "loss": 0.1416, + "step": 14720 + }, + { + "epoch": 2.18, + "grad_norm": 0.3126089572906494, + "learning_rate": 7.841213202497771e-05, + "loss": 0.1417, + "step": 14730 + }, + { + "epoch": 2.18, + "grad_norm": 1.1311891078948975, + "learning_rate": 7.839726434730895e-05, + "loss": 0.1365, + "step": 14740 + }, + { + "epoch": 2.19, + "grad_norm": 0.27010664343833923, + "learning_rate": 7.83823966696402e-05, + "loss": 0.1367, + "step": 14750 + }, + { + "epoch": 2.19, + "grad_norm": 1.2416595220565796, + "learning_rate": 7.836752899197146e-05, + "loss": 0.137, + "step": 14760 + }, + { + "epoch": 2.19, + "grad_norm": 0.2980870306491852, + "learning_rate": 7.835266131430271e-05, + "loss": 0.1486, + "step": 14770 + }, + { + "epoch": 2.19, + "grad_norm": 0.44527989625930786, + "learning_rate": 7.833779363663396e-05, + "loss": 0.1392, + "step": 14780 + }, + { + "epoch": 2.19, + "grad_norm": 0.36573994159698486, + "learning_rate": 7.832292595896521e-05, + "loss": 0.1469, + "step": 14790 + }, + { + "epoch": 2.19, + "grad_norm": 0.3939895033836365, + "learning_rate": 7.830805828129647e-05, + "loss": 0.1449, + "step": 14800 + }, + { + "epoch": 2.2, + "grad_norm": 0.5952978134155273, + "learning_rate": 7.829319060362772e-05, + "loss": 0.1399, + "step": 14810 + }, + { + "epoch": 2.2, + "grad_norm": 0.4172046184539795, + "learning_rate": 7.827832292595897e-05, + "loss": 0.1417, + "step": 14820 + }, + { + "epoch": 2.2, + "grad_norm": 1.6859674453735352, + "learning_rate": 7.826345524829022e-05, + "loss": 0.1503, + "step": 14830 + }, + { + "epoch": 2.2, + "grad_norm": 0.4280993640422821, + "learning_rate": 7.824858757062147e-05, + "loss": 0.1466, + "step": 14840 + }, + { + "epoch": 2.2, + "grad_norm": 0.28708526492118835, + "learning_rate": 7.823371989295273e-05, + "loss": 0.1401, + "step": 14850 + }, + { + "epoch": 2.2, + "grad_norm": 1.4044026136398315, + "learning_rate": 7.821885221528398e-05, + "loss": 0.142, + "step": 14860 + }, + { + "epoch": 2.2, + "grad_norm": 0.4096578061580658, + "learning_rate": 7.820398453761522e-05, + "loss": 0.1433, + "step": 14870 + }, + { + "epoch": 2.21, + "grad_norm": 0.49273645877838135, + "learning_rate": 7.818911685994648e-05, + "loss": 0.1411, + "step": 14880 + }, + { + "epoch": 2.21, + "grad_norm": 1.1558241844177246, + "learning_rate": 7.817424918227773e-05, + "loss": 0.1325, + "step": 14890 + }, + { + "epoch": 2.21, + "grad_norm": 0.2521809935569763, + "learning_rate": 7.815938150460899e-05, + "loss": 0.1436, + "step": 14900 + }, + { + "epoch": 2.21, + "grad_norm": 1.3384026288986206, + "learning_rate": 7.814451382694024e-05, + "loss": 0.1353, + "step": 14910 + }, + { + "epoch": 2.21, + "grad_norm": 0.9027255177497864, + "learning_rate": 7.812964614927148e-05, + "loss": 0.1396, + "step": 14920 + }, + { + "epoch": 2.21, + "grad_norm": 1.5526256561279297, + "learning_rate": 7.811477847160274e-05, + "loss": 0.1385, + "step": 14930 + }, + { + "epoch": 2.21, + "grad_norm": 1.069063425064087, + "learning_rate": 7.809991079393399e-05, + "loss": 0.1359, + "step": 14940 + }, + { + "epoch": 2.22, + "grad_norm": 0.2974967956542969, + "learning_rate": 7.808504311626525e-05, + "loss": 0.1342, + "step": 14950 + }, + { + "epoch": 2.22, + "grad_norm": 0.25086086988449097, + "learning_rate": 7.807017543859649e-05, + "loss": 0.1332, + "step": 14960 + }, + { + "epoch": 2.22, + "grad_norm": 0.6883974671363831, + "learning_rate": 7.805530776092774e-05, + "loss": 0.1399, + "step": 14970 + }, + { + "epoch": 2.22, + "grad_norm": 0.8510827422142029, + "learning_rate": 7.8040440083259e-05, + "loss": 0.1441, + "step": 14980 + }, + { + "epoch": 2.22, + "grad_norm": 1.4618726968765259, + "learning_rate": 7.802557240559025e-05, + "loss": 0.147, + "step": 14990 + }, + { + "epoch": 2.22, + "grad_norm": 0.8736234307289124, + "learning_rate": 7.80107047279215e-05, + "loss": 0.1422, + "step": 15000 + }, + { + "epoch": 2.22, + "grad_norm": 0.5815086364746094, + "learning_rate": 7.799583705025275e-05, + "loss": 0.1413, + "step": 15010 + }, + { + "epoch": 2.23, + "grad_norm": 0.29546529054641724, + "learning_rate": 7.7980969372584e-05, + "loss": 0.1404, + "step": 15020 + }, + { + "epoch": 2.23, + "grad_norm": 0.3832775056362152, + "learning_rate": 7.796610169491526e-05, + "loss": 0.1327, + "step": 15030 + }, + { + "epoch": 2.23, + "grad_norm": 0.2785964608192444, + "learning_rate": 7.795123401724651e-05, + "loss": 0.1329, + "step": 15040 + }, + { + "epoch": 2.23, + "grad_norm": 1.5065478086471558, + "learning_rate": 7.793636633957775e-05, + "loss": 0.1527, + "step": 15050 + }, + { + "epoch": 2.23, + "grad_norm": 1.5339558124542236, + "learning_rate": 7.792149866190901e-05, + "loss": 0.1431, + "step": 15060 + }, + { + "epoch": 2.23, + "grad_norm": 1.5240484476089478, + "learning_rate": 7.790663098424028e-05, + "loss": 0.1465, + "step": 15070 + }, + { + "epoch": 2.24, + "grad_norm": 0.3933528661727905, + "learning_rate": 7.789176330657152e-05, + "loss": 0.1408, + "step": 15080 + }, + { + "epoch": 2.24, + "grad_norm": 0.600188672542572, + "learning_rate": 7.787689562890277e-05, + "loss": 0.1413, + "step": 15090 + }, + { + "epoch": 2.24, + "grad_norm": 0.3889021575450897, + "learning_rate": 7.786202795123401e-05, + "loss": 0.1377, + "step": 15100 + }, + { + "epoch": 2.24, + "grad_norm": 0.4499037265777588, + "learning_rate": 7.784716027356528e-05, + "loss": 0.1401, + "step": 15110 + }, + { + "epoch": 2.24, + "grad_norm": 0.772075355052948, + "learning_rate": 7.783229259589652e-05, + "loss": 0.1359, + "step": 15120 + }, + { + "epoch": 2.24, + "grad_norm": 0.5900905132293701, + "learning_rate": 7.781742491822778e-05, + "loss": 0.1374, + "step": 15130 + }, + { + "epoch": 2.24, + "grad_norm": 1.9632419347763062, + "learning_rate": 7.780255724055903e-05, + "loss": 0.1485, + "step": 15140 + }, + { + "epoch": 2.25, + "grad_norm": 1.079269289970398, + "learning_rate": 7.778768956289027e-05, + "loss": 0.1428, + "step": 15150 + }, + { + "epoch": 2.25, + "grad_norm": 0.5753262042999268, + "learning_rate": 7.777282188522154e-05, + "loss": 0.1429, + "step": 15160 + }, + { + "epoch": 2.25, + "grad_norm": 0.28684714436531067, + "learning_rate": 7.775795420755278e-05, + "loss": 0.1363, + "step": 15170 + }, + { + "epoch": 2.25, + "grad_norm": 0.2784688174724579, + "learning_rate": 7.774308652988404e-05, + "loss": 0.1447, + "step": 15180 + }, + { + "epoch": 2.25, + "grad_norm": 0.4321437478065491, + "learning_rate": 7.772821885221528e-05, + "loss": 0.1437, + "step": 15190 + }, + { + "epoch": 2.25, + "grad_norm": 0.4593127369880676, + "learning_rate": 7.771335117454655e-05, + "loss": 0.1394, + "step": 15200 + }, + { + "epoch": 2.25, + "grad_norm": 0.6981044411659241, + "learning_rate": 7.769848349687779e-05, + "loss": 0.1469, + "step": 15210 + }, + { + "epoch": 2.26, + "grad_norm": 0.9199114441871643, + "learning_rate": 7.768361581920904e-05, + "loss": 0.1396, + "step": 15220 + }, + { + "epoch": 2.26, + "grad_norm": 0.5728020668029785, + "learning_rate": 7.76687481415403e-05, + "loss": 0.1433, + "step": 15230 + }, + { + "epoch": 2.26, + "grad_norm": 0.3551998734474182, + "learning_rate": 7.765388046387154e-05, + "loss": 0.1418, + "step": 15240 + }, + { + "epoch": 2.26, + "grad_norm": 0.25206756591796875, + "learning_rate": 7.763901278620281e-05, + "loss": 0.1405, + "step": 15250 + }, + { + "epoch": 2.26, + "grad_norm": 0.40802913904190063, + "learning_rate": 7.762414510853405e-05, + "loss": 0.1406, + "step": 15260 + }, + { + "epoch": 2.26, + "grad_norm": 0.4538644552230835, + "learning_rate": 7.76092774308653e-05, + "loss": 0.1414, + "step": 15270 + }, + { + "epoch": 2.26, + "grad_norm": 0.5877019166946411, + "learning_rate": 7.759440975319655e-05, + "loss": 0.1398, + "step": 15280 + }, + { + "epoch": 2.27, + "grad_norm": 1.7143536806106567, + "learning_rate": 7.757954207552781e-05, + "loss": 0.1444, + "step": 15290 + }, + { + "epoch": 2.27, + "grad_norm": 0.8385623693466187, + "learning_rate": 7.756467439785905e-05, + "loss": 0.1453, + "step": 15300 + }, + { + "epoch": 2.27, + "grad_norm": 0.5438138246536255, + "learning_rate": 7.754980672019031e-05, + "loss": 0.1476, + "step": 15310 + }, + { + "epoch": 2.27, + "grad_norm": 1.067368507385254, + "learning_rate": 7.753493904252156e-05, + "loss": 0.1493, + "step": 15320 + }, + { + "epoch": 2.27, + "grad_norm": 0.43237921595573425, + "learning_rate": 7.75200713648528e-05, + "loss": 0.1418, + "step": 15330 + }, + { + "epoch": 2.27, + "grad_norm": 0.7449706792831421, + "learning_rate": 7.750520368718407e-05, + "loss": 0.1372, + "step": 15340 + }, + { + "epoch": 2.28, + "grad_norm": 1.315883994102478, + "learning_rate": 7.749033600951531e-05, + "loss": 0.1482, + "step": 15350 + }, + { + "epoch": 2.28, + "grad_norm": 0.20625711977481842, + "learning_rate": 7.747546833184657e-05, + "loss": 0.1374, + "step": 15360 + }, + { + "epoch": 2.28, + "grad_norm": 0.29170262813568115, + "learning_rate": 7.746060065417781e-05, + "loss": 0.1434, + "step": 15370 + }, + { + "epoch": 2.28, + "grad_norm": 0.6575577855110168, + "learning_rate": 7.744573297650908e-05, + "loss": 0.1411, + "step": 15380 + }, + { + "epoch": 2.28, + "grad_norm": 0.36040765047073364, + "learning_rate": 7.743086529884032e-05, + "loss": 0.1387, + "step": 15390 + }, + { + "epoch": 2.28, + "grad_norm": 0.6309950947761536, + "learning_rate": 7.741599762117158e-05, + "loss": 0.1478, + "step": 15400 + }, + { + "epoch": 2.28, + "grad_norm": 0.2758764624595642, + "learning_rate": 7.740112994350283e-05, + "loss": 0.1437, + "step": 15410 + }, + { + "epoch": 2.29, + "grad_norm": 0.648673951625824, + "learning_rate": 7.738626226583407e-05, + "loss": 0.1365, + "step": 15420 + }, + { + "epoch": 2.29, + "grad_norm": 1.0597361326217651, + "learning_rate": 7.737139458816534e-05, + "loss": 0.1408, + "step": 15430 + }, + { + "epoch": 2.29, + "grad_norm": 0.3122524321079254, + "learning_rate": 7.735652691049658e-05, + "loss": 0.1481, + "step": 15440 + }, + { + "epoch": 2.29, + "grad_norm": 0.9538226127624512, + "learning_rate": 7.734165923282784e-05, + "loss": 0.149, + "step": 15450 + }, + { + "epoch": 2.29, + "grad_norm": 0.28215184807777405, + "learning_rate": 7.732679155515909e-05, + "loss": 0.1316, + "step": 15460 + }, + { + "epoch": 2.29, + "grad_norm": 0.4941287338733673, + "learning_rate": 7.731192387749034e-05, + "loss": 0.1409, + "step": 15470 + }, + { + "epoch": 2.29, + "grad_norm": 0.3369395136833191, + "learning_rate": 7.72970561998216e-05, + "loss": 0.145, + "step": 15480 + }, + { + "epoch": 2.3, + "grad_norm": 0.8342441320419312, + "learning_rate": 7.728218852215284e-05, + "loss": 0.1377, + "step": 15490 + }, + { + "epoch": 2.3, + "grad_norm": 2.002681255340576, + "learning_rate": 7.72673208444841e-05, + "loss": 0.1436, + "step": 15500 + }, + { + "epoch": 2.3, + "grad_norm": 1.1296112537384033, + "learning_rate": 7.725245316681534e-05, + "loss": 0.1372, + "step": 15510 + }, + { + "epoch": 2.3, + "grad_norm": 0.4747876822948456, + "learning_rate": 7.72375854891466e-05, + "loss": 0.1391, + "step": 15520 + }, + { + "epoch": 2.3, + "grad_norm": 0.37686342000961304, + "learning_rate": 7.722271781147785e-05, + "loss": 0.1417, + "step": 15530 + }, + { + "epoch": 2.3, + "grad_norm": 0.6419855952262878, + "learning_rate": 7.72078501338091e-05, + "loss": 0.1507, + "step": 15540 + }, + { + "epoch": 2.3, + "grad_norm": 0.5197919011116028, + "learning_rate": 7.719298245614036e-05, + "loss": 0.1406, + "step": 15550 + }, + { + "epoch": 2.31, + "grad_norm": 0.6364248991012573, + "learning_rate": 7.717811477847161e-05, + "loss": 0.139, + "step": 15560 + }, + { + "epoch": 2.31, + "grad_norm": 0.2479657530784607, + "learning_rate": 7.716324710080287e-05, + "loss": 0.1347, + "step": 15570 + }, + { + "epoch": 2.31, + "grad_norm": 0.46477001905441284, + "learning_rate": 7.71483794231341e-05, + "loss": 0.1431, + "step": 15580 + }, + { + "epoch": 2.31, + "grad_norm": 0.32222843170166016, + "learning_rate": 7.713351174546536e-05, + "loss": 0.1487, + "step": 15590 + }, + { + "epoch": 2.31, + "grad_norm": 0.847084641456604, + "learning_rate": 7.711864406779662e-05, + "loss": 0.1503, + "step": 15600 + }, + { + "epoch": 2.31, + "grad_norm": 0.4260410964488983, + "learning_rate": 7.710377639012787e-05, + "loss": 0.1367, + "step": 15610 + }, + { + "epoch": 2.32, + "grad_norm": 0.7543812394142151, + "learning_rate": 7.708890871245911e-05, + "loss": 0.1353, + "step": 15620 + }, + { + "epoch": 2.32, + "grad_norm": 1.0072932243347168, + "learning_rate": 7.707404103479037e-05, + "loss": 0.1471, + "step": 15630 + }, + { + "epoch": 2.32, + "grad_norm": 0.30913281440734863, + "learning_rate": 7.705917335712162e-05, + "loss": 0.145, + "step": 15640 + }, + { + "epoch": 2.32, + "grad_norm": 0.6405145525932312, + "learning_rate": 7.704430567945288e-05, + "loss": 0.1342, + "step": 15650 + }, + { + "epoch": 2.32, + "grad_norm": 1.67426598072052, + "learning_rate": 7.702943800178413e-05, + "loss": 0.1342, + "step": 15660 + }, + { + "epoch": 2.32, + "grad_norm": 0.5174002647399902, + "learning_rate": 7.701457032411537e-05, + "loss": 0.1419, + "step": 15670 + }, + { + "epoch": 2.32, + "grad_norm": 0.7070042490959167, + "learning_rate": 7.699970264644663e-05, + "loss": 0.1388, + "step": 15680 + }, + { + "epoch": 2.33, + "grad_norm": 0.992650032043457, + "learning_rate": 7.698483496877788e-05, + "loss": 0.1361, + "step": 15690 + }, + { + "epoch": 2.33, + "grad_norm": 0.519932210445404, + "learning_rate": 7.696996729110914e-05, + "loss": 0.1409, + "step": 15700 + }, + { + "epoch": 2.33, + "grad_norm": 0.5189968943595886, + "learning_rate": 7.695509961344038e-05, + "loss": 0.14, + "step": 15710 + }, + { + "epoch": 2.33, + "grad_norm": 1.0756185054779053, + "learning_rate": 7.694023193577163e-05, + "loss": 0.1407, + "step": 15720 + }, + { + "epoch": 2.33, + "grad_norm": 0.6444623470306396, + "learning_rate": 7.692536425810289e-05, + "loss": 0.1466, + "step": 15730 + }, + { + "epoch": 2.33, + "grad_norm": 0.3298320174217224, + "learning_rate": 7.691049658043414e-05, + "loss": 0.136, + "step": 15740 + }, + { + "epoch": 2.33, + "grad_norm": 0.20186583697795868, + "learning_rate": 7.68956289027654e-05, + "loss": 0.1393, + "step": 15750 + }, + { + "epoch": 2.34, + "grad_norm": 0.335825115442276, + "learning_rate": 7.688076122509664e-05, + "loss": 0.1377, + "step": 15760 + }, + { + "epoch": 2.34, + "grad_norm": 1.36127769947052, + "learning_rate": 7.686589354742789e-05, + "loss": 0.1457, + "step": 15770 + }, + { + "epoch": 2.34, + "grad_norm": 1.2447770833969116, + "learning_rate": 7.685102586975915e-05, + "loss": 0.144, + "step": 15780 + }, + { + "epoch": 2.34, + "grad_norm": 0.7446007132530212, + "learning_rate": 7.68361581920904e-05, + "loss": 0.1407, + "step": 15790 + }, + { + "epoch": 2.34, + "grad_norm": 1.47913658618927, + "learning_rate": 7.682129051442166e-05, + "loss": 0.1473, + "step": 15800 + }, + { + "epoch": 2.34, + "grad_norm": 0.7737151980400085, + "learning_rate": 7.68064228367529e-05, + "loss": 0.138, + "step": 15810 + }, + { + "epoch": 2.35, + "grad_norm": 0.3995475172996521, + "learning_rate": 7.679155515908415e-05, + "loss": 0.1473, + "step": 15820 + }, + { + "epoch": 2.35, + "grad_norm": 0.6491892337799072, + "learning_rate": 7.677668748141541e-05, + "loss": 0.1408, + "step": 15830 + }, + { + "epoch": 2.35, + "grad_norm": 0.3302996754646301, + "learning_rate": 7.676181980374666e-05, + "loss": 0.1456, + "step": 15840 + }, + { + "epoch": 2.35, + "grad_norm": 1.0726025104522705, + "learning_rate": 7.67469521260779e-05, + "loss": 0.1448, + "step": 15850 + }, + { + "epoch": 2.35, + "grad_norm": 0.29980069398880005, + "learning_rate": 7.673208444840916e-05, + "loss": 0.1394, + "step": 15860 + }, + { + "epoch": 2.35, + "grad_norm": 0.3691909611225128, + "learning_rate": 7.671721677074041e-05, + "loss": 0.139, + "step": 15870 + }, + { + "epoch": 2.35, + "grad_norm": 0.26005685329437256, + "learning_rate": 7.670234909307167e-05, + "loss": 0.1369, + "step": 15880 + }, + { + "epoch": 2.36, + "grad_norm": 0.5521954894065857, + "learning_rate": 7.668748141540292e-05, + "loss": 0.1394, + "step": 15890 + }, + { + "epoch": 2.36, + "grad_norm": 0.44873157143592834, + "learning_rate": 7.667261373773416e-05, + "loss": 0.1408, + "step": 15900 + }, + { + "epoch": 2.36, + "grad_norm": 0.8657715916633606, + "learning_rate": 7.665774606006542e-05, + "loss": 0.1368, + "step": 15910 + }, + { + "epoch": 2.36, + "grad_norm": 0.876879870891571, + "learning_rate": 7.664287838239667e-05, + "loss": 0.1395, + "step": 15920 + }, + { + "epoch": 2.36, + "grad_norm": 0.5348753929138184, + "learning_rate": 7.662801070472793e-05, + "loss": 0.147, + "step": 15930 + }, + { + "epoch": 2.36, + "grad_norm": 1.3844401836395264, + "learning_rate": 7.661314302705917e-05, + "loss": 0.1448, + "step": 15940 + }, + { + "epoch": 2.36, + "grad_norm": 0.532243013381958, + "learning_rate": 7.659827534939042e-05, + "loss": 0.1453, + "step": 15950 + }, + { + "epoch": 2.37, + "grad_norm": 0.49074360728263855, + "learning_rate": 7.658340767172168e-05, + "loss": 0.1393, + "step": 15960 + }, + { + "epoch": 2.37, + "grad_norm": 0.7487906217575073, + "learning_rate": 7.656853999405293e-05, + "loss": 0.1369, + "step": 15970 + }, + { + "epoch": 2.37, + "grad_norm": 0.24927030503749847, + "learning_rate": 7.655367231638419e-05, + "loss": 0.146, + "step": 15980 + }, + { + "epoch": 2.37, + "grad_norm": 0.3706786036491394, + "learning_rate": 7.653880463871543e-05, + "loss": 0.1432, + "step": 15990 + }, + { + "epoch": 2.37, + "grad_norm": 0.342334508895874, + "learning_rate": 7.652393696104668e-05, + "loss": 0.1466, + "step": 16000 + }, + { + "epoch": 2.37, + "grad_norm": 0.24996308982372284, + "learning_rate": 7.650906928337794e-05, + "loss": 0.1383, + "step": 16010 + }, + { + "epoch": 2.37, + "grad_norm": 0.39586570858955383, + "learning_rate": 7.64942016057092e-05, + "loss": 0.1392, + "step": 16020 + }, + { + "epoch": 2.38, + "grad_norm": 0.6906977891921997, + "learning_rate": 7.647933392804044e-05, + "loss": 0.1363, + "step": 16030 + }, + { + "epoch": 2.38, + "grad_norm": 1.0736937522888184, + "learning_rate": 7.646446625037169e-05, + "loss": 0.1384, + "step": 16040 + }, + { + "epoch": 2.38, + "grad_norm": 0.8316856622695923, + "learning_rate": 7.644959857270295e-05, + "loss": 0.1358, + "step": 16050 + }, + { + "epoch": 2.38, + "grad_norm": 0.44582003355026245, + "learning_rate": 7.64347308950342e-05, + "loss": 0.1439, + "step": 16060 + }, + { + "epoch": 2.38, + "grad_norm": 0.3568483591079712, + "learning_rate": 7.641986321736545e-05, + "loss": 0.1418, + "step": 16070 + }, + { + "epoch": 2.38, + "grad_norm": 0.5303836464881897, + "learning_rate": 7.64049955396967e-05, + "loss": 0.1381, + "step": 16080 + }, + { + "epoch": 2.39, + "grad_norm": 0.48559141159057617, + "learning_rate": 7.639012786202796e-05, + "loss": 0.1312, + "step": 16090 + }, + { + "epoch": 2.39, + "grad_norm": 0.39514291286468506, + "learning_rate": 7.63752601843592e-05, + "loss": 0.1452, + "step": 16100 + }, + { + "epoch": 2.39, + "grad_norm": 0.7182896137237549, + "learning_rate": 7.636039250669046e-05, + "loss": 0.133, + "step": 16110 + }, + { + "epoch": 2.39, + "grad_norm": 1.0193687677383423, + "learning_rate": 7.634552482902171e-05, + "loss": 0.1429, + "step": 16120 + }, + { + "epoch": 2.39, + "grad_norm": 0.29207757115364075, + "learning_rate": 7.633065715135296e-05, + "loss": 0.1457, + "step": 16130 + }, + { + "epoch": 2.39, + "grad_norm": 0.8084988594055176, + "learning_rate": 7.631578947368422e-05, + "loss": 0.1446, + "step": 16140 + }, + { + "epoch": 2.39, + "grad_norm": 0.3876637816429138, + "learning_rate": 7.630092179601547e-05, + "loss": 0.1339, + "step": 16150 + }, + { + "epoch": 2.4, + "grad_norm": 1.8244117498397827, + "learning_rate": 7.628605411834672e-05, + "loss": 0.1456, + "step": 16160 + }, + { + "epoch": 2.4, + "grad_norm": 0.2684149444103241, + "learning_rate": 7.627118644067796e-05, + "loss": 0.1347, + "step": 16170 + }, + { + "epoch": 2.4, + "grad_norm": 0.710655927658081, + "learning_rate": 7.625631876300923e-05, + "loss": 0.1438, + "step": 16180 + }, + { + "epoch": 2.4, + "grad_norm": 1.0059056282043457, + "learning_rate": 7.624145108534047e-05, + "loss": 0.1427, + "step": 16190 + }, + { + "epoch": 2.4, + "grad_norm": 0.8452532887458801, + "learning_rate": 7.622658340767173e-05, + "loss": 0.1484, + "step": 16200 + }, + { + "epoch": 2.4, + "grad_norm": 0.28337419033050537, + "learning_rate": 7.621171573000298e-05, + "loss": 0.1421, + "step": 16210 + }, + { + "epoch": 2.4, + "grad_norm": 0.3758450448513031, + "learning_rate": 7.619684805233422e-05, + "loss": 0.1385, + "step": 16220 + }, + { + "epoch": 2.41, + "grad_norm": 0.6121477484703064, + "learning_rate": 7.618198037466549e-05, + "loss": 0.1454, + "step": 16230 + }, + { + "epoch": 2.41, + "grad_norm": 0.5063532590866089, + "learning_rate": 7.616711269699673e-05, + "loss": 0.1422, + "step": 16240 + }, + { + "epoch": 2.41, + "grad_norm": 1.5977672338485718, + "learning_rate": 7.615224501932799e-05, + "loss": 0.1472, + "step": 16250 + }, + { + "epoch": 2.41, + "grad_norm": 0.38274484872817993, + "learning_rate": 7.613737734165923e-05, + "loss": 0.1479, + "step": 16260 + }, + { + "epoch": 2.41, + "grad_norm": 0.6593810319900513, + "learning_rate": 7.61225096639905e-05, + "loss": 0.1436, + "step": 16270 + }, + { + "epoch": 2.41, + "grad_norm": 0.3354584276676178, + "learning_rate": 7.610764198632174e-05, + "loss": 0.1341, + "step": 16280 + }, + { + "epoch": 2.41, + "grad_norm": 0.642591655254364, + "learning_rate": 7.609277430865299e-05, + "loss": 0.1391, + "step": 16290 + }, + { + "epoch": 2.42, + "grad_norm": 0.25391778349876404, + "learning_rate": 7.607790663098425e-05, + "loss": 0.1409, + "step": 16300 + }, + { + "epoch": 2.42, + "grad_norm": 0.478242427110672, + "learning_rate": 7.606303895331549e-05, + "loss": 0.1324, + "step": 16310 + }, + { + "epoch": 2.42, + "grad_norm": 1.1422983407974243, + "learning_rate": 7.604817127564676e-05, + "loss": 0.1378, + "step": 16320 + }, + { + "epoch": 2.42, + "grad_norm": 0.5172997713088989, + "learning_rate": 7.6033303597978e-05, + "loss": 0.142, + "step": 16330 + }, + { + "epoch": 2.42, + "grad_norm": 0.5715184807777405, + "learning_rate": 7.601843592030925e-05, + "loss": 0.1425, + "step": 16340 + }, + { + "epoch": 2.42, + "grad_norm": 0.6750907897949219, + "learning_rate": 7.60035682426405e-05, + "loss": 0.1496, + "step": 16350 + }, + { + "epoch": 2.43, + "grad_norm": 0.7555719017982483, + "learning_rate": 7.598870056497176e-05, + "loss": 0.1384, + "step": 16360 + }, + { + "epoch": 2.43, + "grad_norm": 0.9847421050071716, + "learning_rate": 7.5973832887303e-05, + "loss": 0.1445, + "step": 16370 + }, + { + "epoch": 2.43, + "grad_norm": 2.2487268447875977, + "learning_rate": 7.595896520963426e-05, + "loss": 0.1437, + "step": 16380 + }, + { + "epoch": 2.43, + "grad_norm": 0.2849732041358948, + "learning_rate": 7.594409753196551e-05, + "loss": 0.1373, + "step": 16390 + }, + { + "epoch": 2.43, + "grad_norm": 0.9714717268943787, + "learning_rate": 7.592922985429675e-05, + "loss": 0.1395, + "step": 16400 + }, + { + "epoch": 2.43, + "grad_norm": 0.6926866173744202, + "learning_rate": 7.591436217662802e-05, + "loss": 0.1417, + "step": 16410 + }, + { + "epoch": 2.43, + "grad_norm": 0.687318742275238, + "learning_rate": 7.589949449895926e-05, + "loss": 0.1339, + "step": 16420 + }, + { + "epoch": 2.44, + "grad_norm": 0.5382028222084045, + "learning_rate": 7.588462682129052e-05, + "loss": 0.1466, + "step": 16430 + }, + { + "epoch": 2.44, + "grad_norm": 0.7722700238227844, + "learning_rate": 7.586975914362177e-05, + "loss": 0.1394, + "step": 16440 + }, + { + "epoch": 2.44, + "grad_norm": 0.8510586619377136, + "learning_rate": 7.585489146595303e-05, + "loss": 0.1389, + "step": 16450 + }, + { + "epoch": 2.44, + "grad_norm": 0.35136356949806213, + "learning_rate": 7.584002378828428e-05, + "loss": 0.1414, + "step": 16460 + }, + { + "epoch": 2.44, + "grad_norm": 0.2880131006240845, + "learning_rate": 7.582515611061552e-05, + "loss": 0.1411, + "step": 16470 + }, + { + "epoch": 2.44, + "grad_norm": 0.2339642494916916, + "learning_rate": 7.581028843294678e-05, + "loss": 0.14, + "step": 16480 + }, + { + "epoch": 2.44, + "grad_norm": 0.9338073134422302, + "learning_rate": 7.579542075527802e-05, + "loss": 0.1414, + "step": 16490 + }, + { + "epoch": 2.45, + "grad_norm": 0.46103188395500183, + "learning_rate": 7.578055307760929e-05, + "loss": 0.1396, + "step": 16500 + }, + { + "epoch": 2.45, + "grad_norm": 0.4752241373062134, + "learning_rate": 7.576568539994053e-05, + "loss": 0.1455, + "step": 16510 + }, + { + "epoch": 2.45, + "grad_norm": 0.4363197684288025, + "learning_rate": 7.575081772227178e-05, + "loss": 0.1391, + "step": 16520 + }, + { + "epoch": 2.45, + "grad_norm": 0.7979761362075806, + "learning_rate": 7.573595004460304e-05, + "loss": 0.1379, + "step": 16530 + }, + { + "epoch": 2.45, + "grad_norm": 0.3751250207424164, + "learning_rate": 7.572108236693429e-05, + "loss": 0.1444, + "step": 16540 + }, + { + "epoch": 2.45, + "grad_norm": 0.4037092328071594, + "learning_rate": 7.570621468926555e-05, + "loss": 0.1438, + "step": 16550 + }, + { + "epoch": 2.45, + "grad_norm": 0.4799390137195587, + "learning_rate": 7.569134701159679e-05, + "loss": 0.1403, + "step": 16560 + }, + { + "epoch": 2.46, + "grad_norm": 0.8113133311271667, + "learning_rate": 7.567647933392804e-05, + "loss": 0.1372, + "step": 16570 + }, + { + "epoch": 2.46, + "grad_norm": 1.4397985935211182, + "learning_rate": 7.56616116562593e-05, + "loss": 0.1379, + "step": 16580 + }, + { + "epoch": 2.46, + "grad_norm": 0.3244461417198181, + "learning_rate": 7.564674397859055e-05, + "loss": 0.14, + "step": 16590 + }, + { + "epoch": 2.46, + "grad_norm": 0.3702487647533417, + "learning_rate": 7.56318763009218e-05, + "loss": 0.1388, + "step": 16600 + }, + { + "epoch": 2.46, + "grad_norm": 0.5789926648139954, + "learning_rate": 7.561700862325305e-05, + "loss": 0.1455, + "step": 16610 + }, + { + "epoch": 2.46, + "grad_norm": 0.3471226096153259, + "learning_rate": 7.56021409455843e-05, + "loss": 0.1339, + "step": 16620 + }, + { + "epoch": 2.47, + "grad_norm": 0.26994770765304565, + "learning_rate": 7.558727326791556e-05, + "loss": 0.1378, + "step": 16630 + }, + { + "epoch": 2.47, + "grad_norm": 0.5144693851470947, + "learning_rate": 7.557240559024681e-05, + "loss": 0.1468, + "step": 16640 + }, + { + "epoch": 2.47, + "grad_norm": 0.29384931921958923, + "learning_rate": 7.555753791257805e-05, + "loss": 0.1399, + "step": 16650 + }, + { + "epoch": 2.47, + "grad_norm": 0.8755843639373779, + "learning_rate": 7.554267023490931e-05, + "loss": 0.1365, + "step": 16660 + }, + { + "epoch": 2.47, + "grad_norm": 0.43195444345474243, + "learning_rate": 7.552780255724056e-05, + "loss": 0.1496, + "step": 16670 + }, + { + "epoch": 2.47, + "grad_norm": 0.4010591506958008, + "learning_rate": 7.551293487957182e-05, + "loss": 0.1387, + "step": 16680 + }, + { + "epoch": 2.47, + "grad_norm": 0.33434391021728516, + "learning_rate": 7.549806720190306e-05, + "loss": 0.1422, + "step": 16690 + }, + { + "epoch": 2.48, + "grad_norm": 1.1877391338348389, + "learning_rate": 7.548319952423432e-05, + "loss": 0.1342, + "step": 16700 + }, + { + "epoch": 2.48, + "grad_norm": 0.7518399357795715, + "learning_rate": 7.546833184656557e-05, + "loss": 0.1479, + "step": 16710 + }, + { + "epoch": 2.48, + "grad_norm": 0.5379367470741272, + "learning_rate": 7.545346416889682e-05, + "loss": 0.1384, + "step": 16720 + }, + { + "epoch": 2.48, + "grad_norm": 0.8337181210517883, + "learning_rate": 7.543859649122808e-05, + "loss": 0.1445, + "step": 16730 + }, + { + "epoch": 2.48, + "grad_norm": 0.9827070832252502, + "learning_rate": 7.542372881355932e-05, + "loss": 0.1377, + "step": 16740 + }, + { + "epoch": 2.48, + "grad_norm": 0.5948978066444397, + "learning_rate": 7.540886113589058e-05, + "loss": 0.1458, + "step": 16750 + }, + { + "epoch": 2.48, + "grad_norm": 1.3337939977645874, + "learning_rate": 7.539399345822183e-05, + "loss": 0.1422, + "step": 16760 + }, + { + "epoch": 2.49, + "grad_norm": 0.48446279764175415, + "learning_rate": 7.537912578055309e-05, + "loss": 0.1393, + "step": 16770 + }, + { + "epoch": 2.49, + "grad_norm": 0.37337374687194824, + "learning_rate": 7.536425810288434e-05, + "loss": 0.1383, + "step": 16780 + }, + { + "epoch": 2.49, + "grad_norm": 0.37720489501953125, + "learning_rate": 7.534939042521558e-05, + "loss": 0.1323, + "step": 16790 + }, + { + "epoch": 2.49, + "grad_norm": 0.3711811304092407, + "learning_rate": 7.533452274754684e-05, + "loss": 0.1362, + "step": 16800 + }, + { + "epoch": 2.49, + "grad_norm": 0.36137768626213074, + "learning_rate": 7.531965506987809e-05, + "loss": 0.1311, + "step": 16810 + }, + { + "epoch": 2.49, + "grad_norm": 0.5882296562194824, + "learning_rate": 7.530478739220935e-05, + "loss": 0.1407, + "step": 16820 + }, + { + "epoch": 2.49, + "grad_norm": 0.5880404114723206, + "learning_rate": 7.528991971454059e-05, + "loss": 0.1402, + "step": 16830 + }, + { + "epoch": 2.5, + "grad_norm": 0.5515339374542236, + "learning_rate": 7.527505203687184e-05, + "loss": 0.1407, + "step": 16840 + }, + { + "epoch": 2.5, + "grad_norm": 0.28839829564094543, + "learning_rate": 7.52601843592031e-05, + "loss": 0.1333, + "step": 16850 + }, + { + "epoch": 2.5, + "grad_norm": 0.4314468502998352, + "learning_rate": 7.524531668153435e-05, + "loss": 0.1424, + "step": 16860 + }, + { + "epoch": 2.5, + "grad_norm": 0.4594912827014923, + "learning_rate": 7.52304490038656e-05, + "loss": 0.1435, + "step": 16870 + }, + { + "epoch": 2.5, + "grad_norm": 0.3527372479438782, + "learning_rate": 7.521558132619685e-05, + "loss": 0.139, + "step": 16880 + }, + { + "epoch": 2.5, + "grad_norm": 0.6145148277282715, + "learning_rate": 7.52007136485281e-05, + "loss": 0.1386, + "step": 16890 + }, + { + "epoch": 2.51, + "grad_norm": 0.47387057542800903, + "learning_rate": 7.518584597085936e-05, + "loss": 0.1448, + "step": 16900 + }, + { + "epoch": 2.51, + "grad_norm": 0.3137947916984558, + "learning_rate": 7.517097829319061e-05, + "loss": 0.1415, + "step": 16910 + }, + { + "epoch": 2.51, + "grad_norm": 0.25534674525260925, + "learning_rate": 7.515611061552185e-05, + "loss": 0.1413, + "step": 16920 + }, + { + "epoch": 2.51, + "grad_norm": 0.8711992502212524, + "learning_rate": 7.514124293785311e-05, + "loss": 0.141, + "step": 16930 + }, + { + "epoch": 2.51, + "grad_norm": 0.5922618508338928, + "learning_rate": 7.512637526018436e-05, + "loss": 0.1372, + "step": 16940 + }, + { + "epoch": 2.51, + "grad_norm": 0.2968009114265442, + "learning_rate": 7.511150758251562e-05, + "loss": 0.1443, + "step": 16950 + }, + { + "epoch": 2.51, + "grad_norm": 0.7671728730201721, + "learning_rate": 7.509663990484687e-05, + "loss": 0.1426, + "step": 16960 + }, + { + "epoch": 2.52, + "grad_norm": 0.7778077721595764, + "learning_rate": 7.508177222717811e-05, + "loss": 0.1391, + "step": 16970 + }, + { + "epoch": 2.52, + "grad_norm": 0.5975720882415771, + "learning_rate": 7.506690454950937e-05, + "loss": 0.1399, + "step": 16980 + }, + { + "epoch": 2.52, + "grad_norm": 0.3030705153942108, + "learning_rate": 7.505203687184062e-05, + "loss": 0.1411, + "step": 16990 + }, + { + "epoch": 2.52, + "grad_norm": 0.6025968790054321, + "learning_rate": 7.503716919417188e-05, + "loss": 0.1429, + "step": 17000 + }, + { + "epoch": 2.52, + "grad_norm": 0.262866735458374, + "learning_rate": 7.502230151650312e-05, + "loss": 0.1359, + "step": 17010 + }, + { + "epoch": 2.52, + "grad_norm": 1.0509122610092163, + "learning_rate": 7.500743383883437e-05, + "loss": 0.1488, + "step": 17020 + }, + { + "epoch": 2.52, + "grad_norm": 0.8183063268661499, + "learning_rate": 7.499256616116563e-05, + "loss": 0.1436, + "step": 17030 + }, + { + "epoch": 2.53, + "grad_norm": 0.247868612408638, + "learning_rate": 7.497769848349688e-05, + "loss": 0.1381, + "step": 17040 + }, + { + "epoch": 2.53, + "grad_norm": 0.292298287153244, + "learning_rate": 7.496283080582814e-05, + "loss": 0.145, + "step": 17050 + }, + { + "epoch": 2.53, + "grad_norm": 0.6261994242668152, + "learning_rate": 7.494796312815938e-05, + "loss": 0.1494, + "step": 17060 + }, + { + "epoch": 2.53, + "grad_norm": 0.6475508213043213, + "learning_rate": 7.493309545049065e-05, + "loss": 0.1359, + "step": 17070 + }, + { + "epoch": 2.53, + "grad_norm": 0.24762730300426483, + "learning_rate": 7.491822777282189e-05, + "loss": 0.1477, + "step": 17080 + }, + { + "epoch": 2.53, + "grad_norm": 1.116014003753662, + "learning_rate": 7.490336009515314e-05, + "loss": 0.1436, + "step": 17090 + }, + { + "epoch": 2.53, + "grad_norm": 0.24743910133838654, + "learning_rate": 7.488849241748438e-05, + "loss": 0.1452, + "step": 17100 + }, + { + "epoch": 2.54, + "grad_norm": 0.4202406406402588, + "learning_rate": 7.487362473981564e-05, + "loss": 0.14, + "step": 17110 + }, + { + "epoch": 2.54, + "grad_norm": 0.25671133399009705, + "learning_rate": 7.485875706214691e-05, + "loss": 0.1421, + "step": 17120 + }, + { + "epoch": 2.54, + "grad_norm": 1.6126033067703247, + "learning_rate": 7.484388938447815e-05, + "loss": 0.1455, + "step": 17130 + }, + { + "epoch": 2.54, + "grad_norm": 0.7320016026496887, + "learning_rate": 7.48290217068094e-05, + "loss": 0.1455, + "step": 17140 + }, + { + "epoch": 2.54, + "grad_norm": 0.4042165279388428, + "learning_rate": 7.481415402914064e-05, + "loss": 0.138, + "step": 17150 + }, + { + "epoch": 2.54, + "grad_norm": 0.34952160716056824, + "learning_rate": 7.479928635147191e-05, + "loss": 0.1372, + "step": 17160 + }, + { + "epoch": 2.55, + "grad_norm": 0.43814823031425476, + "learning_rate": 7.478441867380315e-05, + "loss": 0.1426, + "step": 17170 + }, + { + "epoch": 2.55, + "grad_norm": 0.30086013674736023, + "learning_rate": 7.476955099613441e-05, + "loss": 0.1333, + "step": 17180 + }, + { + "epoch": 2.55, + "grad_norm": 0.6000199317932129, + "learning_rate": 7.475468331846566e-05, + "loss": 0.1436, + "step": 17190 + }, + { + "epoch": 2.55, + "grad_norm": 0.31554168462753296, + "learning_rate": 7.47398156407969e-05, + "loss": 0.1476, + "step": 17200 + }, + { + "epoch": 2.55, + "grad_norm": 0.5113075971603394, + "learning_rate": 7.472494796312817e-05, + "loss": 0.1442, + "step": 17210 + }, + { + "epoch": 2.55, + "grad_norm": 0.6701615452766418, + "learning_rate": 7.471008028545941e-05, + "loss": 0.1412, + "step": 17220 + }, + { + "epoch": 2.55, + "grad_norm": 0.9265919327735901, + "learning_rate": 7.469521260779067e-05, + "loss": 0.1443, + "step": 17230 + }, + { + "epoch": 2.56, + "grad_norm": 0.5859856605529785, + "learning_rate": 7.468034493012191e-05, + "loss": 0.1479, + "step": 17240 + }, + { + "epoch": 2.56, + "grad_norm": 0.39752840995788574, + "learning_rate": 7.466547725245318e-05, + "loss": 0.1487, + "step": 17250 + }, + { + "epoch": 2.56, + "grad_norm": 0.5720125436782837, + "learning_rate": 7.465060957478442e-05, + "loss": 0.1419, + "step": 17260 + }, + { + "epoch": 2.56, + "grad_norm": 0.754892110824585, + "learning_rate": 7.463574189711567e-05, + "loss": 0.1425, + "step": 17270 + }, + { + "epoch": 2.56, + "grad_norm": 0.7733323574066162, + "learning_rate": 7.462087421944693e-05, + "loss": 0.1453, + "step": 17280 + }, + { + "epoch": 2.56, + "grad_norm": 1.4931153059005737, + "learning_rate": 7.460600654177817e-05, + "loss": 0.135, + "step": 17290 + }, + { + "epoch": 2.56, + "grad_norm": 0.3707599639892578, + "learning_rate": 7.459113886410944e-05, + "loss": 0.1415, + "step": 17300 + }, + { + "epoch": 2.57, + "grad_norm": 0.3874984681606293, + "learning_rate": 7.457627118644068e-05, + "loss": 0.1476, + "step": 17310 + }, + { + "epoch": 2.57, + "grad_norm": 0.3033313751220703, + "learning_rate": 7.456140350877193e-05, + "loss": 0.1414, + "step": 17320 + }, + { + "epoch": 2.57, + "grad_norm": 0.6481125354766846, + "learning_rate": 7.454653583110318e-05, + "loss": 0.1432, + "step": 17330 + }, + { + "epoch": 2.57, + "grad_norm": 1.2512054443359375, + "learning_rate": 7.453166815343444e-05, + "loss": 0.1396, + "step": 17340 + }, + { + "epoch": 2.57, + "grad_norm": 0.6148348450660706, + "learning_rate": 7.451680047576569e-05, + "loss": 0.139, + "step": 17350 + }, + { + "epoch": 2.57, + "grad_norm": 0.3186105191707611, + "learning_rate": 7.450193279809694e-05, + "loss": 0.1282, + "step": 17360 + }, + { + "epoch": 2.57, + "grad_norm": 0.31010115146636963, + "learning_rate": 7.44870651204282e-05, + "loss": 0.1444, + "step": 17370 + }, + { + "epoch": 2.58, + "grad_norm": 0.8012278079986572, + "learning_rate": 7.447219744275944e-05, + "loss": 0.1356, + "step": 17380 + }, + { + "epoch": 2.58, + "grad_norm": 0.26003143191337585, + "learning_rate": 7.44573297650907e-05, + "loss": 0.1387, + "step": 17390 + }, + { + "epoch": 2.58, + "grad_norm": 0.284709095954895, + "learning_rate": 7.444246208742195e-05, + "loss": 0.1456, + "step": 17400 + }, + { + "epoch": 2.58, + "grad_norm": 1.3213711977005005, + "learning_rate": 7.44275944097532e-05, + "loss": 0.1347, + "step": 17410 + }, + { + "epoch": 2.58, + "grad_norm": 0.8473525047302246, + "learning_rate": 7.441272673208444e-05, + "loss": 0.1453, + "step": 17420 + }, + { + "epoch": 2.58, + "grad_norm": 0.36998623609542847, + "learning_rate": 7.439785905441571e-05, + "loss": 0.1308, + "step": 17430 + }, + { + "epoch": 2.59, + "grad_norm": 2.219050168991089, + "learning_rate": 7.438299137674695e-05, + "loss": 0.1497, + "step": 17440 + }, + { + "epoch": 2.59, + "grad_norm": 1.2892897129058838, + "learning_rate": 7.43681236990782e-05, + "loss": 0.1392, + "step": 17450 + }, + { + "epoch": 2.59, + "grad_norm": 0.3273633122444153, + "learning_rate": 7.435325602140946e-05, + "loss": 0.1459, + "step": 17460 + }, + { + "epoch": 2.59, + "grad_norm": 1.172643780708313, + "learning_rate": 7.43383883437407e-05, + "loss": 0.1396, + "step": 17470 + }, + { + "epoch": 2.59, + "grad_norm": 0.7880512475967407, + "learning_rate": 7.432352066607197e-05, + "loss": 0.1404, + "step": 17480 + }, + { + "epoch": 2.59, + "grad_norm": 0.3355169892311096, + "learning_rate": 7.430865298840321e-05, + "loss": 0.1513, + "step": 17490 + }, + { + "epoch": 2.59, + "grad_norm": 0.5652661919593811, + "learning_rate": 7.429378531073447e-05, + "loss": 0.1349, + "step": 17500 + }, + { + "epoch": 2.6, + "grad_norm": 0.34006038308143616, + "learning_rate": 7.427891763306572e-05, + "loss": 0.1406, + "step": 17510 + }, + { + "epoch": 2.6, + "grad_norm": 0.258184552192688, + "learning_rate": 7.426404995539698e-05, + "loss": 0.1422, + "step": 17520 + }, + { + "epoch": 2.6, + "grad_norm": 0.49831709265708923, + "learning_rate": 7.424918227772823e-05, + "loss": 0.1373, + "step": 17530 + }, + { + "epoch": 2.6, + "grad_norm": 1.0806423425674438, + "learning_rate": 7.423431460005947e-05, + "loss": 0.1468, + "step": 17540 + }, + { + "epoch": 2.6, + "grad_norm": 0.307546466588974, + "learning_rate": 7.421944692239073e-05, + "loss": 0.1376, + "step": 17550 + }, + { + "epoch": 2.6, + "grad_norm": 0.5520646572113037, + "learning_rate": 7.420457924472198e-05, + "loss": 0.1426, + "step": 17560 + }, + { + "epoch": 2.6, + "grad_norm": 0.2815137505531311, + "learning_rate": 7.418971156705324e-05, + "loss": 0.1359, + "step": 17570 + }, + { + "epoch": 2.61, + "grad_norm": 0.4063834846019745, + "learning_rate": 7.417484388938448e-05, + "loss": 0.1448, + "step": 17580 + }, + { + "epoch": 2.61, + "grad_norm": 0.4033089876174927, + "learning_rate": 7.415997621171573e-05, + "loss": 0.1406, + "step": 17590 + }, + { + "epoch": 2.61, + "grad_norm": 0.300946980714798, + "learning_rate": 7.414510853404699e-05, + "loss": 0.1339, + "step": 17600 + }, + { + "epoch": 2.61, + "grad_norm": 0.9063736200332642, + "learning_rate": 7.413024085637824e-05, + "loss": 0.146, + "step": 17610 + }, + { + "epoch": 2.61, + "grad_norm": 1.2720870971679688, + "learning_rate": 7.41153731787095e-05, + "loss": 0.1394, + "step": 17620 + }, + { + "epoch": 2.61, + "grad_norm": 0.36671578884124756, + "learning_rate": 7.410050550104074e-05, + "loss": 0.1451, + "step": 17630 + }, + { + "epoch": 2.61, + "grad_norm": 0.28279754519462585, + "learning_rate": 7.408563782337199e-05, + "loss": 0.1421, + "step": 17640 + }, + { + "epoch": 2.62, + "grad_norm": 1.5014578104019165, + "learning_rate": 7.407077014570325e-05, + "loss": 0.1362, + "step": 17650 + }, + { + "epoch": 2.62, + "grad_norm": 0.6834672689437866, + "learning_rate": 7.40559024680345e-05, + "loss": 0.1504, + "step": 17660 + }, + { + "epoch": 2.62, + "grad_norm": 0.5922780632972717, + "learning_rate": 7.404103479036574e-05, + "loss": 0.1375, + "step": 17670 + }, + { + "epoch": 2.62, + "grad_norm": 0.2400318682193756, + "learning_rate": 7.4026167112697e-05, + "loss": 0.1401, + "step": 17680 + }, + { + "epoch": 2.62, + "grad_norm": 0.8875470161437988, + "learning_rate": 7.401129943502825e-05, + "loss": 0.1465, + "step": 17690 + }, + { + "epoch": 2.62, + "grad_norm": 0.2918970584869385, + "learning_rate": 7.399643175735951e-05, + "loss": 0.141, + "step": 17700 + }, + { + "epoch": 2.63, + "grad_norm": 0.6647611260414124, + "learning_rate": 7.398156407969076e-05, + "loss": 0.1399, + "step": 17710 + }, + { + "epoch": 2.63, + "grad_norm": 0.23712264001369476, + "learning_rate": 7.3966696402022e-05, + "loss": 0.1331, + "step": 17720 + }, + { + "epoch": 2.63, + "grad_norm": 0.7306144833564758, + "learning_rate": 7.395182872435326e-05, + "loss": 0.1417, + "step": 17730 + }, + { + "epoch": 2.63, + "grad_norm": 0.40289196372032166, + "learning_rate": 7.393696104668451e-05, + "loss": 0.1437, + "step": 17740 + }, + { + "epoch": 2.63, + "grad_norm": 1.719495415687561, + "learning_rate": 7.392209336901577e-05, + "loss": 0.1472, + "step": 17750 + }, + { + "epoch": 2.63, + "grad_norm": 0.24560745060443878, + "learning_rate": 7.390722569134701e-05, + "loss": 0.1434, + "step": 17760 + }, + { + "epoch": 2.63, + "grad_norm": 1.0618599653244019, + "learning_rate": 7.389235801367826e-05, + "loss": 0.1481, + "step": 17770 + }, + { + "epoch": 2.64, + "grad_norm": 0.5430271029472351, + "learning_rate": 7.387749033600952e-05, + "loss": 0.139, + "step": 17780 + }, + { + "epoch": 2.64, + "grad_norm": 1.0089772939682007, + "learning_rate": 7.386262265834077e-05, + "loss": 0.1429, + "step": 17790 + }, + { + "epoch": 2.64, + "grad_norm": 0.2407023161649704, + "learning_rate": 7.384775498067203e-05, + "loss": 0.1383, + "step": 17800 + }, + { + "epoch": 2.64, + "grad_norm": 0.26621460914611816, + "learning_rate": 7.383288730300327e-05, + "loss": 0.139, + "step": 17810 + }, + { + "epoch": 2.64, + "grad_norm": 0.26956406235694885, + "learning_rate": 7.381801962533452e-05, + "loss": 0.1417, + "step": 17820 + }, + { + "epoch": 2.64, + "grad_norm": 0.2954663038253784, + "learning_rate": 7.380315194766578e-05, + "loss": 0.1337, + "step": 17830 + }, + { + "epoch": 2.64, + "grad_norm": 1.1632752418518066, + "learning_rate": 7.378828426999703e-05, + "loss": 0.1466, + "step": 17840 + }, + { + "epoch": 2.65, + "grad_norm": 1.1635485887527466, + "learning_rate": 7.377341659232829e-05, + "loss": 0.1374, + "step": 17850 + }, + { + "epoch": 2.65, + "grad_norm": 0.622186541557312, + "learning_rate": 7.375854891465953e-05, + "loss": 0.1385, + "step": 17860 + }, + { + "epoch": 2.65, + "grad_norm": 0.2939455211162567, + "learning_rate": 7.374368123699078e-05, + "loss": 0.137, + "step": 17870 + }, + { + "epoch": 2.65, + "grad_norm": 0.38761767745018005, + "learning_rate": 7.372881355932204e-05, + "loss": 0.1361, + "step": 17880 + }, + { + "epoch": 2.65, + "grad_norm": 0.9036255478858948, + "learning_rate": 7.37139458816533e-05, + "loss": 0.1374, + "step": 17890 + }, + { + "epoch": 2.65, + "grad_norm": 0.34090399742126465, + "learning_rate": 7.369907820398453e-05, + "loss": 0.1325, + "step": 17900 + }, + { + "epoch": 2.65, + "grad_norm": 0.29713836312294006, + "learning_rate": 7.368421052631579e-05, + "loss": 0.1408, + "step": 17910 + }, + { + "epoch": 2.66, + "grad_norm": 0.826836884021759, + "learning_rate": 7.366934284864704e-05, + "loss": 0.1498, + "step": 17920 + }, + { + "epoch": 2.66, + "grad_norm": 0.8794457316398621, + "learning_rate": 7.36544751709783e-05, + "loss": 0.1441, + "step": 17930 + }, + { + "epoch": 2.66, + "grad_norm": 0.3596440851688385, + "learning_rate": 7.363960749330955e-05, + "loss": 0.1452, + "step": 17940 + }, + { + "epoch": 2.66, + "grad_norm": 0.4660389721393585, + "learning_rate": 7.36247398156408e-05, + "loss": 0.1397, + "step": 17950 + }, + { + "epoch": 2.66, + "grad_norm": 1.148728847503662, + "learning_rate": 7.360987213797205e-05, + "loss": 0.1429, + "step": 17960 + }, + { + "epoch": 2.66, + "grad_norm": 0.5068449378013611, + "learning_rate": 7.35950044603033e-05, + "loss": 0.1367, + "step": 17970 + }, + { + "epoch": 2.67, + "grad_norm": 0.3436690866947174, + "learning_rate": 7.358013678263456e-05, + "loss": 0.1434, + "step": 17980 + }, + { + "epoch": 2.67, + "grad_norm": 0.28847482800483704, + "learning_rate": 7.35652691049658e-05, + "loss": 0.1452, + "step": 17990 + }, + { + "epoch": 2.67, + "grad_norm": 0.3127191960811615, + "learning_rate": 7.355040142729706e-05, + "loss": 0.1468, + "step": 18000 + }, + { + "epoch": 2.67, + "grad_norm": 0.3851916491985321, + "learning_rate": 7.353553374962831e-05, + "loss": 0.1416, + "step": 18010 + }, + { + "epoch": 2.67, + "grad_norm": 1.0425105094909668, + "learning_rate": 7.352066607195956e-05, + "loss": 0.1336, + "step": 18020 + }, + { + "epoch": 2.67, + "grad_norm": 0.5313891172409058, + "learning_rate": 7.350579839429082e-05, + "loss": 0.1449, + "step": 18030 + }, + { + "epoch": 2.67, + "grad_norm": 0.3654390871524811, + "learning_rate": 7.349093071662206e-05, + "loss": 0.1304, + "step": 18040 + }, + { + "epoch": 2.68, + "grad_norm": 0.8208723664283752, + "learning_rate": 7.347606303895333e-05, + "loss": 0.1395, + "step": 18050 + }, + { + "epoch": 2.68, + "grad_norm": 0.265028178691864, + "learning_rate": 7.346119536128457e-05, + "loss": 0.1423, + "step": 18060 + }, + { + "epoch": 2.68, + "grad_norm": 0.5879223346710205, + "learning_rate": 7.344632768361583e-05, + "loss": 0.1346, + "step": 18070 + }, + { + "epoch": 2.68, + "grad_norm": 1.9254904985427856, + "learning_rate": 7.343146000594707e-05, + "loss": 0.1455, + "step": 18080 + }, + { + "epoch": 2.68, + "grad_norm": 0.7956934571266174, + "learning_rate": 7.341659232827832e-05, + "loss": 0.1395, + "step": 18090 + }, + { + "epoch": 2.68, + "grad_norm": 0.3552607595920563, + "learning_rate": 7.340172465060958e-05, + "loss": 0.1381, + "step": 18100 + }, + { + "epoch": 2.68, + "grad_norm": 0.8107473254203796, + "learning_rate": 7.338685697294083e-05, + "loss": 0.1417, + "step": 18110 + }, + { + "epoch": 2.69, + "grad_norm": 0.28250497579574585, + "learning_rate": 7.337198929527209e-05, + "loss": 0.144, + "step": 18120 + }, + { + "epoch": 2.69, + "grad_norm": 0.26782554388046265, + "learning_rate": 7.335712161760333e-05, + "loss": 0.1386, + "step": 18130 + }, + { + "epoch": 2.69, + "grad_norm": 0.5947808623313904, + "learning_rate": 7.33422539399346e-05, + "loss": 0.1424, + "step": 18140 + }, + { + "epoch": 2.69, + "grad_norm": 0.2512226998806, + "learning_rate": 7.332738626226584e-05, + "loss": 0.1352, + "step": 18150 + }, + { + "epoch": 2.69, + "grad_norm": 0.35504305362701416, + "learning_rate": 7.331251858459709e-05, + "loss": 0.1416, + "step": 18160 + }, + { + "epoch": 2.69, + "grad_norm": 1.3061926364898682, + "learning_rate": 7.329765090692835e-05, + "loss": 0.1444, + "step": 18170 + }, + { + "epoch": 2.69, + "grad_norm": 0.6368234157562256, + "learning_rate": 7.328278322925959e-05, + "loss": 0.1403, + "step": 18180 + }, + { + "epoch": 2.7, + "grad_norm": 0.42611798644065857, + "learning_rate": 7.326791555159086e-05, + "loss": 0.1386, + "step": 18190 + }, + { + "epoch": 2.7, + "grad_norm": 0.6984022855758667, + "learning_rate": 7.32530478739221e-05, + "loss": 0.1451, + "step": 18200 + }, + { + "epoch": 2.7, + "grad_norm": 0.4926019310951233, + "learning_rate": 7.323818019625335e-05, + "loss": 0.1399, + "step": 18210 + }, + { + "epoch": 2.7, + "grad_norm": 0.8579056262969971, + "learning_rate": 7.322331251858459e-05, + "loss": 0.143, + "step": 18220 + }, + { + "epoch": 2.7, + "grad_norm": 0.9211030602455139, + "learning_rate": 7.320844484091586e-05, + "loss": 0.1469, + "step": 18230 + }, + { + "epoch": 2.7, + "grad_norm": 0.8623634576797485, + "learning_rate": 7.31935771632471e-05, + "loss": 0.1424, + "step": 18240 + }, + { + "epoch": 2.71, + "grad_norm": 0.2992829382419586, + "learning_rate": 7.317870948557836e-05, + "loss": 0.1417, + "step": 18250 + }, + { + "epoch": 2.71, + "grad_norm": 0.6315504908561707, + "learning_rate": 7.316384180790961e-05, + "loss": 0.1417, + "step": 18260 + }, + { + "epoch": 2.71, + "grad_norm": 0.9249059557914734, + "learning_rate": 7.314897413024085e-05, + "loss": 0.1468, + "step": 18270 + }, + { + "epoch": 2.71, + "grad_norm": 0.35125473141670227, + "learning_rate": 7.313410645257212e-05, + "loss": 0.145, + "step": 18280 + }, + { + "epoch": 2.71, + "grad_norm": 0.4258287250995636, + "learning_rate": 7.311923877490336e-05, + "loss": 0.1401, + "step": 18290 + }, + { + "epoch": 2.71, + "grad_norm": 0.26481908559799194, + "learning_rate": 7.310437109723462e-05, + "loss": 0.1431, + "step": 18300 + }, + { + "epoch": 2.71, + "grad_norm": 1.5498566627502441, + "learning_rate": 7.308950341956586e-05, + "loss": 0.1455, + "step": 18310 + }, + { + "epoch": 2.72, + "grad_norm": 1.0560815334320068, + "learning_rate": 7.307463574189713e-05, + "loss": 0.1475, + "step": 18320 + }, + { + "epoch": 2.72, + "grad_norm": 0.7546606659889221, + "learning_rate": 7.305976806422837e-05, + "loss": 0.1465, + "step": 18330 + }, + { + "epoch": 2.72, + "grad_norm": 1.151910424232483, + "learning_rate": 7.304490038655962e-05, + "loss": 0.1435, + "step": 18340 + }, + { + "epoch": 2.72, + "grad_norm": 0.38248512148857117, + "learning_rate": 7.303003270889088e-05, + "loss": 0.1462, + "step": 18350 + }, + { + "epoch": 2.72, + "grad_norm": 0.24999620020389557, + "learning_rate": 7.301516503122212e-05, + "loss": 0.1423, + "step": 18360 + }, + { + "epoch": 2.72, + "grad_norm": 1.2096368074417114, + "learning_rate": 7.300029735355339e-05, + "loss": 0.1402, + "step": 18370 + }, + { + "epoch": 2.72, + "grad_norm": 0.6669314503669739, + "learning_rate": 7.298542967588463e-05, + "loss": 0.1419, + "step": 18380 + }, + { + "epoch": 2.73, + "grad_norm": 0.7453274726867676, + "learning_rate": 7.297056199821588e-05, + "loss": 0.1507, + "step": 18390 + }, + { + "epoch": 2.73, + "grad_norm": 0.3134549558162689, + "learning_rate": 7.295569432054712e-05, + "loss": 0.1401, + "step": 18400 + }, + { + "epoch": 2.73, + "grad_norm": 0.509635329246521, + "learning_rate": 7.294082664287839e-05, + "loss": 0.1427, + "step": 18410 + }, + { + "epoch": 2.73, + "grad_norm": 1.5291324853897095, + "learning_rate": 7.292595896520963e-05, + "loss": 0.1382, + "step": 18420 + }, + { + "epoch": 2.73, + "grad_norm": 0.8732107877731323, + "learning_rate": 7.291109128754089e-05, + "loss": 0.1425, + "step": 18430 + }, + { + "epoch": 2.73, + "grad_norm": 0.951556384563446, + "learning_rate": 7.289622360987214e-05, + "loss": 0.1417, + "step": 18440 + }, + { + "epoch": 2.73, + "grad_norm": 0.22064682841300964, + "learning_rate": 7.288135593220338e-05, + "loss": 0.1389, + "step": 18450 + }, + { + "epoch": 2.74, + "grad_norm": 0.36942338943481445, + "learning_rate": 7.286648825453465e-05, + "loss": 0.1356, + "step": 18460 + }, + { + "epoch": 2.74, + "grad_norm": 0.9014129638671875, + "learning_rate": 7.28516205768659e-05, + "loss": 0.1439, + "step": 18470 + }, + { + "epoch": 2.74, + "grad_norm": 0.31892070174217224, + "learning_rate": 7.283675289919715e-05, + "loss": 0.1439, + "step": 18480 + }, + { + "epoch": 2.74, + "grad_norm": 0.3054444491863251, + "learning_rate": 7.28218852215284e-05, + "loss": 0.1385, + "step": 18490 + }, + { + "epoch": 2.74, + "grad_norm": 0.23387959599494934, + "learning_rate": 7.280701754385966e-05, + "loss": 0.1406, + "step": 18500 + }, + { + "epoch": 2.74, + "grad_norm": 0.26444515585899353, + "learning_rate": 7.279214986619091e-05, + "loss": 0.1412, + "step": 18510 + }, + { + "epoch": 2.75, + "grad_norm": 0.8463549613952637, + "learning_rate": 7.277728218852215e-05, + "loss": 0.1437, + "step": 18520 + }, + { + "epoch": 2.75, + "grad_norm": 0.9371518492698669, + "learning_rate": 7.276241451085341e-05, + "loss": 0.1401, + "step": 18530 + }, + { + "epoch": 2.75, + "grad_norm": 0.3569345772266388, + "learning_rate": 7.274754683318466e-05, + "loss": 0.132, + "step": 18540 + }, + { + "epoch": 2.75, + "grad_norm": 0.25233766436576843, + "learning_rate": 7.273267915551592e-05, + "loss": 0.1409, + "step": 18550 + }, + { + "epoch": 2.75, + "grad_norm": 0.46643221378326416, + "learning_rate": 7.271781147784716e-05, + "loss": 0.1378, + "step": 18560 + }, + { + "epoch": 2.75, + "grad_norm": 1.249118447303772, + "learning_rate": 7.270294380017841e-05, + "loss": 0.1477, + "step": 18570 + }, + { + "epoch": 2.75, + "grad_norm": 0.29594314098358154, + "learning_rate": 7.268807612250967e-05, + "loss": 0.15, + "step": 18580 + }, + { + "epoch": 2.76, + "grad_norm": 0.2833545207977295, + "learning_rate": 7.267320844484092e-05, + "loss": 0.1405, + "step": 18590 + }, + { + "epoch": 2.76, + "grad_norm": 0.5924676656723022, + "learning_rate": 7.265834076717218e-05, + "loss": 0.1372, + "step": 18600 + }, + { + "epoch": 2.76, + "grad_norm": 0.3902270495891571, + "learning_rate": 7.264347308950342e-05, + "loss": 0.1413, + "step": 18610 + }, + { + "epoch": 2.76, + "grad_norm": 0.447142630815506, + "learning_rate": 7.262860541183467e-05, + "loss": 0.1426, + "step": 18620 + }, + { + "epoch": 2.76, + "grad_norm": 0.4772433340549469, + "learning_rate": 7.261373773416593e-05, + "loss": 0.1394, + "step": 18630 + }, + { + "epoch": 2.76, + "grad_norm": 0.45227327942848206, + "learning_rate": 7.259887005649718e-05, + "loss": 0.1315, + "step": 18640 + }, + { + "epoch": 2.76, + "grad_norm": 1.1519386768341064, + "learning_rate": 7.258400237882843e-05, + "loss": 0.1408, + "step": 18650 + }, + { + "epoch": 2.77, + "grad_norm": 0.8935173749923706, + "learning_rate": 7.256913470115968e-05, + "loss": 0.1429, + "step": 18660 + }, + { + "epoch": 2.77, + "grad_norm": 0.24923725426197052, + "learning_rate": 7.255426702349093e-05, + "loss": 0.132, + "step": 18670 + }, + { + "epoch": 2.77, + "grad_norm": 0.4108467102050781, + "learning_rate": 7.253939934582219e-05, + "loss": 0.1406, + "step": 18680 + }, + { + "epoch": 2.77, + "grad_norm": 0.7500537037849426, + "learning_rate": 7.252453166815344e-05, + "loss": 0.1435, + "step": 18690 + }, + { + "epoch": 2.77, + "grad_norm": 0.3367678225040436, + "learning_rate": 7.250966399048469e-05, + "loss": 0.1423, + "step": 18700 + }, + { + "epoch": 2.77, + "grad_norm": 0.30210843682289124, + "learning_rate": 7.249479631281594e-05, + "loss": 0.1513, + "step": 18710 + }, + { + "epoch": 2.77, + "grad_norm": 0.7015200257301331, + "learning_rate": 7.24799286351472e-05, + "loss": 0.14, + "step": 18720 + }, + { + "epoch": 2.78, + "grad_norm": 0.45465758442878723, + "learning_rate": 7.246506095747845e-05, + "loss": 0.1443, + "step": 18730 + }, + { + "epoch": 2.78, + "grad_norm": 0.9578850269317627, + "learning_rate": 7.245019327980969e-05, + "loss": 0.1461, + "step": 18740 + }, + { + "epoch": 2.78, + "grad_norm": 0.20722438395023346, + "learning_rate": 7.243532560214095e-05, + "loss": 0.1311, + "step": 18750 + }, + { + "epoch": 2.78, + "grad_norm": 0.9666074514389038, + "learning_rate": 7.24204579244722e-05, + "loss": 0.1319, + "step": 18760 + }, + { + "epoch": 2.78, + "grad_norm": 0.286914199590683, + "learning_rate": 7.240559024680346e-05, + "loss": 0.1404, + "step": 18770 + }, + { + "epoch": 2.78, + "grad_norm": 0.635274350643158, + "learning_rate": 7.239072256913471e-05, + "loss": 0.1369, + "step": 18780 + }, + { + "epoch": 2.79, + "grad_norm": 0.46256542205810547, + "learning_rate": 7.237585489146595e-05, + "loss": 0.1416, + "step": 18790 + }, + { + "epoch": 2.79, + "grad_norm": 0.5526597499847412, + "learning_rate": 7.23609872137972e-05, + "loss": 0.14, + "step": 18800 + }, + { + "epoch": 2.79, + "grad_norm": 1.3282074928283691, + "learning_rate": 7.234611953612846e-05, + "loss": 0.1523, + "step": 18810 + }, + { + "epoch": 2.79, + "grad_norm": 0.2414388209581375, + "learning_rate": 7.233125185845972e-05, + "loss": 0.133, + "step": 18820 + }, + { + "epoch": 2.79, + "grad_norm": 0.8080794215202332, + "learning_rate": 7.231638418079097e-05, + "loss": 0.143, + "step": 18830 + }, + { + "epoch": 2.79, + "grad_norm": 0.4373549818992615, + "learning_rate": 7.230151650312221e-05, + "loss": 0.1352, + "step": 18840 + }, + { + "epoch": 2.79, + "grad_norm": 0.6084916591644287, + "learning_rate": 7.228664882545347e-05, + "loss": 0.1433, + "step": 18850 + }, + { + "epoch": 2.8, + "grad_norm": 0.41089242696762085, + "learning_rate": 7.227178114778472e-05, + "loss": 0.145, + "step": 18860 + }, + { + "epoch": 2.8, + "grad_norm": 0.2655513882637024, + "learning_rate": 7.225691347011598e-05, + "loss": 0.1423, + "step": 18870 + }, + { + "epoch": 2.8, + "grad_norm": 0.4778943359851837, + "learning_rate": 7.224204579244722e-05, + "loss": 0.1348, + "step": 18880 + }, + { + "epoch": 2.8, + "grad_norm": 0.4908297061920166, + "learning_rate": 7.222717811477847e-05, + "loss": 0.1434, + "step": 18890 + }, + { + "epoch": 2.8, + "grad_norm": 0.3996427059173584, + "learning_rate": 7.221231043710973e-05, + "loss": 0.1404, + "step": 18900 + }, + { + "epoch": 2.8, + "grad_norm": 0.6099945306777954, + "learning_rate": 7.219744275944098e-05, + "loss": 0.1454, + "step": 18910 + }, + { + "epoch": 2.8, + "grad_norm": 0.45081639289855957, + "learning_rate": 7.218257508177224e-05, + "loss": 0.1354, + "step": 18920 + }, + { + "epoch": 2.81, + "grad_norm": 1.0426857471466064, + "learning_rate": 7.216770740410348e-05, + "loss": 0.1491, + "step": 18930 + }, + { + "epoch": 2.81, + "grad_norm": 0.772966206073761, + "learning_rate": 7.215283972643473e-05, + "loss": 0.1365, + "step": 18940 + }, + { + "epoch": 2.81, + "grad_norm": 0.3265818655490875, + "learning_rate": 7.213797204876599e-05, + "loss": 0.1498, + "step": 18950 + }, + { + "epoch": 2.81, + "grad_norm": 0.2635791003704071, + "learning_rate": 7.212310437109724e-05, + "loss": 0.1363, + "step": 18960 + }, + { + "epoch": 2.81, + "grad_norm": 0.9569715857505798, + "learning_rate": 7.210823669342848e-05, + "loss": 0.1327, + "step": 18970 + }, + { + "epoch": 2.81, + "grad_norm": 0.20809006690979004, + "learning_rate": 7.209336901575974e-05, + "loss": 0.1377, + "step": 18980 + }, + { + "epoch": 2.81, + "grad_norm": 1.0042877197265625, + "learning_rate": 7.207850133809099e-05, + "loss": 0.1371, + "step": 18990 + }, + { + "epoch": 2.82, + "grad_norm": 0.5755969882011414, + "learning_rate": 7.206363366042225e-05, + "loss": 0.1355, + "step": 19000 + }, + { + "epoch": 2.82, + "grad_norm": 0.8315523862838745, + "learning_rate": 7.20487659827535e-05, + "loss": 0.1435, + "step": 19010 + }, + { + "epoch": 2.82, + "grad_norm": 0.5126630663871765, + "learning_rate": 7.203389830508474e-05, + "loss": 0.1358, + "step": 19020 + }, + { + "epoch": 2.82, + "grad_norm": 0.5610932111740112, + "learning_rate": 7.201903062741601e-05, + "loss": 0.1303, + "step": 19030 + }, + { + "epoch": 2.82, + "grad_norm": 0.49236372113227844, + "learning_rate": 7.200416294974725e-05, + "loss": 0.1428, + "step": 19040 + }, + { + "epoch": 2.82, + "grad_norm": 0.26628008484840393, + "learning_rate": 7.198929527207851e-05, + "loss": 0.1407, + "step": 19050 + }, + { + "epoch": 2.83, + "grad_norm": 1.6078991889953613, + "learning_rate": 7.197442759440975e-05, + "loss": 0.135, + "step": 19060 + }, + { + "epoch": 2.83, + "grad_norm": 0.3959672749042511, + "learning_rate": 7.1959559916741e-05, + "loss": 0.1418, + "step": 19070 + }, + { + "epoch": 2.83, + "grad_norm": 0.35960277915000916, + "learning_rate": 7.194469223907226e-05, + "loss": 0.1386, + "step": 19080 + }, + { + "epoch": 2.83, + "grad_norm": 0.46168479323387146, + "learning_rate": 7.192982456140351e-05, + "loss": 0.1465, + "step": 19090 + }, + { + "epoch": 2.83, + "grad_norm": 0.7458998560905457, + "learning_rate": 7.191495688373477e-05, + "loss": 0.1347, + "step": 19100 + }, + { + "epoch": 2.83, + "grad_norm": 0.629227876663208, + "learning_rate": 7.190008920606601e-05, + "loss": 0.1382, + "step": 19110 + }, + { + "epoch": 2.83, + "grad_norm": 0.48990973830223083, + "learning_rate": 7.188522152839728e-05, + "loss": 0.1346, + "step": 19120 + }, + { + "epoch": 2.84, + "grad_norm": 1.1364151239395142, + "learning_rate": 7.187035385072852e-05, + "loss": 0.1346, + "step": 19130 + }, + { + "epoch": 2.84, + "grad_norm": 0.39966854453086853, + "learning_rate": 7.185548617305977e-05, + "loss": 0.1365, + "step": 19140 + }, + { + "epoch": 2.84, + "grad_norm": 0.7130772471427917, + "learning_rate": 7.184061849539101e-05, + "loss": 0.1474, + "step": 19150 + }, + { + "epoch": 2.84, + "grad_norm": 0.47073498368263245, + "learning_rate": 7.182575081772227e-05, + "loss": 0.136, + "step": 19160 + }, + { + "epoch": 2.84, + "grad_norm": 0.7420998811721802, + "learning_rate": 7.181088314005354e-05, + "loss": 0.1423, + "step": 19170 + }, + { + "epoch": 2.84, + "grad_norm": 0.5848856568336487, + "learning_rate": 7.179601546238478e-05, + "loss": 0.1435, + "step": 19180 + }, + { + "epoch": 2.84, + "grad_norm": 0.41992321610450745, + "learning_rate": 7.178114778471603e-05, + "loss": 0.1385, + "step": 19190 + }, + { + "epoch": 2.85, + "grad_norm": 0.2730390131473541, + "learning_rate": 7.176628010704727e-05, + "loss": 0.135, + "step": 19200 + }, + { + "epoch": 2.85, + "grad_norm": 0.503334105014801, + "learning_rate": 7.175141242937854e-05, + "loss": 0.141, + "step": 19210 + }, + { + "epoch": 2.85, + "grad_norm": 0.7644150257110596, + "learning_rate": 7.173654475170978e-05, + "loss": 0.1387, + "step": 19220 + }, + { + "epoch": 2.85, + "grad_norm": 0.33800143003463745, + "learning_rate": 7.172167707404104e-05, + "loss": 0.1381, + "step": 19230 + }, + { + "epoch": 2.85, + "grad_norm": 0.5975232124328613, + "learning_rate": 7.17068093963723e-05, + "loss": 0.1403, + "step": 19240 + }, + { + "epoch": 2.85, + "grad_norm": 0.3202337920665741, + "learning_rate": 7.169194171870353e-05, + "loss": 0.1415, + "step": 19250 + }, + { + "epoch": 2.85, + "grad_norm": 0.2667614221572876, + "learning_rate": 7.16770740410348e-05, + "loss": 0.1404, + "step": 19260 + }, + { + "epoch": 2.86, + "grad_norm": 0.28970199823379517, + "learning_rate": 7.166220636336604e-05, + "loss": 0.144, + "step": 19270 + }, + { + "epoch": 2.86, + "grad_norm": 0.9261873364448547, + "learning_rate": 7.16473386856973e-05, + "loss": 0.1355, + "step": 19280 + }, + { + "epoch": 2.86, + "grad_norm": 0.30007144808769226, + "learning_rate": 7.163247100802854e-05, + "loss": 0.1357, + "step": 19290 + }, + { + "epoch": 2.86, + "grad_norm": 1.1617704629898071, + "learning_rate": 7.161760333035981e-05, + "loss": 0.1433, + "step": 19300 + }, + { + "epoch": 2.86, + "grad_norm": 0.5835827589035034, + "learning_rate": 7.160273565269105e-05, + "loss": 0.1412, + "step": 19310 + }, + { + "epoch": 2.86, + "grad_norm": 0.3388794958591461, + "learning_rate": 7.15878679750223e-05, + "loss": 0.1467, + "step": 19320 + }, + { + "epoch": 2.87, + "grad_norm": 0.5224015712738037, + "learning_rate": 7.157300029735356e-05, + "loss": 0.1409, + "step": 19330 + }, + { + "epoch": 2.87, + "grad_norm": 0.2810278534889221, + "learning_rate": 7.15581326196848e-05, + "loss": 0.14, + "step": 19340 + }, + { + "epoch": 2.87, + "grad_norm": 0.6086370348930359, + "learning_rate": 7.154326494201607e-05, + "loss": 0.1415, + "step": 19350 + }, + { + "epoch": 2.87, + "grad_norm": 0.28138285875320435, + "learning_rate": 7.152839726434731e-05, + "loss": 0.1351, + "step": 19360 + }, + { + "epoch": 2.87, + "grad_norm": 0.2224389910697937, + "learning_rate": 7.151352958667857e-05, + "loss": 0.1389, + "step": 19370 + }, + { + "epoch": 2.87, + "grad_norm": 0.6104931235313416, + "learning_rate": 7.14986619090098e-05, + "loss": 0.144, + "step": 19380 + }, + { + "epoch": 2.87, + "grad_norm": 0.29406848549842834, + "learning_rate": 7.148379423134107e-05, + "loss": 0.1367, + "step": 19390 + }, + { + "epoch": 2.88, + "grad_norm": 0.9205821752548218, + "learning_rate": 7.146892655367232e-05, + "loss": 0.1391, + "step": 19400 + }, + { + "epoch": 2.88, + "grad_norm": 0.89859539270401, + "learning_rate": 7.145405887600357e-05, + "loss": 0.1401, + "step": 19410 + }, + { + "epoch": 2.88, + "grad_norm": 0.37001898884773254, + "learning_rate": 7.143919119833483e-05, + "loss": 0.1421, + "step": 19420 + }, + { + "epoch": 2.88, + "grad_norm": 0.483979731798172, + "learning_rate": 7.142432352066607e-05, + "loss": 0.1388, + "step": 19430 + }, + { + "epoch": 2.88, + "grad_norm": 0.3105422556400299, + "learning_rate": 7.140945584299733e-05, + "loss": 0.1451, + "step": 19440 + }, + { + "epoch": 2.88, + "grad_norm": 0.598884105682373, + "learning_rate": 7.139458816532858e-05, + "loss": 0.1378, + "step": 19450 + }, + { + "epoch": 2.88, + "grad_norm": 0.6454994082450867, + "learning_rate": 7.137972048765983e-05, + "loss": 0.1382, + "step": 19460 + }, + { + "epoch": 2.89, + "grad_norm": 0.18754199147224426, + "learning_rate": 7.136485280999107e-05, + "loss": 0.1276, + "step": 19470 + }, + { + "epoch": 2.89, + "grad_norm": 0.31574535369873047, + "learning_rate": 7.134998513232234e-05, + "loss": 0.1422, + "step": 19480 + }, + { + "epoch": 2.89, + "grad_norm": 0.33168497681617737, + "learning_rate": 7.133511745465358e-05, + "loss": 0.1333, + "step": 19490 + }, + { + "epoch": 2.89, + "grad_norm": 0.3626178503036499, + "learning_rate": 7.132024977698484e-05, + "loss": 0.1411, + "step": 19500 + }, + { + "epoch": 2.89, + "grad_norm": 0.19792704284191132, + "learning_rate": 7.130538209931609e-05, + "loss": 0.1357, + "step": 19510 + }, + { + "epoch": 2.89, + "grad_norm": 0.45540741086006165, + "learning_rate": 7.129051442164735e-05, + "loss": 0.1373, + "step": 19520 + }, + { + "epoch": 2.89, + "grad_norm": 1.1599476337432861, + "learning_rate": 7.12756467439786e-05, + "loss": 0.1394, + "step": 19530 + }, + { + "epoch": 2.9, + "grad_norm": 0.3340741991996765, + "learning_rate": 7.126077906630984e-05, + "loss": 0.1394, + "step": 19540 + }, + { + "epoch": 2.9, + "grad_norm": 0.3207394480705261, + "learning_rate": 7.12459113886411e-05, + "loss": 0.1456, + "step": 19550 + }, + { + "epoch": 2.9, + "grad_norm": 0.8679114580154419, + "learning_rate": 7.123104371097235e-05, + "loss": 0.1392, + "step": 19560 + }, + { + "epoch": 2.9, + "grad_norm": 1.6166648864746094, + "learning_rate": 7.12161760333036e-05, + "loss": 0.1494, + "step": 19570 + }, + { + "epoch": 2.9, + "grad_norm": 0.4138358533382416, + "learning_rate": 7.120130835563486e-05, + "loss": 0.1461, + "step": 19580 + }, + { + "epoch": 2.9, + "grad_norm": 0.23959451913833618, + "learning_rate": 7.11864406779661e-05, + "loss": 0.1339, + "step": 19590 + }, + { + "epoch": 2.91, + "grad_norm": 0.32449159026145935, + "learning_rate": 7.117157300029736e-05, + "loss": 0.1282, + "step": 19600 + }, + { + "epoch": 2.91, + "grad_norm": 2.3437106609344482, + "learning_rate": 7.115670532262861e-05, + "loss": 0.1494, + "step": 19610 + }, + { + "epoch": 2.91, + "grad_norm": 0.4861600995063782, + "learning_rate": 7.114183764495987e-05, + "loss": 0.1428, + "step": 19620 + }, + { + "epoch": 2.91, + "grad_norm": 0.6348639726638794, + "learning_rate": 7.112696996729111e-05, + "loss": 0.1397, + "step": 19630 + }, + { + "epoch": 2.91, + "grad_norm": 0.27723273634910583, + "learning_rate": 7.111210228962236e-05, + "loss": 0.1427, + "step": 19640 + }, + { + "epoch": 2.91, + "grad_norm": 0.36210235953330994, + "learning_rate": 7.109723461195362e-05, + "loss": 0.1429, + "step": 19650 + }, + { + "epoch": 2.91, + "grad_norm": 0.48523059487342834, + "learning_rate": 7.108236693428487e-05, + "loss": 0.1331, + "step": 19660 + }, + { + "epoch": 2.92, + "grad_norm": 0.6052648425102234, + "learning_rate": 7.106749925661613e-05, + "loss": 0.1447, + "step": 19670 + }, + { + "epoch": 2.92, + "grad_norm": 0.2304733544588089, + "learning_rate": 7.105263157894737e-05, + "loss": 0.1311, + "step": 19680 + }, + { + "epoch": 2.92, + "grad_norm": 0.2759769558906555, + "learning_rate": 7.103776390127862e-05, + "loss": 0.1388, + "step": 19690 + }, + { + "epoch": 2.92, + "grad_norm": 0.9866233468055725, + "learning_rate": 7.102289622360988e-05, + "loss": 0.1448, + "step": 19700 + }, + { + "epoch": 2.92, + "grad_norm": 0.8615758419036865, + "learning_rate": 7.100802854594113e-05, + "loss": 0.1501, + "step": 19710 + }, + { + "epoch": 2.92, + "grad_norm": 0.49602025747299194, + "learning_rate": 7.099316086827237e-05, + "loss": 0.1455, + "step": 19720 + }, + { + "epoch": 2.92, + "grad_norm": 0.47930097579956055, + "learning_rate": 7.097829319060363e-05, + "loss": 0.1398, + "step": 19730 + }, + { + "epoch": 2.93, + "grad_norm": 0.4859764575958252, + "learning_rate": 7.096342551293488e-05, + "loss": 0.138, + "step": 19740 + }, + { + "epoch": 2.93, + "grad_norm": 0.7272999882698059, + "learning_rate": 7.094855783526614e-05, + "loss": 0.1323, + "step": 19750 + }, + { + "epoch": 2.93, + "grad_norm": 0.3457830250263214, + "learning_rate": 7.093369015759739e-05, + "loss": 0.1425, + "step": 19760 + }, + { + "epoch": 2.93, + "grad_norm": 0.23593512177467346, + "learning_rate": 7.091882247992863e-05, + "loss": 0.1293, + "step": 19770 + }, + { + "epoch": 2.93, + "grad_norm": 0.8734978437423706, + "learning_rate": 7.090395480225989e-05, + "loss": 0.136, + "step": 19780 + }, + { + "epoch": 2.93, + "grad_norm": 0.3335154950618744, + "learning_rate": 7.088908712459114e-05, + "loss": 0.1376, + "step": 19790 + }, + { + "epoch": 2.93, + "grad_norm": 0.34812235832214355, + "learning_rate": 7.08742194469224e-05, + "loss": 0.1357, + "step": 19800 + }, + { + "epoch": 2.94, + "grad_norm": 0.2229343056678772, + "learning_rate": 7.085935176925364e-05, + "loss": 0.1397, + "step": 19810 + }, + { + "epoch": 2.94, + "grad_norm": 0.29468947649002075, + "learning_rate": 7.08444840915849e-05, + "loss": 0.1386, + "step": 19820 + }, + { + "epoch": 2.94, + "grad_norm": 0.8740628361701965, + "learning_rate": 7.082961641391615e-05, + "loss": 0.1403, + "step": 19830 + }, + { + "epoch": 2.94, + "grad_norm": 0.5060070753097534, + "learning_rate": 7.08147487362474e-05, + "loss": 0.1439, + "step": 19840 + }, + { + "epoch": 2.94, + "grad_norm": 0.2761290967464447, + "learning_rate": 7.079988105857866e-05, + "loss": 0.1454, + "step": 19850 + }, + { + "epoch": 2.94, + "grad_norm": 0.2517363429069519, + "learning_rate": 7.07850133809099e-05, + "loss": 0.1381, + "step": 19860 + }, + { + "epoch": 2.95, + "grad_norm": 0.6545493006706238, + "learning_rate": 7.077014570324115e-05, + "loss": 0.1349, + "step": 19870 + }, + { + "epoch": 2.95, + "grad_norm": 0.6688421964645386, + "learning_rate": 7.075527802557241e-05, + "loss": 0.1384, + "step": 19880 + }, + { + "epoch": 2.95, + "grad_norm": 0.26555657386779785, + "learning_rate": 7.074041034790366e-05, + "loss": 0.1385, + "step": 19890 + }, + { + "epoch": 2.95, + "grad_norm": 0.32324767112731934, + "learning_rate": 7.072554267023492e-05, + "loss": 0.1362, + "step": 19900 + }, + { + "epoch": 2.95, + "grad_norm": 0.7072975039482117, + "learning_rate": 7.071067499256616e-05, + "loss": 0.1416, + "step": 19910 + }, + { + "epoch": 2.95, + "grad_norm": 0.9408333897590637, + "learning_rate": 7.069580731489741e-05, + "loss": 0.137, + "step": 19920 + }, + { + "epoch": 2.95, + "grad_norm": 0.7711098194122314, + "learning_rate": 7.068093963722867e-05, + "loss": 0.1379, + "step": 19930 + }, + { + "epoch": 2.96, + "grad_norm": 1.1616792678833008, + "learning_rate": 7.066607195955992e-05, + "loss": 0.1445, + "step": 19940 + }, + { + "epoch": 2.96, + "grad_norm": 0.3702917993068695, + "learning_rate": 7.065120428189117e-05, + "loss": 0.1516, + "step": 19950 + }, + { + "epoch": 2.96, + "grad_norm": 1.209560513496399, + "learning_rate": 7.063633660422242e-05, + "loss": 0.1455, + "step": 19960 + }, + { + "epoch": 2.96, + "grad_norm": 0.42448684573173523, + "learning_rate": 7.062146892655367e-05, + "loss": 0.1464, + "step": 19970 + }, + { + "epoch": 2.96, + "grad_norm": 0.7959136366844177, + "learning_rate": 7.060660124888493e-05, + "loss": 0.1467, + "step": 19980 + }, + { + "epoch": 2.96, + "grad_norm": 0.3942021131515503, + "learning_rate": 7.059173357121618e-05, + "loss": 0.14, + "step": 19990 + }, + { + "epoch": 2.96, + "grad_norm": 0.2557319104671478, + "learning_rate": 7.057686589354743e-05, + "loss": 0.1373, + "step": 20000 + }, + { + "epoch": 2.97, + "grad_norm": 0.9788160920143127, + "learning_rate": 7.05619982158787e-05, + "loss": 0.1346, + "step": 20010 + }, + { + "epoch": 2.97, + "grad_norm": 0.39195263385772705, + "learning_rate": 7.054713053820994e-05, + "loss": 0.1364, + "step": 20020 + }, + { + "epoch": 2.97, + "grad_norm": 0.37623974680900574, + "learning_rate": 7.053226286054119e-05, + "loss": 0.142, + "step": 20030 + }, + { + "epoch": 2.97, + "grad_norm": 0.6487099528312683, + "learning_rate": 7.051739518287243e-05, + "loss": 0.1424, + "step": 20040 + }, + { + "epoch": 2.97, + "grad_norm": 0.6612546443939209, + "learning_rate": 7.050252750520369e-05, + "loss": 0.1381, + "step": 20050 + }, + { + "epoch": 2.97, + "grad_norm": 0.30832764506340027, + "learning_rate": 7.048765982753494e-05, + "loss": 0.139, + "step": 20060 + }, + { + "epoch": 2.97, + "grad_norm": 0.5305371880531311, + "learning_rate": 7.04727921498662e-05, + "loss": 0.1415, + "step": 20070 + }, + { + "epoch": 2.98, + "grad_norm": 0.4207991361618042, + "learning_rate": 7.045792447219745e-05, + "loss": 0.1385, + "step": 20080 + }, + { + "epoch": 2.98, + "grad_norm": 0.2595292627811432, + "learning_rate": 7.044305679452869e-05, + "loss": 0.1286, + "step": 20090 + }, + { + "epoch": 2.98, + "grad_norm": 0.864605724811554, + "learning_rate": 7.042818911685996e-05, + "loss": 0.1325, + "step": 20100 + }, + { + "epoch": 2.98, + "grad_norm": 1.4537006616592407, + "learning_rate": 7.04133214391912e-05, + "loss": 0.135, + "step": 20110 + }, + { + "epoch": 2.98, + "grad_norm": 0.3340538740158081, + "learning_rate": 7.039845376152246e-05, + "loss": 0.1375, + "step": 20120 + }, + { + "epoch": 2.98, + "grad_norm": 1.0329761505126953, + "learning_rate": 7.03835860838537e-05, + "loss": 0.137, + "step": 20130 + }, + { + "epoch": 2.99, + "grad_norm": 0.6245138645172119, + "learning_rate": 7.036871840618495e-05, + "loss": 0.1372, + "step": 20140 + }, + { + "epoch": 2.99, + "grad_norm": 0.35035985708236694, + "learning_rate": 7.03538507285162e-05, + "loss": 0.1395, + "step": 20150 + }, + { + "epoch": 2.99, + "grad_norm": 0.351352721452713, + "learning_rate": 7.033898305084746e-05, + "loss": 0.1433, + "step": 20160 + }, + { + "epoch": 2.99, + "grad_norm": 0.4944305717945099, + "learning_rate": 7.032411537317872e-05, + "loss": 0.1388, + "step": 20170 + }, + { + "epoch": 2.99, + "grad_norm": 0.8752815127372742, + "learning_rate": 7.030924769550996e-05, + "loss": 0.1341, + "step": 20180 + }, + { + "epoch": 2.99, + "grad_norm": 0.30437782406806946, + "learning_rate": 7.029438001784123e-05, + "loss": 0.1306, + "step": 20190 + }, + { + "epoch": 2.99, + "grad_norm": 0.797670841217041, + "learning_rate": 7.027951234017247e-05, + "loss": 0.1469, + "step": 20200 + }, + { + "epoch": 3.0, + "grad_norm": 1.6191693544387817, + "learning_rate": 7.026464466250372e-05, + "loss": 0.1346, + "step": 20210 + }, + { + "epoch": 3.0, + "grad_norm": 0.9991375803947449, + "learning_rate": 7.024977698483498e-05, + "loss": 0.1402, + "step": 20220 + }, + { + "epoch": 3.0, + "grad_norm": 1.0731664896011353, + "learning_rate": 7.023490930716622e-05, + "loss": 0.1451, + "step": 20230 + }, + { + "epoch": 3.0, + "eval_loss": 0.15701617300510406, + "eval_runtime": 2481.1974, + "eval_samples_per_second": 235.24, + "eval_steps_per_second": 3.676, + "step": 20238 + } + ], + "logging_steps": 10, + "max_steps": 67460, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "total_flos": 8.63158939557036e+18, + "train_batch_size": 6, + "trial_name": null, + "trial_params": null +}