{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.04182968498155174, "eval_steps": 500, "global_step": 23000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.818681955719641e-05, "grad_norm": 2.1063554286956787, "learning_rate": 0.0002, "loss": 1.9357, "step": 10 }, { "epoch": 3.637363911439282e-05, "grad_norm": 0.9359453320503235, "learning_rate": 0.0002, "loss": 0.2208, "step": 20 }, { "epoch": 5.4560458671589234e-05, "grad_norm": 0.5420117378234863, "learning_rate": 0.0002, "loss": 0.1459, "step": 30 }, { "epoch": 7.274727822878565e-05, "grad_norm": 0.05442357063293457, "learning_rate": 0.0002, "loss": 0.0823, "step": 40 }, { "epoch": 9.093409778598205e-05, "grad_norm": 0.0005907653248868883, "learning_rate": 0.0002, "loss": 0.0005, "step": 50 }, { "epoch": 0.00010912091734317847, "grad_norm": 0.26516178250312805, "learning_rate": 0.0002, "loss": 1.0686, "step": 60 }, { "epoch": 0.00012730773690037487, "grad_norm": 0.44067099690437317, "learning_rate": 0.0002, "loss": 0.2613, "step": 70 }, { "epoch": 0.0001454945564575713, "grad_norm": 0.09356075525283813, "learning_rate": 0.0002, "loss": 0.1415, "step": 80 }, { "epoch": 0.0001636813760147677, "grad_norm": 0.017799921333789825, "learning_rate": 0.0002, "loss": 0.1013, "step": 90 }, { "epoch": 0.0001818681955719641, "grad_norm": 0.0018534553237259388, "learning_rate": 0.0002, "loss": 0.0001, "step": 100 }, { "epoch": 0.00020005501512916052, "grad_norm": 0.35472020506858826, "learning_rate": 0.0002, "loss": 0.73, "step": 110 }, { "epoch": 0.00021824183468635694, "grad_norm": 0.3880878686904907, "learning_rate": 0.0002, "loss": 0.1424, "step": 120 }, { "epoch": 0.00023642865424355333, "grad_norm": 0.19027432799339294, "learning_rate": 0.0002, "loss": 0.1173, "step": 130 }, { "epoch": 0.00025461547380074975, "grad_norm": 0.019047321751713753, "learning_rate": 0.0002, "loss": 0.0977, "step": 140 }, { "epoch": 0.00027280229335794617, "grad_norm": 0.0003795044613070786, "learning_rate": 0.0002, "loss": 0.0007, "step": 150 }, { "epoch": 0.0002909891129151426, "grad_norm": 0.08740618824958801, "learning_rate": 0.0002, "loss": 0.801, "step": 160 }, { "epoch": 0.000309175932472339, "grad_norm": 0.2661634087562561, "learning_rate": 0.0002, "loss": 0.1274, "step": 170 }, { "epoch": 0.0003273627520295354, "grad_norm": 0.05828547850251198, "learning_rate": 0.0002, "loss": 0.1184, "step": 180 }, { "epoch": 0.0003455495715867318, "grad_norm": 0.02175055630505085, "learning_rate": 0.0002, "loss": 0.0752, "step": 190 }, { "epoch": 0.0003637363911439282, "grad_norm": 0.0009504792396910489, "learning_rate": 0.0002, "loss": 0.0005, "step": 200 }, { "epoch": 0.0003819232107011246, "grad_norm": 0.25059741735458374, "learning_rate": 0.0002, "loss": 0.5125, "step": 210 }, { "epoch": 0.00040011003025832104, "grad_norm": 0.13256193697452545, "learning_rate": 0.0002, "loss": 0.1014, "step": 220 }, { "epoch": 0.00041829684981551746, "grad_norm": 0.09446375072002411, "learning_rate": 0.0002, "loss": 0.0896, "step": 230 }, { "epoch": 0.0004364836693727139, "grad_norm": 0.019389621913433075, "learning_rate": 0.0002, "loss": 0.0726, "step": 240 }, { "epoch": 0.0004546704889299103, "grad_norm": 0.0032304900232702494, "learning_rate": 0.0002, "loss": 0.0023, "step": 250 }, { "epoch": 0.00047285730848710666, "grad_norm": 2.5549609661102295, "learning_rate": 0.0002, "loss": 0.3884, "step": 260 }, { "epoch": 0.0004910441280443031, "grad_norm": 0.44937047362327576, "learning_rate": 0.0002, "loss": 0.1071, "step": 270 }, { "epoch": 0.0005092309476014995, "grad_norm": 0.1509999781847, "learning_rate": 0.0002, "loss": 0.0979, "step": 280 }, { "epoch": 0.0005274177671586959, "grad_norm": 0.006468054372817278, "learning_rate": 0.0002, "loss": 0.0611, "step": 290 }, { "epoch": 0.0005456045867158923, "grad_norm": 0.0002916739322245121, "learning_rate": 0.0002, "loss": 0.001, "step": 300 }, { "epoch": 0.0005637914062730887, "grad_norm": 0.23081810772418976, "learning_rate": 0.0002, "loss": 0.5894, "step": 310 }, { "epoch": 0.0005819782258302852, "grad_norm": 0.22755394876003265, "learning_rate": 0.0002, "loss": 0.114, "step": 320 }, { "epoch": 0.0006001650453874816, "grad_norm": 0.49973106384277344, "learning_rate": 0.0002, "loss": 0.093, "step": 330 }, { "epoch": 0.000618351864944678, "grad_norm": 0.08789435774087906, "learning_rate": 0.0002, "loss": 0.0745, "step": 340 }, { "epoch": 0.0006365386845018744, "grad_norm": 0.0058497479185462, "learning_rate": 0.0002, "loss": 0.0007, "step": 350 }, { "epoch": 0.0006547255040590708, "grad_norm": 0.30569636821746826, "learning_rate": 0.0002, "loss": 0.5169, "step": 360 }, { "epoch": 0.0006729123236162671, "grad_norm": 0.2783024311065674, "learning_rate": 0.0002, "loss": 0.13, "step": 370 }, { "epoch": 0.0006910991431734636, "grad_norm": 0.13052967190742493, "learning_rate": 0.0002, "loss": 0.0907, "step": 380 }, { "epoch": 0.00070928596273066, "grad_norm": 0.15066476166248322, "learning_rate": 0.0002, "loss": 0.0996, "step": 390 }, { "epoch": 0.0007274727822878564, "grad_norm": 0.0005865198327228427, "learning_rate": 0.0002, "loss": 0.0021, "step": 400 }, { "epoch": 0.0007456596018450528, "grad_norm": 0.31872233748435974, "learning_rate": 0.0002, "loss": 0.4507, "step": 410 }, { "epoch": 0.0007638464214022492, "grad_norm": 0.08874880522489548, "learning_rate": 0.0002, "loss": 0.136, "step": 420 }, { "epoch": 0.0007820332409594457, "grad_norm": 0.10985178500413895, "learning_rate": 0.0002, "loss": 0.0992, "step": 430 }, { "epoch": 0.0008002200605166421, "grad_norm": 0.10776215046644211, "learning_rate": 0.0002, "loss": 0.0661, "step": 440 }, { "epoch": 0.0008184068800738385, "grad_norm": 0.006612936966121197, "learning_rate": 0.0002, "loss": 0.0009, "step": 450 }, { "epoch": 0.0008365936996310349, "grad_norm": 0.2757071256637573, "learning_rate": 0.0002, "loss": 0.6376, "step": 460 }, { "epoch": 0.0008547805191882313, "grad_norm": 0.24748466908931732, "learning_rate": 0.0002, "loss": 0.1241, "step": 470 }, { "epoch": 0.0008729673387454278, "grad_norm": 0.1035066694021225, "learning_rate": 0.0002, "loss": 0.1008, "step": 480 }, { "epoch": 0.0008911541583026242, "grad_norm": 0.06515783071517944, "learning_rate": 0.0002, "loss": 0.0711, "step": 490 }, { "epoch": 0.0009093409778598206, "grad_norm": 0.011224807240068913, "learning_rate": 0.0002, "loss": 0.0004, "step": 500 }, { "epoch": 0.000927527797417017, "grad_norm": 0.2669332027435303, "learning_rate": 0.0002, "loss": 0.5618, "step": 510 }, { "epoch": 0.0009457146169742133, "grad_norm": 0.26048392057418823, "learning_rate": 0.0002, "loss": 0.1259, "step": 520 }, { "epoch": 0.0009639014365314097, "grad_norm": 0.22928836941719055, "learning_rate": 0.0002, "loss": 0.0956, "step": 530 }, { "epoch": 0.0009820882560886062, "grad_norm": 0.084063321352005, "learning_rate": 0.0002, "loss": 0.0708, "step": 540 }, { "epoch": 0.0010002750756458027, "grad_norm": 0.004612344317138195, "learning_rate": 0.0002, "loss": 0.0007, "step": 550 }, { "epoch": 0.001018461895202999, "grad_norm": 0.3866584599018097, "learning_rate": 0.0002, "loss": 0.5406, "step": 560 }, { "epoch": 0.0010366487147601955, "grad_norm": 0.32303065061569214, "learning_rate": 0.0002, "loss": 0.1001, "step": 570 }, { "epoch": 0.0010548355343173918, "grad_norm": 0.09439560770988464, "learning_rate": 0.0002, "loss": 0.1051, "step": 580 }, { "epoch": 0.0010730223538745881, "grad_norm": 0.028145521879196167, "learning_rate": 0.0002, "loss": 0.0638, "step": 590 }, { "epoch": 0.0010912091734317847, "grad_norm": 0.00048497263924218714, "learning_rate": 0.0002, "loss": 0.002, "step": 600 }, { "epoch": 0.001109395992988981, "grad_norm": 0.32391539216041565, "learning_rate": 0.0002, "loss": 0.5483, "step": 610 }, { "epoch": 0.0011275828125461775, "grad_norm": 0.02977031283080578, "learning_rate": 0.0002, "loss": 0.1264, "step": 620 }, { "epoch": 0.0011457696321033738, "grad_norm": 0.07332426309585571, "learning_rate": 0.0002, "loss": 0.1018, "step": 630 }, { "epoch": 0.0011639564516605703, "grad_norm": 0.05653443560004234, "learning_rate": 0.0002, "loss": 0.0666, "step": 640 }, { "epoch": 0.0011821432712177666, "grad_norm": 0.0010635281214490533, "learning_rate": 0.0002, "loss": 0.0009, "step": 650 }, { "epoch": 0.0012003300907749632, "grad_norm": 0.04933600872755051, "learning_rate": 0.0002, "loss": 0.3902, "step": 660 }, { "epoch": 0.0012185169103321595, "grad_norm": 0.14713574945926666, "learning_rate": 0.0002, "loss": 0.0905, "step": 670 }, { "epoch": 0.001236703729889356, "grad_norm": 0.05463952198624611, "learning_rate": 0.0002, "loss": 0.0909, "step": 680 }, { "epoch": 0.0012548905494465523, "grad_norm": 0.10299955308437347, "learning_rate": 0.0002, "loss": 0.07, "step": 690 }, { "epoch": 0.0012730773690037488, "grad_norm": 0.022791124880313873, "learning_rate": 0.0002, "loss": 0.0027, "step": 700 }, { "epoch": 0.0012912641885609452, "grad_norm": 0.27977490425109863, "learning_rate": 0.0002, "loss": 0.4421, "step": 710 }, { "epoch": 0.0013094510081181417, "grad_norm": 0.2346329241991043, "learning_rate": 0.0002, "loss": 0.1263, "step": 720 }, { "epoch": 0.001327637827675338, "grad_norm": 0.09294597059488297, "learning_rate": 0.0002, "loss": 0.096, "step": 730 }, { "epoch": 0.0013458246472325343, "grad_norm": 0.10317150503396988, "learning_rate": 0.0002, "loss": 0.0727, "step": 740 }, { "epoch": 0.0013640114667897308, "grad_norm": 0.001372635131701827, "learning_rate": 0.0002, "loss": 0.001, "step": 750 }, { "epoch": 0.0013821982863469271, "grad_norm": 0.10563486814498901, "learning_rate": 0.0002, "loss": 0.596, "step": 760 }, { "epoch": 0.0014003851059041237, "grad_norm": 0.14429838955402374, "learning_rate": 0.0002, "loss": 0.1178, "step": 770 }, { "epoch": 0.00141857192546132, "grad_norm": 0.0848163515329361, "learning_rate": 0.0002, "loss": 0.1008, "step": 780 }, { "epoch": 0.0014367587450185165, "grad_norm": 0.07259710133075714, "learning_rate": 0.0002, "loss": 0.069, "step": 790 }, { "epoch": 0.0014549455645757128, "grad_norm": 0.0019098519114777446, "learning_rate": 0.0002, "loss": 0.0023, "step": 800 }, { "epoch": 0.0014731323841329093, "grad_norm": 0.2433256059885025, "learning_rate": 0.0002, "loss": 0.2937, "step": 810 }, { "epoch": 0.0014913192036901056, "grad_norm": 0.04093409329652786, "learning_rate": 0.0002, "loss": 0.1133, "step": 820 }, { "epoch": 0.0015095060232473022, "grad_norm": 0.0480966717004776, "learning_rate": 0.0002, "loss": 0.0969, "step": 830 }, { "epoch": 0.0015276928428044985, "grad_norm": 0.14327965676784515, "learning_rate": 0.0002, "loss": 0.0866, "step": 840 }, { "epoch": 0.001545879662361695, "grad_norm": 0.001585015095770359, "learning_rate": 0.0002, "loss": 0.0042, "step": 850 }, { "epoch": 0.0015640664819188913, "grad_norm": 0.1842886209487915, "learning_rate": 0.0002, "loss": 0.3273, "step": 860 }, { "epoch": 0.0015822533014760878, "grad_norm": 0.09671049565076828, "learning_rate": 0.0002, "loss": 0.1079, "step": 870 }, { "epoch": 0.0016004401210332842, "grad_norm": 0.2730088233947754, "learning_rate": 0.0002, "loss": 0.1018, "step": 880 }, { "epoch": 0.0016186269405904805, "grad_norm": 0.11702803522348404, "learning_rate": 0.0002, "loss": 0.0758, "step": 890 }, { "epoch": 0.001636813760147677, "grad_norm": 0.004438066389411688, "learning_rate": 0.0002, "loss": 0.0033, "step": 900 }, { "epoch": 0.0016550005797048733, "grad_norm": 0.18424616754055023, "learning_rate": 0.0002, "loss": 0.4028, "step": 910 }, { "epoch": 0.0016731873992620698, "grad_norm": 0.12502820789813995, "learning_rate": 0.0002, "loss": 0.0979, "step": 920 }, { "epoch": 0.0016913742188192661, "grad_norm": 0.05109328031539917, "learning_rate": 0.0002, "loss": 0.0889, "step": 930 }, { "epoch": 0.0017095610383764627, "grad_norm": 0.18566183745861053, "learning_rate": 0.0002, "loss": 0.0833, "step": 940 }, { "epoch": 0.001727747857933659, "grad_norm": 0.0012954511912539601, "learning_rate": 0.0002, "loss": 0.0029, "step": 950 }, { "epoch": 0.0017459346774908555, "grad_norm": 0.06683014333248138, "learning_rate": 0.0002, "loss": 0.4614, "step": 960 }, { "epoch": 0.0017641214970480518, "grad_norm": 0.27773013710975647, "learning_rate": 0.0002, "loss": 0.1131, "step": 970 }, { "epoch": 0.0017823083166052483, "grad_norm": 0.1999790072441101, "learning_rate": 0.0002, "loss": 0.089, "step": 980 }, { "epoch": 0.0018004951361624446, "grad_norm": 0.09625103324651718, "learning_rate": 0.0002, "loss": 0.0739, "step": 990 }, { "epoch": 0.0018186819557196412, "grad_norm": 0.005470380187034607, "learning_rate": 0.0002, "loss": 0.0012, "step": 1000 }, { "epoch": 0.0018368687752768375, "grad_norm": 0.038832616060972214, "learning_rate": 0.0002, "loss": 0.5521, "step": 1010 }, { "epoch": 0.001855055594834034, "grad_norm": 0.1903093159198761, "learning_rate": 0.0002, "loss": 0.1237, "step": 1020 }, { "epoch": 0.0018732424143912303, "grad_norm": 0.031102774664759636, "learning_rate": 0.0002, "loss": 0.0866, "step": 1030 }, { "epoch": 0.0018914292339484266, "grad_norm": 0.043983202427625656, "learning_rate": 0.0002, "loss": 0.0611, "step": 1040 }, { "epoch": 0.0019096160535056232, "grad_norm": 0.0002974902163259685, "learning_rate": 0.0002, "loss": 0.0035, "step": 1050 }, { "epoch": 0.0019278028730628195, "grad_norm": 0.1936149299144745, "learning_rate": 0.0002, "loss": 0.3019, "step": 1060 }, { "epoch": 0.001945989692620016, "grad_norm": 0.15767355263233185, "learning_rate": 0.0002, "loss": 0.108, "step": 1070 }, { "epoch": 0.0019641765121772123, "grad_norm": 0.08244495838880539, "learning_rate": 0.0002, "loss": 0.091, "step": 1080 }, { "epoch": 0.0019823633317344086, "grad_norm": 0.15848897397518158, "learning_rate": 0.0002, "loss": 0.0655, "step": 1090 }, { "epoch": 0.0020005501512916054, "grad_norm": 0.0011951205087825656, "learning_rate": 0.0002, "loss": 0.0052, "step": 1100 }, { "epoch": 0.0020187369708488017, "grad_norm": 0.13027112185955048, "learning_rate": 0.0002, "loss": 0.2943, "step": 1110 }, { "epoch": 0.002036923790405998, "grad_norm": 0.19413979351520538, "learning_rate": 0.0002, "loss": 0.1329, "step": 1120 }, { "epoch": 0.0020551106099631943, "grad_norm": 0.08515465259552002, "learning_rate": 0.0002, "loss": 0.0921, "step": 1130 }, { "epoch": 0.002073297429520391, "grad_norm": 0.1244177296757698, "learning_rate": 0.0002, "loss": 0.0678, "step": 1140 }, { "epoch": 0.0020914842490775873, "grad_norm": 0.0016714326338842511, "learning_rate": 0.0002, "loss": 0.0035, "step": 1150 }, { "epoch": 0.0021096710686347836, "grad_norm": 0.24979737401008606, "learning_rate": 0.0002, "loss": 0.2643, "step": 1160 }, { "epoch": 0.00212785788819198, "grad_norm": 0.14143353700637817, "learning_rate": 0.0002, "loss": 0.1037, "step": 1170 }, { "epoch": 0.0021460447077491763, "grad_norm": 0.033794257789850235, "learning_rate": 0.0002, "loss": 0.087, "step": 1180 }, { "epoch": 0.002164231527306373, "grad_norm": 0.11503162235021591, "learning_rate": 0.0002, "loss": 0.0659, "step": 1190 }, { "epoch": 0.0021824183468635693, "grad_norm": 0.0014654065016657114, "learning_rate": 0.0002, "loss": 0.0056, "step": 1200 }, { "epoch": 0.0022006051664207656, "grad_norm": 0.13292767107486725, "learning_rate": 0.0002, "loss": 0.2956, "step": 1210 }, { "epoch": 0.002218791985977962, "grad_norm": 0.15238040685653687, "learning_rate": 0.0002, "loss": 0.1122, "step": 1220 }, { "epoch": 0.0022369788055351587, "grad_norm": 0.045078523457050323, "learning_rate": 0.0002, "loss": 0.091, "step": 1230 }, { "epoch": 0.002255165625092355, "grad_norm": 0.11438468098640442, "learning_rate": 0.0002, "loss": 0.0754, "step": 1240 }, { "epoch": 0.0022733524446495513, "grad_norm": 0.001236733514815569, "learning_rate": 0.0002, "loss": 0.004, "step": 1250 }, { "epoch": 0.0022915392642067476, "grad_norm": 0.23386552929878235, "learning_rate": 0.0002, "loss": 0.351, "step": 1260 }, { "epoch": 0.0023097260837639444, "grad_norm": 0.030786139890551567, "learning_rate": 0.0002, "loss": 0.1074, "step": 1270 }, { "epoch": 0.0023279129033211407, "grad_norm": 0.150347501039505, "learning_rate": 0.0002, "loss": 0.1064, "step": 1280 }, { "epoch": 0.002346099722878337, "grad_norm": 0.1402382105588913, "learning_rate": 0.0002, "loss": 0.0675, "step": 1290 }, { "epoch": 0.0023642865424355333, "grad_norm": 0.0006117303855717182, "learning_rate": 0.0002, "loss": 0.0031, "step": 1300 }, { "epoch": 0.00238247336199273, "grad_norm": 0.16031372547149658, "learning_rate": 0.0002, "loss": 0.4344, "step": 1310 }, { "epoch": 0.0024006601815499263, "grad_norm": 0.11017303168773651, "learning_rate": 0.0002, "loss": 0.1147, "step": 1320 }, { "epoch": 0.0024188470011071227, "grad_norm": 0.055746905505657196, "learning_rate": 0.0002, "loss": 0.093, "step": 1330 }, { "epoch": 0.002437033820664319, "grad_norm": 0.09806664288043976, "learning_rate": 0.0002, "loss": 0.0682, "step": 1340 }, { "epoch": 0.0024552206402215153, "grad_norm": 0.000555588339921087, "learning_rate": 0.0002, "loss": 0.0045, "step": 1350 }, { "epoch": 0.002473407459778712, "grad_norm": 0.04899182915687561, "learning_rate": 0.0002, "loss": 0.3454, "step": 1360 }, { "epoch": 0.0024915942793359083, "grad_norm": 0.02870030514895916, "learning_rate": 0.0002, "loss": 0.1036, "step": 1370 }, { "epoch": 0.0025097810988931046, "grad_norm": 0.08591730147600174, "learning_rate": 0.0002, "loss": 0.0962, "step": 1380 }, { "epoch": 0.002527967918450301, "grad_norm": 0.1169242337346077, "learning_rate": 0.0002, "loss": 0.0627, "step": 1390 }, { "epoch": 0.0025461547380074977, "grad_norm": 0.0008637752034701407, "learning_rate": 0.0002, "loss": 0.0025, "step": 1400 }, { "epoch": 0.002564341557564694, "grad_norm": 0.11741841584444046, "learning_rate": 0.0002, "loss": 0.3703, "step": 1410 }, { "epoch": 0.0025825283771218903, "grad_norm": 0.05232485383749008, "learning_rate": 0.0002, "loss": 0.1072, "step": 1420 }, { "epoch": 0.0026007151966790866, "grad_norm": 0.025201110169291496, "learning_rate": 0.0002, "loss": 0.0893, "step": 1430 }, { "epoch": 0.0026189020162362834, "grad_norm": 0.11462239921092987, "learning_rate": 0.0002, "loss": 0.0826, "step": 1440 }, { "epoch": 0.0026370888357934797, "grad_norm": 0.002194227883592248, "learning_rate": 0.0002, "loss": 0.0049, "step": 1450 }, { "epoch": 0.002655275655350676, "grad_norm": 0.05786404758691788, "learning_rate": 0.0002, "loss": 0.3187, "step": 1460 }, { "epoch": 0.0026734624749078723, "grad_norm": 0.03776915743947029, "learning_rate": 0.0002, "loss": 0.1002, "step": 1470 }, { "epoch": 0.0026916492944650686, "grad_norm": 0.08628734946250916, "learning_rate": 0.0002, "loss": 0.0933, "step": 1480 }, { "epoch": 0.0027098361140222653, "grad_norm": 0.0933455228805542, "learning_rate": 0.0002, "loss": 0.0712, "step": 1490 }, { "epoch": 0.0027280229335794617, "grad_norm": 0.0007446192903444171, "learning_rate": 0.0002, "loss": 0.003, "step": 1500 }, { "epoch": 0.002746209753136658, "grad_norm": 0.04412281885743141, "learning_rate": 0.0002, "loss": 0.3738, "step": 1510 }, { "epoch": 0.0027643965726938543, "grad_norm": 0.04729326814413071, "learning_rate": 0.0002, "loss": 0.1015, "step": 1520 }, { "epoch": 0.002782583392251051, "grad_norm": 0.04822024703025818, "learning_rate": 0.0002, "loss": 0.0913, "step": 1530 }, { "epoch": 0.0028007702118082473, "grad_norm": 0.15468090772628784, "learning_rate": 0.0002, "loss": 0.0794, "step": 1540 }, { "epoch": 0.0028189570313654436, "grad_norm": 0.0011828596470877528, "learning_rate": 0.0002, "loss": 0.0089, "step": 1550 }, { "epoch": 0.00283714385092264, "grad_norm": 0.030639037489891052, "learning_rate": 0.0002, "loss": 0.3382, "step": 1560 }, { "epoch": 0.0028553306704798367, "grad_norm": 0.08429472148418427, "learning_rate": 0.0002, "loss": 0.1075, "step": 1570 }, { "epoch": 0.002873517490037033, "grad_norm": 0.056431323289871216, "learning_rate": 0.0002, "loss": 0.0946, "step": 1580 }, { "epoch": 0.0028917043095942293, "grad_norm": 0.1799512803554535, "learning_rate": 0.0002, "loss": 0.0795, "step": 1590 }, { "epoch": 0.0029098911291514256, "grad_norm": 0.0018818675307556987, "learning_rate": 0.0002, "loss": 0.0082, "step": 1600 }, { "epoch": 0.002928077948708622, "grad_norm": 0.061398155987262726, "learning_rate": 0.0002, "loss": 0.3414, "step": 1610 }, { "epoch": 0.0029462647682658187, "grad_norm": 0.0657019093632698, "learning_rate": 0.0002, "loss": 0.1082, "step": 1620 }, { "epoch": 0.002964451587823015, "grad_norm": 0.04701487720012665, "learning_rate": 0.0002, "loss": 0.0918, "step": 1630 }, { "epoch": 0.0029826384073802113, "grad_norm": 0.1834430694580078, "learning_rate": 0.0002, "loss": 0.081, "step": 1640 }, { "epoch": 0.0030008252269374076, "grad_norm": 0.004841644782572985, "learning_rate": 0.0002, "loss": 0.0138, "step": 1650 }, { "epoch": 0.0030190120464946043, "grad_norm": 0.05793444439768791, "learning_rate": 0.0002, "loss": 0.2981, "step": 1660 }, { "epoch": 0.0030371988660518007, "grad_norm": 0.049123138189315796, "learning_rate": 0.0002, "loss": 0.1072, "step": 1670 }, { "epoch": 0.003055385685608997, "grad_norm": 0.033852141350507736, "learning_rate": 0.0002, "loss": 0.093, "step": 1680 }, { "epoch": 0.0030735725051661933, "grad_norm": 0.16161279380321503, "learning_rate": 0.0002, "loss": 0.084, "step": 1690 }, { "epoch": 0.00309175932472339, "grad_norm": 0.0011225020280107856, "learning_rate": 0.0002, "loss": 0.0059, "step": 1700 }, { "epoch": 0.0031099461442805863, "grad_norm": 0.05849582701921463, "learning_rate": 0.0002, "loss": 0.3878, "step": 1710 }, { "epoch": 0.0031281329638377826, "grad_norm": 0.033466637134552, "learning_rate": 0.0002, "loss": 0.1096, "step": 1720 }, { "epoch": 0.003146319783394979, "grad_norm": 0.03488466143608093, "learning_rate": 0.0002, "loss": 0.0895, "step": 1730 }, { "epoch": 0.0031645066029521757, "grad_norm": 0.15636079013347626, "learning_rate": 0.0002, "loss": 0.0716, "step": 1740 }, { "epoch": 0.003182693422509372, "grad_norm": 0.001519509358331561, "learning_rate": 0.0002, "loss": 0.0062, "step": 1750 }, { "epoch": 0.0032008802420665683, "grad_norm": 0.04979783296585083, "learning_rate": 0.0002, "loss": 0.3409, "step": 1760 }, { "epoch": 0.0032190670616237646, "grad_norm": 0.09706272929906845, "learning_rate": 0.0002, "loss": 0.1052, "step": 1770 }, { "epoch": 0.003237253881180961, "grad_norm": 0.08768483251333237, "learning_rate": 0.0002, "loss": 0.0938, "step": 1780 }, { "epoch": 0.0032554407007381577, "grad_norm": 0.20421457290649414, "learning_rate": 0.0002, "loss": 0.085, "step": 1790 }, { "epoch": 0.003273627520295354, "grad_norm": 0.0024727964773774147, "learning_rate": 0.0002, "loss": 0.0147, "step": 1800 }, { "epoch": 0.0032918143398525503, "grad_norm": 0.04270516335964203, "learning_rate": 0.0002, "loss": 0.2872, "step": 1810 }, { "epoch": 0.0033100011594097466, "grad_norm": 0.08055799454450607, "learning_rate": 0.0002, "loss": 0.0992, "step": 1820 }, { "epoch": 0.0033281879789669433, "grad_norm": 0.02607434056699276, "learning_rate": 0.0002, "loss": 0.0803, "step": 1830 }, { "epoch": 0.0033463747985241397, "grad_norm": 0.16260816156864166, "learning_rate": 0.0002, "loss": 0.0753, "step": 1840 }, { "epoch": 0.003364561618081336, "grad_norm": 0.004690333269536495, "learning_rate": 0.0002, "loss": 0.012, "step": 1850 }, { "epoch": 0.0033827484376385323, "grad_norm": 0.041513338685035706, "learning_rate": 0.0002, "loss": 0.2491, "step": 1860 }, { "epoch": 0.003400935257195729, "grad_norm": 0.08935420960187912, "learning_rate": 0.0002, "loss": 0.1001, "step": 1870 }, { "epoch": 0.0034191220767529253, "grad_norm": 0.03826737776398659, "learning_rate": 0.0002, "loss": 0.0877, "step": 1880 }, { "epoch": 0.0034373088963101216, "grad_norm": 0.19423778355121613, "learning_rate": 0.0002, "loss": 0.0797, "step": 1890 }, { "epoch": 0.003455495715867318, "grad_norm": 0.003520288970321417, "learning_rate": 0.0002, "loss": 0.013, "step": 1900 }, { "epoch": 0.0034736825354245143, "grad_norm": 0.14648132026195526, "learning_rate": 0.0002, "loss": 0.3209, "step": 1910 }, { "epoch": 0.003491869354981711, "grad_norm": 0.03780071437358856, "learning_rate": 0.0002, "loss": 0.0934, "step": 1920 }, { "epoch": 0.0035100561745389073, "grad_norm": 0.05014612153172493, "learning_rate": 0.0002, "loss": 0.082, "step": 1930 }, { "epoch": 0.0035282429940961036, "grad_norm": 0.12917590141296387, "learning_rate": 0.0002, "loss": 0.0733, "step": 1940 }, { "epoch": 0.0035464298136533, "grad_norm": 0.0030132795218378305, "learning_rate": 0.0002, "loss": 0.0111, "step": 1950 }, { "epoch": 0.0035646166332104967, "grad_norm": 0.03008626028895378, "learning_rate": 0.0002, "loss": 0.2126, "step": 1960 }, { "epoch": 0.003582803452767693, "grad_norm": 0.0915503203868866, "learning_rate": 0.0002, "loss": 0.1097, "step": 1970 }, { "epoch": 0.0036009902723248893, "grad_norm": 0.06607015430927277, "learning_rate": 0.0002, "loss": 0.0932, "step": 1980 }, { "epoch": 0.0036191770918820856, "grad_norm": 0.18796613812446594, "learning_rate": 0.0002, "loss": 0.083, "step": 1990 }, { "epoch": 0.0036373639114392823, "grad_norm": 0.0022257096134126186, "learning_rate": 0.0002, "loss": 0.0147, "step": 2000 }, { "epoch": 0.0036555507309964787, "grad_norm": 0.0687415823340416, "learning_rate": 0.0002, "loss": 0.2604, "step": 2010 }, { "epoch": 0.003673737550553675, "grad_norm": 0.025175679475069046, "learning_rate": 0.0002, "loss": 0.0998, "step": 2020 }, { "epoch": 0.0036919243701108713, "grad_norm": 0.04275168478488922, "learning_rate": 0.0002, "loss": 0.0898, "step": 2030 }, { "epoch": 0.003710111189668068, "grad_norm": 0.17306455969810486, "learning_rate": 0.0002, "loss": 0.0793, "step": 2040 }, { "epoch": 0.0037282980092252643, "grad_norm": 0.007826454006135464, "learning_rate": 0.0002, "loss": 0.011, "step": 2050 }, { "epoch": 0.0037464848287824606, "grad_norm": 0.06461178511381149, "learning_rate": 0.0002, "loss": 0.2597, "step": 2060 }, { "epoch": 0.003764671648339657, "grad_norm": 0.061357177793979645, "learning_rate": 0.0002, "loss": 0.1001, "step": 2070 }, { "epoch": 0.0037828584678968533, "grad_norm": 0.029154235497117043, "learning_rate": 0.0002, "loss": 0.0859, "step": 2080 }, { "epoch": 0.00380104528745405, "grad_norm": 0.1350340098142624, "learning_rate": 0.0002, "loss": 0.0756, "step": 2090 }, { "epoch": 0.0038192321070112463, "grad_norm": 0.0017614173702895641, "learning_rate": 0.0002, "loss": 0.0058, "step": 2100 }, { "epoch": 0.0038374189265684426, "grad_norm": 0.024254316464066505, "learning_rate": 0.0002, "loss": 0.3349, "step": 2110 }, { "epoch": 0.003855605746125639, "grad_norm": 0.07142530381679535, "learning_rate": 0.0002, "loss": 0.0953, "step": 2120 }, { "epoch": 0.0038737925656828357, "grad_norm": 0.05570175498723984, "learning_rate": 0.0002, "loss": 0.0796, "step": 2130 }, { "epoch": 0.003891979385240032, "grad_norm": 0.16996875405311584, "learning_rate": 0.0002, "loss": 0.0782, "step": 2140 }, { "epoch": 0.003910166204797228, "grad_norm": 0.0058751595206558704, "learning_rate": 0.0002, "loss": 0.0206, "step": 2150 }, { "epoch": 0.003928353024354425, "grad_norm": 0.029807811602950096, "learning_rate": 0.0002, "loss": 0.1926, "step": 2160 }, { "epoch": 0.003946539843911621, "grad_norm": 0.11123469471931458, "learning_rate": 0.0002, "loss": 0.1082, "step": 2170 }, { "epoch": 0.003964726663468817, "grad_norm": 0.074626125395298, "learning_rate": 0.0002, "loss": 0.081, "step": 2180 }, { "epoch": 0.003982913483026014, "grad_norm": 0.17397737503051758, "learning_rate": 0.0002, "loss": 0.0729, "step": 2190 }, { "epoch": 0.004001100302583211, "grad_norm": 0.007995887659490108, "learning_rate": 0.0002, "loss": 0.022, "step": 2200 }, { "epoch": 0.004019287122140407, "grad_norm": 0.039921898394823074, "learning_rate": 0.0002, "loss": 0.1883, "step": 2210 }, { "epoch": 0.004037473941697603, "grad_norm": 0.07736324518918991, "learning_rate": 0.0002, "loss": 0.0941, "step": 2220 }, { "epoch": 0.0040556607612548, "grad_norm": 0.0867881178855896, "learning_rate": 0.0002, "loss": 0.0873, "step": 2230 }, { "epoch": 0.004073847580811996, "grad_norm": 0.1497400403022766, "learning_rate": 0.0002, "loss": 0.0829, "step": 2240 }, { "epoch": 0.004092034400369192, "grad_norm": 0.007458314299583435, "learning_rate": 0.0002, "loss": 0.02, "step": 2250 }, { "epoch": 0.004110221219926389, "grad_norm": 0.04168029874563217, "learning_rate": 0.0002, "loss": 0.2176, "step": 2260 }, { "epoch": 0.004128408039483585, "grad_norm": 0.10017130523920059, "learning_rate": 0.0002, "loss": 0.0958, "step": 2270 }, { "epoch": 0.004146594859040782, "grad_norm": 0.02727416157722473, "learning_rate": 0.0002, "loss": 0.088, "step": 2280 }, { "epoch": 0.004164781678597978, "grad_norm": 0.15034393966197968, "learning_rate": 0.0002, "loss": 0.0826, "step": 2290 }, { "epoch": 0.004182968498155175, "grad_norm": 0.0023451410233974457, "learning_rate": 0.0002, "loss": 0.0102, "step": 2300 }, { "epoch": 0.004201155317712371, "grad_norm": 0.03462455794215202, "learning_rate": 0.0002, "loss": 0.3404, "step": 2310 }, { "epoch": 0.004219342137269567, "grad_norm": 0.02866148017346859, "learning_rate": 0.0002, "loss": 0.0932, "step": 2320 }, { "epoch": 0.004237528956826764, "grad_norm": 0.0685456171631813, "learning_rate": 0.0002, "loss": 0.0806, "step": 2330 }, { "epoch": 0.00425571577638396, "grad_norm": 0.17208056151866913, "learning_rate": 0.0002, "loss": 0.0826, "step": 2340 }, { "epoch": 0.004273902595941156, "grad_norm": 0.008708455599844456, "learning_rate": 0.0002, "loss": 0.0171, "step": 2350 }, { "epoch": 0.0042920894154983525, "grad_norm": 0.044025715440511703, "learning_rate": 0.0002, "loss": 0.212, "step": 2360 }, { "epoch": 0.00431027623505555, "grad_norm": 0.050246164202690125, "learning_rate": 0.0002, "loss": 0.107, "step": 2370 }, { "epoch": 0.004328463054612746, "grad_norm": 0.05257886275649071, "learning_rate": 0.0002, "loss": 0.0868, "step": 2380 }, { "epoch": 0.004346649874169942, "grad_norm": 0.16567641496658325, "learning_rate": 0.0002, "loss": 0.0819, "step": 2390 }, { "epoch": 0.004364836693727139, "grad_norm": 0.0062621901743113995, "learning_rate": 0.0002, "loss": 0.0171, "step": 2400 }, { "epoch": 0.004383023513284335, "grad_norm": 0.03025338612496853, "learning_rate": 0.0002, "loss": 0.2141, "step": 2410 }, { "epoch": 0.004401210332841531, "grad_norm": 0.06401577591896057, "learning_rate": 0.0002, "loss": 0.0982, "step": 2420 }, { "epoch": 0.004419397152398728, "grad_norm": 0.12474781274795532, "learning_rate": 0.0002, "loss": 0.0834, "step": 2430 }, { "epoch": 0.004437583971955924, "grad_norm": 0.18607665598392487, "learning_rate": 0.0002, "loss": 0.0801, "step": 2440 }, { "epoch": 0.004455770791513121, "grad_norm": 0.0017643098253756762, "learning_rate": 0.0002, "loss": 0.0129, "step": 2450 }, { "epoch": 0.004473957611070317, "grad_norm": 0.03936386480927467, "learning_rate": 0.0002, "loss": 0.2541, "step": 2460 }, { "epoch": 0.004492144430627514, "grad_norm": 0.08961635082960129, "learning_rate": 0.0002, "loss": 0.0961, "step": 2470 }, { "epoch": 0.00451033125018471, "grad_norm": 0.07525113970041275, "learning_rate": 0.0002, "loss": 0.0844, "step": 2480 }, { "epoch": 0.004528518069741906, "grad_norm": 0.16746751964092255, "learning_rate": 0.0002, "loss": 0.071, "step": 2490 }, { "epoch": 0.004546704889299103, "grad_norm": 0.0027625334914773703, "learning_rate": 0.0002, "loss": 0.0151, "step": 2500 }, { "epoch": 0.004564891708856299, "grad_norm": 0.049662694334983826, "learning_rate": 0.0002, "loss": 0.253, "step": 2510 }, { "epoch": 0.004583078528413495, "grad_norm": 0.08312079310417175, "learning_rate": 0.0002, "loss": 0.0922, "step": 2520 }, { "epoch": 0.0046012653479706915, "grad_norm": 0.0646345317363739, "learning_rate": 0.0002, "loss": 0.0889, "step": 2530 }, { "epoch": 0.004619452167527889, "grad_norm": 0.20036271214485168, "learning_rate": 0.0002, "loss": 0.081, "step": 2540 }, { "epoch": 0.004637638987085085, "grad_norm": 0.010091719217598438, "learning_rate": 0.0002, "loss": 0.024, "step": 2550 }, { "epoch": 0.004655825806642281, "grad_norm": 0.048885516822338104, "learning_rate": 0.0002, "loss": 0.184, "step": 2560 }, { "epoch": 0.004674012626199478, "grad_norm": 0.09142889827489853, "learning_rate": 0.0002, "loss": 0.0935, "step": 2570 }, { "epoch": 0.004692199445756674, "grad_norm": 0.049207963049411774, "learning_rate": 0.0002, "loss": 0.0816, "step": 2580 }, { "epoch": 0.00471038626531387, "grad_norm": 0.1498396098613739, "learning_rate": 0.0002, "loss": 0.0698, "step": 2590 }, { "epoch": 0.004728573084871067, "grad_norm": 0.00522881094366312, "learning_rate": 0.0002, "loss": 0.0189, "step": 2600 }, { "epoch": 0.004746759904428263, "grad_norm": 0.07461311668157578, "learning_rate": 0.0002, "loss": 0.1944, "step": 2610 }, { "epoch": 0.00476494672398546, "grad_norm": 0.048005711287260056, "learning_rate": 0.0002, "loss": 0.0883, "step": 2620 }, { "epoch": 0.004783133543542656, "grad_norm": 0.10151612013578415, "learning_rate": 0.0002, "loss": 0.0827, "step": 2630 }, { "epoch": 0.004801320363099853, "grad_norm": 0.1504422426223755, "learning_rate": 0.0002, "loss": 0.078, "step": 2640 }, { "epoch": 0.004819507182657049, "grad_norm": 0.004988422151654959, "learning_rate": 0.0002, "loss": 0.0229, "step": 2650 }, { "epoch": 0.004837694002214245, "grad_norm": 0.025008924305438995, "learning_rate": 0.0002, "loss": 0.1818, "step": 2660 }, { "epoch": 0.004855880821771442, "grad_norm": 0.027460169047117233, "learning_rate": 0.0002, "loss": 0.0966, "step": 2670 }, { "epoch": 0.004874067641328638, "grad_norm": 0.09704197943210602, "learning_rate": 0.0002, "loss": 0.0824, "step": 2680 }, { "epoch": 0.004892254460885834, "grad_norm": 0.138654425740242, "learning_rate": 0.0002, "loss": 0.0746, "step": 2690 }, { "epoch": 0.0049104412804430305, "grad_norm": 0.00859556533396244, "learning_rate": 0.0002, "loss": 0.0187, "step": 2700 }, { "epoch": 0.004928628100000228, "grad_norm": 0.05207522585988045, "learning_rate": 0.0002, "loss": 0.1985, "step": 2710 }, { "epoch": 0.004946814919557424, "grad_norm": 0.07787417620420456, "learning_rate": 0.0002, "loss": 0.101, "step": 2720 }, { "epoch": 0.00496500173911462, "grad_norm": 0.02819981426000595, "learning_rate": 0.0002, "loss": 0.0845, "step": 2730 }, { "epoch": 0.004983188558671817, "grad_norm": 0.13569314777851105, "learning_rate": 0.0002, "loss": 0.0756, "step": 2740 }, { "epoch": 0.005001375378229013, "grad_norm": 0.05175986513495445, "learning_rate": 0.0002, "loss": 0.024, "step": 2750 }, { "epoch": 0.005019562197786209, "grad_norm": 0.037230249494314194, "learning_rate": 0.0002, "loss": 0.2056, "step": 2760 }, { "epoch": 0.005037749017343406, "grad_norm": 0.05532974749803543, "learning_rate": 0.0002, "loss": 0.0939, "step": 2770 }, { "epoch": 0.005055935836900602, "grad_norm": 0.06930708140134811, "learning_rate": 0.0002, "loss": 0.0853, "step": 2780 }, { "epoch": 0.005074122656457798, "grad_norm": 0.16405801475048065, "learning_rate": 0.0002, "loss": 0.0766, "step": 2790 }, { "epoch": 0.005092309476014995, "grad_norm": 0.006398684345185757, "learning_rate": 0.0002, "loss": 0.0124, "step": 2800 }, { "epoch": 0.005110496295572192, "grad_norm": 0.06269315630197525, "learning_rate": 0.0002, "loss": 0.2703, "step": 2810 }, { "epoch": 0.005128683115129388, "grad_norm": 0.049293261021375656, "learning_rate": 0.0002, "loss": 0.0943, "step": 2820 }, { "epoch": 0.005146869934686584, "grad_norm": 0.08814405649900436, "learning_rate": 0.0002, "loss": 0.0855, "step": 2830 }, { "epoch": 0.005165056754243781, "grad_norm": 0.17452259361743927, "learning_rate": 0.0002, "loss": 0.0822, "step": 2840 }, { "epoch": 0.005183243573800977, "grad_norm": 0.005008229520171881, "learning_rate": 0.0002, "loss": 0.0136, "step": 2850 }, { "epoch": 0.005201430393358173, "grad_norm": 0.04459540545940399, "learning_rate": 0.0002, "loss": 0.2623, "step": 2860 }, { "epoch": 0.0052196172129153695, "grad_norm": 0.042845603078603745, "learning_rate": 0.0002, "loss": 0.0929, "step": 2870 }, { "epoch": 0.005237804032472567, "grad_norm": 0.03079635463654995, "learning_rate": 0.0002, "loss": 0.0844, "step": 2880 }, { "epoch": 0.005255990852029763, "grad_norm": 0.14457851648330688, "learning_rate": 0.0002, "loss": 0.0753, "step": 2890 }, { "epoch": 0.005274177671586959, "grad_norm": 0.0009016963304020464, "learning_rate": 0.0002, "loss": 0.0037, "step": 2900 }, { "epoch": 0.005292364491144156, "grad_norm": 0.0983906164765358, "learning_rate": 0.0002, "loss": 0.3661, "step": 2910 }, { "epoch": 0.005310551310701352, "grad_norm": 0.08794154971837997, "learning_rate": 0.0002, "loss": 0.0894, "step": 2920 }, { "epoch": 0.005328738130258548, "grad_norm": 0.026981573551893234, "learning_rate": 0.0002, "loss": 0.0779, "step": 2930 }, { "epoch": 0.005346924949815745, "grad_norm": 0.15572553873062134, "learning_rate": 0.0002, "loss": 0.077, "step": 2940 }, { "epoch": 0.005365111769372941, "grad_norm": 0.005491070915013552, "learning_rate": 0.0002, "loss": 0.0092, "step": 2950 }, { "epoch": 0.005383298588930137, "grad_norm": 0.07383686304092407, "learning_rate": 0.0002, "loss": 0.2574, "step": 2960 }, { "epoch": 0.005401485408487334, "grad_norm": 0.05919960141181946, "learning_rate": 0.0002, "loss": 0.1045, "step": 2970 }, { "epoch": 0.005419672228044531, "grad_norm": 0.06027739867568016, "learning_rate": 0.0002, "loss": 0.0822, "step": 2980 }, { "epoch": 0.005437859047601727, "grad_norm": 0.1288602501153946, "learning_rate": 0.0002, "loss": 0.0688, "step": 2990 }, { "epoch": 0.005456045867158923, "grad_norm": 0.007565880194306374, "learning_rate": 0.0002, "loss": 0.0192, "step": 3000 }, { "epoch": 0.00547423268671612, "grad_norm": 0.024412864819169044, "learning_rate": 0.0002, "loss": 0.1782, "step": 3010 }, { "epoch": 0.005492419506273316, "grad_norm": 0.05559355765581131, "learning_rate": 0.0002, "loss": 0.1072, "step": 3020 }, { "epoch": 0.005510606325830512, "grad_norm": 0.07073906064033508, "learning_rate": 0.0002, "loss": 0.0863, "step": 3030 }, { "epoch": 0.0055287931453877085, "grad_norm": 0.14979414641857147, "learning_rate": 0.0002, "loss": 0.0788, "step": 3040 }, { "epoch": 0.005546979964944906, "grad_norm": 0.0057297456078231335, "learning_rate": 0.0002, "loss": 0.0192, "step": 3050 }, { "epoch": 0.005565166784502102, "grad_norm": 0.03195042535662651, "learning_rate": 0.0002, "loss": 0.1879, "step": 3060 }, { "epoch": 0.005583353604059298, "grad_norm": 0.05925082787871361, "learning_rate": 0.0002, "loss": 0.0992, "step": 3070 }, { "epoch": 0.005601540423616495, "grad_norm": 0.052063606679439545, "learning_rate": 0.0002, "loss": 0.0826, "step": 3080 }, { "epoch": 0.005619727243173691, "grad_norm": 0.16005952656269073, "learning_rate": 0.0002, "loss": 0.0743, "step": 3090 }, { "epoch": 0.005637914062730887, "grad_norm": 0.005742133595049381, "learning_rate": 0.0002, "loss": 0.0137, "step": 3100 }, { "epoch": 0.005656100882288084, "grad_norm": 0.07523638010025024, "learning_rate": 0.0002, "loss": 0.2072, "step": 3110 }, { "epoch": 0.00567428770184528, "grad_norm": 0.23799611628055573, "learning_rate": 0.0002, "loss": 0.0906, "step": 3120 }, { "epoch": 0.005692474521402476, "grad_norm": 0.06176261603832245, "learning_rate": 0.0002, "loss": 0.088, "step": 3130 }, { "epoch": 0.005710661340959673, "grad_norm": 0.13692723214626312, "learning_rate": 0.0002, "loss": 0.0733, "step": 3140 }, { "epoch": 0.00572884816051687, "grad_norm": 0.007059803698211908, "learning_rate": 0.0002, "loss": 0.0194, "step": 3150 }, { "epoch": 0.005747034980074066, "grad_norm": 0.08868405222892761, "learning_rate": 0.0002, "loss": 0.1745, "step": 3160 }, { "epoch": 0.005765221799631262, "grad_norm": 0.05126733332872391, "learning_rate": 0.0002, "loss": 0.1024, "step": 3170 }, { "epoch": 0.005783408619188459, "grad_norm": 0.06377821415662766, "learning_rate": 0.0002, "loss": 0.0846, "step": 3180 }, { "epoch": 0.005801595438745655, "grad_norm": 0.10748566687107086, "learning_rate": 0.0002, "loss": 0.0769, "step": 3190 }, { "epoch": 0.005819782258302851, "grad_norm": 0.004992443602532148, "learning_rate": 0.0002, "loss": 0.0114, "step": 3200 }, { "epoch": 0.0058379690778600475, "grad_norm": 0.0420277863740921, "learning_rate": 0.0002, "loss": 0.2159, "step": 3210 }, { "epoch": 0.005856155897417244, "grad_norm": 0.02828531712293625, "learning_rate": 0.0002, "loss": 0.0923, "step": 3220 }, { "epoch": 0.005874342716974441, "grad_norm": 0.028216248378157616, "learning_rate": 0.0002, "loss": 0.0789, "step": 3230 }, { "epoch": 0.005892529536531637, "grad_norm": 0.11420746147632599, "learning_rate": 0.0002, "loss": 0.0696, "step": 3240 }, { "epoch": 0.005910716356088834, "grad_norm": 0.0019631448667496443, "learning_rate": 0.0002, "loss": 0.0128, "step": 3250 }, { "epoch": 0.00592890317564603, "grad_norm": 0.05514012649655342, "learning_rate": 0.0002, "loss": 0.2609, "step": 3260 }, { "epoch": 0.005947089995203226, "grad_norm": 0.0917636826634407, "learning_rate": 0.0002, "loss": 0.0996, "step": 3270 }, { "epoch": 0.005965276814760423, "grad_norm": 0.03648284077644348, "learning_rate": 0.0002, "loss": 0.084, "step": 3280 }, { "epoch": 0.005983463634317619, "grad_norm": 0.13859149813652039, "learning_rate": 0.0002, "loss": 0.0807, "step": 3290 }, { "epoch": 0.006001650453874815, "grad_norm": 0.013779910281300545, "learning_rate": 0.0002, "loss": 0.0181, "step": 3300 }, { "epoch": 0.006019837273432012, "grad_norm": 0.02654041163623333, "learning_rate": 0.0002, "loss": 0.1636, "step": 3310 }, { "epoch": 0.006038024092989209, "grad_norm": 0.062298137694597244, "learning_rate": 0.0002, "loss": 0.0872, "step": 3320 }, { "epoch": 0.006056210912546405, "grad_norm": 0.0351388119161129, "learning_rate": 0.0002, "loss": 0.0802, "step": 3330 }, { "epoch": 0.006074397732103601, "grad_norm": 0.16063807904720306, "learning_rate": 0.0002, "loss": 0.0753, "step": 3340 }, { "epoch": 0.006092584551660798, "grad_norm": 0.009991235099732876, "learning_rate": 0.0002, "loss": 0.016, "step": 3350 }, { "epoch": 0.006110771371217994, "grad_norm": 0.052919622510671616, "learning_rate": 0.0002, "loss": 0.2027, "step": 3360 }, { "epoch": 0.00612895819077519, "grad_norm": 0.03228602185845375, "learning_rate": 0.0002, "loss": 0.0985, "step": 3370 }, { "epoch": 0.0061471450103323865, "grad_norm": 0.11311203986406326, "learning_rate": 0.0002, "loss": 0.0797, "step": 3380 }, { "epoch": 0.006165331829889583, "grad_norm": 0.1674620360136032, "learning_rate": 0.0002, "loss": 0.072, "step": 3390 }, { "epoch": 0.00618351864944678, "grad_norm": 0.015154430642724037, "learning_rate": 0.0002, "loss": 0.0186, "step": 3400 }, { "epoch": 0.006201705469003976, "grad_norm": 0.043151434510946274, "learning_rate": 0.0002, "loss": 0.1892, "step": 3410 }, { "epoch": 0.006219892288561173, "grad_norm": 0.12342707067728043, "learning_rate": 0.0002, "loss": 0.0907, "step": 3420 }, { "epoch": 0.006238079108118369, "grad_norm": 0.08350827544927597, "learning_rate": 0.0002, "loss": 0.0783, "step": 3430 }, { "epoch": 0.006256265927675565, "grad_norm": 0.11938697844743729, "learning_rate": 0.0002, "loss": 0.0666, "step": 3440 }, { "epoch": 0.006274452747232762, "grad_norm": 0.015424132347106934, "learning_rate": 0.0002, "loss": 0.0173, "step": 3450 }, { "epoch": 0.006292639566789958, "grad_norm": 0.04220043867826462, "learning_rate": 0.0002, "loss": 0.1805, "step": 3460 }, { "epoch": 0.006310826386347154, "grad_norm": 0.08813903480768204, "learning_rate": 0.0002, "loss": 0.096, "step": 3470 }, { "epoch": 0.006329013205904351, "grad_norm": 0.07647278904914856, "learning_rate": 0.0002, "loss": 0.0821, "step": 3480 }, { "epoch": 0.006347200025461548, "grad_norm": 0.14242641627788544, "learning_rate": 0.0002, "loss": 0.0769, "step": 3490 }, { "epoch": 0.006365386845018744, "grad_norm": 0.011115231551229954, "learning_rate": 0.0002, "loss": 0.0221, "step": 3500 }, { "epoch": 0.00638357366457594, "grad_norm": 0.036351826041936874, "learning_rate": 0.0002, "loss": 0.1557, "step": 3510 }, { "epoch": 0.006401760484133137, "grad_norm": 0.08549819141626358, "learning_rate": 0.0002, "loss": 0.0864, "step": 3520 }, { "epoch": 0.006419947303690333, "grad_norm": 0.047141823917627335, "learning_rate": 0.0002, "loss": 0.079, "step": 3530 }, { "epoch": 0.006438134123247529, "grad_norm": 0.13143447041511536, "learning_rate": 0.0002, "loss": 0.0717, "step": 3540 }, { "epoch": 0.0064563209428047256, "grad_norm": 0.013524871319532394, "learning_rate": 0.0002, "loss": 0.0149, "step": 3550 }, { "epoch": 0.006474507762361922, "grad_norm": 0.03367459774017334, "learning_rate": 0.0002, "loss": 0.1715, "step": 3560 }, { "epoch": 0.006492694581919119, "grad_norm": 0.045889757573604584, "learning_rate": 0.0002, "loss": 0.0949, "step": 3570 }, { "epoch": 0.006510881401476315, "grad_norm": 0.04099202901124954, "learning_rate": 0.0002, "loss": 0.0813, "step": 3580 }, { "epoch": 0.006529068221033512, "grad_norm": 0.133371040225029, "learning_rate": 0.0002, "loss": 0.0767, "step": 3590 }, { "epoch": 0.006547255040590708, "grad_norm": 0.00645647756755352, "learning_rate": 0.0002, "loss": 0.0186, "step": 3600 }, { "epoch": 0.006565441860147904, "grad_norm": 0.050674330443143845, "learning_rate": 0.0002, "loss": 0.2179, "step": 3610 }, { "epoch": 0.006583628679705101, "grad_norm": 0.07087302207946777, "learning_rate": 0.0002, "loss": 0.0882, "step": 3620 }, { "epoch": 0.006601815499262297, "grad_norm": 0.02759486250579357, "learning_rate": 0.0002, "loss": 0.0789, "step": 3630 }, { "epoch": 0.006620002318819493, "grad_norm": 0.12163479626178741, "learning_rate": 0.0002, "loss": 0.0689, "step": 3640 }, { "epoch": 0.00663818913837669, "grad_norm": 0.00969718024134636, "learning_rate": 0.0002, "loss": 0.0112, "step": 3650 }, { "epoch": 0.006656375957933887, "grad_norm": 0.07106204330921173, "learning_rate": 0.0002, "loss": 0.199, "step": 3660 }, { "epoch": 0.006674562777491083, "grad_norm": 0.08954132348299026, "learning_rate": 0.0002, "loss": 0.0985, "step": 3670 }, { "epoch": 0.006692749597048279, "grad_norm": 0.09899396449327469, "learning_rate": 0.0002, "loss": 0.0811, "step": 3680 }, { "epoch": 0.006710936416605476, "grad_norm": 0.12119311839342117, "learning_rate": 0.0002, "loss": 0.0698, "step": 3690 }, { "epoch": 0.006729123236162672, "grad_norm": 0.013957214541733265, "learning_rate": 0.0002, "loss": 0.018, "step": 3700 }, { "epoch": 0.006747310055719868, "grad_norm": 0.03089285083115101, "learning_rate": 0.0002, "loss": 0.1434, "step": 3710 }, { "epoch": 0.0067654968752770646, "grad_norm": 0.025650829076766968, "learning_rate": 0.0002, "loss": 0.0886, "step": 3720 }, { "epoch": 0.006783683694834261, "grad_norm": 0.044103365391492844, "learning_rate": 0.0002, "loss": 0.0788, "step": 3730 }, { "epoch": 0.006801870514391458, "grad_norm": 0.09726370871067047, "learning_rate": 0.0002, "loss": 0.0674, "step": 3740 }, { "epoch": 0.006820057333948654, "grad_norm": 0.018105274066329002, "learning_rate": 0.0002, "loss": 0.0232, "step": 3750 }, { "epoch": 0.006838244153505851, "grad_norm": 0.021543240174651146, "learning_rate": 0.0002, "loss": 0.1406, "step": 3760 }, { "epoch": 0.006856430973063047, "grad_norm": 0.09367050975561142, "learning_rate": 0.0002, "loss": 0.0973, "step": 3770 }, { "epoch": 0.006874617792620243, "grad_norm": 0.06836032122373581, "learning_rate": 0.0002, "loss": 0.0848, "step": 3780 }, { "epoch": 0.00689280461217744, "grad_norm": 0.11758081614971161, "learning_rate": 0.0002, "loss": 0.0693, "step": 3790 }, { "epoch": 0.006910991431734636, "grad_norm": 0.008669364266097546, "learning_rate": 0.0002, "loss": 0.0223, "step": 3800 }, { "epoch": 0.006929178251291832, "grad_norm": 0.03903719782829285, "learning_rate": 0.0002, "loss": 0.1519, "step": 3810 }, { "epoch": 0.0069473650708490285, "grad_norm": 0.030682874843478203, "learning_rate": 0.0002, "loss": 0.0931, "step": 3820 }, { "epoch": 0.006965551890406226, "grad_norm": 0.02693006955087185, "learning_rate": 0.0002, "loss": 0.0784, "step": 3830 }, { "epoch": 0.006983738709963422, "grad_norm": 0.09535166621208191, "learning_rate": 0.0002, "loss": 0.0696, "step": 3840 }, { "epoch": 0.007001925529520618, "grad_norm": 0.014680403284728527, "learning_rate": 0.0002, "loss": 0.0176, "step": 3850 }, { "epoch": 0.007020112349077815, "grad_norm": 0.031090212985873222, "learning_rate": 0.0002, "loss": 0.1544, "step": 3860 }, { "epoch": 0.007038299168635011, "grad_norm": 0.05870644003152847, "learning_rate": 0.0002, "loss": 0.0898, "step": 3870 }, { "epoch": 0.007056485988192207, "grad_norm": 0.03480982780456543, "learning_rate": 0.0002, "loss": 0.0812, "step": 3880 }, { "epoch": 0.0070746728077494036, "grad_norm": 0.09751418977975845, "learning_rate": 0.0002, "loss": 0.0724, "step": 3890 }, { "epoch": 0.0070928596273066, "grad_norm": 0.022084850817918777, "learning_rate": 0.0002, "loss": 0.019, "step": 3900 }, { "epoch": 0.007111046446863797, "grad_norm": 0.06994971632957458, "learning_rate": 0.0002, "loss": 0.1478, "step": 3910 }, { "epoch": 0.007129233266420993, "grad_norm": 0.05761263892054558, "learning_rate": 0.0002, "loss": 0.0932, "step": 3920 }, { "epoch": 0.00714742008597819, "grad_norm": 0.029772033914923668, "learning_rate": 0.0002, "loss": 0.0855, "step": 3930 }, { "epoch": 0.007165606905535386, "grad_norm": 0.11868726462125778, "learning_rate": 0.0002, "loss": 0.0727, "step": 3940 }, { "epoch": 0.007183793725092582, "grad_norm": 0.0065403408370912075, "learning_rate": 0.0002, "loss": 0.0174, "step": 3950 }, { "epoch": 0.007201980544649779, "grad_norm": 0.031544361263513565, "learning_rate": 0.0002, "loss": 0.1827, "step": 3960 }, { "epoch": 0.007220167364206975, "grad_norm": 0.031641531735658646, "learning_rate": 0.0002, "loss": 0.0867, "step": 3970 }, { "epoch": 0.007238354183764171, "grad_norm": 0.028574040159583092, "learning_rate": 0.0002, "loss": 0.0798, "step": 3980 }, { "epoch": 0.0072565410033213675, "grad_norm": 0.12866555154323578, "learning_rate": 0.0002, "loss": 0.0708, "step": 3990 }, { "epoch": 0.007274727822878565, "grad_norm": 0.00843430683016777, "learning_rate": 0.0002, "loss": 0.0127, "step": 4000 }, { "epoch": 0.007292914642435761, "grad_norm": 0.03737691789865494, "learning_rate": 0.0002, "loss": 0.2201, "step": 4010 }, { "epoch": 0.007311101461992957, "grad_norm": 0.05326579511165619, "learning_rate": 0.0002, "loss": 0.0838, "step": 4020 }, { "epoch": 0.007329288281550154, "grad_norm": 0.031934209167957306, "learning_rate": 0.0002, "loss": 0.0778, "step": 4030 }, { "epoch": 0.00734747510110735, "grad_norm": 0.17401957511901855, "learning_rate": 0.0002, "loss": 0.073, "step": 4040 }, { "epoch": 0.007365661920664546, "grad_norm": 0.005256639327853918, "learning_rate": 0.0002, "loss": 0.0122, "step": 4050 }, { "epoch": 0.0073838487402217426, "grad_norm": 0.05043623968958855, "learning_rate": 0.0002, "loss": 0.2524, "step": 4060 }, { "epoch": 0.007402035559778939, "grad_norm": 0.06662425398826599, "learning_rate": 0.0002, "loss": 0.0976, "step": 4070 }, { "epoch": 0.007420222379336136, "grad_norm": 0.13419686257839203, "learning_rate": 0.0002, "loss": 0.0833, "step": 4080 }, { "epoch": 0.007438409198893332, "grad_norm": 0.176285520195961, "learning_rate": 0.0002, "loss": 0.0761, "step": 4090 }, { "epoch": 0.007456596018450529, "grad_norm": 0.008489354513585567, "learning_rate": 0.0002, "loss": 0.0182, "step": 4100 }, { "epoch": 0.007474782838007725, "grad_norm": 0.06247509643435478, "learning_rate": 0.0002, "loss": 0.2232, "step": 4110 }, { "epoch": 0.007492969657564921, "grad_norm": 0.05744702368974686, "learning_rate": 0.0002, "loss": 0.0875, "step": 4120 }, { "epoch": 0.007511156477122118, "grad_norm": 0.053026407957077026, "learning_rate": 0.0002, "loss": 0.0807, "step": 4130 }, { "epoch": 0.007529343296679314, "grad_norm": 0.11734003573656082, "learning_rate": 0.0002, "loss": 0.0724, "step": 4140 }, { "epoch": 0.00754753011623651, "grad_norm": 0.005216363817453384, "learning_rate": 0.0002, "loss": 0.0129, "step": 4150 }, { "epoch": 0.0075657169357937065, "grad_norm": 0.08154789358377457, "learning_rate": 0.0002, "loss": 0.2221, "step": 4160 }, { "epoch": 0.007583903755350904, "grad_norm": 0.03619784861803055, "learning_rate": 0.0002, "loss": 0.0993, "step": 4170 }, { "epoch": 0.0076020905749081, "grad_norm": 0.08239256590604782, "learning_rate": 0.0002, "loss": 0.0811, "step": 4180 }, { "epoch": 0.007620277394465296, "grad_norm": 0.11934535950422287, "learning_rate": 0.0002, "loss": 0.0726, "step": 4190 }, { "epoch": 0.007638464214022493, "grad_norm": 0.006965799257159233, "learning_rate": 0.0002, "loss": 0.0181, "step": 4200 }, { "epoch": 0.007656651033579689, "grad_norm": 0.04328077286481857, "learning_rate": 0.0002, "loss": 0.1983, "step": 4210 }, { "epoch": 0.007674837853136885, "grad_norm": 0.08253510296344757, "learning_rate": 0.0002, "loss": 0.0954, "step": 4220 }, { "epoch": 0.0076930246726940816, "grad_norm": 0.06146657094359398, "learning_rate": 0.0002, "loss": 0.0843, "step": 4230 }, { "epoch": 0.007711211492251278, "grad_norm": 0.13579218089580536, "learning_rate": 0.0002, "loss": 0.0672, "step": 4240 }, { "epoch": 0.007729398311808474, "grad_norm": 0.0038396338932216167, "learning_rate": 0.0002, "loss": 0.0131, "step": 4250 }, { "epoch": 0.007747585131365671, "grad_norm": 0.03109130822122097, "learning_rate": 0.0002, "loss": 0.2102, "step": 4260 }, { "epoch": 0.007765771950922868, "grad_norm": 0.04971664398908615, "learning_rate": 0.0002, "loss": 0.0903, "step": 4270 }, { "epoch": 0.007783958770480064, "grad_norm": 0.06476306915283203, "learning_rate": 0.0002, "loss": 0.0859, "step": 4280 }, { "epoch": 0.00780214559003726, "grad_norm": 0.15377041697502136, "learning_rate": 0.0002, "loss": 0.0828, "step": 4290 }, { "epoch": 0.007820332409594457, "grad_norm": 0.005592274013906717, "learning_rate": 0.0002, "loss": 0.014, "step": 4300 }, { "epoch": 0.007838519229151653, "grad_norm": 0.04387212172150612, "learning_rate": 0.0002, "loss": 0.1907, "step": 4310 }, { "epoch": 0.00785670604870885, "grad_norm": 0.06001356989145279, "learning_rate": 0.0002, "loss": 0.0864, "step": 4320 }, { "epoch": 0.007874892868266046, "grad_norm": 0.030866140499711037, "learning_rate": 0.0002, "loss": 0.0748, "step": 4330 }, { "epoch": 0.007893079687823242, "grad_norm": 0.13280808925628662, "learning_rate": 0.0002, "loss": 0.0686, "step": 4340 }, { "epoch": 0.007911266507380438, "grad_norm": 0.015559020452201366, "learning_rate": 0.0002, "loss": 0.016, "step": 4350 }, { "epoch": 0.007929453326937634, "grad_norm": 0.0669974684715271, "learning_rate": 0.0002, "loss": 0.1916, "step": 4360 }, { "epoch": 0.00794764014649483, "grad_norm": 0.0759076252579689, "learning_rate": 0.0002, "loss": 0.0925, "step": 4370 }, { "epoch": 0.007965826966052029, "grad_norm": 0.029388410970568657, "learning_rate": 0.0002, "loss": 0.086, "step": 4380 }, { "epoch": 0.007984013785609225, "grad_norm": 0.17637981474399567, "learning_rate": 0.0002, "loss": 0.0697, "step": 4390 }, { "epoch": 0.008002200605166421, "grad_norm": 0.008022189140319824, "learning_rate": 0.0002, "loss": 0.0196, "step": 4400 }, { "epoch": 0.008020387424723618, "grad_norm": 0.04126167669892311, "learning_rate": 0.0002, "loss": 0.192, "step": 4410 }, { "epoch": 0.008038574244280814, "grad_norm": 0.08132971078157425, "learning_rate": 0.0002, "loss": 0.093, "step": 4420 }, { "epoch": 0.00805676106383801, "grad_norm": 0.07568484544754028, "learning_rate": 0.0002, "loss": 0.0823, "step": 4430 }, { "epoch": 0.008074947883395207, "grad_norm": 0.1259222775697708, "learning_rate": 0.0002, "loss": 0.0696, "step": 4440 }, { "epoch": 0.008093134702952403, "grad_norm": 0.009711826220154762, "learning_rate": 0.0002, "loss": 0.0232, "step": 4450 }, { "epoch": 0.0081113215225096, "grad_norm": 0.029734279960393906, "learning_rate": 0.0002, "loss": 0.1595, "step": 4460 }, { "epoch": 0.008129508342066796, "grad_norm": 0.04886960610747337, "learning_rate": 0.0002, "loss": 0.0919, "step": 4470 }, { "epoch": 0.008147695161623992, "grad_norm": 0.07031470537185669, "learning_rate": 0.0002, "loss": 0.0813, "step": 4480 }, { "epoch": 0.008165881981181188, "grad_norm": 0.12099859863519669, "learning_rate": 0.0002, "loss": 0.0731, "step": 4490 }, { "epoch": 0.008184068800738385, "grad_norm": 0.02181529812514782, "learning_rate": 0.0002, "loss": 0.021, "step": 4500 }, { "epoch": 0.00820225562029558, "grad_norm": 0.035477787256240845, "learning_rate": 0.0002, "loss": 0.1429, "step": 4510 }, { "epoch": 0.008220442439852777, "grad_norm": 0.07788772135972977, "learning_rate": 0.0002, "loss": 0.0842, "step": 4520 }, { "epoch": 0.008238629259409973, "grad_norm": 0.045833125710487366, "learning_rate": 0.0002, "loss": 0.0829, "step": 4530 }, { "epoch": 0.00825681607896717, "grad_norm": 0.12271951884031296, "learning_rate": 0.0002, "loss": 0.0707, "step": 4540 }, { "epoch": 0.008275002898524366, "grad_norm": 0.01919553242623806, "learning_rate": 0.0002, "loss": 0.0213, "step": 4550 }, { "epoch": 0.008293189718081564, "grad_norm": 0.032527096569538116, "learning_rate": 0.0002, "loss": 0.1397, "step": 4560 }, { "epoch": 0.00831137653763876, "grad_norm": 0.045243579894304276, "learning_rate": 0.0002, "loss": 0.0854, "step": 4570 }, { "epoch": 0.008329563357195957, "grad_norm": 0.04226524010300636, "learning_rate": 0.0002, "loss": 0.0728, "step": 4580 }, { "epoch": 0.008347750176753153, "grad_norm": 0.09887039661407471, "learning_rate": 0.0002, "loss": 0.0661, "step": 4590 }, { "epoch": 0.00836593699631035, "grad_norm": 0.01822318509221077, "learning_rate": 0.0002, "loss": 0.0169, "step": 4600 }, { "epoch": 0.008384123815867546, "grad_norm": 0.05729951336979866, "learning_rate": 0.0002, "loss": 0.137, "step": 4610 }, { "epoch": 0.008402310635424742, "grad_norm": 0.041520439088344574, "learning_rate": 0.0002, "loss": 0.0825, "step": 4620 }, { "epoch": 0.008420497454981938, "grad_norm": 0.051164623349905014, "learning_rate": 0.0002, "loss": 0.0818, "step": 4630 }, { "epoch": 0.008438684274539135, "grad_norm": 0.1289409101009369, "learning_rate": 0.0002, "loss": 0.0664, "step": 4640 }, { "epoch": 0.008456871094096331, "grad_norm": 0.0085114361718297, "learning_rate": 0.0002, "loss": 0.0229, "step": 4650 }, { "epoch": 0.008475057913653527, "grad_norm": 0.03594676032662392, "learning_rate": 0.0002, "loss": 0.1401, "step": 4660 }, { "epoch": 0.008493244733210724, "grad_norm": 0.0316978394985199, "learning_rate": 0.0002, "loss": 0.0877, "step": 4670 }, { "epoch": 0.00851143155276792, "grad_norm": 0.023302162066102028, "learning_rate": 0.0002, "loss": 0.0764, "step": 4680 }, { "epoch": 0.008529618372325116, "grad_norm": 0.1329929083585739, "learning_rate": 0.0002, "loss": 0.0788, "step": 4690 }, { "epoch": 0.008547805191882312, "grad_norm": 0.01048013661056757, "learning_rate": 0.0002, "loss": 0.0234, "step": 4700 }, { "epoch": 0.008565992011439509, "grad_norm": 0.03505022078752518, "learning_rate": 0.0002, "loss": 0.1509, "step": 4710 }, { "epoch": 0.008584178830996705, "grad_norm": 0.03877585008740425, "learning_rate": 0.0002, "loss": 0.0802, "step": 4720 }, { "epoch": 0.008602365650553903, "grad_norm": 0.041193027049303055, "learning_rate": 0.0002, "loss": 0.0695, "step": 4730 }, { "epoch": 0.0086205524701111, "grad_norm": 0.17310455441474915, "learning_rate": 0.0002, "loss": 0.0794, "step": 4740 }, { "epoch": 0.008638739289668296, "grad_norm": 0.0061012376099824905, "learning_rate": 0.0002, "loss": 0.0158, "step": 4750 }, { "epoch": 0.008656926109225492, "grad_norm": 0.04843207076191902, "learning_rate": 0.0002, "loss": 0.2103, "step": 4760 }, { "epoch": 0.008675112928782688, "grad_norm": 0.04483436048030853, "learning_rate": 0.0002, "loss": 0.0878, "step": 4770 }, { "epoch": 0.008693299748339885, "grad_norm": 0.056655965745449066, "learning_rate": 0.0002, "loss": 0.0752, "step": 4780 }, { "epoch": 0.008711486567897081, "grad_norm": 0.11626063287258148, "learning_rate": 0.0002, "loss": 0.0685, "step": 4790 }, { "epoch": 0.008729673387454277, "grad_norm": 0.013872025534510612, "learning_rate": 0.0002, "loss": 0.0198, "step": 4800 }, { "epoch": 0.008747860207011474, "grad_norm": 0.06217370182275772, "learning_rate": 0.0002, "loss": 0.1371, "step": 4810 }, { "epoch": 0.00876604702656867, "grad_norm": 0.027149083092808723, "learning_rate": 0.0002, "loss": 0.0849, "step": 4820 }, { "epoch": 0.008784233846125866, "grad_norm": 0.043290987610816956, "learning_rate": 0.0002, "loss": 0.0739, "step": 4830 }, { "epoch": 0.008802420665683063, "grad_norm": 0.10664638131856918, "learning_rate": 0.0002, "loss": 0.0722, "step": 4840 }, { "epoch": 0.008820607485240259, "grad_norm": 0.033459801226854324, "learning_rate": 0.0002, "loss": 0.0234, "step": 4850 }, { "epoch": 0.008838794304797455, "grad_norm": 0.049193184822797775, "learning_rate": 0.0002, "loss": 0.1173, "step": 4860 }, { "epoch": 0.008856981124354651, "grad_norm": 0.05060647428035736, "learning_rate": 0.0002, "loss": 0.0883, "step": 4870 }, { "epoch": 0.008875167943911848, "grad_norm": 0.028496885672211647, "learning_rate": 0.0002, "loss": 0.0747, "step": 4880 }, { "epoch": 0.008893354763469044, "grad_norm": 0.10652820765972137, "learning_rate": 0.0002, "loss": 0.0707, "step": 4890 }, { "epoch": 0.008911541583026242, "grad_norm": 0.007879966869950294, "learning_rate": 0.0002, "loss": 0.0178, "step": 4900 }, { "epoch": 0.008929728402583438, "grad_norm": 0.05227983742952347, "learning_rate": 0.0002, "loss": 0.1379, "step": 4910 }, { "epoch": 0.008947915222140635, "grad_norm": 0.06054231896996498, "learning_rate": 0.0002, "loss": 0.0934, "step": 4920 }, { "epoch": 0.008966102041697831, "grad_norm": 0.029085835441946983, "learning_rate": 0.0002, "loss": 0.0816, "step": 4930 }, { "epoch": 0.008984288861255027, "grad_norm": 0.09829402714967728, "learning_rate": 0.0002, "loss": 0.0672, "step": 4940 }, { "epoch": 0.009002475680812224, "grad_norm": 0.005579107441008091, "learning_rate": 0.0002, "loss": 0.0172, "step": 4950 }, { "epoch": 0.00902066250036942, "grad_norm": 0.027280857786536217, "learning_rate": 0.0002, "loss": 0.1659, "step": 4960 }, { "epoch": 0.009038849319926616, "grad_norm": 0.10321583598852158, "learning_rate": 0.0002, "loss": 0.0947, "step": 4970 }, { "epoch": 0.009057036139483813, "grad_norm": 0.03381946310400963, "learning_rate": 0.0002, "loss": 0.0837, "step": 4980 }, { "epoch": 0.009075222959041009, "grad_norm": 0.14493779838085175, "learning_rate": 0.0002, "loss": 0.0736, "step": 4990 }, { "epoch": 0.009093409778598205, "grad_norm": 0.009917684830725193, "learning_rate": 0.0002, "loss": 0.0188, "step": 5000 }, { "epoch": 0.009111596598155402, "grad_norm": 1.003450632095337, "learning_rate": 0.0002, "loss": 0.218, "step": 5010 }, { "epoch": 0.009129783417712598, "grad_norm": 0.09081514924764633, "learning_rate": 0.0002, "loss": 0.1714, "step": 5020 }, { "epoch": 0.009147970237269794, "grad_norm": 0.042343392968177795, "learning_rate": 0.0002, "loss": 0.0823, "step": 5030 }, { "epoch": 0.00916615705682699, "grad_norm": 0.09944835305213928, "learning_rate": 0.0002, "loss": 0.0667, "step": 5040 }, { "epoch": 0.009184343876384187, "grad_norm": 0.008264658972620964, "learning_rate": 0.0002, "loss": 0.0122, "step": 5050 }, { "epoch": 0.009202530695941383, "grad_norm": 0.08990125358104706, "learning_rate": 0.0002, "loss": 0.1685, "step": 5060 }, { "epoch": 0.009220717515498581, "grad_norm": 0.0331488698720932, "learning_rate": 0.0002, "loss": 0.0885, "step": 5070 }, { "epoch": 0.009238904335055777, "grad_norm": 0.029458707198500633, "learning_rate": 0.0002, "loss": 0.078, "step": 5080 }, { "epoch": 0.009257091154612974, "grad_norm": 0.10468839108943939, "learning_rate": 0.0002, "loss": 0.0683, "step": 5090 }, { "epoch": 0.00927527797417017, "grad_norm": 0.002719841431826353, "learning_rate": 0.0002, "loss": 0.0117, "step": 5100 }, { "epoch": 0.009293464793727366, "grad_norm": 0.0411439947783947, "learning_rate": 0.0002, "loss": 0.2025, "step": 5110 }, { "epoch": 0.009311651613284563, "grad_norm": 0.03695548698306084, "learning_rate": 0.0002, "loss": 0.0831, "step": 5120 }, { "epoch": 0.009329838432841759, "grad_norm": 0.06067590415477753, "learning_rate": 0.0002, "loss": 0.0779, "step": 5130 }, { "epoch": 0.009348025252398955, "grad_norm": 0.11754634976387024, "learning_rate": 0.0002, "loss": 0.0667, "step": 5140 }, { "epoch": 0.009366212071956152, "grad_norm": 0.004248317331075668, "learning_rate": 0.0002, "loss": 0.0113, "step": 5150 }, { "epoch": 0.009384398891513348, "grad_norm": 0.03073648177087307, "learning_rate": 0.0002, "loss": 0.2289, "step": 5160 }, { "epoch": 0.009402585711070544, "grad_norm": 0.10287592560052872, "learning_rate": 0.0002, "loss": 0.0977, "step": 5170 }, { "epoch": 0.00942077253062774, "grad_norm": 0.06832946836948395, "learning_rate": 0.0002, "loss": 0.0764, "step": 5180 }, { "epoch": 0.009438959350184937, "grad_norm": 0.1760883778333664, "learning_rate": 0.0002, "loss": 0.0795, "step": 5190 }, { "epoch": 0.009457146169742133, "grad_norm": 0.02968805655837059, "learning_rate": 0.0002, "loss": 0.0253, "step": 5200 }, { "epoch": 0.00947533298929933, "grad_norm": 0.046602651476860046, "learning_rate": 0.0002, "loss": 0.1432, "step": 5210 }, { "epoch": 0.009493519808856526, "grad_norm": 0.051989324390888214, "learning_rate": 0.0002, "loss": 0.0807, "step": 5220 }, { "epoch": 0.009511706628413722, "grad_norm": 0.04583961144089699, "learning_rate": 0.0002, "loss": 0.0782, "step": 5230 }, { "epoch": 0.00952989344797092, "grad_norm": 0.13195525109767914, "learning_rate": 0.0002, "loss": 0.0688, "step": 5240 }, { "epoch": 0.009548080267528116, "grad_norm": 0.011369351297616959, "learning_rate": 0.0002, "loss": 0.0232, "step": 5250 }, { "epoch": 0.009566267087085313, "grad_norm": 0.05092083290219307, "learning_rate": 0.0002, "loss": 0.145, "step": 5260 }, { "epoch": 0.009584453906642509, "grad_norm": 0.05051489174365997, "learning_rate": 0.0002, "loss": 0.0803, "step": 5270 }, { "epoch": 0.009602640726199705, "grad_norm": 0.05730990320444107, "learning_rate": 0.0002, "loss": 0.0717, "step": 5280 }, { "epoch": 0.009620827545756902, "grad_norm": 0.11170202493667603, "learning_rate": 0.0002, "loss": 0.0711, "step": 5290 }, { "epoch": 0.009639014365314098, "grad_norm": 0.011571788229048252, "learning_rate": 0.0002, "loss": 0.0204, "step": 5300 }, { "epoch": 0.009657201184871294, "grad_norm": 0.04396244138479233, "learning_rate": 0.0002, "loss": 0.1764, "step": 5310 }, { "epoch": 0.00967538800442849, "grad_norm": 0.047808658331632614, "learning_rate": 0.0002, "loss": 0.0855, "step": 5320 }, { "epoch": 0.009693574823985687, "grad_norm": 0.09201673418283463, "learning_rate": 0.0002, "loss": 0.0737, "step": 5330 }, { "epoch": 0.009711761643542883, "grad_norm": 0.12273146212100983, "learning_rate": 0.0002, "loss": 0.0658, "step": 5340 }, { "epoch": 0.00972994846310008, "grad_norm": 0.014599839225411415, "learning_rate": 0.0002, "loss": 0.0254, "step": 5350 }, { "epoch": 0.009748135282657276, "grad_norm": 0.049732692539691925, "learning_rate": 0.0002, "loss": 0.1432, "step": 5360 }, { "epoch": 0.009766322102214472, "grad_norm": 0.07791377604007721, "learning_rate": 0.0002, "loss": 0.0865, "step": 5370 }, { "epoch": 0.009784508921771668, "grad_norm": 0.06298892199993134, "learning_rate": 0.0002, "loss": 0.0816, "step": 5380 }, { "epoch": 0.009802695741328865, "grad_norm": 0.08924435079097748, "learning_rate": 0.0002, "loss": 0.0709, "step": 5390 }, { "epoch": 0.009820882560886061, "grad_norm": 0.02383723482489586, "learning_rate": 0.0002, "loss": 0.0208, "step": 5400 }, { "epoch": 0.009839069380443257, "grad_norm": 0.042910825461149216, "learning_rate": 0.0002, "loss": 0.1383, "step": 5410 }, { "epoch": 0.009857256200000455, "grad_norm": 0.05560186505317688, "learning_rate": 0.0002, "loss": 0.0827, "step": 5420 }, { "epoch": 0.009875443019557652, "grad_norm": 0.08179624378681183, "learning_rate": 0.0002, "loss": 0.0758, "step": 5430 }, { "epoch": 0.009893629839114848, "grad_norm": 0.17111806571483612, "learning_rate": 0.0002, "loss": 0.0688, "step": 5440 }, { "epoch": 0.009911816658672044, "grad_norm": 0.008684845641255379, "learning_rate": 0.0002, "loss": 0.0177, "step": 5450 }, { "epoch": 0.00993000347822924, "grad_norm": 0.044370412826538086, "learning_rate": 0.0002, "loss": 0.2036, "step": 5460 }, { "epoch": 0.009948190297786437, "grad_norm": 0.08403154462575912, "learning_rate": 0.0002, "loss": 0.0878, "step": 5470 }, { "epoch": 0.009966377117343633, "grad_norm": 0.10712645202875137, "learning_rate": 0.0002, "loss": 0.0787, "step": 5480 }, { "epoch": 0.00998456393690083, "grad_norm": 0.12575705349445343, "learning_rate": 0.0002, "loss": 0.0637, "step": 5490 }, { "epoch": 0.010002750756458026, "grad_norm": 0.018583891913294792, "learning_rate": 0.0002, "loss": 0.0179, "step": 5500 }, { "epoch": 0.010020937576015222, "grad_norm": 0.040852561593055725, "learning_rate": 0.0002, "loss": 0.1545, "step": 5510 }, { "epoch": 0.010039124395572419, "grad_norm": 0.09006325900554657, "learning_rate": 0.0002, "loss": 0.0888, "step": 5520 }, { "epoch": 0.010057311215129615, "grad_norm": 0.06323093175888062, "learning_rate": 0.0002, "loss": 0.0778, "step": 5530 }, { "epoch": 0.010075498034686811, "grad_norm": 0.10159824043512344, "learning_rate": 0.0002, "loss": 0.0662, "step": 5540 }, { "epoch": 0.010093684854244007, "grad_norm": 0.012086872011423111, "learning_rate": 0.0002, "loss": 0.0237, "step": 5550 }, { "epoch": 0.010111871673801204, "grad_norm": 0.02518664114177227, "learning_rate": 0.0002, "loss": 0.1246, "step": 5560 }, { "epoch": 0.0101300584933584, "grad_norm": 0.056161828339099884, "learning_rate": 0.0002, "loss": 0.086, "step": 5570 }, { "epoch": 0.010148245312915596, "grad_norm": 0.03376586362719536, "learning_rate": 0.0002, "loss": 0.0842, "step": 5580 }, { "epoch": 0.010166432132472794, "grad_norm": 0.09921032190322876, "learning_rate": 0.0002, "loss": 0.0667, "step": 5590 }, { "epoch": 0.01018461895202999, "grad_norm": 0.009120604954659939, "learning_rate": 0.0002, "loss": 0.0209, "step": 5600 }, { "epoch": 0.010202805771587187, "grad_norm": 0.037767425179481506, "learning_rate": 0.0002, "loss": 0.1248, "step": 5610 }, { "epoch": 0.010220992591144383, "grad_norm": 0.05255524069070816, "learning_rate": 0.0002, "loss": 0.0794, "step": 5620 }, { "epoch": 0.01023917941070158, "grad_norm": 0.038734354078769684, "learning_rate": 0.0002, "loss": 0.0791, "step": 5630 }, { "epoch": 0.010257366230258776, "grad_norm": 0.09293238073587418, "learning_rate": 0.0002, "loss": 0.064, "step": 5640 }, { "epoch": 0.010275553049815972, "grad_norm": 0.013020232319831848, "learning_rate": 0.0002, "loss": 0.0174, "step": 5650 }, { "epoch": 0.010293739869373169, "grad_norm": 0.030535893514752388, "learning_rate": 0.0002, "loss": 0.1615, "step": 5660 }, { "epoch": 0.010311926688930365, "grad_norm": 0.08644227683544159, "learning_rate": 0.0002, "loss": 0.0856, "step": 5670 }, { "epoch": 0.010330113508487561, "grad_norm": 0.04769067466259003, "learning_rate": 0.0002, "loss": 0.0777, "step": 5680 }, { "epoch": 0.010348300328044758, "grad_norm": 0.1528550088405609, "learning_rate": 0.0002, "loss": 0.0757, "step": 5690 }, { "epoch": 0.010366487147601954, "grad_norm": 0.012257793918251991, "learning_rate": 0.0002, "loss": 0.0163, "step": 5700 }, { "epoch": 0.01038467396715915, "grad_norm": 0.5761304497718811, "learning_rate": 0.0002, "loss": 0.1787, "step": 5710 }, { "epoch": 0.010402860786716346, "grad_norm": 0.07034485787153244, "learning_rate": 0.0002, "loss": 0.0964, "step": 5720 }, { "epoch": 0.010421047606273543, "grad_norm": 0.04541708156466484, "learning_rate": 0.0002, "loss": 0.0793, "step": 5730 }, { "epoch": 0.010439234425830739, "grad_norm": 0.12013612687587738, "learning_rate": 0.0002, "loss": 0.07, "step": 5740 }, { "epoch": 0.010457421245387935, "grad_norm": 0.014152747578918934, "learning_rate": 0.0002, "loss": 0.0208, "step": 5750 }, { "epoch": 0.010475608064945133, "grad_norm": 0.029470542445778847, "learning_rate": 0.0002, "loss": 0.1352, "step": 5760 }, { "epoch": 0.01049379488450233, "grad_norm": 0.04889104515314102, "learning_rate": 0.0002, "loss": 0.0748, "step": 5770 }, { "epoch": 0.010511981704059526, "grad_norm": 0.0311355609446764, "learning_rate": 0.0002, "loss": 0.0764, "step": 5780 }, { "epoch": 0.010530168523616722, "grad_norm": 0.16830098628997803, "learning_rate": 0.0002, "loss": 0.0734, "step": 5790 }, { "epoch": 0.010548355343173919, "grad_norm": 0.013224232010543346, "learning_rate": 0.0002, "loss": 0.0218, "step": 5800 }, { "epoch": 0.010566542162731115, "grad_norm": 0.03710555657744408, "learning_rate": 0.0002, "loss": 0.1403, "step": 5810 }, { "epoch": 0.010584728982288311, "grad_norm": 0.05788695067167282, "learning_rate": 0.0002, "loss": 0.0863, "step": 5820 }, { "epoch": 0.010602915801845508, "grad_norm": 0.03398163616657257, "learning_rate": 0.0002, "loss": 0.0751, "step": 5830 }, { "epoch": 0.010621102621402704, "grad_norm": 0.13862720131874084, "learning_rate": 0.0002, "loss": 0.07, "step": 5840 }, { "epoch": 0.0106392894409599, "grad_norm": 0.016240287572145462, "learning_rate": 0.0002, "loss": 0.0209, "step": 5850 }, { "epoch": 0.010657476260517097, "grad_norm": 0.030351752415299416, "learning_rate": 0.0002, "loss": 0.157, "step": 5860 }, { "epoch": 0.010675663080074293, "grad_norm": 0.038465555757284164, "learning_rate": 0.0002, "loss": 0.072, "step": 5870 }, { "epoch": 0.01069384989963149, "grad_norm": 0.07298482209444046, "learning_rate": 0.0002, "loss": 0.0796, "step": 5880 }, { "epoch": 0.010712036719188685, "grad_norm": 0.13822157680988312, "learning_rate": 0.0002, "loss": 0.0687, "step": 5890 }, { "epoch": 0.010730223538745882, "grad_norm": 0.014381729066371918, "learning_rate": 0.0002, "loss": 0.0192, "step": 5900 }, { "epoch": 0.010748410358303078, "grad_norm": 0.040448348969221115, "learning_rate": 0.0002, "loss": 0.1714, "step": 5910 }, { "epoch": 0.010766597177860274, "grad_norm": 0.06950225681066513, "learning_rate": 0.0002, "loss": 0.098, "step": 5920 }, { "epoch": 0.010784783997417472, "grad_norm": 0.04581855982542038, "learning_rate": 0.0002, "loss": 0.0752, "step": 5930 }, { "epoch": 0.010802970816974669, "grad_norm": 0.10498905926942825, "learning_rate": 0.0002, "loss": 0.0627, "step": 5940 }, { "epoch": 0.010821157636531865, "grad_norm": 0.009345698170363903, "learning_rate": 0.0002, "loss": 0.0183, "step": 5950 }, { "epoch": 0.010839344456089061, "grad_norm": 0.02440352365374565, "learning_rate": 0.0002, "loss": 0.1289, "step": 5960 }, { "epoch": 0.010857531275646258, "grad_norm": 0.051523737609386444, "learning_rate": 0.0002, "loss": 0.0813, "step": 5970 }, { "epoch": 0.010875718095203454, "grad_norm": 0.031664300709962845, "learning_rate": 0.0002, "loss": 0.0736, "step": 5980 }, { "epoch": 0.01089390491476065, "grad_norm": 0.10166060924530029, "learning_rate": 0.0002, "loss": 0.0631, "step": 5990 }, { "epoch": 0.010912091734317847, "grad_norm": 0.01642071269452572, "learning_rate": 0.0002, "loss": 0.0198, "step": 6000 }, { "epoch": 0.010930278553875043, "grad_norm": 0.04028782621026039, "learning_rate": 0.0002, "loss": 0.1355, "step": 6010 }, { "epoch": 0.01094846537343224, "grad_norm": 0.04289260134100914, "learning_rate": 0.0002, "loss": 0.0794, "step": 6020 }, { "epoch": 0.010966652192989436, "grad_norm": 0.03854202851653099, "learning_rate": 0.0002, "loss": 0.0765, "step": 6030 }, { "epoch": 0.010984839012546632, "grad_norm": 0.07910823822021484, "learning_rate": 0.0002, "loss": 0.0618, "step": 6040 }, { "epoch": 0.011003025832103828, "grad_norm": 0.009719946421682835, "learning_rate": 0.0002, "loss": 0.0145, "step": 6050 }, { "epoch": 0.011021212651661024, "grad_norm": 0.06853003799915314, "learning_rate": 0.0002, "loss": 0.1563, "step": 6060 }, { "epoch": 0.01103939947121822, "grad_norm": 0.02887076325714588, "learning_rate": 0.0002, "loss": 0.0803, "step": 6070 }, { "epoch": 0.011057586290775417, "grad_norm": 0.060147739946842194, "learning_rate": 0.0002, "loss": 0.0758, "step": 6080 }, { "epoch": 0.011075773110332613, "grad_norm": 0.10197418928146362, "learning_rate": 0.0002, "loss": 0.0627, "step": 6090 }, { "epoch": 0.011093959929889811, "grad_norm": 0.015125100500881672, "learning_rate": 0.0002, "loss": 0.0164, "step": 6100 }, { "epoch": 0.011112146749447008, "grad_norm": 0.029526161029934883, "learning_rate": 0.0002, "loss": 0.1526, "step": 6110 }, { "epoch": 0.011130333569004204, "grad_norm": 0.05942453444004059, "learning_rate": 0.0002, "loss": 0.0891, "step": 6120 }, { "epoch": 0.0111485203885614, "grad_norm": 0.07344426214694977, "learning_rate": 0.0002, "loss": 0.0753, "step": 6130 }, { "epoch": 0.011166707208118597, "grad_norm": 0.1394059658050537, "learning_rate": 0.0002, "loss": 0.0776, "step": 6140 }, { "epoch": 0.011184894027675793, "grad_norm": 0.00965851079672575, "learning_rate": 0.0002, "loss": 0.019, "step": 6150 }, { "epoch": 0.01120308084723299, "grad_norm": 0.041846372187137604, "learning_rate": 0.0002, "loss": 0.1776, "step": 6160 }, { "epoch": 0.011221267666790186, "grad_norm": 0.04657486826181412, "learning_rate": 0.0002, "loss": 0.0878, "step": 6170 }, { "epoch": 0.011239454486347382, "grad_norm": 0.026520246639847755, "learning_rate": 0.0002, "loss": 0.0768, "step": 6180 }, { "epoch": 0.011257641305904578, "grad_norm": 0.10318096727132797, "learning_rate": 0.0002, "loss": 0.0617, "step": 6190 }, { "epoch": 0.011275828125461775, "grad_norm": 0.019912905991077423, "learning_rate": 0.0002, "loss": 0.0202, "step": 6200 }, { "epoch": 0.01129401494501897, "grad_norm": 0.05316480994224548, "learning_rate": 0.0002, "loss": 0.1412, "step": 6210 }, { "epoch": 0.011312201764576167, "grad_norm": 0.02944323979318142, "learning_rate": 0.0002, "loss": 0.0829, "step": 6220 }, { "epoch": 0.011330388584133363, "grad_norm": 0.0285831056535244, "learning_rate": 0.0002, "loss": 0.074, "step": 6230 }, { "epoch": 0.01134857540369056, "grad_norm": 0.0975700169801712, "learning_rate": 0.0002, "loss": 0.0681, "step": 6240 }, { "epoch": 0.011366762223247756, "grad_norm": 0.025717545300722122, "learning_rate": 0.0002, "loss": 0.0221, "step": 6250 }, { "epoch": 0.011384949042804952, "grad_norm": 0.02859714813530445, "learning_rate": 0.0002, "loss": 0.1142, "step": 6260 }, { "epoch": 0.01140313586236215, "grad_norm": 0.04395005479454994, "learning_rate": 0.0002, "loss": 0.0777, "step": 6270 }, { "epoch": 0.011421322681919347, "grad_norm": 0.05116860568523407, "learning_rate": 0.0002, "loss": 0.0763, "step": 6280 }, { "epoch": 0.011439509501476543, "grad_norm": 0.06850302964448929, "learning_rate": 0.0002, "loss": 0.0632, "step": 6290 }, { "epoch": 0.01145769632103374, "grad_norm": 0.016113542020320892, "learning_rate": 0.0002, "loss": 0.0178, "step": 6300 }, { "epoch": 0.011475883140590936, "grad_norm": 0.032306116074323654, "learning_rate": 0.0002, "loss": 0.1306, "step": 6310 }, { "epoch": 0.011494069960148132, "grad_norm": 0.055701326578855515, "learning_rate": 0.0002, "loss": 0.0834, "step": 6320 }, { "epoch": 0.011512256779705328, "grad_norm": 0.022934190928936005, "learning_rate": 0.0002, "loss": 0.0737, "step": 6330 }, { "epoch": 0.011530443599262525, "grad_norm": 0.08375566452741623, "learning_rate": 0.0002, "loss": 0.0661, "step": 6340 }, { "epoch": 0.011548630418819721, "grad_norm": 0.013614729046821594, "learning_rate": 0.0002, "loss": 0.0187, "step": 6350 }, { "epoch": 0.011566817238376917, "grad_norm": 0.028269700706005096, "learning_rate": 0.0002, "loss": 0.1245, "step": 6360 }, { "epoch": 0.011585004057934114, "grad_norm": 0.03646335378289223, "learning_rate": 0.0002, "loss": 0.0866, "step": 6370 }, { "epoch": 0.01160319087749131, "grad_norm": 0.0371277742087841, "learning_rate": 0.0002, "loss": 0.0737, "step": 6380 }, { "epoch": 0.011621377697048506, "grad_norm": 0.13698458671569824, "learning_rate": 0.0002, "loss": 0.0679, "step": 6390 }, { "epoch": 0.011639564516605702, "grad_norm": 0.009350700303912163, "learning_rate": 0.0002, "loss": 0.024, "step": 6400 }, { "epoch": 0.011657751336162899, "grad_norm": 0.03187236189842224, "learning_rate": 0.0002, "loss": 0.1555, "step": 6410 }, { "epoch": 0.011675938155720095, "grad_norm": 0.06672242283821106, "learning_rate": 0.0002, "loss": 0.0835, "step": 6420 }, { "epoch": 0.011694124975277291, "grad_norm": 0.07821471244096756, "learning_rate": 0.0002, "loss": 0.0746, "step": 6430 }, { "epoch": 0.011712311794834488, "grad_norm": 0.14781107008457184, "learning_rate": 0.0002, "loss": 0.0662, "step": 6440 }, { "epoch": 0.011730498614391686, "grad_norm": 0.0057207453064620495, "learning_rate": 0.0002, "loss": 0.0169, "step": 6450 }, { "epoch": 0.011748685433948882, "grad_norm": 0.04252105578780174, "learning_rate": 0.0002, "loss": 0.1868, "step": 6460 }, { "epoch": 0.011766872253506078, "grad_norm": 0.05041474476456642, "learning_rate": 0.0002, "loss": 0.0842, "step": 6470 }, { "epoch": 0.011785059073063275, "grad_norm": 0.06584125757217407, "learning_rate": 0.0002, "loss": 0.0779, "step": 6480 }, { "epoch": 0.011803245892620471, "grad_norm": 0.14610575139522552, "learning_rate": 0.0002, "loss": 0.063, "step": 6490 }, { "epoch": 0.011821432712177667, "grad_norm": 0.01419675163924694, "learning_rate": 0.0002, "loss": 0.0152, "step": 6500 }, { "epoch": 0.011839619531734864, "grad_norm": 0.03371060639619827, "learning_rate": 0.0002, "loss": 0.1725, "step": 6510 }, { "epoch": 0.01185780635129206, "grad_norm": 0.028900766745209694, "learning_rate": 0.0002, "loss": 0.0815, "step": 6520 }, { "epoch": 0.011875993170849256, "grad_norm": 0.059519629925489426, "learning_rate": 0.0002, "loss": 0.0785, "step": 6530 }, { "epoch": 0.011894179990406453, "grad_norm": 0.12085167318582535, "learning_rate": 0.0002, "loss": 0.0615, "step": 6540 }, { "epoch": 0.011912366809963649, "grad_norm": 0.028604619204998016, "learning_rate": 0.0002, "loss": 0.0196, "step": 6550 }, { "epoch": 0.011930553629520845, "grad_norm": 0.03659407049417496, "learning_rate": 0.0002, "loss": 0.1403, "step": 6560 }, { "epoch": 0.011948740449078041, "grad_norm": 0.034444138407707214, "learning_rate": 0.0002, "loss": 0.0812, "step": 6570 }, { "epoch": 0.011966927268635238, "grad_norm": 0.029788263142108917, "learning_rate": 0.0002, "loss": 0.0713, "step": 6580 }, { "epoch": 0.011985114088192434, "grad_norm": 0.1271272599697113, "learning_rate": 0.0002, "loss": 0.0672, "step": 6590 }, { "epoch": 0.01200330090774963, "grad_norm": 0.018705012276768684, "learning_rate": 0.0002, "loss": 0.0212, "step": 6600 }, { "epoch": 0.012021487727306827, "grad_norm": 0.02982541173696518, "learning_rate": 0.0002, "loss": 0.1152, "step": 6610 }, { "epoch": 0.012039674546864025, "grad_norm": 0.06942040473222733, "learning_rate": 0.0002, "loss": 0.0963, "step": 6620 }, { "epoch": 0.012057861366421221, "grad_norm": 0.06102292984724045, "learning_rate": 0.0002, "loss": 0.0775, "step": 6630 }, { "epoch": 0.012076048185978417, "grad_norm": 0.10115987807512283, "learning_rate": 0.0002, "loss": 0.0729, "step": 6640 }, { "epoch": 0.012094235005535614, "grad_norm": 0.011439867317676544, "learning_rate": 0.0002, "loss": 0.0253, "step": 6650 }, { "epoch": 0.01211242182509281, "grad_norm": 0.062434904277324677, "learning_rate": 0.0002, "loss": 0.1166, "step": 6660 }, { "epoch": 0.012130608644650006, "grad_norm": 0.055352553725242615, "learning_rate": 0.0002, "loss": 0.0802, "step": 6670 }, { "epoch": 0.012148795464207203, "grad_norm": 0.031538888812065125, "learning_rate": 0.0002, "loss": 0.0786, "step": 6680 }, { "epoch": 0.012166982283764399, "grad_norm": 0.10964162647724152, "learning_rate": 0.0002, "loss": 0.0626, "step": 6690 }, { "epoch": 0.012185169103321595, "grad_norm": 0.011173764243721962, "learning_rate": 0.0002, "loss": 0.0205, "step": 6700 }, { "epoch": 0.012203355922878792, "grad_norm": 0.035984206944704056, "learning_rate": 0.0002, "loss": 0.1412, "step": 6710 }, { "epoch": 0.012221542742435988, "grad_norm": 0.07189827412366867, "learning_rate": 0.0002, "loss": 0.0818, "step": 6720 }, { "epoch": 0.012239729561993184, "grad_norm": 0.0400136299431324, "learning_rate": 0.0002, "loss": 0.0676, "step": 6730 }, { "epoch": 0.01225791638155038, "grad_norm": 0.14700625836849213, "learning_rate": 0.0002, "loss": 0.0663, "step": 6740 }, { "epoch": 0.012276103201107577, "grad_norm": 0.007156179752200842, "learning_rate": 0.0002, "loss": 0.0171, "step": 6750 }, { "epoch": 0.012294290020664773, "grad_norm": 0.04911777004599571, "learning_rate": 0.0002, "loss": 0.1657, "step": 6760 }, { "epoch": 0.01231247684022197, "grad_norm": 0.03729144483804703, "learning_rate": 0.0002, "loss": 0.0806, "step": 6770 }, { "epoch": 0.012330663659779166, "grad_norm": 0.037231944501399994, "learning_rate": 0.0002, "loss": 0.0736, "step": 6780 }, { "epoch": 0.012348850479336364, "grad_norm": 0.09694401919841766, "learning_rate": 0.0002, "loss": 0.0642, "step": 6790 }, { "epoch": 0.01236703729889356, "grad_norm": 0.025534989312291145, "learning_rate": 0.0002, "loss": 0.0208, "step": 6800 }, { "epoch": 0.012385224118450756, "grad_norm": 0.033654361963272095, "learning_rate": 0.0002, "loss": 0.1295, "step": 6810 }, { "epoch": 0.012403410938007953, "grad_norm": 0.04499521851539612, "learning_rate": 0.0002, "loss": 0.0902, "step": 6820 }, { "epoch": 0.012421597757565149, "grad_norm": 0.0335836224257946, "learning_rate": 0.0002, "loss": 0.0781, "step": 6830 }, { "epoch": 0.012439784577122345, "grad_norm": 0.1040850430727005, "learning_rate": 0.0002, "loss": 0.0679, "step": 6840 }, { "epoch": 0.012457971396679542, "grad_norm": 0.015963764861226082, "learning_rate": 0.0002, "loss": 0.0226, "step": 6850 }, { "epoch": 0.012476158216236738, "grad_norm": 0.05578307807445526, "learning_rate": 0.0002, "loss": 0.1119, "step": 6860 }, { "epoch": 0.012494345035793934, "grad_norm": 0.0364505760371685, "learning_rate": 0.0002, "loss": 0.0805, "step": 6870 }, { "epoch": 0.01251253185535113, "grad_norm": 0.027990469709038734, "learning_rate": 0.0002, "loss": 0.0826, "step": 6880 }, { "epoch": 0.012530718674908327, "grad_norm": 0.08282670378684998, "learning_rate": 0.0002, "loss": 0.0685, "step": 6890 }, { "epoch": 0.012548905494465523, "grad_norm": 0.02172144502401352, "learning_rate": 0.0002, "loss": 0.0259, "step": 6900 }, { "epoch": 0.01256709231402272, "grad_norm": 0.04074740409851074, "learning_rate": 0.0002, "loss": 0.1211, "step": 6910 }, { "epoch": 0.012585279133579916, "grad_norm": 0.05433020740747452, "learning_rate": 0.0002, "loss": 0.0776, "step": 6920 }, { "epoch": 0.012603465953137112, "grad_norm": 0.05479983240365982, "learning_rate": 0.0002, "loss": 0.0778, "step": 6930 }, { "epoch": 0.012621652772694308, "grad_norm": 1.6031180620193481, "learning_rate": 0.0002, "loss": 0.2265, "step": 6940 }, { "epoch": 0.012639839592251505, "grad_norm": 1.0940366983413696, "learning_rate": 0.0002, "loss": 0.4586, "step": 6950 }, { "epoch": 0.012658026411808703, "grad_norm": 0.0412282720208168, "learning_rate": 0.0002, "loss": 0.1072, "step": 6960 }, { "epoch": 0.012676213231365899, "grad_norm": 0.03705910965800285, "learning_rate": 0.0002, "loss": 0.1014, "step": 6970 }, { "epoch": 0.012694400050923095, "grad_norm": 0.07444313168525696, "learning_rate": 0.0002, "loss": 0.0881, "step": 6980 }, { "epoch": 0.012712586870480292, "grad_norm": 0.08558017760515213, "learning_rate": 0.0002, "loss": 0.0619, "step": 6990 }, { "epoch": 0.012730773690037488, "grad_norm": 0.0004157133516855538, "learning_rate": 0.0002, "loss": 0.0045, "step": 7000 }, { "epoch": 0.012748960509594684, "grad_norm": 0.07950109243392944, "learning_rate": 0.0002, "loss": 0.1801, "step": 7010 }, { "epoch": 0.01276714732915188, "grad_norm": 0.08424151688814163, "learning_rate": 0.0002, "loss": 0.0753, "step": 7020 }, { "epoch": 0.012785334148709077, "grad_norm": 0.47635558247566223, "learning_rate": 0.0002, "loss": 0.0753, "step": 7030 }, { "epoch": 0.012803520968266273, "grad_norm": 0.0452958345413208, "learning_rate": 0.0002, "loss": 0.0731, "step": 7040 }, { "epoch": 0.01282170778782347, "grad_norm": 0.007719043176621199, "learning_rate": 0.0002, "loss": 0.0193, "step": 7050 }, { "epoch": 0.012839894607380666, "grad_norm": 0.2408572882413864, "learning_rate": 0.0002, "loss": 0.4117, "step": 7060 }, { "epoch": 0.012858081426937862, "grad_norm": 0.7272363305091858, "learning_rate": 0.0002, "loss": 0.0852, "step": 7070 }, { "epoch": 0.012876268246495058, "grad_norm": 0.5539261698722839, "learning_rate": 0.0002, "loss": 0.075, "step": 7080 }, { "epoch": 0.012894455066052255, "grad_norm": 4.608922481536865, "learning_rate": 0.0002, "loss": 0.2301, "step": 7090 }, { "epoch": 0.012912641885609451, "grad_norm": 0.0012216357281431556, "learning_rate": 0.0002, "loss": 0.0034, "step": 7100 }, { "epoch": 0.012930828705166647, "grad_norm": 0.15025563538074493, "learning_rate": 0.0002, "loss": 0.2717, "step": 7110 }, { "epoch": 0.012949015524723844, "grad_norm": 0.06209970638155937, "learning_rate": 0.0002, "loss": 0.0852, "step": 7120 }, { "epoch": 0.012967202344281042, "grad_norm": 0.6127016544342041, "learning_rate": 0.0002, "loss": 0.1271, "step": 7130 }, { "epoch": 0.012985389163838238, "grad_norm": 0.047152891755104065, "learning_rate": 0.0002, "loss": 0.0626, "step": 7140 }, { "epoch": 0.013003575983395434, "grad_norm": 0.0005132685182616115, "learning_rate": 0.0002, "loss": 0.0029, "step": 7150 }, { "epoch": 0.01302176280295263, "grad_norm": 0.08946029096841812, "learning_rate": 0.0002, "loss": 0.309, "step": 7160 }, { "epoch": 0.013039949622509827, "grad_norm": 0.18610751628875732, "learning_rate": 0.0002, "loss": 0.0867, "step": 7170 }, { "epoch": 0.013058136442067023, "grad_norm": 0.07280854880809784, "learning_rate": 0.0002, "loss": 0.0832, "step": 7180 }, { "epoch": 0.01307632326162422, "grad_norm": 0.11997990310192108, "learning_rate": 0.0002, "loss": 0.0746, "step": 7190 }, { "epoch": 0.013094510081181416, "grad_norm": 0.00019475500448606908, "learning_rate": 0.0002, "loss": 0.01, "step": 7200 }, { "epoch": 0.013112696900738612, "grad_norm": 0.07719916105270386, "learning_rate": 0.0002, "loss": 0.3035, "step": 7210 }, { "epoch": 0.013130883720295809, "grad_norm": 0.0990060344338417, "learning_rate": 0.0002, "loss": 0.0902, "step": 7220 }, { "epoch": 0.013149070539853005, "grad_norm": 0.22215688228607178, "learning_rate": 0.0002, "loss": 0.0797, "step": 7230 }, { "epoch": 0.013167257359410201, "grad_norm": 0.08412040770053864, "learning_rate": 0.0002, "loss": 0.0646, "step": 7240 }, { "epoch": 0.013185444178967397, "grad_norm": 0.0017518314998596907, "learning_rate": 0.0002, "loss": 0.007, "step": 7250 }, { "epoch": 0.013203630998524594, "grad_norm": 0.1554754078388214, "learning_rate": 0.0002, "loss": 0.2319, "step": 7260 }, { "epoch": 0.01322181781808179, "grad_norm": 0.052371326833963394, "learning_rate": 0.0002, "loss": 0.0832, "step": 7270 }, { "epoch": 0.013240004637638986, "grad_norm": 0.9168817400932312, "learning_rate": 0.0002, "loss": 0.0791, "step": 7280 }, { "epoch": 0.013258191457196183, "grad_norm": 0.07169363647699356, "learning_rate": 0.0002, "loss": 0.0602, "step": 7290 }, { "epoch": 0.01327637827675338, "grad_norm": 0.0009911650558933616, "learning_rate": 0.0002, "loss": 0.0041, "step": 7300 }, { "epoch": 0.013294565096310577, "grad_norm": 0.2644541263580322, "learning_rate": 0.0002, "loss": 0.2193, "step": 7310 }, { "epoch": 0.013312751915867773, "grad_norm": 0.12140689790248871, "learning_rate": 0.0002, "loss": 0.0944, "step": 7320 }, { "epoch": 0.01333093873542497, "grad_norm": 0.03627191483974457, "learning_rate": 0.0002, "loss": 0.0812, "step": 7330 }, { "epoch": 0.013349125554982166, "grad_norm": 0.06252894550561905, "learning_rate": 0.0002, "loss": 0.0596, "step": 7340 }, { "epoch": 0.013367312374539362, "grad_norm": 0.20318441092967987, "learning_rate": 0.0002, "loss": 0.0064, "step": 7350 }, { "epoch": 0.013385499194096559, "grad_norm": 0.4231732189655304, "learning_rate": 0.0002, "loss": 0.4329, "step": 7360 }, { "epoch": 0.013403686013653755, "grad_norm": 0.07567082345485687, "learning_rate": 0.0002, "loss": 0.089, "step": 7370 }, { "epoch": 0.013421872833210951, "grad_norm": 0.23021474480628967, "learning_rate": 0.0002, "loss": 0.0833, "step": 7380 }, { "epoch": 0.013440059652768148, "grad_norm": 0.09458985179662704, "learning_rate": 0.0002, "loss": 0.1391, "step": 7390 }, { "epoch": 0.013458246472325344, "grad_norm": 0.010052111931145191, "learning_rate": 0.0002, "loss": 0.0073, "step": 7400 }, { "epoch": 0.01347643329188254, "grad_norm": 0.2159787267446518, "learning_rate": 0.0002, "loss": 0.2249, "step": 7410 }, { "epoch": 0.013494620111439736, "grad_norm": 0.11222853511571884, "learning_rate": 0.0002, "loss": 0.093, "step": 7420 }, { "epoch": 0.013512806930996933, "grad_norm": 0.08586139976978302, "learning_rate": 0.0002, "loss": 0.0821, "step": 7430 }, { "epoch": 0.013530993750554129, "grad_norm": 0.12232748419046402, "learning_rate": 0.0002, "loss": 0.0736, "step": 7440 }, { "epoch": 0.013549180570111325, "grad_norm": 0.006977527402341366, "learning_rate": 0.0002, "loss": 0.0115, "step": 7450 }, { "epoch": 0.013567367389668522, "grad_norm": 0.051690369844436646, "learning_rate": 0.0002, "loss": 0.2247, "step": 7460 }, { "epoch": 0.013585554209225718, "grad_norm": 0.06542158871889114, "learning_rate": 0.0002, "loss": 0.1056, "step": 7470 }, { "epoch": 0.013603741028782916, "grad_norm": 0.18546995520591736, "learning_rate": 0.0002, "loss": 0.1102, "step": 7480 }, { "epoch": 0.013621927848340112, "grad_norm": 13.399182319641113, "learning_rate": 0.0002, "loss": 2.0806, "step": 7490 }, { "epoch": 0.013640114667897309, "grad_norm": 0.0982588455080986, "learning_rate": 0.0002, "loss": 0.2158, "step": 7500 }, { "epoch": 0.013658301487454505, "grad_norm": 0.07860754430294037, "learning_rate": 0.0002, "loss": 0.125, "step": 7510 }, { "epoch": 0.013676488307011701, "grad_norm": 0.1165497750043869, "learning_rate": 0.0002, "loss": 0.0899, "step": 7520 }, { "epoch": 0.013694675126568898, "grad_norm": 0.2813965380191803, "learning_rate": 0.0002, "loss": 0.0758, "step": 7530 }, { "epoch": 0.013712861946126094, "grad_norm": 0.33458462357521057, "learning_rate": 0.0002, "loss": 0.0683, "step": 7540 }, { "epoch": 0.01373104876568329, "grad_norm": 0.012062279507517815, "learning_rate": 0.0002, "loss": 0.0135, "step": 7550 }, { "epoch": 0.013749235585240487, "grad_norm": 0.1787721961736679, "learning_rate": 0.0002, "loss": 0.1763, "step": 7560 }, { "epoch": 0.013767422404797683, "grad_norm": 0.05922751501202583, "learning_rate": 0.0002, "loss": 0.1223, "step": 7570 }, { "epoch": 0.01378560922435488, "grad_norm": 0.11594684422016144, "learning_rate": 0.0002, "loss": 0.0934, "step": 7580 }, { "epoch": 0.013803796043912075, "grad_norm": 0.2290794998407364, "learning_rate": 0.0002, "loss": 0.0799, "step": 7590 }, { "epoch": 0.013821982863469272, "grad_norm": 0.04903063178062439, "learning_rate": 0.0002, "loss": 0.0195, "step": 7600 }, { "epoch": 0.013840169683026468, "grad_norm": 123.61300659179688, "learning_rate": 0.0002, "loss": 0.8359, "step": 7610 }, { "epoch": 0.013858356502583664, "grad_norm": 0.25403347611427307, "learning_rate": 0.0002, "loss": 1.0464, "step": 7620 }, { "epoch": 0.01387654332214086, "grad_norm": 0.08144152164459229, "learning_rate": 0.0002, "loss": 0.0794, "step": 7630 }, { "epoch": 0.013894730141698057, "grad_norm": 0.11679713428020477, "learning_rate": 0.0002, "loss": 0.0706, "step": 7640 }, { "epoch": 0.013912916961255255, "grad_norm": 0.00391317019239068, "learning_rate": 0.0002, "loss": 0.0075, "step": 7650 }, { "epoch": 0.013931103780812451, "grad_norm": 0.13209663331508636, "learning_rate": 0.0002, "loss": 0.2228, "step": 7660 }, { "epoch": 0.013949290600369648, "grad_norm": 0.06067880615592003, "learning_rate": 0.0002, "loss": 0.089, "step": 7670 }, { "epoch": 0.013967477419926844, "grad_norm": 0.04806550592184067, "learning_rate": 0.0002, "loss": 0.0769, "step": 7680 }, { "epoch": 0.01398566423948404, "grad_norm": 0.09506970643997192, "learning_rate": 0.0002, "loss": 0.0689, "step": 7690 }, { "epoch": 0.014003851059041237, "grad_norm": 0.002536884741857648, "learning_rate": 0.0002, "loss": 0.0172, "step": 7700 }, { "epoch": 0.014022037878598433, "grad_norm": 0.13837113976478577, "learning_rate": 0.0002, "loss": 0.2243, "step": 7710 }, { "epoch": 0.01404022469815563, "grad_norm": 0.08101535588502884, "learning_rate": 0.0002, "loss": 0.0878, "step": 7720 }, { "epoch": 0.014058411517712826, "grad_norm": 0.04018868878483772, "learning_rate": 0.0002, "loss": 0.0776, "step": 7730 }, { "epoch": 0.014076598337270022, "grad_norm": 0.1377197653055191, "learning_rate": 0.0002, "loss": 0.0681, "step": 7740 }, { "epoch": 0.014094785156827218, "grad_norm": 0.0006735012284480035, "learning_rate": 0.0002, "loss": 0.0041, "step": 7750 }, { "epoch": 0.014112971976384414, "grad_norm": 0.17503094673156738, "learning_rate": 0.0002, "loss": 0.3114, "step": 7760 }, { "epoch": 0.01413115879594161, "grad_norm": 0.07190551608800888, "learning_rate": 0.0002, "loss": 0.1018, "step": 7770 }, { "epoch": 0.014149345615498807, "grad_norm": 0.036945659667253494, "learning_rate": 0.0002, "loss": 0.0785, "step": 7780 }, { "epoch": 0.014167532435056003, "grad_norm": 0.13999724388122559, "learning_rate": 0.0002, "loss": 0.0725, "step": 7790 }, { "epoch": 0.0141857192546132, "grad_norm": 0.0031171294394880533, "learning_rate": 0.0002, "loss": 0.0144, "step": 7800 }, { "epoch": 0.014203906074170396, "grad_norm": 0.059554051607847214, "learning_rate": 0.0002, "loss": 0.2442, "step": 7810 }, { "epoch": 0.014222092893727594, "grad_norm": 0.06873622536659241, "learning_rate": 0.0002, "loss": 0.0904, "step": 7820 }, { "epoch": 0.01424027971328479, "grad_norm": 0.11261582374572754, "learning_rate": 0.0002, "loss": 0.0758, "step": 7830 }, { "epoch": 0.014258466532841987, "grad_norm": 1.497631311416626, "learning_rate": 0.0002, "loss": 0.0689, "step": 7840 }, { "epoch": 0.014276653352399183, "grad_norm": 0.004822546616196632, "learning_rate": 0.0002, "loss": 0.0156, "step": 7850 }, { "epoch": 0.01429484017195638, "grad_norm": 0.0575052835047245, "learning_rate": 0.0002, "loss": 0.1895, "step": 7860 }, { "epoch": 0.014313026991513576, "grad_norm": 0.10657750070095062, "learning_rate": 0.0002, "loss": 0.0855, "step": 7870 }, { "epoch": 0.014331213811070772, "grad_norm": 0.07080844044685364, "learning_rate": 0.0002, "loss": 0.0716, "step": 7880 }, { "epoch": 0.014349400630627968, "grad_norm": 0.1628514677286148, "learning_rate": 0.0002, "loss": 0.071, "step": 7890 }, { "epoch": 0.014367587450185165, "grad_norm": 0.013860347680747509, "learning_rate": 0.0002, "loss": 0.0227, "step": 7900 }, { "epoch": 0.014385774269742361, "grad_norm": 0.5240967869758606, "learning_rate": 0.0002, "loss": 0.1854, "step": 7910 }, { "epoch": 0.014403961089299557, "grad_norm": 1.0027457475662231, "learning_rate": 0.0002, "loss": 0.0942, "step": 7920 }, { "epoch": 0.014422147908856753, "grad_norm": 0.05730056390166283, "learning_rate": 0.0002, "loss": 0.0778, "step": 7930 }, { "epoch": 0.01444033472841395, "grad_norm": 0.1485404521226883, "learning_rate": 0.0002, "loss": 0.0719, "step": 7940 }, { "epoch": 0.014458521547971146, "grad_norm": 0.009702637791633606, "learning_rate": 0.0002, "loss": 0.0136, "step": 7950 }, { "epoch": 0.014476708367528342, "grad_norm": 0.046543315052986145, "learning_rate": 0.0002, "loss": 0.1697, "step": 7960 }, { "epoch": 0.014494895187085539, "grad_norm": 0.05248842388391495, "learning_rate": 0.0002, "loss": 0.0888, "step": 7970 }, { "epoch": 0.014513082006642735, "grad_norm": 0.047813788056373596, "learning_rate": 0.0002, "loss": 0.0806, "step": 7980 }, { "epoch": 0.014531268826199933, "grad_norm": 0.19744129478931427, "learning_rate": 0.0002, "loss": 0.0788, "step": 7990 }, { "epoch": 0.01454945564575713, "grad_norm": 0.005265017040073872, "learning_rate": 0.0002, "loss": 0.014, "step": 8000 }, { "epoch": 0.014567642465314326, "grad_norm": 0.0564056858420372, "learning_rate": 0.0002, "loss": 0.2681, "step": 8010 }, { "epoch": 0.014585829284871522, "grad_norm": 0.0958496481180191, "learning_rate": 0.0002, "loss": 0.0823, "step": 8020 }, { "epoch": 0.014604016104428718, "grad_norm": 0.12000919133424759, "learning_rate": 0.0002, "loss": 0.073, "step": 8030 }, { "epoch": 0.014622202923985915, "grad_norm": 0.15912771224975586, "learning_rate": 0.0002, "loss": 0.075, "step": 8040 }, { "epoch": 0.014640389743543111, "grad_norm": 0.004025776404887438, "learning_rate": 0.0002, "loss": 0.012, "step": 8050 }, { "epoch": 0.014658576563100307, "grad_norm": 0.1682930886745453, "learning_rate": 0.0002, "loss": 0.2926, "step": 8060 }, { "epoch": 0.014676763382657504, "grad_norm": 0.057362254709005356, "learning_rate": 0.0002, "loss": 0.0869, "step": 8070 }, { "epoch": 0.0146949502022147, "grad_norm": 0.0814078077673912, "learning_rate": 0.0002, "loss": 0.0825, "step": 8080 }, { "epoch": 0.014713137021771896, "grad_norm": 0.18205074965953827, "learning_rate": 0.0002, "loss": 0.0699, "step": 8090 }, { "epoch": 0.014731323841329092, "grad_norm": 0.013200881890952587, "learning_rate": 0.0002, "loss": 0.0228, "step": 8100 }, { "epoch": 0.014749510660886289, "grad_norm": 0.21043474972248077, "learning_rate": 0.0002, "loss": 0.2138, "step": 8110 }, { "epoch": 0.014767697480443485, "grad_norm": 0.1000015065073967, "learning_rate": 0.0002, "loss": 0.0906, "step": 8120 }, { "epoch": 0.014785884300000681, "grad_norm": 0.045657768845558167, "learning_rate": 0.0002, "loss": 0.0826, "step": 8130 }, { "epoch": 0.014804071119557878, "grad_norm": 0.13545630872249603, "learning_rate": 0.0002, "loss": 0.066, "step": 8140 }, { "epoch": 0.014822257939115074, "grad_norm": 0.01422254927456379, "learning_rate": 0.0002, "loss": 0.0179, "step": 8150 }, { "epoch": 0.014840444758672272, "grad_norm": 0.12108676135540009, "learning_rate": 0.0002, "loss": 0.1717, "step": 8160 }, { "epoch": 0.014858631578229468, "grad_norm": 0.10441934317350388, "learning_rate": 0.0002, "loss": 0.106, "step": 8170 }, { "epoch": 0.014876818397786665, "grad_norm": 0.08105968683958054, "learning_rate": 0.0002, "loss": 0.0829, "step": 8180 }, { "epoch": 0.014895005217343861, "grad_norm": 0.12230301648378372, "learning_rate": 0.0002, "loss": 0.0731, "step": 8190 }, { "epoch": 0.014913192036901057, "grad_norm": 0.033857300877571106, "learning_rate": 0.0002, "loss": 0.029, "step": 8200 }, { "epoch": 0.014931378856458254, "grad_norm": 0.04827893525362015, "learning_rate": 0.0002, "loss": 0.1369, "step": 8210 }, { "epoch": 0.01494956567601545, "grad_norm": 0.056212421506643295, "learning_rate": 0.0002, "loss": 0.0879, "step": 8220 }, { "epoch": 0.014967752495572646, "grad_norm": 0.03163846209645271, "learning_rate": 0.0002, "loss": 0.0698, "step": 8230 }, { "epoch": 0.014985939315129843, "grad_norm": 0.09394920617341995, "learning_rate": 0.0002, "loss": 0.0688, "step": 8240 }, { "epoch": 0.015004126134687039, "grad_norm": 0.024936649948358536, "learning_rate": 0.0002, "loss": 0.0211, "step": 8250 }, { "epoch": 0.015022312954244235, "grad_norm": 4.499615669250488, "learning_rate": 0.0002, "loss": 2.7596, "step": 8260 }, { "epoch": 0.015040499773801431, "grad_norm": 9.221298217773438, "learning_rate": 0.0002, "loss": 0.9135, "step": 8270 }, { "epoch": 0.015058686593358628, "grad_norm": 0.5199778079986572, "learning_rate": 0.0002, "loss": 0.1441, "step": 8280 }, { "epoch": 0.015076873412915824, "grad_norm": 0.07028087228536606, "learning_rate": 0.0002, "loss": 0.0793, "step": 8290 }, { "epoch": 0.01509506023247302, "grad_norm": 0.0003307730657979846, "learning_rate": 0.0002, "loss": 0.0006, "step": 8300 }, { "epoch": 0.015113247052030217, "grad_norm": 0.7940683960914612, "learning_rate": 0.0002, "loss": 0.7233, "step": 8310 }, { "epoch": 0.015131433871587413, "grad_norm": 0.09774448722600937, "learning_rate": 0.0002, "loss": 0.1451, "step": 8320 }, { "epoch": 0.015149620691144611, "grad_norm": 0.3088306188583374, "learning_rate": 0.0002, "loss": 0.0986, "step": 8330 }, { "epoch": 0.015167807510701807, "grad_norm": 0.08629265427589417, "learning_rate": 0.0002, "loss": 0.0581, "step": 8340 }, { "epoch": 0.015185994330259004, "grad_norm": 0.0011582528240978718, "learning_rate": 0.0002, "loss": 0.0008, "step": 8350 }, { "epoch": 0.0152041811498162, "grad_norm": 0.48978063464164734, "learning_rate": 0.0002, "loss": 0.5111, "step": 8360 }, { "epoch": 0.015222367969373396, "grad_norm": 0.2633112668991089, "learning_rate": 0.0002, "loss": 0.1354, "step": 8370 }, { "epoch": 0.015240554788930593, "grad_norm": 0.058184925466775894, "learning_rate": 0.0002, "loss": 0.0963, "step": 8380 }, { "epoch": 0.015258741608487789, "grad_norm": 0.397290974855423, "learning_rate": 0.0002, "loss": 0.0915, "step": 8390 }, { "epoch": 0.015276928428044985, "grad_norm": 0.0013334077084437013, "learning_rate": 0.0002, "loss": 0.0117, "step": 8400 }, { "epoch": 0.015295115247602182, "grad_norm": 3.2027626037597656, "learning_rate": 0.0002, "loss": 0.3642, "step": 8410 }, { "epoch": 0.015313302067159378, "grad_norm": 0.4110456705093384, "learning_rate": 0.0002, "loss": 0.1347, "step": 8420 }, { "epoch": 0.015331488886716574, "grad_norm": 0.19789688289165497, "learning_rate": 0.0002, "loss": 0.0946, "step": 8430 }, { "epoch": 0.01534967570627377, "grad_norm": 0.15914630889892578, "learning_rate": 0.0002, "loss": 0.0619, "step": 8440 }, { "epoch": 0.015367862525830967, "grad_norm": 0.004021051339805126, "learning_rate": 0.0002, "loss": 0.0081, "step": 8450 }, { "epoch": 0.015386049345388163, "grad_norm": 0.25250542163848877, "learning_rate": 0.0002, "loss": 0.2409, "step": 8460 }, { "epoch": 0.01540423616494536, "grad_norm": 0.16660314798355103, "learning_rate": 0.0002, "loss": 0.1041, "step": 8470 }, { "epoch": 0.015422422984502556, "grad_norm": 0.09435573220252991, "learning_rate": 0.0002, "loss": 0.0838, "step": 8480 }, { "epoch": 0.015440609804059752, "grad_norm": 0.1622086614370346, "learning_rate": 0.0002, "loss": 0.0648, "step": 8490 }, { "epoch": 0.015458796623616948, "grad_norm": 0.002267120871692896, "learning_rate": 0.0002, "loss": 0.0057, "step": 8500 }, { "epoch": 0.015476983443174146, "grad_norm": 0.11559420824050903, "learning_rate": 0.0002, "loss": 0.2994, "step": 8510 }, { "epoch": 0.015495170262731343, "grad_norm": 0.18291179835796356, "learning_rate": 0.0002, "loss": 0.0908, "step": 8520 }, { "epoch": 0.015513357082288539, "grad_norm": 0.14989323914051056, "learning_rate": 0.0002, "loss": 0.0912, "step": 8530 }, { "epoch": 0.015531543901845735, "grad_norm": 0.09752708673477173, "learning_rate": 0.0002, "loss": 0.0586, "step": 8540 }, { "epoch": 0.015549730721402932, "grad_norm": 0.0005314307054504752, "learning_rate": 0.0002, "loss": 0.0026, "step": 8550 }, { "epoch": 0.015567917540960128, "grad_norm": 0.18309178948402405, "learning_rate": 0.0002, "loss": 0.3059, "step": 8560 }, { "epoch": 0.015586104360517324, "grad_norm": 0.8144251108169556, "learning_rate": 0.0002, "loss": 0.1103, "step": 8570 }, { "epoch": 0.01560429118007452, "grad_norm": 0.0331404022872448, "learning_rate": 0.0002, "loss": 0.0898, "step": 8580 }, { "epoch": 0.015622477999631717, "grad_norm": 0.1460132598876953, "learning_rate": 0.0002, "loss": 0.0656, "step": 8590 }, { "epoch": 0.015640664819188913, "grad_norm": 0.013606027700006962, "learning_rate": 0.0002, "loss": 0.0076, "step": 8600 }, { "epoch": 0.01565885163874611, "grad_norm": 0.22224061191082, "learning_rate": 0.0002, "loss": 0.2609, "step": 8610 }, { "epoch": 0.015677038458303306, "grad_norm": 0.22729800641536713, "learning_rate": 0.0002, "loss": 0.1028, "step": 8620 }, { "epoch": 0.015695225277860502, "grad_norm": 0.0848810002207756, "learning_rate": 0.0002, "loss": 0.0871, "step": 8630 }, { "epoch": 0.0157134120974177, "grad_norm": 0.17896370589733124, "learning_rate": 0.0002, "loss": 0.0636, "step": 8640 }, { "epoch": 0.015731598916974895, "grad_norm": 0.006263076793402433, "learning_rate": 0.0002, "loss": 0.0068, "step": 8650 }, { "epoch": 0.01574978573653209, "grad_norm": 0.29927679896354675, "learning_rate": 0.0002, "loss": 0.2761, "step": 8660 }, { "epoch": 0.015767972556089287, "grad_norm": 0.05662700906395912, "learning_rate": 0.0002, "loss": 0.1029, "step": 8670 }, { "epoch": 0.015786159375646484, "grad_norm": 0.09140895307064056, "learning_rate": 0.0002, "loss": 0.0854, "step": 8680 }, { "epoch": 0.01580434619520368, "grad_norm": 0.21034927666187286, "learning_rate": 0.0002, "loss": 0.0691, "step": 8690 }, { "epoch": 0.015822533014760876, "grad_norm": 0.0010229075560346246, "learning_rate": 0.0002, "loss": 0.0033, "step": 8700 }, { "epoch": 0.015840719834318073, "grad_norm": 0.0626237690448761, "learning_rate": 0.0002, "loss": 0.3583, "step": 8710 }, { "epoch": 0.01585890665387527, "grad_norm": 0.10027278959751129, "learning_rate": 0.0002, "loss": 0.0959, "step": 8720 }, { "epoch": 0.015877093473432465, "grad_norm": 0.0870286151766777, "learning_rate": 0.0002, "loss": 0.0767, "step": 8730 }, { "epoch": 0.01589528029298966, "grad_norm": 0.16106969118118286, "learning_rate": 0.0002, "loss": 0.0763, "step": 8740 }, { "epoch": 0.015913467112546858, "grad_norm": 0.0022529088892042637, "learning_rate": 0.0002, "loss": 0.0108, "step": 8750 }, { "epoch": 0.015931653932104058, "grad_norm": 0.06070050224661827, "learning_rate": 0.0002, "loss": 0.2606, "step": 8760 }, { "epoch": 0.015949840751661254, "grad_norm": 0.09406338632106781, "learning_rate": 0.0002, "loss": 0.1062, "step": 8770 }, { "epoch": 0.01596802757121845, "grad_norm": 0.1367248147726059, "learning_rate": 0.0002, "loss": 0.0757, "step": 8780 }, { "epoch": 0.015986214390775647, "grad_norm": 0.26938319206237793, "learning_rate": 0.0002, "loss": 0.0733, "step": 8790 }, { "epoch": 0.016004401210332843, "grad_norm": 0.011559409089386463, "learning_rate": 0.0002, "loss": 0.0176, "step": 8800 }, { "epoch": 0.01602258802989004, "grad_norm": 0.12351766228675842, "learning_rate": 0.0002, "loss": 0.24, "step": 8810 }, { "epoch": 0.016040774849447235, "grad_norm": 0.08965809643268585, "learning_rate": 0.0002, "loss": 0.0947, "step": 8820 }, { "epoch": 0.016058961669004432, "grad_norm": 0.027005961164832115, "learning_rate": 0.0002, "loss": 0.0738, "step": 8830 }, { "epoch": 0.016077148488561628, "grad_norm": 0.18656685948371887, "learning_rate": 0.0002, "loss": 0.0667, "step": 8840 }, { "epoch": 0.016095335308118824, "grad_norm": 0.003148626768961549, "learning_rate": 0.0002, "loss": 0.0119, "step": 8850 }, { "epoch": 0.01611352212767602, "grad_norm": 0.07959452271461487, "learning_rate": 0.0002, "loss": 0.275, "step": 8860 }, { "epoch": 0.016131708947233217, "grad_norm": 0.19433775544166565, "learning_rate": 0.0002, "loss": 0.0872, "step": 8870 }, { "epoch": 0.016149895766790413, "grad_norm": 0.1376393735408783, "learning_rate": 0.0002, "loss": 0.0779, "step": 8880 }, { "epoch": 0.01616808258634761, "grad_norm": 0.18282419443130493, "learning_rate": 0.0002, "loss": 0.068, "step": 8890 }, { "epoch": 0.016186269405904806, "grad_norm": 0.0112565653398633, "learning_rate": 0.0002, "loss": 0.0137, "step": 8900 }, { "epoch": 0.016204456225462002, "grad_norm": 0.08975637704133987, "learning_rate": 0.0002, "loss": 0.23, "step": 8910 }, { "epoch": 0.0162226430450192, "grad_norm": 0.19316238164901733, "learning_rate": 0.0002, "loss": 0.0975, "step": 8920 }, { "epoch": 0.016240829864576395, "grad_norm": 0.1870724856853485, "learning_rate": 0.0002, "loss": 0.0862, "step": 8930 }, { "epoch": 0.01625901668413359, "grad_norm": 0.19031721353530884, "learning_rate": 0.0002, "loss": 0.0735, "step": 8940 }, { "epoch": 0.016277203503690788, "grad_norm": 0.015979783609509468, "learning_rate": 0.0002, "loss": 0.0249, "step": 8950 }, { "epoch": 0.016295390323247984, "grad_norm": 0.09105712175369263, "learning_rate": 0.0002, "loss": 0.1573, "step": 8960 }, { "epoch": 0.01631357714280518, "grad_norm": 0.13035650551319122, "learning_rate": 0.0002, "loss": 0.0958, "step": 8970 }, { "epoch": 0.016331763962362376, "grad_norm": 0.18613573908805847, "learning_rate": 0.0002, "loss": 0.088, "step": 8980 }, { "epoch": 0.016349950781919573, "grad_norm": 0.2518664300441742, "learning_rate": 0.0002, "loss": 0.0725, "step": 8990 }, { "epoch": 0.01636813760147677, "grad_norm": 0.03324449062347412, "learning_rate": 0.0002, "loss": 0.0256, "step": 9000 }, { "epoch": 0.016386324421033965, "grad_norm": 0.08766523003578186, "learning_rate": 0.0002, "loss": 0.1531, "step": 9010 }, { "epoch": 0.01640451124059116, "grad_norm": 0.14177583158016205, "learning_rate": 0.0002, "loss": 0.0861, "step": 9020 }, { "epoch": 0.016422698060148358, "grad_norm": 0.1354762315750122, "learning_rate": 0.0002, "loss": 0.0782, "step": 9030 }, { "epoch": 0.016440884879705554, "grad_norm": 0.15894347429275513, "learning_rate": 0.0002, "loss": 0.073, "step": 9040 }, { "epoch": 0.01645907169926275, "grad_norm": 0.02154761180281639, "learning_rate": 0.0002, "loss": 0.0156, "step": 9050 }, { "epoch": 0.016477258518819947, "grad_norm": 0.06432317197322845, "learning_rate": 0.0002, "loss": 0.1384, "step": 9060 }, { "epoch": 0.016495445338377143, "grad_norm": 0.12112505733966827, "learning_rate": 0.0002, "loss": 0.093, "step": 9070 }, { "epoch": 0.01651363215793434, "grad_norm": 0.10628003627061844, "learning_rate": 0.0002, "loss": 0.0738, "step": 9080 }, { "epoch": 0.016531818977491536, "grad_norm": 0.1930958330631256, "learning_rate": 0.0002, "loss": 0.0678, "step": 9090 }, { "epoch": 0.016550005797048732, "grad_norm": 0.03878525644540787, "learning_rate": 0.0002, "loss": 0.0235, "step": 9100 }, { "epoch": 0.016568192616605932, "grad_norm": 0.0920896977186203, "learning_rate": 0.0002, "loss": 0.1661, "step": 9110 }, { "epoch": 0.016586379436163128, "grad_norm": 0.11687818914651871, "learning_rate": 0.0002, "loss": 0.0847, "step": 9120 }, { "epoch": 0.016604566255720325, "grad_norm": 0.10511167347431183, "learning_rate": 0.0002, "loss": 0.0832, "step": 9130 }, { "epoch": 0.01662275307527752, "grad_norm": 0.26365017890930176, "learning_rate": 0.0002, "loss": 0.0717, "step": 9140 }, { "epoch": 0.016640939894834717, "grad_norm": 0.02445841394364834, "learning_rate": 0.0002, "loss": 0.0233, "step": 9150 }, { "epoch": 0.016659126714391913, "grad_norm": 0.08213133364915848, "learning_rate": 0.0002, "loss": 0.1439, "step": 9160 }, { "epoch": 0.01667731353394911, "grad_norm": 0.17025598883628845, "learning_rate": 0.0002, "loss": 0.0852, "step": 9170 }, { "epoch": 0.016695500353506306, "grad_norm": 0.098059743642807, "learning_rate": 0.0002, "loss": 0.0761, "step": 9180 }, { "epoch": 0.016713687173063502, "grad_norm": 0.18436011672019958, "learning_rate": 0.0002, "loss": 0.0674, "step": 9190 }, { "epoch": 0.0167318739926207, "grad_norm": 0.011012010276317596, "learning_rate": 0.0002, "loss": 0.0221, "step": 9200 }, { "epoch": 0.016750060812177895, "grad_norm": 0.07544030994176865, "learning_rate": 0.0002, "loss": 0.161, "step": 9210 }, { "epoch": 0.01676824763173509, "grad_norm": 0.16041946411132812, "learning_rate": 0.0002, "loss": 0.0824, "step": 9220 }, { "epoch": 0.016786434451292288, "grad_norm": 0.17295844852924347, "learning_rate": 0.0002, "loss": 0.0797, "step": 9230 }, { "epoch": 0.016804621270849484, "grad_norm": 0.1818791776895523, "learning_rate": 0.0002, "loss": 0.0683, "step": 9240 }, { "epoch": 0.01682280809040668, "grad_norm": 0.019515013322234154, "learning_rate": 0.0002, "loss": 0.0188, "step": 9250 }, { "epoch": 0.016840994909963877, "grad_norm": 0.15059705078601837, "learning_rate": 0.0002, "loss": 0.1743, "step": 9260 }, { "epoch": 0.016859181729521073, "grad_norm": 0.1481601595878601, "learning_rate": 0.0002, "loss": 0.0906, "step": 9270 }, { "epoch": 0.01687736854907827, "grad_norm": 0.07433108985424042, "learning_rate": 0.0002, "loss": 0.08, "step": 9280 }, { "epoch": 0.016895555368635466, "grad_norm": 0.1752692312002182, "learning_rate": 0.0002, "loss": 0.06, "step": 9290 }, { "epoch": 0.016913742188192662, "grad_norm": 0.027612384408712387, "learning_rate": 0.0002, "loss": 0.0157, "step": 9300 }, { "epoch": 0.016931929007749858, "grad_norm": 0.08575212955474854, "learning_rate": 0.0002, "loss": 0.1679, "step": 9310 }, { "epoch": 0.016950115827307054, "grad_norm": 0.11127147823572159, "learning_rate": 0.0002, "loss": 0.0848, "step": 9320 }, { "epoch": 0.01696830264686425, "grad_norm": 0.08989393711090088, "learning_rate": 0.0002, "loss": 0.0823, "step": 9330 }, { "epoch": 0.016986489466421447, "grad_norm": 0.18898548185825348, "learning_rate": 0.0002, "loss": 0.0687, "step": 9340 }, { "epoch": 0.017004676285978643, "grad_norm": 0.023646721616387367, "learning_rate": 0.0002, "loss": 0.0244, "step": 9350 }, { "epoch": 0.01702286310553584, "grad_norm": 0.11511775106191635, "learning_rate": 0.0002, "loss": 0.1642, "step": 9360 }, { "epoch": 0.017041049925093036, "grad_norm": 0.1458021104335785, "learning_rate": 0.0002, "loss": 0.084, "step": 9370 }, { "epoch": 0.017059236744650232, "grad_norm": 0.060528095811605453, "learning_rate": 0.0002, "loss": 0.0809, "step": 9380 }, { "epoch": 0.01707742356420743, "grad_norm": 0.16314280033111572, "learning_rate": 0.0002, "loss": 0.0661, "step": 9390 }, { "epoch": 0.017095610383764625, "grad_norm": 0.03078557923436165, "learning_rate": 0.0002, "loss": 0.015, "step": 9400 }, { "epoch": 0.01711379720332182, "grad_norm": 0.11488370597362518, "learning_rate": 0.0002, "loss": 0.1712, "step": 9410 }, { "epoch": 0.017131984022879018, "grad_norm": 0.0972781702876091, "learning_rate": 0.0002, "loss": 0.0856, "step": 9420 }, { "epoch": 0.017150170842436214, "grad_norm": 0.08523645251989365, "learning_rate": 0.0002, "loss": 0.0744, "step": 9430 }, { "epoch": 0.01716835766199341, "grad_norm": 0.18629521131515503, "learning_rate": 0.0002, "loss": 0.0659, "step": 9440 }, { "epoch": 0.01718654448155061, "grad_norm": 0.00908618327230215, "learning_rate": 0.0002, "loss": 0.0219, "step": 9450 }, { "epoch": 0.017204731301107806, "grad_norm": 0.05552325397729874, "learning_rate": 0.0002, "loss": 0.1377, "step": 9460 }, { "epoch": 0.017222918120665003, "grad_norm": 0.16133128106594086, "learning_rate": 0.0002, "loss": 0.0885, "step": 9470 }, { "epoch": 0.0172411049402222, "grad_norm": 0.0965205654501915, "learning_rate": 0.0002, "loss": 0.0713, "step": 9480 }, { "epoch": 0.017259291759779395, "grad_norm": 0.21675604581832886, "learning_rate": 0.0002, "loss": 0.0658, "step": 9490 }, { "epoch": 0.01727747857933659, "grad_norm": 0.043898243457078934, "learning_rate": 0.0002, "loss": 0.0213, "step": 9500 }, { "epoch": 0.017295665398893788, "grad_norm": 0.0968618243932724, "learning_rate": 0.0002, "loss": 0.1391, "step": 9510 }, { "epoch": 0.017313852218450984, "grad_norm": 0.15061378479003906, "learning_rate": 0.0002, "loss": 0.0879, "step": 9520 }, { "epoch": 0.01733203903800818, "grad_norm": 0.08481590449810028, "learning_rate": 0.0002, "loss": 0.0771, "step": 9530 }, { "epoch": 0.017350225857565377, "grad_norm": 0.20935995876789093, "learning_rate": 0.0002, "loss": 0.0705, "step": 9540 }, { "epoch": 0.017368412677122573, "grad_norm": 0.04010302573442459, "learning_rate": 0.0002, "loss": 0.0257, "step": 9550 }, { "epoch": 0.01738659949667977, "grad_norm": 0.10532956570386887, "learning_rate": 0.0002, "loss": 0.1528, "step": 9560 }, { "epoch": 0.017404786316236966, "grad_norm": 0.1484638750553131, "learning_rate": 0.0002, "loss": 0.0847, "step": 9570 }, { "epoch": 0.017422973135794162, "grad_norm": 0.05873465910553932, "learning_rate": 0.0002, "loss": 0.0765, "step": 9580 }, { "epoch": 0.01744115995535136, "grad_norm": 0.1689092516899109, "learning_rate": 0.0002, "loss": 0.0673, "step": 9590 }, { "epoch": 0.017459346774908555, "grad_norm": 0.014237391762435436, "learning_rate": 0.0002, "loss": 0.0165, "step": 9600 }, { "epoch": 0.01747753359446575, "grad_norm": 0.06250491738319397, "learning_rate": 0.0002, "loss": 0.1635, "step": 9610 }, { "epoch": 0.017495720414022947, "grad_norm": 0.08895017951726913, "learning_rate": 0.0002, "loss": 0.0765, "step": 9620 }, { "epoch": 0.017513907233580144, "grad_norm": 0.08614445477724075, "learning_rate": 0.0002, "loss": 0.0852, "step": 9630 }, { "epoch": 0.01753209405313734, "grad_norm": 0.25440698862075806, "learning_rate": 0.0002, "loss": 0.0735, "step": 9640 }, { "epoch": 0.017550280872694536, "grad_norm": 0.015447271056473255, "learning_rate": 0.0002, "loss": 0.0199, "step": 9650 }, { "epoch": 0.017568467692251732, "grad_norm": 0.08685171604156494, "learning_rate": 0.0002, "loss": 0.1721, "step": 9660 }, { "epoch": 0.01758665451180893, "grad_norm": 0.1007658839225769, "learning_rate": 0.0002, "loss": 0.0858, "step": 9670 }, { "epoch": 0.017604841331366125, "grad_norm": 0.1291055977344513, "learning_rate": 0.0002, "loss": 0.0817, "step": 9680 }, { "epoch": 0.01762302815092332, "grad_norm": 0.21103522181510925, "learning_rate": 0.0002, "loss": 0.0707, "step": 9690 }, { "epoch": 0.017641214970480518, "grad_norm": 0.027955593541264534, "learning_rate": 0.0002, "loss": 0.0199, "step": 9700 }, { "epoch": 0.017659401790037714, "grad_norm": 0.06710019707679749, "learning_rate": 0.0002, "loss": 0.1623, "step": 9710 }, { "epoch": 0.01767758860959491, "grad_norm": 0.09083720296621323, "learning_rate": 0.0002, "loss": 0.0845, "step": 9720 }, { "epoch": 0.017695775429152107, "grad_norm": 0.07230041921138763, "learning_rate": 0.0002, "loss": 0.0767, "step": 9730 }, { "epoch": 0.017713962248709303, "grad_norm": 0.19016912579536438, "learning_rate": 0.0002, "loss": 0.0648, "step": 9740 }, { "epoch": 0.0177321490682665, "grad_norm": 0.03999534249305725, "learning_rate": 0.0002, "loss": 0.0216, "step": 9750 }, { "epoch": 0.017750335887823696, "grad_norm": 0.08057496696710587, "learning_rate": 0.0002, "loss": 0.1251, "step": 9760 }, { "epoch": 0.017768522707380892, "grad_norm": 0.16494789719581604, "learning_rate": 0.0002, "loss": 0.0896, "step": 9770 }, { "epoch": 0.017786709526938088, "grad_norm": 0.07119818776845932, "learning_rate": 0.0002, "loss": 0.0749, "step": 9780 }, { "epoch": 0.017804896346495288, "grad_norm": 0.1790028065443039, "learning_rate": 0.0002, "loss": 0.0718, "step": 9790 }, { "epoch": 0.017823083166052484, "grad_norm": 0.055643875151872635, "learning_rate": 0.0002, "loss": 0.0294, "step": 9800 }, { "epoch": 0.01784126998560968, "grad_norm": 0.15530900657176971, "learning_rate": 0.0002, "loss": 0.1343, "step": 9810 }, { "epoch": 0.017859456805166877, "grad_norm": 0.08989892899990082, "learning_rate": 0.0002, "loss": 0.0778, "step": 9820 }, { "epoch": 0.017877643624724073, "grad_norm": 0.038054581731557846, "learning_rate": 0.0002, "loss": 0.0842, "step": 9830 }, { "epoch": 0.01789583044428127, "grad_norm": 0.12264154851436615, "learning_rate": 0.0002, "loss": 0.065, "step": 9840 }, { "epoch": 0.017914017263838466, "grad_norm": 0.03432893753051758, "learning_rate": 0.0002, "loss": 0.02, "step": 9850 }, { "epoch": 0.017932204083395662, "grad_norm": 0.0516468770802021, "learning_rate": 0.0002, "loss": 0.1339, "step": 9860 }, { "epoch": 0.01795039090295286, "grad_norm": 0.11306226998567581, "learning_rate": 0.0002, "loss": 0.0842, "step": 9870 }, { "epoch": 0.017968577722510055, "grad_norm": 0.051579318940639496, "learning_rate": 0.0002, "loss": 0.0795, "step": 9880 }, { "epoch": 0.01798676454206725, "grad_norm": 0.19050930440425873, "learning_rate": 0.0002, "loss": 0.0673, "step": 9890 }, { "epoch": 0.018004951361624447, "grad_norm": 0.015286738984286785, "learning_rate": 0.0002, "loss": 0.0169, "step": 9900 }, { "epoch": 0.018023138181181644, "grad_norm": 0.16055025160312653, "learning_rate": 0.0002, "loss": 0.1655, "step": 9910 }, { "epoch": 0.01804132500073884, "grad_norm": 0.05445674806833267, "learning_rate": 0.0002, "loss": 0.0786, "step": 9920 }, { "epoch": 0.018059511820296036, "grad_norm": 0.07221481204032898, "learning_rate": 0.0002, "loss": 0.0726, "step": 9930 }, { "epoch": 0.018077698639853233, "grad_norm": 0.15800146758556366, "learning_rate": 0.0002, "loss": 0.0607, "step": 9940 }, { "epoch": 0.01809588545941043, "grad_norm": 0.007713336031883955, "learning_rate": 0.0002, "loss": 0.0148, "step": 9950 }, { "epoch": 0.018114072278967625, "grad_norm": 0.04677269607782364, "learning_rate": 0.0002, "loss": 0.1718, "step": 9960 }, { "epoch": 0.01813225909852482, "grad_norm": 0.1699189841747284, "learning_rate": 0.0002, "loss": 0.0865, "step": 9970 }, { "epoch": 0.018150445918082018, "grad_norm": 0.04046279937028885, "learning_rate": 0.0002, "loss": 0.0781, "step": 9980 }, { "epoch": 0.018168632737639214, "grad_norm": 0.164504736661911, "learning_rate": 0.0002, "loss": 0.0645, "step": 9990 }, { "epoch": 0.01818681955719641, "grad_norm": 0.014479747042059898, "learning_rate": 0.0002, "loss": 0.0186, "step": 10000 }, { "epoch": 0.018205006376753607, "grad_norm": 0.051388438791036606, "learning_rate": 0.0002, "loss": 0.1414, "step": 10010 }, { "epoch": 0.018223193196310803, "grad_norm": 0.11734543740749359, "learning_rate": 0.0002, "loss": 0.0894, "step": 10020 }, { "epoch": 0.018241380015868, "grad_norm": 0.022312749177217484, "learning_rate": 0.0002, "loss": 0.0775, "step": 10030 }, { "epoch": 0.018259566835425196, "grad_norm": 0.1579144448041916, "learning_rate": 0.0002, "loss": 0.0668, "step": 10040 }, { "epoch": 0.018277753654982392, "grad_norm": 0.02757895737886429, "learning_rate": 0.0002, "loss": 0.0197, "step": 10050 }, { "epoch": 0.01829594047453959, "grad_norm": 0.07557844370603561, "learning_rate": 0.0002, "loss": 0.1526, "step": 10060 }, { "epoch": 0.018314127294096785, "grad_norm": 0.1216227188706398, "learning_rate": 0.0002, "loss": 0.0871, "step": 10070 }, { "epoch": 0.01833231411365398, "grad_norm": 0.04201141744852066, "learning_rate": 0.0002, "loss": 0.0723, "step": 10080 }, { "epoch": 0.018350500933211177, "grad_norm": 0.151902437210083, "learning_rate": 0.0002, "loss": 0.063, "step": 10090 }, { "epoch": 0.018368687752768374, "grad_norm": 0.028730260208249092, "learning_rate": 0.0002, "loss": 0.0154, "step": 10100 }, { "epoch": 0.01838687457232557, "grad_norm": 0.0815989300608635, "learning_rate": 0.0002, "loss": 0.1439, "step": 10110 }, { "epoch": 0.018405061391882766, "grad_norm": 0.16359028220176697, "learning_rate": 0.0002, "loss": 0.0901, "step": 10120 }, { "epoch": 0.018423248211439962, "grad_norm": 0.055030226707458496, "learning_rate": 0.0002, "loss": 0.0812, "step": 10130 }, { "epoch": 0.018441435030997162, "grad_norm": 0.17064853012561798, "learning_rate": 0.0002, "loss": 0.0713, "step": 10140 }, { "epoch": 0.01845962185055436, "grad_norm": 0.024902408942580223, "learning_rate": 0.0002, "loss": 0.0201, "step": 10150 }, { "epoch": 0.018477808670111555, "grad_norm": 0.037377748638391495, "learning_rate": 0.0002, "loss": 0.1394, "step": 10160 }, { "epoch": 0.01849599548966875, "grad_norm": 0.14072410762310028, "learning_rate": 0.0002, "loss": 0.088, "step": 10170 }, { "epoch": 0.018514182309225947, "grad_norm": 0.07339414954185486, "learning_rate": 0.0002, "loss": 0.0739, "step": 10180 }, { "epoch": 0.018532369128783144, "grad_norm": 0.166766956448555, "learning_rate": 0.0002, "loss": 0.0648, "step": 10190 }, { "epoch": 0.01855055594834034, "grad_norm": 0.009605699218809605, "learning_rate": 0.0002, "loss": 0.0148, "step": 10200 }, { "epoch": 0.018568742767897536, "grad_norm": 0.045747216790914536, "learning_rate": 0.0002, "loss": 0.1426, "step": 10210 }, { "epoch": 0.018586929587454733, "grad_norm": 0.09927495568990707, "learning_rate": 0.0002, "loss": 0.0757, "step": 10220 }, { "epoch": 0.01860511640701193, "grad_norm": 0.032050736248493195, "learning_rate": 0.0002, "loss": 0.0732, "step": 10230 }, { "epoch": 0.018623303226569125, "grad_norm": 0.14915086328983307, "learning_rate": 0.0002, "loss": 0.0619, "step": 10240 }, { "epoch": 0.01864149004612632, "grad_norm": 0.019674960523843765, "learning_rate": 0.0002, "loss": 0.0176, "step": 10250 }, { "epoch": 0.018659676865683518, "grad_norm": 0.0990150198340416, "learning_rate": 0.0002, "loss": 0.156, "step": 10260 }, { "epoch": 0.018677863685240714, "grad_norm": 0.1409665048122406, "learning_rate": 0.0002, "loss": 0.0843, "step": 10270 }, { "epoch": 0.01869605050479791, "grad_norm": 0.0232121329754591, "learning_rate": 0.0002, "loss": 0.0712, "step": 10280 }, { "epoch": 0.018714237324355107, "grad_norm": 0.14811532199382782, "learning_rate": 0.0002, "loss": 0.0649, "step": 10290 }, { "epoch": 0.018732424143912303, "grad_norm": 0.025812385603785515, "learning_rate": 0.0002, "loss": 0.0191, "step": 10300 }, { "epoch": 0.0187506109634695, "grad_norm": 0.03710811957716942, "learning_rate": 0.0002, "loss": 0.1323, "step": 10310 }, { "epoch": 0.018768797783026696, "grad_norm": 0.16586032509803772, "learning_rate": 0.0002, "loss": 0.0868, "step": 10320 }, { "epoch": 0.018786984602583892, "grad_norm": 0.09154761582612991, "learning_rate": 0.0002, "loss": 0.0762, "step": 10330 }, { "epoch": 0.01880517142214109, "grad_norm": 0.20400644838809967, "learning_rate": 0.0002, "loss": 0.072, "step": 10340 }, { "epoch": 0.018823358241698285, "grad_norm": 0.04426256939768791, "learning_rate": 0.0002, "loss": 0.0281, "step": 10350 }, { "epoch": 0.01884154506125548, "grad_norm": 0.10118848830461502, "learning_rate": 0.0002, "loss": 0.116, "step": 10360 }, { "epoch": 0.018859731880812677, "grad_norm": 0.11934473365545273, "learning_rate": 0.0002, "loss": 0.0779, "step": 10370 }, { "epoch": 0.018877918700369874, "grad_norm": 0.04116957262158394, "learning_rate": 0.0002, "loss": 0.0811, "step": 10380 }, { "epoch": 0.01889610551992707, "grad_norm": 0.16668827831745148, "learning_rate": 0.0002, "loss": 0.064, "step": 10390 }, { "epoch": 0.018914292339484266, "grad_norm": 0.04703928530216217, "learning_rate": 0.0002, "loss": 0.0189, "step": 10400 }, { "epoch": 0.018932479159041463, "grad_norm": 0.10670439153909683, "learning_rate": 0.0002, "loss": 0.1329, "step": 10410 }, { "epoch": 0.01895066597859866, "grad_norm": 0.033486492931842804, "learning_rate": 0.0002, "loss": 0.0812, "step": 10420 }, { "epoch": 0.018968852798155855, "grad_norm": 0.03778929263353348, "learning_rate": 0.0002, "loss": 0.0739, "step": 10430 }, { "epoch": 0.01898703961771305, "grad_norm": 0.1499231606721878, "learning_rate": 0.0002, "loss": 0.0691, "step": 10440 }, { "epoch": 0.019005226437270248, "grad_norm": 0.020496509969234467, "learning_rate": 0.0002, "loss": 0.0166, "step": 10450 }, { "epoch": 0.019023413256827444, "grad_norm": 0.07973606884479523, "learning_rate": 0.0002, "loss": 0.1647, "step": 10460 }, { "epoch": 0.01904160007638464, "grad_norm": 0.2187214344739914, "learning_rate": 0.0002, "loss": 0.0851, "step": 10470 }, { "epoch": 0.01905978689594184, "grad_norm": 0.05374719575047493, "learning_rate": 0.0002, "loss": 0.0763, "step": 10480 }, { "epoch": 0.019077973715499037, "grad_norm": 0.20388802886009216, "learning_rate": 0.0002, "loss": 0.0661, "step": 10490 }, { "epoch": 0.019096160535056233, "grad_norm": 0.023114027455449104, "learning_rate": 0.0002, "loss": 0.0196, "step": 10500 }, { "epoch": 0.01911434735461343, "grad_norm": 0.07263924926519394, "learning_rate": 0.0002, "loss": 0.1397, "step": 10510 }, { "epoch": 0.019132534174170625, "grad_norm": 0.13590484857559204, "learning_rate": 0.0002, "loss": 0.0866, "step": 10520 }, { "epoch": 0.019150720993727822, "grad_norm": 0.03279007971286774, "learning_rate": 0.0002, "loss": 0.0845, "step": 10530 }, { "epoch": 0.019168907813285018, "grad_norm": 0.16929341852664948, "learning_rate": 0.0002, "loss": 0.0638, "step": 10540 }, { "epoch": 0.019187094632842214, "grad_norm": 0.043504901230335236, "learning_rate": 0.0002, "loss": 0.0217, "step": 10550 }, { "epoch": 0.01920528145239941, "grad_norm": 0.05582214519381523, "learning_rate": 0.0002, "loss": 0.1454, "step": 10560 }, { "epoch": 0.019223468271956607, "grad_norm": 0.12112174928188324, "learning_rate": 0.0002, "loss": 0.0773, "step": 10570 }, { "epoch": 0.019241655091513803, "grad_norm": 0.028584860265254974, "learning_rate": 0.0002, "loss": 0.0766, "step": 10580 }, { "epoch": 0.019259841911071, "grad_norm": 0.14817841351032257, "learning_rate": 0.0002, "loss": 0.07, "step": 10590 }, { "epoch": 0.019278028730628196, "grad_norm": 0.0354049950838089, "learning_rate": 0.0002, "loss": 0.0205, "step": 10600 }, { "epoch": 0.019296215550185392, "grad_norm": 0.0580359622836113, "learning_rate": 0.0002, "loss": 0.126, "step": 10610 }, { "epoch": 0.01931440236974259, "grad_norm": 0.1495518982410431, "learning_rate": 0.0002, "loss": 0.0759, "step": 10620 }, { "epoch": 0.019332589189299785, "grad_norm": 0.029057197272777557, "learning_rate": 0.0002, "loss": 0.0751, "step": 10630 }, { "epoch": 0.01935077600885698, "grad_norm": 0.17057828605175018, "learning_rate": 0.0002, "loss": 0.0675, "step": 10640 }, { "epoch": 0.019368962828414178, "grad_norm": 0.029123524203896523, "learning_rate": 0.0002, "loss": 0.0207, "step": 10650 }, { "epoch": 0.019387149647971374, "grad_norm": 0.06929099559783936, "learning_rate": 0.0002, "loss": 0.1272, "step": 10660 }, { "epoch": 0.01940533646752857, "grad_norm": 0.0806749165058136, "learning_rate": 0.0002, "loss": 0.0825, "step": 10670 }, { "epoch": 0.019423523287085766, "grad_norm": 0.025454839691519737, "learning_rate": 0.0002, "loss": 0.0786, "step": 10680 }, { "epoch": 0.019441710106642963, "grad_norm": 0.1879327893257141, "learning_rate": 0.0002, "loss": 0.0664, "step": 10690 }, { "epoch": 0.01945989692620016, "grad_norm": 0.03334587439894676, "learning_rate": 0.0002, "loss": 0.0217, "step": 10700 }, { "epoch": 0.019478083745757355, "grad_norm": 0.05760979652404785, "learning_rate": 0.0002, "loss": 0.141, "step": 10710 }, { "epoch": 0.01949627056531455, "grad_norm": 0.03565089777112007, "learning_rate": 0.0002, "loss": 0.0849, "step": 10720 }, { "epoch": 0.019514457384871748, "grad_norm": 0.1484966278076172, "learning_rate": 0.0002, "loss": 0.0839, "step": 10730 }, { "epoch": 0.019532644204428944, "grad_norm": 0.22200991213321686, "learning_rate": 0.0002, "loss": 0.0673, "step": 10740 }, { "epoch": 0.01955083102398614, "grad_norm": 0.017915472388267517, "learning_rate": 0.0002, "loss": 0.0203, "step": 10750 }, { "epoch": 0.019569017843543337, "grad_norm": 0.11213338375091553, "learning_rate": 0.0002, "loss": 0.126, "step": 10760 }, { "epoch": 0.019587204663100533, "grad_norm": 0.1563912183046341, "learning_rate": 0.0002, "loss": 0.0803, "step": 10770 }, { "epoch": 0.01960539148265773, "grad_norm": 0.02315036952495575, "learning_rate": 0.0002, "loss": 0.0801, "step": 10780 }, { "epoch": 0.019623578302214926, "grad_norm": 0.14482071995735168, "learning_rate": 0.0002, "loss": 0.0701, "step": 10790 }, { "epoch": 0.019641765121772122, "grad_norm": 0.0369495190680027, "learning_rate": 0.0002, "loss": 0.022, "step": 10800 }, { "epoch": 0.01965995194132932, "grad_norm": 0.0659516304731369, "learning_rate": 0.0002, "loss": 0.1282, "step": 10810 }, { "epoch": 0.019678138760886515, "grad_norm": 0.09046377241611481, "learning_rate": 0.0002, "loss": 0.0748, "step": 10820 }, { "epoch": 0.019696325580443715, "grad_norm": 0.05669049918651581, "learning_rate": 0.0002, "loss": 0.0808, "step": 10830 }, { "epoch": 0.01971451240000091, "grad_norm": 0.16696439683437347, "learning_rate": 0.0002, "loss": 0.0696, "step": 10840 }, { "epoch": 0.019732699219558107, "grad_norm": 0.02596648782491684, "learning_rate": 0.0002, "loss": 0.0189, "step": 10850 }, { "epoch": 0.019750886039115303, "grad_norm": 0.030568787828087807, "learning_rate": 0.0002, "loss": 0.1431, "step": 10860 }, { "epoch": 0.0197690728586725, "grad_norm": 0.11519906669855118, "learning_rate": 0.0002, "loss": 0.0832, "step": 10870 }, { "epoch": 0.019787259678229696, "grad_norm": 0.12018325924873352, "learning_rate": 0.0002, "loss": 0.0769, "step": 10880 }, { "epoch": 0.019805446497786892, "grad_norm": 0.15875691175460815, "learning_rate": 0.0002, "loss": 0.0679, "step": 10890 }, { "epoch": 0.01982363331734409, "grad_norm": 0.02812560275197029, "learning_rate": 0.0002, "loss": 0.0236, "step": 10900 }, { "epoch": 0.019841820136901285, "grad_norm": 0.039342913776636124, "learning_rate": 0.0002, "loss": 0.1433, "step": 10910 }, { "epoch": 0.01986000695645848, "grad_norm": 0.1218978762626648, "learning_rate": 0.0002, "loss": 0.0785, "step": 10920 }, { "epoch": 0.019878193776015678, "grad_norm": 0.02437124028801918, "learning_rate": 0.0002, "loss": 0.081, "step": 10930 }, { "epoch": 0.019896380595572874, "grad_norm": 0.16295987367630005, "learning_rate": 0.0002, "loss": 0.0615, "step": 10940 }, { "epoch": 0.01991456741513007, "grad_norm": 0.03147517144680023, "learning_rate": 0.0002, "loss": 0.0167, "step": 10950 }, { "epoch": 0.019932754234687267, "grad_norm": 0.051139310002326965, "learning_rate": 0.0002, "loss": 0.1486, "step": 10960 }, { "epoch": 0.019950941054244463, "grad_norm": 0.10385333746671677, "learning_rate": 0.0002, "loss": 0.0835, "step": 10970 }, { "epoch": 0.01996912787380166, "grad_norm": 0.029570510610938072, "learning_rate": 0.0002, "loss": 0.0808, "step": 10980 }, { "epoch": 0.019987314693358856, "grad_norm": 0.1457994282245636, "learning_rate": 0.0002, "loss": 0.062, "step": 10990 }, { "epoch": 0.020005501512916052, "grad_norm": 0.013582763262093067, "learning_rate": 0.0002, "loss": 0.0149, "step": 11000 }, { "epoch": 0.020023688332473248, "grad_norm": 0.13736847043037415, "learning_rate": 0.0002, "loss": 0.164, "step": 11010 }, { "epoch": 0.020041875152030444, "grad_norm": 0.146778866648674, "learning_rate": 0.0002, "loss": 0.0865, "step": 11020 }, { "epoch": 0.02006006197158764, "grad_norm": 0.09848106652498245, "learning_rate": 0.0002, "loss": 0.0785, "step": 11030 }, { "epoch": 0.020078248791144837, "grad_norm": 0.19981160759925842, "learning_rate": 0.0002, "loss": 0.0738, "step": 11040 }, { "epoch": 0.020096435610702033, "grad_norm": 0.0248726736754179, "learning_rate": 0.0002, "loss": 0.0232, "step": 11050 }, { "epoch": 0.02011462243025923, "grad_norm": 0.09688897430896759, "learning_rate": 0.0002, "loss": 0.1361, "step": 11060 }, { "epoch": 0.020132809249816426, "grad_norm": 0.09953918308019638, "learning_rate": 0.0002, "loss": 0.0815, "step": 11070 }, { "epoch": 0.020150996069373622, "grad_norm": 0.05801590532064438, "learning_rate": 0.0002, "loss": 0.0794, "step": 11080 }, { "epoch": 0.02016918288893082, "grad_norm": 0.2029600441455841, "learning_rate": 0.0002, "loss": 0.0661, "step": 11090 }, { "epoch": 0.020187369708488015, "grad_norm": 0.026677627116441727, "learning_rate": 0.0002, "loss": 0.022, "step": 11100 }, { "epoch": 0.02020555652804521, "grad_norm": 0.054907217621803284, "learning_rate": 0.0002, "loss": 0.1356, "step": 11110 }, { "epoch": 0.020223743347602408, "grad_norm": 0.16302120685577393, "learning_rate": 0.0002, "loss": 0.0721, "step": 11120 }, { "epoch": 0.020241930167159604, "grad_norm": 0.03393812105059624, "learning_rate": 0.0002, "loss": 0.0733, "step": 11130 }, { "epoch": 0.0202601169867168, "grad_norm": 0.16455304622650146, "learning_rate": 0.0002, "loss": 0.062, "step": 11140 }, { "epoch": 0.020278303806273996, "grad_norm": 0.026239484548568726, "learning_rate": 0.0002, "loss": 0.0148, "step": 11150 }, { "epoch": 0.020296490625831193, "grad_norm": 0.10048040002584457, "learning_rate": 0.0002, "loss": 0.1398, "step": 11160 }, { "epoch": 0.020314677445388393, "grad_norm": 0.14221400022506714, "learning_rate": 0.0002, "loss": 0.0782, "step": 11170 }, { "epoch": 0.02033286426494559, "grad_norm": 0.08432412147521973, "learning_rate": 0.0002, "loss": 0.0807, "step": 11180 }, { "epoch": 0.020351051084502785, "grad_norm": 0.172295480966568, "learning_rate": 0.0002, "loss": 0.0655, "step": 11190 }, { "epoch": 0.02036923790405998, "grad_norm": 0.023976756259799004, "learning_rate": 0.0002, "loss": 0.0218, "step": 11200 }, { "epoch": 0.020387424723617178, "grad_norm": 0.03286349028348923, "learning_rate": 0.0002, "loss": 0.1441, "step": 11210 }, { "epoch": 0.020405611543174374, "grad_norm": 0.04403531551361084, "learning_rate": 0.0002, "loss": 0.0825, "step": 11220 }, { "epoch": 0.02042379836273157, "grad_norm": 0.0398452989757061, "learning_rate": 0.0002, "loss": 0.0755, "step": 11230 }, { "epoch": 0.020441985182288767, "grad_norm": 0.15185104310512543, "learning_rate": 0.0002, "loss": 0.0591, "step": 11240 }, { "epoch": 0.020460172001845963, "grad_norm": 0.005839187186211348, "learning_rate": 0.0002, "loss": 0.0172, "step": 11250 }, { "epoch": 0.02047835882140316, "grad_norm": 0.031195368617773056, "learning_rate": 0.0002, "loss": 0.1594, "step": 11260 }, { "epoch": 0.020496545640960356, "grad_norm": 0.1997426599264145, "learning_rate": 0.0002, "loss": 0.0797, "step": 11270 }, { "epoch": 0.020514732460517552, "grad_norm": 0.03075752593576908, "learning_rate": 0.0002, "loss": 0.0798, "step": 11280 }, { "epoch": 0.02053291928007475, "grad_norm": 0.17717675864696503, "learning_rate": 0.0002, "loss": 0.0654, "step": 11290 }, { "epoch": 0.020551106099631945, "grad_norm": 0.036260057240724564, "learning_rate": 0.0002, "loss": 0.0196, "step": 11300 }, { "epoch": 0.02056929291918914, "grad_norm": 0.11961262673139572, "learning_rate": 0.0002, "loss": 0.1313, "step": 11310 }, { "epoch": 0.020587479738746337, "grad_norm": 0.12344212830066681, "learning_rate": 0.0002, "loss": 0.0798, "step": 11320 }, { "epoch": 0.020605666558303534, "grad_norm": 0.12796273827552795, "learning_rate": 0.0002, "loss": 0.0693, "step": 11330 }, { "epoch": 0.02062385337786073, "grad_norm": 0.12038332223892212, "learning_rate": 0.0002, "loss": 0.0637, "step": 11340 }, { "epoch": 0.020642040197417926, "grad_norm": 0.013724497519433498, "learning_rate": 0.0002, "loss": 0.0134, "step": 11350 }, { "epoch": 0.020660227016975122, "grad_norm": 0.030014917254447937, "learning_rate": 0.0002, "loss": 0.1355, "step": 11360 }, { "epoch": 0.02067841383653232, "grad_norm": 0.05455614998936653, "learning_rate": 0.0002, "loss": 0.0824, "step": 11370 }, { "epoch": 0.020696600656089515, "grad_norm": 0.09036605060100555, "learning_rate": 0.0002, "loss": 0.0712, "step": 11380 }, { "epoch": 0.02071478747564671, "grad_norm": 0.15607796609401703, "learning_rate": 0.0002, "loss": 0.0613, "step": 11390 }, { "epoch": 0.020732974295203908, "grad_norm": 0.029900453984737396, "learning_rate": 0.0002, "loss": 0.0216, "step": 11400 }, { "epoch": 0.020751161114761104, "grad_norm": 0.06108042970299721, "learning_rate": 0.0002, "loss": 0.1223, "step": 11410 }, { "epoch": 0.0207693479343183, "grad_norm": 0.052377600222826004, "learning_rate": 0.0002, "loss": 0.0795, "step": 11420 }, { "epoch": 0.020787534753875497, "grad_norm": 0.063735231757164, "learning_rate": 0.0002, "loss": 0.0746, "step": 11430 }, { "epoch": 0.020805721573432693, "grad_norm": 0.16977328062057495, "learning_rate": 0.0002, "loss": 0.0634, "step": 11440 }, { "epoch": 0.02082390839298989, "grad_norm": 0.04451785981655121, "learning_rate": 0.0002, "loss": 0.0298, "step": 11450 }, { "epoch": 0.020842095212547086, "grad_norm": 1.1584863662719727, "learning_rate": 0.0002, "loss": 0.1133, "step": 11460 }, { "epoch": 0.020860282032104282, "grad_norm": 0.09867832064628601, "learning_rate": 0.0002, "loss": 0.0774, "step": 11470 }, { "epoch": 0.020878468851661478, "grad_norm": 0.05493566766381264, "learning_rate": 0.0002, "loss": 0.0752, "step": 11480 }, { "epoch": 0.020896655671218674, "grad_norm": 0.2149093896150589, "learning_rate": 0.0002, "loss": 0.0682, "step": 11490 }, { "epoch": 0.02091484249077587, "grad_norm": 0.02243107184767723, "learning_rate": 0.0002, "loss": 0.0191, "step": 11500 }, { "epoch": 0.02093302931033307, "grad_norm": 0.27817150950431824, "learning_rate": 0.0002, "loss": 0.1658, "step": 11510 }, { "epoch": 0.020951216129890267, "grad_norm": 0.14467410743236542, "learning_rate": 0.0002, "loss": 0.083, "step": 11520 }, { "epoch": 0.020969402949447463, "grad_norm": 0.1027064323425293, "learning_rate": 0.0002, "loss": 0.0825, "step": 11530 }, { "epoch": 0.02098758976900466, "grad_norm": 0.2156657725572586, "learning_rate": 0.0002, "loss": 0.0694, "step": 11540 }, { "epoch": 0.021005776588561856, "grad_norm": 0.023746902123093605, "learning_rate": 0.0002, "loss": 0.024, "step": 11550 }, { "epoch": 0.021023963408119052, "grad_norm": 0.19738778471946716, "learning_rate": 0.0002, "loss": 0.1473, "step": 11560 }, { "epoch": 0.02104215022767625, "grad_norm": 0.19759760797023773, "learning_rate": 0.0002, "loss": 0.078, "step": 11570 }, { "epoch": 0.021060337047233445, "grad_norm": 9.88092041015625, "learning_rate": 0.0002, "loss": 0.0866, "step": 11580 }, { "epoch": 0.02107852386679064, "grad_norm": 0.22301238775253296, "learning_rate": 0.0002, "loss": 0.0685, "step": 11590 }, { "epoch": 0.021096710686347837, "grad_norm": 0.023191403597593307, "learning_rate": 0.0002, "loss": 0.0468, "step": 11600 }, { "epoch": 0.021114897505905034, "grad_norm": 0.10442623496055603, "learning_rate": 0.0002, "loss": 0.2046, "step": 11610 }, { "epoch": 0.02113308432546223, "grad_norm": 0.18771864473819733, "learning_rate": 0.0002, "loss": 0.0805, "step": 11620 }, { "epoch": 0.021151271145019426, "grad_norm": 0.05516243353486061, "learning_rate": 0.0002, "loss": 0.0795, "step": 11630 }, { "epoch": 0.021169457964576623, "grad_norm": 0.21308554708957672, "learning_rate": 0.0002, "loss": 0.0725, "step": 11640 }, { "epoch": 0.02118764478413382, "grad_norm": 0.010607315227389336, "learning_rate": 0.0002, "loss": 0.0241, "step": 11650 }, { "epoch": 0.021205831603691015, "grad_norm": 0.0542677640914917, "learning_rate": 0.0002, "loss": 0.1648, "step": 11660 }, { "epoch": 0.02122401842324821, "grad_norm": 0.11239166557788849, "learning_rate": 0.0002, "loss": 0.0825, "step": 11670 }, { "epoch": 0.021242205242805408, "grad_norm": 0.032700493931770325, "learning_rate": 0.0002, "loss": 0.0727, "step": 11680 }, { "epoch": 0.021260392062362604, "grad_norm": 0.2005159705877304, "learning_rate": 0.0002, "loss": 0.0708, "step": 11690 }, { "epoch": 0.0212785788819198, "grad_norm": 0.01741277053952217, "learning_rate": 0.0002, "loss": 0.0232, "step": 11700 }, { "epoch": 0.021296765701476997, "grad_norm": 0.04048267379403114, "learning_rate": 0.0002, "loss": 0.1403, "step": 11710 }, { "epoch": 0.021314952521034193, "grad_norm": 0.18796616792678833, "learning_rate": 0.0002, "loss": 0.0886, "step": 11720 }, { "epoch": 0.02133313934059139, "grad_norm": 0.06360754370689392, "learning_rate": 0.0002, "loss": 0.0731, "step": 11730 }, { "epoch": 0.021351326160148586, "grad_norm": 0.14168913662433624, "learning_rate": 0.0002, "loss": 0.0622, "step": 11740 }, { "epoch": 0.021369512979705782, "grad_norm": 0.012988853268325329, "learning_rate": 0.0002, "loss": 0.0144, "step": 11750 }, { "epoch": 0.02138769979926298, "grad_norm": 0.09176674485206604, "learning_rate": 0.0002, "loss": 0.1574, "step": 11760 }, { "epoch": 0.021405886618820175, "grad_norm": 0.11934395134449005, "learning_rate": 0.0002, "loss": 0.079, "step": 11770 }, { "epoch": 0.02142407343837737, "grad_norm": 0.11853605508804321, "learning_rate": 0.0002, "loss": 0.076, "step": 11780 }, { "epoch": 0.021442260257934567, "grad_norm": 0.1625816971063614, "learning_rate": 0.0002, "loss": 0.0649, "step": 11790 }, { "epoch": 0.021460447077491764, "grad_norm": 0.023221928626298904, "learning_rate": 0.0002, "loss": 0.0228, "step": 11800 }, { "epoch": 0.02147863389704896, "grad_norm": 0.0494253933429718, "learning_rate": 0.0002, "loss": 0.1418, "step": 11810 }, { "epoch": 0.021496820716606156, "grad_norm": 0.18250688910484314, "learning_rate": 0.0002, "loss": 0.0827, "step": 11820 }, { "epoch": 0.021515007536163352, "grad_norm": 0.13340160250663757, "learning_rate": 0.0002, "loss": 0.0794, "step": 11830 }, { "epoch": 0.02153319435572055, "grad_norm": 0.15497778356075287, "learning_rate": 0.0002, "loss": 0.0613, "step": 11840 }, { "epoch": 0.021551381175277745, "grad_norm": 0.03259354829788208, "learning_rate": 0.0002, "loss": 0.023, "step": 11850 }, { "epoch": 0.021569567994834945, "grad_norm": 0.09126435220241547, "learning_rate": 0.0002, "loss": 0.1235, "step": 11860 }, { "epoch": 0.02158775481439214, "grad_norm": 0.13455496728420258, "learning_rate": 0.0002, "loss": 0.0806, "step": 11870 }, { "epoch": 0.021605941633949338, "grad_norm": 0.10817539691925049, "learning_rate": 0.0002, "loss": 0.0829, "step": 11880 }, { "epoch": 0.021624128453506534, "grad_norm": 0.1913878321647644, "learning_rate": 0.0002, "loss": 0.0636, "step": 11890 }, { "epoch": 0.02164231527306373, "grad_norm": 0.025634530931711197, "learning_rate": 0.0002, "loss": 0.0216, "step": 11900 }, { "epoch": 0.021660502092620926, "grad_norm": 0.10507725924253464, "learning_rate": 0.0002, "loss": 0.1326, "step": 11910 }, { "epoch": 0.021678688912178123, "grad_norm": 0.09721452742815018, "learning_rate": 0.0002, "loss": 0.0857, "step": 11920 }, { "epoch": 0.02169687573173532, "grad_norm": 0.028759269043803215, "learning_rate": 0.0002, "loss": 0.0751, "step": 11930 }, { "epoch": 0.021715062551292515, "grad_norm": 0.17618104815483093, "learning_rate": 0.0002, "loss": 0.062, "step": 11940 }, { "epoch": 0.02173324937084971, "grad_norm": 0.02503124624490738, "learning_rate": 0.0002, "loss": 0.0182, "step": 11950 }, { "epoch": 0.021751436190406908, "grad_norm": 0.10976126044988632, "learning_rate": 0.0002, "loss": 0.1564, "step": 11960 }, { "epoch": 0.021769623009964104, "grad_norm": 0.0833989605307579, "learning_rate": 0.0002, "loss": 0.0784, "step": 11970 }, { "epoch": 0.0217878098295213, "grad_norm": 0.06359647959470749, "learning_rate": 0.0002, "loss": 0.0795, "step": 11980 }, { "epoch": 0.021805996649078497, "grad_norm": 0.1677824705839157, "learning_rate": 0.0002, "loss": 0.0699, "step": 11990 }, { "epoch": 0.021824183468635693, "grad_norm": 0.018009621649980545, "learning_rate": 0.0002, "loss": 0.0185, "step": 12000 }, { "epoch": 0.02184237028819289, "grad_norm": 0.12256644666194916, "learning_rate": 0.0002, "loss": 0.1839, "step": 12010 }, { "epoch": 0.021860557107750086, "grad_norm": 0.11677028983831406, "learning_rate": 0.0002, "loss": 0.0829, "step": 12020 }, { "epoch": 0.021878743927307282, "grad_norm": 0.12885046005249023, "learning_rate": 0.0002, "loss": 0.0812, "step": 12030 }, { "epoch": 0.02189693074686448, "grad_norm": 0.1394425481557846, "learning_rate": 0.0002, "loss": 0.0668, "step": 12040 }, { "epoch": 0.021915117566421675, "grad_norm": 0.024974076077342033, "learning_rate": 0.0002, "loss": 0.0192, "step": 12050 }, { "epoch": 0.02193330438597887, "grad_norm": 0.11284986138343811, "learning_rate": 0.0002, "loss": 0.1492, "step": 12060 }, { "epoch": 0.021951491205536067, "grad_norm": 0.0605492927134037, "learning_rate": 0.0002, "loss": 0.0787, "step": 12070 }, { "epoch": 0.021969678025093264, "grad_norm": 0.040298718959093094, "learning_rate": 0.0002, "loss": 0.0778, "step": 12080 }, { "epoch": 0.02198786484465046, "grad_norm": 0.1555332988500595, "learning_rate": 0.0002, "loss": 0.0683, "step": 12090 }, { "epoch": 0.022006051664207656, "grad_norm": 0.022474724799394608, "learning_rate": 0.0002, "loss": 0.0139, "step": 12100 }, { "epoch": 0.022024238483764853, "grad_norm": 0.08212363719940186, "learning_rate": 0.0002, "loss": 0.1513, "step": 12110 }, { "epoch": 0.02204242530332205, "grad_norm": 0.16297335922718048, "learning_rate": 0.0002, "loss": 0.087, "step": 12120 }, { "epoch": 0.022060612122879245, "grad_norm": 0.026817265897989273, "learning_rate": 0.0002, "loss": 0.0763, "step": 12130 }, { "epoch": 0.02207879894243644, "grad_norm": 0.15199647843837738, "learning_rate": 0.0002, "loss": 0.0632, "step": 12140 }, { "epoch": 0.022096985761993638, "grad_norm": 0.021619049832224846, "learning_rate": 0.0002, "loss": 0.0221, "step": 12150 }, { "epoch": 0.022115172581550834, "grad_norm": 0.071327805519104, "learning_rate": 0.0002, "loss": 0.138, "step": 12160 }, { "epoch": 0.02213335940110803, "grad_norm": 0.07506705075502396, "learning_rate": 0.0002, "loss": 0.0802, "step": 12170 }, { "epoch": 0.022151546220665227, "grad_norm": 0.05193526670336723, "learning_rate": 0.0002, "loss": 0.0726, "step": 12180 }, { "epoch": 0.022169733040222423, "grad_norm": 0.125730961561203, "learning_rate": 0.0002, "loss": 0.0658, "step": 12190 }, { "epoch": 0.022187919859779623, "grad_norm": 0.01939002424478531, "learning_rate": 0.0002, "loss": 0.0174, "step": 12200 }, { "epoch": 0.02220610667933682, "grad_norm": 0.05645585432648659, "learning_rate": 0.0002, "loss": 0.1447, "step": 12210 }, { "epoch": 0.022224293498894016, "grad_norm": 0.12416274845600128, "learning_rate": 0.0002, "loss": 0.0727, "step": 12220 }, { "epoch": 0.022242480318451212, "grad_norm": 0.05618545040488243, "learning_rate": 0.0002, "loss": 0.0801, "step": 12230 }, { "epoch": 0.022260667138008408, "grad_norm": 0.12334968894720078, "learning_rate": 0.0002, "loss": 0.0598, "step": 12240 }, { "epoch": 0.022278853957565604, "grad_norm": 0.024331970140337944, "learning_rate": 0.0002, "loss": 0.0179, "step": 12250 }, { "epoch": 0.0222970407771228, "grad_norm": 0.05856281518936157, "learning_rate": 0.0002, "loss": 0.126, "step": 12260 }, { "epoch": 0.022315227596679997, "grad_norm": 0.07432300597429276, "learning_rate": 0.0002, "loss": 0.0839, "step": 12270 }, { "epoch": 0.022333414416237193, "grad_norm": 0.07249715179204941, "learning_rate": 0.0002, "loss": 0.0815, "step": 12280 }, { "epoch": 0.02235160123579439, "grad_norm": 0.14335612952709198, "learning_rate": 0.0002, "loss": 0.0605, "step": 12290 }, { "epoch": 0.022369788055351586, "grad_norm": 0.03603110462427139, "learning_rate": 0.0002, "loss": 0.0185, "step": 12300 }, { "epoch": 0.022387974874908782, "grad_norm": 0.08532091230154037, "learning_rate": 0.0002, "loss": 0.1339, "step": 12310 }, { "epoch": 0.02240616169446598, "grad_norm": 0.13663236796855927, "learning_rate": 0.0002, "loss": 0.0761, "step": 12320 }, { "epoch": 0.022424348514023175, "grad_norm": 0.10088011622428894, "learning_rate": 0.0002, "loss": 0.0737, "step": 12330 }, { "epoch": 0.02244253533358037, "grad_norm": 0.17186152935028076, "learning_rate": 0.0002, "loss": 0.0661, "step": 12340 }, { "epoch": 0.022460722153137568, "grad_norm": 0.01941334828734398, "learning_rate": 0.0002, "loss": 0.0135, "step": 12350 }, { "epoch": 0.022478908972694764, "grad_norm": 0.12438862770795822, "learning_rate": 0.0002, "loss": 0.1474, "step": 12360 }, { "epoch": 0.02249709579225196, "grad_norm": 0.08050791174173355, "learning_rate": 0.0002, "loss": 0.0792, "step": 12370 }, { "epoch": 0.022515282611809156, "grad_norm": 0.04660952091217041, "learning_rate": 0.0002, "loss": 0.0826, "step": 12380 }, { "epoch": 0.022533469431366353, "grad_norm": 0.16433311998844147, "learning_rate": 0.0002, "loss": 0.0691, "step": 12390 }, { "epoch": 0.02255165625092355, "grad_norm": 0.04376552626490593, "learning_rate": 0.0002, "loss": 0.0219, "step": 12400 }, { "epoch": 0.022569843070480745, "grad_norm": 0.06648654490709305, "learning_rate": 0.0002, "loss": 0.1346, "step": 12410 }, { "epoch": 0.02258802989003794, "grad_norm": 0.11318199336528778, "learning_rate": 0.0002, "loss": 0.0736, "step": 12420 }, { "epoch": 0.022606216709595138, "grad_norm": 0.0922408252954483, "learning_rate": 0.0002, "loss": 0.0819, "step": 12430 }, { "epoch": 0.022624403529152334, "grad_norm": 0.1696896106004715, "learning_rate": 0.0002, "loss": 0.0642, "step": 12440 }, { "epoch": 0.02264259034870953, "grad_norm": 0.03212421387434006, "learning_rate": 0.0002, "loss": 0.0247, "step": 12450 }, { "epoch": 0.022660777168266727, "grad_norm": 0.12295889109373093, "learning_rate": 0.0002, "loss": 0.1504, "step": 12460 }, { "epoch": 0.022678963987823923, "grad_norm": 0.10351194441318512, "learning_rate": 0.0002, "loss": 0.0757, "step": 12470 }, { "epoch": 0.02269715080738112, "grad_norm": 0.022580118849873543, "learning_rate": 0.0002, "loss": 0.0756, "step": 12480 }, { "epoch": 0.022715337626938316, "grad_norm": 0.16330066323280334, "learning_rate": 0.0002, "loss": 0.0645, "step": 12490 }, { "epoch": 0.022733524446495512, "grad_norm": 0.021431026980280876, "learning_rate": 0.0002, "loss": 0.0224, "step": 12500 }, { "epoch": 0.02275171126605271, "grad_norm": 0.053853604942560196, "learning_rate": 0.0002, "loss": 0.1304, "step": 12510 }, { "epoch": 0.022769898085609905, "grad_norm": 0.129705548286438, "learning_rate": 0.0002, "loss": 0.0799, "step": 12520 }, { "epoch": 0.0227880849051671, "grad_norm": 0.027473529800772667, "learning_rate": 0.0002, "loss": 0.0771, "step": 12530 }, { "epoch": 0.0228062717247243, "grad_norm": 0.2045305222272873, "learning_rate": 0.0002, "loss": 0.0615, "step": 12540 }, { "epoch": 0.022824458544281497, "grad_norm": 0.041042860597372055, "learning_rate": 0.0002, "loss": 0.026, "step": 12550 }, { "epoch": 0.022842645363838694, "grad_norm": 0.05624527484178543, "learning_rate": 0.0002, "loss": 0.1327, "step": 12560 }, { "epoch": 0.02286083218339589, "grad_norm": 0.09647081047296524, "learning_rate": 0.0002, "loss": 0.0758, "step": 12570 }, { "epoch": 0.022879019002953086, "grad_norm": 0.03362264856696129, "learning_rate": 0.0002, "loss": 0.076, "step": 12580 }, { "epoch": 0.022897205822510282, "grad_norm": 0.1459503322839737, "learning_rate": 0.0002, "loss": 0.0603, "step": 12590 }, { "epoch": 0.02291539264206748, "grad_norm": 0.025729481130838394, "learning_rate": 0.0002, "loss": 0.0196, "step": 12600 }, { "epoch": 0.022933579461624675, "grad_norm": 0.19940927624702454, "learning_rate": 0.0002, "loss": 0.1298, "step": 12610 }, { "epoch": 0.02295176628118187, "grad_norm": 0.13796600699424744, "learning_rate": 0.0002, "loss": 0.086, "step": 12620 }, { "epoch": 0.022969953100739068, "grad_norm": 0.08884158730506897, "learning_rate": 0.0002, "loss": 0.0808, "step": 12630 }, { "epoch": 0.022988139920296264, "grad_norm": 0.15814751386642456, "learning_rate": 0.0002, "loss": 0.0658, "step": 12640 }, { "epoch": 0.02300632673985346, "grad_norm": 0.03503837063908577, "learning_rate": 0.0002, "loss": 0.0232, "step": 12650 }, { "epoch": 0.023024513559410657, "grad_norm": 0.09701854735612869, "learning_rate": 0.0002, "loss": 0.136, "step": 12660 }, { "epoch": 0.023042700378967853, "grad_norm": 0.13909977674484253, "learning_rate": 0.0002, "loss": 0.0839, "step": 12670 }, { "epoch": 0.02306088719852505, "grad_norm": 0.03152406960725784, "learning_rate": 0.0002, "loss": 0.0733, "step": 12680 }, { "epoch": 0.023079074018082246, "grad_norm": 0.13872750103473663, "learning_rate": 0.0002, "loss": 0.0604, "step": 12690 }, { "epoch": 0.023097260837639442, "grad_norm": 0.03626656159758568, "learning_rate": 0.0002, "loss": 0.0234, "step": 12700 }, { "epoch": 0.023115447657196638, "grad_norm": 0.10111619532108307, "learning_rate": 0.0002, "loss": 0.1507, "step": 12710 }, { "epoch": 0.023133634476753834, "grad_norm": 0.09038366377353668, "learning_rate": 0.0002, "loss": 0.0839, "step": 12720 }, { "epoch": 0.02315182129631103, "grad_norm": 0.026116544380784035, "learning_rate": 0.0002, "loss": 0.0777, "step": 12730 }, { "epoch": 0.023170008115868227, "grad_norm": 0.2067679613828659, "learning_rate": 0.0002, "loss": 0.0661, "step": 12740 }, { "epoch": 0.023188194935425423, "grad_norm": 0.02005072310566902, "learning_rate": 0.0002, "loss": 0.0165, "step": 12750 }, { "epoch": 0.02320638175498262, "grad_norm": 0.03261101245880127, "learning_rate": 0.0002, "loss": 0.159, "step": 12760 }, { "epoch": 0.023224568574539816, "grad_norm": 0.1416555494070053, "learning_rate": 0.0002, "loss": 0.0856, "step": 12770 }, { "epoch": 0.023242755394097012, "grad_norm": 0.09400717914104462, "learning_rate": 0.0002, "loss": 0.0745, "step": 12780 }, { "epoch": 0.02326094221365421, "grad_norm": 0.17093195021152496, "learning_rate": 0.0002, "loss": 0.0691, "step": 12790 }, { "epoch": 0.023279129033211405, "grad_norm": 0.0209200382232666, "learning_rate": 0.0002, "loss": 0.0168, "step": 12800 }, { "epoch": 0.0232973158527686, "grad_norm": 0.10523302853107452, "learning_rate": 0.0002, "loss": 0.1628, "step": 12810 }, { "epoch": 0.023315502672325798, "grad_norm": 0.06932856142520905, "learning_rate": 0.0002, "loss": 0.079, "step": 12820 }, { "epoch": 0.023333689491882994, "grad_norm": 0.03244032710790634, "learning_rate": 0.0002, "loss": 0.0699, "step": 12830 }, { "epoch": 0.02335187631144019, "grad_norm": 0.13403338193893433, "learning_rate": 0.0002, "loss": 0.0619, "step": 12840 }, { "epoch": 0.023370063130997386, "grad_norm": 0.034033093601465225, "learning_rate": 0.0002, "loss": 0.0166, "step": 12850 }, { "epoch": 0.023388249950554583, "grad_norm": 0.07277385890483856, "learning_rate": 0.0002, "loss": 0.1377, "step": 12860 }, { "epoch": 0.02340643677011178, "grad_norm": 0.10873163491487503, "learning_rate": 0.0002, "loss": 0.0895, "step": 12870 }, { "epoch": 0.023424623589668975, "grad_norm": 0.06244732066988945, "learning_rate": 0.0002, "loss": 0.0745, "step": 12880 }, { "epoch": 0.023442810409226175, "grad_norm": 0.1937248259782791, "learning_rate": 0.0002, "loss": 0.0633, "step": 12890 }, { "epoch": 0.02346099722878337, "grad_norm": 0.03432930260896683, "learning_rate": 0.0002, "loss": 0.0246, "step": 12900 }, { "epoch": 0.023479184048340568, "grad_norm": 0.33358234167099, "learning_rate": 0.0002, "loss": 0.1249, "step": 12910 }, { "epoch": 0.023497370867897764, "grad_norm": 0.12039615213871002, "learning_rate": 0.0002, "loss": 0.0734, "step": 12920 }, { "epoch": 0.02351555768745496, "grad_norm": 0.02666555717587471, "learning_rate": 0.0002, "loss": 0.0849, "step": 12930 }, { "epoch": 0.023533744507012157, "grad_norm": 0.128091961145401, "learning_rate": 0.0002, "loss": 0.0647, "step": 12940 }, { "epoch": 0.023551931326569353, "grad_norm": 0.030916422605514526, "learning_rate": 0.0002, "loss": 0.0217, "step": 12950 }, { "epoch": 0.02357011814612655, "grad_norm": 0.09280567616224289, "learning_rate": 0.0002, "loss": 0.1281, "step": 12960 }, { "epoch": 0.023588304965683746, "grad_norm": 0.09032955765724182, "learning_rate": 0.0002, "loss": 0.0834, "step": 12970 }, { "epoch": 0.023606491785240942, "grad_norm": 0.3660918176174164, "learning_rate": 0.0002, "loss": 0.0776, "step": 12980 }, { "epoch": 0.02362467860479814, "grad_norm": 0.15715408325195312, "learning_rate": 0.0002, "loss": 0.0611, "step": 12990 }, { "epoch": 0.023642865424355335, "grad_norm": 0.03867153823375702, "learning_rate": 0.0002, "loss": 0.0214, "step": 13000 }, { "epoch": 0.02366105224391253, "grad_norm": 0.37568527460098267, "learning_rate": 0.0002, "loss": 0.2529, "step": 13010 }, { "epoch": 0.023679239063469727, "grad_norm": 0.14888867735862732, "learning_rate": 0.0002, "loss": 0.0801, "step": 13020 }, { "epoch": 0.023697425883026924, "grad_norm": 0.04271422699093819, "learning_rate": 0.0002, "loss": 0.0863, "step": 13030 }, { "epoch": 0.02371561270258412, "grad_norm": 0.190608948469162, "learning_rate": 0.0002, "loss": 0.0773, "step": 13040 }, { "epoch": 0.023733799522141316, "grad_norm": 0.020333535969257355, "learning_rate": 0.0002, "loss": 0.0201, "step": 13050 }, { "epoch": 0.023751986341698512, "grad_norm": 0.143577441573143, "learning_rate": 0.0002, "loss": 0.1709, "step": 13060 }, { "epoch": 0.02377017316125571, "grad_norm": 0.09225071966648102, "learning_rate": 0.0002, "loss": 0.0854, "step": 13070 }, { "epoch": 0.023788359980812905, "grad_norm": 0.08655473589897156, "learning_rate": 0.0002, "loss": 0.0727, "step": 13080 }, { "epoch": 0.0238065468003701, "grad_norm": 0.14465250074863434, "learning_rate": 0.0002, "loss": 0.0632, "step": 13090 }, { "epoch": 0.023824733619927298, "grad_norm": 0.019399341195821762, "learning_rate": 0.0002, "loss": 0.0204, "step": 13100 }, { "epoch": 0.023842920439484494, "grad_norm": 0.09221036732196808, "learning_rate": 0.0002, "loss": 0.1646, "step": 13110 }, { "epoch": 0.02386110725904169, "grad_norm": 0.1308157742023468, "learning_rate": 0.0002, "loss": 0.089, "step": 13120 }, { "epoch": 0.023879294078598887, "grad_norm": 0.04212506487965584, "learning_rate": 0.0002, "loss": 0.0776, "step": 13130 }, { "epoch": 0.023897480898156083, "grad_norm": 0.13541243970394135, "learning_rate": 0.0002, "loss": 0.0694, "step": 13140 }, { "epoch": 0.02391566771771328, "grad_norm": 0.016859933733940125, "learning_rate": 0.0002, "loss": 0.0191, "step": 13150 }, { "epoch": 0.023933854537270476, "grad_norm": 0.1553143709897995, "learning_rate": 0.0002, "loss": 0.1653, "step": 13160 }, { "epoch": 0.023952041356827672, "grad_norm": 0.07960142940282822, "learning_rate": 0.0002, "loss": 0.0938, "step": 13170 }, { "epoch": 0.023970228176384868, "grad_norm": 0.0719163790345192, "learning_rate": 0.0002, "loss": 0.0767, "step": 13180 }, { "epoch": 0.023988414995942065, "grad_norm": 0.14845407009124756, "learning_rate": 0.0002, "loss": 0.0642, "step": 13190 }, { "epoch": 0.02400660181549926, "grad_norm": 0.01817360520362854, "learning_rate": 0.0002, "loss": 0.0229, "step": 13200 }, { "epoch": 0.024024788635056457, "grad_norm": 0.03876543045043945, "learning_rate": 0.0002, "loss": 0.1377, "step": 13210 }, { "epoch": 0.024042975454613653, "grad_norm": 0.05972164496779442, "learning_rate": 0.0002, "loss": 0.0802, "step": 13220 }, { "epoch": 0.024061162274170853, "grad_norm": 0.09239703416824341, "learning_rate": 0.0002, "loss": 0.0816, "step": 13230 }, { "epoch": 0.02407934909372805, "grad_norm": 0.15912885963916779, "learning_rate": 0.0002, "loss": 0.0598, "step": 13240 }, { "epoch": 0.024097535913285246, "grad_norm": 0.024279551580548286, "learning_rate": 0.0002, "loss": 0.0235, "step": 13250 }, { "epoch": 0.024115722732842442, "grad_norm": 0.06568270921707153, "learning_rate": 0.0002, "loss": 0.1255, "step": 13260 }, { "epoch": 0.02413390955239964, "grad_norm": 0.04041383042931557, "learning_rate": 0.0002, "loss": 0.0718, "step": 13270 }, { "epoch": 0.024152096371956835, "grad_norm": 0.046768829226493835, "learning_rate": 0.0002, "loss": 0.0741, "step": 13280 }, { "epoch": 0.02417028319151403, "grad_norm": 0.21418194472789764, "learning_rate": 0.0002, "loss": 0.0683, "step": 13290 }, { "epoch": 0.024188470011071227, "grad_norm": 0.04398053511977196, "learning_rate": 0.0002, "loss": 0.0262, "step": 13300 }, { "epoch": 0.024206656830628424, "grad_norm": 0.1672079861164093, "learning_rate": 0.0002, "loss": 0.1408, "step": 13310 }, { "epoch": 0.02422484365018562, "grad_norm": 0.05705881491303444, "learning_rate": 0.0002, "loss": 0.0773, "step": 13320 }, { "epoch": 0.024243030469742816, "grad_norm": 0.0667627677321434, "learning_rate": 0.0002, "loss": 0.0823, "step": 13330 }, { "epoch": 0.024261217289300013, "grad_norm": 0.16610710322856903, "learning_rate": 0.0002, "loss": 0.0682, "step": 13340 }, { "epoch": 0.02427940410885721, "grad_norm": 0.028300171718001366, "learning_rate": 0.0002, "loss": 0.0185, "step": 13350 }, { "epoch": 0.024297590928414405, "grad_norm": 0.10226302593946457, "learning_rate": 0.0002, "loss": 0.1406, "step": 13360 }, { "epoch": 0.0243157777479716, "grad_norm": 0.0939667820930481, "learning_rate": 0.0002, "loss": 0.0755, "step": 13370 }, { "epoch": 0.024333964567528798, "grad_norm": 0.029998745769262314, "learning_rate": 0.0002, "loss": 0.0748, "step": 13380 }, { "epoch": 0.024352151387085994, "grad_norm": 0.1240144744515419, "learning_rate": 0.0002, "loss": 0.0639, "step": 13390 }, { "epoch": 0.02437033820664319, "grad_norm": 0.017499787732958794, "learning_rate": 0.0002, "loss": 0.0156, "step": 13400 }, { "epoch": 0.024388525026200387, "grad_norm": 0.11781036853790283, "learning_rate": 0.0002, "loss": 0.1385, "step": 13410 }, { "epoch": 0.024406711845757583, "grad_norm": 0.09330960363149643, "learning_rate": 0.0002, "loss": 0.0789, "step": 13420 }, { "epoch": 0.02442489866531478, "grad_norm": 0.03347505256533623, "learning_rate": 0.0002, "loss": 0.0742, "step": 13430 }, { "epoch": 0.024443085484871976, "grad_norm": 0.18877847492694855, "learning_rate": 0.0002, "loss": 0.0701, "step": 13440 }, { "epoch": 0.024461272304429172, "grad_norm": 0.03831986337900162, "learning_rate": 0.0002, "loss": 0.0243, "step": 13450 }, { "epoch": 0.02447945912398637, "grad_norm": 0.07360157370567322, "learning_rate": 0.0002, "loss": 0.1237, "step": 13460 }, { "epoch": 0.024497645943543565, "grad_norm": 0.0442088283598423, "learning_rate": 0.0002, "loss": 0.0742, "step": 13470 }, { "epoch": 0.02451583276310076, "grad_norm": 0.07053640484809875, "learning_rate": 0.0002, "loss": 0.0793, "step": 13480 }, { "epoch": 0.024534019582657957, "grad_norm": 0.20134539902210236, "learning_rate": 0.0002, "loss": 0.0621, "step": 13490 }, { "epoch": 0.024552206402215154, "grad_norm": 0.016353536397218704, "learning_rate": 0.0002, "loss": 0.0204, "step": 13500 }, { "epoch": 0.02457039322177235, "grad_norm": 0.15373657643795013, "learning_rate": 0.0002, "loss": 0.1446, "step": 13510 }, { "epoch": 0.024588580041329546, "grad_norm": 2.457998037338257, "learning_rate": 0.0002, "loss": 0.0959, "step": 13520 }, { "epoch": 0.024606766860886743, "grad_norm": 0.11631426215171814, "learning_rate": 0.0002, "loss": 0.0718, "step": 13530 }, { "epoch": 0.02462495368044394, "grad_norm": 0.15928395092487335, "learning_rate": 0.0002, "loss": 0.0638, "step": 13540 }, { "epoch": 0.024643140500001135, "grad_norm": 0.01724998839199543, "learning_rate": 0.0002, "loss": 0.0127, "step": 13550 }, { "epoch": 0.02466132731955833, "grad_norm": 0.10434440523386002, "learning_rate": 0.0002, "loss": 0.1676, "step": 13560 }, { "epoch": 0.02467951413911553, "grad_norm": 0.09029936045408249, "learning_rate": 0.0002, "loss": 0.0792, "step": 13570 }, { "epoch": 0.024697700958672728, "grad_norm": 0.07413540780544281, "learning_rate": 0.0002, "loss": 0.074, "step": 13580 }, { "epoch": 0.024715887778229924, "grad_norm": 0.15171368420124054, "learning_rate": 0.0002, "loss": 0.0646, "step": 13590 }, { "epoch": 0.02473407459778712, "grad_norm": 0.03615165874361992, "learning_rate": 0.0002, "loss": 0.0253, "step": 13600 }, { "epoch": 0.024752261417344316, "grad_norm": 0.08074207603931427, "learning_rate": 0.0002, "loss": 0.1251, "step": 13610 }, { "epoch": 0.024770448236901513, "grad_norm": 0.12725302577018738, "learning_rate": 0.0002, "loss": 0.0868, "step": 13620 }, { "epoch": 0.02478863505645871, "grad_norm": 0.02872832864522934, "learning_rate": 0.0002, "loss": 0.072, "step": 13630 }, { "epoch": 0.024806821876015905, "grad_norm": 0.14573116600513458, "learning_rate": 0.0002, "loss": 0.06, "step": 13640 }, { "epoch": 0.0248250086955731, "grad_norm": 0.039421938359737396, "learning_rate": 0.0002, "loss": 0.0259, "step": 13650 }, { "epoch": 0.024843195515130298, "grad_norm": 0.08786037564277649, "learning_rate": 0.0002, "loss": 0.1255, "step": 13660 }, { "epoch": 0.024861382334687494, "grad_norm": 0.7118334174156189, "learning_rate": 0.0002, "loss": 0.1096, "step": 13670 }, { "epoch": 0.02487956915424469, "grad_norm": 0.05718977376818657, "learning_rate": 0.0002, "loss": 0.1057, "step": 13680 }, { "epoch": 0.024897755973801887, "grad_norm": 0.19388055801391602, "learning_rate": 0.0002, "loss": 0.0668, "step": 13690 }, { "epoch": 0.024915942793359083, "grad_norm": 0.02519839070737362, "learning_rate": 0.0002, "loss": 0.0182, "step": 13700 }, { "epoch": 0.02493412961291628, "grad_norm": 0.15939857065677643, "learning_rate": 0.0002, "loss": 0.1685, "step": 13710 }, { "epoch": 0.024952316432473476, "grad_norm": 0.07893367856740952, "learning_rate": 0.0002, "loss": 0.0781, "step": 13720 }, { "epoch": 0.024970503252030672, "grad_norm": 0.0573757067322731, "learning_rate": 0.0002, "loss": 0.0819, "step": 13730 }, { "epoch": 0.02498869007158787, "grad_norm": 0.1089317575097084, "learning_rate": 0.0002, "loss": 0.0645, "step": 13740 }, { "epoch": 0.025006876891145065, "grad_norm": 0.03239568695425987, "learning_rate": 0.0002, "loss": 0.0199, "step": 13750 }, { "epoch": 0.02502506371070226, "grad_norm": 0.04015114903450012, "learning_rate": 0.0002, "loss": 0.146, "step": 13760 }, { "epoch": 0.025043250530259457, "grad_norm": 0.15218386054039001, "learning_rate": 0.0002, "loss": 0.0854, "step": 13770 }, { "epoch": 0.025061437349816654, "grad_norm": 0.04461386427283287, "learning_rate": 0.0002, "loss": 0.0734, "step": 13780 }, { "epoch": 0.02507962416937385, "grad_norm": 0.17443357408046722, "learning_rate": 0.0002, "loss": 0.0677, "step": 13790 }, { "epoch": 0.025097810988931046, "grad_norm": 1.0899302959442139, "learning_rate": 0.0002, "loss": 0.0312, "step": 13800 }, { "epoch": 0.025115997808488243, "grad_norm": 0.04115718603134155, "learning_rate": 0.0002, "loss": 0.1392, "step": 13810 }, { "epoch": 0.02513418462804544, "grad_norm": 0.06605038046836853, "learning_rate": 0.0002, "loss": 0.0826, "step": 13820 }, { "epoch": 0.025152371447602635, "grad_norm": 0.115416020154953, "learning_rate": 0.0002, "loss": 0.0709, "step": 13830 }, { "epoch": 0.02517055826715983, "grad_norm": 0.1582881212234497, "learning_rate": 0.0002, "loss": 0.066, "step": 13840 }, { "epoch": 0.025188745086717028, "grad_norm": 0.037643156945705414, "learning_rate": 0.0002, "loss": 0.0226, "step": 13850 }, { "epoch": 0.025206931906274224, "grad_norm": 0.08343279361724854, "learning_rate": 0.0002, "loss": 0.1197, "step": 13860 }, { "epoch": 0.02522511872583142, "grad_norm": 0.13482169806957245, "learning_rate": 0.0002, "loss": 0.0799, "step": 13870 }, { "epoch": 0.025243305545388617, "grad_norm": 0.10373103618621826, "learning_rate": 0.0002, "loss": 0.075, "step": 13880 }, { "epoch": 0.025261492364945813, "grad_norm": 0.1348303109407425, "learning_rate": 0.0002, "loss": 0.0603, "step": 13890 }, { "epoch": 0.02527967918450301, "grad_norm": 0.058479245752096176, "learning_rate": 0.0002, "loss": 0.0252, "step": 13900 }, { "epoch": 0.025297866004060206, "grad_norm": 0.19177350401878357, "learning_rate": 0.0002, "loss": 0.122, "step": 13910 }, { "epoch": 0.025316052823617406, "grad_norm": 0.11044300347566605, "learning_rate": 0.0002, "loss": 0.0724, "step": 13920 }, { "epoch": 0.025334239643174602, "grad_norm": 0.05279375612735748, "learning_rate": 0.0002, "loss": 0.0836, "step": 13930 }, { "epoch": 0.025352426462731798, "grad_norm": 0.12162257730960846, "learning_rate": 0.0002, "loss": 0.0615, "step": 13940 }, { "epoch": 0.025370613282288994, "grad_norm": 0.026728983968496323, "learning_rate": 0.0002, "loss": 0.0207, "step": 13950 }, { "epoch": 0.02538880010184619, "grad_norm": 0.08440329879522324, "learning_rate": 0.0002, "loss": 0.1171, "step": 13960 }, { "epoch": 0.025406986921403387, "grad_norm": 0.10090481489896774, "learning_rate": 0.0002, "loss": 0.0851, "step": 13970 }, { "epoch": 0.025425173740960583, "grad_norm": 0.03063822351396084, "learning_rate": 0.0002, "loss": 0.0783, "step": 13980 }, { "epoch": 0.02544336056051778, "grad_norm": 0.14754973351955414, "learning_rate": 0.0002, "loss": 0.0662, "step": 13990 }, { "epoch": 0.025461547380074976, "grad_norm": 0.04844941198825836, "learning_rate": 0.0002, "loss": 0.0204, "step": 14000 }, { "epoch": 0.025479734199632172, "grad_norm": 0.08291894942522049, "learning_rate": 0.0002, "loss": 0.13, "step": 14010 }, { "epoch": 0.02549792101918937, "grad_norm": 0.05875542387366295, "learning_rate": 0.0002, "loss": 0.0732, "step": 14020 }, { "epoch": 0.025516107838746565, "grad_norm": 0.04103298857808113, "learning_rate": 0.0002, "loss": 0.0796, "step": 14030 }, { "epoch": 0.02553429465830376, "grad_norm": 0.20349934697151184, "learning_rate": 0.0002, "loss": 0.0672, "step": 14040 }, { "epoch": 0.025552481477860958, "grad_norm": 0.05419473722577095, "learning_rate": 0.0002, "loss": 0.0231, "step": 14050 }, { "epoch": 0.025570668297418154, "grad_norm": 0.05501960590481758, "learning_rate": 0.0002, "loss": 0.1281, "step": 14060 }, { "epoch": 0.02558885511697535, "grad_norm": 0.07140739262104034, "learning_rate": 0.0002, "loss": 0.0746, "step": 14070 }, { "epoch": 0.025607041936532546, "grad_norm": 0.04564960300922394, "learning_rate": 0.0002, "loss": 0.0746, "step": 14080 }, { "epoch": 0.025625228756089743, "grad_norm": 0.16987308859825134, "learning_rate": 0.0002, "loss": 0.0642, "step": 14090 }, { "epoch": 0.02564341557564694, "grad_norm": 0.017460890114307404, "learning_rate": 0.0002, "loss": 0.0218, "step": 14100 }, { "epoch": 0.025661602395204135, "grad_norm": 0.15666340291500092, "learning_rate": 0.0002, "loss": 0.1572, "step": 14110 }, { "epoch": 0.02567978921476133, "grad_norm": 0.06847309321165085, "learning_rate": 0.0002, "loss": 0.0744, "step": 14120 }, { "epoch": 0.025697976034318528, "grad_norm": 0.03678276389837265, "learning_rate": 0.0002, "loss": 0.0718, "step": 14130 }, { "epoch": 0.025716162853875724, "grad_norm": 0.1861123889684677, "learning_rate": 0.0002, "loss": 0.06, "step": 14140 }, { "epoch": 0.02573434967343292, "grad_norm": 0.010294788517057896, "learning_rate": 0.0002, "loss": 0.0183, "step": 14150 }, { "epoch": 0.025752536492990117, "grad_norm": 0.0643458440899849, "learning_rate": 0.0002, "loss": 0.1594, "step": 14160 }, { "epoch": 0.025770723312547313, "grad_norm": 0.10639938712120056, "learning_rate": 0.0002, "loss": 0.0761, "step": 14170 }, { "epoch": 0.02578891013210451, "grad_norm": 0.056529924273490906, "learning_rate": 0.0002, "loss": 0.082, "step": 14180 }, { "epoch": 0.025807096951661706, "grad_norm": 0.18884658813476562, "learning_rate": 0.0002, "loss": 0.0683, "step": 14190 }, { "epoch": 0.025825283771218902, "grad_norm": 0.035667784512043, "learning_rate": 0.0002, "loss": 0.0263, "step": 14200 }, { "epoch": 0.0258434705907761, "grad_norm": 0.14650103449821472, "learning_rate": 0.0002, "loss": 0.1314, "step": 14210 }, { "epoch": 0.025861657410333295, "grad_norm": 0.12219654768705368, "learning_rate": 0.0002, "loss": 0.0755, "step": 14220 }, { "epoch": 0.02587984422989049, "grad_norm": 0.05271647870540619, "learning_rate": 0.0002, "loss": 0.0789, "step": 14230 }, { "epoch": 0.025898031049447687, "grad_norm": 0.1669916957616806, "learning_rate": 0.0002, "loss": 0.0641, "step": 14240 }, { "epoch": 0.025916217869004884, "grad_norm": 0.035175371915102005, "learning_rate": 0.0002, "loss": 0.0222, "step": 14250 }, { "epoch": 0.025934404688562084, "grad_norm": 0.14658409357070923, "learning_rate": 0.0002, "loss": 0.1382, "step": 14260 }, { "epoch": 0.02595259150811928, "grad_norm": 0.07525639981031418, "learning_rate": 0.0002, "loss": 0.0815, "step": 14270 }, { "epoch": 0.025970778327676476, "grad_norm": 0.02428872510790825, "learning_rate": 0.0002, "loss": 0.076, "step": 14280 }, { "epoch": 0.025988965147233672, "grad_norm": 0.1825665533542633, "learning_rate": 0.0002, "loss": 0.0652, "step": 14290 }, { "epoch": 0.02600715196679087, "grad_norm": 0.033867619931697845, "learning_rate": 0.0002, "loss": 0.0206, "step": 14300 }, { "epoch": 0.026025338786348065, "grad_norm": 0.051891107112169266, "learning_rate": 0.0002, "loss": 0.1576, "step": 14310 }, { "epoch": 0.02604352560590526, "grad_norm": 0.1111353188753128, "learning_rate": 0.0002, "loss": 0.0889, "step": 14320 }, { "epoch": 0.026061712425462458, "grad_norm": 0.04253942146897316, "learning_rate": 0.0002, "loss": 0.079, "step": 14330 }, { "epoch": 0.026079899245019654, "grad_norm": 0.17151106894016266, "learning_rate": 0.0002, "loss": 0.0678, "step": 14340 }, { "epoch": 0.02609808606457685, "grad_norm": 0.03877005726099014, "learning_rate": 0.0002, "loss": 0.0206, "step": 14350 }, { "epoch": 0.026116272884134047, "grad_norm": 0.03517235442996025, "learning_rate": 0.0002, "loss": 0.1343, "step": 14360 }, { "epoch": 0.026134459703691243, "grad_norm": 0.08157488703727722, "learning_rate": 0.0002, "loss": 0.0764, "step": 14370 }, { "epoch": 0.02615264652324844, "grad_norm": 0.03245632350444794, "learning_rate": 0.0002, "loss": 0.0758, "step": 14380 }, { "epoch": 0.026170833342805636, "grad_norm": 0.20079655945301056, "learning_rate": 0.0002, "loss": 0.0691, "step": 14390 }, { "epoch": 0.026189020162362832, "grad_norm": 0.03477077558636665, "learning_rate": 0.0002, "loss": 0.0232, "step": 14400 }, { "epoch": 0.026207206981920028, "grad_norm": 0.14853888750076294, "learning_rate": 0.0002, "loss": 0.1436, "step": 14410 }, { "epoch": 0.026225393801477224, "grad_norm": 0.12416905164718628, "learning_rate": 0.0002, "loss": 0.0755, "step": 14420 }, { "epoch": 0.02624358062103442, "grad_norm": 0.03126871958374977, "learning_rate": 0.0002, "loss": 0.0762, "step": 14430 }, { "epoch": 0.026261767440591617, "grad_norm": 0.20726743340492249, "learning_rate": 0.0002, "loss": 0.0614, "step": 14440 }, { "epoch": 0.026279954260148813, "grad_norm": 0.039617493748664856, "learning_rate": 0.0002, "loss": 0.0181, "step": 14450 }, { "epoch": 0.02629814107970601, "grad_norm": 0.08146277070045471, "learning_rate": 0.0002, "loss": 0.132, "step": 14460 }, { "epoch": 0.026316327899263206, "grad_norm": 0.07181694358587265, "learning_rate": 0.0002, "loss": 0.0706, "step": 14470 }, { "epoch": 0.026334514718820402, "grad_norm": 0.04080040752887726, "learning_rate": 0.0002, "loss": 0.0792, "step": 14480 }, { "epoch": 0.0263527015383776, "grad_norm": 0.1903056502342224, "learning_rate": 0.0002, "loss": 0.0647, "step": 14490 }, { "epoch": 0.026370888357934795, "grad_norm": 0.027256207540631294, "learning_rate": 0.0002, "loss": 0.0202, "step": 14500 }, { "epoch": 0.02638907517749199, "grad_norm": 0.1434287130832672, "learning_rate": 0.0002, "loss": 0.1262, "step": 14510 }, { "epoch": 0.026407261997049188, "grad_norm": 0.06977452337741852, "learning_rate": 0.0002, "loss": 0.0722, "step": 14520 }, { "epoch": 0.026425448816606384, "grad_norm": 0.03453589975833893, "learning_rate": 0.0002, "loss": 0.0778, "step": 14530 }, { "epoch": 0.02644363563616358, "grad_norm": 0.1455768346786499, "learning_rate": 0.0002, "loss": 0.0678, "step": 14540 }, { "epoch": 0.026461822455720777, "grad_norm": 0.02977900207042694, "learning_rate": 0.0002, "loss": 0.0227, "step": 14550 }, { "epoch": 0.026480009275277973, "grad_norm": 0.06667467951774597, "learning_rate": 0.0002, "loss": 0.1345, "step": 14560 }, { "epoch": 0.02649819609483517, "grad_norm": 0.05125528201460838, "learning_rate": 0.0002, "loss": 0.078, "step": 14570 }, { "epoch": 0.026516382914392365, "grad_norm": 0.02796974405646324, "learning_rate": 0.0002, "loss": 0.0782, "step": 14580 }, { "epoch": 0.026534569733949562, "grad_norm": 0.18518763780593872, "learning_rate": 0.0002, "loss": 0.0722, "step": 14590 }, { "epoch": 0.02655275655350676, "grad_norm": 0.01827179454267025, "learning_rate": 0.0002, "loss": 0.0193, "step": 14600 }, { "epoch": 0.026570943373063958, "grad_norm": 0.1146678775548935, "learning_rate": 0.0002, "loss": 0.1651, "step": 14610 }, { "epoch": 0.026589130192621154, "grad_norm": 3.385193109512329, "learning_rate": 0.0002, "loss": 0.2165, "step": 14620 }, { "epoch": 0.02660731701217835, "grad_norm": 0.3052279055118561, "learning_rate": 0.0002, "loss": 0.1489, "step": 14630 }, { "epoch": 0.026625503831735547, "grad_norm": 0.12762853503227234, "learning_rate": 0.0002, "loss": 0.0693, "step": 14640 }, { "epoch": 0.026643690651292743, "grad_norm": 0.003925936296582222, "learning_rate": 0.0002, "loss": 0.0078, "step": 14650 }, { "epoch": 0.02666187747084994, "grad_norm": 0.28632932901382446, "learning_rate": 0.0002, "loss": 0.2533, "step": 14660 }, { "epoch": 0.026680064290407136, "grad_norm": 0.037552788853645325, "learning_rate": 0.0002, "loss": 0.0852, "step": 14670 }, { "epoch": 0.026698251109964332, "grad_norm": 0.0911126434803009, "learning_rate": 0.0002, "loss": 0.0751, "step": 14680 }, { "epoch": 0.02671643792952153, "grad_norm": 0.18434865772724152, "learning_rate": 0.0002, "loss": 0.084, "step": 14690 }, { "epoch": 0.026734624749078725, "grad_norm": 0.03813793510198593, "learning_rate": 0.0002, "loss": 0.0165, "step": 14700 }, { "epoch": 0.02675281156863592, "grad_norm": 0.04764392226934433, "learning_rate": 0.0002, "loss": 0.1642, "step": 14710 }, { "epoch": 0.026770998388193117, "grad_norm": 0.04611713066697121, "learning_rate": 0.0002, "loss": 0.088, "step": 14720 }, { "epoch": 0.026789185207750314, "grad_norm": 0.07171179354190826, "learning_rate": 0.0002, "loss": 0.1417, "step": 14730 }, { "epoch": 0.02680737202730751, "grad_norm": 0.14135649800300598, "learning_rate": 0.0002, "loss": 0.0692, "step": 14740 }, { "epoch": 0.026825558846864706, "grad_norm": 0.004508219193667173, "learning_rate": 0.0002, "loss": 0.016, "step": 14750 }, { "epoch": 0.026843745666421902, "grad_norm": 0.09732682257890701, "learning_rate": 0.0002, "loss": 0.2089, "step": 14760 }, { "epoch": 0.0268619324859791, "grad_norm": 0.12676575779914856, "learning_rate": 0.0002, "loss": 0.0849, "step": 14770 }, { "epoch": 0.026880119305536295, "grad_norm": 0.0696650817990303, "learning_rate": 0.0002, "loss": 0.08, "step": 14780 }, { "epoch": 0.02689830612509349, "grad_norm": 0.17883484065532684, "learning_rate": 0.0002, "loss": 0.0682, "step": 14790 }, { "epoch": 0.026916492944650688, "grad_norm": 0.0567975677549839, "learning_rate": 0.0002, "loss": 0.0149, "step": 14800 }, { "epoch": 0.026934679764207884, "grad_norm": 0.4884565472602844, "learning_rate": 0.0002, "loss": 0.6381, "step": 14810 }, { "epoch": 0.02695286658376508, "grad_norm": 0.0742981806397438, "learning_rate": 0.0002, "loss": 0.0845, "step": 14820 }, { "epoch": 0.026971053403322277, "grad_norm": 0.030466781929135323, "learning_rate": 0.0002, "loss": 0.0818, "step": 14830 }, { "epoch": 0.026989240222879473, "grad_norm": 0.13108357787132263, "learning_rate": 0.0002, "loss": 0.065, "step": 14840 }, { "epoch": 0.02700742704243667, "grad_norm": 0.019065184518694878, "learning_rate": 0.0002, "loss": 0.0168, "step": 14850 }, { "epoch": 0.027025613861993866, "grad_norm": 0.21891777217388153, "learning_rate": 0.0002, "loss": 0.1456, "step": 14860 }, { "epoch": 0.027043800681551062, "grad_norm": 0.0836934968829155, "learning_rate": 0.0002, "loss": 0.0781, "step": 14870 }, { "epoch": 0.027061987501108258, "grad_norm": 0.0643845945596695, "learning_rate": 0.0002, "loss": 0.0795, "step": 14880 }, { "epoch": 0.027080174320665455, "grad_norm": 0.27108556032180786, "learning_rate": 0.0002, "loss": 0.0722, "step": 14890 }, { "epoch": 0.02709836114022265, "grad_norm": 0.008289041928946972, "learning_rate": 0.0002, "loss": 0.0201, "step": 14900 }, { "epoch": 0.027116547959779847, "grad_norm": 0.03284185752272606, "learning_rate": 0.0002, "loss": 0.1509, "step": 14910 }, { "epoch": 0.027134734779337043, "grad_norm": 0.051129039376974106, "learning_rate": 0.0002, "loss": 0.0831, "step": 14920 }, { "epoch": 0.02715292159889424, "grad_norm": 0.046401191502809525, "learning_rate": 0.0002, "loss": 0.0694, "step": 14930 }, { "epoch": 0.027171108418451436, "grad_norm": 0.19945313036441803, "learning_rate": 0.0002, "loss": 0.0734, "step": 14940 }, { "epoch": 0.027189295238008636, "grad_norm": 0.03877973556518555, "learning_rate": 0.0002, "loss": 0.026, "step": 14950 }, { "epoch": 0.027207482057565832, "grad_norm": 0.19090695679187775, "learning_rate": 0.0002, "loss": 0.136, "step": 14960 }, { "epoch": 0.02722566887712303, "grad_norm": 0.11352288722991943, "learning_rate": 0.0002, "loss": 0.0797, "step": 14970 }, { "epoch": 0.027243855696680225, "grad_norm": 0.055218834429979324, "learning_rate": 0.0002, "loss": 0.0763, "step": 14980 }, { "epoch": 0.02726204251623742, "grad_norm": 0.1060803234577179, "learning_rate": 0.0002, "loss": 0.059, "step": 14990 }, { "epoch": 0.027280229335794617, "grad_norm": 0.03370797634124756, "learning_rate": 0.0002, "loss": 0.0172, "step": 15000 }, { "epoch": 0.027298416155351814, "grad_norm": 0.19884982705116272, "learning_rate": 0.0002, "loss": 0.1408, "step": 15010 }, { "epoch": 0.02731660297490901, "grad_norm": 0.1186273992061615, "learning_rate": 0.0002, "loss": 0.0769, "step": 15020 }, { "epoch": 0.027334789794466206, "grad_norm": 0.0494297556579113, "learning_rate": 0.0002, "loss": 0.0818, "step": 15030 }, { "epoch": 0.027352976614023403, "grad_norm": 0.17990480363368988, "learning_rate": 0.0002, "loss": 0.06, "step": 15040 }, { "epoch": 0.0273711634335806, "grad_norm": 0.015269913710653782, "learning_rate": 0.0002, "loss": 0.0143, "step": 15050 }, { "epoch": 0.027389350253137795, "grad_norm": 0.1387794464826584, "learning_rate": 0.0002, "loss": 0.171, "step": 15060 }, { "epoch": 0.02740753707269499, "grad_norm": 0.11648393422365189, "learning_rate": 0.0002, "loss": 0.0769, "step": 15070 }, { "epoch": 0.027425723892252188, "grad_norm": 0.04039733111858368, "learning_rate": 0.0002, "loss": 0.0707, "step": 15080 }, { "epoch": 0.027443910711809384, "grad_norm": 0.19274230301380157, "learning_rate": 0.0002, "loss": 0.0657, "step": 15090 }, { "epoch": 0.02746209753136658, "grad_norm": 0.03266929090023041, "learning_rate": 0.0002, "loss": 0.0155, "step": 15100 }, { "epoch": 0.027480284350923777, "grad_norm": 0.44524702429771423, "learning_rate": 0.0002, "loss": 0.3075, "step": 15110 }, { "epoch": 0.027498471170480973, "grad_norm": 0.15604422986507416, "learning_rate": 0.0002, "loss": 0.0874, "step": 15120 }, { "epoch": 0.02751665799003817, "grad_norm": 0.043061114847660065, "learning_rate": 0.0002, "loss": 0.0814, "step": 15130 }, { "epoch": 0.027534844809595366, "grad_norm": 0.2331482172012329, "learning_rate": 0.0002, "loss": 0.0638, "step": 15140 }, { "epoch": 0.027553031629152562, "grad_norm": 0.011037157848477364, "learning_rate": 0.0002, "loss": 0.0197, "step": 15150 }, { "epoch": 0.02757121844870976, "grad_norm": 0.0758776143193245, "learning_rate": 0.0002, "loss": 0.1481, "step": 15160 }, { "epoch": 0.027589405268266955, "grad_norm": 0.18878699839115143, "learning_rate": 0.0002, "loss": 0.083, "step": 15170 }, { "epoch": 0.02760759208782415, "grad_norm": 0.042469121515750885, "learning_rate": 0.0002, "loss": 0.0799, "step": 15180 }, { "epoch": 0.027625778907381347, "grad_norm": 0.1603335440158844, "learning_rate": 0.0002, "loss": 0.0579, "step": 15190 }, { "epoch": 0.027643965726938544, "grad_norm": 0.03533349186182022, "learning_rate": 0.0002, "loss": 0.0195, "step": 15200 }, { "epoch": 0.02766215254649574, "grad_norm": 0.2014724314212799, "learning_rate": 0.0002, "loss": 0.1443, "step": 15210 }, { "epoch": 0.027680339366052936, "grad_norm": 0.04604899883270264, "learning_rate": 0.0002, "loss": 0.0701, "step": 15220 }, { "epoch": 0.027698526185610133, "grad_norm": 0.04726789519190788, "learning_rate": 0.0002, "loss": 0.078, "step": 15230 }, { "epoch": 0.02771671300516733, "grad_norm": 0.16189764440059662, "learning_rate": 0.0002, "loss": 0.0686, "step": 15240 }, { "epoch": 0.027734899824724525, "grad_norm": 0.018077973276376724, "learning_rate": 0.0002, "loss": 0.0155, "step": 15250 }, { "epoch": 0.02775308664428172, "grad_norm": 0.09486963599920273, "learning_rate": 0.0002, "loss": 0.1695, "step": 15260 }, { "epoch": 0.027771273463838918, "grad_norm": 0.19950449466705322, "learning_rate": 0.0002, "loss": 0.0784, "step": 15270 }, { "epoch": 0.027789460283396114, "grad_norm": 0.03350493311882019, "learning_rate": 0.0002, "loss": 0.0797, "step": 15280 }, { "epoch": 0.027807647102953314, "grad_norm": 0.14408868551254272, "learning_rate": 0.0002, "loss": 0.0624, "step": 15290 }, { "epoch": 0.02782583392251051, "grad_norm": 0.03824521601200104, "learning_rate": 0.0002, "loss": 0.0182, "step": 15300 }, { "epoch": 0.027844020742067706, "grad_norm": 0.051167964935302734, "learning_rate": 0.0002, "loss": 0.1342, "step": 15310 }, { "epoch": 0.027862207561624903, "grad_norm": 0.08440420031547546, "learning_rate": 0.0002, "loss": 0.0775, "step": 15320 }, { "epoch": 0.0278803943811821, "grad_norm": 0.05162487551569939, "learning_rate": 0.0002, "loss": 0.0824, "step": 15330 }, { "epoch": 0.027898581200739295, "grad_norm": 0.1576220989227295, "learning_rate": 0.0002, "loss": 0.0607, "step": 15340 }, { "epoch": 0.02791676802029649, "grad_norm": 0.03840797394514084, "learning_rate": 0.0002, "loss": 0.0197, "step": 15350 }, { "epoch": 0.027934954839853688, "grad_norm": 0.1418246179819107, "learning_rate": 0.0002, "loss": 0.151, "step": 15360 }, { "epoch": 0.027953141659410884, "grad_norm": 0.07326096296310425, "learning_rate": 0.0002, "loss": 0.0764, "step": 15370 }, { "epoch": 0.02797132847896808, "grad_norm": 0.0582844614982605, "learning_rate": 0.0002, "loss": 0.0745, "step": 15380 }, { "epoch": 0.027989515298525277, "grad_norm": 0.2234935164451599, "learning_rate": 0.0002, "loss": 0.0687, "step": 15390 }, { "epoch": 0.028007702118082473, "grad_norm": 0.04384669288992882, "learning_rate": 0.0002, "loss": 0.023, "step": 15400 }, { "epoch": 0.02802588893763967, "grad_norm": 0.14306089282035828, "learning_rate": 0.0002, "loss": 0.1477, "step": 15410 }, { "epoch": 0.028044075757196866, "grad_norm": 0.1326105296611786, "learning_rate": 0.0002, "loss": 0.0784, "step": 15420 }, { "epoch": 0.028062262576754062, "grad_norm": 0.05531894043087959, "learning_rate": 0.0002, "loss": 0.0813, "step": 15430 }, { "epoch": 0.02808044939631126, "grad_norm": 0.14875297248363495, "learning_rate": 0.0002, "loss": 0.0622, "step": 15440 }, { "epoch": 0.028098636215868455, "grad_norm": 0.03749268501996994, "learning_rate": 0.0002, "loss": 0.0181, "step": 15450 }, { "epoch": 0.02811682303542565, "grad_norm": 0.05747106671333313, "learning_rate": 0.0002, "loss": 0.1157, "step": 15460 }, { "epoch": 0.028135009854982847, "grad_norm": 0.06197863444685936, "learning_rate": 0.0002, "loss": 0.0779, "step": 15470 }, { "epoch": 0.028153196674540044, "grad_norm": 0.09997677057981491, "learning_rate": 0.0002, "loss": 0.0784, "step": 15480 }, { "epoch": 0.02817138349409724, "grad_norm": 0.18067684769630432, "learning_rate": 0.0002, "loss": 0.0728, "step": 15490 }, { "epoch": 0.028189570313654436, "grad_norm": 0.03378088399767876, "learning_rate": 0.0002, "loss": 0.0252, "step": 15500 }, { "epoch": 0.028207757133211633, "grad_norm": 0.14048723876476288, "learning_rate": 0.0002, "loss": 0.1392, "step": 15510 }, { "epoch": 0.02822594395276883, "grad_norm": 0.09573493152856827, "learning_rate": 0.0002, "loss": 0.0751, "step": 15520 }, { "epoch": 0.028244130772326025, "grad_norm": 0.11000777781009674, "learning_rate": 0.0002, "loss": 0.08, "step": 15530 }, { "epoch": 0.02826231759188322, "grad_norm": 0.17712855339050293, "learning_rate": 0.0002, "loss": 0.0658, "step": 15540 }, { "epoch": 0.028280504411440418, "grad_norm": 0.0183733981102705, "learning_rate": 0.0002, "loss": 0.0188, "step": 15550 }, { "epoch": 0.028298691230997614, "grad_norm": 0.15027762949466705, "learning_rate": 0.0002, "loss": 0.1235, "step": 15560 }, { "epoch": 0.02831687805055481, "grad_norm": 0.10586661100387573, "learning_rate": 0.0002, "loss": 0.0791, "step": 15570 }, { "epoch": 0.028335064870112007, "grad_norm": 0.031083540990948677, "learning_rate": 0.0002, "loss": 0.0765, "step": 15580 }, { "epoch": 0.028353251689669203, "grad_norm": 0.12294827401638031, "learning_rate": 0.0002, "loss": 0.0615, "step": 15590 }, { "epoch": 0.0283714385092264, "grad_norm": 0.03652534633874893, "learning_rate": 0.0002, "loss": 0.0203, "step": 15600 }, { "epoch": 0.028389625328783596, "grad_norm": 0.046638645231723785, "learning_rate": 0.0002, "loss": 0.1327, "step": 15610 }, { "epoch": 0.028407812148340792, "grad_norm": 0.07200415432453156, "learning_rate": 0.0002, "loss": 0.0765, "step": 15620 }, { "epoch": 0.028425998967897992, "grad_norm": 0.040679559111595154, "learning_rate": 0.0002, "loss": 0.0812, "step": 15630 }, { "epoch": 0.028444185787455188, "grad_norm": 0.1572960615158081, "learning_rate": 0.0002, "loss": 0.0637, "step": 15640 }, { "epoch": 0.028462372607012384, "grad_norm": 0.036091506481170654, "learning_rate": 0.0002, "loss": 0.0266, "step": 15650 }, { "epoch": 0.02848055942656958, "grad_norm": 0.10555437207221985, "learning_rate": 0.0002, "loss": 0.1093, "step": 15660 }, { "epoch": 0.028498746246126777, "grad_norm": 0.08854329586029053, "learning_rate": 0.0002, "loss": 0.0741, "step": 15670 }, { "epoch": 0.028516933065683973, "grad_norm": 0.02908560261130333, "learning_rate": 0.0002, "loss": 0.0732, "step": 15680 }, { "epoch": 0.02853511988524117, "grad_norm": 0.1568380743265152, "learning_rate": 0.0002, "loss": 0.0586, "step": 15690 }, { "epoch": 0.028553306704798366, "grad_norm": 0.04985487833619118, "learning_rate": 0.0002, "loss": 0.0247, "step": 15700 }, { "epoch": 0.028571493524355562, "grad_norm": 0.07582605630159378, "learning_rate": 0.0002, "loss": 0.1196, "step": 15710 }, { "epoch": 0.02858968034391276, "grad_norm": 0.02401849813759327, "learning_rate": 0.0002, "loss": 0.075, "step": 15720 }, { "epoch": 0.028607867163469955, "grad_norm": 0.032545965164899826, "learning_rate": 0.0002, "loss": 0.0761, "step": 15730 }, { "epoch": 0.02862605398302715, "grad_norm": 0.1098649650812149, "learning_rate": 0.0002, "loss": 0.0599, "step": 15740 }, { "epoch": 0.028644240802584348, "grad_norm": 0.021166007965803146, "learning_rate": 0.0002, "loss": 0.0169, "step": 15750 }, { "epoch": 0.028662427622141544, "grad_norm": 0.0823541134595871, "learning_rate": 0.0002, "loss": 0.1337, "step": 15760 }, { "epoch": 0.02868061444169874, "grad_norm": 0.1009572371840477, "learning_rate": 0.0002, "loss": 0.0779, "step": 15770 }, { "epoch": 0.028698801261255937, "grad_norm": 0.09160738438367844, "learning_rate": 0.0002, "loss": 0.0766, "step": 15780 }, { "epoch": 0.028716988080813133, "grad_norm": 0.14419673383235931, "learning_rate": 0.0002, "loss": 0.0594, "step": 15790 }, { "epoch": 0.02873517490037033, "grad_norm": 0.01628550887107849, "learning_rate": 0.0002, "loss": 0.0218, "step": 15800 }, { "epoch": 0.028753361719927525, "grad_norm": 0.15207678079605103, "learning_rate": 0.0002, "loss": 0.1262, "step": 15810 }, { "epoch": 0.028771548539484722, "grad_norm": 0.14951761066913605, "learning_rate": 0.0002, "loss": 0.0812, "step": 15820 }, { "epoch": 0.028789735359041918, "grad_norm": 0.028078215196728706, "learning_rate": 0.0002, "loss": 0.0783, "step": 15830 }, { "epoch": 0.028807922178599114, "grad_norm": 0.16079741716384888, "learning_rate": 0.0002, "loss": 0.0633, "step": 15840 }, { "epoch": 0.02882610899815631, "grad_norm": 0.04218870773911476, "learning_rate": 0.0002, "loss": 0.0217, "step": 15850 }, { "epoch": 0.028844295817713507, "grad_norm": 0.13758492469787598, "learning_rate": 0.0002, "loss": 0.1358, "step": 15860 }, { "epoch": 0.028862482637270703, "grad_norm": 0.10366559028625488, "learning_rate": 0.0002, "loss": 0.0787, "step": 15870 }, { "epoch": 0.0288806694568279, "grad_norm": 0.04433147609233856, "learning_rate": 0.0002, "loss": 0.0781, "step": 15880 }, { "epoch": 0.028898856276385096, "grad_norm": 0.16709402203559875, "learning_rate": 0.0002, "loss": 0.0684, "step": 15890 }, { "epoch": 0.028917043095942292, "grad_norm": 0.03370310738682747, "learning_rate": 0.0002, "loss": 0.0191, "step": 15900 }, { "epoch": 0.02893522991549949, "grad_norm": 0.15469267964363098, "learning_rate": 0.0002, "loss": 0.1487, "step": 15910 }, { "epoch": 0.028953416735056685, "grad_norm": 0.19974654912948608, "learning_rate": 0.0002, "loss": 0.0769, "step": 15920 }, { "epoch": 0.02897160355461388, "grad_norm": 0.04307623952627182, "learning_rate": 0.0002, "loss": 0.075, "step": 15930 }, { "epoch": 0.028989790374171077, "grad_norm": 0.21828149259090424, "learning_rate": 0.0002, "loss": 0.0691, "step": 15940 }, { "epoch": 0.029007977193728274, "grad_norm": 0.0268656387925148, "learning_rate": 0.0002, "loss": 0.022, "step": 15950 }, { "epoch": 0.02902616401328547, "grad_norm": 0.11213699728250504, "learning_rate": 0.0002, "loss": 0.1326, "step": 15960 }, { "epoch": 0.029044350832842666, "grad_norm": 0.2018963098526001, "learning_rate": 0.0002, "loss": 0.0772, "step": 15970 }, { "epoch": 0.029062537652399866, "grad_norm": 0.06034110113978386, "learning_rate": 0.0002, "loss": 0.0712, "step": 15980 }, { "epoch": 0.029080724471957062, "grad_norm": 0.1817707121372223, "learning_rate": 0.0002, "loss": 0.0692, "step": 15990 }, { "epoch": 0.02909891129151426, "grad_norm": 0.03466440737247467, "learning_rate": 0.0002, "loss": 0.0205, "step": 16000 }, { "epoch": 0.029117098111071455, "grad_norm": 0.1375580132007599, "learning_rate": 0.0002, "loss": 0.1499, "step": 16010 }, { "epoch": 0.02913528493062865, "grad_norm": 0.14308910071849823, "learning_rate": 0.0002, "loss": 0.083, "step": 16020 }, { "epoch": 0.029153471750185848, "grad_norm": 0.041022926568984985, "learning_rate": 0.0002, "loss": 0.0785, "step": 16030 }, { "epoch": 0.029171658569743044, "grad_norm": 0.1701498180627823, "learning_rate": 0.0002, "loss": 0.0656, "step": 16040 }, { "epoch": 0.02918984538930024, "grad_norm": 0.023075805976986885, "learning_rate": 0.0002, "loss": 0.0225, "step": 16050 }, { "epoch": 0.029208032208857437, "grad_norm": 0.05303549766540527, "learning_rate": 0.0002, "loss": 0.1369, "step": 16060 }, { "epoch": 0.029226219028414633, "grad_norm": 0.044178470969200134, "learning_rate": 0.0002, "loss": 0.0754, "step": 16070 }, { "epoch": 0.02924440584797183, "grad_norm": 0.03951259329915047, "learning_rate": 0.0002, "loss": 0.0759, "step": 16080 }, { "epoch": 0.029262592667529026, "grad_norm": 0.13762067258358002, "learning_rate": 0.0002, "loss": 0.0605, "step": 16090 }, { "epoch": 0.029280779487086222, "grad_norm": 0.021227868273854256, "learning_rate": 0.0002, "loss": 0.0173, "step": 16100 }, { "epoch": 0.029298966306643418, "grad_norm": 0.19493195414543152, "learning_rate": 0.0002, "loss": 0.1307, "step": 16110 }, { "epoch": 0.029317153126200615, "grad_norm": 0.09980791062116623, "learning_rate": 0.0002, "loss": 0.0724, "step": 16120 }, { "epoch": 0.02933533994575781, "grad_norm": 0.08762095868587494, "learning_rate": 0.0002, "loss": 0.0734, "step": 16130 }, { "epoch": 0.029353526765315007, "grad_norm": 0.14261308312416077, "learning_rate": 0.0002, "loss": 0.071, "step": 16140 }, { "epoch": 0.029371713584872203, "grad_norm": 0.033154651522636414, "learning_rate": 0.0002, "loss": 0.0238, "step": 16150 }, { "epoch": 0.0293899004044294, "grad_norm": 0.1422877162694931, "learning_rate": 0.0002, "loss": 0.1285, "step": 16160 }, { "epoch": 0.029408087223986596, "grad_norm": 0.1342266947031021, "learning_rate": 0.0002, "loss": 0.0765, "step": 16170 }, { "epoch": 0.029426274043543792, "grad_norm": 0.031525906175374985, "learning_rate": 0.0002, "loss": 0.0772, "step": 16180 }, { "epoch": 0.02944446086310099, "grad_norm": 0.14790122210979462, "learning_rate": 0.0002, "loss": 0.0627, "step": 16190 }, { "epoch": 0.029462647682658185, "grad_norm": 0.025354932993650436, "learning_rate": 0.0002, "loss": 0.0212, "step": 16200 }, { "epoch": 0.02948083450221538, "grad_norm": 0.1287624090909958, "learning_rate": 0.0002, "loss": 0.1457, "step": 16210 }, { "epoch": 0.029499021321772578, "grad_norm": 0.1079782247543335, "learning_rate": 0.0002, "loss": 0.0819, "step": 16220 }, { "epoch": 0.029517208141329774, "grad_norm": 0.04884497448801994, "learning_rate": 0.0002, "loss": 0.0843, "step": 16230 }, { "epoch": 0.02953539496088697, "grad_norm": 0.14452646672725677, "learning_rate": 0.0002, "loss": 0.0664, "step": 16240 }, { "epoch": 0.029553581780444167, "grad_norm": 0.029236188158392906, "learning_rate": 0.0002, "loss": 0.0182, "step": 16250 }, { "epoch": 0.029571768600001363, "grad_norm": 0.18048252165317535, "learning_rate": 0.0002, "loss": 0.1382, "step": 16260 }, { "epoch": 0.02958995541955856, "grad_norm": 0.08402508497238159, "learning_rate": 0.0002, "loss": 0.078, "step": 16270 }, { "epoch": 0.029608142239115755, "grad_norm": 0.07740433514118195, "learning_rate": 0.0002, "loss": 0.0776, "step": 16280 }, { "epoch": 0.029626329058672952, "grad_norm": 0.1414123773574829, "learning_rate": 0.0002, "loss": 0.0611, "step": 16290 }, { "epoch": 0.029644515878230148, "grad_norm": 0.03296574577689171, "learning_rate": 0.0002, "loss": 0.0228, "step": 16300 }, { "epoch": 0.029662702697787344, "grad_norm": 0.09312735497951508, "learning_rate": 0.0002, "loss": 0.1213, "step": 16310 }, { "epoch": 0.029680889517344544, "grad_norm": 0.07857484370470047, "learning_rate": 0.0002, "loss": 0.0812, "step": 16320 }, { "epoch": 0.02969907633690174, "grad_norm": 0.0680379793047905, "learning_rate": 0.0002, "loss": 0.0774, "step": 16330 }, { "epoch": 0.029717263156458937, "grad_norm": 0.18506748974323273, "learning_rate": 0.0002, "loss": 0.0675, "step": 16340 }, { "epoch": 0.029735449976016133, "grad_norm": 0.029233543202280998, "learning_rate": 0.0002, "loss": 0.0187, "step": 16350 }, { "epoch": 0.02975363679557333, "grad_norm": 0.1133171021938324, "learning_rate": 0.0002, "loss": 0.1217, "step": 16360 }, { "epoch": 0.029771823615130526, "grad_norm": 0.06985988467931747, "learning_rate": 0.0002, "loss": 0.0761, "step": 16370 }, { "epoch": 0.029790010434687722, "grad_norm": 0.13158757984638214, "learning_rate": 0.0002, "loss": 0.0764, "step": 16380 }, { "epoch": 0.02980819725424492, "grad_norm": 0.19751304388046265, "learning_rate": 0.0002, "loss": 0.0652, "step": 16390 }, { "epoch": 0.029826384073802115, "grad_norm": 0.019567493349313736, "learning_rate": 0.0002, "loss": 0.0166, "step": 16400 }, { "epoch": 0.02984457089335931, "grad_norm": 0.1859702467918396, "learning_rate": 0.0002, "loss": 0.1482, "step": 16410 }, { "epoch": 0.029862757712916507, "grad_norm": 0.03211350366473198, "learning_rate": 0.0002, "loss": 0.073, "step": 16420 }, { "epoch": 0.029880944532473704, "grad_norm": 0.10664219409227371, "learning_rate": 0.0002, "loss": 0.075, "step": 16430 }, { "epoch": 0.0298991313520309, "grad_norm": 0.18254978954792023, "learning_rate": 0.0002, "loss": 0.0666, "step": 16440 }, { "epoch": 0.029917318171588096, "grad_norm": 0.03076091594994068, "learning_rate": 0.0002, "loss": 0.0217, "step": 16450 }, { "epoch": 0.029935504991145293, "grad_norm": 0.11172248423099518, "learning_rate": 0.0002, "loss": 0.1115, "step": 16460 }, { "epoch": 0.02995369181070249, "grad_norm": 0.1121174767613411, "learning_rate": 0.0002, "loss": 0.0838, "step": 16470 }, { "epoch": 0.029971878630259685, "grad_norm": 0.05544061213731766, "learning_rate": 0.0002, "loss": 0.0773, "step": 16480 }, { "epoch": 0.02999006544981688, "grad_norm": 0.13899610936641693, "learning_rate": 0.0002, "loss": 0.0648, "step": 16490 }, { "epoch": 0.030008252269374078, "grad_norm": 0.031017031520605087, "learning_rate": 0.0002, "loss": 0.0205, "step": 16500 }, { "epoch": 0.030026439088931274, "grad_norm": 0.5919166803359985, "learning_rate": 0.0002, "loss": 0.1454, "step": 16510 }, { "epoch": 0.03004462590848847, "grad_norm": 2.5127646923065186, "learning_rate": 0.0002, "loss": 0.0925, "step": 16520 }, { "epoch": 0.030062812728045667, "grad_norm": 0.12587642669677734, "learning_rate": 0.0002, "loss": 0.0896, "step": 16530 }, { "epoch": 0.030080999547602863, "grad_norm": 0.29352524876594543, "learning_rate": 0.0002, "loss": 0.0692, "step": 16540 }, { "epoch": 0.03009918636716006, "grad_norm": 0.012585405260324478, "learning_rate": 0.0002, "loss": 0.021, "step": 16550 }, { "epoch": 0.030117373186717256, "grad_norm": 2.432018756866455, "learning_rate": 0.0002, "loss": 0.239, "step": 16560 }, { "epoch": 0.030135560006274452, "grad_norm": 0.09337054193019867, "learning_rate": 0.0002, "loss": 0.0859, "step": 16570 }, { "epoch": 0.030153746825831648, "grad_norm": 0.05135548114776611, "learning_rate": 0.0002, "loss": 0.0794, "step": 16580 }, { "epoch": 0.030171933645388845, "grad_norm": 0.15056684613227844, "learning_rate": 0.0002, "loss": 0.0697, "step": 16590 }, { "epoch": 0.03019012046494604, "grad_norm": 5.883757694391534e-05, "learning_rate": 0.0002, "loss": 0.0085, "step": 16600 }, { "epoch": 0.030208307284503237, "grad_norm": 1.0368543863296509, "learning_rate": 0.0002, "loss": 0.1861, "step": 16610 }, { "epoch": 0.030226494104060433, "grad_norm": 0.07987317442893982, "learning_rate": 0.0002, "loss": 0.0938, "step": 16620 }, { "epoch": 0.03024468092361763, "grad_norm": 0.02812887355685234, "learning_rate": 0.0002, "loss": 0.0753, "step": 16630 }, { "epoch": 0.030262867743174826, "grad_norm": 0.24061231315135956, "learning_rate": 0.0002, "loss": 0.0653, "step": 16640 }, { "epoch": 0.030281054562732022, "grad_norm": 0.0402507558465004, "learning_rate": 0.0002, "loss": 0.0266, "step": 16650 }, { "epoch": 0.030299241382289222, "grad_norm": 0.13552093505859375, "learning_rate": 0.0002, "loss": 0.1709, "step": 16660 }, { "epoch": 0.03031742820184642, "grad_norm": 0.6093604564666748, "learning_rate": 0.0002, "loss": 0.0857, "step": 16670 }, { "epoch": 0.030335615021403615, "grad_norm": 0.11608528345823288, "learning_rate": 0.0002, "loss": 0.0874, "step": 16680 }, { "epoch": 0.03035380184096081, "grad_norm": 0.23376339673995972, "learning_rate": 0.0002, "loss": 0.0688, "step": 16690 }, { "epoch": 0.030371988660518007, "grad_norm": 0.03484225273132324, "learning_rate": 0.0002, "loss": 0.0172, "step": 16700 }, { "epoch": 0.030390175480075204, "grad_norm": 0.30532532930374146, "learning_rate": 0.0002, "loss": 0.1686, "step": 16710 }, { "epoch": 0.0304083622996324, "grad_norm": 0.05142231658101082, "learning_rate": 0.0002, "loss": 0.0766, "step": 16720 }, { "epoch": 0.030426549119189596, "grad_norm": 0.08218207955360413, "learning_rate": 0.0002, "loss": 0.0839, "step": 16730 }, { "epoch": 0.030444735938746793, "grad_norm": 0.15296520292758942, "learning_rate": 0.0002, "loss": 0.0717, "step": 16740 }, { "epoch": 0.03046292275830399, "grad_norm": 0.009951476007699966, "learning_rate": 0.0002, "loss": 0.0103, "step": 16750 }, { "epoch": 0.030481109577861185, "grad_norm": 0.18752850592136383, "learning_rate": 0.0002, "loss": 0.2382, "step": 16760 }, { "epoch": 0.03049929639741838, "grad_norm": 0.1473335325717926, "learning_rate": 0.0002, "loss": 0.0975, "step": 16770 }, { "epoch": 0.030517483216975578, "grad_norm": 0.04578230902552605, "learning_rate": 0.0002, "loss": 0.0812, "step": 16780 }, { "epoch": 0.030535670036532774, "grad_norm": 0.2557182312011719, "learning_rate": 0.0002, "loss": 0.0691, "step": 16790 }, { "epoch": 0.03055385685608997, "grad_norm": 1.473021388053894, "learning_rate": 0.0002, "loss": 0.2088, "step": 16800 }, { "epoch": 0.030572043675647167, "grad_norm": 1.0227181911468506, "learning_rate": 0.0002, "loss": 0.7207, "step": 16810 }, { "epoch": 0.030590230495204363, "grad_norm": 0.11395780742168427, "learning_rate": 0.0002, "loss": 0.0943, "step": 16820 }, { "epoch": 0.03060841731476156, "grad_norm": 6.501937389373779, "learning_rate": 0.0002, "loss": 0.0871, "step": 16830 }, { "epoch": 0.030626604134318756, "grad_norm": 0.17187578976154327, "learning_rate": 0.0002, "loss": 0.0672, "step": 16840 }, { "epoch": 0.030644790953875952, "grad_norm": 0.03396519273519516, "learning_rate": 0.0002, "loss": 0.0224, "step": 16850 }, { "epoch": 0.03066297777343315, "grad_norm": 3.397012948989868, "learning_rate": 0.0002, "loss": 0.1641, "step": 16860 }, { "epoch": 0.030681164592990345, "grad_norm": 0.44838130474090576, "learning_rate": 0.0002, "loss": 0.0868, "step": 16870 }, { "epoch": 0.03069935141254754, "grad_norm": 0.08598771691322327, "learning_rate": 0.0002, "loss": 0.0766, "step": 16880 }, { "epoch": 0.030717538232104737, "grad_norm": 0.15339739620685577, "learning_rate": 0.0002, "loss": 0.0609, "step": 16890 }, { "epoch": 0.030735725051661934, "grad_norm": 0.04086040332913399, "learning_rate": 0.0002, "loss": 0.0218, "step": 16900 }, { "epoch": 0.03075391187121913, "grad_norm": 0.40313076972961426, "learning_rate": 0.0002, "loss": 0.2017, "step": 16910 }, { "epoch": 0.030772098690776326, "grad_norm": 0.2068721503019333, "learning_rate": 0.0002, "loss": 0.0906, "step": 16920 }, { "epoch": 0.030790285510333523, "grad_norm": 0.12770770490169525, "learning_rate": 0.0002, "loss": 0.0801, "step": 16930 }, { "epoch": 0.03080847232989072, "grad_norm": 17.294641494750977, "learning_rate": 0.0002, "loss": 0.0701, "step": 16940 }, { "epoch": 0.030826659149447915, "grad_norm": 0.04612286388874054, "learning_rate": 0.0002, "loss": 0.0287, "step": 16950 }, { "epoch": 0.03084484596900511, "grad_norm": 0.10311487317085266, "learning_rate": 0.0002, "loss": 0.136, "step": 16960 }, { "epoch": 0.030863032788562308, "grad_norm": 0.20878446102142334, "learning_rate": 0.0002, "loss": 0.0886, "step": 16970 }, { "epoch": 0.030881219608119504, "grad_norm": 1.412353515625, "learning_rate": 0.0002, "loss": 0.0843, "step": 16980 }, { "epoch": 0.0308994064276767, "grad_norm": 0.27046918869018555, "learning_rate": 0.0002, "loss": 0.0755, "step": 16990 }, { "epoch": 0.030917593247233897, "grad_norm": 0.5227788090705872, "learning_rate": 0.0002, "loss": 0.0234, "step": 17000 }, { "epoch": 0.030935780066791096, "grad_norm": 0.16006655991077423, "learning_rate": 0.0002, "loss": 0.183, "step": 17010 }, { "epoch": 0.030953966886348293, "grad_norm": 0.1297607421875, "learning_rate": 0.0002, "loss": 0.0868, "step": 17020 }, { "epoch": 0.03097215370590549, "grad_norm": 11.198999404907227, "learning_rate": 0.0002, "loss": 0.0998, "step": 17030 }, { "epoch": 0.030990340525462685, "grad_norm": 0.39887136220932007, "learning_rate": 0.0002, "loss": 0.0898, "step": 17040 }, { "epoch": 0.03100852734501988, "grad_norm": 0.009262642823159695, "learning_rate": 0.0002, "loss": 0.0215, "step": 17050 }, { "epoch": 0.031026714164577078, "grad_norm": 0.15820527076721191, "learning_rate": 0.0002, "loss": 0.2017, "step": 17060 }, { "epoch": 0.031044900984134274, "grad_norm": 0.11645558476448059, "learning_rate": 0.0002, "loss": 0.085, "step": 17070 }, { "epoch": 0.03106308780369147, "grad_norm": 0.03981775790452957, "learning_rate": 0.0002, "loss": 0.0803, "step": 17080 }, { "epoch": 0.031081274623248667, "grad_norm": 0.1584177166223526, "learning_rate": 0.0002, "loss": 0.0635, "step": 17090 }, { "epoch": 0.031099461442805863, "grad_norm": 0.0005907397717237473, "learning_rate": 0.0002, "loss": 0.006, "step": 17100 }, { "epoch": 0.03111764826236306, "grad_norm": 0.05344061553478241, "learning_rate": 0.0002, "loss": 0.3098, "step": 17110 }, { "epoch": 0.031135835081920256, "grad_norm": 0.05249408632516861, "learning_rate": 0.0002, "loss": 0.1002, "step": 17120 }, { "epoch": 0.031154021901477452, "grad_norm": 0.04177263006567955, "learning_rate": 0.0002, "loss": 0.0969, "step": 17130 }, { "epoch": 0.03117220872103465, "grad_norm": 0.18396486341953278, "learning_rate": 0.0002, "loss": 0.0727, "step": 17140 }, { "epoch": 0.031190395540591845, "grad_norm": 0.0019848416559398174, "learning_rate": 0.0002, "loss": 0.0092, "step": 17150 }, { "epoch": 0.03120858236014904, "grad_norm": 0.23747271299362183, "learning_rate": 0.0002, "loss": 0.3243, "step": 17160 }, { "epoch": 0.031226769179706237, "grad_norm": 0.2365376353263855, "learning_rate": 0.0002, "loss": 0.094, "step": 17170 }, { "epoch": 0.031244955999263434, "grad_norm": 0.21784919500350952, "learning_rate": 0.0002, "loss": 0.0795, "step": 17180 }, { "epoch": 0.03126314281882063, "grad_norm": 0.27253153920173645, "learning_rate": 0.0002, "loss": 0.0748, "step": 17190 }, { "epoch": 0.031281329638377826, "grad_norm": 0.004298684187233448, "learning_rate": 0.0002, "loss": 0.014, "step": 17200 }, { "epoch": 0.03129951645793502, "grad_norm": 0.267871230840683, "learning_rate": 0.0002, "loss": 0.2938, "step": 17210 }, { "epoch": 0.03131770327749222, "grad_norm": 0.1428530067205429, "learning_rate": 0.0002, "loss": 0.0901, "step": 17220 }, { "epoch": 0.031335890097049415, "grad_norm": 0.10623782873153687, "learning_rate": 0.0002, "loss": 0.0752, "step": 17230 }, { "epoch": 0.03135407691660661, "grad_norm": 0.2869247496128082, "learning_rate": 0.0002, "loss": 0.0707, "step": 17240 }, { "epoch": 0.03137226373616381, "grad_norm": 0.011321209371089935, "learning_rate": 0.0002, "loss": 0.0168, "step": 17250 }, { "epoch": 0.031390450555721004, "grad_norm": 0.09432020783424377, "learning_rate": 0.0002, "loss": 0.2046, "step": 17260 }, { "epoch": 0.0314086373752782, "grad_norm": 0.190867081284523, "learning_rate": 0.0002, "loss": 0.0866, "step": 17270 }, { "epoch": 0.0314268241948354, "grad_norm": 0.14274829626083374, "learning_rate": 0.0002, "loss": 0.0796, "step": 17280 }, { "epoch": 0.03144501101439259, "grad_norm": 0.29910504817962646, "learning_rate": 0.0002, "loss": 0.0711, "step": 17290 }, { "epoch": 0.03146319783394979, "grad_norm": 0.031730011105537415, "learning_rate": 0.0002, "loss": 0.0217, "step": 17300 }, { "epoch": 0.031481384653506986, "grad_norm": 0.23042625188827515, "learning_rate": 0.0002, "loss": 0.1491, "step": 17310 }, { "epoch": 0.03149957147306418, "grad_norm": 0.15560220181941986, "learning_rate": 0.0002, "loss": 0.0761, "step": 17320 }, { "epoch": 0.03151775829262138, "grad_norm": 0.051929160952568054, "learning_rate": 0.0002, "loss": 0.0893, "step": 17330 }, { "epoch": 0.031535945112178575, "grad_norm": 0.16162756085395813, "learning_rate": 0.0002, "loss": 0.0623, "step": 17340 }, { "epoch": 0.03155413193173577, "grad_norm": 0.019480068236589432, "learning_rate": 0.0002, "loss": 0.0137, "step": 17350 }, { "epoch": 0.03157231875129297, "grad_norm": 0.24700693786144257, "learning_rate": 0.0002, "loss": 0.1481, "step": 17360 }, { "epoch": 0.031590505570850164, "grad_norm": 0.17574873566627502, "learning_rate": 0.0002, "loss": 0.079, "step": 17370 }, { "epoch": 0.03160869239040736, "grad_norm": 0.10368580371141434, "learning_rate": 0.0002, "loss": 0.0811, "step": 17380 }, { "epoch": 0.031626879209964556, "grad_norm": 0.23330622911453247, "learning_rate": 0.0002, "loss": 0.0669, "step": 17390 }, { "epoch": 0.03164506602952175, "grad_norm": 0.031393859535455704, "learning_rate": 0.0002, "loss": 0.0183, "step": 17400 }, { "epoch": 0.03166325284907895, "grad_norm": 0.22080129384994507, "learning_rate": 0.0002, "loss": 0.1567, "step": 17410 }, { "epoch": 0.031681439668636145, "grad_norm": 0.177025705575943, "learning_rate": 0.0002, "loss": 0.0798, "step": 17420 }, { "epoch": 0.03169962648819334, "grad_norm": 0.054285600781440735, "learning_rate": 0.0002, "loss": 0.0709, "step": 17430 }, { "epoch": 0.03171781330775054, "grad_norm": 0.20625421404838562, "learning_rate": 0.0002, "loss": 0.0592, "step": 17440 }, { "epoch": 0.031736000127307734, "grad_norm": 0.042640089988708496, "learning_rate": 0.0002, "loss": 0.0199, "step": 17450 }, { "epoch": 0.03175418694686493, "grad_norm": 0.2505437731742859, "learning_rate": 0.0002, "loss": 0.131, "step": 17460 }, { "epoch": 0.03177237376642213, "grad_norm": 0.24848629534244537, "learning_rate": 0.0002, "loss": 0.0826, "step": 17470 }, { "epoch": 0.03179056058597932, "grad_norm": 0.056854844093322754, "learning_rate": 0.0002, "loss": 0.0779, "step": 17480 }, { "epoch": 0.03180874740553652, "grad_norm": 0.23022660613059998, "learning_rate": 0.0002, "loss": 0.0703, "step": 17490 }, { "epoch": 0.031826934225093716, "grad_norm": 0.033501993864774704, "learning_rate": 0.0002, "loss": 0.0229, "step": 17500 }, { "epoch": 0.03184512104465091, "grad_norm": 0.25061148405075073, "learning_rate": 0.0002, "loss": 0.1588, "step": 17510 }, { "epoch": 0.031863307864208115, "grad_norm": 0.21534167230129242, "learning_rate": 0.0002, "loss": 0.079, "step": 17520 }, { "epoch": 0.03188149468376531, "grad_norm": 0.04823959991335869, "learning_rate": 0.0002, "loss": 0.0826, "step": 17530 }, { "epoch": 0.03189968150332251, "grad_norm": 0.23680952191352844, "learning_rate": 0.0002, "loss": 0.0617, "step": 17540 }, { "epoch": 0.031917868322879704, "grad_norm": 0.016636351123452187, "learning_rate": 0.0002, "loss": 0.0143, "step": 17550 }, { "epoch": 0.0319360551424369, "grad_norm": 0.3684225082397461, "learning_rate": 0.0002, "loss": 0.2011, "step": 17560 }, { "epoch": 0.0319542419619941, "grad_norm": 0.07126643508672714, "learning_rate": 0.0002, "loss": 0.0792, "step": 17570 }, { "epoch": 0.03197242878155129, "grad_norm": 0.05354290455579758, "learning_rate": 0.0002, "loss": 0.0831, "step": 17580 }, { "epoch": 0.03199061560110849, "grad_norm": 0.20318995416164398, "learning_rate": 0.0002, "loss": 0.0617, "step": 17590 }, { "epoch": 0.032008802420665686, "grad_norm": 0.021502351388335228, "learning_rate": 0.0002, "loss": 0.0137, "step": 17600 }, { "epoch": 0.03202698924022288, "grad_norm": 0.3471545875072479, "learning_rate": 0.0002, "loss": 0.1823, "step": 17610 }, { "epoch": 0.03204517605978008, "grad_norm": 0.23191972076892853, "learning_rate": 0.0002, "loss": 0.0837, "step": 17620 }, { "epoch": 0.032063362879337275, "grad_norm": 0.0479818731546402, "learning_rate": 0.0002, "loss": 0.0845, "step": 17630 }, { "epoch": 0.03208154969889447, "grad_norm": 0.2193339467048645, "learning_rate": 0.0002, "loss": 0.068, "step": 17640 }, { "epoch": 0.03209973651845167, "grad_norm": 0.03661821037530899, "learning_rate": 0.0002, "loss": 0.0234, "step": 17650 }, { "epoch": 0.032117923338008864, "grad_norm": 0.10396943986415863, "learning_rate": 0.0002, "loss": 0.1295, "step": 17660 }, { "epoch": 0.03213611015756606, "grad_norm": 0.16999179124832153, "learning_rate": 0.0002, "loss": 0.0823, "step": 17670 }, { "epoch": 0.032154296977123256, "grad_norm": 0.09069819748401642, "learning_rate": 0.0002, "loss": 0.0748, "step": 17680 }, { "epoch": 0.03217248379668045, "grad_norm": 0.24210433661937714, "learning_rate": 0.0002, "loss": 0.0611, "step": 17690 }, { "epoch": 0.03219067061623765, "grad_norm": 0.028281020000576973, "learning_rate": 0.0002, "loss": 0.018, "step": 17700 }, { "epoch": 0.032208857435794845, "grad_norm": 0.4133516252040863, "learning_rate": 0.0002, "loss": 0.1704, "step": 17710 }, { "epoch": 0.03222704425535204, "grad_norm": 0.20207400619983673, "learning_rate": 0.0002, "loss": 0.0804, "step": 17720 }, { "epoch": 0.03224523107490924, "grad_norm": 0.043604232370853424, "learning_rate": 0.0002, "loss": 0.0929, "step": 17730 }, { "epoch": 0.032263417894466434, "grad_norm": 0.1995580494403839, "learning_rate": 0.0002, "loss": 0.062, "step": 17740 }, { "epoch": 0.03228160471402363, "grad_norm": 0.03241848200559616, "learning_rate": 0.0002, "loss": 0.0137, "step": 17750 }, { "epoch": 0.03229979153358083, "grad_norm": 0.28819000720977783, "learning_rate": 0.0002, "loss": 0.1696, "step": 17760 }, { "epoch": 0.03231797835313802, "grad_norm": 0.2625056803226471, "learning_rate": 0.0002, "loss": 0.0704, "step": 17770 }, { "epoch": 0.03233616517269522, "grad_norm": 0.03986202925443649, "learning_rate": 0.0002, "loss": 0.0848, "step": 17780 }, { "epoch": 0.032354351992252416, "grad_norm": 0.24770867824554443, "learning_rate": 0.0002, "loss": 0.0608, "step": 17790 }, { "epoch": 0.03237253881180961, "grad_norm": 0.031353630125522614, "learning_rate": 0.0002, "loss": 0.0145, "step": 17800 }, { "epoch": 0.03239072563136681, "grad_norm": 0.2273588478565216, "learning_rate": 0.0002, "loss": 0.1765, "step": 17810 }, { "epoch": 0.032408912450924005, "grad_norm": 0.19741755723953247, "learning_rate": 0.0002, "loss": 0.0818, "step": 17820 }, { "epoch": 0.0324270992704812, "grad_norm": 0.03193483129143715, "learning_rate": 0.0002, "loss": 0.0737, "step": 17830 }, { "epoch": 0.0324452860900384, "grad_norm": 0.13962946832180023, "learning_rate": 0.0002, "loss": 0.0575, "step": 17840 }, { "epoch": 0.03246347290959559, "grad_norm": 0.01755092851817608, "learning_rate": 0.0002, "loss": 0.0159, "step": 17850 }, { "epoch": 0.03248165972915279, "grad_norm": 0.21713244915008545, "learning_rate": 0.0002, "loss": 0.1476, "step": 17860 }, { "epoch": 0.032499846548709986, "grad_norm": 0.15362155437469482, "learning_rate": 0.0002, "loss": 0.0747, "step": 17870 }, { "epoch": 0.03251803336826718, "grad_norm": 0.02643916755914688, "learning_rate": 0.0002, "loss": 0.0793, "step": 17880 }, { "epoch": 0.03253622018782438, "grad_norm": 0.2702760100364685, "learning_rate": 0.0002, "loss": 0.0641, "step": 17890 }, { "epoch": 0.032554407007381575, "grad_norm": 0.05910428613424301, "learning_rate": 0.0002, "loss": 0.022, "step": 17900 }, { "epoch": 0.03257259382693877, "grad_norm": 0.17692551016807556, "learning_rate": 0.0002, "loss": 0.1407, "step": 17910 }, { "epoch": 0.03259078064649597, "grad_norm": 0.19877870380878448, "learning_rate": 0.0002, "loss": 0.0798, "step": 17920 }, { "epoch": 0.032608967466053164, "grad_norm": 0.06731924414634705, "learning_rate": 0.0002, "loss": 0.0798, "step": 17930 }, { "epoch": 0.03262715428561036, "grad_norm": 0.20342952013015747, "learning_rate": 0.0002, "loss": 0.0571, "step": 17940 }, { "epoch": 0.03264534110516756, "grad_norm": 0.06299301236867905, "learning_rate": 0.0002, "loss": 0.0154, "step": 17950 }, { "epoch": 0.03266352792472475, "grad_norm": 0.30317986011505127, "learning_rate": 0.0002, "loss": 0.1496, "step": 17960 }, { "epoch": 0.03268171474428195, "grad_norm": 0.2737327218055725, "learning_rate": 0.0002, "loss": 0.0777, "step": 17970 }, { "epoch": 0.032699901563839145, "grad_norm": 0.03226702660322189, "learning_rate": 0.0002, "loss": 0.0799, "step": 17980 }, { "epoch": 0.03271808838339634, "grad_norm": 0.20195341110229492, "learning_rate": 0.0002, "loss": 0.0654, "step": 17990 }, { "epoch": 0.03273627520295354, "grad_norm": 0.03351292014122009, "learning_rate": 0.0002, "loss": 0.0194, "step": 18000 }, { "epoch": 0.032754462022510734, "grad_norm": 0.2281372845172882, "learning_rate": 0.0002, "loss": 0.154, "step": 18010 }, { "epoch": 0.03277264884206793, "grad_norm": 0.19263891875743866, "learning_rate": 0.0002, "loss": 0.0803, "step": 18020 }, { "epoch": 0.03279083566162513, "grad_norm": 0.04183288663625717, "learning_rate": 0.0002, "loss": 0.0842, "step": 18030 }, { "epoch": 0.03280902248118232, "grad_norm": 0.284759521484375, "learning_rate": 0.0002, "loss": 0.067, "step": 18040 }, { "epoch": 0.03282720930073952, "grad_norm": 0.02972390688955784, "learning_rate": 0.0002, "loss": 0.016, "step": 18050 }, { "epoch": 0.032845396120296716, "grad_norm": 0.28630614280700684, "learning_rate": 0.0002, "loss": 0.1866, "step": 18060 }, { "epoch": 0.03286358293985391, "grad_norm": 0.16426514089107513, "learning_rate": 0.0002, "loss": 0.0812, "step": 18070 }, { "epoch": 0.03288176975941111, "grad_norm": 0.05643441155552864, "learning_rate": 0.0002, "loss": 0.0773, "step": 18080 }, { "epoch": 0.032899956578968305, "grad_norm": 0.19082742929458618, "learning_rate": 0.0002, "loss": 0.0582, "step": 18090 }, { "epoch": 0.0329181433985255, "grad_norm": 0.017512233927845955, "learning_rate": 0.0002, "loss": 0.0174, "step": 18100 }, { "epoch": 0.0329363302180827, "grad_norm": 0.22619640827178955, "learning_rate": 0.0002, "loss": 0.166, "step": 18110 }, { "epoch": 0.032954517037639894, "grad_norm": 0.10430974513292313, "learning_rate": 0.0002, "loss": 0.0716, "step": 18120 }, { "epoch": 0.03297270385719709, "grad_norm": 0.07371710985898972, "learning_rate": 0.0002, "loss": 0.0733, "step": 18130 }, { "epoch": 0.032990890676754286, "grad_norm": 0.19163483381271362, "learning_rate": 0.0002, "loss": 0.0609, "step": 18140 }, { "epoch": 0.03300907749631148, "grad_norm": 0.03743975609540939, "learning_rate": 0.0002, "loss": 0.017, "step": 18150 }, { "epoch": 0.03302726431586868, "grad_norm": 0.19496546685695648, "learning_rate": 0.0002, "loss": 0.1622, "step": 18160 }, { "epoch": 0.033045451135425875, "grad_norm": 0.13054883480072021, "learning_rate": 0.0002, "loss": 0.0728, "step": 18170 }, { "epoch": 0.03306363795498307, "grad_norm": 0.10058756172657013, "learning_rate": 0.0002, "loss": 0.0738, "step": 18180 }, { "epoch": 0.03308182477454027, "grad_norm": 0.220932736992836, "learning_rate": 0.0002, "loss": 0.063, "step": 18190 }, { "epoch": 0.033100011594097464, "grad_norm": 0.04396356642246246, "learning_rate": 0.0002, "loss": 0.0207, "step": 18200 }, { "epoch": 0.03311819841365467, "grad_norm": 0.23554326593875885, "learning_rate": 0.0002, "loss": 0.1484, "step": 18210 }, { "epoch": 0.033136385233211864, "grad_norm": 0.11277181655168533, "learning_rate": 0.0002, "loss": 0.0763, "step": 18220 }, { "epoch": 0.03315457205276906, "grad_norm": 0.05176365375518799, "learning_rate": 0.0002, "loss": 0.076, "step": 18230 }, { "epoch": 0.033172758872326256, "grad_norm": 0.1521395444869995, "learning_rate": 0.0002, "loss": 0.0605, "step": 18240 }, { "epoch": 0.03319094569188345, "grad_norm": 0.04682580381631851, "learning_rate": 0.0002, "loss": 0.0149, "step": 18250 }, { "epoch": 0.03320913251144065, "grad_norm": 0.16890883445739746, "learning_rate": 0.0002, "loss": 0.1402, "step": 18260 }, { "epoch": 0.033227319330997845, "grad_norm": 0.17221559584140778, "learning_rate": 0.0002, "loss": 0.0819, "step": 18270 }, { "epoch": 0.03324550615055504, "grad_norm": 0.07434559613466263, "learning_rate": 0.0002, "loss": 0.0784, "step": 18280 }, { "epoch": 0.03326369297011224, "grad_norm": 0.1912834346294403, "learning_rate": 0.0002, "loss": 0.0614, "step": 18290 }, { "epoch": 0.033281879789669434, "grad_norm": 0.04286884889006615, "learning_rate": 0.0002, "loss": 0.0185, "step": 18300 }, { "epoch": 0.03330006660922663, "grad_norm": 0.29059842228889465, "learning_rate": 0.0002, "loss": 0.1357, "step": 18310 }, { "epoch": 0.03331825342878383, "grad_norm": 0.2289486825466156, "learning_rate": 0.0002, "loss": 0.0865, "step": 18320 }, { "epoch": 0.03333644024834102, "grad_norm": 0.027094636112451553, "learning_rate": 0.0002, "loss": 0.0841, "step": 18330 }, { "epoch": 0.03335462706789822, "grad_norm": 0.21263600885868073, "learning_rate": 0.0002, "loss": 0.0628, "step": 18340 }, { "epoch": 0.033372813887455416, "grad_norm": 0.03497980535030365, "learning_rate": 0.0002, "loss": 0.0158, "step": 18350 }, { "epoch": 0.03339100070701261, "grad_norm": 0.20155973732471466, "learning_rate": 0.0002, "loss": 0.1523, "step": 18360 }, { "epoch": 0.03340918752656981, "grad_norm": 0.03746286779642105, "learning_rate": 0.0002, "loss": 0.0781, "step": 18370 }, { "epoch": 0.033427374346127005, "grad_norm": 0.06747066229581833, "learning_rate": 0.0002, "loss": 0.0792, "step": 18380 }, { "epoch": 0.0334455611656842, "grad_norm": 0.23699060082435608, "learning_rate": 0.0002, "loss": 0.0651, "step": 18390 }, { "epoch": 0.0334637479852414, "grad_norm": 0.047832150012254715, "learning_rate": 0.0002, "loss": 0.0181, "step": 18400 }, { "epoch": 0.033481934804798594, "grad_norm": 0.3178698420524597, "learning_rate": 0.0002, "loss": 0.1537, "step": 18410 }, { "epoch": 0.03350012162435579, "grad_norm": 0.16258081793785095, "learning_rate": 0.0002, "loss": 0.0722, "step": 18420 }, { "epoch": 0.033518308443912986, "grad_norm": 0.02807716652750969, "learning_rate": 0.0002, "loss": 0.0844, "step": 18430 }, { "epoch": 0.03353649526347018, "grad_norm": 0.16596710681915283, "learning_rate": 0.0002, "loss": 0.0607, "step": 18440 }, { "epoch": 0.03355468208302738, "grad_norm": 0.04448723793029785, "learning_rate": 0.0002, "loss": 0.0183, "step": 18450 }, { "epoch": 0.033572868902584575, "grad_norm": 0.39318934082984924, "learning_rate": 0.0002, "loss": 0.1497, "step": 18460 }, { "epoch": 0.03359105572214177, "grad_norm": 0.17387263476848602, "learning_rate": 0.0002, "loss": 0.0787, "step": 18470 }, { "epoch": 0.03360924254169897, "grad_norm": 0.14859163761138916, "learning_rate": 0.0002, "loss": 0.0837, "step": 18480 }, { "epoch": 0.033627429361256164, "grad_norm": 0.24148601293563843, "learning_rate": 0.0002, "loss": 0.0655, "step": 18490 }, { "epoch": 0.03364561618081336, "grad_norm": 0.04743284359574318, "learning_rate": 0.0002, "loss": 0.0174, "step": 18500 }, { "epoch": 0.03366380300037056, "grad_norm": 0.25396591424942017, "learning_rate": 0.0002, "loss": 0.1438, "step": 18510 }, { "epoch": 0.03368198981992775, "grad_norm": 0.1759178638458252, "learning_rate": 0.0002, "loss": 0.0758, "step": 18520 }, { "epoch": 0.03370017663948495, "grad_norm": 0.06611669808626175, "learning_rate": 0.0002, "loss": 0.0787, "step": 18530 }, { "epoch": 0.033718363459042146, "grad_norm": 0.22699445486068726, "learning_rate": 0.0002, "loss": 0.0697, "step": 18540 }, { "epoch": 0.03373655027859934, "grad_norm": 0.02634899877011776, "learning_rate": 0.0002, "loss": 0.0189, "step": 18550 }, { "epoch": 0.03375473709815654, "grad_norm": 0.3238360285758972, "learning_rate": 0.0002, "loss": 0.1496, "step": 18560 }, { "epoch": 0.033772923917713735, "grad_norm": 0.16044601798057556, "learning_rate": 0.0002, "loss": 0.076, "step": 18570 }, { "epoch": 0.03379111073727093, "grad_norm": 0.029841836541891098, "learning_rate": 0.0002, "loss": 0.0718, "step": 18580 }, { "epoch": 0.03380929755682813, "grad_norm": 0.21851007640361786, "learning_rate": 0.0002, "loss": 0.0656, "step": 18590 }, { "epoch": 0.033827484376385324, "grad_norm": 0.02096417360007763, "learning_rate": 0.0002, "loss": 0.0173, "step": 18600 }, { "epoch": 0.03384567119594252, "grad_norm": 0.29625844955444336, "learning_rate": 0.0002, "loss": 0.1716, "step": 18610 }, { "epoch": 0.033863858015499716, "grad_norm": 0.1510130614042282, "learning_rate": 0.0002, "loss": 0.0792, "step": 18620 }, { "epoch": 0.03388204483505691, "grad_norm": 0.04192917421460152, "learning_rate": 0.0002, "loss": 0.0717, "step": 18630 }, { "epoch": 0.03390023165461411, "grad_norm": 0.23139427602291107, "learning_rate": 0.0002, "loss": 0.0609, "step": 18640 }, { "epoch": 0.033918418474171305, "grad_norm": 0.03887970373034477, "learning_rate": 0.0002, "loss": 0.0127, "step": 18650 }, { "epoch": 0.0339366052937285, "grad_norm": 0.1315147578716278, "learning_rate": 0.0002, "loss": 0.1434, "step": 18660 }, { "epoch": 0.0339547921132857, "grad_norm": 0.13328243792057037, "learning_rate": 0.0002, "loss": 0.0673, "step": 18670 }, { "epoch": 0.033972978932842894, "grad_norm": 0.07161080092191696, "learning_rate": 0.0002, "loss": 0.0692, "step": 18680 }, { "epoch": 0.03399116575240009, "grad_norm": 0.16019296646118164, "learning_rate": 0.0002, "loss": 0.0641, "step": 18690 }, { "epoch": 0.03400935257195729, "grad_norm": 0.042882539331912994, "learning_rate": 0.0002, "loss": 0.0196, "step": 18700 }, { "epoch": 0.03402753939151448, "grad_norm": 0.15019817650318146, "learning_rate": 0.0002, "loss": 0.1239, "step": 18710 }, { "epoch": 0.03404572621107168, "grad_norm": 0.140267476439476, "learning_rate": 0.0002, "loss": 0.0715, "step": 18720 }, { "epoch": 0.034063913030628876, "grad_norm": 0.060760073363780975, "learning_rate": 0.0002, "loss": 0.079, "step": 18730 }, { "epoch": 0.03408209985018607, "grad_norm": 0.1783122718334198, "learning_rate": 0.0002, "loss": 0.0616, "step": 18740 }, { "epoch": 0.03410028666974327, "grad_norm": 0.023139121010899544, "learning_rate": 0.0002, "loss": 0.0171, "step": 18750 }, { "epoch": 0.034118473489300465, "grad_norm": 0.2645978331565857, "learning_rate": 0.0002, "loss": 0.1355, "step": 18760 }, { "epoch": 0.03413666030885766, "grad_norm": 0.21009914577007294, "learning_rate": 0.0002, "loss": 0.0757, "step": 18770 }, { "epoch": 0.03415484712841486, "grad_norm": 0.13494494557380676, "learning_rate": 0.0002, "loss": 0.0774, "step": 18780 }, { "epoch": 0.034173033947972054, "grad_norm": 0.19806784391403198, "learning_rate": 0.0002, "loss": 0.0636, "step": 18790 }, { "epoch": 0.03419122076752925, "grad_norm": 0.020482519641518593, "learning_rate": 0.0002, "loss": 0.0194, "step": 18800 }, { "epoch": 0.034209407587086446, "grad_norm": 0.34826937317848206, "learning_rate": 0.0002, "loss": 0.1521, "step": 18810 }, { "epoch": 0.03422759440664364, "grad_norm": 0.1293957680463791, "learning_rate": 0.0002, "loss": 0.0742, "step": 18820 }, { "epoch": 0.03424578122620084, "grad_norm": 0.06574539095163345, "learning_rate": 0.0002, "loss": 0.0792, "step": 18830 }, { "epoch": 0.034263968045758035, "grad_norm": 0.2005399614572525, "learning_rate": 0.0002, "loss": 0.0618, "step": 18840 }, { "epoch": 0.03428215486531523, "grad_norm": 0.04699913039803505, "learning_rate": 0.0002, "loss": 0.0176, "step": 18850 }, { "epoch": 0.03430034168487243, "grad_norm": 0.2593109905719757, "learning_rate": 0.0002, "loss": 0.1709, "step": 18860 }, { "epoch": 0.034318528504429624, "grad_norm": 0.587365448474884, "learning_rate": 0.0002, "loss": 0.0794, "step": 18870 }, { "epoch": 0.03433671532398682, "grad_norm": 0.0371614433825016, "learning_rate": 0.0002, "loss": 0.076, "step": 18880 }, { "epoch": 0.03435490214354402, "grad_norm": 0.2164178341627121, "learning_rate": 0.0002, "loss": 0.0577, "step": 18890 }, { "epoch": 0.03437308896310122, "grad_norm": 0.028071587905287743, "learning_rate": 0.0002, "loss": 0.0184, "step": 18900 }, { "epoch": 0.034391275782658416, "grad_norm": 0.25464126467704773, "learning_rate": 0.0002, "loss": 0.1616, "step": 18910 }, { "epoch": 0.03440946260221561, "grad_norm": 0.2830415368080139, "learning_rate": 0.0002, "loss": 0.0795, "step": 18920 }, { "epoch": 0.03442764942177281, "grad_norm": 0.07880273461341858, "learning_rate": 0.0002, "loss": 0.0717, "step": 18930 }, { "epoch": 0.034445836241330005, "grad_norm": 0.19671671092510223, "learning_rate": 0.0002, "loss": 0.0625, "step": 18940 }, { "epoch": 0.0344640230608872, "grad_norm": 0.038350027054548264, "learning_rate": 0.0002, "loss": 0.0172, "step": 18950 }, { "epoch": 0.0344822098804444, "grad_norm": 0.196768656373024, "learning_rate": 0.0002, "loss": 0.1586, "step": 18960 }, { "epoch": 0.034500396700001594, "grad_norm": 0.1861678808927536, "learning_rate": 0.0002, "loss": 0.0871, "step": 18970 }, { "epoch": 0.03451858351955879, "grad_norm": 0.1074979305267334, "learning_rate": 0.0002, "loss": 0.0697, "step": 18980 }, { "epoch": 0.03453677033911599, "grad_norm": 0.18214645981788635, "learning_rate": 0.0002, "loss": 0.0594, "step": 18990 }, { "epoch": 0.03455495715867318, "grad_norm": 0.035948049277067184, "learning_rate": 0.0002, "loss": 0.0177, "step": 19000 }, { "epoch": 0.03457314397823038, "grad_norm": 0.2434094399213791, "learning_rate": 0.0002, "loss": 0.1402, "step": 19010 }, { "epoch": 0.034591330797787576, "grad_norm": 0.06897670775651932, "learning_rate": 0.0002, "loss": 0.0758, "step": 19020 }, { "epoch": 0.03460951761734477, "grad_norm": 0.13107649981975555, "learning_rate": 0.0002, "loss": 0.0826, "step": 19030 }, { "epoch": 0.03462770443690197, "grad_norm": 0.1787865310907364, "learning_rate": 0.0002, "loss": 0.0619, "step": 19040 }, { "epoch": 0.034645891256459165, "grad_norm": 0.0460963137447834, "learning_rate": 0.0002, "loss": 0.0203, "step": 19050 }, { "epoch": 0.03466407807601636, "grad_norm": 0.20582084357738495, "learning_rate": 0.0002, "loss": 0.1325, "step": 19060 }, { "epoch": 0.03468226489557356, "grad_norm": 0.16120313107967377, "learning_rate": 0.0002, "loss": 0.08, "step": 19070 }, { "epoch": 0.03470045171513075, "grad_norm": 0.04322347044944763, "learning_rate": 0.0002, "loss": 0.0753, "step": 19080 }, { "epoch": 0.03471863853468795, "grad_norm": 0.1764109879732132, "learning_rate": 0.0002, "loss": 0.0618, "step": 19090 }, { "epoch": 0.034736825354245146, "grad_norm": 0.04453815147280693, "learning_rate": 0.0002, "loss": 0.0172, "step": 19100 }, { "epoch": 0.03475501217380234, "grad_norm": 0.32023972272872925, "learning_rate": 0.0002, "loss": 0.1394, "step": 19110 }, { "epoch": 0.03477319899335954, "grad_norm": 0.09920009225606918, "learning_rate": 0.0002, "loss": 0.0788, "step": 19120 }, { "epoch": 0.034791385812916735, "grad_norm": 0.047868456691503525, "learning_rate": 0.0002, "loss": 0.0745, "step": 19130 }, { "epoch": 0.03480957263247393, "grad_norm": 0.219430074095726, "learning_rate": 0.0002, "loss": 0.063, "step": 19140 }, { "epoch": 0.03482775945203113, "grad_norm": 0.04879681020975113, "learning_rate": 0.0002, "loss": 0.0161, "step": 19150 }, { "epoch": 0.034845946271588324, "grad_norm": 0.21360138058662415, "learning_rate": 0.0002, "loss": 0.1602, "step": 19160 }, { "epoch": 0.03486413309114552, "grad_norm": 0.1391269713640213, "learning_rate": 0.0002, "loss": 0.0798, "step": 19170 }, { "epoch": 0.03488231991070272, "grad_norm": 0.06293737888336182, "learning_rate": 0.0002, "loss": 0.0717, "step": 19180 }, { "epoch": 0.03490050673025991, "grad_norm": 0.20241963863372803, "learning_rate": 0.0002, "loss": 0.0612, "step": 19190 }, { "epoch": 0.03491869354981711, "grad_norm": 0.06246611103415489, "learning_rate": 0.0002, "loss": 0.0148, "step": 19200 }, { "epoch": 0.034936880369374305, "grad_norm": 0.16479995846748352, "learning_rate": 0.0002, "loss": 0.1611, "step": 19210 }, { "epoch": 0.0349550671889315, "grad_norm": 0.12036983668804169, "learning_rate": 0.0002, "loss": 0.0724, "step": 19220 }, { "epoch": 0.0349732540084887, "grad_norm": 0.03939517214894295, "learning_rate": 0.0002, "loss": 0.0758, "step": 19230 }, { "epoch": 0.034991440828045894, "grad_norm": 0.17047277092933655, "learning_rate": 0.0002, "loss": 0.066, "step": 19240 }, { "epoch": 0.03500962764760309, "grad_norm": 0.031782686710357666, "learning_rate": 0.0002, "loss": 0.0203, "step": 19250 }, { "epoch": 0.03502781446716029, "grad_norm": 0.2545730471611023, "learning_rate": 0.0002, "loss": 0.1716, "step": 19260 }, { "epoch": 0.03504600128671748, "grad_norm": 0.11225811392068863, "learning_rate": 0.0002, "loss": 0.0791, "step": 19270 }, { "epoch": 0.03506418810627468, "grad_norm": 0.049140989780426025, "learning_rate": 0.0002, "loss": 0.0784, "step": 19280 }, { "epoch": 0.035082374925831876, "grad_norm": 0.16942913830280304, "learning_rate": 0.0002, "loss": 0.0638, "step": 19290 }, { "epoch": 0.03510056174538907, "grad_norm": 0.03836115077137947, "learning_rate": 0.0002, "loss": 0.0193, "step": 19300 }, { "epoch": 0.03511874856494627, "grad_norm": 0.13004787266254425, "learning_rate": 0.0002, "loss": 0.1477, "step": 19310 }, { "epoch": 0.035136935384503465, "grad_norm": 0.2054329216480255, "learning_rate": 0.0002, "loss": 0.0792, "step": 19320 }, { "epoch": 0.03515512220406066, "grad_norm": 0.06592074781656265, "learning_rate": 0.0002, "loss": 0.0784, "step": 19330 }, { "epoch": 0.03517330902361786, "grad_norm": 0.19228027760982513, "learning_rate": 0.0002, "loss": 0.067, "step": 19340 }, { "epoch": 0.035191495843175054, "grad_norm": 0.04050719738006592, "learning_rate": 0.0002, "loss": 0.017, "step": 19350 }, { "epoch": 0.03520968266273225, "grad_norm": 0.28715401887893677, "learning_rate": 0.0002, "loss": 0.1499, "step": 19360 }, { "epoch": 0.035227869482289446, "grad_norm": 0.13954712450504303, "learning_rate": 0.0002, "loss": 0.0787, "step": 19370 }, { "epoch": 0.03524605630184664, "grad_norm": 0.08851815015077591, "learning_rate": 0.0002, "loss": 0.0739, "step": 19380 }, { "epoch": 0.03526424312140384, "grad_norm": 0.1788545697927475, "learning_rate": 0.0002, "loss": 0.0576, "step": 19390 }, { "epoch": 0.035282429940961035, "grad_norm": 0.03644658252596855, "learning_rate": 0.0002, "loss": 0.0143, "step": 19400 }, { "epoch": 0.03530061676051823, "grad_norm": 0.3140568137168884, "learning_rate": 0.0002, "loss": 0.1498, "step": 19410 }, { "epoch": 0.03531880358007543, "grad_norm": 0.14550529420375824, "learning_rate": 0.0002, "loss": 0.0748, "step": 19420 }, { "epoch": 0.035336990399632624, "grad_norm": 0.10995481163263321, "learning_rate": 0.0002, "loss": 0.0801, "step": 19430 }, { "epoch": 0.03535517721918982, "grad_norm": 0.17238560318946838, "learning_rate": 0.0002, "loss": 0.0608, "step": 19440 }, { "epoch": 0.03537336403874702, "grad_norm": 0.031363293528556824, "learning_rate": 0.0002, "loss": 0.0154, "step": 19450 }, { "epoch": 0.03539155085830421, "grad_norm": 0.14145390689373016, "learning_rate": 0.0002, "loss": 0.1511, "step": 19460 }, { "epoch": 0.03540973767786141, "grad_norm": 0.19073855876922607, "learning_rate": 0.0002, "loss": 0.0725, "step": 19470 }, { "epoch": 0.035427924497418606, "grad_norm": 0.15639430284500122, "learning_rate": 0.0002, "loss": 0.0836, "step": 19480 }, { "epoch": 0.0354461113169758, "grad_norm": 0.2566238045692444, "learning_rate": 0.0002, "loss": 0.0617, "step": 19490 }, { "epoch": 0.035464298136533, "grad_norm": 0.055755820125341415, "learning_rate": 0.0002, "loss": 0.0178, "step": 19500 }, { "epoch": 0.035482484956090195, "grad_norm": 0.2835562527179718, "learning_rate": 0.0002, "loss": 0.1306, "step": 19510 }, { "epoch": 0.03550067177564739, "grad_norm": 0.2310812920331955, "learning_rate": 0.0002, "loss": 0.0766, "step": 19520 }, { "epoch": 0.03551885859520459, "grad_norm": 0.1287071257829666, "learning_rate": 0.0002, "loss": 0.0791, "step": 19530 }, { "epoch": 0.035537045414761784, "grad_norm": 0.21308869123458862, "learning_rate": 0.0002, "loss": 0.0584, "step": 19540 }, { "epoch": 0.03555523223431898, "grad_norm": 0.0662735179066658, "learning_rate": 0.0002, "loss": 0.0207, "step": 19550 }, { "epoch": 0.035573419053876176, "grad_norm": 0.21706523001194, "learning_rate": 0.0002, "loss": 0.1308, "step": 19560 }, { "epoch": 0.03559160587343337, "grad_norm": 0.09376335144042969, "learning_rate": 0.0002, "loss": 0.0677, "step": 19570 }, { "epoch": 0.035609792692990576, "grad_norm": 0.1093437597155571, "learning_rate": 0.0002, "loss": 0.0741, "step": 19580 }, { "epoch": 0.03562797951254777, "grad_norm": 0.21057911217212677, "learning_rate": 0.0002, "loss": 0.0637, "step": 19590 }, { "epoch": 0.03564616633210497, "grad_norm": 0.04383830726146698, "learning_rate": 0.0002, "loss": 0.019, "step": 19600 }, { "epoch": 0.035664353151662165, "grad_norm": 0.3657427132129669, "learning_rate": 0.0002, "loss": 0.1421, "step": 19610 }, { "epoch": 0.03568253997121936, "grad_norm": 0.17154265940189362, "learning_rate": 0.0002, "loss": 0.0779, "step": 19620 }, { "epoch": 0.03570072679077656, "grad_norm": 0.041993435472249985, "learning_rate": 0.0002, "loss": 0.0768, "step": 19630 }, { "epoch": 0.035718913610333754, "grad_norm": 0.1658252775669098, "learning_rate": 0.0002, "loss": 0.0602, "step": 19640 }, { "epoch": 0.03573710042989095, "grad_norm": 0.028523078188300133, "learning_rate": 0.0002, "loss": 0.0151, "step": 19650 }, { "epoch": 0.035755287249448146, "grad_norm": 0.2624453902244568, "learning_rate": 0.0002, "loss": 0.1355, "step": 19660 }, { "epoch": 0.03577347406900534, "grad_norm": 0.12055794149637222, "learning_rate": 0.0002, "loss": 0.079, "step": 19670 }, { "epoch": 0.03579166088856254, "grad_norm": 0.043441224843263626, "learning_rate": 0.0002, "loss": 0.0722, "step": 19680 }, { "epoch": 0.035809847708119735, "grad_norm": 0.2464340627193451, "learning_rate": 0.0002, "loss": 0.0673, "step": 19690 }, { "epoch": 0.03582803452767693, "grad_norm": 0.04004153981804848, "learning_rate": 0.0002, "loss": 0.0212, "step": 19700 }, { "epoch": 0.03584622134723413, "grad_norm": 0.3159453570842743, "learning_rate": 0.0002, "loss": 0.1806, "step": 19710 }, { "epoch": 0.035864408166791324, "grad_norm": 0.11327318102121353, "learning_rate": 0.0002, "loss": 0.0748, "step": 19720 }, { "epoch": 0.03588259498634852, "grad_norm": 0.0980909988284111, "learning_rate": 0.0002, "loss": 0.0807, "step": 19730 }, { "epoch": 0.03590078180590572, "grad_norm": 0.15508098900318146, "learning_rate": 0.0002, "loss": 0.0576, "step": 19740 }, { "epoch": 0.03591896862546291, "grad_norm": 0.019624806940555573, "learning_rate": 0.0002, "loss": 0.0135, "step": 19750 }, { "epoch": 0.03593715544502011, "grad_norm": 0.20336109399795532, "learning_rate": 0.0002, "loss": 0.1702, "step": 19760 }, { "epoch": 0.035955342264577306, "grad_norm": 0.12767620384693146, "learning_rate": 0.0002, "loss": 0.0776, "step": 19770 }, { "epoch": 0.0359735290841345, "grad_norm": 0.19050805270671844, "learning_rate": 0.0002, "loss": 0.0838, "step": 19780 }, { "epoch": 0.0359917159036917, "grad_norm": 0.17471866309642792, "learning_rate": 0.0002, "loss": 0.0561, "step": 19790 }, { "epoch": 0.036009902723248895, "grad_norm": 0.044348277151584625, "learning_rate": 0.0002, "loss": 0.0159, "step": 19800 }, { "epoch": 0.03602808954280609, "grad_norm": 0.30847081542015076, "learning_rate": 0.0002, "loss": 0.1686, "step": 19810 }, { "epoch": 0.03604627636236329, "grad_norm": 0.08963622897863388, "learning_rate": 0.0002, "loss": 0.078, "step": 19820 }, { "epoch": 0.036064463181920484, "grad_norm": 0.0580587275326252, "learning_rate": 0.0002, "loss": 0.0741, "step": 19830 }, { "epoch": 0.03608265000147768, "grad_norm": 0.1698184460401535, "learning_rate": 0.0002, "loss": 0.0631, "step": 19840 }, { "epoch": 0.036100836821034876, "grad_norm": 0.025531867519021034, "learning_rate": 0.0002, "loss": 0.0166, "step": 19850 }, { "epoch": 0.03611902364059207, "grad_norm": 0.3544731140136719, "learning_rate": 0.0002, "loss": 0.1886, "step": 19860 }, { "epoch": 0.03613721046014927, "grad_norm": 0.2552841901779175, "learning_rate": 0.0002, "loss": 0.0859, "step": 19870 }, { "epoch": 0.036155397279706465, "grad_norm": 0.07771942019462585, "learning_rate": 0.0002, "loss": 0.0859, "step": 19880 }, { "epoch": 0.03617358409926366, "grad_norm": 0.15945585072040558, "learning_rate": 0.0002, "loss": 0.0609, "step": 19890 }, { "epoch": 0.03619177091882086, "grad_norm": 0.04583865404129028, "learning_rate": 0.0002, "loss": 0.0196, "step": 19900 }, { "epoch": 0.036209957738378054, "grad_norm": 0.2110920548439026, "learning_rate": 0.0002, "loss": 0.1305, "step": 19910 }, { "epoch": 0.03622814455793525, "grad_norm": 0.22165755927562714, "learning_rate": 0.0002, "loss": 0.0767, "step": 19920 }, { "epoch": 0.03624633137749245, "grad_norm": 0.0866742879152298, "learning_rate": 0.0002, "loss": 0.0785, "step": 19930 }, { "epoch": 0.03626451819704964, "grad_norm": 0.19838224351406097, "learning_rate": 0.0002, "loss": 0.0663, "step": 19940 }, { "epoch": 0.03628270501660684, "grad_norm": 0.05543521046638489, "learning_rate": 0.0002, "loss": 0.023, "step": 19950 }, { "epoch": 0.036300891836164036, "grad_norm": 0.20800183713436127, "learning_rate": 0.0002, "loss": 0.1468, "step": 19960 }, { "epoch": 0.03631907865572123, "grad_norm": 0.14951092004776, "learning_rate": 0.0002, "loss": 0.0698, "step": 19970 }, { "epoch": 0.03633726547527843, "grad_norm": 0.10162603855133057, "learning_rate": 0.0002, "loss": 0.0841, "step": 19980 }, { "epoch": 0.036355452294835625, "grad_norm": 0.24774019420146942, "learning_rate": 0.0002, "loss": 0.0658, "step": 19990 }, { "epoch": 0.03637363911439282, "grad_norm": 0.02705777995288372, "learning_rate": 0.0002, "loss": 0.02, "step": 20000 }, { "epoch": 0.03639182593395002, "grad_norm": 0.2509992718696594, "learning_rate": 0.0002, "loss": 0.1529, "step": 20010 }, { "epoch": 0.036410012753507214, "grad_norm": 0.2126697599887848, "learning_rate": 0.0002, "loss": 0.0716, "step": 20020 }, { "epoch": 0.03642819957306441, "grad_norm": 0.1463591754436493, "learning_rate": 0.0002, "loss": 0.076, "step": 20030 }, { "epoch": 0.036446386392621606, "grad_norm": 0.21879518032073975, "learning_rate": 0.0002, "loss": 0.0677, "step": 20040 }, { "epoch": 0.0364645732121788, "grad_norm": 0.028337355703115463, "learning_rate": 0.0002, "loss": 0.0131, "step": 20050 }, { "epoch": 0.036482760031736, "grad_norm": 0.335788756608963, "learning_rate": 0.0002, "loss": 0.1693, "step": 20060 }, { "epoch": 0.036500946851293195, "grad_norm": 0.17615728080272675, "learning_rate": 0.0002, "loss": 0.0791, "step": 20070 }, { "epoch": 0.03651913367085039, "grad_norm": 0.034229181706905365, "learning_rate": 0.0002, "loss": 0.0774, "step": 20080 }, { "epoch": 0.03653732049040759, "grad_norm": 0.20637790858745575, "learning_rate": 0.0002, "loss": 0.0544, "step": 20090 }, { "epoch": 0.036555507309964784, "grad_norm": 0.033659741282463074, "learning_rate": 0.0002, "loss": 0.0128, "step": 20100 }, { "epoch": 0.03657369412952198, "grad_norm": 0.18249601125717163, "learning_rate": 0.0002, "loss": 0.1939, "step": 20110 }, { "epoch": 0.03659188094907918, "grad_norm": 0.18065877258777618, "learning_rate": 0.0002, "loss": 0.0816, "step": 20120 }, { "epoch": 0.03661006776863637, "grad_norm": 0.4361811876296997, "learning_rate": 0.0002, "loss": 0.0978, "step": 20130 }, { "epoch": 0.03662825458819357, "grad_norm": 0.24488002061843872, "learning_rate": 0.0002, "loss": 0.0742, "step": 20140 }, { "epoch": 0.036646441407750766, "grad_norm": 0.023062752559781075, "learning_rate": 0.0002, "loss": 0.0196, "step": 20150 }, { "epoch": 0.03666462822730796, "grad_norm": 0.22796255350112915, "learning_rate": 0.0002, "loss": 0.1457, "step": 20160 }, { "epoch": 0.03668281504686516, "grad_norm": 0.16665758192539215, "learning_rate": 0.0002, "loss": 0.138, "step": 20170 }, { "epoch": 0.036701001866422354, "grad_norm": 0.0503946952521801, "learning_rate": 0.0002, "loss": 0.079, "step": 20180 }, { "epoch": 0.03671918868597955, "grad_norm": 0.1672963798046112, "learning_rate": 0.0002, "loss": 0.0621, "step": 20190 }, { "epoch": 0.03673737550553675, "grad_norm": 0.06765859574079514, "learning_rate": 0.0002, "loss": 0.0171, "step": 20200 }, { "epoch": 0.03675556232509394, "grad_norm": 0.6076682806015015, "learning_rate": 0.0002, "loss": 0.6804, "step": 20210 }, { "epoch": 0.03677374914465114, "grad_norm": 0.04764563590288162, "learning_rate": 0.0002, "loss": 0.0965, "step": 20220 }, { "epoch": 0.036791935964208336, "grad_norm": 0.6847806572914124, "learning_rate": 0.0002, "loss": 0.0784, "step": 20230 }, { "epoch": 0.03681012278376553, "grad_norm": 0.2678837478160858, "learning_rate": 0.0002, "loss": 0.069, "step": 20240 }, { "epoch": 0.03682830960332273, "grad_norm": 0.039824239909648895, "learning_rate": 0.0002, "loss": 0.0206, "step": 20250 }, { "epoch": 0.036846496422879925, "grad_norm": 0.19583609700202942, "learning_rate": 0.0002, "loss": 0.1588, "step": 20260 }, { "epoch": 0.03686468324243713, "grad_norm": 0.08613055944442749, "learning_rate": 0.0002, "loss": 0.0777, "step": 20270 }, { "epoch": 0.036882870061994324, "grad_norm": 0.028818165883421898, "learning_rate": 0.0002, "loss": 0.0704, "step": 20280 }, { "epoch": 0.03690105688155152, "grad_norm": 0.19514115154743195, "learning_rate": 0.0002, "loss": 0.0654, "step": 20290 }, { "epoch": 0.03691924370110872, "grad_norm": 0.043222617357969284, "learning_rate": 0.0002, "loss": 0.0216, "step": 20300 }, { "epoch": 0.03693743052066591, "grad_norm": 0.2490546703338623, "learning_rate": 0.0002, "loss": 0.1472, "step": 20310 }, { "epoch": 0.03695561734022311, "grad_norm": 0.16989269852638245, "learning_rate": 0.0002, "loss": 0.081, "step": 20320 }, { "epoch": 0.036973804159780306, "grad_norm": 0.09191739559173584, "learning_rate": 0.0002, "loss": 0.0733, "step": 20330 }, { "epoch": 0.0369919909793375, "grad_norm": 0.18435023725032806, "learning_rate": 0.0002, "loss": 0.0654, "step": 20340 }, { "epoch": 0.0370101777988947, "grad_norm": 0.031144114211201668, "learning_rate": 0.0002, "loss": 0.0226, "step": 20350 }, { "epoch": 0.037028364618451895, "grad_norm": 0.3244694769382477, "learning_rate": 0.0002, "loss": 0.1304, "step": 20360 }, { "epoch": 0.03704655143800909, "grad_norm": 0.13787488639354706, "learning_rate": 0.0002, "loss": 0.0811, "step": 20370 }, { "epoch": 0.03706473825756629, "grad_norm": 0.058523450046777725, "learning_rate": 0.0002, "loss": 0.0806, "step": 20380 }, { "epoch": 0.037082925077123484, "grad_norm": 0.3001325726509094, "learning_rate": 0.0002, "loss": 0.0694, "step": 20390 }, { "epoch": 0.03710111189668068, "grad_norm": 0.04447292909026146, "learning_rate": 0.0002, "loss": 0.0218, "step": 20400 }, { "epoch": 0.037119298716237877, "grad_norm": 0.25786396861076355, "learning_rate": 0.0002, "loss": 0.1499, "step": 20410 }, { "epoch": 0.03713748553579507, "grad_norm": 0.11381134390830994, "learning_rate": 0.0002, "loss": 0.0822, "step": 20420 }, { "epoch": 0.03715567235535227, "grad_norm": 0.022713568061590195, "learning_rate": 0.0002, "loss": 0.0785, "step": 20430 }, { "epoch": 0.037173859174909465, "grad_norm": 0.15770909190177917, "learning_rate": 0.0002, "loss": 0.0625, "step": 20440 }, { "epoch": 0.03719204599446666, "grad_norm": 0.021412041038274765, "learning_rate": 0.0002, "loss": 0.0126, "step": 20450 }, { "epoch": 0.03721023281402386, "grad_norm": 0.24260753393173218, "learning_rate": 0.0002, "loss": 0.1777, "step": 20460 }, { "epoch": 0.037228419633581054, "grad_norm": 0.10953031480312347, "learning_rate": 0.0002, "loss": 0.073, "step": 20470 }, { "epoch": 0.03724660645313825, "grad_norm": 0.03975062072277069, "learning_rate": 0.0002, "loss": 0.0907, "step": 20480 }, { "epoch": 0.03726479327269545, "grad_norm": 0.2025018036365509, "learning_rate": 0.0002, "loss": 0.0631, "step": 20490 }, { "epoch": 0.03728298009225264, "grad_norm": 0.031849734485149384, "learning_rate": 0.0002, "loss": 0.0156, "step": 20500 }, { "epoch": 0.03730116691180984, "grad_norm": 0.2650098502635956, "learning_rate": 0.0002, "loss": 0.1569, "step": 20510 }, { "epoch": 0.037319353731367036, "grad_norm": 0.14113937318325043, "learning_rate": 0.0002, "loss": 0.0824, "step": 20520 }, { "epoch": 0.03733754055092423, "grad_norm": 0.10276420414447784, "learning_rate": 0.0002, "loss": 0.0797, "step": 20530 }, { "epoch": 0.03735572737048143, "grad_norm": 0.2258286476135254, "learning_rate": 0.0002, "loss": 0.0671, "step": 20540 }, { "epoch": 0.037373914190038625, "grad_norm": 0.10343242436647415, "learning_rate": 0.0002, "loss": 0.0178, "step": 20550 }, { "epoch": 0.03739210100959582, "grad_norm": 0.19423982501029968, "learning_rate": 0.0002, "loss": 0.1423, "step": 20560 }, { "epoch": 0.03741028782915302, "grad_norm": 0.12046124786138535, "learning_rate": 0.0002, "loss": 0.0827, "step": 20570 }, { "epoch": 0.037428474648710214, "grad_norm": 0.026751041412353516, "learning_rate": 0.0002, "loss": 0.0743, "step": 20580 }, { "epoch": 0.03744666146826741, "grad_norm": 0.23576834797859192, "learning_rate": 0.0002, "loss": 0.0629, "step": 20590 }, { "epoch": 0.037464848287824606, "grad_norm": 0.05146399885416031, "learning_rate": 0.0002, "loss": 0.0205, "step": 20600 }, { "epoch": 0.0374830351073818, "grad_norm": 0.21750135719776154, "learning_rate": 0.0002, "loss": 0.1397, "step": 20610 }, { "epoch": 0.037501221926939, "grad_norm": 0.08351115882396698, "learning_rate": 0.0002, "loss": 0.0801, "step": 20620 }, { "epoch": 0.037519408746496195, "grad_norm": 0.07272092998027802, "learning_rate": 0.0002, "loss": 0.0881, "step": 20630 }, { "epoch": 0.03753759556605339, "grad_norm": 0.23707769811153412, "learning_rate": 0.0002, "loss": 0.0706, "step": 20640 }, { "epoch": 0.03755578238561059, "grad_norm": 0.05208323150873184, "learning_rate": 0.0002, "loss": 0.024, "step": 20650 }, { "epoch": 0.037573969205167784, "grad_norm": 0.4163022041320801, "learning_rate": 0.0002, "loss": 0.159, "step": 20660 }, { "epoch": 0.03759215602472498, "grad_norm": 0.1036575511097908, "learning_rate": 0.0002, "loss": 0.0814, "step": 20670 }, { "epoch": 0.03761034284428218, "grad_norm": 0.09861626476049423, "learning_rate": 0.0002, "loss": 0.0828, "step": 20680 }, { "epoch": 0.03762852966383937, "grad_norm": 0.1685744971036911, "learning_rate": 0.0002, "loss": 0.0597, "step": 20690 }, { "epoch": 0.03764671648339657, "grad_norm": 0.02716050110757351, "learning_rate": 0.0002, "loss": 0.0164, "step": 20700 }, { "epoch": 0.037664903302953766, "grad_norm": 0.46858713030815125, "learning_rate": 0.0002, "loss": 0.1596, "step": 20710 }, { "epoch": 0.03768309012251096, "grad_norm": 0.15260715782642365, "learning_rate": 0.0002, "loss": 0.0835, "step": 20720 }, { "epoch": 0.03770127694206816, "grad_norm": 0.2063397914171219, "learning_rate": 0.0002, "loss": 0.0845, "step": 20730 }, { "epoch": 0.037719463761625355, "grad_norm": 0.16447599232196808, "learning_rate": 0.0002, "loss": 0.0595, "step": 20740 }, { "epoch": 0.03773765058118255, "grad_norm": 0.020755184814333916, "learning_rate": 0.0002, "loss": 0.0164, "step": 20750 }, { "epoch": 0.03775583740073975, "grad_norm": 0.23675021529197693, "learning_rate": 0.0002, "loss": 0.1634, "step": 20760 }, { "epoch": 0.037774024220296944, "grad_norm": 0.08625516295433044, "learning_rate": 0.0002, "loss": 0.0685, "step": 20770 }, { "epoch": 0.03779221103985414, "grad_norm": 0.043796882033348083, "learning_rate": 0.0002, "loss": 0.0812, "step": 20780 }, { "epoch": 0.037810397859411336, "grad_norm": 0.20600435137748718, "learning_rate": 0.0002, "loss": 0.0651, "step": 20790 }, { "epoch": 0.03782858467896853, "grad_norm": 0.04963940382003784, "learning_rate": 0.0002, "loss": 0.0202, "step": 20800 }, { "epoch": 0.03784677149852573, "grad_norm": 0.34920167922973633, "learning_rate": 0.0002, "loss": 0.1494, "step": 20810 }, { "epoch": 0.037864958318082925, "grad_norm": 0.18662041425704956, "learning_rate": 0.0002, "loss": 0.0823, "step": 20820 }, { "epoch": 0.03788314513764012, "grad_norm": 0.12615887820720673, "learning_rate": 0.0002, "loss": 0.0856, "step": 20830 }, { "epoch": 0.03790133195719732, "grad_norm": 0.1857282668352127, "learning_rate": 0.0002, "loss": 0.0676, "step": 20840 }, { "epoch": 0.037919518776754514, "grad_norm": 0.05569197237491608, "learning_rate": 0.0002, "loss": 0.0181, "step": 20850 }, { "epoch": 0.03793770559631171, "grad_norm": 0.29011765122413635, "learning_rate": 0.0002, "loss": 0.1418, "step": 20860 }, { "epoch": 0.03795589241586891, "grad_norm": 0.14119744300842285, "learning_rate": 0.0002, "loss": 0.0812, "step": 20870 }, { "epoch": 0.0379740792354261, "grad_norm": 0.039884984493255615, "learning_rate": 0.0002, "loss": 0.0781, "step": 20880 }, { "epoch": 0.0379922660549833, "grad_norm": 0.23705685138702393, "learning_rate": 0.0002, "loss": 0.0621, "step": 20890 }, { "epoch": 0.038010452874540496, "grad_norm": 0.07462739199399948, "learning_rate": 0.0002, "loss": 0.022, "step": 20900 }, { "epoch": 0.03802863969409769, "grad_norm": 0.2610052824020386, "learning_rate": 0.0002, "loss": 0.1517, "step": 20910 }, { "epoch": 0.03804682651365489, "grad_norm": 0.12775090336799622, "learning_rate": 0.0002, "loss": 0.0758, "step": 20920 }, { "epoch": 0.038065013333212085, "grad_norm": 0.03661905974149704, "learning_rate": 0.0002, "loss": 0.0738, "step": 20930 }, { "epoch": 0.03808320015276928, "grad_norm": 0.20907218754291534, "learning_rate": 0.0002, "loss": 0.0627, "step": 20940 }, { "epoch": 0.03810138697232648, "grad_norm": 0.022804679349064827, "learning_rate": 0.0002, "loss": 0.0205, "step": 20950 }, { "epoch": 0.03811957379188368, "grad_norm": 0.258284330368042, "learning_rate": 0.0002, "loss": 0.1428, "step": 20960 }, { "epoch": 0.03813776061144088, "grad_norm": 0.1477317065000534, "learning_rate": 0.0002, "loss": 0.0789, "step": 20970 }, { "epoch": 0.03815594743099807, "grad_norm": 0.0610325001180172, "learning_rate": 0.0002, "loss": 0.0836, "step": 20980 }, { "epoch": 0.03817413425055527, "grad_norm": 0.18825507164001465, "learning_rate": 0.0002, "loss": 0.0621, "step": 20990 }, { "epoch": 0.038192321070112466, "grad_norm": 0.03943372145295143, "learning_rate": 0.0002, "loss": 0.0185, "step": 21000 }, { "epoch": 0.03821050788966966, "grad_norm": 0.34519344568252563, "learning_rate": 0.0002, "loss": 0.1345, "step": 21010 }, { "epoch": 0.03822869470922686, "grad_norm": 0.09635084867477417, "learning_rate": 0.0002, "loss": 0.0753, "step": 21020 }, { "epoch": 0.038246881528784055, "grad_norm": 0.032520972192287445, "learning_rate": 0.0002, "loss": 0.082, "step": 21030 }, { "epoch": 0.03826506834834125, "grad_norm": 0.18068930506706238, "learning_rate": 0.0002, "loss": 0.0609, "step": 21040 }, { "epoch": 0.03828325516789845, "grad_norm": 0.05550973862409592, "learning_rate": 0.0002, "loss": 0.0241, "step": 21050 }, { "epoch": 0.038301441987455644, "grad_norm": 0.19561107456684113, "learning_rate": 0.0002, "loss": 0.1337, "step": 21060 }, { "epoch": 0.03831962880701284, "grad_norm": 0.1852179914712906, "learning_rate": 0.0002, "loss": 0.0724, "step": 21070 }, { "epoch": 0.038337815626570036, "grad_norm": 0.11915116757154465, "learning_rate": 0.0002, "loss": 0.0836, "step": 21080 }, { "epoch": 0.03835600244612723, "grad_norm": 0.21116836369037628, "learning_rate": 0.0002, "loss": 0.0628, "step": 21090 }, { "epoch": 0.03837418926568443, "grad_norm": 0.042745884507894516, "learning_rate": 0.0002, "loss": 0.0214, "step": 21100 }, { "epoch": 0.038392376085241625, "grad_norm": 0.43089792132377625, "learning_rate": 0.0002, "loss": 0.1351, "step": 21110 }, { "epoch": 0.03841056290479882, "grad_norm": 0.09607810527086258, "learning_rate": 0.0002, "loss": 0.0778, "step": 21120 }, { "epoch": 0.03842874972435602, "grad_norm": 0.13603460788726807, "learning_rate": 0.0002, "loss": 0.0787, "step": 21130 }, { "epoch": 0.038446936543913214, "grad_norm": 0.20110103487968445, "learning_rate": 0.0002, "loss": 0.067, "step": 21140 }, { "epoch": 0.03846512336347041, "grad_norm": 0.042503997683525085, "learning_rate": 0.0002, "loss": 0.0194, "step": 21150 }, { "epoch": 0.03848331018302761, "grad_norm": 0.2605084478855133, "learning_rate": 0.0002, "loss": 0.1374, "step": 21160 }, { "epoch": 0.0385014970025848, "grad_norm": 0.09476794302463531, "learning_rate": 0.0002, "loss": 0.078, "step": 21170 }, { "epoch": 0.038519683822142, "grad_norm": 0.03458428382873535, "learning_rate": 0.0002, "loss": 0.08, "step": 21180 }, { "epoch": 0.038537870641699196, "grad_norm": 0.31196194887161255, "learning_rate": 0.0002, "loss": 0.0664, "step": 21190 }, { "epoch": 0.03855605746125639, "grad_norm": 0.037113118916749954, "learning_rate": 0.0002, "loss": 0.0221, "step": 21200 }, { "epoch": 0.03857424428081359, "grad_norm": 0.3699415922164917, "learning_rate": 0.0002, "loss": 0.1534, "step": 21210 }, { "epoch": 0.038592431100370785, "grad_norm": 0.06454256922006607, "learning_rate": 0.0002, "loss": 0.0762, "step": 21220 }, { "epoch": 0.03861061791992798, "grad_norm": 0.09858033806085587, "learning_rate": 0.0002, "loss": 0.0785, "step": 21230 }, { "epoch": 0.03862880473948518, "grad_norm": 0.1482791304588318, "learning_rate": 0.0002, "loss": 0.062, "step": 21240 }, { "epoch": 0.038646991559042373, "grad_norm": 0.031473588198423386, "learning_rate": 0.0002, "loss": 0.0163, "step": 21250 }, { "epoch": 0.03866517837859957, "grad_norm": 0.09360513091087341, "learning_rate": 0.0002, "loss": 0.1397, "step": 21260 }, { "epoch": 0.038683365198156766, "grad_norm": 0.10830901563167572, "learning_rate": 0.0002, "loss": 0.0789, "step": 21270 }, { "epoch": 0.03870155201771396, "grad_norm": 0.08910014480352402, "learning_rate": 0.0002, "loss": 0.0758, "step": 21280 }, { "epoch": 0.03871973883727116, "grad_norm": 0.21524523198604584, "learning_rate": 0.0002, "loss": 0.0628, "step": 21290 }, { "epoch": 0.038737925656828355, "grad_norm": 0.03794678673148155, "learning_rate": 0.0002, "loss": 0.0229, "step": 21300 }, { "epoch": 0.03875611247638555, "grad_norm": 0.46754345297813416, "learning_rate": 0.0002, "loss": 0.1291, "step": 21310 }, { "epoch": 0.03877429929594275, "grad_norm": 0.07472983002662659, "learning_rate": 0.0002, "loss": 0.076, "step": 21320 }, { "epoch": 0.038792486115499944, "grad_norm": 0.11820811778306961, "learning_rate": 0.0002, "loss": 0.0772, "step": 21330 }, { "epoch": 0.03881067293505714, "grad_norm": 0.21140390634536743, "learning_rate": 0.0002, "loss": 0.0539, "step": 21340 }, { "epoch": 0.03882885975461434, "grad_norm": 0.044819217175245285, "learning_rate": 0.0002, "loss": 0.0228, "step": 21350 }, { "epoch": 0.03884704657417153, "grad_norm": 0.2267816811800003, "learning_rate": 0.0002, "loss": 0.1462, "step": 21360 }, { "epoch": 0.03886523339372873, "grad_norm": 0.10087496787309647, "learning_rate": 0.0002, "loss": 0.0766, "step": 21370 }, { "epoch": 0.038883420213285926, "grad_norm": 0.09982341527938843, "learning_rate": 0.0002, "loss": 0.0798, "step": 21380 }, { "epoch": 0.03890160703284312, "grad_norm": 0.21729151904582977, "learning_rate": 0.0002, "loss": 0.0586, "step": 21390 }, { "epoch": 0.03891979385240032, "grad_norm": 0.020691821351647377, "learning_rate": 0.0002, "loss": 0.0175, "step": 21400 }, { "epoch": 0.038937980671957514, "grad_norm": 0.33531665802001953, "learning_rate": 0.0002, "loss": 0.149, "step": 21410 }, { "epoch": 0.03895616749151471, "grad_norm": 0.11777795851230621, "learning_rate": 0.0002, "loss": 0.0736, "step": 21420 }, { "epoch": 0.03897435431107191, "grad_norm": 0.07860718667507172, "learning_rate": 0.0002, "loss": 0.0788, "step": 21430 }, { "epoch": 0.0389925411306291, "grad_norm": 0.16030597686767578, "learning_rate": 0.0002, "loss": 0.0581, "step": 21440 }, { "epoch": 0.0390107279501863, "grad_norm": 0.01747356541454792, "learning_rate": 0.0002, "loss": 0.0185, "step": 21450 }, { "epoch": 0.039028914769743496, "grad_norm": 0.2313859909772873, "learning_rate": 0.0002, "loss": 0.1383, "step": 21460 }, { "epoch": 0.03904710158930069, "grad_norm": 0.14510080218315125, "learning_rate": 0.0002, "loss": 0.0805, "step": 21470 }, { "epoch": 0.03906528840885789, "grad_norm": 0.04511871561408043, "learning_rate": 0.0002, "loss": 0.0793, "step": 21480 }, { "epoch": 0.039083475228415085, "grad_norm": 0.24205265939235687, "learning_rate": 0.0002, "loss": 0.0624, "step": 21490 }, { "epoch": 0.03910166204797228, "grad_norm": 0.08096791058778763, "learning_rate": 0.0002, "loss": 0.0208, "step": 21500 }, { "epoch": 0.03911984886752948, "grad_norm": 0.14405490458011627, "learning_rate": 0.0002, "loss": 0.1189, "step": 21510 }, { "epoch": 0.039138035687086674, "grad_norm": 0.06753374636173248, "learning_rate": 0.0002, "loss": 0.0772, "step": 21520 }, { "epoch": 0.03915622250664387, "grad_norm": 0.029025042429566383, "learning_rate": 0.0002, "loss": 0.0761, "step": 21530 }, { "epoch": 0.039174409326201066, "grad_norm": 0.2987070381641388, "learning_rate": 0.0002, "loss": 0.0656, "step": 21540 }, { "epoch": 0.03919259614575826, "grad_norm": 0.04445091262459755, "learning_rate": 0.0002, "loss": 0.0241, "step": 21550 }, { "epoch": 0.03921078296531546, "grad_norm": 0.34976306557655334, "learning_rate": 0.0002, "loss": 0.138, "step": 21560 }, { "epoch": 0.039228969784872655, "grad_norm": 0.07521916925907135, "learning_rate": 0.0002, "loss": 0.0774, "step": 21570 }, { "epoch": 0.03924715660442985, "grad_norm": 0.1445412039756775, "learning_rate": 0.0002, "loss": 0.087, "step": 21580 }, { "epoch": 0.03926534342398705, "grad_norm": 0.2688128352165222, "learning_rate": 0.0002, "loss": 0.0712, "step": 21590 }, { "epoch": 0.039283530243544244, "grad_norm": 0.05321233719587326, "learning_rate": 0.0002, "loss": 0.0245, "step": 21600 }, { "epoch": 0.03930171706310144, "grad_norm": 0.44459134340286255, "learning_rate": 0.0002, "loss": 0.1524, "step": 21610 }, { "epoch": 0.03931990388265864, "grad_norm": 0.13169553875923157, "learning_rate": 0.0002, "loss": 0.0726, "step": 21620 }, { "epoch": 0.03933809070221583, "grad_norm": 0.0908237174153328, "learning_rate": 0.0002, "loss": 0.0785, "step": 21630 }, { "epoch": 0.03935627752177303, "grad_norm": 0.18110623955726624, "learning_rate": 0.0002, "loss": 0.0606, "step": 21640 }, { "epoch": 0.03937446434133023, "grad_norm": 0.021362677216529846, "learning_rate": 0.0002, "loss": 0.0175, "step": 21650 }, { "epoch": 0.03939265116088743, "grad_norm": 0.27973899245262146, "learning_rate": 0.0002, "loss": 0.1641, "step": 21660 }, { "epoch": 0.039410837980444625, "grad_norm": 0.09090718626976013, "learning_rate": 0.0002, "loss": 0.073, "step": 21670 }, { "epoch": 0.03942902480000182, "grad_norm": 0.13408254086971283, "learning_rate": 0.0002, "loss": 0.0769, "step": 21680 }, { "epoch": 0.03944721161955902, "grad_norm": 0.2530055046081543, "learning_rate": 0.0002, "loss": 0.0729, "step": 21690 }, { "epoch": 0.039465398439116214, "grad_norm": 0.027523871511220932, "learning_rate": 0.0002, "loss": 0.017, "step": 21700 }, { "epoch": 0.03948358525867341, "grad_norm": 0.2520642578601837, "learning_rate": 0.0002, "loss": 0.1804, "step": 21710 }, { "epoch": 0.03950177207823061, "grad_norm": 0.11017465591430664, "learning_rate": 0.0002, "loss": 0.0767, "step": 21720 }, { "epoch": 0.0395199588977878, "grad_norm": 0.05129052326083183, "learning_rate": 0.0002, "loss": 0.0723, "step": 21730 }, { "epoch": 0.039538145717345, "grad_norm": 0.1846659779548645, "learning_rate": 0.0002, "loss": 0.0619, "step": 21740 }, { "epoch": 0.039556332536902196, "grad_norm": 0.014305013231933117, "learning_rate": 0.0002, "loss": 0.0171, "step": 21750 }, { "epoch": 0.03957451935645939, "grad_norm": 0.21667814254760742, "learning_rate": 0.0002, "loss": 0.157, "step": 21760 }, { "epoch": 0.03959270617601659, "grad_norm": 0.21456903219223022, "learning_rate": 0.0002, "loss": 0.0803, "step": 21770 }, { "epoch": 0.039610892995573785, "grad_norm": 0.03621416166424751, "learning_rate": 0.0002, "loss": 0.0796, "step": 21780 }, { "epoch": 0.03962907981513098, "grad_norm": 0.20819205045700073, "learning_rate": 0.0002, "loss": 0.0633, "step": 21790 }, { "epoch": 0.03964726663468818, "grad_norm": 0.06860963255167007, "learning_rate": 0.0002, "loss": 0.0172, "step": 21800 }, { "epoch": 0.039665453454245374, "grad_norm": 0.2568039894104004, "learning_rate": 0.0002, "loss": 0.134, "step": 21810 }, { "epoch": 0.03968364027380257, "grad_norm": 0.08747372031211853, "learning_rate": 0.0002, "loss": 0.0753, "step": 21820 }, { "epoch": 0.039701827093359766, "grad_norm": 0.13403570652008057, "learning_rate": 0.0002, "loss": 0.0807, "step": 21830 }, { "epoch": 0.03972001391291696, "grad_norm": 0.20756667852401733, "learning_rate": 0.0002, "loss": 0.0625, "step": 21840 }, { "epoch": 0.03973820073247416, "grad_norm": 0.03678170591592789, "learning_rate": 0.0002, "loss": 0.019, "step": 21850 }, { "epoch": 0.039756387552031355, "grad_norm": 0.1847693920135498, "learning_rate": 0.0002, "loss": 0.1385, "step": 21860 }, { "epoch": 0.03977457437158855, "grad_norm": 0.1627635508775711, "learning_rate": 0.0002, "loss": 0.0765, "step": 21870 }, { "epoch": 0.03979276119114575, "grad_norm": 0.0535571426153183, "learning_rate": 0.0002, "loss": 0.0741, "step": 21880 }, { "epoch": 0.039810948010702944, "grad_norm": 0.3128276765346527, "learning_rate": 0.0002, "loss": 0.0598, "step": 21890 }, { "epoch": 0.03982913483026014, "grad_norm": 0.03369860351085663, "learning_rate": 0.0002, "loss": 0.0217, "step": 21900 }, { "epoch": 0.03984732164981734, "grad_norm": 0.1962599903345108, "learning_rate": 0.0002, "loss": 0.1319, "step": 21910 }, { "epoch": 0.03986550846937453, "grad_norm": 0.1397421509027481, "learning_rate": 0.0002, "loss": 0.068, "step": 21920 }, { "epoch": 0.03988369528893173, "grad_norm": 0.10252605378627777, "learning_rate": 0.0002, "loss": 0.0736, "step": 21930 }, { "epoch": 0.039901882108488926, "grad_norm": 0.22179432213306427, "learning_rate": 0.0002, "loss": 0.0625, "step": 21940 }, { "epoch": 0.03992006892804612, "grad_norm": 0.06068069487810135, "learning_rate": 0.0002, "loss": 0.0242, "step": 21950 }, { "epoch": 0.03993825574760332, "grad_norm": 0.20243950188159943, "learning_rate": 0.0002, "loss": 0.143, "step": 21960 }, { "epoch": 0.039956442567160515, "grad_norm": 0.11786511540412903, "learning_rate": 0.0002, "loss": 0.0779, "step": 21970 }, { "epoch": 0.03997462938671771, "grad_norm": 0.08299421519041061, "learning_rate": 0.0002, "loss": 0.0774, "step": 21980 }, { "epoch": 0.03999281620627491, "grad_norm": 0.2844075858592987, "learning_rate": 0.0002, "loss": 0.0711, "step": 21990 }, { "epoch": 0.040011003025832104, "grad_norm": 0.034433312714099884, "learning_rate": 0.0002, "loss": 0.0217, "step": 22000 }, { "epoch": 0.0400291898453893, "grad_norm": 0.3878481388092041, "learning_rate": 0.0002, "loss": 0.1525, "step": 22010 }, { "epoch": 0.040047376664946496, "grad_norm": 0.16157971322536469, "learning_rate": 0.0002, "loss": 0.0788, "step": 22020 }, { "epoch": 0.04006556348450369, "grad_norm": 0.10347063094377518, "learning_rate": 0.0002, "loss": 0.0809, "step": 22030 }, { "epoch": 0.04008375030406089, "grad_norm": 0.20982638001441956, "learning_rate": 0.0002, "loss": 0.0662, "step": 22040 }, { "epoch": 0.040101937123618085, "grad_norm": 5.856126308441162, "learning_rate": 0.0002, "loss": 0.0578, "step": 22050 }, { "epoch": 0.04012012394317528, "grad_norm": 0.21289357542991638, "learning_rate": 0.0002, "loss": 0.1257, "step": 22060 }, { "epoch": 0.04013831076273248, "grad_norm": 0.040848907083272934, "learning_rate": 0.0002, "loss": 0.0783, "step": 22070 }, { "epoch": 0.040156497582289674, "grad_norm": 0.056517478078603745, "learning_rate": 0.0002, "loss": 0.0693, "step": 22080 }, { "epoch": 0.04017468440184687, "grad_norm": 0.274312287569046, "learning_rate": 0.0002, "loss": 0.0685, "step": 22090 }, { "epoch": 0.04019287122140407, "grad_norm": 0.06353340297937393, "learning_rate": 0.0002, "loss": 0.0263, "step": 22100 }, { "epoch": 0.04021105804096126, "grad_norm": 0.287201464176178, "learning_rate": 0.0002, "loss": 0.1425, "step": 22110 }, { "epoch": 0.04022924486051846, "grad_norm": 0.0990116223692894, "learning_rate": 0.0002, "loss": 0.0732, "step": 22120 }, { "epoch": 0.040247431680075656, "grad_norm": 0.03471527248620987, "learning_rate": 0.0002, "loss": 0.0806, "step": 22130 }, { "epoch": 0.04026561849963285, "grad_norm": 0.16411902010440826, "learning_rate": 0.0002, "loss": 0.0646, "step": 22140 }, { "epoch": 0.04028380531919005, "grad_norm": 0.032927367836236954, "learning_rate": 0.0002, "loss": 0.0225, "step": 22150 }, { "epoch": 0.040301992138747245, "grad_norm": 0.31128716468811035, "learning_rate": 0.0002, "loss": 0.1227, "step": 22160 }, { "epoch": 0.04032017895830444, "grad_norm": 0.14056596159934998, "learning_rate": 0.0002, "loss": 0.0866, "step": 22170 }, { "epoch": 0.04033836577786164, "grad_norm": 0.10555677115917206, "learning_rate": 0.0002, "loss": 0.0785, "step": 22180 }, { "epoch": 0.040356552597418834, "grad_norm": 0.25597816705703735, "learning_rate": 0.0002, "loss": 0.0667, "step": 22190 }, { "epoch": 0.04037473941697603, "grad_norm": 0.04694845899939537, "learning_rate": 0.0002, "loss": 0.021, "step": 22200 }, { "epoch": 0.040392926236533226, "grad_norm": 0.2536766529083252, "learning_rate": 0.0002, "loss": 0.1485, "step": 22210 }, { "epoch": 0.04041111305609042, "grad_norm": 0.0536673367023468, "learning_rate": 0.0002, "loss": 0.0737, "step": 22220 }, { "epoch": 0.04042929987564762, "grad_norm": 0.13121111690998077, "learning_rate": 0.0002, "loss": 0.0793, "step": 22230 }, { "epoch": 0.040447486695204815, "grad_norm": 0.23850645124912262, "learning_rate": 0.0002, "loss": 0.0698, "step": 22240 }, { "epoch": 0.04046567351476201, "grad_norm": 0.04178560525178909, "learning_rate": 0.0002, "loss": 0.0212, "step": 22250 }, { "epoch": 0.04048386033431921, "grad_norm": 0.42834579944610596, "learning_rate": 0.0002, "loss": 0.1352, "step": 22260 }, { "epoch": 0.040502047153876404, "grad_norm": 0.050178542733192444, "learning_rate": 0.0002, "loss": 0.0853, "step": 22270 }, { "epoch": 0.0405202339734336, "grad_norm": 0.042758237570524216, "learning_rate": 0.0002, "loss": 0.0709, "step": 22280 }, { "epoch": 0.0405384207929908, "grad_norm": 0.2604416012763977, "learning_rate": 0.0002, "loss": 0.0643, "step": 22290 }, { "epoch": 0.04055660761254799, "grad_norm": 0.06166388466954231, "learning_rate": 0.0002, "loss": 0.0236, "step": 22300 }, { "epoch": 0.04057479443210519, "grad_norm": 0.2337518334388733, "learning_rate": 0.0002, "loss": 0.132, "step": 22310 }, { "epoch": 0.040592981251662386, "grad_norm": 0.15794694423675537, "learning_rate": 0.0002, "loss": 0.0739, "step": 22320 }, { "epoch": 0.04061116807121959, "grad_norm": 0.12059915065765381, "learning_rate": 0.0002, "loss": 0.0743, "step": 22330 }, { "epoch": 0.040629354890776785, "grad_norm": 0.25351977348327637, "learning_rate": 0.0002, "loss": 0.065, "step": 22340 }, { "epoch": 0.04064754171033398, "grad_norm": 0.03265364468097687, "learning_rate": 0.0002, "loss": 0.02, "step": 22350 }, { "epoch": 0.04066572852989118, "grad_norm": 0.22959749400615692, "learning_rate": 0.0002, "loss": 0.1278, "step": 22360 }, { "epoch": 0.040683915349448374, "grad_norm": 0.11381889134645462, "learning_rate": 0.0002, "loss": 0.0823, "step": 22370 }, { "epoch": 0.04070210216900557, "grad_norm": 0.03541165217757225, "learning_rate": 0.0002, "loss": 0.0809, "step": 22380 }, { "epoch": 0.04072028898856277, "grad_norm": 0.20604047179222107, "learning_rate": 0.0002, "loss": 0.0693, "step": 22390 }, { "epoch": 0.04073847580811996, "grad_norm": 0.051576532423496246, "learning_rate": 0.0002, "loss": 0.0213, "step": 22400 }, { "epoch": 0.04075666262767716, "grad_norm": 0.208265483379364, "learning_rate": 0.0002, "loss": 0.1203, "step": 22410 }, { "epoch": 0.040774849447234356, "grad_norm": 0.14376410841941833, "learning_rate": 0.0002, "loss": 0.0832, "step": 22420 }, { "epoch": 0.04079303626679155, "grad_norm": 0.0634629875421524, "learning_rate": 0.0002, "loss": 0.0797, "step": 22430 }, { "epoch": 0.04081122308634875, "grad_norm": 0.22782418131828308, "learning_rate": 0.0002, "loss": 0.0594, "step": 22440 }, { "epoch": 0.040829409905905945, "grad_norm": 0.034153662621974945, "learning_rate": 0.0002, "loss": 0.0197, "step": 22450 }, { "epoch": 0.04084759672546314, "grad_norm": 0.22994177043437958, "learning_rate": 0.0002, "loss": 0.1276, "step": 22460 }, { "epoch": 0.04086578354502034, "grad_norm": 0.37397289276123047, "learning_rate": 0.0002, "loss": 0.0794, "step": 22470 }, { "epoch": 0.040883970364577533, "grad_norm": 0.03585643321275711, "learning_rate": 0.0002, "loss": 0.0765, "step": 22480 }, { "epoch": 0.04090215718413473, "grad_norm": 0.2266087681055069, "learning_rate": 0.0002, "loss": 0.0661, "step": 22490 }, { "epoch": 0.040920344003691926, "grad_norm": 0.03867397829890251, "learning_rate": 0.0002, "loss": 0.0241, "step": 22500 }, { "epoch": 0.04093853082324912, "grad_norm": 0.23483702540397644, "learning_rate": 0.0002, "loss": 0.1442, "step": 22510 }, { "epoch": 0.04095671764280632, "grad_norm": 0.11447428911924362, "learning_rate": 0.0002, "loss": 0.0759, "step": 22520 }, { "epoch": 0.040974904462363515, "grad_norm": 0.1060417965054512, "learning_rate": 0.0002, "loss": 0.0792, "step": 22530 }, { "epoch": 0.04099309128192071, "grad_norm": 0.1915966123342514, "learning_rate": 0.0002, "loss": 0.0679, "step": 22540 }, { "epoch": 0.04101127810147791, "grad_norm": 0.05328527092933655, "learning_rate": 0.0002, "loss": 0.0211, "step": 22550 }, { "epoch": 0.041029464921035104, "grad_norm": 0.31612515449523926, "learning_rate": 0.0002, "loss": 0.1395, "step": 22560 }, { "epoch": 0.0410476517405923, "grad_norm": 0.1860841065645218, "learning_rate": 0.0002, "loss": 0.0704, "step": 22570 }, { "epoch": 0.0410658385601495, "grad_norm": 0.11183702945709229, "learning_rate": 0.0002, "loss": 0.073, "step": 22580 }, { "epoch": 0.04108402537970669, "grad_norm": 0.2028307020664215, "learning_rate": 0.0002, "loss": 0.0592, "step": 22590 }, { "epoch": 0.04110221219926389, "grad_norm": 0.032915905117988586, "learning_rate": 0.0002, "loss": 0.0211, "step": 22600 }, { "epoch": 0.041120399018821086, "grad_norm": 0.2932131588459015, "learning_rate": 0.0002, "loss": 0.1542, "step": 22610 }, { "epoch": 0.04113858583837828, "grad_norm": 0.08883325010538101, "learning_rate": 0.0002, "loss": 0.079, "step": 22620 }, { "epoch": 0.04115677265793548, "grad_norm": 0.07874555885791779, "learning_rate": 0.0002, "loss": 0.0801, "step": 22630 }, { "epoch": 0.041174959477492674, "grad_norm": 0.13785040378570557, "learning_rate": 0.0002, "loss": 0.0636, "step": 22640 }, { "epoch": 0.04119314629704987, "grad_norm": 0.0321812778711319, "learning_rate": 0.0002, "loss": 0.0208, "step": 22650 }, { "epoch": 0.04121133311660707, "grad_norm": 0.142785906791687, "learning_rate": 0.0002, "loss": 0.1292, "step": 22660 }, { "epoch": 0.04122951993616426, "grad_norm": 0.15572668612003326, "learning_rate": 0.0002, "loss": 0.0774, "step": 22670 }, { "epoch": 0.04124770675572146, "grad_norm": 0.033191781491041183, "learning_rate": 0.0002, "loss": 0.0805, "step": 22680 }, { "epoch": 0.041265893575278656, "grad_norm": 0.23840776085853577, "learning_rate": 0.0002, "loss": 0.06, "step": 22690 }, { "epoch": 0.04128408039483585, "grad_norm": 0.05943412706255913, "learning_rate": 0.0002, "loss": 0.0215, "step": 22700 }, { "epoch": 0.04130226721439305, "grad_norm": 0.05142183229327202, "learning_rate": 0.0002, "loss": 0.1181, "step": 22710 }, { "epoch": 0.041320454033950245, "grad_norm": 0.1583058387041092, "learning_rate": 0.0002, "loss": 0.0822, "step": 22720 }, { "epoch": 0.04133864085350744, "grad_norm": 0.035809941589832306, "learning_rate": 0.0002, "loss": 0.0723, "step": 22730 }, { "epoch": 0.04135682767306464, "grad_norm": 0.24066607654094696, "learning_rate": 0.0002, "loss": 0.0621, "step": 22740 }, { "epoch": 0.041375014492621834, "grad_norm": 0.0327225998044014, "learning_rate": 0.0002, "loss": 0.0264, "step": 22750 }, { "epoch": 0.04139320131217903, "grad_norm": 0.16599033772945404, "learning_rate": 0.0002, "loss": 0.1082, "step": 22760 }, { "epoch": 0.041411388131736226, "grad_norm": 0.18834830820560455, "learning_rate": 0.0002, "loss": 0.0767, "step": 22770 }, { "epoch": 0.04142957495129342, "grad_norm": 0.04162973538041115, "learning_rate": 0.0002, "loss": 0.0821, "step": 22780 }, { "epoch": 0.04144776177085062, "grad_norm": 0.21065399050712585, "learning_rate": 0.0002, "loss": 0.0591, "step": 22790 }, { "epoch": 0.041465948590407815, "grad_norm": 0.03744394704699516, "learning_rate": 0.0002, "loss": 0.0199, "step": 22800 }, { "epoch": 0.04148413540996501, "grad_norm": 0.30440911650657654, "learning_rate": 0.0002, "loss": 0.1321, "step": 22810 }, { "epoch": 0.04150232222952221, "grad_norm": 0.07215052098035812, "learning_rate": 0.0002, "loss": 0.077, "step": 22820 }, { "epoch": 0.041520509049079404, "grad_norm": 0.0822744220495224, "learning_rate": 0.0002, "loss": 0.0695, "step": 22830 }, { "epoch": 0.0415386958686366, "grad_norm": 0.20610104501247406, "learning_rate": 0.0002, "loss": 0.0668, "step": 22840 }, { "epoch": 0.0415568826881938, "grad_norm": 0.05089128017425537, "learning_rate": 0.0002, "loss": 0.0275, "step": 22850 }, { "epoch": 0.04157506950775099, "grad_norm": 0.23365797102451324, "learning_rate": 0.0002, "loss": 0.1308, "step": 22860 }, { "epoch": 0.04159325632730819, "grad_norm": 0.03983612358570099, "learning_rate": 0.0002, "loss": 0.0738, "step": 22870 }, { "epoch": 0.041611443146865386, "grad_norm": 0.12472117692232132, "learning_rate": 0.0002, "loss": 0.082, "step": 22880 }, { "epoch": 0.04162962996642258, "grad_norm": 0.19599118828773499, "learning_rate": 0.0002, "loss": 0.0614, "step": 22890 }, { "epoch": 0.04164781678597978, "grad_norm": 0.04077763110399246, "learning_rate": 0.0002, "loss": 0.0298, "step": 22900 }, { "epoch": 0.041666003605536975, "grad_norm": 0.3027828633785248, "learning_rate": 0.0002, "loss": 0.1294, "step": 22910 }, { "epoch": 0.04168419042509417, "grad_norm": 0.1551598757505417, "learning_rate": 0.0002, "loss": 0.0716, "step": 22920 }, { "epoch": 0.04170237724465137, "grad_norm": 0.06512947380542755, "learning_rate": 0.0002, "loss": 0.071, "step": 22930 }, { "epoch": 0.041720564064208564, "grad_norm": 0.2486017346382141, "learning_rate": 0.0002, "loss": 0.0726, "step": 22940 }, { "epoch": 0.04173875088376576, "grad_norm": 0.0658118799328804, "learning_rate": 0.0002, "loss": 0.0211, "step": 22950 }, { "epoch": 0.041756937703322956, "grad_norm": 0.18327641487121582, "learning_rate": 0.0002, "loss": 0.1307, "step": 22960 }, { "epoch": 0.04177512452288015, "grad_norm": 0.06218123063445091, "learning_rate": 0.0002, "loss": 0.0839, "step": 22970 }, { "epoch": 0.04179331134243735, "grad_norm": 0.07085203379392624, "learning_rate": 0.0002, "loss": 0.0786, "step": 22980 }, { "epoch": 0.041811498161994545, "grad_norm": 0.19552426040172577, "learning_rate": 0.0002, "loss": 0.0646, "step": 22990 }, { "epoch": 0.04182968498155174, "grad_norm": 0.06710335612297058, "learning_rate": 0.0002, "loss": 0.0252, "step": 23000 } ], "logging_steps": 10, "max_steps": 100000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.0398767809662812e+19, "train_batch_size": 24, "trial_name": null, "trial_params": null }