diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,16133 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.04182968498155174, + "eval_steps": 500, + "global_step": 23000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 1.818681955719641e-05, + "grad_norm": 2.1063554286956787, + "learning_rate": 0.0002, + "loss": 1.9357, + "step": 10 + }, + { + "epoch": 3.637363911439282e-05, + "grad_norm": 0.9359453320503235, + "learning_rate": 0.0002, + "loss": 0.2208, + "step": 20 + }, + { + "epoch": 5.4560458671589234e-05, + "grad_norm": 0.5420117378234863, + "learning_rate": 0.0002, + "loss": 0.1459, + "step": 30 + }, + { + "epoch": 7.274727822878565e-05, + "grad_norm": 0.05442357063293457, + "learning_rate": 0.0002, + "loss": 0.0823, + "step": 40 + }, + { + "epoch": 9.093409778598205e-05, + "grad_norm": 0.0005907653248868883, + "learning_rate": 0.0002, + "loss": 0.0005, + "step": 50 + }, + { + "epoch": 0.00010912091734317847, + "grad_norm": 0.26516178250312805, + "learning_rate": 0.0002, + "loss": 1.0686, + "step": 60 + }, + { + "epoch": 0.00012730773690037487, + "grad_norm": 0.44067099690437317, + "learning_rate": 0.0002, + "loss": 0.2613, + "step": 70 + }, + { + "epoch": 0.0001454945564575713, + "grad_norm": 0.09356075525283813, + "learning_rate": 0.0002, + "loss": 0.1415, + "step": 80 + }, + { + "epoch": 0.0001636813760147677, + "grad_norm": 0.017799921333789825, + "learning_rate": 0.0002, + "loss": 0.1013, + "step": 90 + }, + { + "epoch": 0.0001818681955719641, + "grad_norm": 0.0018534553237259388, + "learning_rate": 0.0002, + "loss": 0.0001, + "step": 100 + }, + { + "epoch": 0.00020005501512916052, + "grad_norm": 0.35472020506858826, + "learning_rate": 0.0002, + "loss": 0.73, + "step": 110 + }, + { + "epoch": 0.00021824183468635694, + "grad_norm": 0.3880878686904907, + "learning_rate": 0.0002, + "loss": 0.1424, + "step": 120 + }, + { + "epoch": 0.00023642865424355333, + "grad_norm": 0.19027432799339294, + "learning_rate": 0.0002, + "loss": 0.1173, + "step": 130 + }, + { + "epoch": 0.00025461547380074975, + "grad_norm": 0.019047321751713753, + "learning_rate": 0.0002, + "loss": 0.0977, + "step": 140 + }, + { + "epoch": 0.00027280229335794617, + "grad_norm": 0.0003795044613070786, + "learning_rate": 0.0002, + "loss": 0.0007, + "step": 150 + }, + { + "epoch": 0.0002909891129151426, + "grad_norm": 0.08740618824958801, + "learning_rate": 0.0002, + "loss": 0.801, + "step": 160 + }, + { + "epoch": 0.000309175932472339, + "grad_norm": 0.2661634087562561, + "learning_rate": 0.0002, + "loss": 0.1274, + "step": 170 + }, + { + "epoch": 0.0003273627520295354, + "grad_norm": 0.05828547850251198, + "learning_rate": 0.0002, + "loss": 0.1184, + "step": 180 + }, + { + "epoch": 0.0003455495715867318, + "grad_norm": 0.02175055630505085, + "learning_rate": 0.0002, + "loss": 0.0752, + "step": 190 + }, + { + "epoch": 0.0003637363911439282, + "grad_norm": 0.0009504792396910489, + "learning_rate": 0.0002, + "loss": 0.0005, + "step": 200 + }, + { + "epoch": 0.0003819232107011246, + "grad_norm": 0.25059741735458374, + "learning_rate": 0.0002, + "loss": 0.5125, + "step": 210 + }, + { + "epoch": 0.00040011003025832104, + "grad_norm": 0.13256193697452545, + "learning_rate": 0.0002, + "loss": 0.1014, + "step": 220 + }, + { + "epoch": 0.00041829684981551746, + "grad_norm": 0.09446375072002411, + "learning_rate": 0.0002, + "loss": 0.0896, + "step": 230 + }, + { + "epoch": 0.0004364836693727139, + "grad_norm": 0.019389621913433075, + "learning_rate": 0.0002, + "loss": 0.0726, + "step": 240 + }, + { + "epoch": 0.0004546704889299103, + "grad_norm": 0.0032304900232702494, + "learning_rate": 0.0002, + "loss": 0.0023, + "step": 250 + }, + { + "epoch": 0.00047285730848710666, + "grad_norm": 2.5549609661102295, + "learning_rate": 0.0002, + "loss": 0.3884, + "step": 260 + }, + { + "epoch": 0.0004910441280443031, + "grad_norm": 0.44937047362327576, + "learning_rate": 0.0002, + "loss": 0.1071, + "step": 270 + }, + { + "epoch": 0.0005092309476014995, + "grad_norm": 0.1509999781847, + "learning_rate": 0.0002, + "loss": 0.0979, + "step": 280 + }, + { + "epoch": 0.0005274177671586959, + "grad_norm": 0.006468054372817278, + "learning_rate": 0.0002, + "loss": 0.0611, + "step": 290 + }, + { + "epoch": 0.0005456045867158923, + "grad_norm": 0.0002916739322245121, + "learning_rate": 0.0002, + "loss": 0.001, + "step": 300 + }, + { + "epoch": 0.0005637914062730887, + "grad_norm": 0.23081810772418976, + "learning_rate": 0.0002, + "loss": 0.5894, + "step": 310 + }, + { + "epoch": 0.0005819782258302852, + "grad_norm": 0.22755394876003265, + "learning_rate": 0.0002, + "loss": 0.114, + "step": 320 + }, + { + "epoch": 0.0006001650453874816, + "grad_norm": 0.49973106384277344, + "learning_rate": 0.0002, + "loss": 0.093, + "step": 330 + }, + { + "epoch": 0.000618351864944678, + "grad_norm": 0.08789435774087906, + "learning_rate": 0.0002, + "loss": 0.0745, + "step": 340 + }, + { + "epoch": 0.0006365386845018744, + "grad_norm": 0.0058497479185462, + "learning_rate": 0.0002, + "loss": 0.0007, + "step": 350 + }, + { + "epoch": 0.0006547255040590708, + "grad_norm": 0.30569636821746826, + "learning_rate": 0.0002, + "loss": 0.5169, + "step": 360 + }, + { + "epoch": 0.0006729123236162671, + "grad_norm": 0.2783024311065674, + "learning_rate": 0.0002, + "loss": 0.13, + "step": 370 + }, + { + "epoch": 0.0006910991431734636, + "grad_norm": 0.13052967190742493, + "learning_rate": 0.0002, + "loss": 0.0907, + "step": 380 + }, + { + "epoch": 0.00070928596273066, + "grad_norm": 0.15066476166248322, + "learning_rate": 0.0002, + "loss": 0.0996, + "step": 390 + }, + { + "epoch": 0.0007274727822878564, + "grad_norm": 0.0005865198327228427, + "learning_rate": 0.0002, + "loss": 0.0021, + "step": 400 + }, + { + "epoch": 0.0007456596018450528, + "grad_norm": 0.31872233748435974, + "learning_rate": 0.0002, + "loss": 0.4507, + "step": 410 + }, + { + "epoch": 0.0007638464214022492, + "grad_norm": 0.08874880522489548, + "learning_rate": 0.0002, + "loss": 0.136, + "step": 420 + }, + { + "epoch": 0.0007820332409594457, + "grad_norm": 0.10985178500413895, + "learning_rate": 0.0002, + "loss": 0.0992, + "step": 430 + }, + { + "epoch": 0.0008002200605166421, + "grad_norm": 0.10776215046644211, + "learning_rate": 0.0002, + "loss": 0.0661, + "step": 440 + }, + { + "epoch": 0.0008184068800738385, + "grad_norm": 0.006612936966121197, + "learning_rate": 0.0002, + "loss": 0.0009, + "step": 450 + }, + { + "epoch": 0.0008365936996310349, + "grad_norm": 0.2757071256637573, + "learning_rate": 0.0002, + "loss": 0.6376, + "step": 460 + }, + { + "epoch": 0.0008547805191882313, + "grad_norm": 0.24748466908931732, + "learning_rate": 0.0002, + "loss": 0.1241, + "step": 470 + }, + { + "epoch": 0.0008729673387454278, + "grad_norm": 0.1035066694021225, + "learning_rate": 0.0002, + "loss": 0.1008, + "step": 480 + }, + { + "epoch": 0.0008911541583026242, + "grad_norm": 0.06515783071517944, + "learning_rate": 0.0002, + "loss": 0.0711, + "step": 490 + }, + { + "epoch": 0.0009093409778598206, + "grad_norm": 0.011224807240068913, + "learning_rate": 0.0002, + "loss": 0.0004, + "step": 500 + }, + { + "epoch": 0.000927527797417017, + "grad_norm": 0.2669332027435303, + "learning_rate": 0.0002, + "loss": 0.5618, + "step": 510 + }, + { + "epoch": 0.0009457146169742133, + "grad_norm": 0.26048392057418823, + "learning_rate": 0.0002, + "loss": 0.1259, + "step": 520 + }, + { + "epoch": 0.0009639014365314097, + "grad_norm": 0.22928836941719055, + "learning_rate": 0.0002, + "loss": 0.0956, + "step": 530 + }, + { + "epoch": 0.0009820882560886062, + "grad_norm": 0.084063321352005, + "learning_rate": 0.0002, + "loss": 0.0708, + "step": 540 + }, + { + "epoch": 0.0010002750756458027, + "grad_norm": 0.004612344317138195, + "learning_rate": 0.0002, + "loss": 0.0007, + "step": 550 + }, + { + "epoch": 0.001018461895202999, + "grad_norm": 0.3866584599018097, + "learning_rate": 0.0002, + "loss": 0.5406, + "step": 560 + }, + { + "epoch": 0.0010366487147601955, + "grad_norm": 0.32303065061569214, + "learning_rate": 0.0002, + "loss": 0.1001, + "step": 570 + }, + { + "epoch": 0.0010548355343173918, + "grad_norm": 0.09439560770988464, + "learning_rate": 0.0002, + "loss": 0.1051, + "step": 580 + }, + { + "epoch": 0.0010730223538745881, + "grad_norm": 0.028145521879196167, + "learning_rate": 0.0002, + "loss": 0.0638, + "step": 590 + }, + { + "epoch": 0.0010912091734317847, + "grad_norm": 0.00048497263924218714, + "learning_rate": 0.0002, + "loss": 0.002, + "step": 600 + }, + { + "epoch": 0.001109395992988981, + "grad_norm": 0.32391539216041565, + "learning_rate": 0.0002, + "loss": 0.5483, + "step": 610 + }, + { + "epoch": 0.0011275828125461775, + "grad_norm": 0.02977031283080578, + "learning_rate": 0.0002, + "loss": 0.1264, + "step": 620 + }, + { + "epoch": 0.0011457696321033738, + "grad_norm": 0.07332426309585571, + "learning_rate": 0.0002, + "loss": 0.1018, + "step": 630 + }, + { + "epoch": 0.0011639564516605703, + "grad_norm": 0.05653443560004234, + "learning_rate": 0.0002, + "loss": 0.0666, + "step": 640 + }, + { + "epoch": 0.0011821432712177666, + "grad_norm": 0.0010635281214490533, + "learning_rate": 0.0002, + "loss": 0.0009, + "step": 650 + }, + { + "epoch": 0.0012003300907749632, + "grad_norm": 0.04933600872755051, + "learning_rate": 0.0002, + "loss": 0.3902, + "step": 660 + }, + { + "epoch": 0.0012185169103321595, + "grad_norm": 0.14713574945926666, + "learning_rate": 0.0002, + "loss": 0.0905, + "step": 670 + }, + { + "epoch": 0.001236703729889356, + "grad_norm": 0.05463952198624611, + "learning_rate": 0.0002, + "loss": 0.0909, + "step": 680 + }, + { + "epoch": 0.0012548905494465523, + "grad_norm": 0.10299955308437347, + "learning_rate": 0.0002, + "loss": 0.07, + "step": 690 + }, + { + "epoch": 0.0012730773690037488, + "grad_norm": 0.022791124880313873, + "learning_rate": 0.0002, + "loss": 0.0027, + "step": 700 + }, + { + "epoch": 0.0012912641885609452, + "grad_norm": 0.27977490425109863, + "learning_rate": 0.0002, + "loss": 0.4421, + "step": 710 + }, + { + "epoch": 0.0013094510081181417, + "grad_norm": 0.2346329241991043, + "learning_rate": 0.0002, + "loss": 0.1263, + "step": 720 + }, + { + "epoch": 0.001327637827675338, + "grad_norm": 0.09294597059488297, + "learning_rate": 0.0002, + "loss": 0.096, + "step": 730 + }, + { + "epoch": 0.0013458246472325343, + "grad_norm": 0.10317150503396988, + "learning_rate": 0.0002, + "loss": 0.0727, + "step": 740 + }, + { + "epoch": 0.0013640114667897308, + "grad_norm": 0.001372635131701827, + "learning_rate": 0.0002, + "loss": 0.001, + "step": 750 + }, + { + "epoch": 0.0013821982863469271, + "grad_norm": 0.10563486814498901, + "learning_rate": 0.0002, + "loss": 0.596, + "step": 760 + }, + { + "epoch": 0.0014003851059041237, + "grad_norm": 0.14429838955402374, + "learning_rate": 0.0002, + "loss": 0.1178, + "step": 770 + }, + { + "epoch": 0.00141857192546132, + "grad_norm": 0.0848163515329361, + "learning_rate": 0.0002, + "loss": 0.1008, + "step": 780 + }, + { + "epoch": 0.0014367587450185165, + "grad_norm": 0.07259710133075714, + "learning_rate": 0.0002, + "loss": 0.069, + "step": 790 + }, + { + "epoch": 0.0014549455645757128, + "grad_norm": 0.0019098519114777446, + "learning_rate": 0.0002, + "loss": 0.0023, + "step": 800 + }, + { + "epoch": 0.0014731323841329093, + "grad_norm": 0.2433256059885025, + "learning_rate": 0.0002, + "loss": 0.2937, + "step": 810 + }, + { + "epoch": 0.0014913192036901056, + "grad_norm": 0.04093409329652786, + "learning_rate": 0.0002, + "loss": 0.1133, + "step": 820 + }, + { + "epoch": 0.0015095060232473022, + "grad_norm": 0.0480966717004776, + "learning_rate": 0.0002, + "loss": 0.0969, + "step": 830 + }, + { + "epoch": 0.0015276928428044985, + "grad_norm": 0.14327965676784515, + "learning_rate": 0.0002, + "loss": 0.0866, + "step": 840 + }, + { + "epoch": 0.001545879662361695, + "grad_norm": 0.001585015095770359, + "learning_rate": 0.0002, + "loss": 0.0042, + "step": 850 + }, + { + "epoch": 0.0015640664819188913, + "grad_norm": 0.1842886209487915, + "learning_rate": 0.0002, + "loss": 0.3273, + "step": 860 + }, + { + "epoch": 0.0015822533014760878, + "grad_norm": 0.09671049565076828, + "learning_rate": 0.0002, + "loss": 0.1079, + "step": 870 + }, + { + "epoch": 0.0016004401210332842, + "grad_norm": 0.2730088233947754, + "learning_rate": 0.0002, + "loss": 0.1018, + "step": 880 + }, + { + "epoch": 0.0016186269405904805, + "grad_norm": 0.11702803522348404, + "learning_rate": 0.0002, + "loss": 0.0758, + "step": 890 + }, + { + "epoch": 0.001636813760147677, + "grad_norm": 0.004438066389411688, + "learning_rate": 0.0002, + "loss": 0.0033, + "step": 900 + }, + { + "epoch": 0.0016550005797048733, + "grad_norm": 0.18424616754055023, + "learning_rate": 0.0002, + "loss": 0.4028, + "step": 910 + }, + { + "epoch": 0.0016731873992620698, + "grad_norm": 0.12502820789813995, + "learning_rate": 0.0002, + "loss": 0.0979, + "step": 920 + }, + { + "epoch": 0.0016913742188192661, + "grad_norm": 0.05109328031539917, + "learning_rate": 0.0002, + "loss": 0.0889, + "step": 930 + }, + { + "epoch": 0.0017095610383764627, + "grad_norm": 0.18566183745861053, + "learning_rate": 0.0002, + "loss": 0.0833, + "step": 940 + }, + { + "epoch": 0.001727747857933659, + "grad_norm": 0.0012954511912539601, + "learning_rate": 0.0002, + "loss": 0.0029, + "step": 950 + }, + { + "epoch": 0.0017459346774908555, + "grad_norm": 0.06683014333248138, + "learning_rate": 0.0002, + "loss": 0.4614, + "step": 960 + }, + { + "epoch": 0.0017641214970480518, + "grad_norm": 0.27773013710975647, + "learning_rate": 0.0002, + "loss": 0.1131, + "step": 970 + }, + { + "epoch": 0.0017823083166052483, + "grad_norm": 0.1999790072441101, + "learning_rate": 0.0002, + "loss": 0.089, + "step": 980 + }, + { + "epoch": 0.0018004951361624446, + "grad_norm": 0.09625103324651718, + "learning_rate": 0.0002, + "loss": 0.0739, + "step": 990 + }, + { + "epoch": 0.0018186819557196412, + "grad_norm": 0.005470380187034607, + "learning_rate": 0.0002, + "loss": 0.0012, + "step": 1000 + }, + { + "epoch": 0.0018368687752768375, + "grad_norm": 0.038832616060972214, + "learning_rate": 0.0002, + "loss": 0.5521, + "step": 1010 + }, + { + "epoch": 0.001855055594834034, + "grad_norm": 0.1903093159198761, + "learning_rate": 0.0002, + "loss": 0.1237, + "step": 1020 + }, + { + "epoch": 0.0018732424143912303, + "grad_norm": 0.031102774664759636, + "learning_rate": 0.0002, + "loss": 0.0866, + "step": 1030 + }, + { + "epoch": 0.0018914292339484266, + "grad_norm": 0.043983202427625656, + "learning_rate": 0.0002, + "loss": 0.0611, + "step": 1040 + }, + { + "epoch": 0.0019096160535056232, + "grad_norm": 0.0002974902163259685, + "learning_rate": 0.0002, + "loss": 0.0035, + "step": 1050 + }, + { + "epoch": 0.0019278028730628195, + "grad_norm": 0.1936149299144745, + "learning_rate": 0.0002, + "loss": 0.3019, + "step": 1060 + }, + { + "epoch": 0.001945989692620016, + "grad_norm": 0.15767355263233185, + "learning_rate": 0.0002, + "loss": 0.108, + "step": 1070 + }, + { + "epoch": 0.0019641765121772123, + "grad_norm": 0.08244495838880539, + "learning_rate": 0.0002, + "loss": 0.091, + "step": 1080 + }, + { + "epoch": 0.0019823633317344086, + "grad_norm": 0.15848897397518158, + "learning_rate": 0.0002, + "loss": 0.0655, + "step": 1090 + }, + { + "epoch": 0.0020005501512916054, + "grad_norm": 0.0011951205087825656, + "learning_rate": 0.0002, + "loss": 0.0052, + "step": 1100 + }, + { + "epoch": 0.0020187369708488017, + "grad_norm": 0.13027112185955048, + "learning_rate": 0.0002, + "loss": 0.2943, + "step": 1110 + }, + { + "epoch": 0.002036923790405998, + "grad_norm": 0.19413979351520538, + "learning_rate": 0.0002, + "loss": 0.1329, + "step": 1120 + }, + { + "epoch": 0.0020551106099631943, + "grad_norm": 0.08515465259552002, + "learning_rate": 0.0002, + "loss": 0.0921, + "step": 1130 + }, + { + "epoch": 0.002073297429520391, + "grad_norm": 0.1244177296757698, + "learning_rate": 0.0002, + "loss": 0.0678, + "step": 1140 + }, + { + "epoch": 0.0020914842490775873, + "grad_norm": 0.0016714326338842511, + "learning_rate": 0.0002, + "loss": 0.0035, + "step": 1150 + }, + { + "epoch": 0.0021096710686347836, + "grad_norm": 0.24979737401008606, + "learning_rate": 0.0002, + "loss": 0.2643, + "step": 1160 + }, + { + "epoch": 0.00212785788819198, + "grad_norm": 0.14143353700637817, + "learning_rate": 0.0002, + "loss": 0.1037, + "step": 1170 + }, + { + "epoch": 0.0021460447077491763, + "grad_norm": 0.033794257789850235, + "learning_rate": 0.0002, + "loss": 0.087, + "step": 1180 + }, + { + "epoch": 0.002164231527306373, + "grad_norm": 0.11503162235021591, + "learning_rate": 0.0002, + "loss": 0.0659, + "step": 1190 + }, + { + "epoch": 0.0021824183468635693, + "grad_norm": 0.0014654065016657114, + "learning_rate": 0.0002, + "loss": 0.0056, + "step": 1200 + }, + { + "epoch": 0.0022006051664207656, + "grad_norm": 0.13292767107486725, + "learning_rate": 0.0002, + "loss": 0.2956, + "step": 1210 + }, + { + "epoch": 0.002218791985977962, + "grad_norm": 0.15238040685653687, + "learning_rate": 0.0002, + "loss": 0.1122, + "step": 1220 + }, + { + "epoch": 0.0022369788055351587, + "grad_norm": 0.045078523457050323, + "learning_rate": 0.0002, + "loss": 0.091, + "step": 1230 + }, + { + "epoch": 0.002255165625092355, + "grad_norm": 0.11438468098640442, + "learning_rate": 0.0002, + "loss": 0.0754, + "step": 1240 + }, + { + "epoch": 0.0022733524446495513, + "grad_norm": 0.001236733514815569, + "learning_rate": 0.0002, + "loss": 0.004, + "step": 1250 + }, + { + "epoch": 0.0022915392642067476, + "grad_norm": 0.23386552929878235, + "learning_rate": 0.0002, + "loss": 0.351, + "step": 1260 + }, + { + "epoch": 0.0023097260837639444, + "grad_norm": 0.030786139890551567, + "learning_rate": 0.0002, + "loss": 0.1074, + "step": 1270 + }, + { + "epoch": 0.0023279129033211407, + "grad_norm": 0.150347501039505, + "learning_rate": 0.0002, + "loss": 0.1064, + "step": 1280 + }, + { + "epoch": 0.002346099722878337, + "grad_norm": 0.1402382105588913, + "learning_rate": 0.0002, + "loss": 0.0675, + "step": 1290 + }, + { + "epoch": 0.0023642865424355333, + "grad_norm": 0.0006117303855717182, + "learning_rate": 0.0002, + "loss": 0.0031, + "step": 1300 + }, + { + "epoch": 0.00238247336199273, + "grad_norm": 0.16031372547149658, + "learning_rate": 0.0002, + "loss": 0.4344, + "step": 1310 + }, + { + "epoch": 0.0024006601815499263, + "grad_norm": 0.11017303168773651, + "learning_rate": 0.0002, + "loss": 0.1147, + "step": 1320 + }, + { + "epoch": 0.0024188470011071227, + "grad_norm": 0.055746905505657196, + "learning_rate": 0.0002, + "loss": 0.093, + "step": 1330 + }, + { + "epoch": 0.002437033820664319, + "grad_norm": 0.09806664288043976, + "learning_rate": 0.0002, + "loss": 0.0682, + "step": 1340 + }, + { + "epoch": 0.0024552206402215153, + "grad_norm": 0.000555588339921087, + "learning_rate": 0.0002, + "loss": 0.0045, + "step": 1350 + }, + { + "epoch": 0.002473407459778712, + "grad_norm": 0.04899182915687561, + "learning_rate": 0.0002, + "loss": 0.3454, + "step": 1360 + }, + { + "epoch": 0.0024915942793359083, + "grad_norm": 0.02870030514895916, + "learning_rate": 0.0002, + "loss": 0.1036, + "step": 1370 + }, + { + "epoch": 0.0025097810988931046, + "grad_norm": 0.08591730147600174, + "learning_rate": 0.0002, + "loss": 0.0962, + "step": 1380 + }, + { + "epoch": 0.002527967918450301, + "grad_norm": 0.1169242337346077, + "learning_rate": 0.0002, + "loss": 0.0627, + "step": 1390 + }, + { + "epoch": 0.0025461547380074977, + "grad_norm": 0.0008637752034701407, + "learning_rate": 0.0002, + "loss": 0.0025, + "step": 1400 + }, + { + "epoch": 0.002564341557564694, + "grad_norm": 0.11741841584444046, + "learning_rate": 0.0002, + "loss": 0.3703, + "step": 1410 + }, + { + "epoch": 0.0025825283771218903, + "grad_norm": 0.05232485383749008, + "learning_rate": 0.0002, + "loss": 0.1072, + "step": 1420 + }, + { + "epoch": 0.0026007151966790866, + "grad_norm": 0.025201110169291496, + "learning_rate": 0.0002, + "loss": 0.0893, + "step": 1430 + }, + { + "epoch": 0.0026189020162362834, + "grad_norm": 0.11462239921092987, + "learning_rate": 0.0002, + "loss": 0.0826, + "step": 1440 + }, + { + "epoch": 0.0026370888357934797, + "grad_norm": 0.002194227883592248, + "learning_rate": 0.0002, + "loss": 0.0049, + "step": 1450 + }, + { + "epoch": 0.002655275655350676, + "grad_norm": 0.05786404758691788, + "learning_rate": 0.0002, + "loss": 0.3187, + "step": 1460 + }, + { + "epoch": 0.0026734624749078723, + "grad_norm": 0.03776915743947029, + "learning_rate": 0.0002, + "loss": 0.1002, + "step": 1470 + }, + { + "epoch": 0.0026916492944650686, + "grad_norm": 0.08628734946250916, + "learning_rate": 0.0002, + "loss": 0.0933, + "step": 1480 + }, + { + "epoch": 0.0027098361140222653, + "grad_norm": 0.0933455228805542, + "learning_rate": 0.0002, + "loss": 0.0712, + "step": 1490 + }, + { + "epoch": 0.0027280229335794617, + "grad_norm": 0.0007446192903444171, + "learning_rate": 0.0002, + "loss": 0.003, + "step": 1500 + }, + { + "epoch": 0.002746209753136658, + "grad_norm": 0.04412281885743141, + "learning_rate": 0.0002, + "loss": 0.3738, + "step": 1510 + }, + { + "epoch": 0.0027643965726938543, + "grad_norm": 0.04729326814413071, + "learning_rate": 0.0002, + "loss": 0.1015, + "step": 1520 + }, + { + "epoch": 0.002782583392251051, + "grad_norm": 0.04822024703025818, + "learning_rate": 0.0002, + "loss": 0.0913, + "step": 1530 + }, + { + "epoch": 0.0028007702118082473, + "grad_norm": 0.15468090772628784, + "learning_rate": 0.0002, + "loss": 0.0794, + "step": 1540 + }, + { + "epoch": 0.0028189570313654436, + "grad_norm": 0.0011828596470877528, + "learning_rate": 0.0002, + "loss": 0.0089, + "step": 1550 + }, + { + "epoch": 0.00283714385092264, + "grad_norm": 0.030639037489891052, + "learning_rate": 0.0002, + "loss": 0.3382, + "step": 1560 + }, + { + "epoch": 0.0028553306704798367, + "grad_norm": 0.08429472148418427, + "learning_rate": 0.0002, + "loss": 0.1075, + "step": 1570 + }, + { + "epoch": 0.002873517490037033, + "grad_norm": 0.056431323289871216, + "learning_rate": 0.0002, + "loss": 0.0946, + "step": 1580 + }, + { + "epoch": 0.0028917043095942293, + "grad_norm": 0.1799512803554535, + "learning_rate": 0.0002, + "loss": 0.0795, + "step": 1590 + }, + { + "epoch": 0.0029098911291514256, + "grad_norm": 0.0018818675307556987, + "learning_rate": 0.0002, + "loss": 0.0082, + "step": 1600 + }, + { + "epoch": 0.002928077948708622, + "grad_norm": 0.061398155987262726, + "learning_rate": 0.0002, + "loss": 0.3414, + "step": 1610 + }, + { + "epoch": 0.0029462647682658187, + "grad_norm": 0.0657019093632698, + "learning_rate": 0.0002, + "loss": 0.1082, + "step": 1620 + }, + { + "epoch": 0.002964451587823015, + "grad_norm": 0.04701487720012665, + "learning_rate": 0.0002, + "loss": 0.0918, + "step": 1630 + }, + { + "epoch": 0.0029826384073802113, + "grad_norm": 0.1834430694580078, + "learning_rate": 0.0002, + "loss": 0.081, + "step": 1640 + }, + { + "epoch": 0.0030008252269374076, + "grad_norm": 0.004841644782572985, + "learning_rate": 0.0002, + "loss": 0.0138, + "step": 1650 + }, + { + "epoch": 0.0030190120464946043, + "grad_norm": 0.05793444439768791, + "learning_rate": 0.0002, + "loss": 0.2981, + "step": 1660 + }, + { + "epoch": 0.0030371988660518007, + "grad_norm": 0.049123138189315796, + "learning_rate": 0.0002, + "loss": 0.1072, + "step": 1670 + }, + { + "epoch": 0.003055385685608997, + "grad_norm": 0.033852141350507736, + "learning_rate": 0.0002, + "loss": 0.093, + "step": 1680 + }, + { + "epoch": 0.0030735725051661933, + "grad_norm": 0.16161279380321503, + "learning_rate": 0.0002, + "loss": 0.084, + "step": 1690 + }, + { + "epoch": 0.00309175932472339, + "grad_norm": 0.0011225020280107856, + "learning_rate": 0.0002, + "loss": 0.0059, + "step": 1700 + }, + { + "epoch": 0.0031099461442805863, + "grad_norm": 0.05849582701921463, + "learning_rate": 0.0002, + "loss": 0.3878, + "step": 1710 + }, + { + "epoch": 0.0031281329638377826, + "grad_norm": 0.033466637134552, + "learning_rate": 0.0002, + "loss": 0.1096, + "step": 1720 + }, + { + "epoch": 0.003146319783394979, + "grad_norm": 0.03488466143608093, + "learning_rate": 0.0002, + "loss": 0.0895, + "step": 1730 + }, + { + "epoch": 0.0031645066029521757, + "grad_norm": 0.15636079013347626, + "learning_rate": 0.0002, + "loss": 0.0716, + "step": 1740 + }, + { + "epoch": 0.003182693422509372, + "grad_norm": 0.001519509358331561, + "learning_rate": 0.0002, + "loss": 0.0062, + "step": 1750 + }, + { + "epoch": 0.0032008802420665683, + "grad_norm": 0.04979783296585083, + "learning_rate": 0.0002, + "loss": 0.3409, + "step": 1760 + }, + { + "epoch": 0.0032190670616237646, + "grad_norm": 0.09706272929906845, + "learning_rate": 0.0002, + "loss": 0.1052, + "step": 1770 + }, + { + "epoch": 0.003237253881180961, + "grad_norm": 0.08768483251333237, + "learning_rate": 0.0002, + "loss": 0.0938, + "step": 1780 + }, + { + "epoch": 0.0032554407007381577, + "grad_norm": 0.20421457290649414, + "learning_rate": 0.0002, + "loss": 0.085, + "step": 1790 + }, + { + "epoch": 0.003273627520295354, + "grad_norm": 0.0024727964773774147, + "learning_rate": 0.0002, + "loss": 0.0147, + "step": 1800 + }, + { + "epoch": 0.0032918143398525503, + "grad_norm": 0.04270516335964203, + "learning_rate": 0.0002, + "loss": 0.2872, + "step": 1810 + }, + { + "epoch": 0.0033100011594097466, + "grad_norm": 0.08055799454450607, + "learning_rate": 0.0002, + "loss": 0.0992, + "step": 1820 + }, + { + "epoch": 0.0033281879789669433, + "grad_norm": 0.02607434056699276, + "learning_rate": 0.0002, + "loss": 0.0803, + "step": 1830 + }, + { + "epoch": 0.0033463747985241397, + "grad_norm": 0.16260816156864166, + "learning_rate": 0.0002, + "loss": 0.0753, + "step": 1840 + }, + { + "epoch": 0.003364561618081336, + "grad_norm": 0.004690333269536495, + "learning_rate": 0.0002, + "loss": 0.012, + "step": 1850 + }, + { + "epoch": 0.0033827484376385323, + "grad_norm": 0.041513338685035706, + "learning_rate": 0.0002, + "loss": 0.2491, + "step": 1860 + }, + { + "epoch": 0.003400935257195729, + "grad_norm": 0.08935420960187912, + "learning_rate": 0.0002, + "loss": 0.1001, + "step": 1870 + }, + { + "epoch": 0.0034191220767529253, + "grad_norm": 0.03826737776398659, + "learning_rate": 0.0002, + "loss": 0.0877, + "step": 1880 + }, + { + "epoch": 0.0034373088963101216, + "grad_norm": 0.19423778355121613, + "learning_rate": 0.0002, + "loss": 0.0797, + "step": 1890 + }, + { + "epoch": 0.003455495715867318, + "grad_norm": 0.003520288970321417, + "learning_rate": 0.0002, + "loss": 0.013, + "step": 1900 + }, + { + "epoch": 0.0034736825354245143, + "grad_norm": 0.14648132026195526, + "learning_rate": 0.0002, + "loss": 0.3209, + "step": 1910 + }, + { + "epoch": 0.003491869354981711, + "grad_norm": 0.03780071437358856, + "learning_rate": 0.0002, + "loss": 0.0934, + "step": 1920 + }, + { + "epoch": 0.0035100561745389073, + "grad_norm": 0.05014612153172493, + "learning_rate": 0.0002, + "loss": 0.082, + "step": 1930 + }, + { + "epoch": 0.0035282429940961036, + "grad_norm": 0.12917590141296387, + "learning_rate": 0.0002, + "loss": 0.0733, + "step": 1940 + }, + { + "epoch": 0.0035464298136533, + "grad_norm": 0.0030132795218378305, + "learning_rate": 0.0002, + "loss": 0.0111, + "step": 1950 + }, + { + "epoch": 0.0035646166332104967, + "grad_norm": 0.03008626028895378, + "learning_rate": 0.0002, + "loss": 0.2126, + "step": 1960 + }, + { + "epoch": 0.003582803452767693, + "grad_norm": 0.0915503203868866, + "learning_rate": 0.0002, + "loss": 0.1097, + "step": 1970 + }, + { + "epoch": 0.0036009902723248893, + "grad_norm": 0.06607015430927277, + "learning_rate": 0.0002, + "loss": 0.0932, + "step": 1980 + }, + { + "epoch": 0.0036191770918820856, + "grad_norm": 0.18796613812446594, + "learning_rate": 0.0002, + "loss": 0.083, + "step": 1990 + }, + { + "epoch": 0.0036373639114392823, + "grad_norm": 0.0022257096134126186, + "learning_rate": 0.0002, + "loss": 0.0147, + "step": 2000 + }, + { + "epoch": 0.0036555507309964787, + "grad_norm": 0.0687415823340416, + "learning_rate": 0.0002, + "loss": 0.2604, + "step": 2010 + }, + { + "epoch": 0.003673737550553675, + "grad_norm": 0.025175679475069046, + "learning_rate": 0.0002, + "loss": 0.0998, + "step": 2020 + }, + { + "epoch": 0.0036919243701108713, + "grad_norm": 0.04275168478488922, + "learning_rate": 0.0002, + "loss": 0.0898, + "step": 2030 + }, + { + "epoch": 0.003710111189668068, + "grad_norm": 0.17306455969810486, + "learning_rate": 0.0002, + "loss": 0.0793, + "step": 2040 + }, + { + "epoch": 0.0037282980092252643, + "grad_norm": 0.007826454006135464, + "learning_rate": 0.0002, + "loss": 0.011, + "step": 2050 + }, + { + "epoch": 0.0037464848287824606, + "grad_norm": 0.06461178511381149, + "learning_rate": 0.0002, + "loss": 0.2597, + "step": 2060 + }, + { + "epoch": 0.003764671648339657, + "grad_norm": 0.061357177793979645, + "learning_rate": 0.0002, + "loss": 0.1001, + "step": 2070 + }, + { + "epoch": 0.0037828584678968533, + "grad_norm": 0.029154235497117043, + "learning_rate": 0.0002, + "loss": 0.0859, + "step": 2080 + }, + { + "epoch": 0.00380104528745405, + "grad_norm": 0.1350340098142624, + "learning_rate": 0.0002, + "loss": 0.0756, + "step": 2090 + }, + { + "epoch": 0.0038192321070112463, + "grad_norm": 0.0017614173702895641, + "learning_rate": 0.0002, + "loss": 0.0058, + "step": 2100 + }, + { + "epoch": 0.0038374189265684426, + "grad_norm": 0.024254316464066505, + "learning_rate": 0.0002, + "loss": 0.3349, + "step": 2110 + }, + { + "epoch": 0.003855605746125639, + "grad_norm": 0.07142530381679535, + "learning_rate": 0.0002, + "loss": 0.0953, + "step": 2120 + }, + { + "epoch": 0.0038737925656828357, + "grad_norm": 0.05570175498723984, + "learning_rate": 0.0002, + "loss": 0.0796, + "step": 2130 + }, + { + "epoch": 0.003891979385240032, + "grad_norm": 0.16996875405311584, + "learning_rate": 0.0002, + "loss": 0.0782, + "step": 2140 + }, + { + "epoch": 0.003910166204797228, + "grad_norm": 0.0058751595206558704, + "learning_rate": 0.0002, + "loss": 0.0206, + "step": 2150 + }, + { + "epoch": 0.003928353024354425, + "grad_norm": 0.029807811602950096, + "learning_rate": 0.0002, + "loss": 0.1926, + "step": 2160 + }, + { + "epoch": 0.003946539843911621, + "grad_norm": 0.11123469471931458, + "learning_rate": 0.0002, + "loss": 0.1082, + "step": 2170 + }, + { + "epoch": 0.003964726663468817, + "grad_norm": 0.074626125395298, + "learning_rate": 0.0002, + "loss": 0.081, + "step": 2180 + }, + { + "epoch": 0.003982913483026014, + "grad_norm": 0.17397737503051758, + "learning_rate": 0.0002, + "loss": 0.0729, + "step": 2190 + }, + { + "epoch": 0.004001100302583211, + "grad_norm": 0.007995887659490108, + "learning_rate": 0.0002, + "loss": 0.022, + "step": 2200 + }, + { + "epoch": 0.004019287122140407, + "grad_norm": 0.039921898394823074, + "learning_rate": 0.0002, + "loss": 0.1883, + "step": 2210 + }, + { + "epoch": 0.004037473941697603, + "grad_norm": 0.07736324518918991, + "learning_rate": 0.0002, + "loss": 0.0941, + "step": 2220 + }, + { + "epoch": 0.0040556607612548, + "grad_norm": 0.0867881178855896, + "learning_rate": 0.0002, + "loss": 0.0873, + "step": 2230 + }, + { + "epoch": 0.004073847580811996, + "grad_norm": 0.1497400403022766, + "learning_rate": 0.0002, + "loss": 0.0829, + "step": 2240 + }, + { + "epoch": 0.004092034400369192, + "grad_norm": 0.007458314299583435, + "learning_rate": 0.0002, + "loss": 0.02, + "step": 2250 + }, + { + "epoch": 0.004110221219926389, + "grad_norm": 0.04168029874563217, + "learning_rate": 0.0002, + "loss": 0.2176, + "step": 2260 + }, + { + "epoch": 0.004128408039483585, + "grad_norm": 0.10017130523920059, + "learning_rate": 0.0002, + "loss": 0.0958, + "step": 2270 + }, + { + "epoch": 0.004146594859040782, + "grad_norm": 0.02727416157722473, + "learning_rate": 0.0002, + "loss": 0.088, + "step": 2280 + }, + { + "epoch": 0.004164781678597978, + "grad_norm": 0.15034393966197968, + "learning_rate": 0.0002, + "loss": 0.0826, + "step": 2290 + }, + { + "epoch": 0.004182968498155175, + "grad_norm": 0.0023451410233974457, + "learning_rate": 0.0002, + "loss": 0.0102, + "step": 2300 + }, + { + "epoch": 0.004201155317712371, + "grad_norm": 0.03462455794215202, + "learning_rate": 0.0002, + "loss": 0.3404, + "step": 2310 + }, + { + "epoch": 0.004219342137269567, + "grad_norm": 0.02866148017346859, + "learning_rate": 0.0002, + "loss": 0.0932, + "step": 2320 + }, + { + "epoch": 0.004237528956826764, + "grad_norm": 0.0685456171631813, + "learning_rate": 0.0002, + "loss": 0.0806, + "step": 2330 + }, + { + "epoch": 0.00425571577638396, + "grad_norm": 0.17208056151866913, + "learning_rate": 0.0002, + "loss": 0.0826, + "step": 2340 + }, + { + "epoch": 0.004273902595941156, + "grad_norm": 0.008708455599844456, + "learning_rate": 0.0002, + "loss": 0.0171, + "step": 2350 + }, + { + "epoch": 0.0042920894154983525, + "grad_norm": 0.044025715440511703, + "learning_rate": 0.0002, + "loss": 0.212, + "step": 2360 + }, + { + "epoch": 0.00431027623505555, + "grad_norm": 0.050246164202690125, + "learning_rate": 0.0002, + "loss": 0.107, + "step": 2370 + }, + { + "epoch": 0.004328463054612746, + "grad_norm": 0.05257886275649071, + "learning_rate": 0.0002, + "loss": 0.0868, + "step": 2380 + }, + { + "epoch": 0.004346649874169942, + "grad_norm": 0.16567641496658325, + "learning_rate": 0.0002, + "loss": 0.0819, + "step": 2390 + }, + { + "epoch": 0.004364836693727139, + "grad_norm": 0.0062621901743113995, + "learning_rate": 0.0002, + "loss": 0.0171, + "step": 2400 + }, + { + "epoch": 0.004383023513284335, + "grad_norm": 0.03025338612496853, + "learning_rate": 0.0002, + "loss": 0.2141, + "step": 2410 + }, + { + "epoch": 0.004401210332841531, + "grad_norm": 0.06401577591896057, + "learning_rate": 0.0002, + "loss": 0.0982, + "step": 2420 + }, + { + "epoch": 0.004419397152398728, + "grad_norm": 0.12474781274795532, + "learning_rate": 0.0002, + "loss": 0.0834, + "step": 2430 + }, + { + "epoch": 0.004437583971955924, + "grad_norm": 0.18607665598392487, + "learning_rate": 0.0002, + "loss": 0.0801, + "step": 2440 + }, + { + "epoch": 0.004455770791513121, + "grad_norm": 0.0017643098253756762, + "learning_rate": 0.0002, + "loss": 0.0129, + "step": 2450 + }, + { + "epoch": 0.004473957611070317, + "grad_norm": 0.03936386480927467, + "learning_rate": 0.0002, + "loss": 0.2541, + "step": 2460 + }, + { + "epoch": 0.004492144430627514, + "grad_norm": 0.08961635082960129, + "learning_rate": 0.0002, + "loss": 0.0961, + "step": 2470 + }, + { + "epoch": 0.00451033125018471, + "grad_norm": 0.07525113970041275, + "learning_rate": 0.0002, + "loss": 0.0844, + "step": 2480 + }, + { + "epoch": 0.004528518069741906, + "grad_norm": 0.16746751964092255, + "learning_rate": 0.0002, + "loss": 0.071, + "step": 2490 + }, + { + "epoch": 0.004546704889299103, + "grad_norm": 0.0027625334914773703, + "learning_rate": 0.0002, + "loss": 0.0151, + "step": 2500 + }, + { + "epoch": 0.004564891708856299, + "grad_norm": 0.049662694334983826, + "learning_rate": 0.0002, + "loss": 0.253, + "step": 2510 + }, + { + "epoch": 0.004583078528413495, + "grad_norm": 0.08312079310417175, + "learning_rate": 0.0002, + "loss": 0.0922, + "step": 2520 + }, + { + "epoch": 0.0046012653479706915, + "grad_norm": 0.0646345317363739, + "learning_rate": 0.0002, + "loss": 0.0889, + "step": 2530 + }, + { + "epoch": 0.004619452167527889, + "grad_norm": 0.20036271214485168, + "learning_rate": 0.0002, + "loss": 0.081, + "step": 2540 + }, + { + "epoch": 0.004637638987085085, + "grad_norm": 0.010091719217598438, + "learning_rate": 0.0002, + "loss": 0.024, + "step": 2550 + }, + { + "epoch": 0.004655825806642281, + "grad_norm": 0.048885516822338104, + "learning_rate": 0.0002, + "loss": 0.184, + "step": 2560 + }, + { + "epoch": 0.004674012626199478, + "grad_norm": 0.09142889827489853, + "learning_rate": 0.0002, + "loss": 0.0935, + "step": 2570 + }, + { + "epoch": 0.004692199445756674, + "grad_norm": 0.049207963049411774, + "learning_rate": 0.0002, + "loss": 0.0816, + "step": 2580 + }, + { + "epoch": 0.00471038626531387, + "grad_norm": 0.1498396098613739, + "learning_rate": 0.0002, + "loss": 0.0698, + "step": 2590 + }, + { + "epoch": 0.004728573084871067, + "grad_norm": 0.00522881094366312, + "learning_rate": 0.0002, + "loss": 0.0189, + "step": 2600 + }, + { + "epoch": 0.004746759904428263, + "grad_norm": 0.07461311668157578, + "learning_rate": 0.0002, + "loss": 0.1944, + "step": 2610 + }, + { + "epoch": 0.00476494672398546, + "grad_norm": 0.048005711287260056, + "learning_rate": 0.0002, + "loss": 0.0883, + "step": 2620 + }, + { + "epoch": 0.004783133543542656, + "grad_norm": 0.10151612013578415, + "learning_rate": 0.0002, + "loss": 0.0827, + "step": 2630 + }, + { + "epoch": 0.004801320363099853, + "grad_norm": 0.1504422426223755, + "learning_rate": 0.0002, + "loss": 0.078, + "step": 2640 + }, + { + "epoch": 0.004819507182657049, + "grad_norm": 0.004988422151654959, + "learning_rate": 0.0002, + "loss": 0.0229, + "step": 2650 + }, + { + "epoch": 0.004837694002214245, + "grad_norm": 0.025008924305438995, + "learning_rate": 0.0002, + "loss": 0.1818, + "step": 2660 + }, + { + "epoch": 0.004855880821771442, + "grad_norm": 0.027460169047117233, + "learning_rate": 0.0002, + "loss": 0.0966, + "step": 2670 + }, + { + "epoch": 0.004874067641328638, + "grad_norm": 0.09704197943210602, + "learning_rate": 0.0002, + "loss": 0.0824, + "step": 2680 + }, + { + "epoch": 0.004892254460885834, + "grad_norm": 0.138654425740242, + "learning_rate": 0.0002, + "loss": 0.0746, + "step": 2690 + }, + { + "epoch": 0.0049104412804430305, + "grad_norm": 0.00859556533396244, + "learning_rate": 0.0002, + "loss": 0.0187, + "step": 2700 + }, + { + "epoch": 0.004928628100000228, + "grad_norm": 0.05207522585988045, + "learning_rate": 0.0002, + "loss": 0.1985, + "step": 2710 + }, + { + "epoch": 0.004946814919557424, + "grad_norm": 0.07787417620420456, + "learning_rate": 0.0002, + "loss": 0.101, + "step": 2720 + }, + { + "epoch": 0.00496500173911462, + "grad_norm": 0.02819981426000595, + "learning_rate": 0.0002, + "loss": 0.0845, + "step": 2730 + }, + { + "epoch": 0.004983188558671817, + "grad_norm": 0.13569314777851105, + "learning_rate": 0.0002, + "loss": 0.0756, + "step": 2740 + }, + { + "epoch": 0.005001375378229013, + "grad_norm": 0.05175986513495445, + "learning_rate": 0.0002, + "loss": 0.024, + "step": 2750 + }, + { + "epoch": 0.005019562197786209, + "grad_norm": 0.037230249494314194, + "learning_rate": 0.0002, + "loss": 0.2056, + "step": 2760 + }, + { + "epoch": 0.005037749017343406, + "grad_norm": 0.05532974749803543, + "learning_rate": 0.0002, + "loss": 0.0939, + "step": 2770 + }, + { + "epoch": 0.005055935836900602, + "grad_norm": 0.06930708140134811, + "learning_rate": 0.0002, + "loss": 0.0853, + "step": 2780 + }, + { + "epoch": 0.005074122656457798, + "grad_norm": 0.16405801475048065, + "learning_rate": 0.0002, + "loss": 0.0766, + "step": 2790 + }, + { + "epoch": 0.005092309476014995, + "grad_norm": 0.006398684345185757, + "learning_rate": 0.0002, + "loss": 0.0124, + "step": 2800 + }, + { + "epoch": 0.005110496295572192, + "grad_norm": 0.06269315630197525, + "learning_rate": 0.0002, + "loss": 0.2703, + "step": 2810 + }, + { + "epoch": 0.005128683115129388, + "grad_norm": 0.049293261021375656, + "learning_rate": 0.0002, + "loss": 0.0943, + "step": 2820 + }, + { + "epoch": 0.005146869934686584, + "grad_norm": 0.08814405649900436, + "learning_rate": 0.0002, + "loss": 0.0855, + "step": 2830 + }, + { + "epoch": 0.005165056754243781, + "grad_norm": 0.17452259361743927, + "learning_rate": 0.0002, + "loss": 0.0822, + "step": 2840 + }, + { + "epoch": 0.005183243573800977, + "grad_norm": 0.005008229520171881, + "learning_rate": 0.0002, + "loss": 0.0136, + "step": 2850 + }, + { + "epoch": 0.005201430393358173, + "grad_norm": 0.04459540545940399, + "learning_rate": 0.0002, + "loss": 0.2623, + "step": 2860 + }, + { + "epoch": 0.0052196172129153695, + "grad_norm": 0.042845603078603745, + "learning_rate": 0.0002, + "loss": 0.0929, + "step": 2870 + }, + { + "epoch": 0.005237804032472567, + "grad_norm": 0.03079635463654995, + "learning_rate": 0.0002, + "loss": 0.0844, + "step": 2880 + }, + { + "epoch": 0.005255990852029763, + "grad_norm": 0.14457851648330688, + "learning_rate": 0.0002, + "loss": 0.0753, + "step": 2890 + }, + { + "epoch": 0.005274177671586959, + "grad_norm": 0.0009016963304020464, + "learning_rate": 0.0002, + "loss": 0.0037, + "step": 2900 + }, + { + "epoch": 0.005292364491144156, + "grad_norm": 0.0983906164765358, + "learning_rate": 0.0002, + "loss": 0.3661, + "step": 2910 + }, + { + "epoch": 0.005310551310701352, + "grad_norm": 0.08794154971837997, + "learning_rate": 0.0002, + "loss": 0.0894, + "step": 2920 + }, + { + "epoch": 0.005328738130258548, + "grad_norm": 0.026981573551893234, + "learning_rate": 0.0002, + "loss": 0.0779, + "step": 2930 + }, + { + "epoch": 0.005346924949815745, + "grad_norm": 0.15572553873062134, + "learning_rate": 0.0002, + "loss": 0.077, + "step": 2940 + }, + { + "epoch": 0.005365111769372941, + "grad_norm": 0.005491070915013552, + "learning_rate": 0.0002, + "loss": 0.0092, + "step": 2950 + }, + { + "epoch": 0.005383298588930137, + "grad_norm": 0.07383686304092407, + "learning_rate": 0.0002, + "loss": 0.2574, + "step": 2960 + }, + { + "epoch": 0.005401485408487334, + "grad_norm": 0.05919960141181946, + "learning_rate": 0.0002, + "loss": 0.1045, + "step": 2970 + }, + { + "epoch": 0.005419672228044531, + "grad_norm": 0.06027739867568016, + "learning_rate": 0.0002, + "loss": 0.0822, + "step": 2980 + }, + { + "epoch": 0.005437859047601727, + "grad_norm": 0.1288602501153946, + "learning_rate": 0.0002, + "loss": 0.0688, + "step": 2990 + }, + { + "epoch": 0.005456045867158923, + "grad_norm": 0.007565880194306374, + "learning_rate": 0.0002, + "loss": 0.0192, + "step": 3000 + }, + { + "epoch": 0.00547423268671612, + "grad_norm": 0.024412864819169044, + "learning_rate": 0.0002, + "loss": 0.1782, + "step": 3010 + }, + { + "epoch": 0.005492419506273316, + "grad_norm": 0.05559355765581131, + "learning_rate": 0.0002, + "loss": 0.1072, + "step": 3020 + }, + { + "epoch": 0.005510606325830512, + "grad_norm": 0.07073906064033508, + "learning_rate": 0.0002, + "loss": 0.0863, + "step": 3030 + }, + { + "epoch": 0.0055287931453877085, + "grad_norm": 0.14979414641857147, + "learning_rate": 0.0002, + "loss": 0.0788, + "step": 3040 + }, + { + "epoch": 0.005546979964944906, + "grad_norm": 0.0057297456078231335, + "learning_rate": 0.0002, + "loss": 0.0192, + "step": 3050 + }, + { + "epoch": 0.005565166784502102, + "grad_norm": 0.03195042535662651, + "learning_rate": 0.0002, + "loss": 0.1879, + "step": 3060 + }, + { + "epoch": 0.005583353604059298, + "grad_norm": 0.05925082787871361, + "learning_rate": 0.0002, + "loss": 0.0992, + "step": 3070 + }, + { + "epoch": 0.005601540423616495, + "grad_norm": 0.052063606679439545, + "learning_rate": 0.0002, + "loss": 0.0826, + "step": 3080 + }, + { + "epoch": 0.005619727243173691, + "grad_norm": 0.16005952656269073, + "learning_rate": 0.0002, + "loss": 0.0743, + "step": 3090 + }, + { + "epoch": 0.005637914062730887, + "grad_norm": 0.005742133595049381, + "learning_rate": 0.0002, + "loss": 0.0137, + "step": 3100 + }, + { + "epoch": 0.005656100882288084, + "grad_norm": 0.07523638010025024, + "learning_rate": 0.0002, + "loss": 0.2072, + "step": 3110 + }, + { + "epoch": 0.00567428770184528, + "grad_norm": 0.23799611628055573, + "learning_rate": 0.0002, + "loss": 0.0906, + "step": 3120 + }, + { + "epoch": 0.005692474521402476, + "grad_norm": 0.06176261603832245, + "learning_rate": 0.0002, + "loss": 0.088, + "step": 3130 + }, + { + "epoch": 0.005710661340959673, + "grad_norm": 0.13692723214626312, + "learning_rate": 0.0002, + "loss": 0.0733, + "step": 3140 + }, + { + "epoch": 0.00572884816051687, + "grad_norm": 0.007059803698211908, + "learning_rate": 0.0002, + "loss": 0.0194, + "step": 3150 + }, + { + "epoch": 0.005747034980074066, + "grad_norm": 0.08868405222892761, + "learning_rate": 0.0002, + "loss": 0.1745, + "step": 3160 + }, + { + "epoch": 0.005765221799631262, + "grad_norm": 0.05126733332872391, + "learning_rate": 0.0002, + "loss": 0.1024, + "step": 3170 + }, + { + "epoch": 0.005783408619188459, + "grad_norm": 0.06377821415662766, + "learning_rate": 0.0002, + "loss": 0.0846, + "step": 3180 + }, + { + "epoch": 0.005801595438745655, + "grad_norm": 0.10748566687107086, + "learning_rate": 0.0002, + "loss": 0.0769, + "step": 3190 + }, + { + "epoch": 0.005819782258302851, + "grad_norm": 0.004992443602532148, + "learning_rate": 0.0002, + "loss": 0.0114, + "step": 3200 + }, + { + "epoch": 0.0058379690778600475, + "grad_norm": 0.0420277863740921, + "learning_rate": 0.0002, + "loss": 0.2159, + "step": 3210 + }, + { + "epoch": 0.005856155897417244, + "grad_norm": 0.02828531712293625, + "learning_rate": 0.0002, + "loss": 0.0923, + "step": 3220 + }, + { + "epoch": 0.005874342716974441, + "grad_norm": 0.028216248378157616, + "learning_rate": 0.0002, + "loss": 0.0789, + "step": 3230 + }, + { + "epoch": 0.005892529536531637, + "grad_norm": 0.11420746147632599, + "learning_rate": 0.0002, + "loss": 0.0696, + "step": 3240 + }, + { + "epoch": 0.005910716356088834, + "grad_norm": 0.0019631448667496443, + "learning_rate": 0.0002, + "loss": 0.0128, + "step": 3250 + }, + { + "epoch": 0.00592890317564603, + "grad_norm": 0.05514012649655342, + "learning_rate": 0.0002, + "loss": 0.2609, + "step": 3260 + }, + { + "epoch": 0.005947089995203226, + "grad_norm": 0.0917636826634407, + "learning_rate": 0.0002, + "loss": 0.0996, + "step": 3270 + }, + { + "epoch": 0.005965276814760423, + "grad_norm": 0.03648284077644348, + "learning_rate": 0.0002, + "loss": 0.084, + "step": 3280 + }, + { + "epoch": 0.005983463634317619, + "grad_norm": 0.13859149813652039, + "learning_rate": 0.0002, + "loss": 0.0807, + "step": 3290 + }, + { + "epoch": 0.006001650453874815, + "grad_norm": 0.013779910281300545, + "learning_rate": 0.0002, + "loss": 0.0181, + "step": 3300 + }, + { + "epoch": 0.006019837273432012, + "grad_norm": 0.02654041163623333, + "learning_rate": 0.0002, + "loss": 0.1636, + "step": 3310 + }, + { + "epoch": 0.006038024092989209, + "grad_norm": 0.062298137694597244, + "learning_rate": 0.0002, + "loss": 0.0872, + "step": 3320 + }, + { + "epoch": 0.006056210912546405, + "grad_norm": 0.0351388119161129, + "learning_rate": 0.0002, + "loss": 0.0802, + "step": 3330 + }, + { + "epoch": 0.006074397732103601, + "grad_norm": 0.16063807904720306, + "learning_rate": 0.0002, + "loss": 0.0753, + "step": 3340 + }, + { + "epoch": 0.006092584551660798, + "grad_norm": 0.009991235099732876, + "learning_rate": 0.0002, + "loss": 0.016, + "step": 3350 + }, + { + "epoch": 0.006110771371217994, + "grad_norm": 0.052919622510671616, + "learning_rate": 0.0002, + "loss": 0.2027, + "step": 3360 + }, + { + "epoch": 0.00612895819077519, + "grad_norm": 0.03228602185845375, + "learning_rate": 0.0002, + "loss": 0.0985, + "step": 3370 + }, + { + "epoch": 0.0061471450103323865, + "grad_norm": 0.11311203986406326, + "learning_rate": 0.0002, + "loss": 0.0797, + "step": 3380 + }, + { + "epoch": 0.006165331829889583, + "grad_norm": 0.1674620360136032, + "learning_rate": 0.0002, + "loss": 0.072, + "step": 3390 + }, + { + "epoch": 0.00618351864944678, + "grad_norm": 0.015154430642724037, + "learning_rate": 0.0002, + "loss": 0.0186, + "step": 3400 + }, + { + "epoch": 0.006201705469003976, + "grad_norm": 0.043151434510946274, + "learning_rate": 0.0002, + "loss": 0.1892, + "step": 3410 + }, + { + "epoch": 0.006219892288561173, + "grad_norm": 0.12342707067728043, + "learning_rate": 0.0002, + "loss": 0.0907, + "step": 3420 + }, + { + "epoch": 0.006238079108118369, + "grad_norm": 0.08350827544927597, + "learning_rate": 0.0002, + "loss": 0.0783, + "step": 3430 + }, + { + "epoch": 0.006256265927675565, + "grad_norm": 0.11938697844743729, + "learning_rate": 0.0002, + "loss": 0.0666, + "step": 3440 + }, + { + "epoch": 0.006274452747232762, + "grad_norm": 0.015424132347106934, + "learning_rate": 0.0002, + "loss": 0.0173, + "step": 3450 + }, + { + "epoch": 0.006292639566789958, + "grad_norm": 0.04220043867826462, + "learning_rate": 0.0002, + "loss": 0.1805, + "step": 3460 + }, + { + "epoch": 0.006310826386347154, + "grad_norm": 0.08813903480768204, + "learning_rate": 0.0002, + "loss": 0.096, + "step": 3470 + }, + { + "epoch": 0.006329013205904351, + "grad_norm": 0.07647278904914856, + "learning_rate": 0.0002, + "loss": 0.0821, + "step": 3480 + }, + { + "epoch": 0.006347200025461548, + "grad_norm": 0.14242641627788544, + "learning_rate": 0.0002, + "loss": 0.0769, + "step": 3490 + }, + { + "epoch": 0.006365386845018744, + "grad_norm": 0.011115231551229954, + "learning_rate": 0.0002, + "loss": 0.0221, + "step": 3500 + }, + { + "epoch": 0.00638357366457594, + "grad_norm": 0.036351826041936874, + "learning_rate": 0.0002, + "loss": 0.1557, + "step": 3510 + }, + { + "epoch": 0.006401760484133137, + "grad_norm": 0.08549819141626358, + "learning_rate": 0.0002, + "loss": 0.0864, + "step": 3520 + }, + { + "epoch": 0.006419947303690333, + "grad_norm": 0.047141823917627335, + "learning_rate": 0.0002, + "loss": 0.079, + "step": 3530 + }, + { + "epoch": 0.006438134123247529, + "grad_norm": 0.13143447041511536, + "learning_rate": 0.0002, + "loss": 0.0717, + "step": 3540 + }, + { + "epoch": 0.0064563209428047256, + "grad_norm": 0.013524871319532394, + "learning_rate": 0.0002, + "loss": 0.0149, + "step": 3550 + }, + { + "epoch": 0.006474507762361922, + "grad_norm": 0.03367459774017334, + "learning_rate": 0.0002, + "loss": 0.1715, + "step": 3560 + }, + { + "epoch": 0.006492694581919119, + "grad_norm": 0.045889757573604584, + "learning_rate": 0.0002, + "loss": 0.0949, + "step": 3570 + }, + { + "epoch": 0.006510881401476315, + "grad_norm": 0.04099202901124954, + "learning_rate": 0.0002, + "loss": 0.0813, + "step": 3580 + }, + { + "epoch": 0.006529068221033512, + "grad_norm": 0.133371040225029, + "learning_rate": 0.0002, + "loss": 0.0767, + "step": 3590 + }, + { + "epoch": 0.006547255040590708, + "grad_norm": 0.00645647756755352, + "learning_rate": 0.0002, + "loss": 0.0186, + "step": 3600 + }, + { + "epoch": 0.006565441860147904, + "grad_norm": 0.050674330443143845, + "learning_rate": 0.0002, + "loss": 0.2179, + "step": 3610 + }, + { + "epoch": 0.006583628679705101, + "grad_norm": 0.07087302207946777, + "learning_rate": 0.0002, + "loss": 0.0882, + "step": 3620 + }, + { + "epoch": 0.006601815499262297, + "grad_norm": 0.02759486250579357, + "learning_rate": 0.0002, + "loss": 0.0789, + "step": 3630 + }, + { + "epoch": 0.006620002318819493, + "grad_norm": 0.12163479626178741, + "learning_rate": 0.0002, + "loss": 0.0689, + "step": 3640 + }, + { + "epoch": 0.00663818913837669, + "grad_norm": 0.00969718024134636, + "learning_rate": 0.0002, + "loss": 0.0112, + "step": 3650 + }, + { + "epoch": 0.006656375957933887, + "grad_norm": 0.07106204330921173, + "learning_rate": 0.0002, + "loss": 0.199, + "step": 3660 + }, + { + "epoch": 0.006674562777491083, + "grad_norm": 0.08954132348299026, + "learning_rate": 0.0002, + "loss": 0.0985, + "step": 3670 + }, + { + "epoch": 0.006692749597048279, + "grad_norm": 0.09899396449327469, + "learning_rate": 0.0002, + "loss": 0.0811, + "step": 3680 + }, + { + "epoch": 0.006710936416605476, + "grad_norm": 0.12119311839342117, + "learning_rate": 0.0002, + "loss": 0.0698, + "step": 3690 + }, + { + "epoch": 0.006729123236162672, + "grad_norm": 0.013957214541733265, + "learning_rate": 0.0002, + "loss": 0.018, + "step": 3700 + }, + { + "epoch": 0.006747310055719868, + "grad_norm": 0.03089285083115101, + "learning_rate": 0.0002, + "loss": 0.1434, + "step": 3710 + }, + { + "epoch": 0.0067654968752770646, + "grad_norm": 0.025650829076766968, + "learning_rate": 0.0002, + "loss": 0.0886, + "step": 3720 + }, + { + "epoch": 0.006783683694834261, + "grad_norm": 0.044103365391492844, + "learning_rate": 0.0002, + "loss": 0.0788, + "step": 3730 + }, + { + "epoch": 0.006801870514391458, + "grad_norm": 0.09726370871067047, + "learning_rate": 0.0002, + "loss": 0.0674, + "step": 3740 + }, + { + "epoch": 0.006820057333948654, + "grad_norm": 0.018105274066329002, + "learning_rate": 0.0002, + "loss": 0.0232, + "step": 3750 + }, + { + "epoch": 0.006838244153505851, + "grad_norm": 0.021543240174651146, + "learning_rate": 0.0002, + "loss": 0.1406, + "step": 3760 + }, + { + "epoch": 0.006856430973063047, + "grad_norm": 0.09367050975561142, + "learning_rate": 0.0002, + "loss": 0.0973, + "step": 3770 + }, + { + "epoch": 0.006874617792620243, + "grad_norm": 0.06836032122373581, + "learning_rate": 0.0002, + "loss": 0.0848, + "step": 3780 + }, + { + "epoch": 0.00689280461217744, + "grad_norm": 0.11758081614971161, + "learning_rate": 0.0002, + "loss": 0.0693, + "step": 3790 + }, + { + "epoch": 0.006910991431734636, + "grad_norm": 0.008669364266097546, + "learning_rate": 0.0002, + "loss": 0.0223, + "step": 3800 + }, + { + "epoch": 0.006929178251291832, + "grad_norm": 0.03903719782829285, + "learning_rate": 0.0002, + "loss": 0.1519, + "step": 3810 + }, + { + "epoch": 0.0069473650708490285, + "grad_norm": 0.030682874843478203, + "learning_rate": 0.0002, + "loss": 0.0931, + "step": 3820 + }, + { + "epoch": 0.006965551890406226, + "grad_norm": 0.02693006955087185, + "learning_rate": 0.0002, + "loss": 0.0784, + "step": 3830 + }, + { + "epoch": 0.006983738709963422, + "grad_norm": 0.09535166621208191, + "learning_rate": 0.0002, + "loss": 0.0696, + "step": 3840 + }, + { + "epoch": 0.007001925529520618, + "grad_norm": 0.014680403284728527, + "learning_rate": 0.0002, + "loss": 0.0176, + "step": 3850 + }, + { + "epoch": 0.007020112349077815, + "grad_norm": 0.031090212985873222, + "learning_rate": 0.0002, + "loss": 0.1544, + "step": 3860 + }, + { + "epoch": 0.007038299168635011, + "grad_norm": 0.05870644003152847, + "learning_rate": 0.0002, + "loss": 0.0898, + "step": 3870 + }, + { + "epoch": 0.007056485988192207, + "grad_norm": 0.03480982780456543, + "learning_rate": 0.0002, + "loss": 0.0812, + "step": 3880 + }, + { + "epoch": 0.0070746728077494036, + "grad_norm": 0.09751418977975845, + "learning_rate": 0.0002, + "loss": 0.0724, + "step": 3890 + }, + { + "epoch": 0.0070928596273066, + "grad_norm": 0.022084850817918777, + "learning_rate": 0.0002, + "loss": 0.019, + "step": 3900 + }, + { + "epoch": 0.007111046446863797, + "grad_norm": 0.06994971632957458, + "learning_rate": 0.0002, + "loss": 0.1478, + "step": 3910 + }, + { + "epoch": 0.007129233266420993, + "grad_norm": 0.05761263892054558, + "learning_rate": 0.0002, + "loss": 0.0932, + "step": 3920 + }, + { + "epoch": 0.00714742008597819, + "grad_norm": 0.029772033914923668, + "learning_rate": 0.0002, + "loss": 0.0855, + "step": 3930 + }, + { + "epoch": 0.007165606905535386, + "grad_norm": 0.11868726462125778, + "learning_rate": 0.0002, + "loss": 0.0727, + "step": 3940 + }, + { + "epoch": 0.007183793725092582, + "grad_norm": 0.0065403408370912075, + "learning_rate": 0.0002, + "loss": 0.0174, + "step": 3950 + }, + { + "epoch": 0.007201980544649779, + "grad_norm": 0.031544361263513565, + "learning_rate": 0.0002, + "loss": 0.1827, + "step": 3960 + }, + { + "epoch": 0.007220167364206975, + "grad_norm": 0.031641531735658646, + "learning_rate": 0.0002, + "loss": 0.0867, + "step": 3970 + }, + { + "epoch": 0.007238354183764171, + "grad_norm": 0.028574040159583092, + "learning_rate": 0.0002, + "loss": 0.0798, + "step": 3980 + }, + { + "epoch": 0.0072565410033213675, + "grad_norm": 0.12866555154323578, + "learning_rate": 0.0002, + "loss": 0.0708, + "step": 3990 + }, + { + "epoch": 0.007274727822878565, + "grad_norm": 0.00843430683016777, + "learning_rate": 0.0002, + "loss": 0.0127, + "step": 4000 + }, + { + "epoch": 0.007292914642435761, + "grad_norm": 0.03737691789865494, + "learning_rate": 0.0002, + "loss": 0.2201, + "step": 4010 + }, + { + "epoch": 0.007311101461992957, + "grad_norm": 0.05326579511165619, + "learning_rate": 0.0002, + "loss": 0.0838, + "step": 4020 + }, + { + "epoch": 0.007329288281550154, + "grad_norm": 0.031934209167957306, + "learning_rate": 0.0002, + "loss": 0.0778, + "step": 4030 + }, + { + "epoch": 0.00734747510110735, + "grad_norm": 0.17401957511901855, + "learning_rate": 0.0002, + "loss": 0.073, + "step": 4040 + }, + { + "epoch": 0.007365661920664546, + "grad_norm": 0.005256639327853918, + "learning_rate": 0.0002, + "loss": 0.0122, + "step": 4050 + }, + { + "epoch": 0.0073838487402217426, + "grad_norm": 0.05043623968958855, + "learning_rate": 0.0002, + "loss": 0.2524, + "step": 4060 + }, + { + "epoch": 0.007402035559778939, + "grad_norm": 0.06662425398826599, + "learning_rate": 0.0002, + "loss": 0.0976, + "step": 4070 + }, + { + "epoch": 0.007420222379336136, + "grad_norm": 0.13419686257839203, + "learning_rate": 0.0002, + "loss": 0.0833, + "step": 4080 + }, + { + "epoch": 0.007438409198893332, + "grad_norm": 0.176285520195961, + "learning_rate": 0.0002, + "loss": 0.0761, + "step": 4090 + }, + { + "epoch": 0.007456596018450529, + "grad_norm": 0.008489354513585567, + "learning_rate": 0.0002, + "loss": 0.0182, + "step": 4100 + }, + { + "epoch": 0.007474782838007725, + "grad_norm": 0.06247509643435478, + "learning_rate": 0.0002, + "loss": 0.2232, + "step": 4110 + }, + { + "epoch": 0.007492969657564921, + "grad_norm": 0.05744702368974686, + "learning_rate": 0.0002, + "loss": 0.0875, + "step": 4120 + }, + { + "epoch": 0.007511156477122118, + "grad_norm": 0.053026407957077026, + "learning_rate": 0.0002, + "loss": 0.0807, + "step": 4130 + }, + { + "epoch": 0.007529343296679314, + "grad_norm": 0.11734003573656082, + "learning_rate": 0.0002, + "loss": 0.0724, + "step": 4140 + }, + { + "epoch": 0.00754753011623651, + "grad_norm": 0.005216363817453384, + "learning_rate": 0.0002, + "loss": 0.0129, + "step": 4150 + }, + { + "epoch": 0.0075657169357937065, + "grad_norm": 0.08154789358377457, + "learning_rate": 0.0002, + "loss": 0.2221, + "step": 4160 + }, + { + "epoch": 0.007583903755350904, + "grad_norm": 0.03619784861803055, + "learning_rate": 0.0002, + "loss": 0.0993, + "step": 4170 + }, + { + "epoch": 0.0076020905749081, + "grad_norm": 0.08239256590604782, + "learning_rate": 0.0002, + "loss": 0.0811, + "step": 4180 + }, + { + "epoch": 0.007620277394465296, + "grad_norm": 0.11934535950422287, + "learning_rate": 0.0002, + "loss": 0.0726, + "step": 4190 + }, + { + "epoch": 0.007638464214022493, + "grad_norm": 0.006965799257159233, + "learning_rate": 0.0002, + "loss": 0.0181, + "step": 4200 + }, + { + "epoch": 0.007656651033579689, + "grad_norm": 0.04328077286481857, + "learning_rate": 0.0002, + "loss": 0.1983, + "step": 4210 + }, + { + "epoch": 0.007674837853136885, + "grad_norm": 0.08253510296344757, + "learning_rate": 0.0002, + "loss": 0.0954, + "step": 4220 + }, + { + "epoch": 0.0076930246726940816, + "grad_norm": 0.06146657094359398, + "learning_rate": 0.0002, + "loss": 0.0843, + "step": 4230 + }, + { + "epoch": 0.007711211492251278, + "grad_norm": 0.13579218089580536, + "learning_rate": 0.0002, + "loss": 0.0672, + "step": 4240 + }, + { + "epoch": 0.007729398311808474, + "grad_norm": 0.0038396338932216167, + "learning_rate": 0.0002, + "loss": 0.0131, + "step": 4250 + }, + { + "epoch": 0.007747585131365671, + "grad_norm": 0.03109130822122097, + "learning_rate": 0.0002, + "loss": 0.2102, + "step": 4260 + }, + { + "epoch": 0.007765771950922868, + "grad_norm": 0.04971664398908615, + "learning_rate": 0.0002, + "loss": 0.0903, + "step": 4270 + }, + { + "epoch": 0.007783958770480064, + "grad_norm": 0.06476306915283203, + "learning_rate": 0.0002, + "loss": 0.0859, + "step": 4280 + }, + { + "epoch": 0.00780214559003726, + "grad_norm": 0.15377041697502136, + "learning_rate": 0.0002, + "loss": 0.0828, + "step": 4290 + }, + { + "epoch": 0.007820332409594457, + "grad_norm": 0.005592274013906717, + "learning_rate": 0.0002, + "loss": 0.014, + "step": 4300 + }, + { + "epoch": 0.007838519229151653, + "grad_norm": 0.04387212172150612, + "learning_rate": 0.0002, + "loss": 0.1907, + "step": 4310 + }, + { + "epoch": 0.00785670604870885, + "grad_norm": 0.06001356989145279, + "learning_rate": 0.0002, + "loss": 0.0864, + "step": 4320 + }, + { + "epoch": 0.007874892868266046, + "grad_norm": 0.030866140499711037, + "learning_rate": 0.0002, + "loss": 0.0748, + "step": 4330 + }, + { + "epoch": 0.007893079687823242, + "grad_norm": 0.13280808925628662, + "learning_rate": 0.0002, + "loss": 0.0686, + "step": 4340 + }, + { + "epoch": 0.007911266507380438, + "grad_norm": 0.015559020452201366, + "learning_rate": 0.0002, + "loss": 0.016, + "step": 4350 + }, + { + "epoch": 0.007929453326937634, + "grad_norm": 0.0669974684715271, + "learning_rate": 0.0002, + "loss": 0.1916, + "step": 4360 + }, + { + "epoch": 0.00794764014649483, + "grad_norm": 0.0759076252579689, + "learning_rate": 0.0002, + "loss": 0.0925, + "step": 4370 + }, + { + "epoch": 0.007965826966052029, + "grad_norm": 0.029388410970568657, + "learning_rate": 0.0002, + "loss": 0.086, + "step": 4380 + }, + { + "epoch": 0.007984013785609225, + "grad_norm": 0.17637981474399567, + "learning_rate": 0.0002, + "loss": 0.0697, + "step": 4390 + }, + { + "epoch": 0.008002200605166421, + "grad_norm": 0.008022189140319824, + "learning_rate": 0.0002, + "loss": 0.0196, + "step": 4400 + }, + { + "epoch": 0.008020387424723618, + "grad_norm": 0.04126167669892311, + "learning_rate": 0.0002, + "loss": 0.192, + "step": 4410 + }, + { + "epoch": 0.008038574244280814, + "grad_norm": 0.08132971078157425, + "learning_rate": 0.0002, + "loss": 0.093, + "step": 4420 + }, + { + "epoch": 0.00805676106383801, + "grad_norm": 0.07568484544754028, + "learning_rate": 0.0002, + "loss": 0.0823, + "step": 4430 + }, + { + "epoch": 0.008074947883395207, + "grad_norm": 0.1259222775697708, + "learning_rate": 0.0002, + "loss": 0.0696, + "step": 4440 + }, + { + "epoch": 0.008093134702952403, + "grad_norm": 0.009711826220154762, + "learning_rate": 0.0002, + "loss": 0.0232, + "step": 4450 + }, + { + "epoch": 0.0081113215225096, + "grad_norm": 0.029734279960393906, + "learning_rate": 0.0002, + "loss": 0.1595, + "step": 4460 + }, + { + "epoch": 0.008129508342066796, + "grad_norm": 0.04886960610747337, + "learning_rate": 0.0002, + "loss": 0.0919, + "step": 4470 + }, + { + "epoch": 0.008147695161623992, + "grad_norm": 0.07031470537185669, + "learning_rate": 0.0002, + "loss": 0.0813, + "step": 4480 + }, + { + "epoch": 0.008165881981181188, + "grad_norm": 0.12099859863519669, + "learning_rate": 0.0002, + "loss": 0.0731, + "step": 4490 + }, + { + "epoch": 0.008184068800738385, + "grad_norm": 0.02181529812514782, + "learning_rate": 0.0002, + "loss": 0.021, + "step": 4500 + }, + { + "epoch": 0.00820225562029558, + "grad_norm": 0.035477787256240845, + "learning_rate": 0.0002, + "loss": 0.1429, + "step": 4510 + }, + { + "epoch": 0.008220442439852777, + "grad_norm": 0.07788772135972977, + "learning_rate": 0.0002, + "loss": 0.0842, + "step": 4520 + }, + { + "epoch": 0.008238629259409973, + "grad_norm": 0.045833125710487366, + "learning_rate": 0.0002, + "loss": 0.0829, + "step": 4530 + }, + { + "epoch": 0.00825681607896717, + "grad_norm": 0.12271951884031296, + "learning_rate": 0.0002, + "loss": 0.0707, + "step": 4540 + }, + { + "epoch": 0.008275002898524366, + "grad_norm": 0.01919553242623806, + "learning_rate": 0.0002, + "loss": 0.0213, + "step": 4550 + }, + { + "epoch": 0.008293189718081564, + "grad_norm": 0.032527096569538116, + "learning_rate": 0.0002, + "loss": 0.1397, + "step": 4560 + }, + { + "epoch": 0.00831137653763876, + "grad_norm": 0.045243579894304276, + "learning_rate": 0.0002, + "loss": 0.0854, + "step": 4570 + }, + { + "epoch": 0.008329563357195957, + "grad_norm": 0.04226524010300636, + "learning_rate": 0.0002, + "loss": 0.0728, + "step": 4580 + }, + { + "epoch": 0.008347750176753153, + "grad_norm": 0.09887039661407471, + "learning_rate": 0.0002, + "loss": 0.0661, + "step": 4590 + }, + { + "epoch": 0.00836593699631035, + "grad_norm": 0.01822318509221077, + "learning_rate": 0.0002, + "loss": 0.0169, + "step": 4600 + }, + { + "epoch": 0.008384123815867546, + "grad_norm": 0.05729951336979866, + "learning_rate": 0.0002, + "loss": 0.137, + "step": 4610 + }, + { + "epoch": 0.008402310635424742, + "grad_norm": 0.041520439088344574, + "learning_rate": 0.0002, + "loss": 0.0825, + "step": 4620 + }, + { + "epoch": 0.008420497454981938, + "grad_norm": 0.051164623349905014, + "learning_rate": 0.0002, + "loss": 0.0818, + "step": 4630 + }, + { + "epoch": 0.008438684274539135, + "grad_norm": 0.1289409101009369, + "learning_rate": 0.0002, + "loss": 0.0664, + "step": 4640 + }, + { + "epoch": 0.008456871094096331, + "grad_norm": 0.0085114361718297, + "learning_rate": 0.0002, + "loss": 0.0229, + "step": 4650 + }, + { + "epoch": 0.008475057913653527, + "grad_norm": 0.03594676032662392, + "learning_rate": 0.0002, + "loss": 0.1401, + "step": 4660 + }, + { + "epoch": 0.008493244733210724, + "grad_norm": 0.0316978394985199, + "learning_rate": 0.0002, + "loss": 0.0877, + "step": 4670 + }, + { + "epoch": 0.00851143155276792, + "grad_norm": 0.023302162066102028, + "learning_rate": 0.0002, + "loss": 0.0764, + "step": 4680 + }, + { + "epoch": 0.008529618372325116, + "grad_norm": 0.1329929083585739, + "learning_rate": 0.0002, + "loss": 0.0788, + "step": 4690 + }, + { + "epoch": 0.008547805191882312, + "grad_norm": 0.01048013661056757, + "learning_rate": 0.0002, + "loss": 0.0234, + "step": 4700 + }, + { + "epoch": 0.008565992011439509, + "grad_norm": 0.03505022078752518, + "learning_rate": 0.0002, + "loss": 0.1509, + "step": 4710 + }, + { + "epoch": 0.008584178830996705, + "grad_norm": 0.03877585008740425, + "learning_rate": 0.0002, + "loss": 0.0802, + "step": 4720 + }, + { + "epoch": 0.008602365650553903, + "grad_norm": 0.041193027049303055, + "learning_rate": 0.0002, + "loss": 0.0695, + "step": 4730 + }, + { + "epoch": 0.0086205524701111, + "grad_norm": 0.17310455441474915, + "learning_rate": 0.0002, + "loss": 0.0794, + "step": 4740 + }, + { + "epoch": 0.008638739289668296, + "grad_norm": 0.0061012376099824905, + "learning_rate": 0.0002, + "loss": 0.0158, + "step": 4750 + }, + { + "epoch": 0.008656926109225492, + "grad_norm": 0.04843207076191902, + "learning_rate": 0.0002, + "loss": 0.2103, + "step": 4760 + }, + { + "epoch": 0.008675112928782688, + "grad_norm": 0.04483436048030853, + "learning_rate": 0.0002, + "loss": 0.0878, + "step": 4770 + }, + { + "epoch": 0.008693299748339885, + "grad_norm": 0.056655965745449066, + "learning_rate": 0.0002, + "loss": 0.0752, + "step": 4780 + }, + { + "epoch": 0.008711486567897081, + "grad_norm": 0.11626063287258148, + "learning_rate": 0.0002, + "loss": 0.0685, + "step": 4790 + }, + { + "epoch": 0.008729673387454277, + "grad_norm": 0.013872025534510612, + "learning_rate": 0.0002, + "loss": 0.0198, + "step": 4800 + }, + { + "epoch": 0.008747860207011474, + "grad_norm": 0.06217370182275772, + "learning_rate": 0.0002, + "loss": 0.1371, + "step": 4810 + }, + { + "epoch": 0.00876604702656867, + "grad_norm": 0.027149083092808723, + "learning_rate": 0.0002, + "loss": 0.0849, + "step": 4820 + }, + { + "epoch": 0.008784233846125866, + "grad_norm": 0.043290987610816956, + "learning_rate": 0.0002, + "loss": 0.0739, + "step": 4830 + }, + { + "epoch": 0.008802420665683063, + "grad_norm": 0.10664638131856918, + "learning_rate": 0.0002, + "loss": 0.0722, + "step": 4840 + }, + { + "epoch": 0.008820607485240259, + "grad_norm": 0.033459801226854324, + "learning_rate": 0.0002, + "loss": 0.0234, + "step": 4850 + }, + { + "epoch": 0.008838794304797455, + "grad_norm": 0.049193184822797775, + "learning_rate": 0.0002, + "loss": 0.1173, + "step": 4860 + }, + { + "epoch": 0.008856981124354651, + "grad_norm": 0.05060647428035736, + "learning_rate": 0.0002, + "loss": 0.0883, + "step": 4870 + }, + { + "epoch": 0.008875167943911848, + "grad_norm": 0.028496885672211647, + "learning_rate": 0.0002, + "loss": 0.0747, + "step": 4880 + }, + { + "epoch": 0.008893354763469044, + "grad_norm": 0.10652820765972137, + "learning_rate": 0.0002, + "loss": 0.0707, + "step": 4890 + }, + { + "epoch": 0.008911541583026242, + "grad_norm": 0.007879966869950294, + "learning_rate": 0.0002, + "loss": 0.0178, + "step": 4900 + }, + { + "epoch": 0.008929728402583438, + "grad_norm": 0.05227983742952347, + "learning_rate": 0.0002, + "loss": 0.1379, + "step": 4910 + }, + { + "epoch": 0.008947915222140635, + "grad_norm": 0.06054231896996498, + "learning_rate": 0.0002, + "loss": 0.0934, + "step": 4920 + }, + { + "epoch": 0.008966102041697831, + "grad_norm": 0.029085835441946983, + "learning_rate": 0.0002, + "loss": 0.0816, + "step": 4930 + }, + { + "epoch": 0.008984288861255027, + "grad_norm": 0.09829402714967728, + "learning_rate": 0.0002, + "loss": 0.0672, + "step": 4940 + }, + { + "epoch": 0.009002475680812224, + "grad_norm": 0.005579107441008091, + "learning_rate": 0.0002, + "loss": 0.0172, + "step": 4950 + }, + { + "epoch": 0.00902066250036942, + "grad_norm": 0.027280857786536217, + "learning_rate": 0.0002, + "loss": 0.1659, + "step": 4960 + }, + { + "epoch": 0.009038849319926616, + "grad_norm": 0.10321583598852158, + "learning_rate": 0.0002, + "loss": 0.0947, + "step": 4970 + }, + { + "epoch": 0.009057036139483813, + "grad_norm": 0.03381946310400963, + "learning_rate": 0.0002, + "loss": 0.0837, + "step": 4980 + }, + { + "epoch": 0.009075222959041009, + "grad_norm": 0.14493779838085175, + "learning_rate": 0.0002, + "loss": 0.0736, + "step": 4990 + }, + { + "epoch": 0.009093409778598205, + "grad_norm": 0.009917684830725193, + "learning_rate": 0.0002, + "loss": 0.0188, + "step": 5000 + }, + { + "epoch": 0.009111596598155402, + "grad_norm": 1.003450632095337, + "learning_rate": 0.0002, + "loss": 0.218, + "step": 5010 + }, + { + "epoch": 0.009129783417712598, + "grad_norm": 0.09081514924764633, + "learning_rate": 0.0002, + "loss": 0.1714, + "step": 5020 + }, + { + "epoch": 0.009147970237269794, + "grad_norm": 0.042343392968177795, + "learning_rate": 0.0002, + "loss": 0.0823, + "step": 5030 + }, + { + "epoch": 0.00916615705682699, + "grad_norm": 0.09944835305213928, + "learning_rate": 0.0002, + "loss": 0.0667, + "step": 5040 + }, + { + "epoch": 0.009184343876384187, + "grad_norm": 0.008264658972620964, + "learning_rate": 0.0002, + "loss": 0.0122, + "step": 5050 + }, + { + "epoch": 0.009202530695941383, + "grad_norm": 0.08990125358104706, + "learning_rate": 0.0002, + "loss": 0.1685, + "step": 5060 + }, + { + "epoch": 0.009220717515498581, + "grad_norm": 0.0331488698720932, + "learning_rate": 0.0002, + "loss": 0.0885, + "step": 5070 + }, + { + "epoch": 0.009238904335055777, + "grad_norm": 0.029458707198500633, + "learning_rate": 0.0002, + "loss": 0.078, + "step": 5080 + }, + { + "epoch": 0.009257091154612974, + "grad_norm": 0.10468839108943939, + "learning_rate": 0.0002, + "loss": 0.0683, + "step": 5090 + }, + { + "epoch": 0.00927527797417017, + "grad_norm": 0.002719841431826353, + "learning_rate": 0.0002, + "loss": 0.0117, + "step": 5100 + }, + { + "epoch": 0.009293464793727366, + "grad_norm": 0.0411439947783947, + "learning_rate": 0.0002, + "loss": 0.2025, + "step": 5110 + }, + { + "epoch": 0.009311651613284563, + "grad_norm": 0.03695548698306084, + "learning_rate": 0.0002, + "loss": 0.0831, + "step": 5120 + }, + { + "epoch": 0.009329838432841759, + "grad_norm": 0.06067590415477753, + "learning_rate": 0.0002, + "loss": 0.0779, + "step": 5130 + }, + { + "epoch": 0.009348025252398955, + "grad_norm": 0.11754634976387024, + "learning_rate": 0.0002, + "loss": 0.0667, + "step": 5140 + }, + { + "epoch": 0.009366212071956152, + "grad_norm": 0.004248317331075668, + "learning_rate": 0.0002, + "loss": 0.0113, + "step": 5150 + }, + { + "epoch": 0.009384398891513348, + "grad_norm": 0.03073648177087307, + "learning_rate": 0.0002, + "loss": 0.2289, + "step": 5160 + }, + { + "epoch": 0.009402585711070544, + "grad_norm": 0.10287592560052872, + "learning_rate": 0.0002, + "loss": 0.0977, + "step": 5170 + }, + { + "epoch": 0.00942077253062774, + "grad_norm": 0.06832946836948395, + "learning_rate": 0.0002, + "loss": 0.0764, + "step": 5180 + }, + { + "epoch": 0.009438959350184937, + "grad_norm": 0.1760883778333664, + "learning_rate": 0.0002, + "loss": 0.0795, + "step": 5190 + }, + { + "epoch": 0.009457146169742133, + "grad_norm": 0.02968805655837059, + "learning_rate": 0.0002, + "loss": 0.0253, + "step": 5200 + }, + { + "epoch": 0.00947533298929933, + "grad_norm": 0.046602651476860046, + "learning_rate": 0.0002, + "loss": 0.1432, + "step": 5210 + }, + { + "epoch": 0.009493519808856526, + "grad_norm": 0.051989324390888214, + "learning_rate": 0.0002, + "loss": 0.0807, + "step": 5220 + }, + { + "epoch": 0.009511706628413722, + "grad_norm": 0.04583961144089699, + "learning_rate": 0.0002, + "loss": 0.0782, + "step": 5230 + }, + { + "epoch": 0.00952989344797092, + "grad_norm": 0.13195525109767914, + "learning_rate": 0.0002, + "loss": 0.0688, + "step": 5240 + }, + { + "epoch": 0.009548080267528116, + "grad_norm": 0.011369351297616959, + "learning_rate": 0.0002, + "loss": 0.0232, + "step": 5250 + }, + { + "epoch": 0.009566267087085313, + "grad_norm": 0.05092083290219307, + "learning_rate": 0.0002, + "loss": 0.145, + "step": 5260 + }, + { + "epoch": 0.009584453906642509, + "grad_norm": 0.05051489174365997, + "learning_rate": 0.0002, + "loss": 0.0803, + "step": 5270 + }, + { + "epoch": 0.009602640726199705, + "grad_norm": 0.05730990320444107, + "learning_rate": 0.0002, + "loss": 0.0717, + "step": 5280 + }, + { + "epoch": 0.009620827545756902, + "grad_norm": 0.11170202493667603, + "learning_rate": 0.0002, + "loss": 0.0711, + "step": 5290 + }, + { + "epoch": 0.009639014365314098, + "grad_norm": 0.011571788229048252, + "learning_rate": 0.0002, + "loss": 0.0204, + "step": 5300 + }, + { + "epoch": 0.009657201184871294, + "grad_norm": 0.04396244138479233, + "learning_rate": 0.0002, + "loss": 0.1764, + "step": 5310 + }, + { + "epoch": 0.00967538800442849, + "grad_norm": 0.047808658331632614, + "learning_rate": 0.0002, + "loss": 0.0855, + "step": 5320 + }, + { + "epoch": 0.009693574823985687, + "grad_norm": 0.09201673418283463, + "learning_rate": 0.0002, + "loss": 0.0737, + "step": 5330 + }, + { + "epoch": 0.009711761643542883, + "grad_norm": 0.12273146212100983, + "learning_rate": 0.0002, + "loss": 0.0658, + "step": 5340 + }, + { + "epoch": 0.00972994846310008, + "grad_norm": 0.014599839225411415, + "learning_rate": 0.0002, + "loss": 0.0254, + "step": 5350 + }, + { + "epoch": 0.009748135282657276, + "grad_norm": 0.049732692539691925, + "learning_rate": 0.0002, + "loss": 0.1432, + "step": 5360 + }, + { + "epoch": 0.009766322102214472, + "grad_norm": 0.07791377604007721, + "learning_rate": 0.0002, + "loss": 0.0865, + "step": 5370 + }, + { + "epoch": 0.009784508921771668, + "grad_norm": 0.06298892199993134, + "learning_rate": 0.0002, + "loss": 0.0816, + "step": 5380 + }, + { + "epoch": 0.009802695741328865, + "grad_norm": 0.08924435079097748, + "learning_rate": 0.0002, + "loss": 0.0709, + "step": 5390 + }, + { + "epoch": 0.009820882560886061, + "grad_norm": 0.02383723482489586, + "learning_rate": 0.0002, + "loss": 0.0208, + "step": 5400 + }, + { + "epoch": 0.009839069380443257, + "grad_norm": 0.042910825461149216, + "learning_rate": 0.0002, + "loss": 0.1383, + "step": 5410 + }, + { + "epoch": 0.009857256200000455, + "grad_norm": 0.05560186505317688, + "learning_rate": 0.0002, + "loss": 0.0827, + "step": 5420 + }, + { + "epoch": 0.009875443019557652, + "grad_norm": 0.08179624378681183, + "learning_rate": 0.0002, + "loss": 0.0758, + "step": 5430 + }, + { + "epoch": 0.009893629839114848, + "grad_norm": 0.17111806571483612, + "learning_rate": 0.0002, + "loss": 0.0688, + "step": 5440 + }, + { + "epoch": 0.009911816658672044, + "grad_norm": 0.008684845641255379, + "learning_rate": 0.0002, + "loss": 0.0177, + "step": 5450 + }, + { + "epoch": 0.00993000347822924, + "grad_norm": 0.044370412826538086, + "learning_rate": 0.0002, + "loss": 0.2036, + "step": 5460 + }, + { + "epoch": 0.009948190297786437, + "grad_norm": 0.08403154462575912, + "learning_rate": 0.0002, + "loss": 0.0878, + "step": 5470 + }, + { + "epoch": 0.009966377117343633, + "grad_norm": 0.10712645202875137, + "learning_rate": 0.0002, + "loss": 0.0787, + "step": 5480 + }, + { + "epoch": 0.00998456393690083, + "grad_norm": 0.12575705349445343, + "learning_rate": 0.0002, + "loss": 0.0637, + "step": 5490 + }, + { + "epoch": 0.010002750756458026, + "grad_norm": 0.018583891913294792, + "learning_rate": 0.0002, + "loss": 0.0179, + "step": 5500 + }, + { + "epoch": 0.010020937576015222, + "grad_norm": 0.040852561593055725, + "learning_rate": 0.0002, + "loss": 0.1545, + "step": 5510 + }, + { + "epoch": 0.010039124395572419, + "grad_norm": 0.09006325900554657, + "learning_rate": 0.0002, + "loss": 0.0888, + "step": 5520 + }, + { + "epoch": 0.010057311215129615, + "grad_norm": 0.06323093175888062, + "learning_rate": 0.0002, + "loss": 0.0778, + "step": 5530 + }, + { + "epoch": 0.010075498034686811, + "grad_norm": 0.10159824043512344, + "learning_rate": 0.0002, + "loss": 0.0662, + "step": 5540 + }, + { + "epoch": 0.010093684854244007, + "grad_norm": 0.012086872011423111, + "learning_rate": 0.0002, + "loss": 0.0237, + "step": 5550 + }, + { + "epoch": 0.010111871673801204, + "grad_norm": 0.02518664114177227, + "learning_rate": 0.0002, + "loss": 0.1246, + "step": 5560 + }, + { + "epoch": 0.0101300584933584, + "grad_norm": 0.056161828339099884, + "learning_rate": 0.0002, + "loss": 0.086, + "step": 5570 + }, + { + "epoch": 0.010148245312915596, + "grad_norm": 0.03376586362719536, + "learning_rate": 0.0002, + "loss": 0.0842, + "step": 5580 + }, + { + "epoch": 0.010166432132472794, + "grad_norm": 0.09921032190322876, + "learning_rate": 0.0002, + "loss": 0.0667, + "step": 5590 + }, + { + "epoch": 0.01018461895202999, + "grad_norm": 0.009120604954659939, + "learning_rate": 0.0002, + "loss": 0.0209, + "step": 5600 + }, + { + "epoch": 0.010202805771587187, + "grad_norm": 0.037767425179481506, + "learning_rate": 0.0002, + "loss": 0.1248, + "step": 5610 + }, + { + "epoch": 0.010220992591144383, + "grad_norm": 0.05255524069070816, + "learning_rate": 0.0002, + "loss": 0.0794, + "step": 5620 + }, + { + "epoch": 0.01023917941070158, + "grad_norm": 0.038734354078769684, + "learning_rate": 0.0002, + "loss": 0.0791, + "step": 5630 + }, + { + "epoch": 0.010257366230258776, + "grad_norm": 0.09293238073587418, + "learning_rate": 0.0002, + "loss": 0.064, + "step": 5640 + }, + { + "epoch": 0.010275553049815972, + "grad_norm": 0.013020232319831848, + "learning_rate": 0.0002, + "loss": 0.0174, + "step": 5650 + }, + { + "epoch": 0.010293739869373169, + "grad_norm": 0.030535893514752388, + "learning_rate": 0.0002, + "loss": 0.1615, + "step": 5660 + }, + { + "epoch": 0.010311926688930365, + "grad_norm": 0.08644227683544159, + "learning_rate": 0.0002, + "loss": 0.0856, + "step": 5670 + }, + { + "epoch": 0.010330113508487561, + "grad_norm": 0.04769067466259003, + "learning_rate": 0.0002, + "loss": 0.0777, + "step": 5680 + }, + { + "epoch": 0.010348300328044758, + "grad_norm": 0.1528550088405609, + "learning_rate": 0.0002, + "loss": 0.0757, + "step": 5690 + }, + { + "epoch": 0.010366487147601954, + "grad_norm": 0.012257793918251991, + "learning_rate": 0.0002, + "loss": 0.0163, + "step": 5700 + }, + { + "epoch": 0.01038467396715915, + "grad_norm": 0.5761304497718811, + "learning_rate": 0.0002, + "loss": 0.1787, + "step": 5710 + }, + { + "epoch": 0.010402860786716346, + "grad_norm": 0.07034485787153244, + "learning_rate": 0.0002, + "loss": 0.0964, + "step": 5720 + }, + { + "epoch": 0.010421047606273543, + "grad_norm": 0.04541708156466484, + "learning_rate": 0.0002, + "loss": 0.0793, + "step": 5730 + }, + { + "epoch": 0.010439234425830739, + "grad_norm": 0.12013612687587738, + "learning_rate": 0.0002, + "loss": 0.07, + "step": 5740 + }, + { + "epoch": 0.010457421245387935, + "grad_norm": 0.014152747578918934, + "learning_rate": 0.0002, + "loss": 0.0208, + "step": 5750 + }, + { + "epoch": 0.010475608064945133, + "grad_norm": 0.029470542445778847, + "learning_rate": 0.0002, + "loss": 0.1352, + "step": 5760 + }, + { + "epoch": 0.01049379488450233, + "grad_norm": 0.04889104515314102, + "learning_rate": 0.0002, + "loss": 0.0748, + "step": 5770 + }, + { + "epoch": 0.010511981704059526, + "grad_norm": 0.0311355609446764, + "learning_rate": 0.0002, + "loss": 0.0764, + "step": 5780 + }, + { + "epoch": 0.010530168523616722, + "grad_norm": 0.16830098628997803, + "learning_rate": 0.0002, + "loss": 0.0734, + "step": 5790 + }, + { + "epoch": 0.010548355343173919, + "grad_norm": 0.013224232010543346, + "learning_rate": 0.0002, + "loss": 0.0218, + "step": 5800 + }, + { + "epoch": 0.010566542162731115, + "grad_norm": 0.03710555657744408, + "learning_rate": 0.0002, + "loss": 0.1403, + "step": 5810 + }, + { + "epoch": 0.010584728982288311, + "grad_norm": 0.05788695067167282, + "learning_rate": 0.0002, + "loss": 0.0863, + "step": 5820 + }, + { + "epoch": 0.010602915801845508, + "grad_norm": 0.03398163616657257, + "learning_rate": 0.0002, + "loss": 0.0751, + "step": 5830 + }, + { + "epoch": 0.010621102621402704, + "grad_norm": 0.13862720131874084, + "learning_rate": 0.0002, + "loss": 0.07, + "step": 5840 + }, + { + "epoch": 0.0106392894409599, + "grad_norm": 0.016240287572145462, + "learning_rate": 0.0002, + "loss": 0.0209, + "step": 5850 + }, + { + "epoch": 0.010657476260517097, + "grad_norm": 0.030351752415299416, + "learning_rate": 0.0002, + "loss": 0.157, + "step": 5860 + }, + { + "epoch": 0.010675663080074293, + "grad_norm": 0.038465555757284164, + "learning_rate": 0.0002, + "loss": 0.072, + "step": 5870 + }, + { + "epoch": 0.01069384989963149, + "grad_norm": 0.07298482209444046, + "learning_rate": 0.0002, + "loss": 0.0796, + "step": 5880 + }, + { + "epoch": 0.010712036719188685, + "grad_norm": 0.13822157680988312, + "learning_rate": 0.0002, + "loss": 0.0687, + "step": 5890 + }, + { + "epoch": 0.010730223538745882, + "grad_norm": 0.014381729066371918, + "learning_rate": 0.0002, + "loss": 0.0192, + "step": 5900 + }, + { + "epoch": 0.010748410358303078, + "grad_norm": 0.040448348969221115, + "learning_rate": 0.0002, + "loss": 0.1714, + "step": 5910 + }, + { + "epoch": 0.010766597177860274, + "grad_norm": 0.06950225681066513, + "learning_rate": 0.0002, + "loss": 0.098, + "step": 5920 + }, + { + "epoch": 0.010784783997417472, + "grad_norm": 0.04581855982542038, + "learning_rate": 0.0002, + "loss": 0.0752, + "step": 5930 + }, + { + "epoch": 0.010802970816974669, + "grad_norm": 0.10498905926942825, + "learning_rate": 0.0002, + "loss": 0.0627, + "step": 5940 + }, + { + "epoch": 0.010821157636531865, + "grad_norm": 0.009345698170363903, + "learning_rate": 0.0002, + "loss": 0.0183, + "step": 5950 + }, + { + "epoch": 0.010839344456089061, + "grad_norm": 0.02440352365374565, + "learning_rate": 0.0002, + "loss": 0.1289, + "step": 5960 + }, + { + "epoch": 0.010857531275646258, + "grad_norm": 0.051523737609386444, + "learning_rate": 0.0002, + "loss": 0.0813, + "step": 5970 + }, + { + "epoch": 0.010875718095203454, + "grad_norm": 0.031664300709962845, + "learning_rate": 0.0002, + "loss": 0.0736, + "step": 5980 + }, + { + "epoch": 0.01089390491476065, + "grad_norm": 0.10166060924530029, + "learning_rate": 0.0002, + "loss": 0.0631, + "step": 5990 + }, + { + "epoch": 0.010912091734317847, + "grad_norm": 0.01642071269452572, + "learning_rate": 0.0002, + "loss": 0.0198, + "step": 6000 + }, + { + "epoch": 0.010930278553875043, + "grad_norm": 0.04028782621026039, + "learning_rate": 0.0002, + "loss": 0.1355, + "step": 6010 + }, + { + "epoch": 0.01094846537343224, + "grad_norm": 0.04289260134100914, + "learning_rate": 0.0002, + "loss": 0.0794, + "step": 6020 + }, + { + "epoch": 0.010966652192989436, + "grad_norm": 0.03854202851653099, + "learning_rate": 0.0002, + "loss": 0.0765, + "step": 6030 + }, + { + "epoch": 0.010984839012546632, + "grad_norm": 0.07910823822021484, + "learning_rate": 0.0002, + "loss": 0.0618, + "step": 6040 + }, + { + "epoch": 0.011003025832103828, + "grad_norm": 0.009719946421682835, + "learning_rate": 0.0002, + "loss": 0.0145, + "step": 6050 + }, + { + "epoch": 0.011021212651661024, + "grad_norm": 0.06853003799915314, + "learning_rate": 0.0002, + "loss": 0.1563, + "step": 6060 + }, + { + "epoch": 0.01103939947121822, + "grad_norm": 0.02887076325714588, + "learning_rate": 0.0002, + "loss": 0.0803, + "step": 6070 + }, + { + "epoch": 0.011057586290775417, + "grad_norm": 0.060147739946842194, + "learning_rate": 0.0002, + "loss": 0.0758, + "step": 6080 + }, + { + "epoch": 0.011075773110332613, + "grad_norm": 0.10197418928146362, + "learning_rate": 0.0002, + "loss": 0.0627, + "step": 6090 + }, + { + "epoch": 0.011093959929889811, + "grad_norm": 0.015125100500881672, + "learning_rate": 0.0002, + "loss": 0.0164, + "step": 6100 + }, + { + "epoch": 0.011112146749447008, + "grad_norm": 0.029526161029934883, + "learning_rate": 0.0002, + "loss": 0.1526, + "step": 6110 + }, + { + "epoch": 0.011130333569004204, + "grad_norm": 0.05942453444004059, + "learning_rate": 0.0002, + "loss": 0.0891, + "step": 6120 + }, + { + "epoch": 0.0111485203885614, + "grad_norm": 0.07344426214694977, + "learning_rate": 0.0002, + "loss": 0.0753, + "step": 6130 + }, + { + "epoch": 0.011166707208118597, + "grad_norm": 0.1394059658050537, + "learning_rate": 0.0002, + "loss": 0.0776, + "step": 6140 + }, + { + "epoch": 0.011184894027675793, + "grad_norm": 0.00965851079672575, + "learning_rate": 0.0002, + "loss": 0.019, + "step": 6150 + }, + { + "epoch": 0.01120308084723299, + "grad_norm": 0.041846372187137604, + "learning_rate": 0.0002, + "loss": 0.1776, + "step": 6160 + }, + { + "epoch": 0.011221267666790186, + "grad_norm": 0.04657486826181412, + "learning_rate": 0.0002, + "loss": 0.0878, + "step": 6170 + }, + { + "epoch": 0.011239454486347382, + "grad_norm": 0.026520246639847755, + "learning_rate": 0.0002, + "loss": 0.0768, + "step": 6180 + }, + { + "epoch": 0.011257641305904578, + "grad_norm": 0.10318096727132797, + "learning_rate": 0.0002, + "loss": 0.0617, + "step": 6190 + }, + { + "epoch": 0.011275828125461775, + "grad_norm": 0.019912905991077423, + "learning_rate": 0.0002, + "loss": 0.0202, + "step": 6200 + }, + { + "epoch": 0.01129401494501897, + "grad_norm": 0.05316480994224548, + "learning_rate": 0.0002, + "loss": 0.1412, + "step": 6210 + }, + { + "epoch": 0.011312201764576167, + "grad_norm": 0.02944323979318142, + "learning_rate": 0.0002, + "loss": 0.0829, + "step": 6220 + }, + { + "epoch": 0.011330388584133363, + "grad_norm": 0.0285831056535244, + "learning_rate": 0.0002, + "loss": 0.074, + "step": 6230 + }, + { + "epoch": 0.01134857540369056, + "grad_norm": 0.0975700169801712, + "learning_rate": 0.0002, + "loss": 0.0681, + "step": 6240 + }, + { + "epoch": 0.011366762223247756, + "grad_norm": 0.025717545300722122, + "learning_rate": 0.0002, + "loss": 0.0221, + "step": 6250 + }, + { + "epoch": 0.011384949042804952, + "grad_norm": 0.02859714813530445, + "learning_rate": 0.0002, + "loss": 0.1142, + "step": 6260 + }, + { + "epoch": 0.01140313586236215, + "grad_norm": 0.04395005479454994, + "learning_rate": 0.0002, + "loss": 0.0777, + "step": 6270 + }, + { + "epoch": 0.011421322681919347, + "grad_norm": 0.05116860568523407, + "learning_rate": 0.0002, + "loss": 0.0763, + "step": 6280 + }, + { + "epoch": 0.011439509501476543, + "grad_norm": 0.06850302964448929, + "learning_rate": 0.0002, + "loss": 0.0632, + "step": 6290 + }, + { + "epoch": 0.01145769632103374, + "grad_norm": 0.016113542020320892, + "learning_rate": 0.0002, + "loss": 0.0178, + "step": 6300 + }, + { + "epoch": 0.011475883140590936, + "grad_norm": 0.032306116074323654, + "learning_rate": 0.0002, + "loss": 0.1306, + "step": 6310 + }, + { + "epoch": 0.011494069960148132, + "grad_norm": 0.055701326578855515, + "learning_rate": 0.0002, + "loss": 0.0834, + "step": 6320 + }, + { + "epoch": 0.011512256779705328, + "grad_norm": 0.022934190928936005, + "learning_rate": 0.0002, + "loss": 0.0737, + "step": 6330 + }, + { + "epoch": 0.011530443599262525, + "grad_norm": 0.08375566452741623, + "learning_rate": 0.0002, + "loss": 0.0661, + "step": 6340 + }, + { + "epoch": 0.011548630418819721, + "grad_norm": 0.013614729046821594, + "learning_rate": 0.0002, + "loss": 0.0187, + "step": 6350 + }, + { + "epoch": 0.011566817238376917, + "grad_norm": 0.028269700706005096, + "learning_rate": 0.0002, + "loss": 0.1245, + "step": 6360 + }, + { + "epoch": 0.011585004057934114, + "grad_norm": 0.03646335378289223, + "learning_rate": 0.0002, + "loss": 0.0866, + "step": 6370 + }, + { + "epoch": 0.01160319087749131, + "grad_norm": 0.0371277742087841, + "learning_rate": 0.0002, + "loss": 0.0737, + "step": 6380 + }, + { + "epoch": 0.011621377697048506, + "grad_norm": 0.13698458671569824, + "learning_rate": 0.0002, + "loss": 0.0679, + "step": 6390 + }, + { + "epoch": 0.011639564516605702, + "grad_norm": 0.009350700303912163, + "learning_rate": 0.0002, + "loss": 0.024, + "step": 6400 + }, + { + "epoch": 0.011657751336162899, + "grad_norm": 0.03187236189842224, + "learning_rate": 0.0002, + "loss": 0.1555, + "step": 6410 + }, + { + "epoch": 0.011675938155720095, + "grad_norm": 0.06672242283821106, + "learning_rate": 0.0002, + "loss": 0.0835, + "step": 6420 + }, + { + "epoch": 0.011694124975277291, + "grad_norm": 0.07821471244096756, + "learning_rate": 0.0002, + "loss": 0.0746, + "step": 6430 + }, + { + "epoch": 0.011712311794834488, + "grad_norm": 0.14781107008457184, + "learning_rate": 0.0002, + "loss": 0.0662, + "step": 6440 + }, + { + "epoch": 0.011730498614391686, + "grad_norm": 0.0057207453064620495, + "learning_rate": 0.0002, + "loss": 0.0169, + "step": 6450 + }, + { + "epoch": 0.011748685433948882, + "grad_norm": 0.04252105578780174, + "learning_rate": 0.0002, + "loss": 0.1868, + "step": 6460 + }, + { + "epoch": 0.011766872253506078, + "grad_norm": 0.05041474476456642, + "learning_rate": 0.0002, + "loss": 0.0842, + "step": 6470 + }, + { + "epoch": 0.011785059073063275, + "grad_norm": 0.06584125757217407, + "learning_rate": 0.0002, + "loss": 0.0779, + "step": 6480 + }, + { + "epoch": 0.011803245892620471, + "grad_norm": 0.14610575139522552, + "learning_rate": 0.0002, + "loss": 0.063, + "step": 6490 + }, + { + "epoch": 0.011821432712177667, + "grad_norm": 0.01419675163924694, + "learning_rate": 0.0002, + "loss": 0.0152, + "step": 6500 + }, + { + "epoch": 0.011839619531734864, + "grad_norm": 0.03371060639619827, + "learning_rate": 0.0002, + "loss": 0.1725, + "step": 6510 + }, + { + "epoch": 0.01185780635129206, + "grad_norm": 0.028900766745209694, + "learning_rate": 0.0002, + "loss": 0.0815, + "step": 6520 + }, + { + "epoch": 0.011875993170849256, + "grad_norm": 0.059519629925489426, + "learning_rate": 0.0002, + "loss": 0.0785, + "step": 6530 + }, + { + "epoch": 0.011894179990406453, + "grad_norm": 0.12085167318582535, + "learning_rate": 0.0002, + "loss": 0.0615, + "step": 6540 + }, + { + "epoch": 0.011912366809963649, + "grad_norm": 0.028604619204998016, + "learning_rate": 0.0002, + "loss": 0.0196, + "step": 6550 + }, + { + "epoch": 0.011930553629520845, + "grad_norm": 0.03659407049417496, + "learning_rate": 0.0002, + "loss": 0.1403, + "step": 6560 + }, + { + "epoch": 0.011948740449078041, + "grad_norm": 0.034444138407707214, + "learning_rate": 0.0002, + "loss": 0.0812, + "step": 6570 + }, + { + "epoch": 0.011966927268635238, + "grad_norm": 0.029788263142108917, + "learning_rate": 0.0002, + "loss": 0.0713, + "step": 6580 + }, + { + "epoch": 0.011985114088192434, + "grad_norm": 0.1271272599697113, + "learning_rate": 0.0002, + "loss": 0.0672, + "step": 6590 + }, + { + "epoch": 0.01200330090774963, + "grad_norm": 0.018705012276768684, + "learning_rate": 0.0002, + "loss": 0.0212, + "step": 6600 + }, + { + "epoch": 0.012021487727306827, + "grad_norm": 0.02982541173696518, + "learning_rate": 0.0002, + "loss": 0.1152, + "step": 6610 + }, + { + "epoch": 0.012039674546864025, + "grad_norm": 0.06942040473222733, + "learning_rate": 0.0002, + "loss": 0.0963, + "step": 6620 + }, + { + "epoch": 0.012057861366421221, + "grad_norm": 0.06102292984724045, + "learning_rate": 0.0002, + "loss": 0.0775, + "step": 6630 + }, + { + "epoch": 0.012076048185978417, + "grad_norm": 0.10115987807512283, + "learning_rate": 0.0002, + "loss": 0.0729, + "step": 6640 + }, + { + "epoch": 0.012094235005535614, + "grad_norm": 0.011439867317676544, + "learning_rate": 0.0002, + "loss": 0.0253, + "step": 6650 + }, + { + "epoch": 0.01211242182509281, + "grad_norm": 0.062434904277324677, + "learning_rate": 0.0002, + "loss": 0.1166, + "step": 6660 + }, + { + "epoch": 0.012130608644650006, + "grad_norm": 0.055352553725242615, + "learning_rate": 0.0002, + "loss": 0.0802, + "step": 6670 + }, + { + "epoch": 0.012148795464207203, + "grad_norm": 0.031538888812065125, + "learning_rate": 0.0002, + "loss": 0.0786, + "step": 6680 + }, + { + "epoch": 0.012166982283764399, + "grad_norm": 0.10964162647724152, + "learning_rate": 0.0002, + "loss": 0.0626, + "step": 6690 + }, + { + "epoch": 0.012185169103321595, + "grad_norm": 0.011173764243721962, + "learning_rate": 0.0002, + "loss": 0.0205, + "step": 6700 + }, + { + "epoch": 0.012203355922878792, + "grad_norm": 0.035984206944704056, + "learning_rate": 0.0002, + "loss": 0.1412, + "step": 6710 + }, + { + "epoch": 0.012221542742435988, + "grad_norm": 0.07189827412366867, + "learning_rate": 0.0002, + "loss": 0.0818, + "step": 6720 + }, + { + "epoch": 0.012239729561993184, + "grad_norm": 0.0400136299431324, + "learning_rate": 0.0002, + "loss": 0.0676, + "step": 6730 + }, + { + "epoch": 0.01225791638155038, + "grad_norm": 0.14700625836849213, + "learning_rate": 0.0002, + "loss": 0.0663, + "step": 6740 + }, + { + "epoch": 0.012276103201107577, + "grad_norm": 0.007156179752200842, + "learning_rate": 0.0002, + "loss": 0.0171, + "step": 6750 + }, + { + "epoch": 0.012294290020664773, + "grad_norm": 0.04911777004599571, + "learning_rate": 0.0002, + "loss": 0.1657, + "step": 6760 + }, + { + "epoch": 0.01231247684022197, + "grad_norm": 0.03729144483804703, + "learning_rate": 0.0002, + "loss": 0.0806, + "step": 6770 + }, + { + "epoch": 0.012330663659779166, + "grad_norm": 0.037231944501399994, + "learning_rate": 0.0002, + "loss": 0.0736, + "step": 6780 + }, + { + "epoch": 0.012348850479336364, + "grad_norm": 0.09694401919841766, + "learning_rate": 0.0002, + "loss": 0.0642, + "step": 6790 + }, + { + "epoch": 0.01236703729889356, + "grad_norm": 0.025534989312291145, + "learning_rate": 0.0002, + "loss": 0.0208, + "step": 6800 + }, + { + "epoch": 0.012385224118450756, + "grad_norm": 0.033654361963272095, + "learning_rate": 0.0002, + "loss": 0.1295, + "step": 6810 + }, + { + "epoch": 0.012403410938007953, + "grad_norm": 0.04499521851539612, + "learning_rate": 0.0002, + "loss": 0.0902, + "step": 6820 + }, + { + "epoch": 0.012421597757565149, + "grad_norm": 0.0335836224257946, + "learning_rate": 0.0002, + "loss": 0.0781, + "step": 6830 + }, + { + "epoch": 0.012439784577122345, + "grad_norm": 0.1040850430727005, + "learning_rate": 0.0002, + "loss": 0.0679, + "step": 6840 + }, + { + "epoch": 0.012457971396679542, + "grad_norm": 0.015963764861226082, + "learning_rate": 0.0002, + "loss": 0.0226, + "step": 6850 + }, + { + "epoch": 0.012476158216236738, + "grad_norm": 0.05578307807445526, + "learning_rate": 0.0002, + "loss": 0.1119, + "step": 6860 + }, + { + "epoch": 0.012494345035793934, + "grad_norm": 0.0364505760371685, + "learning_rate": 0.0002, + "loss": 0.0805, + "step": 6870 + }, + { + "epoch": 0.01251253185535113, + "grad_norm": 0.027990469709038734, + "learning_rate": 0.0002, + "loss": 0.0826, + "step": 6880 + }, + { + "epoch": 0.012530718674908327, + "grad_norm": 0.08282670378684998, + "learning_rate": 0.0002, + "loss": 0.0685, + "step": 6890 + }, + { + "epoch": 0.012548905494465523, + "grad_norm": 0.02172144502401352, + "learning_rate": 0.0002, + "loss": 0.0259, + "step": 6900 + }, + { + "epoch": 0.01256709231402272, + "grad_norm": 0.04074740409851074, + "learning_rate": 0.0002, + "loss": 0.1211, + "step": 6910 + }, + { + "epoch": 0.012585279133579916, + "grad_norm": 0.05433020740747452, + "learning_rate": 0.0002, + "loss": 0.0776, + "step": 6920 + }, + { + "epoch": 0.012603465953137112, + "grad_norm": 0.05479983240365982, + "learning_rate": 0.0002, + "loss": 0.0778, + "step": 6930 + }, + { + "epoch": 0.012621652772694308, + "grad_norm": 1.6031180620193481, + "learning_rate": 0.0002, + "loss": 0.2265, + "step": 6940 + }, + { + "epoch": 0.012639839592251505, + "grad_norm": 1.0940366983413696, + "learning_rate": 0.0002, + "loss": 0.4586, + "step": 6950 + }, + { + "epoch": 0.012658026411808703, + "grad_norm": 0.0412282720208168, + "learning_rate": 0.0002, + "loss": 0.1072, + "step": 6960 + }, + { + "epoch": 0.012676213231365899, + "grad_norm": 0.03705910965800285, + "learning_rate": 0.0002, + "loss": 0.1014, + "step": 6970 + }, + { + "epoch": 0.012694400050923095, + "grad_norm": 0.07444313168525696, + "learning_rate": 0.0002, + "loss": 0.0881, + "step": 6980 + }, + { + "epoch": 0.012712586870480292, + "grad_norm": 0.08558017760515213, + "learning_rate": 0.0002, + "loss": 0.0619, + "step": 6990 + }, + { + "epoch": 0.012730773690037488, + "grad_norm": 0.0004157133516855538, + "learning_rate": 0.0002, + "loss": 0.0045, + "step": 7000 + }, + { + "epoch": 0.012748960509594684, + "grad_norm": 0.07950109243392944, + "learning_rate": 0.0002, + "loss": 0.1801, + "step": 7010 + }, + { + "epoch": 0.01276714732915188, + "grad_norm": 0.08424151688814163, + "learning_rate": 0.0002, + "loss": 0.0753, + "step": 7020 + }, + { + "epoch": 0.012785334148709077, + "grad_norm": 0.47635558247566223, + "learning_rate": 0.0002, + "loss": 0.0753, + "step": 7030 + }, + { + "epoch": 0.012803520968266273, + "grad_norm": 0.0452958345413208, + "learning_rate": 0.0002, + "loss": 0.0731, + "step": 7040 + }, + { + "epoch": 0.01282170778782347, + "grad_norm": 0.007719043176621199, + "learning_rate": 0.0002, + "loss": 0.0193, + "step": 7050 + }, + { + "epoch": 0.012839894607380666, + "grad_norm": 0.2408572882413864, + "learning_rate": 0.0002, + "loss": 0.4117, + "step": 7060 + }, + { + "epoch": 0.012858081426937862, + "grad_norm": 0.7272363305091858, + "learning_rate": 0.0002, + "loss": 0.0852, + "step": 7070 + }, + { + "epoch": 0.012876268246495058, + "grad_norm": 0.5539261698722839, + "learning_rate": 0.0002, + "loss": 0.075, + "step": 7080 + }, + { + "epoch": 0.012894455066052255, + "grad_norm": 4.608922481536865, + "learning_rate": 0.0002, + "loss": 0.2301, + "step": 7090 + }, + { + "epoch": 0.012912641885609451, + "grad_norm": 0.0012216357281431556, + "learning_rate": 0.0002, + "loss": 0.0034, + "step": 7100 + }, + { + "epoch": 0.012930828705166647, + "grad_norm": 0.15025563538074493, + "learning_rate": 0.0002, + "loss": 0.2717, + "step": 7110 + }, + { + "epoch": 0.012949015524723844, + "grad_norm": 0.06209970638155937, + "learning_rate": 0.0002, + "loss": 0.0852, + "step": 7120 + }, + { + "epoch": 0.012967202344281042, + "grad_norm": 0.6127016544342041, + "learning_rate": 0.0002, + "loss": 0.1271, + "step": 7130 + }, + { + "epoch": 0.012985389163838238, + "grad_norm": 0.047152891755104065, + "learning_rate": 0.0002, + "loss": 0.0626, + "step": 7140 + }, + { + "epoch": 0.013003575983395434, + "grad_norm": 0.0005132685182616115, + "learning_rate": 0.0002, + "loss": 0.0029, + "step": 7150 + }, + { + "epoch": 0.01302176280295263, + "grad_norm": 0.08946029096841812, + "learning_rate": 0.0002, + "loss": 0.309, + "step": 7160 + }, + { + "epoch": 0.013039949622509827, + "grad_norm": 0.18610751628875732, + "learning_rate": 0.0002, + "loss": 0.0867, + "step": 7170 + }, + { + "epoch": 0.013058136442067023, + "grad_norm": 0.07280854880809784, + "learning_rate": 0.0002, + "loss": 0.0832, + "step": 7180 + }, + { + "epoch": 0.01307632326162422, + "grad_norm": 0.11997990310192108, + "learning_rate": 0.0002, + "loss": 0.0746, + "step": 7190 + }, + { + "epoch": 0.013094510081181416, + "grad_norm": 0.00019475500448606908, + "learning_rate": 0.0002, + "loss": 0.01, + "step": 7200 + }, + { + "epoch": 0.013112696900738612, + "grad_norm": 0.07719916105270386, + "learning_rate": 0.0002, + "loss": 0.3035, + "step": 7210 + }, + { + "epoch": 0.013130883720295809, + "grad_norm": 0.0990060344338417, + "learning_rate": 0.0002, + "loss": 0.0902, + "step": 7220 + }, + { + "epoch": 0.013149070539853005, + "grad_norm": 0.22215688228607178, + "learning_rate": 0.0002, + "loss": 0.0797, + "step": 7230 + }, + { + "epoch": 0.013167257359410201, + "grad_norm": 0.08412040770053864, + "learning_rate": 0.0002, + "loss": 0.0646, + "step": 7240 + }, + { + "epoch": 0.013185444178967397, + "grad_norm": 0.0017518314998596907, + "learning_rate": 0.0002, + "loss": 0.007, + "step": 7250 + }, + { + "epoch": 0.013203630998524594, + "grad_norm": 0.1554754078388214, + "learning_rate": 0.0002, + "loss": 0.2319, + "step": 7260 + }, + { + "epoch": 0.01322181781808179, + "grad_norm": 0.052371326833963394, + "learning_rate": 0.0002, + "loss": 0.0832, + "step": 7270 + }, + { + "epoch": 0.013240004637638986, + "grad_norm": 0.9168817400932312, + "learning_rate": 0.0002, + "loss": 0.0791, + "step": 7280 + }, + { + "epoch": 0.013258191457196183, + "grad_norm": 0.07169363647699356, + "learning_rate": 0.0002, + "loss": 0.0602, + "step": 7290 + }, + { + "epoch": 0.01327637827675338, + "grad_norm": 0.0009911650558933616, + "learning_rate": 0.0002, + "loss": 0.0041, + "step": 7300 + }, + { + "epoch": 0.013294565096310577, + "grad_norm": 0.2644541263580322, + "learning_rate": 0.0002, + "loss": 0.2193, + "step": 7310 + }, + { + "epoch": 0.013312751915867773, + "grad_norm": 0.12140689790248871, + "learning_rate": 0.0002, + "loss": 0.0944, + "step": 7320 + }, + { + "epoch": 0.01333093873542497, + "grad_norm": 0.03627191483974457, + "learning_rate": 0.0002, + "loss": 0.0812, + "step": 7330 + }, + { + "epoch": 0.013349125554982166, + "grad_norm": 0.06252894550561905, + "learning_rate": 0.0002, + "loss": 0.0596, + "step": 7340 + }, + { + "epoch": 0.013367312374539362, + "grad_norm": 0.20318441092967987, + "learning_rate": 0.0002, + "loss": 0.0064, + "step": 7350 + }, + { + "epoch": 0.013385499194096559, + "grad_norm": 0.4231732189655304, + "learning_rate": 0.0002, + "loss": 0.4329, + "step": 7360 + }, + { + "epoch": 0.013403686013653755, + "grad_norm": 0.07567082345485687, + "learning_rate": 0.0002, + "loss": 0.089, + "step": 7370 + }, + { + "epoch": 0.013421872833210951, + "grad_norm": 0.23021474480628967, + "learning_rate": 0.0002, + "loss": 0.0833, + "step": 7380 + }, + { + "epoch": 0.013440059652768148, + "grad_norm": 0.09458985179662704, + "learning_rate": 0.0002, + "loss": 0.1391, + "step": 7390 + }, + { + "epoch": 0.013458246472325344, + "grad_norm": 0.010052111931145191, + "learning_rate": 0.0002, + "loss": 0.0073, + "step": 7400 + }, + { + "epoch": 0.01347643329188254, + "grad_norm": 0.2159787267446518, + "learning_rate": 0.0002, + "loss": 0.2249, + "step": 7410 + }, + { + "epoch": 0.013494620111439736, + "grad_norm": 0.11222853511571884, + "learning_rate": 0.0002, + "loss": 0.093, + "step": 7420 + }, + { + "epoch": 0.013512806930996933, + "grad_norm": 0.08586139976978302, + "learning_rate": 0.0002, + "loss": 0.0821, + "step": 7430 + }, + { + "epoch": 0.013530993750554129, + "grad_norm": 0.12232748419046402, + "learning_rate": 0.0002, + "loss": 0.0736, + "step": 7440 + }, + { + "epoch": 0.013549180570111325, + "grad_norm": 0.006977527402341366, + "learning_rate": 0.0002, + "loss": 0.0115, + "step": 7450 + }, + { + "epoch": 0.013567367389668522, + "grad_norm": 0.051690369844436646, + "learning_rate": 0.0002, + "loss": 0.2247, + "step": 7460 + }, + { + "epoch": 0.013585554209225718, + "grad_norm": 0.06542158871889114, + "learning_rate": 0.0002, + "loss": 0.1056, + "step": 7470 + }, + { + "epoch": 0.013603741028782916, + "grad_norm": 0.18546995520591736, + "learning_rate": 0.0002, + "loss": 0.1102, + "step": 7480 + }, + { + "epoch": 0.013621927848340112, + "grad_norm": 13.399182319641113, + "learning_rate": 0.0002, + "loss": 2.0806, + "step": 7490 + }, + { + "epoch": 0.013640114667897309, + "grad_norm": 0.0982588455080986, + "learning_rate": 0.0002, + "loss": 0.2158, + "step": 7500 + }, + { + "epoch": 0.013658301487454505, + "grad_norm": 0.07860754430294037, + "learning_rate": 0.0002, + "loss": 0.125, + "step": 7510 + }, + { + "epoch": 0.013676488307011701, + "grad_norm": 0.1165497750043869, + "learning_rate": 0.0002, + "loss": 0.0899, + "step": 7520 + }, + { + "epoch": 0.013694675126568898, + "grad_norm": 0.2813965380191803, + "learning_rate": 0.0002, + "loss": 0.0758, + "step": 7530 + }, + { + "epoch": 0.013712861946126094, + "grad_norm": 0.33458462357521057, + "learning_rate": 0.0002, + "loss": 0.0683, + "step": 7540 + }, + { + "epoch": 0.01373104876568329, + "grad_norm": 0.012062279507517815, + "learning_rate": 0.0002, + "loss": 0.0135, + "step": 7550 + }, + { + "epoch": 0.013749235585240487, + "grad_norm": 0.1787721961736679, + "learning_rate": 0.0002, + "loss": 0.1763, + "step": 7560 + }, + { + "epoch": 0.013767422404797683, + "grad_norm": 0.05922751501202583, + "learning_rate": 0.0002, + "loss": 0.1223, + "step": 7570 + }, + { + "epoch": 0.01378560922435488, + "grad_norm": 0.11594684422016144, + "learning_rate": 0.0002, + "loss": 0.0934, + "step": 7580 + }, + { + "epoch": 0.013803796043912075, + "grad_norm": 0.2290794998407364, + "learning_rate": 0.0002, + "loss": 0.0799, + "step": 7590 + }, + { + "epoch": 0.013821982863469272, + "grad_norm": 0.04903063178062439, + "learning_rate": 0.0002, + "loss": 0.0195, + "step": 7600 + }, + { + "epoch": 0.013840169683026468, + "grad_norm": 123.61300659179688, + "learning_rate": 0.0002, + "loss": 0.8359, + "step": 7610 + }, + { + "epoch": 0.013858356502583664, + "grad_norm": 0.25403347611427307, + "learning_rate": 0.0002, + "loss": 1.0464, + "step": 7620 + }, + { + "epoch": 0.01387654332214086, + "grad_norm": 0.08144152164459229, + "learning_rate": 0.0002, + "loss": 0.0794, + "step": 7630 + }, + { + "epoch": 0.013894730141698057, + "grad_norm": 0.11679713428020477, + "learning_rate": 0.0002, + "loss": 0.0706, + "step": 7640 + }, + { + "epoch": 0.013912916961255255, + "grad_norm": 0.00391317019239068, + "learning_rate": 0.0002, + "loss": 0.0075, + "step": 7650 + }, + { + "epoch": 0.013931103780812451, + "grad_norm": 0.13209663331508636, + "learning_rate": 0.0002, + "loss": 0.2228, + "step": 7660 + }, + { + "epoch": 0.013949290600369648, + "grad_norm": 0.06067880615592003, + "learning_rate": 0.0002, + "loss": 0.089, + "step": 7670 + }, + { + "epoch": 0.013967477419926844, + "grad_norm": 0.04806550592184067, + "learning_rate": 0.0002, + "loss": 0.0769, + "step": 7680 + }, + { + "epoch": 0.01398566423948404, + "grad_norm": 0.09506970643997192, + "learning_rate": 0.0002, + "loss": 0.0689, + "step": 7690 + }, + { + "epoch": 0.014003851059041237, + "grad_norm": 0.002536884741857648, + "learning_rate": 0.0002, + "loss": 0.0172, + "step": 7700 + }, + { + "epoch": 0.014022037878598433, + "grad_norm": 0.13837113976478577, + "learning_rate": 0.0002, + "loss": 0.2243, + "step": 7710 + }, + { + "epoch": 0.01404022469815563, + "grad_norm": 0.08101535588502884, + "learning_rate": 0.0002, + "loss": 0.0878, + "step": 7720 + }, + { + "epoch": 0.014058411517712826, + "grad_norm": 0.04018868878483772, + "learning_rate": 0.0002, + "loss": 0.0776, + "step": 7730 + }, + { + "epoch": 0.014076598337270022, + "grad_norm": 0.1377197653055191, + "learning_rate": 0.0002, + "loss": 0.0681, + "step": 7740 + }, + { + "epoch": 0.014094785156827218, + "grad_norm": 0.0006735012284480035, + "learning_rate": 0.0002, + "loss": 0.0041, + "step": 7750 + }, + { + "epoch": 0.014112971976384414, + "grad_norm": 0.17503094673156738, + "learning_rate": 0.0002, + "loss": 0.3114, + "step": 7760 + }, + { + "epoch": 0.01413115879594161, + "grad_norm": 0.07190551608800888, + "learning_rate": 0.0002, + "loss": 0.1018, + "step": 7770 + }, + { + "epoch": 0.014149345615498807, + "grad_norm": 0.036945659667253494, + "learning_rate": 0.0002, + "loss": 0.0785, + "step": 7780 + }, + { + "epoch": 0.014167532435056003, + "grad_norm": 0.13999724388122559, + "learning_rate": 0.0002, + "loss": 0.0725, + "step": 7790 + }, + { + "epoch": 0.0141857192546132, + "grad_norm": 0.0031171294394880533, + "learning_rate": 0.0002, + "loss": 0.0144, + "step": 7800 + }, + { + "epoch": 0.014203906074170396, + "grad_norm": 0.059554051607847214, + "learning_rate": 0.0002, + "loss": 0.2442, + "step": 7810 + }, + { + "epoch": 0.014222092893727594, + "grad_norm": 0.06873622536659241, + "learning_rate": 0.0002, + "loss": 0.0904, + "step": 7820 + }, + { + "epoch": 0.01424027971328479, + "grad_norm": 0.11261582374572754, + "learning_rate": 0.0002, + "loss": 0.0758, + "step": 7830 + }, + { + "epoch": 0.014258466532841987, + "grad_norm": 1.497631311416626, + "learning_rate": 0.0002, + "loss": 0.0689, + "step": 7840 + }, + { + "epoch": 0.014276653352399183, + "grad_norm": 0.004822546616196632, + "learning_rate": 0.0002, + "loss": 0.0156, + "step": 7850 + }, + { + "epoch": 0.01429484017195638, + "grad_norm": 0.0575052835047245, + "learning_rate": 0.0002, + "loss": 0.1895, + "step": 7860 + }, + { + "epoch": 0.014313026991513576, + "grad_norm": 0.10657750070095062, + "learning_rate": 0.0002, + "loss": 0.0855, + "step": 7870 + }, + { + "epoch": 0.014331213811070772, + "grad_norm": 0.07080844044685364, + "learning_rate": 0.0002, + "loss": 0.0716, + "step": 7880 + }, + { + "epoch": 0.014349400630627968, + "grad_norm": 0.1628514677286148, + "learning_rate": 0.0002, + "loss": 0.071, + "step": 7890 + }, + { + "epoch": 0.014367587450185165, + "grad_norm": 0.013860347680747509, + "learning_rate": 0.0002, + "loss": 0.0227, + "step": 7900 + }, + { + "epoch": 0.014385774269742361, + "grad_norm": 0.5240967869758606, + "learning_rate": 0.0002, + "loss": 0.1854, + "step": 7910 + }, + { + "epoch": 0.014403961089299557, + "grad_norm": 1.0027457475662231, + "learning_rate": 0.0002, + "loss": 0.0942, + "step": 7920 + }, + { + "epoch": 0.014422147908856753, + "grad_norm": 0.05730056390166283, + "learning_rate": 0.0002, + "loss": 0.0778, + "step": 7930 + }, + { + "epoch": 0.01444033472841395, + "grad_norm": 0.1485404521226883, + "learning_rate": 0.0002, + "loss": 0.0719, + "step": 7940 + }, + { + "epoch": 0.014458521547971146, + "grad_norm": 0.009702637791633606, + "learning_rate": 0.0002, + "loss": 0.0136, + "step": 7950 + }, + { + "epoch": 0.014476708367528342, + "grad_norm": 0.046543315052986145, + "learning_rate": 0.0002, + "loss": 0.1697, + "step": 7960 + }, + { + "epoch": 0.014494895187085539, + "grad_norm": 0.05248842388391495, + "learning_rate": 0.0002, + "loss": 0.0888, + "step": 7970 + }, + { + "epoch": 0.014513082006642735, + "grad_norm": 0.047813788056373596, + "learning_rate": 0.0002, + "loss": 0.0806, + "step": 7980 + }, + { + "epoch": 0.014531268826199933, + "grad_norm": 0.19744129478931427, + "learning_rate": 0.0002, + "loss": 0.0788, + "step": 7990 + }, + { + "epoch": 0.01454945564575713, + "grad_norm": 0.005265017040073872, + "learning_rate": 0.0002, + "loss": 0.014, + "step": 8000 + }, + { + "epoch": 0.014567642465314326, + "grad_norm": 0.0564056858420372, + "learning_rate": 0.0002, + "loss": 0.2681, + "step": 8010 + }, + { + "epoch": 0.014585829284871522, + "grad_norm": 0.0958496481180191, + "learning_rate": 0.0002, + "loss": 0.0823, + "step": 8020 + }, + { + "epoch": 0.014604016104428718, + "grad_norm": 0.12000919133424759, + "learning_rate": 0.0002, + "loss": 0.073, + "step": 8030 + }, + { + "epoch": 0.014622202923985915, + "grad_norm": 0.15912771224975586, + "learning_rate": 0.0002, + "loss": 0.075, + "step": 8040 + }, + { + "epoch": 0.014640389743543111, + "grad_norm": 0.004025776404887438, + "learning_rate": 0.0002, + "loss": 0.012, + "step": 8050 + }, + { + "epoch": 0.014658576563100307, + "grad_norm": 0.1682930886745453, + "learning_rate": 0.0002, + "loss": 0.2926, + "step": 8060 + }, + { + "epoch": 0.014676763382657504, + "grad_norm": 0.057362254709005356, + "learning_rate": 0.0002, + "loss": 0.0869, + "step": 8070 + }, + { + "epoch": 0.0146949502022147, + "grad_norm": 0.0814078077673912, + "learning_rate": 0.0002, + "loss": 0.0825, + "step": 8080 + }, + { + "epoch": 0.014713137021771896, + "grad_norm": 0.18205074965953827, + "learning_rate": 0.0002, + "loss": 0.0699, + "step": 8090 + }, + { + "epoch": 0.014731323841329092, + "grad_norm": 0.013200881890952587, + "learning_rate": 0.0002, + "loss": 0.0228, + "step": 8100 + }, + { + "epoch": 0.014749510660886289, + "grad_norm": 0.21043474972248077, + "learning_rate": 0.0002, + "loss": 0.2138, + "step": 8110 + }, + { + "epoch": 0.014767697480443485, + "grad_norm": 0.1000015065073967, + "learning_rate": 0.0002, + "loss": 0.0906, + "step": 8120 + }, + { + "epoch": 0.014785884300000681, + "grad_norm": 0.045657768845558167, + "learning_rate": 0.0002, + "loss": 0.0826, + "step": 8130 + }, + { + "epoch": 0.014804071119557878, + "grad_norm": 0.13545630872249603, + "learning_rate": 0.0002, + "loss": 0.066, + "step": 8140 + }, + { + "epoch": 0.014822257939115074, + "grad_norm": 0.01422254927456379, + "learning_rate": 0.0002, + "loss": 0.0179, + "step": 8150 + }, + { + "epoch": 0.014840444758672272, + "grad_norm": 0.12108676135540009, + "learning_rate": 0.0002, + "loss": 0.1717, + "step": 8160 + }, + { + "epoch": 0.014858631578229468, + "grad_norm": 0.10441934317350388, + "learning_rate": 0.0002, + "loss": 0.106, + "step": 8170 + }, + { + "epoch": 0.014876818397786665, + "grad_norm": 0.08105968683958054, + "learning_rate": 0.0002, + "loss": 0.0829, + "step": 8180 + }, + { + "epoch": 0.014895005217343861, + "grad_norm": 0.12230301648378372, + "learning_rate": 0.0002, + "loss": 0.0731, + "step": 8190 + }, + { + "epoch": 0.014913192036901057, + "grad_norm": 0.033857300877571106, + "learning_rate": 0.0002, + "loss": 0.029, + "step": 8200 + }, + { + "epoch": 0.014931378856458254, + "grad_norm": 0.04827893525362015, + "learning_rate": 0.0002, + "loss": 0.1369, + "step": 8210 + }, + { + "epoch": 0.01494956567601545, + "grad_norm": 0.056212421506643295, + "learning_rate": 0.0002, + "loss": 0.0879, + "step": 8220 + }, + { + "epoch": 0.014967752495572646, + "grad_norm": 0.03163846209645271, + "learning_rate": 0.0002, + "loss": 0.0698, + "step": 8230 + }, + { + "epoch": 0.014985939315129843, + "grad_norm": 0.09394920617341995, + "learning_rate": 0.0002, + "loss": 0.0688, + "step": 8240 + }, + { + "epoch": 0.015004126134687039, + "grad_norm": 0.024936649948358536, + "learning_rate": 0.0002, + "loss": 0.0211, + "step": 8250 + }, + { + "epoch": 0.015022312954244235, + "grad_norm": 4.499615669250488, + "learning_rate": 0.0002, + "loss": 2.7596, + "step": 8260 + }, + { + "epoch": 0.015040499773801431, + "grad_norm": 9.221298217773438, + "learning_rate": 0.0002, + "loss": 0.9135, + "step": 8270 + }, + { + "epoch": 0.015058686593358628, + "grad_norm": 0.5199778079986572, + "learning_rate": 0.0002, + "loss": 0.1441, + "step": 8280 + }, + { + "epoch": 0.015076873412915824, + "grad_norm": 0.07028087228536606, + "learning_rate": 0.0002, + "loss": 0.0793, + "step": 8290 + }, + { + "epoch": 0.01509506023247302, + "grad_norm": 0.0003307730657979846, + "learning_rate": 0.0002, + "loss": 0.0006, + "step": 8300 + }, + { + "epoch": 0.015113247052030217, + "grad_norm": 0.7940683960914612, + "learning_rate": 0.0002, + "loss": 0.7233, + "step": 8310 + }, + { + "epoch": 0.015131433871587413, + "grad_norm": 0.09774448722600937, + "learning_rate": 0.0002, + "loss": 0.1451, + "step": 8320 + }, + { + "epoch": 0.015149620691144611, + "grad_norm": 0.3088306188583374, + "learning_rate": 0.0002, + "loss": 0.0986, + "step": 8330 + }, + { + "epoch": 0.015167807510701807, + "grad_norm": 0.08629265427589417, + "learning_rate": 0.0002, + "loss": 0.0581, + "step": 8340 + }, + { + "epoch": 0.015185994330259004, + "grad_norm": 0.0011582528240978718, + "learning_rate": 0.0002, + "loss": 0.0008, + "step": 8350 + }, + { + "epoch": 0.0152041811498162, + "grad_norm": 0.48978063464164734, + "learning_rate": 0.0002, + "loss": 0.5111, + "step": 8360 + }, + { + "epoch": 0.015222367969373396, + "grad_norm": 0.2633112668991089, + "learning_rate": 0.0002, + "loss": 0.1354, + "step": 8370 + }, + { + "epoch": 0.015240554788930593, + "grad_norm": 0.058184925466775894, + "learning_rate": 0.0002, + "loss": 0.0963, + "step": 8380 + }, + { + "epoch": 0.015258741608487789, + "grad_norm": 0.397290974855423, + "learning_rate": 0.0002, + "loss": 0.0915, + "step": 8390 + }, + { + "epoch": 0.015276928428044985, + "grad_norm": 0.0013334077084437013, + "learning_rate": 0.0002, + "loss": 0.0117, + "step": 8400 + }, + { + "epoch": 0.015295115247602182, + "grad_norm": 3.2027626037597656, + "learning_rate": 0.0002, + "loss": 0.3642, + "step": 8410 + }, + { + "epoch": 0.015313302067159378, + "grad_norm": 0.4110456705093384, + "learning_rate": 0.0002, + "loss": 0.1347, + "step": 8420 + }, + { + "epoch": 0.015331488886716574, + "grad_norm": 0.19789688289165497, + "learning_rate": 0.0002, + "loss": 0.0946, + "step": 8430 + }, + { + "epoch": 0.01534967570627377, + "grad_norm": 0.15914630889892578, + "learning_rate": 0.0002, + "loss": 0.0619, + "step": 8440 + }, + { + "epoch": 0.015367862525830967, + "grad_norm": 0.004021051339805126, + "learning_rate": 0.0002, + "loss": 0.0081, + "step": 8450 + }, + { + "epoch": 0.015386049345388163, + "grad_norm": 0.25250542163848877, + "learning_rate": 0.0002, + "loss": 0.2409, + "step": 8460 + }, + { + "epoch": 0.01540423616494536, + "grad_norm": 0.16660314798355103, + "learning_rate": 0.0002, + "loss": 0.1041, + "step": 8470 + }, + { + "epoch": 0.015422422984502556, + "grad_norm": 0.09435573220252991, + "learning_rate": 0.0002, + "loss": 0.0838, + "step": 8480 + }, + { + "epoch": 0.015440609804059752, + "grad_norm": 0.1622086614370346, + "learning_rate": 0.0002, + "loss": 0.0648, + "step": 8490 + }, + { + "epoch": 0.015458796623616948, + "grad_norm": 0.002267120871692896, + "learning_rate": 0.0002, + "loss": 0.0057, + "step": 8500 + }, + { + "epoch": 0.015476983443174146, + "grad_norm": 0.11559420824050903, + "learning_rate": 0.0002, + "loss": 0.2994, + "step": 8510 + }, + { + "epoch": 0.015495170262731343, + "grad_norm": 0.18291179835796356, + "learning_rate": 0.0002, + "loss": 0.0908, + "step": 8520 + }, + { + "epoch": 0.015513357082288539, + "grad_norm": 0.14989323914051056, + "learning_rate": 0.0002, + "loss": 0.0912, + "step": 8530 + }, + { + "epoch": 0.015531543901845735, + "grad_norm": 0.09752708673477173, + "learning_rate": 0.0002, + "loss": 0.0586, + "step": 8540 + }, + { + "epoch": 0.015549730721402932, + "grad_norm": 0.0005314307054504752, + "learning_rate": 0.0002, + "loss": 0.0026, + "step": 8550 + }, + { + "epoch": 0.015567917540960128, + "grad_norm": 0.18309178948402405, + "learning_rate": 0.0002, + "loss": 0.3059, + "step": 8560 + }, + { + "epoch": 0.015586104360517324, + "grad_norm": 0.8144251108169556, + "learning_rate": 0.0002, + "loss": 0.1103, + "step": 8570 + }, + { + "epoch": 0.01560429118007452, + "grad_norm": 0.0331404022872448, + "learning_rate": 0.0002, + "loss": 0.0898, + "step": 8580 + }, + { + "epoch": 0.015622477999631717, + "grad_norm": 0.1460132598876953, + "learning_rate": 0.0002, + "loss": 0.0656, + "step": 8590 + }, + { + "epoch": 0.015640664819188913, + "grad_norm": 0.013606027700006962, + "learning_rate": 0.0002, + "loss": 0.0076, + "step": 8600 + }, + { + "epoch": 0.01565885163874611, + "grad_norm": 0.22224061191082, + "learning_rate": 0.0002, + "loss": 0.2609, + "step": 8610 + }, + { + "epoch": 0.015677038458303306, + "grad_norm": 0.22729800641536713, + "learning_rate": 0.0002, + "loss": 0.1028, + "step": 8620 + }, + { + "epoch": 0.015695225277860502, + "grad_norm": 0.0848810002207756, + "learning_rate": 0.0002, + "loss": 0.0871, + "step": 8630 + }, + { + "epoch": 0.0157134120974177, + "grad_norm": 0.17896370589733124, + "learning_rate": 0.0002, + "loss": 0.0636, + "step": 8640 + }, + { + "epoch": 0.015731598916974895, + "grad_norm": 0.006263076793402433, + "learning_rate": 0.0002, + "loss": 0.0068, + "step": 8650 + }, + { + "epoch": 0.01574978573653209, + "grad_norm": 0.29927679896354675, + "learning_rate": 0.0002, + "loss": 0.2761, + "step": 8660 + }, + { + "epoch": 0.015767972556089287, + "grad_norm": 0.05662700906395912, + "learning_rate": 0.0002, + "loss": 0.1029, + "step": 8670 + }, + { + "epoch": 0.015786159375646484, + "grad_norm": 0.09140895307064056, + "learning_rate": 0.0002, + "loss": 0.0854, + "step": 8680 + }, + { + "epoch": 0.01580434619520368, + "grad_norm": 0.21034927666187286, + "learning_rate": 0.0002, + "loss": 0.0691, + "step": 8690 + }, + { + "epoch": 0.015822533014760876, + "grad_norm": 0.0010229075560346246, + "learning_rate": 0.0002, + "loss": 0.0033, + "step": 8700 + }, + { + "epoch": 0.015840719834318073, + "grad_norm": 0.0626237690448761, + "learning_rate": 0.0002, + "loss": 0.3583, + "step": 8710 + }, + { + "epoch": 0.01585890665387527, + "grad_norm": 0.10027278959751129, + "learning_rate": 0.0002, + "loss": 0.0959, + "step": 8720 + }, + { + "epoch": 0.015877093473432465, + "grad_norm": 0.0870286151766777, + "learning_rate": 0.0002, + "loss": 0.0767, + "step": 8730 + }, + { + "epoch": 0.01589528029298966, + "grad_norm": 0.16106969118118286, + "learning_rate": 0.0002, + "loss": 0.0763, + "step": 8740 + }, + { + "epoch": 0.015913467112546858, + "grad_norm": 0.0022529088892042637, + "learning_rate": 0.0002, + "loss": 0.0108, + "step": 8750 + }, + { + "epoch": 0.015931653932104058, + "grad_norm": 0.06070050224661827, + "learning_rate": 0.0002, + "loss": 0.2606, + "step": 8760 + }, + { + "epoch": 0.015949840751661254, + "grad_norm": 0.09406338632106781, + "learning_rate": 0.0002, + "loss": 0.1062, + "step": 8770 + }, + { + "epoch": 0.01596802757121845, + "grad_norm": 0.1367248147726059, + "learning_rate": 0.0002, + "loss": 0.0757, + "step": 8780 + }, + { + "epoch": 0.015986214390775647, + "grad_norm": 0.26938319206237793, + "learning_rate": 0.0002, + "loss": 0.0733, + "step": 8790 + }, + { + "epoch": 0.016004401210332843, + "grad_norm": 0.011559409089386463, + "learning_rate": 0.0002, + "loss": 0.0176, + "step": 8800 + }, + { + "epoch": 0.01602258802989004, + "grad_norm": 0.12351766228675842, + "learning_rate": 0.0002, + "loss": 0.24, + "step": 8810 + }, + { + "epoch": 0.016040774849447235, + "grad_norm": 0.08965809643268585, + "learning_rate": 0.0002, + "loss": 0.0947, + "step": 8820 + }, + { + "epoch": 0.016058961669004432, + "grad_norm": 0.027005961164832115, + "learning_rate": 0.0002, + "loss": 0.0738, + "step": 8830 + }, + { + "epoch": 0.016077148488561628, + "grad_norm": 0.18656685948371887, + "learning_rate": 0.0002, + "loss": 0.0667, + "step": 8840 + }, + { + "epoch": 0.016095335308118824, + "grad_norm": 0.003148626768961549, + "learning_rate": 0.0002, + "loss": 0.0119, + "step": 8850 + }, + { + "epoch": 0.01611352212767602, + "grad_norm": 0.07959452271461487, + "learning_rate": 0.0002, + "loss": 0.275, + "step": 8860 + }, + { + "epoch": 0.016131708947233217, + "grad_norm": 0.19433775544166565, + "learning_rate": 0.0002, + "loss": 0.0872, + "step": 8870 + }, + { + "epoch": 0.016149895766790413, + "grad_norm": 0.1376393735408783, + "learning_rate": 0.0002, + "loss": 0.0779, + "step": 8880 + }, + { + "epoch": 0.01616808258634761, + "grad_norm": 0.18282419443130493, + "learning_rate": 0.0002, + "loss": 0.068, + "step": 8890 + }, + { + "epoch": 0.016186269405904806, + "grad_norm": 0.0112565653398633, + "learning_rate": 0.0002, + "loss": 0.0137, + "step": 8900 + }, + { + "epoch": 0.016204456225462002, + "grad_norm": 0.08975637704133987, + "learning_rate": 0.0002, + "loss": 0.23, + "step": 8910 + }, + { + "epoch": 0.0162226430450192, + "grad_norm": 0.19316238164901733, + "learning_rate": 0.0002, + "loss": 0.0975, + "step": 8920 + }, + { + "epoch": 0.016240829864576395, + "grad_norm": 0.1870724856853485, + "learning_rate": 0.0002, + "loss": 0.0862, + "step": 8930 + }, + { + "epoch": 0.01625901668413359, + "grad_norm": 0.19031721353530884, + "learning_rate": 0.0002, + "loss": 0.0735, + "step": 8940 + }, + { + "epoch": 0.016277203503690788, + "grad_norm": 0.015979783609509468, + "learning_rate": 0.0002, + "loss": 0.0249, + "step": 8950 + }, + { + "epoch": 0.016295390323247984, + "grad_norm": 0.09105712175369263, + "learning_rate": 0.0002, + "loss": 0.1573, + "step": 8960 + }, + { + "epoch": 0.01631357714280518, + "grad_norm": 0.13035650551319122, + "learning_rate": 0.0002, + "loss": 0.0958, + "step": 8970 + }, + { + "epoch": 0.016331763962362376, + "grad_norm": 0.18613573908805847, + "learning_rate": 0.0002, + "loss": 0.088, + "step": 8980 + }, + { + "epoch": 0.016349950781919573, + "grad_norm": 0.2518664300441742, + "learning_rate": 0.0002, + "loss": 0.0725, + "step": 8990 + }, + { + "epoch": 0.01636813760147677, + "grad_norm": 0.03324449062347412, + "learning_rate": 0.0002, + "loss": 0.0256, + "step": 9000 + }, + { + "epoch": 0.016386324421033965, + "grad_norm": 0.08766523003578186, + "learning_rate": 0.0002, + "loss": 0.1531, + "step": 9010 + }, + { + "epoch": 0.01640451124059116, + "grad_norm": 0.14177583158016205, + "learning_rate": 0.0002, + "loss": 0.0861, + "step": 9020 + }, + { + "epoch": 0.016422698060148358, + "grad_norm": 0.1354762315750122, + "learning_rate": 0.0002, + "loss": 0.0782, + "step": 9030 + }, + { + "epoch": 0.016440884879705554, + "grad_norm": 0.15894347429275513, + "learning_rate": 0.0002, + "loss": 0.073, + "step": 9040 + }, + { + "epoch": 0.01645907169926275, + "grad_norm": 0.02154761180281639, + "learning_rate": 0.0002, + "loss": 0.0156, + "step": 9050 + }, + { + "epoch": 0.016477258518819947, + "grad_norm": 0.06432317197322845, + "learning_rate": 0.0002, + "loss": 0.1384, + "step": 9060 + }, + { + "epoch": 0.016495445338377143, + "grad_norm": 0.12112505733966827, + "learning_rate": 0.0002, + "loss": 0.093, + "step": 9070 + }, + { + "epoch": 0.01651363215793434, + "grad_norm": 0.10628003627061844, + "learning_rate": 0.0002, + "loss": 0.0738, + "step": 9080 + }, + { + "epoch": 0.016531818977491536, + "grad_norm": 0.1930958330631256, + "learning_rate": 0.0002, + "loss": 0.0678, + "step": 9090 + }, + { + "epoch": 0.016550005797048732, + "grad_norm": 0.03878525644540787, + "learning_rate": 0.0002, + "loss": 0.0235, + "step": 9100 + }, + { + "epoch": 0.016568192616605932, + "grad_norm": 0.0920896977186203, + "learning_rate": 0.0002, + "loss": 0.1661, + "step": 9110 + }, + { + "epoch": 0.016586379436163128, + "grad_norm": 0.11687818914651871, + "learning_rate": 0.0002, + "loss": 0.0847, + "step": 9120 + }, + { + "epoch": 0.016604566255720325, + "grad_norm": 0.10511167347431183, + "learning_rate": 0.0002, + "loss": 0.0832, + "step": 9130 + }, + { + "epoch": 0.01662275307527752, + "grad_norm": 0.26365017890930176, + "learning_rate": 0.0002, + "loss": 0.0717, + "step": 9140 + }, + { + "epoch": 0.016640939894834717, + "grad_norm": 0.02445841394364834, + "learning_rate": 0.0002, + "loss": 0.0233, + "step": 9150 + }, + { + "epoch": 0.016659126714391913, + "grad_norm": 0.08213133364915848, + "learning_rate": 0.0002, + "loss": 0.1439, + "step": 9160 + }, + { + "epoch": 0.01667731353394911, + "grad_norm": 0.17025598883628845, + "learning_rate": 0.0002, + "loss": 0.0852, + "step": 9170 + }, + { + "epoch": 0.016695500353506306, + "grad_norm": 0.098059743642807, + "learning_rate": 0.0002, + "loss": 0.0761, + "step": 9180 + }, + { + "epoch": 0.016713687173063502, + "grad_norm": 0.18436011672019958, + "learning_rate": 0.0002, + "loss": 0.0674, + "step": 9190 + }, + { + "epoch": 0.0167318739926207, + "grad_norm": 0.011012010276317596, + "learning_rate": 0.0002, + "loss": 0.0221, + "step": 9200 + }, + { + "epoch": 0.016750060812177895, + "grad_norm": 0.07544030994176865, + "learning_rate": 0.0002, + "loss": 0.161, + "step": 9210 + }, + { + "epoch": 0.01676824763173509, + "grad_norm": 0.16041946411132812, + "learning_rate": 0.0002, + "loss": 0.0824, + "step": 9220 + }, + { + "epoch": 0.016786434451292288, + "grad_norm": 0.17295844852924347, + "learning_rate": 0.0002, + "loss": 0.0797, + "step": 9230 + }, + { + "epoch": 0.016804621270849484, + "grad_norm": 0.1818791776895523, + "learning_rate": 0.0002, + "loss": 0.0683, + "step": 9240 + }, + { + "epoch": 0.01682280809040668, + "grad_norm": 0.019515013322234154, + "learning_rate": 0.0002, + "loss": 0.0188, + "step": 9250 + }, + { + "epoch": 0.016840994909963877, + "grad_norm": 0.15059705078601837, + "learning_rate": 0.0002, + "loss": 0.1743, + "step": 9260 + }, + { + "epoch": 0.016859181729521073, + "grad_norm": 0.1481601595878601, + "learning_rate": 0.0002, + "loss": 0.0906, + "step": 9270 + }, + { + "epoch": 0.01687736854907827, + "grad_norm": 0.07433108985424042, + "learning_rate": 0.0002, + "loss": 0.08, + "step": 9280 + }, + { + "epoch": 0.016895555368635466, + "grad_norm": 0.1752692312002182, + "learning_rate": 0.0002, + "loss": 0.06, + "step": 9290 + }, + { + "epoch": 0.016913742188192662, + "grad_norm": 0.027612384408712387, + "learning_rate": 0.0002, + "loss": 0.0157, + "step": 9300 + }, + { + "epoch": 0.016931929007749858, + "grad_norm": 0.08575212955474854, + "learning_rate": 0.0002, + "loss": 0.1679, + "step": 9310 + }, + { + "epoch": 0.016950115827307054, + "grad_norm": 0.11127147823572159, + "learning_rate": 0.0002, + "loss": 0.0848, + "step": 9320 + }, + { + "epoch": 0.01696830264686425, + "grad_norm": 0.08989393711090088, + "learning_rate": 0.0002, + "loss": 0.0823, + "step": 9330 + }, + { + "epoch": 0.016986489466421447, + "grad_norm": 0.18898548185825348, + "learning_rate": 0.0002, + "loss": 0.0687, + "step": 9340 + }, + { + "epoch": 0.017004676285978643, + "grad_norm": 0.023646721616387367, + "learning_rate": 0.0002, + "loss": 0.0244, + "step": 9350 + }, + { + "epoch": 0.01702286310553584, + "grad_norm": 0.11511775106191635, + "learning_rate": 0.0002, + "loss": 0.1642, + "step": 9360 + }, + { + "epoch": 0.017041049925093036, + "grad_norm": 0.1458021104335785, + "learning_rate": 0.0002, + "loss": 0.084, + "step": 9370 + }, + { + "epoch": 0.017059236744650232, + "grad_norm": 0.060528095811605453, + "learning_rate": 0.0002, + "loss": 0.0809, + "step": 9380 + }, + { + "epoch": 0.01707742356420743, + "grad_norm": 0.16314280033111572, + "learning_rate": 0.0002, + "loss": 0.0661, + "step": 9390 + }, + { + "epoch": 0.017095610383764625, + "grad_norm": 0.03078557923436165, + "learning_rate": 0.0002, + "loss": 0.015, + "step": 9400 + }, + { + "epoch": 0.01711379720332182, + "grad_norm": 0.11488370597362518, + "learning_rate": 0.0002, + "loss": 0.1712, + "step": 9410 + }, + { + "epoch": 0.017131984022879018, + "grad_norm": 0.0972781702876091, + "learning_rate": 0.0002, + "loss": 0.0856, + "step": 9420 + }, + { + "epoch": 0.017150170842436214, + "grad_norm": 0.08523645251989365, + "learning_rate": 0.0002, + "loss": 0.0744, + "step": 9430 + }, + { + "epoch": 0.01716835766199341, + "grad_norm": 0.18629521131515503, + "learning_rate": 0.0002, + "loss": 0.0659, + "step": 9440 + }, + { + "epoch": 0.01718654448155061, + "grad_norm": 0.00908618327230215, + "learning_rate": 0.0002, + "loss": 0.0219, + "step": 9450 + }, + { + "epoch": 0.017204731301107806, + "grad_norm": 0.05552325397729874, + "learning_rate": 0.0002, + "loss": 0.1377, + "step": 9460 + }, + { + "epoch": 0.017222918120665003, + "grad_norm": 0.16133128106594086, + "learning_rate": 0.0002, + "loss": 0.0885, + "step": 9470 + }, + { + "epoch": 0.0172411049402222, + "grad_norm": 0.0965205654501915, + "learning_rate": 0.0002, + "loss": 0.0713, + "step": 9480 + }, + { + "epoch": 0.017259291759779395, + "grad_norm": 0.21675604581832886, + "learning_rate": 0.0002, + "loss": 0.0658, + "step": 9490 + }, + { + "epoch": 0.01727747857933659, + "grad_norm": 0.043898243457078934, + "learning_rate": 0.0002, + "loss": 0.0213, + "step": 9500 + }, + { + "epoch": 0.017295665398893788, + "grad_norm": 0.0968618243932724, + "learning_rate": 0.0002, + "loss": 0.1391, + "step": 9510 + }, + { + "epoch": 0.017313852218450984, + "grad_norm": 0.15061378479003906, + "learning_rate": 0.0002, + "loss": 0.0879, + "step": 9520 + }, + { + "epoch": 0.01733203903800818, + "grad_norm": 0.08481590449810028, + "learning_rate": 0.0002, + "loss": 0.0771, + "step": 9530 + }, + { + "epoch": 0.017350225857565377, + "grad_norm": 0.20935995876789093, + "learning_rate": 0.0002, + "loss": 0.0705, + "step": 9540 + }, + { + "epoch": 0.017368412677122573, + "grad_norm": 0.04010302573442459, + "learning_rate": 0.0002, + "loss": 0.0257, + "step": 9550 + }, + { + "epoch": 0.01738659949667977, + "grad_norm": 0.10532956570386887, + "learning_rate": 0.0002, + "loss": 0.1528, + "step": 9560 + }, + { + "epoch": 0.017404786316236966, + "grad_norm": 0.1484638750553131, + "learning_rate": 0.0002, + "loss": 0.0847, + "step": 9570 + }, + { + "epoch": 0.017422973135794162, + "grad_norm": 0.05873465910553932, + "learning_rate": 0.0002, + "loss": 0.0765, + "step": 9580 + }, + { + "epoch": 0.01744115995535136, + "grad_norm": 0.1689092516899109, + "learning_rate": 0.0002, + "loss": 0.0673, + "step": 9590 + }, + { + "epoch": 0.017459346774908555, + "grad_norm": 0.014237391762435436, + "learning_rate": 0.0002, + "loss": 0.0165, + "step": 9600 + }, + { + "epoch": 0.01747753359446575, + "grad_norm": 0.06250491738319397, + "learning_rate": 0.0002, + "loss": 0.1635, + "step": 9610 + }, + { + "epoch": 0.017495720414022947, + "grad_norm": 0.08895017951726913, + "learning_rate": 0.0002, + "loss": 0.0765, + "step": 9620 + }, + { + "epoch": 0.017513907233580144, + "grad_norm": 0.08614445477724075, + "learning_rate": 0.0002, + "loss": 0.0852, + "step": 9630 + }, + { + "epoch": 0.01753209405313734, + "grad_norm": 0.25440698862075806, + "learning_rate": 0.0002, + "loss": 0.0735, + "step": 9640 + }, + { + "epoch": 0.017550280872694536, + "grad_norm": 0.015447271056473255, + "learning_rate": 0.0002, + "loss": 0.0199, + "step": 9650 + }, + { + "epoch": 0.017568467692251732, + "grad_norm": 0.08685171604156494, + "learning_rate": 0.0002, + "loss": 0.1721, + "step": 9660 + }, + { + "epoch": 0.01758665451180893, + "grad_norm": 0.1007658839225769, + "learning_rate": 0.0002, + "loss": 0.0858, + "step": 9670 + }, + { + "epoch": 0.017604841331366125, + "grad_norm": 0.1291055977344513, + "learning_rate": 0.0002, + "loss": 0.0817, + "step": 9680 + }, + { + "epoch": 0.01762302815092332, + "grad_norm": 0.21103522181510925, + "learning_rate": 0.0002, + "loss": 0.0707, + "step": 9690 + }, + { + "epoch": 0.017641214970480518, + "grad_norm": 0.027955593541264534, + "learning_rate": 0.0002, + "loss": 0.0199, + "step": 9700 + }, + { + "epoch": 0.017659401790037714, + "grad_norm": 0.06710019707679749, + "learning_rate": 0.0002, + "loss": 0.1623, + "step": 9710 + }, + { + "epoch": 0.01767758860959491, + "grad_norm": 0.09083720296621323, + "learning_rate": 0.0002, + "loss": 0.0845, + "step": 9720 + }, + { + "epoch": 0.017695775429152107, + "grad_norm": 0.07230041921138763, + "learning_rate": 0.0002, + "loss": 0.0767, + "step": 9730 + }, + { + "epoch": 0.017713962248709303, + "grad_norm": 0.19016912579536438, + "learning_rate": 0.0002, + "loss": 0.0648, + "step": 9740 + }, + { + "epoch": 0.0177321490682665, + "grad_norm": 0.03999534249305725, + "learning_rate": 0.0002, + "loss": 0.0216, + "step": 9750 + }, + { + "epoch": 0.017750335887823696, + "grad_norm": 0.08057496696710587, + "learning_rate": 0.0002, + "loss": 0.1251, + "step": 9760 + }, + { + "epoch": 0.017768522707380892, + "grad_norm": 0.16494789719581604, + "learning_rate": 0.0002, + "loss": 0.0896, + "step": 9770 + }, + { + "epoch": 0.017786709526938088, + "grad_norm": 0.07119818776845932, + "learning_rate": 0.0002, + "loss": 0.0749, + "step": 9780 + }, + { + "epoch": 0.017804896346495288, + "grad_norm": 0.1790028065443039, + "learning_rate": 0.0002, + "loss": 0.0718, + "step": 9790 + }, + { + "epoch": 0.017823083166052484, + "grad_norm": 0.055643875151872635, + "learning_rate": 0.0002, + "loss": 0.0294, + "step": 9800 + }, + { + "epoch": 0.01784126998560968, + "grad_norm": 0.15530900657176971, + "learning_rate": 0.0002, + "loss": 0.1343, + "step": 9810 + }, + { + "epoch": 0.017859456805166877, + "grad_norm": 0.08989892899990082, + "learning_rate": 0.0002, + "loss": 0.0778, + "step": 9820 + }, + { + "epoch": 0.017877643624724073, + "grad_norm": 0.038054581731557846, + "learning_rate": 0.0002, + "loss": 0.0842, + "step": 9830 + }, + { + "epoch": 0.01789583044428127, + "grad_norm": 0.12264154851436615, + "learning_rate": 0.0002, + "loss": 0.065, + "step": 9840 + }, + { + "epoch": 0.017914017263838466, + "grad_norm": 0.03432893753051758, + "learning_rate": 0.0002, + "loss": 0.02, + "step": 9850 + }, + { + "epoch": 0.017932204083395662, + "grad_norm": 0.0516468770802021, + "learning_rate": 0.0002, + "loss": 0.1339, + "step": 9860 + }, + { + "epoch": 0.01795039090295286, + "grad_norm": 0.11306226998567581, + "learning_rate": 0.0002, + "loss": 0.0842, + "step": 9870 + }, + { + "epoch": 0.017968577722510055, + "grad_norm": 0.051579318940639496, + "learning_rate": 0.0002, + "loss": 0.0795, + "step": 9880 + }, + { + "epoch": 0.01798676454206725, + "grad_norm": 0.19050930440425873, + "learning_rate": 0.0002, + "loss": 0.0673, + "step": 9890 + }, + { + "epoch": 0.018004951361624447, + "grad_norm": 0.015286738984286785, + "learning_rate": 0.0002, + "loss": 0.0169, + "step": 9900 + }, + { + "epoch": 0.018023138181181644, + "grad_norm": 0.16055025160312653, + "learning_rate": 0.0002, + "loss": 0.1655, + "step": 9910 + }, + { + "epoch": 0.01804132500073884, + "grad_norm": 0.05445674806833267, + "learning_rate": 0.0002, + "loss": 0.0786, + "step": 9920 + }, + { + "epoch": 0.018059511820296036, + "grad_norm": 0.07221481204032898, + "learning_rate": 0.0002, + "loss": 0.0726, + "step": 9930 + }, + { + "epoch": 0.018077698639853233, + "grad_norm": 0.15800146758556366, + "learning_rate": 0.0002, + "loss": 0.0607, + "step": 9940 + }, + { + "epoch": 0.01809588545941043, + "grad_norm": 0.007713336031883955, + "learning_rate": 0.0002, + "loss": 0.0148, + "step": 9950 + }, + { + "epoch": 0.018114072278967625, + "grad_norm": 0.04677269607782364, + "learning_rate": 0.0002, + "loss": 0.1718, + "step": 9960 + }, + { + "epoch": 0.01813225909852482, + "grad_norm": 0.1699189841747284, + "learning_rate": 0.0002, + "loss": 0.0865, + "step": 9970 + }, + { + "epoch": 0.018150445918082018, + "grad_norm": 0.04046279937028885, + "learning_rate": 0.0002, + "loss": 0.0781, + "step": 9980 + }, + { + "epoch": 0.018168632737639214, + "grad_norm": 0.164504736661911, + "learning_rate": 0.0002, + "loss": 0.0645, + "step": 9990 + }, + { + "epoch": 0.01818681955719641, + "grad_norm": 0.014479747042059898, + "learning_rate": 0.0002, + "loss": 0.0186, + "step": 10000 + }, + { + "epoch": 0.018205006376753607, + "grad_norm": 0.051388438791036606, + "learning_rate": 0.0002, + "loss": 0.1414, + "step": 10010 + }, + { + "epoch": 0.018223193196310803, + "grad_norm": 0.11734543740749359, + "learning_rate": 0.0002, + "loss": 0.0894, + "step": 10020 + }, + { + "epoch": 0.018241380015868, + "grad_norm": 0.022312749177217484, + "learning_rate": 0.0002, + "loss": 0.0775, + "step": 10030 + }, + { + "epoch": 0.018259566835425196, + "grad_norm": 0.1579144448041916, + "learning_rate": 0.0002, + "loss": 0.0668, + "step": 10040 + }, + { + "epoch": 0.018277753654982392, + "grad_norm": 0.02757895737886429, + "learning_rate": 0.0002, + "loss": 0.0197, + "step": 10050 + }, + { + "epoch": 0.01829594047453959, + "grad_norm": 0.07557844370603561, + "learning_rate": 0.0002, + "loss": 0.1526, + "step": 10060 + }, + { + "epoch": 0.018314127294096785, + "grad_norm": 0.1216227188706398, + "learning_rate": 0.0002, + "loss": 0.0871, + "step": 10070 + }, + { + "epoch": 0.01833231411365398, + "grad_norm": 0.04201141744852066, + "learning_rate": 0.0002, + "loss": 0.0723, + "step": 10080 + }, + { + "epoch": 0.018350500933211177, + "grad_norm": 0.151902437210083, + "learning_rate": 0.0002, + "loss": 0.063, + "step": 10090 + }, + { + "epoch": 0.018368687752768374, + "grad_norm": 0.028730260208249092, + "learning_rate": 0.0002, + "loss": 0.0154, + "step": 10100 + }, + { + "epoch": 0.01838687457232557, + "grad_norm": 0.0815989300608635, + "learning_rate": 0.0002, + "loss": 0.1439, + "step": 10110 + }, + { + "epoch": 0.018405061391882766, + "grad_norm": 0.16359028220176697, + "learning_rate": 0.0002, + "loss": 0.0901, + "step": 10120 + }, + { + "epoch": 0.018423248211439962, + "grad_norm": 0.055030226707458496, + "learning_rate": 0.0002, + "loss": 0.0812, + "step": 10130 + }, + { + "epoch": 0.018441435030997162, + "grad_norm": 0.17064853012561798, + "learning_rate": 0.0002, + "loss": 0.0713, + "step": 10140 + }, + { + "epoch": 0.01845962185055436, + "grad_norm": 0.024902408942580223, + "learning_rate": 0.0002, + "loss": 0.0201, + "step": 10150 + }, + { + "epoch": 0.018477808670111555, + "grad_norm": 0.037377748638391495, + "learning_rate": 0.0002, + "loss": 0.1394, + "step": 10160 + }, + { + "epoch": 0.01849599548966875, + "grad_norm": 0.14072410762310028, + "learning_rate": 0.0002, + "loss": 0.088, + "step": 10170 + }, + { + "epoch": 0.018514182309225947, + "grad_norm": 0.07339414954185486, + "learning_rate": 0.0002, + "loss": 0.0739, + "step": 10180 + }, + { + "epoch": 0.018532369128783144, + "grad_norm": 0.166766956448555, + "learning_rate": 0.0002, + "loss": 0.0648, + "step": 10190 + }, + { + "epoch": 0.01855055594834034, + "grad_norm": 0.009605699218809605, + "learning_rate": 0.0002, + "loss": 0.0148, + "step": 10200 + }, + { + "epoch": 0.018568742767897536, + "grad_norm": 0.045747216790914536, + "learning_rate": 0.0002, + "loss": 0.1426, + "step": 10210 + }, + { + "epoch": 0.018586929587454733, + "grad_norm": 0.09927495568990707, + "learning_rate": 0.0002, + "loss": 0.0757, + "step": 10220 + }, + { + "epoch": 0.01860511640701193, + "grad_norm": 0.032050736248493195, + "learning_rate": 0.0002, + "loss": 0.0732, + "step": 10230 + }, + { + "epoch": 0.018623303226569125, + "grad_norm": 0.14915086328983307, + "learning_rate": 0.0002, + "loss": 0.0619, + "step": 10240 + }, + { + "epoch": 0.01864149004612632, + "grad_norm": 0.019674960523843765, + "learning_rate": 0.0002, + "loss": 0.0176, + "step": 10250 + }, + { + "epoch": 0.018659676865683518, + "grad_norm": 0.0990150198340416, + "learning_rate": 0.0002, + "loss": 0.156, + "step": 10260 + }, + { + "epoch": 0.018677863685240714, + "grad_norm": 0.1409665048122406, + "learning_rate": 0.0002, + "loss": 0.0843, + "step": 10270 + }, + { + "epoch": 0.01869605050479791, + "grad_norm": 0.0232121329754591, + "learning_rate": 0.0002, + "loss": 0.0712, + "step": 10280 + }, + { + "epoch": 0.018714237324355107, + "grad_norm": 0.14811532199382782, + "learning_rate": 0.0002, + "loss": 0.0649, + "step": 10290 + }, + { + "epoch": 0.018732424143912303, + "grad_norm": 0.025812385603785515, + "learning_rate": 0.0002, + "loss": 0.0191, + "step": 10300 + }, + { + "epoch": 0.0187506109634695, + "grad_norm": 0.03710811957716942, + "learning_rate": 0.0002, + "loss": 0.1323, + "step": 10310 + }, + { + "epoch": 0.018768797783026696, + "grad_norm": 0.16586032509803772, + "learning_rate": 0.0002, + "loss": 0.0868, + "step": 10320 + }, + { + "epoch": 0.018786984602583892, + "grad_norm": 0.09154761582612991, + "learning_rate": 0.0002, + "loss": 0.0762, + "step": 10330 + }, + { + "epoch": 0.01880517142214109, + "grad_norm": 0.20400644838809967, + "learning_rate": 0.0002, + "loss": 0.072, + "step": 10340 + }, + { + "epoch": 0.018823358241698285, + "grad_norm": 0.04426256939768791, + "learning_rate": 0.0002, + "loss": 0.0281, + "step": 10350 + }, + { + "epoch": 0.01884154506125548, + "grad_norm": 0.10118848830461502, + "learning_rate": 0.0002, + "loss": 0.116, + "step": 10360 + }, + { + "epoch": 0.018859731880812677, + "grad_norm": 0.11934473365545273, + "learning_rate": 0.0002, + "loss": 0.0779, + "step": 10370 + }, + { + "epoch": 0.018877918700369874, + "grad_norm": 0.04116957262158394, + "learning_rate": 0.0002, + "loss": 0.0811, + "step": 10380 + }, + { + "epoch": 0.01889610551992707, + "grad_norm": 0.16668827831745148, + "learning_rate": 0.0002, + "loss": 0.064, + "step": 10390 + }, + { + "epoch": 0.018914292339484266, + "grad_norm": 0.04703928530216217, + "learning_rate": 0.0002, + "loss": 0.0189, + "step": 10400 + }, + { + "epoch": 0.018932479159041463, + "grad_norm": 0.10670439153909683, + "learning_rate": 0.0002, + "loss": 0.1329, + "step": 10410 + }, + { + "epoch": 0.01895066597859866, + "grad_norm": 0.033486492931842804, + "learning_rate": 0.0002, + "loss": 0.0812, + "step": 10420 + }, + { + "epoch": 0.018968852798155855, + "grad_norm": 0.03778929263353348, + "learning_rate": 0.0002, + "loss": 0.0739, + "step": 10430 + }, + { + "epoch": 0.01898703961771305, + "grad_norm": 0.1499231606721878, + "learning_rate": 0.0002, + "loss": 0.0691, + "step": 10440 + }, + { + "epoch": 0.019005226437270248, + "grad_norm": 0.020496509969234467, + "learning_rate": 0.0002, + "loss": 0.0166, + "step": 10450 + }, + { + "epoch": 0.019023413256827444, + "grad_norm": 0.07973606884479523, + "learning_rate": 0.0002, + "loss": 0.1647, + "step": 10460 + }, + { + "epoch": 0.01904160007638464, + "grad_norm": 0.2187214344739914, + "learning_rate": 0.0002, + "loss": 0.0851, + "step": 10470 + }, + { + "epoch": 0.01905978689594184, + "grad_norm": 0.05374719575047493, + "learning_rate": 0.0002, + "loss": 0.0763, + "step": 10480 + }, + { + "epoch": 0.019077973715499037, + "grad_norm": 0.20388802886009216, + "learning_rate": 0.0002, + "loss": 0.0661, + "step": 10490 + }, + { + "epoch": 0.019096160535056233, + "grad_norm": 0.023114027455449104, + "learning_rate": 0.0002, + "loss": 0.0196, + "step": 10500 + }, + { + "epoch": 0.01911434735461343, + "grad_norm": 0.07263924926519394, + "learning_rate": 0.0002, + "loss": 0.1397, + "step": 10510 + }, + { + "epoch": 0.019132534174170625, + "grad_norm": 0.13590484857559204, + "learning_rate": 0.0002, + "loss": 0.0866, + "step": 10520 + }, + { + "epoch": 0.019150720993727822, + "grad_norm": 0.03279007971286774, + "learning_rate": 0.0002, + "loss": 0.0845, + "step": 10530 + }, + { + "epoch": 0.019168907813285018, + "grad_norm": 0.16929341852664948, + "learning_rate": 0.0002, + "loss": 0.0638, + "step": 10540 + }, + { + "epoch": 0.019187094632842214, + "grad_norm": 0.043504901230335236, + "learning_rate": 0.0002, + "loss": 0.0217, + "step": 10550 + }, + { + "epoch": 0.01920528145239941, + "grad_norm": 0.05582214519381523, + "learning_rate": 0.0002, + "loss": 0.1454, + "step": 10560 + }, + { + "epoch": 0.019223468271956607, + "grad_norm": 0.12112174928188324, + "learning_rate": 0.0002, + "loss": 0.0773, + "step": 10570 + }, + { + "epoch": 0.019241655091513803, + "grad_norm": 0.028584860265254974, + "learning_rate": 0.0002, + "loss": 0.0766, + "step": 10580 + }, + { + "epoch": 0.019259841911071, + "grad_norm": 0.14817841351032257, + "learning_rate": 0.0002, + "loss": 0.07, + "step": 10590 + }, + { + "epoch": 0.019278028730628196, + "grad_norm": 0.0354049950838089, + "learning_rate": 0.0002, + "loss": 0.0205, + "step": 10600 + }, + { + "epoch": 0.019296215550185392, + "grad_norm": 0.0580359622836113, + "learning_rate": 0.0002, + "loss": 0.126, + "step": 10610 + }, + { + "epoch": 0.01931440236974259, + "grad_norm": 0.1495518982410431, + "learning_rate": 0.0002, + "loss": 0.0759, + "step": 10620 + }, + { + "epoch": 0.019332589189299785, + "grad_norm": 0.029057197272777557, + "learning_rate": 0.0002, + "loss": 0.0751, + "step": 10630 + }, + { + "epoch": 0.01935077600885698, + "grad_norm": 0.17057828605175018, + "learning_rate": 0.0002, + "loss": 0.0675, + "step": 10640 + }, + { + "epoch": 0.019368962828414178, + "grad_norm": 0.029123524203896523, + "learning_rate": 0.0002, + "loss": 0.0207, + "step": 10650 + }, + { + "epoch": 0.019387149647971374, + "grad_norm": 0.06929099559783936, + "learning_rate": 0.0002, + "loss": 0.1272, + "step": 10660 + }, + { + "epoch": 0.01940533646752857, + "grad_norm": 0.0806749165058136, + "learning_rate": 0.0002, + "loss": 0.0825, + "step": 10670 + }, + { + "epoch": 0.019423523287085766, + "grad_norm": 0.025454839691519737, + "learning_rate": 0.0002, + "loss": 0.0786, + "step": 10680 + }, + { + "epoch": 0.019441710106642963, + "grad_norm": 0.1879327893257141, + "learning_rate": 0.0002, + "loss": 0.0664, + "step": 10690 + }, + { + "epoch": 0.01945989692620016, + "grad_norm": 0.03334587439894676, + "learning_rate": 0.0002, + "loss": 0.0217, + "step": 10700 + }, + { + "epoch": 0.019478083745757355, + "grad_norm": 0.05760979652404785, + "learning_rate": 0.0002, + "loss": 0.141, + "step": 10710 + }, + { + "epoch": 0.01949627056531455, + "grad_norm": 0.03565089777112007, + "learning_rate": 0.0002, + "loss": 0.0849, + "step": 10720 + }, + { + "epoch": 0.019514457384871748, + "grad_norm": 0.1484966278076172, + "learning_rate": 0.0002, + "loss": 0.0839, + "step": 10730 + }, + { + "epoch": 0.019532644204428944, + "grad_norm": 0.22200991213321686, + "learning_rate": 0.0002, + "loss": 0.0673, + "step": 10740 + }, + { + "epoch": 0.01955083102398614, + "grad_norm": 0.017915472388267517, + "learning_rate": 0.0002, + "loss": 0.0203, + "step": 10750 + }, + { + "epoch": 0.019569017843543337, + "grad_norm": 0.11213338375091553, + "learning_rate": 0.0002, + "loss": 0.126, + "step": 10760 + }, + { + "epoch": 0.019587204663100533, + "grad_norm": 0.1563912183046341, + "learning_rate": 0.0002, + "loss": 0.0803, + "step": 10770 + }, + { + "epoch": 0.01960539148265773, + "grad_norm": 0.02315036952495575, + "learning_rate": 0.0002, + "loss": 0.0801, + "step": 10780 + }, + { + "epoch": 0.019623578302214926, + "grad_norm": 0.14482071995735168, + "learning_rate": 0.0002, + "loss": 0.0701, + "step": 10790 + }, + { + "epoch": 0.019641765121772122, + "grad_norm": 0.0369495190680027, + "learning_rate": 0.0002, + "loss": 0.022, + "step": 10800 + }, + { + "epoch": 0.01965995194132932, + "grad_norm": 0.0659516304731369, + "learning_rate": 0.0002, + "loss": 0.1282, + "step": 10810 + }, + { + "epoch": 0.019678138760886515, + "grad_norm": 0.09046377241611481, + "learning_rate": 0.0002, + "loss": 0.0748, + "step": 10820 + }, + { + "epoch": 0.019696325580443715, + "grad_norm": 0.05669049918651581, + "learning_rate": 0.0002, + "loss": 0.0808, + "step": 10830 + }, + { + "epoch": 0.01971451240000091, + "grad_norm": 0.16696439683437347, + "learning_rate": 0.0002, + "loss": 0.0696, + "step": 10840 + }, + { + "epoch": 0.019732699219558107, + "grad_norm": 0.02596648782491684, + "learning_rate": 0.0002, + "loss": 0.0189, + "step": 10850 + }, + { + "epoch": 0.019750886039115303, + "grad_norm": 0.030568787828087807, + "learning_rate": 0.0002, + "loss": 0.1431, + "step": 10860 + }, + { + "epoch": 0.0197690728586725, + "grad_norm": 0.11519906669855118, + "learning_rate": 0.0002, + "loss": 0.0832, + "step": 10870 + }, + { + "epoch": 0.019787259678229696, + "grad_norm": 0.12018325924873352, + "learning_rate": 0.0002, + "loss": 0.0769, + "step": 10880 + }, + { + "epoch": 0.019805446497786892, + "grad_norm": 0.15875691175460815, + "learning_rate": 0.0002, + "loss": 0.0679, + "step": 10890 + }, + { + "epoch": 0.01982363331734409, + "grad_norm": 0.02812560275197029, + "learning_rate": 0.0002, + "loss": 0.0236, + "step": 10900 + }, + { + "epoch": 0.019841820136901285, + "grad_norm": 0.039342913776636124, + "learning_rate": 0.0002, + "loss": 0.1433, + "step": 10910 + }, + { + "epoch": 0.01986000695645848, + "grad_norm": 0.1218978762626648, + "learning_rate": 0.0002, + "loss": 0.0785, + "step": 10920 + }, + { + "epoch": 0.019878193776015678, + "grad_norm": 0.02437124028801918, + "learning_rate": 0.0002, + "loss": 0.081, + "step": 10930 + }, + { + "epoch": 0.019896380595572874, + "grad_norm": 0.16295987367630005, + "learning_rate": 0.0002, + "loss": 0.0615, + "step": 10940 + }, + { + "epoch": 0.01991456741513007, + "grad_norm": 0.03147517144680023, + "learning_rate": 0.0002, + "loss": 0.0167, + "step": 10950 + }, + { + "epoch": 0.019932754234687267, + "grad_norm": 0.051139310002326965, + "learning_rate": 0.0002, + "loss": 0.1486, + "step": 10960 + }, + { + "epoch": 0.019950941054244463, + "grad_norm": 0.10385333746671677, + "learning_rate": 0.0002, + "loss": 0.0835, + "step": 10970 + }, + { + "epoch": 0.01996912787380166, + "grad_norm": 0.029570510610938072, + "learning_rate": 0.0002, + "loss": 0.0808, + "step": 10980 + }, + { + "epoch": 0.019987314693358856, + "grad_norm": 0.1457994282245636, + "learning_rate": 0.0002, + "loss": 0.062, + "step": 10990 + }, + { + "epoch": 0.020005501512916052, + "grad_norm": 0.013582763262093067, + "learning_rate": 0.0002, + "loss": 0.0149, + "step": 11000 + }, + { + "epoch": 0.020023688332473248, + "grad_norm": 0.13736847043037415, + "learning_rate": 0.0002, + "loss": 0.164, + "step": 11010 + }, + { + "epoch": 0.020041875152030444, + "grad_norm": 0.146778866648674, + "learning_rate": 0.0002, + "loss": 0.0865, + "step": 11020 + }, + { + "epoch": 0.02006006197158764, + "grad_norm": 0.09848106652498245, + "learning_rate": 0.0002, + "loss": 0.0785, + "step": 11030 + }, + { + "epoch": 0.020078248791144837, + "grad_norm": 0.19981160759925842, + "learning_rate": 0.0002, + "loss": 0.0738, + "step": 11040 + }, + { + "epoch": 0.020096435610702033, + "grad_norm": 0.0248726736754179, + "learning_rate": 0.0002, + "loss": 0.0232, + "step": 11050 + }, + { + "epoch": 0.02011462243025923, + "grad_norm": 0.09688897430896759, + "learning_rate": 0.0002, + "loss": 0.1361, + "step": 11060 + }, + { + "epoch": 0.020132809249816426, + "grad_norm": 0.09953918308019638, + "learning_rate": 0.0002, + "loss": 0.0815, + "step": 11070 + }, + { + "epoch": 0.020150996069373622, + "grad_norm": 0.05801590532064438, + "learning_rate": 0.0002, + "loss": 0.0794, + "step": 11080 + }, + { + "epoch": 0.02016918288893082, + "grad_norm": 0.2029600441455841, + "learning_rate": 0.0002, + "loss": 0.0661, + "step": 11090 + }, + { + "epoch": 0.020187369708488015, + "grad_norm": 0.026677627116441727, + "learning_rate": 0.0002, + "loss": 0.022, + "step": 11100 + }, + { + "epoch": 0.02020555652804521, + "grad_norm": 0.054907217621803284, + "learning_rate": 0.0002, + "loss": 0.1356, + "step": 11110 + }, + { + "epoch": 0.020223743347602408, + "grad_norm": 0.16302120685577393, + "learning_rate": 0.0002, + "loss": 0.0721, + "step": 11120 + }, + { + "epoch": 0.020241930167159604, + "grad_norm": 0.03393812105059624, + "learning_rate": 0.0002, + "loss": 0.0733, + "step": 11130 + }, + { + "epoch": 0.0202601169867168, + "grad_norm": 0.16455304622650146, + "learning_rate": 0.0002, + "loss": 0.062, + "step": 11140 + }, + { + "epoch": 0.020278303806273996, + "grad_norm": 0.026239484548568726, + "learning_rate": 0.0002, + "loss": 0.0148, + "step": 11150 + }, + { + "epoch": 0.020296490625831193, + "grad_norm": 0.10048040002584457, + "learning_rate": 0.0002, + "loss": 0.1398, + "step": 11160 + }, + { + "epoch": 0.020314677445388393, + "grad_norm": 0.14221400022506714, + "learning_rate": 0.0002, + "loss": 0.0782, + "step": 11170 + }, + { + "epoch": 0.02033286426494559, + "grad_norm": 0.08432412147521973, + "learning_rate": 0.0002, + "loss": 0.0807, + "step": 11180 + }, + { + "epoch": 0.020351051084502785, + "grad_norm": 0.172295480966568, + "learning_rate": 0.0002, + "loss": 0.0655, + "step": 11190 + }, + { + "epoch": 0.02036923790405998, + "grad_norm": 0.023976756259799004, + "learning_rate": 0.0002, + "loss": 0.0218, + "step": 11200 + }, + { + "epoch": 0.020387424723617178, + "grad_norm": 0.03286349028348923, + "learning_rate": 0.0002, + "loss": 0.1441, + "step": 11210 + }, + { + "epoch": 0.020405611543174374, + "grad_norm": 0.04403531551361084, + "learning_rate": 0.0002, + "loss": 0.0825, + "step": 11220 + }, + { + "epoch": 0.02042379836273157, + "grad_norm": 0.0398452989757061, + "learning_rate": 0.0002, + "loss": 0.0755, + "step": 11230 + }, + { + "epoch": 0.020441985182288767, + "grad_norm": 0.15185104310512543, + "learning_rate": 0.0002, + "loss": 0.0591, + "step": 11240 + }, + { + "epoch": 0.020460172001845963, + "grad_norm": 0.005839187186211348, + "learning_rate": 0.0002, + "loss": 0.0172, + "step": 11250 + }, + { + "epoch": 0.02047835882140316, + "grad_norm": 0.031195368617773056, + "learning_rate": 0.0002, + "loss": 0.1594, + "step": 11260 + }, + { + "epoch": 0.020496545640960356, + "grad_norm": 0.1997426599264145, + "learning_rate": 0.0002, + "loss": 0.0797, + "step": 11270 + }, + { + "epoch": 0.020514732460517552, + "grad_norm": 0.03075752593576908, + "learning_rate": 0.0002, + "loss": 0.0798, + "step": 11280 + }, + { + "epoch": 0.02053291928007475, + "grad_norm": 0.17717675864696503, + "learning_rate": 0.0002, + "loss": 0.0654, + "step": 11290 + }, + { + "epoch": 0.020551106099631945, + "grad_norm": 0.036260057240724564, + "learning_rate": 0.0002, + "loss": 0.0196, + "step": 11300 + }, + { + "epoch": 0.02056929291918914, + "grad_norm": 0.11961262673139572, + "learning_rate": 0.0002, + "loss": 0.1313, + "step": 11310 + }, + { + "epoch": 0.020587479738746337, + "grad_norm": 0.12344212830066681, + "learning_rate": 0.0002, + "loss": 0.0798, + "step": 11320 + }, + { + "epoch": 0.020605666558303534, + "grad_norm": 0.12796273827552795, + "learning_rate": 0.0002, + "loss": 0.0693, + "step": 11330 + }, + { + "epoch": 0.02062385337786073, + "grad_norm": 0.12038332223892212, + "learning_rate": 0.0002, + "loss": 0.0637, + "step": 11340 + }, + { + "epoch": 0.020642040197417926, + "grad_norm": 0.013724497519433498, + "learning_rate": 0.0002, + "loss": 0.0134, + "step": 11350 + }, + { + "epoch": 0.020660227016975122, + "grad_norm": 0.030014917254447937, + "learning_rate": 0.0002, + "loss": 0.1355, + "step": 11360 + }, + { + "epoch": 0.02067841383653232, + "grad_norm": 0.05455614998936653, + "learning_rate": 0.0002, + "loss": 0.0824, + "step": 11370 + }, + { + "epoch": 0.020696600656089515, + "grad_norm": 0.09036605060100555, + "learning_rate": 0.0002, + "loss": 0.0712, + "step": 11380 + }, + { + "epoch": 0.02071478747564671, + "grad_norm": 0.15607796609401703, + "learning_rate": 0.0002, + "loss": 0.0613, + "step": 11390 + }, + { + "epoch": 0.020732974295203908, + "grad_norm": 0.029900453984737396, + "learning_rate": 0.0002, + "loss": 0.0216, + "step": 11400 + }, + { + "epoch": 0.020751161114761104, + "grad_norm": 0.06108042970299721, + "learning_rate": 0.0002, + "loss": 0.1223, + "step": 11410 + }, + { + "epoch": 0.0207693479343183, + "grad_norm": 0.052377600222826004, + "learning_rate": 0.0002, + "loss": 0.0795, + "step": 11420 + }, + { + "epoch": 0.020787534753875497, + "grad_norm": 0.063735231757164, + "learning_rate": 0.0002, + "loss": 0.0746, + "step": 11430 + }, + { + "epoch": 0.020805721573432693, + "grad_norm": 0.16977328062057495, + "learning_rate": 0.0002, + "loss": 0.0634, + "step": 11440 + }, + { + "epoch": 0.02082390839298989, + "grad_norm": 0.04451785981655121, + "learning_rate": 0.0002, + "loss": 0.0298, + "step": 11450 + }, + { + "epoch": 0.020842095212547086, + "grad_norm": 1.1584863662719727, + "learning_rate": 0.0002, + "loss": 0.1133, + "step": 11460 + }, + { + "epoch": 0.020860282032104282, + "grad_norm": 0.09867832064628601, + "learning_rate": 0.0002, + "loss": 0.0774, + "step": 11470 + }, + { + "epoch": 0.020878468851661478, + "grad_norm": 0.05493566766381264, + "learning_rate": 0.0002, + "loss": 0.0752, + "step": 11480 + }, + { + "epoch": 0.020896655671218674, + "grad_norm": 0.2149093896150589, + "learning_rate": 0.0002, + "loss": 0.0682, + "step": 11490 + }, + { + "epoch": 0.02091484249077587, + "grad_norm": 0.02243107184767723, + "learning_rate": 0.0002, + "loss": 0.0191, + "step": 11500 + }, + { + "epoch": 0.02093302931033307, + "grad_norm": 0.27817150950431824, + "learning_rate": 0.0002, + "loss": 0.1658, + "step": 11510 + }, + { + "epoch": 0.020951216129890267, + "grad_norm": 0.14467410743236542, + "learning_rate": 0.0002, + "loss": 0.083, + "step": 11520 + }, + { + "epoch": 0.020969402949447463, + "grad_norm": 0.1027064323425293, + "learning_rate": 0.0002, + "loss": 0.0825, + "step": 11530 + }, + { + "epoch": 0.02098758976900466, + "grad_norm": 0.2156657725572586, + "learning_rate": 0.0002, + "loss": 0.0694, + "step": 11540 + }, + { + "epoch": 0.021005776588561856, + "grad_norm": 0.023746902123093605, + "learning_rate": 0.0002, + "loss": 0.024, + "step": 11550 + }, + { + "epoch": 0.021023963408119052, + "grad_norm": 0.19738778471946716, + "learning_rate": 0.0002, + "loss": 0.1473, + "step": 11560 + }, + { + "epoch": 0.02104215022767625, + "grad_norm": 0.19759760797023773, + "learning_rate": 0.0002, + "loss": 0.078, + "step": 11570 + }, + { + "epoch": 0.021060337047233445, + "grad_norm": 9.88092041015625, + "learning_rate": 0.0002, + "loss": 0.0866, + "step": 11580 + }, + { + "epoch": 0.02107852386679064, + "grad_norm": 0.22301238775253296, + "learning_rate": 0.0002, + "loss": 0.0685, + "step": 11590 + }, + { + "epoch": 0.021096710686347837, + "grad_norm": 0.023191403597593307, + "learning_rate": 0.0002, + "loss": 0.0468, + "step": 11600 + }, + { + "epoch": 0.021114897505905034, + "grad_norm": 0.10442623496055603, + "learning_rate": 0.0002, + "loss": 0.2046, + "step": 11610 + }, + { + "epoch": 0.02113308432546223, + "grad_norm": 0.18771864473819733, + "learning_rate": 0.0002, + "loss": 0.0805, + "step": 11620 + }, + { + "epoch": 0.021151271145019426, + "grad_norm": 0.05516243353486061, + "learning_rate": 0.0002, + "loss": 0.0795, + "step": 11630 + }, + { + "epoch": 0.021169457964576623, + "grad_norm": 0.21308554708957672, + "learning_rate": 0.0002, + "loss": 0.0725, + "step": 11640 + }, + { + "epoch": 0.02118764478413382, + "grad_norm": 0.010607315227389336, + "learning_rate": 0.0002, + "loss": 0.0241, + "step": 11650 + }, + { + "epoch": 0.021205831603691015, + "grad_norm": 0.0542677640914917, + "learning_rate": 0.0002, + "loss": 0.1648, + "step": 11660 + }, + { + "epoch": 0.02122401842324821, + "grad_norm": 0.11239166557788849, + "learning_rate": 0.0002, + "loss": 0.0825, + "step": 11670 + }, + { + "epoch": 0.021242205242805408, + "grad_norm": 0.032700493931770325, + "learning_rate": 0.0002, + "loss": 0.0727, + "step": 11680 + }, + { + "epoch": 0.021260392062362604, + "grad_norm": 0.2005159705877304, + "learning_rate": 0.0002, + "loss": 0.0708, + "step": 11690 + }, + { + "epoch": 0.0212785788819198, + "grad_norm": 0.01741277053952217, + "learning_rate": 0.0002, + "loss": 0.0232, + "step": 11700 + }, + { + "epoch": 0.021296765701476997, + "grad_norm": 0.04048267379403114, + "learning_rate": 0.0002, + "loss": 0.1403, + "step": 11710 + }, + { + "epoch": 0.021314952521034193, + "grad_norm": 0.18796616792678833, + "learning_rate": 0.0002, + "loss": 0.0886, + "step": 11720 + }, + { + "epoch": 0.02133313934059139, + "grad_norm": 0.06360754370689392, + "learning_rate": 0.0002, + "loss": 0.0731, + "step": 11730 + }, + { + "epoch": 0.021351326160148586, + "grad_norm": 0.14168913662433624, + "learning_rate": 0.0002, + "loss": 0.0622, + "step": 11740 + }, + { + "epoch": 0.021369512979705782, + "grad_norm": 0.012988853268325329, + "learning_rate": 0.0002, + "loss": 0.0144, + "step": 11750 + }, + { + "epoch": 0.02138769979926298, + "grad_norm": 0.09176674485206604, + "learning_rate": 0.0002, + "loss": 0.1574, + "step": 11760 + }, + { + "epoch": 0.021405886618820175, + "grad_norm": 0.11934395134449005, + "learning_rate": 0.0002, + "loss": 0.079, + "step": 11770 + }, + { + "epoch": 0.02142407343837737, + "grad_norm": 0.11853605508804321, + "learning_rate": 0.0002, + "loss": 0.076, + "step": 11780 + }, + { + "epoch": 0.021442260257934567, + "grad_norm": 0.1625816971063614, + "learning_rate": 0.0002, + "loss": 0.0649, + "step": 11790 + }, + { + "epoch": 0.021460447077491764, + "grad_norm": 0.023221928626298904, + "learning_rate": 0.0002, + "loss": 0.0228, + "step": 11800 + }, + { + "epoch": 0.02147863389704896, + "grad_norm": 0.0494253933429718, + "learning_rate": 0.0002, + "loss": 0.1418, + "step": 11810 + }, + { + "epoch": 0.021496820716606156, + "grad_norm": 0.18250688910484314, + "learning_rate": 0.0002, + "loss": 0.0827, + "step": 11820 + }, + { + "epoch": 0.021515007536163352, + "grad_norm": 0.13340160250663757, + "learning_rate": 0.0002, + "loss": 0.0794, + "step": 11830 + }, + { + "epoch": 0.02153319435572055, + "grad_norm": 0.15497778356075287, + "learning_rate": 0.0002, + "loss": 0.0613, + "step": 11840 + }, + { + "epoch": 0.021551381175277745, + "grad_norm": 0.03259354829788208, + "learning_rate": 0.0002, + "loss": 0.023, + "step": 11850 + }, + { + "epoch": 0.021569567994834945, + "grad_norm": 0.09126435220241547, + "learning_rate": 0.0002, + "loss": 0.1235, + "step": 11860 + }, + { + "epoch": 0.02158775481439214, + "grad_norm": 0.13455496728420258, + "learning_rate": 0.0002, + "loss": 0.0806, + "step": 11870 + }, + { + "epoch": 0.021605941633949338, + "grad_norm": 0.10817539691925049, + "learning_rate": 0.0002, + "loss": 0.0829, + "step": 11880 + }, + { + "epoch": 0.021624128453506534, + "grad_norm": 0.1913878321647644, + "learning_rate": 0.0002, + "loss": 0.0636, + "step": 11890 + }, + { + "epoch": 0.02164231527306373, + "grad_norm": 0.025634530931711197, + "learning_rate": 0.0002, + "loss": 0.0216, + "step": 11900 + }, + { + "epoch": 0.021660502092620926, + "grad_norm": 0.10507725924253464, + "learning_rate": 0.0002, + "loss": 0.1326, + "step": 11910 + }, + { + "epoch": 0.021678688912178123, + "grad_norm": 0.09721452742815018, + "learning_rate": 0.0002, + "loss": 0.0857, + "step": 11920 + }, + { + "epoch": 0.02169687573173532, + "grad_norm": 0.028759269043803215, + "learning_rate": 0.0002, + "loss": 0.0751, + "step": 11930 + }, + { + "epoch": 0.021715062551292515, + "grad_norm": 0.17618104815483093, + "learning_rate": 0.0002, + "loss": 0.062, + "step": 11940 + }, + { + "epoch": 0.02173324937084971, + "grad_norm": 0.02503124624490738, + "learning_rate": 0.0002, + "loss": 0.0182, + "step": 11950 + }, + { + "epoch": 0.021751436190406908, + "grad_norm": 0.10976126044988632, + "learning_rate": 0.0002, + "loss": 0.1564, + "step": 11960 + }, + { + "epoch": 0.021769623009964104, + "grad_norm": 0.0833989605307579, + "learning_rate": 0.0002, + "loss": 0.0784, + "step": 11970 + }, + { + "epoch": 0.0217878098295213, + "grad_norm": 0.06359647959470749, + "learning_rate": 0.0002, + "loss": 0.0795, + "step": 11980 + }, + { + "epoch": 0.021805996649078497, + "grad_norm": 0.1677824705839157, + "learning_rate": 0.0002, + "loss": 0.0699, + "step": 11990 + }, + { + "epoch": 0.021824183468635693, + "grad_norm": 0.018009621649980545, + "learning_rate": 0.0002, + "loss": 0.0185, + "step": 12000 + }, + { + "epoch": 0.02184237028819289, + "grad_norm": 0.12256644666194916, + "learning_rate": 0.0002, + "loss": 0.1839, + "step": 12010 + }, + { + "epoch": 0.021860557107750086, + "grad_norm": 0.11677028983831406, + "learning_rate": 0.0002, + "loss": 0.0829, + "step": 12020 + }, + { + "epoch": 0.021878743927307282, + "grad_norm": 0.12885046005249023, + "learning_rate": 0.0002, + "loss": 0.0812, + "step": 12030 + }, + { + "epoch": 0.02189693074686448, + "grad_norm": 0.1394425481557846, + "learning_rate": 0.0002, + "loss": 0.0668, + "step": 12040 + }, + { + "epoch": 0.021915117566421675, + "grad_norm": 0.024974076077342033, + "learning_rate": 0.0002, + "loss": 0.0192, + "step": 12050 + }, + { + "epoch": 0.02193330438597887, + "grad_norm": 0.11284986138343811, + "learning_rate": 0.0002, + "loss": 0.1492, + "step": 12060 + }, + { + "epoch": 0.021951491205536067, + "grad_norm": 0.0605492927134037, + "learning_rate": 0.0002, + "loss": 0.0787, + "step": 12070 + }, + { + "epoch": 0.021969678025093264, + "grad_norm": 0.040298718959093094, + "learning_rate": 0.0002, + "loss": 0.0778, + "step": 12080 + }, + { + "epoch": 0.02198786484465046, + "grad_norm": 0.1555332988500595, + "learning_rate": 0.0002, + "loss": 0.0683, + "step": 12090 + }, + { + "epoch": 0.022006051664207656, + "grad_norm": 0.022474724799394608, + "learning_rate": 0.0002, + "loss": 0.0139, + "step": 12100 + }, + { + "epoch": 0.022024238483764853, + "grad_norm": 0.08212363719940186, + "learning_rate": 0.0002, + "loss": 0.1513, + "step": 12110 + }, + { + "epoch": 0.02204242530332205, + "grad_norm": 0.16297335922718048, + "learning_rate": 0.0002, + "loss": 0.087, + "step": 12120 + }, + { + "epoch": 0.022060612122879245, + "grad_norm": 0.026817265897989273, + "learning_rate": 0.0002, + "loss": 0.0763, + "step": 12130 + }, + { + "epoch": 0.02207879894243644, + "grad_norm": 0.15199647843837738, + "learning_rate": 0.0002, + "loss": 0.0632, + "step": 12140 + }, + { + "epoch": 0.022096985761993638, + "grad_norm": 0.021619049832224846, + "learning_rate": 0.0002, + "loss": 0.0221, + "step": 12150 + }, + { + "epoch": 0.022115172581550834, + "grad_norm": 0.071327805519104, + "learning_rate": 0.0002, + "loss": 0.138, + "step": 12160 + }, + { + "epoch": 0.02213335940110803, + "grad_norm": 0.07506705075502396, + "learning_rate": 0.0002, + "loss": 0.0802, + "step": 12170 + }, + { + "epoch": 0.022151546220665227, + "grad_norm": 0.05193526670336723, + "learning_rate": 0.0002, + "loss": 0.0726, + "step": 12180 + }, + { + "epoch": 0.022169733040222423, + "grad_norm": 0.125730961561203, + "learning_rate": 0.0002, + "loss": 0.0658, + "step": 12190 + }, + { + "epoch": 0.022187919859779623, + "grad_norm": 0.01939002424478531, + "learning_rate": 0.0002, + "loss": 0.0174, + "step": 12200 + }, + { + "epoch": 0.02220610667933682, + "grad_norm": 0.05645585432648659, + "learning_rate": 0.0002, + "loss": 0.1447, + "step": 12210 + }, + { + "epoch": 0.022224293498894016, + "grad_norm": 0.12416274845600128, + "learning_rate": 0.0002, + "loss": 0.0727, + "step": 12220 + }, + { + "epoch": 0.022242480318451212, + "grad_norm": 0.05618545040488243, + "learning_rate": 0.0002, + "loss": 0.0801, + "step": 12230 + }, + { + "epoch": 0.022260667138008408, + "grad_norm": 0.12334968894720078, + "learning_rate": 0.0002, + "loss": 0.0598, + "step": 12240 + }, + { + "epoch": 0.022278853957565604, + "grad_norm": 0.024331970140337944, + "learning_rate": 0.0002, + "loss": 0.0179, + "step": 12250 + }, + { + "epoch": 0.0222970407771228, + "grad_norm": 0.05856281518936157, + "learning_rate": 0.0002, + "loss": 0.126, + "step": 12260 + }, + { + "epoch": 0.022315227596679997, + "grad_norm": 0.07432300597429276, + "learning_rate": 0.0002, + "loss": 0.0839, + "step": 12270 + }, + { + "epoch": 0.022333414416237193, + "grad_norm": 0.07249715179204941, + "learning_rate": 0.0002, + "loss": 0.0815, + "step": 12280 + }, + { + "epoch": 0.02235160123579439, + "grad_norm": 0.14335612952709198, + "learning_rate": 0.0002, + "loss": 0.0605, + "step": 12290 + }, + { + "epoch": 0.022369788055351586, + "grad_norm": 0.03603110462427139, + "learning_rate": 0.0002, + "loss": 0.0185, + "step": 12300 + }, + { + "epoch": 0.022387974874908782, + "grad_norm": 0.08532091230154037, + "learning_rate": 0.0002, + "loss": 0.1339, + "step": 12310 + }, + { + "epoch": 0.02240616169446598, + "grad_norm": 0.13663236796855927, + "learning_rate": 0.0002, + "loss": 0.0761, + "step": 12320 + }, + { + "epoch": 0.022424348514023175, + "grad_norm": 0.10088011622428894, + "learning_rate": 0.0002, + "loss": 0.0737, + "step": 12330 + }, + { + "epoch": 0.02244253533358037, + "grad_norm": 0.17186152935028076, + "learning_rate": 0.0002, + "loss": 0.0661, + "step": 12340 + }, + { + "epoch": 0.022460722153137568, + "grad_norm": 0.01941334828734398, + "learning_rate": 0.0002, + "loss": 0.0135, + "step": 12350 + }, + { + "epoch": 0.022478908972694764, + "grad_norm": 0.12438862770795822, + "learning_rate": 0.0002, + "loss": 0.1474, + "step": 12360 + }, + { + "epoch": 0.02249709579225196, + "grad_norm": 0.08050791174173355, + "learning_rate": 0.0002, + "loss": 0.0792, + "step": 12370 + }, + { + "epoch": 0.022515282611809156, + "grad_norm": 0.04660952091217041, + "learning_rate": 0.0002, + "loss": 0.0826, + "step": 12380 + }, + { + "epoch": 0.022533469431366353, + "grad_norm": 0.16433311998844147, + "learning_rate": 0.0002, + "loss": 0.0691, + "step": 12390 + }, + { + "epoch": 0.02255165625092355, + "grad_norm": 0.04376552626490593, + "learning_rate": 0.0002, + "loss": 0.0219, + "step": 12400 + }, + { + "epoch": 0.022569843070480745, + "grad_norm": 0.06648654490709305, + "learning_rate": 0.0002, + "loss": 0.1346, + "step": 12410 + }, + { + "epoch": 0.02258802989003794, + "grad_norm": 0.11318199336528778, + "learning_rate": 0.0002, + "loss": 0.0736, + "step": 12420 + }, + { + "epoch": 0.022606216709595138, + "grad_norm": 0.0922408252954483, + "learning_rate": 0.0002, + "loss": 0.0819, + "step": 12430 + }, + { + "epoch": 0.022624403529152334, + "grad_norm": 0.1696896106004715, + "learning_rate": 0.0002, + "loss": 0.0642, + "step": 12440 + }, + { + "epoch": 0.02264259034870953, + "grad_norm": 0.03212421387434006, + "learning_rate": 0.0002, + "loss": 0.0247, + "step": 12450 + }, + { + "epoch": 0.022660777168266727, + "grad_norm": 0.12295889109373093, + "learning_rate": 0.0002, + "loss": 0.1504, + "step": 12460 + }, + { + "epoch": 0.022678963987823923, + "grad_norm": 0.10351194441318512, + "learning_rate": 0.0002, + "loss": 0.0757, + "step": 12470 + }, + { + "epoch": 0.02269715080738112, + "grad_norm": 0.022580118849873543, + "learning_rate": 0.0002, + "loss": 0.0756, + "step": 12480 + }, + { + "epoch": 0.022715337626938316, + "grad_norm": 0.16330066323280334, + "learning_rate": 0.0002, + "loss": 0.0645, + "step": 12490 + }, + { + "epoch": 0.022733524446495512, + "grad_norm": 0.021431026980280876, + "learning_rate": 0.0002, + "loss": 0.0224, + "step": 12500 + }, + { + "epoch": 0.02275171126605271, + "grad_norm": 0.053853604942560196, + "learning_rate": 0.0002, + "loss": 0.1304, + "step": 12510 + }, + { + "epoch": 0.022769898085609905, + "grad_norm": 0.129705548286438, + "learning_rate": 0.0002, + "loss": 0.0799, + "step": 12520 + }, + { + "epoch": 0.0227880849051671, + "grad_norm": 0.027473529800772667, + "learning_rate": 0.0002, + "loss": 0.0771, + "step": 12530 + }, + { + "epoch": 0.0228062717247243, + "grad_norm": 0.2045305222272873, + "learning_rate": 0.0002, + "loss": 0.0615, + "step": 12540 + }, + { + "epoch": 0.022824458544281497, + "grad_norm": 0.041042860597372055, + "learning_rate": 0.0002, + "loss": 0.026, + "step": 12550 + }, + { + "epoch": 0.022842645363838694, + "grad_norm": 0.05624527484178543, + "learning_rate": 0.0002, + "loss": 0.1327, + "step": 12560 + }, + { + "epoch": 0.02286083218339589, + "grad_norm": 0.09647081047296524, + "learning_rate": 0.0002, + "loss": 0.0758, + "step": 12570 + }, + { + "epoch": 0.022879019002953086, + "grad_norm": 0.03362264856696129, + "learning_rate": 0.0002, + "loss": 0.076, + "step": 12580 + }, + { + "epoch": 0.022897205822510282, + "grad_norm": 0.1459503322839737, + "learning_rate": 0.0002, + "loss": 0.0603, + "step": 12590 + }, + { + "epoch": 0.02291539264206748, + "grad_norm": 0.025729481130838394, + "learning_rate": 0.0002, + "loss": 0.0196, + "step": 12600 + }, + { + "epoch": 0.022933579461624675, + "grad_norm": 0.19940927624702454, + "learning_rate": 0.0002, + "loss": 0.1298, + "step": 12610 + }, + { + "epoch": 0.02295176628118187, + "grad_norm": 0.13796600699424744, + "learning_rate": 0.0002, + "loss": 0.086, + "step": 12620 + }, + { + "epoch": 0.022969953100739068, + "grad_norm": 0.08884158730506897, + "learning_rate": 0.0002, + "loss": 0.0808, + "step": 12630 + }, + { + "epoch": 0.022988139920296264, + "grad_norm": 0.15814751386642456, + "learning_rate": 0.0002, + "loss": 0.0658, + "step": 12640 + }, + { + "epoch": 0.02300632673985346, + "grad_norm": 0.03503837063908577, + "learning_rate": 0.0002, + "loss": 0.0232, + "step": 12650 + }, + { + "epoch": 0.023024513559410657, + "grad_norm": 0.09701854735612869, + "learning_rate": 0.0002, + "loss": 0.136, + "step": 12660 + }, + { + "epoch": 0.023042700378967853, + "grad_norm": 0.13909977674484253, + "learning_rate": 0.0002, + "loss": 0.0839, + "step": 12670 + }, + { + "epoch": 0.02306088719852505, + "grad_norm": 0.03152406960725784, + "learning_rate": 0.0002, + "loss": 0.0733, + "step": 12680 + }, + { + "epoch": 0.023079074018082246, + "grad_norm": 0.13872750103473663, + "learning_rate": 0.0002, + "loss": 0.0604, + "step": 12690 + }, + { + "epoch": 0.023097260837639442, + "grad_norm": 0.03626656159758568, + "learning_rate": 0.0002, + "loss": 0.0234, + "step": 12700 + }, + { + "epoch": 0.023115447657196638, + "grad_norm": 0.10111619532108307, + "learning_rate": 0.0002, + "loss": 0.1507, + "step": 12710 + }, + { + "epoch": 0.023133634476753834, + "grad_norm": 0.09038366377353668, + "learning_rate": 0.0002, + "loss": 0.0839, + "step": 12720 + }, + { + "epoch": 0.02315182129631103, + "grad_norm": 0.026116544380784035, + "learning_rate": 0.0002, + "loss": 0.0777, + "step": 12730 + }, + { + "epoch": 0.023170008115868227, + "grad_norm": 0.2067679613828659, + "learning_rate": 0.0002, + "loss": 0.0661, + "step": 12740 + }, + { + "epoch": 0.023188194935425423, + "grad_norm": 0.02005072310566902, + "learning_rate": 0.0002, + "loss": 0.0165, + "step": 12750 + }, + { + "epoch": 0.02320638175498262, + "grad_norm": 0.03261101245880127, + "learning_rate": 0.0002, + "loss": 0.159, + "step": 12760 + }, + { + "epoch": 0.023224568574539816, + "grad_norm": 0.1416555494070053, + "learning_rate": 0.0002, + "loss": 0.0856, + "step": 12770 + }, + { + "epoch": 0.023242755394097012, + "grad_norm": 0.09400717914104462, + "learning_rate": 0.0002, + "loss": 0.0745, + "step": 12780 + }, + { + "epoch": 0.02326094221365421, + "grad_norm": 0.17093195021152496, + "learning_rate": 0.0002, + "loss": 0.0691, + "step": 12790 + }, + { + "epoch": 0.023279129033211405, + "grad_norm": 0.0209200382232666, + "learning_rate": 0.0002, + "loss": 0.0168, + "step": 12800 + }, + { + "epoch": 0.0232973158527686, + "grad_norm": 0.10523302853107452, + "learning_rate": 0.0002, + "loss": 0.1628, + "step": 12810 + }, + { + "epoch": 0.023315502672325798, + "grad_norm": 0.06932856142520905, + "learning_rate": 0.0002, + "loss": 0.079, + "step": 12820 + }, + { + "epoch": 0.023333689491882994, + "grad_norm": 0.03244032710790634, + "learning_rate": 0.0002, + "loss": 0.0699, + "step": 12830 + }, + { + "epoch": 0.02335187631144019, + "grad_norm": 0.13403338193893433, + "learning_rate": 0.0002, + "loss": 0.0619, + "step": 12840 + }, + { + "epoch": 0.023370063130997386, + "grad_norm": 0.034033093601465225, + "learning_rate": 0.0002, + "loss": 0.0166, + "step": 12850 + }, + { + "epoch": 0.023388249950554583, + "grad_norm": 0.07277385890483856, + "learning_rate": 0.0002, + "loss": 0.1377, + "step": 12860 + }, + { + "epoch": 0.02340643677011178, + "grad_norm": 0.10873163491487503, + "learning_rate": 0.0002, + "loss": 0.0895, + "step": 12870 + }, + { + "epoch": 0.023424623589668975, + "grad_norm": 0.06244732066988945, + "learning_rate": 0.0002, + "loss": 0.0745, + "step": 12880 + }, + { + "epoch": 0.023442810409226175, + "grad_norm": 0.1937248259782791, + "learning_rate": 0.0002, + "loss": 0.0633, + "step": 12890 + }, + { + "epoch": 0.02346099722878337, + "grad_norm": 0.03432930260896683, + "learning_rate": 0.0002, + "loss": 0.0246, + "step": 12900 + }, + { + "epoch": 0.023479184048340568, + "grad_norm": 0.33358234167099, + "learning_rate": 0.0002, + "loss": 0.1249, + "step": 12910 + }, + { + "epoch": 0.023497370867897764, + "grad_norm": 0.12039615213871002, + "learning_rate": 0.0002, + "loss": 0.0734, + "step": 12920 + }, + { + "epoch": 0.02351555768745496, + "grad_norm": 0.02666555717587471, + "learning_rate": 0.0002, + "loss": 0.0849, + "step": 12930 + }, + { + "epoch": 0.023533744507012157, + "grad_norm": 0.128091961145401, + "learning_rate": 0.0002, + "loss": 0.0647, + "step": 12940 + }, + { + "epoch": 0.023551931326569353, + "grad_norm": 0.030916422605514526, + "learning_rate": 0.0002, + "loss": 0.0217, + "step": 12950 + }, + { + "epoch": 0.02357011814612655, + "grad_norm": 0.09280567616224289, + "learning_rate": 0.0002, + "loss": 0.1281, + "step": 12960 + }, + { + "epoch": 0.023588304965683746, + "grad_norm": 0.09032955765724182, + "learning_rate": 0.0002, + "loss": 0.0834, + "step": 12970 + }, + { + "epoch": 0.023606491785240942, + "grad_norm": 0.3660918176174164, + "learning_rate": 0.0002, + "loss": 0.0776, + "step": 12980 + }, + { + "epoch": 0.02362467860479814, + "grad_norm": 0.15715408325195312, + "learning_rate": 0.0002, + "loss": 0.0611, + "step": 12990 + }, + { + "epoch": 0.023642865424355335, + "grad_norm": 0.03867153823375702, + "learning_rate": 0.0002, + "loss": 0.0214, + "step": 13000 + }, + { + "epoch": 0.02366105224391253, + "grad_norm": 0.37568527460098267, + "learning_rate": 0.0002, + "loss": 0.2529, + "step": 13010 + }, + { + "epoch": 0.023679239063469727, + "grad_norm": 0.14888867735862732, + "learning_rate": 0.0002, + "loss": 0.0801, + "step": 13020 + }, + { + "epoch": 0.023697425883026924, + "grad_norm": 0.04271422699093819, + "learning_rate": 0.0002, + "loss": 0.0863, + "step": 13030 + }, + { + "epoch": 0.02371561270258412, + "grad_norm": 0.190608948469162, + "learning_rate": 0.0002, + "loss": 0.0773, + "step": 13040 + }, + { + "epoch": 0.023733799522141316, + "grad_norm": 0.020333535969257355, + "learning_rate": 0.0002, + "loss": 0.0201, + "step": 13050 + }, + { + "epoch": 0.023751986341698512, + "grad_norm": 0.143577441573143, + "learning_rate": 0.0002, + "loss": 0.1709, + "step": 13060 + }, + { + "epoch": 0.02377017316125571, + "grad_norm": 0.09225071966648102, + "learning_rate": 0.0002, + "loss": 0.0854, + "step": 13070 + }, + { + "epoch": 0.023788359980812905, + "grad_norm": 0.08655473589897156, + "learning_rate": 0.0002, + "loss": 0.0727, + "step": 13080 + }, + { + "epoch": 0.0238065468003701, + "grad_norm": 0.14465250074863434, + "learning_rate": 0.0002, + "loss": 0.0632, + "step": 13090 + }, + { + "epoch": 0.023824733619927298, + "grad_norm": 0.019399341195821762, + "learning_rate": 0.0002, + "loss": 0.0204, + "step": 13100 + }, + { + "epoch": 0.023842920439484494, + "grad_norm": 0.09221036732196808, + "learning_rate": 0.0002, + "loss": 0.1646, + "step": 13110 + }, + { + "epoch": 0.02386110725904169, + "grad_norm": 0.1308157742023468, + "learning_rate": 0.0002, + "loss": 0.089, + "step": 13120 + }, + { + "epoch": 0.023879294078598887, + "grad_norm": 0.04212506487965584, + "learning_rate": 0.0002, + "loss": 0.0776, + "step": 13130 + }, + { + "epoch": 0.023897480898156083, + "grad_norm": 0.13541243970394135, + "learning_rate": 0.0002, + "loss": 0.0694, + "step": 13140 + }, + { + "epoch": 0.02391566771771328, + "grad_norm": 0.016859933733940125, + "learning_rate": 0.0002, + "loss": 0.0191, + "step": 13150 + }, + { + "epoch": 0.023933854537270476, + "grad_norm": 0.1553143709897995, + "learning_rate": 0.0002, + "loss": 0.1653, + "step": 13160 + }, + { + "epoch": 0.023952041356827672, + "grad_norm": 0.07960142940282822, + "learning_rate": 0.0002, + "loss": 0.0938, + "step": 13170 + }, + { + "epoch": 0.023970228176384868, + "grad_norm": 0.0719163790345192, + "learning_rate": 0.0002, + "loss": 0.0767, + "step": 13180 + }, + { + "epoch": 0.023988414995942065, + "grad_norm": 0.14845407009124756, + "learning_rate": 0.0002, + "loss": 0.0642, + "step": 13190 + }, + { + "epoch": 0.02400660181549926, + "grad_norm": 0.01817360520362854, + "learning_rate": 0.0002, + "loss": 0.0229, + "step": 13200 + }, + { + "epoch": 0.024024788635056457, + "grad_norm": 0.03876543045043945, + "learning_rate": 0.0002, + "loss": 0.1377, + "step": 13210 + }, + { + "epoch": 0.024042975454613653, + "grad_norm": 0.05972164496779442, + "learning_rate": 0.0002, + "loss": 0.0802, + "step": 13220 + }, + { + "epoch": 0.024061162274170853, + "grad_norm": 0.09239703416824341, + "learning_rate": 0.0002, + "loss": 0.0816, + "step": 13230 + }, + { + "epoch": 0.02407934909372805, + "grad_norm": 0.15912885963916779, + "learning_rate": 0.0002, + "loss": 0.0598, + "step": 13240 + }, + { + "epoch": 0.024097535913285246, + "grad_norm": 0.024279551580548286, + "learning_rate": 0.0002, + "loss": 0.0235, + "step": 13250 + }, + { + "epoch": 0.024115722732842442, + "grad_norm": 0.06568270921707153, + "learning_rate": 0.0002, + "loss": 0.1255, + "step": 13260 + }, + { + "epoch": 0.02413390955239964, + "grad_norm": 0.04041383042931557, + "learning_rate": 0.0002, + "loss": 0.0718, + "step": 13270 + }, + { + "epoch": 0.024152096371956835, + "grad_norm": 0.046768829226493835, + "learning_rate": 0.0002, + "loss": 0.0741, + "step": 13280 + }, + { + "epoch": 0.02417028319151403, + "grad_norm": 0.21418194472789764, + "learning_rate": 0.0002, + "loss": 0.0683, + "step": 13290 + }, + { + "epoch": 0.024188470011071227, + "grad_norm": 0.04398053511977196, + "learning_rate": 0.0002, + "loss": 0.0262, + "step": 13300 + }, + { + "epoch": 0.024206656830628424, + "grad_norm": 0.1672079861164093, + "learning_rate": 0.0002, + "loss": 0.1408, + "step": 13310 + }, + { + "epoch": 0.02422484365018562, + "grad_norm": 0.05705881491303444, + "learning_rate": 0.0002, + "loss": 0.0773, + "step": 13320 + }, + { + "epoch": 0.024243030469742816, + "grad_norm": 0.0667627677321434, + "learning_rate": 0.0002, + "loss": 0.0823, + "step": 13330 + }, + { + "epoch": 0.024261217289300013, + "grad_norm": 0.16610710322856903, + "learning_rate": 0.0002, + "loss": 0.0682, + "step": 13340 + }, + { + "epoch": 0.02427940410885721, + "grad_norm": 0.028300171718001366, + "learning_rate": 0.0002, + "loss": 0.0185, + "step": 13350 + }, + { + "epoch": 0.024297590928414405, + "grad_norm": 0.10226302593946457, + "learning_rate": 0.0002, + "loss": 0.1406, + "step": 13360 + }, + { + "epoch": 0.0243157777479716, + "grad_norm": 0.0939667820930481, + "learning_rate": 0.0002, + "loss": 0.0755, + "step": 13370 + }, + { + "epoch": 0.024333964567528798, + "grad_norm": 0.029998745769262314, + "learning_rate": 0.0002, + "loss": 0.0748, + "step": 13380 + }, + { + "epoch": 0.024352151387085994, + "grad_norm": 0.1240144744515419, + "learning_rate": 0.0002, + "loss": 0.0639, + "step": 13390 + }, + { + "epoch": 0.02437033820664319, + "grad_norm": 0.017499787732958794, + "learning_rate": 0.0002, + "loss": 0.0156, + "step": 13400 + }, + { + "epoch": 0.024388525026200387, + "grad_norm": 0.11781036853790283, + "learning_rate": 0.0002, + "loss": 0.1385, + "step": 13410 + }, + { + "epoch": 0.024406711845757583, + "grad_norm": 0.09330960363149643, + "learning_rate": 0.0002, + "loss": 0.0789, + "step": 13420 + }, + { + "epoch": 0.02442489866531478, + "grad_norm": 0.03347505256533623, + "learning_rate": 0.0002, + "loss": 0.0742, + "step": 13430 + }, + { + "epoch": 0.024443085484871976, + "grad_norm": 0.18877847492694855, + "learning_rate": 0.0002, + "loss": 0.0701, + "step": 13440 + }, + { + "epoch": 0.024461272304429172, + "grad_norm": 0.03831986337900162, + "learning_rate": 0.0002, + "loss": 0.0243, + "step": 13450 + }, + { + "epoch": 0.02447945912398637, + "grad_norm": 0.07360157370567322, + "learning_rate": 0.0002, + "loss": 0.1237, + "step": 13460 + }, + { + "epoch": 0.024497645943543565, + "grad_norm": 0.0442088283598423, + "learning_rate": 0.0002, + "loss": 0.0742, + "step": 13470 + }, + { + "epoch": 0.02451583276310076, + "grad_norm": 0.07053640484809875, + "learning_rate": 0.0002, + "loss": 0.0793, + "step": 13480 + }, + { + "epoch": 0.024534019582657957, + "grad_norm": 0.20134539902210236, + "learning_rate": 0.0002, + "loss": 0.0621, + "step": 13490 + }, + { + "epoch": 0.024552206402215154, + "grad_norm": 0.016353536397218704, + "learning_rate": 0.0002, + "loss": 0.0204, + "step": 13500 + }, + { + "epoch": 0.02457039322177235, + "grad_norm": 0.15373657643795013, + "learning_rate": 0.0002, + "loss": 0.1446, + "step": 13510 + }, + { + "epoch": 0.024588580041329546, + "grad_norm": 2.457998037338257, + "learning_rate": 0.0002, + "loss": 0.0959, + "step": 13520 + }, + { + "epoch": 0.024606766860886743, + "grad_norm": 0.11631426215171814, + "learning_rate": 0.0002, + "loss": 0.0718, + "step": 13530 + }, + { + "epoch": 0.02462495368044394, + "grad_norm": 0.15928395092487335, + "learning_rate": 0.0002, + "loss": 0.0638, + "step": 13540 + }, + { + "epoch": 0.024643140500001135, + "grad_norm": 0.01724998839199543, + "learning_rate": 0.0002, + "loss": 0.0127, + "step": 13550 + }, + { + "epoch": 0.02466132731955833, + "grad_norm": 0.10434440523386002, + "learning_rate": 0.0002, + "loss": 0.1676, + "step": 13560 + }, + { + "epoch": 0.02467951413911553, + "grad_norm": 0.09029936045408249, + "learning_rate": 0.0002, + "loss": 0.0792, + "step": 13570 + }, + { + "epoch": 0.024697700958672728, + "grad_norm": 0.07413540780544281, + "learning_rate": 0.0002, + "loss": 0.074, + "step": 13580 + }, + { + "epoch": 0.024715887778229924, + "grad_norm": 0.15171368420124054, + "learning_rate": 0.0002, + "loss": 0.0646, + "step": 13590 + }, + { + "epoch": 0.02473407459778712, + "grad_norm": 0.03615165874361992, + "learning_rate": 0.0002, + "loss": 0.0253, + "step": 13600 + }, + { + "epoch": 0.024752261417344316, + "grad_norm": 0.08074207603931427, + "learning_rate": 0.0002, + "loss": 0.1251, + "step": 13610 + }, + { + "epoch": 0.024770448236901513, + "grad_norm": 0.12725302577018738, + "learning_rate": 0.0002, + "loss": 0.0868, + "step": 13620 + }, + { + "epoch": 0.02478863505645871, + "grad_norm": 0.02872832864522934, + "learning_rate": 0.0002, + "loss": 0.072, + "step": 13630 + }, + { + "epoch": 0.024806821876015905, + "grad_norm": 0.14573116600513458, + "learning_rate": 0.0002, + "loss": 0.06, + "step": 13640 + }, + { + "epoch": 0.0248250086955731, + "grad_norm": 0.039421938359737396, + "learning_rate": 0.0002, + "loss": 0.0259, + "step": 13650 + }, + { + "epoch": 0.024843195515130298, + "grad_norm": 0.08786037564277649, + "learning_rate": 0.0002, + "loss": 0.1255, + "step": 13660 + }, + { + "epoch": 0.024861382334687494, + "grad_norm": 0.7118334174156189, + "learning_rate": 0.0002, + "loss": 0.1096, + "step": 13670 + }, + { + "epoch": 0.02487956915424469, + "grad_norm": 0.05718977376818657, + "learning_rate": 0.0002, + "loss": 0.1057, + "step": 13680 + }, + { + "epoch": 0.024897755973801887, + "grad_norm": 0.19388055801391602, + "learning_rate": 0.0002, + "loss": 0.0668, + "step": 13690 + }, + { + "epoch": 0.024915942793359083, + "grad_norm": 0.02519839070737362, + "learning_rate": 0.0002, + "loss": 0.0182, + "step": 13700 + }, + { + "epoch": 0.02493412961291628, + "grad_norm": 0.15939857065677643, + "learning_rate": 0.0002, + "loss": 0.1685, + "step": 13710 + }, + { + "epoch": 0.024952316432473476, + "grad_norm": 0.07893367856740952, + "learning_rate": 0.0002, + "loss": 0.0781, + "step": 13720 + }, + { + "epoch": 0.024970503252030672, + "grad_norm": 0.0573757067322731, + "learning_rate": 0.0002, + "loss": 0.0819, + "step": 13730 + }, + { + "epoch": 0.02498869007158787, + "grad_norm": 0.1089317575097084, + "learning_rate": 0.0002, + "loss": 0.0645, + "step": 13740 + }, + { + "epoch": 0.025006876891145065, + "grad_norm": 0.03239568695425987, + "learning_rate": 0.0002, + "loss": 0.0199, + "step": 13750 + }, + { + "epoch": 0.02502506371070226, + "grad_norm": 0.04015114903450012, + "learning_rate": 0.0002, + "loss": 0.146, + "step": 13760 + }, + { + "epoch": 0.025043250530259457, + "grad_norm": 0.15218386054039001, + "learning_rate": 0.0002, + "loss": 0.0854, + "step": 13770 + }, + { + "epoch": 0.025061437349816654, + "grad_norm": 0.04461386427283287, + "learning_rate": 0.0002, + "loss": 0.0734, + "step": 13780 + }, + { + "epoch": 0.02507962416937385, + "grad_norm": 0.17443357408046722, + "learning_rate": 0.0002, + "loss": 0.0677, + "step": 13790 + }, + { + "epoch": 0.025097810988931046, + "grad_norm": 1.0899302959442139, + "learning_rate": 0.0002, + "loss": 0.0312, + "step": 13800 + }, + { + "epoch": 0.025115997808488243, + "grad_norm": 0.04115718603134155, + "learning_rate": 0.0002, + "loss": 0.1392, + "step": 13810 + }, + { + "epoch": 0.02513418462804544, + "grad_norm": 0.06605038046836853, + "learning_rate": 0.0002, + "loss": 0.0826, + "step": 13820 + }, + { + "epoch": 0.025152371447602635, + "grad_norm": 0.115416020154953, + "learning_rate": 0.0002, + "loss": 0.0709, + "step": 13830 + }, + { + "epoch": 0.02517055826715983, + "grad_norm": 0.1582881212234497, + "learning_rate": 0.0002, + "loss": 0.066, + "step": 13840 + }, + { + "epoch": 0.025188745086717028, + "grad_norm": 0.037643156945705414, + "learning_rate": 0.0002, + "loss": 0.0226, + "step": 13850 + }, + { + "epoch": 0.025206931906274224, + "grad_norm": 0.08343279361724854, + "learning_rate": 0.0002, + "loss": 0.1197, + "step": 13860 + }, + { + "epoch": 0.02522511872583142, + "grad_norm": 0.13482169806957245, + "learning_rate": 0.0002, + "loss": 0.0799, + "step": 13870 + }, + { + "epoch": 0.025243305545388617, + "grad_norm": 0.10373103618621826, + "learning_rate": 0.0002, + "loss": 0.075, + "step": 13880 + }, + { + "epoch": 0.025261492364945813, + "grad_norm": 0.1348303109407425, + "learning_rate": 0.0002, + "loss": 0.0603, + "step": 13890 + }, + { + "epoch": 0.02527967918450301, + "grad_norm": 0.058479245752096176, + "learning_rate": 0.0002, + "loss": 0.0252, + "step": 13900 + }, + { + "epoch": 0.025297866004060206, + "grad_norm": 0.19177350401878357, + "learning_rate": 0.0002, + "loss": 0.122, + "step": 13910 + }, + { + "epoch": 0.025316052823617406, + "grad_norm": 0.11044300347566605, + "learning_rate": 0.0002, + "loss": 0.0724, + "step": 13920 + }, + { + "epoch": 0.025334239643174602, + "grad_norm": 0.05279375612735748, + "learning_rate": 0.0002, + "loss": 0.0836, + "step": 13930 + }, + { + "epoch": 0.025352426462731798, + "grad_norm": 0.12162257730960846, + "learning_rate": 0.0002, + "loss": 0.0615, + "step": 13940 + }, + { + "epoch": 0.025370613282288994, + "grad_norm": 0.026728983968496323, + "learning_rate": 0.0002, + "loss": 0.0207, + "step": 13950 + }, + { + "epoch": 0.02538880010184619, + "grad_norm": 0.08440329879522324, + "learning_rate": 0.0002, + "loss": 0.1171, + "step": 13960 + }, + { + "epoch": 0.025406986921403387, + "grad_norm": 0.10090481489896774, + "learning_rate": 0.0002, + "loss": 0.0851, + "step": 13970 + }, + { + "epoch": 0.025425173740960583, + "grad_norm": 0.03063822351396084, + "learning_rate": 0.0002, + "loss": 0.0783, + "step": 13980 + }, + { + "epoch": 0.02544336056051778, + "grad_norm": 0.14754973351955414, + "learning_rate": 0.0002, + "loss": 0.0662, + "step": 13990 + }, + { + "epoch": 0.025461547380074976, + "grad_norm": 0.04844941198825836, + "learning_rate": 0.0002, + "loss": 0.0204, + "step": 14000 + }, + { + "epoch": 0.025479734199632172, + "grad_norm": 0.08291894942522049, + "learning_rate": 0.0002, + "loss": 0.13, + "step": 14010 + }, + { + "epoch": 0.02549792101918937, + "grad_norm": 0.05875542387366295, + "learning_rate": 0.0002, + "loss": 0.0732, + "step": 14020 + }, + { + "epoch": 0.025516107838746565, + "grad_norm": 0.04103298857808113, + "learning_rate": 0.0002, + "loss": 0.0796, + "step": 14030 + }, + { + "epoch": 0.02553429465830376, + "grad_norm": 0.20349934697151184, + "learning_rate": 0.0002, + "loss": 0.0672, + "step": 14040 + }, + { + "epoch": 0.025552481477860958, + "grad_norm": 0.05419473722577095, + "learning_rate": 0.0002, + "loss": 0.0231, + "step": 14050 + }, + { + "epoch": 0.025570668297418154, + "grad_norm": 0.05501960590481758, + "learning_rate": 0.0002, + "loss": 0.1281, + "step": 14060 + }, + { + "epoch": 0.02558885511697535, + "grad_norm": 0.07140739262104034, + "learning_rate": 0.0002, + "loss": 0.0746, + "step": 14070 + }, + { + "epoch": 0.025607041936532546, + "grad_norm": 0.04564960300922394, + "learning_rate": 0.0002, + "loss": 0.0746, + "step": 14080 + }, + { + "epoch": 0.025625228756089743, + "grad_norm": 0.16987308859825134, + "learning_rate": 0.0002, + "loss": 0.0642, + "step": 14090 + }, + { + "epoch": 0.02564341557564694, + "grad_norm": 0.017460890114307404, + "learning_rate": 0.0002, + "loss": 0.0218, + "step": 14100 + }, + { + "epoch": 0.025661602395204135, + "grad_norm": 0.15666340291500092, + "learning_rate": 0.0002, + "loss": 0.1572, + "step": 14110 + }, + { + "epoch": 0.02567978921476133, + "grad_norm": 0.06847309321165085, + "learning_rate": 0.0002, + "loss": 0.0744, + "step": 14120 + }, + { + "epoch": 0.025697976034318528, + "grad_norm": 0.03678276389837265, + "learning_rate": 0.0002, + "loss": 0.0718, + "step": 14130 + }, + { + "epoch": 0.025716162853875724, + "grad_norm": 0.1861123889684677, + "learning_rate": 0.0002, + "loss": 0.06, + "step": 14140 + }, + { + "epoch": 0.02573434967343292, + "grad_norm": 0.010294788517057896, + "learning_rate": 0.0002, + "loss": 0.0183, + "step": 14150 + }, + { + "epoch": 0.025752536492990117, + "grad_norm": 0.0643458440899849, + "learning_rate": 0.0002, + "loss": 0.1594, + "step": 14160 + }, + { + "epoch": 0.025770723312547313, + "grad_norm": 0.10639938712120056, + "learning_rate": 0.0002, + "loss": 0.0761, + "step": 14170 + }, + { + "epoch": 0.02578891013210451, + "grad_norm": 0.056529924273490906, + "learning_rate": 0.0002, + "loss": 0.082, + "step": 14180 + }, + { + "epoch": 0.025807096951661706, + "grad_norm": 0.18884658813476562, + "learning_rate": 0.0002, + "loss": 0.0683, + "step": 14190 + }, + { + "epoch": 0.025825283771218902, + "grad_norm": 0.035667784512043, + "learning_rate": 0.0002, + "loss": 0.0263, + "step": 14200 + }, + { + "epoch": 0.0258434705907761, + "grad_norm": 0.14650103449821472, + "learning_rate": 0.0002, + "loss": 0.1314, + "step": 14210 + }, + { + "epoch": 0.025861657410333295, + "grad_norm": 0.12219654768705368, + "learning_rate": 0.0002, + "loss": 0.0755, + "step": 14220 + }, + { + "epoch": 0.02587984422989049, + "grad_norm": 0.05271647870540619, + "learning_rate": 0.0002, + "loss": 0.0789, + "step": 14230 + }, + { + "epoch": 0.025898031049447687, + "grad_norm": 0.1669916957616806, + "learning_rate": 0.0002, + "loss": 0.0641, + "step": 14240 + }, + { + "epoch": 0.025916217869004884, + "grad_norm": 0.035175371915102005, + "learning_rate": 0.0002, + "loss": 0.0222, + "step": 14250 + }, + { + "epoch": 0.025934404688562084, + "grad_norm": 0.14658409357070923, + "learning_rate": 0.0002, + "loss": 0.1382, + "step": 14260 + }, + { + "epoch": 0.02595259150811928, + "grad_norm": 0.07525639981031418, + "learning_rate": 0.0002, + "loss": 0.0815, + "step": 14270 + }, + { + "epoch": 0.025970778327676476, + "grad_norm": 0.02428872510790825, + "learning_rate": 0.0002, + "loss": 0.076, + "step": 14280 + }, + { + "epoch": 0.025988965147233672, + "grad_norm": 0.1825665533542633, + "learning_rate": 0.0002, + "loss": 0.0652, + "step": 14290 + }, + { + "epoch": 0.02600715196679087, + "grad_norm": 0.033867619931697845, + "learning_rate": 0.0002, + "loss": 0.0206, + "step": 14300 + }, + { + "epoch": 0.026025338786348065, + "grad_norm": 0.051891107112169266, + "learning_rate": 0.0002, + "loss": 0.1576, + "step": 14310 + }, + { + "epoch": 0.02604352560590526, + "grad_norm": 0.1111353188753128, + "learning_rate": 0.0002, + "loss": 0.0889, + "step": 14320 + }, + { + "epoch": 0.026061712425462458, + "grad_norm": 0.04253942146897316, + "learning_rate": 0.0002, + "loss": 0.079, + "step": 14330 + }, + { + "epoch": 0.026079899245019654, + "grad_norm": 0.17151106894016266, + "learning_rate": 0.0002, + "loss": 0.0678, + "step": 14340 + }, + { + "epoch": 0.02609808606457685, + "grad_norm": 0.03877005726099014, + "learning_rate": 0.0002, + "loss": 0.0206, + "step": 14350 + }, + { + "epoch": 0.026116272884134047, + "grad_norm": 0.03517235442996025, + "learning_rate": 0.0002, + "loss": 0.1343, + "step": 14360 + }, + { + "epoch": 0.026134459703691243, + "grad_norm": 0.08157488703727722, + "learning_rate": 0.0002, + "loss": 0.0764, + "step": 14370 + }, + { + "epoch": 0.02615264652324844, + "grad_norm": 0.03245632350444794, + "learning_rate": 0.0002, + "loss": 0.0758, + "step": 14380 + }, + { + "epoch": 0.026170833342805636, + "grad_norm": 0.20079655945301056, + "learning_rate": 0.0002, + "loss": 0.0691, + "step": 14390 + }, + { + "epoch": 0.026189020162362832, + "grad_norm": 0.03477077558636665, + "learning_rate": 0.0002, + "loss": 0.0232, + "step": 14400 + }, + { + "epoch": 0.026207206981920028, + "grad_norm": 0.14853888750076294, + "learning_rate": 0.0002, + "loss": 0.1436, + "step": 14410 + }, + { + "epoch": 0.026225393801477224, + "grad_norm": 0.12416905164718628, + "learning_rate": 0.0002, + "loss": 0.0755, + "step": 14420 + }, + { + "epoch": 0.02624358062103442, + "grad_norm": 0.03126871958374977, + "learning_rate": 0.0002, + "loss": 0.0762, + "step": 14430 + }, + { + "epoch": 0.026261767440591617, + "grad_norm": 0.20726743340492249, + "learning_rate": 0.0002, + "loss": 0.0614, + "step": 14440 + }, + { + "epoch": 0.026279954260148813, + "grad_norm": 0.039617493748664856, + "learning_rate": 0.0002, + "loss": 0.0181, + "step": 14450 + }, + { + "epoch": 0.02629814107970601, + "grad_norm": 0.08146277070045471, + "learning_rate": 0.0002, + "loss": 0.132, + "step": 14460 + }, + { + "epoch": 0.026316327899263206, + "grad_norm": 0.07181694358587265, + "learning_rate": 0.0002, + "loss": 0.0706, + "step": 14470 + }, + { + "epoch": 0.026334514718820402, + "grad_norm": 0.04080040752887726, + "learning_rate": 0.0002, + "loss": 0.0792, + "step": 14480 + }, + { + "epoch": 0.0263527015383776, + "grad_norm": 0.1903056502342224, + "learning_rate": 0.0002, + "loss": 0.0647, + "step": 14490 + }, + { + "epoch": 0.026370888357934795, + "grad_norm": 0.027256207540631294, + "learning_rate": 0.0002, + "loss": 0.0202, + "step": 14500 + }, + { + "epoch": 0.02638907517749199, + "grad_norm": 0.1434287130832672, + "learning_rate": 0.0002, + "loss": 0.1262, + "step": 14510 + }, + { + "epoch": 0.026407261997049188, + "grad_norm": 0.06977452337741852, + "learning_rate": 0.0002, + "loss": 0.0722, + "step": 14520 + }, + { + "epoch": 0.026425448816606384, + "grad_norm": 0.03453589975833893, + "learning_rate": 0.0002, + "loss": 0.0778, + "step": 14530 + }, + { + "epoch": 0.02644363563616358, + "grad_norm": 0.1455768346786499, + "learning_rate": 0.0002, + "loss": 0.0678, + "step": 14540 + }, + { + "epoch": 0.026461822455720777, + "grad_norm": 0.02977900207042694, + "learning_rate": 0.0002, + "loss": 0.0227, + "step": 14550 + }, + { + "epoch": 0.026480009275277973, + "grad_norm": 0.06667467951774597, + "learning_rate": 0.0002, + "loss": 0.1345, + "step": 14560 + }, + { + "epoch": 0.02649819609483517, + "grad_norm": 0.05125528201460838, + "learning_rate": 0.0002, + "loss": 0.078, + "step": 14570 + }, + { + "epoch": 0.026516382914392365, + "grad_norm": 0.02796974405646324, + "learning_rate": 0.0002, + "loss": 0.0782, + "step": 14580 + }, + { + "epoch": 0.026534569733949562, + "grad_norm": 0.18518763780593872, + "learning_rate": 0.0002, + "loss": 0.0722, + "step": 14590 + }, + { + "epoch": 0.02655275655350676, + "grad_norm": 0.01827179454267025, + "learning_rate": 0.0002, + "loss": 0.0193, + "step": 14600 + }, + { + "epoch": 0.026570943373063958, + "grad_norm": 0.1146678775548935, + "learning_rate": 0.0002, + "loss": 0.1651, + "step": 14610 + }, + { + "epoch": 0.026589130192621154, + "grad_norm": 3.385193109512329, + "learning_rate": 0.0002, + "loss": 0.2165, + "step": 14620 + }, + { + "epoch": 0.02660731701217835, + "grad_norm": 0.3052279055118561, + "learning_rate": 0.0002, + "loss": 0.1489, + "step": 14630 + }, + { + "epoch": 0.026625503831735547, + "grad_norm": 0.12762853503227234, + "learning_rate": 0.0002, + "loss": 0.0693, + "step": 14640 + }, + { + "epoch": 0.026643690651292743, + "grad_norm": 0.003925936296582222, + "learning_rate": 0.0002, + "loss": 0.0078, + "step": 14650 + }, + { + "epoch": 0.02666187747084994, + "grad_norm": 0.28632932901382446, + "learning_rate": 0.0002, + "loss": 0.2533, + "step": 14660 + }, + { + "epoch": 0.026680064290407136, + "grad_norm": 0.037552788853645325, + "learning_rate": 0.0002, + "loss": 0.0852, + "step": 14670 + }, + { + "epoch": 0.026698251109964332, + "grad_norm": 0.0911126434803009, + "learning_rate": 0.0002, + "loss": 0.0751, + "step": 14680 + }, + { + "epoch": 0.02671643792952153, + "grad_norm": 0.18434865772724152, + "learning_rate": 0.0002, + "loss": 0.084, + "step": 14690 + }, + { + "epoch": 0.026734624749078725, + "grad_norm": 0.03813793510198593, + "learning_rate": 0.0002, + "loss": 0.0165, + "step": 14700 + }, + { + "epoch": 0.02675281156863592, + "grad_norm": 0.04764392226934433, + "learning_rate": 0.0002, + "loss": 0.1642, + "step": 14710 + }, + { + "epoch": 0.026770998388193117, + "grad_norm": 0.04611713066697121, + "learning_rate": 0.0002, + "loss": 0.088, + "step": 14720 + }, + { + "epoch": 0.026789185207750314, + "grad_norm": 0.07171179354190826, + "learning_rate": 0.0002, + "loss": 0.1417, + "step": 14730 + }, + { + "epoch": 0.02680737202730751, + "grad_norm": 0.14135649800300598, + "learning_rate": 0.0002, + "loss": 0.0692, + "step": 14740 + }, + { + "epoch": 0.026825558846864706, + "grad_norm": 0.004508219193667173, + "learning_rate": 0.0002, + "loss": 0.016, + "step": 14750 + }, + { + "epoch": 0.026843745666421902, + "grad_norm": 0.09732682257890701, + "learning_rate": 0.0002, + "loss": 0.2089, + "step": 14760 + }, + { + "epoch": 0.0268619324859791, + "grad_norm": 0.12676575779914856, + "learning_rate": 0.0002, + "loss": 0.0849, + "step": 14770 + }, + { + "epoch": 0.026880119305536295, + "grad_norm": 0.0696650817990303, + "learning_rate": 0.0002, + "loss": 0.08, + "step": 14780 + }, + { + "epoch": 0.02689830612509349, + "grad_norm": 0.17883484065532684, + "learning_rate": 0.0002, + "loss": 0.0682, + "step": 14790 + }, + { + "epoch": 0.026916492944650688, + "grad_norm": 0.0567975677549839, + "learning_rate": 0.0002, + "loss": 0.0149, + "step": 14800 + }, + { + "epoch": 0.026934679764207884, + "grad_norm": 0.4884565472602844, + "learning_rate": 0.0002, + "loss": 0.6381, + "step": 14810 + }, + { + "epoch": 0.02695286658376508, + "grad_norm": 0.0742981806397438, + "learning_rate": 0.0002, + "loss": 0.0845, + "step": 14820 + }, + { + "epoch": 0.026971053403322277, + "grad_norm": 0.030466781929135323, + "learning_rate": 0.0002, + "loss": 0.0818, + "step": 14830 + }, + { + "epoch": 0.026989240222879473, + "grad_norm": 0.13108357787132263, + "learning_rate": 0.0002, + "loss": 0.065, + "step": 14840 + }, + { + "epoch": 0.02700742704243667, + "grad_norm": 0.019065184518694878, + "learning_rate": 0.0002, + "loss": 0.0168, + "step": 14850 + }, + { + "epoch": 0.027025613861993866, + "grad_norm": 0.21891777217388153, + "learning_rate": 0.0002, + "loss": 0.1456, + "step": 14860 + }, + { + "epoch": 0.027043800681551062, + "grad_norm": 0.0836934968829155, + "learning_rate": 0.0002, + "loss": 0.0781, + "step": 14870 + }, + { + "epoch": 0.027061987501108258, + "grad_norm": 0.0643845945596695, + "learning_rate": 0.0002, + "loss": 0.0795, + "step": 14880 + }, + { + "epoch": 0.027080174320665455, + "grad_norm": 0.27108556032180786, + "learning_rate": 0.0002, + "loss": 0.0722, + "step": 14890 + }, + { + "epoch": 0.02709836114022265, + "grad_norm": 0.008289041928946972, + "learning_rate": 0.0002, + "loss": 0.0201, + "step": 14900 + }, + { + "epoch": 0.027116547959779847, + "grad_norm": 0.03284185752272606, + "learning_rate": 0.0002, + "loss": 0.1509, + "step": 14910 + }, + { + "epoch": 0.027134734779337043, + "grad_norm": 0.051129039376974106, + "learning_rate": 0.0002, + "loss": 0.0831, + "step": 14920 + }, + { + "epoch": 0.02715292159889424, + "grad_norm": 0.046401191502809525, + "learning_rate": 0.0002, + "loss": 0.0694, + "step": 14930 + }, + { + "epoch": 0.027171108418451436, + "grad_norm": 0.19945313036441803, + "learning_rate": 0.0002, + "loss": 0.0734, + "step": 14940 + }, + { + "epoch": 0.027189295238008636, + "grad_norm": 0.03877973556518555, + "learning_rate": 0.0002, + "loss": 0.026, + "step": 14950 + }, + { + "epoch": 0.027207482057565832, + "grad_norm": 0.19090695679187775, + "learning_rate": 0.0002, + "loss": 0.136, + "step": 14960 + }, + { + "epoch": 0.02722566887712303, + "grad_norm": 0.11352288722991943, + "learning_rate": 0.0002, + "loss": 0.0797, + "step": 14970 + }, + { + "epoch": 0.027243855696680225, + "grad_norm": 0.055218834429979324, + "learning_rate": 0.0002, + "loss": 0.0763, + "step": 14980 + }, + { + "epoch": 0.02726204251623742, + "grad_norm": 0.1060803234577179, + "learning_rate": 0.0002, + "loss": 0.059, + "step": 14990 + }, + { + "epoch": 0.027280229335794617, + "grad_norm": 0.03370797634124756, + "learning_rate": 0.0002, + "loss": 0.0172, + "step": 15000 + }, + { + "epoch": 0.027298416155351814, + "grad_norm": 0.19884982705116272, + "learning_rate": 0.0002, + "loss": 0.1408, + "step": 15010 + }, + { + "epoch": 0.02731660297490901, + "grad_norm": 0.1186273992061615, + "learning_rate": 0.0002, + "loss": 0.0769, + "step": 15020 + }, + { + "epoch": 0.027334789794466206, + "grad_norm": 0.0494297556579113, + "learning_rate": 0.0002, + "loss": 0.0818, + "step": 15030 + }, + { + "epoch": 0.027352976614023403, + "grad_norm": 0.17990480363368988, + "learning_rate": 0.0002, + "loss": 0.06, + "step": 15040 + }, + { + "epoch": 0.0273711634335806, + "grad_norm": 0.015269913710653782, + "learning_rate": 0.0002, + "loss": 0.0143, + "step": 15050 + }, + { + "epoch": 0.027389350253137795, + "grad_norm": 0.1387794464826584, + "learning_rate": 0.0002, + "loss": 0.171, + "step": 15060 + }, + { + "epoch": 0.02740753707269499, + "grad_norm": 0.11648393422365189, + "learning_rate": 0.0002, + "loss": 0.0769, + "step": 15070 + }, + { + "epoch": 0.027425723892252188, + "grad_norm": 0.04039733111858368, + "learning_rate": 0.0002, + "loss": 0.0707, + "step": 15080 + }, + { + "epoch": 0.027443910711809384, + "grad_norm": 0.19274230301380157, + "learning_rate": 0.0002, + "loss": 0.0657, + "step": 15090 + }, + { + "epoch": 0.02746209753136658, + "grad_norm": 0.03266929090023041, + "learning_rate": 0.0002, + "loss": 0.0155, + "step": 15100 + }, + { + "epoch": 0.027480284350923777, + "grad_norm": 0.44524702429771423, + "learning_rate": 0.0002, + "loss": 0.3075, + "step": 15110 + }, + { + "epoch": 0.027498471170480973, + "grad_norm": 0.15604422986507416, + "learning_rate": 0.0002, + "loss": 0.0874, + "step": 15120 + }, + { + "epoch": 0.02751665799003817, + "grad_norm": 0.043061114847660065, + "learning_rate": 0.0002, + "loss": 0.0814, + "step": 15130 + }, + { + "epoch": 0.027534844809595366, + "grad_norm": 0.2331482172012329, + "learning_rate": 0.0002, + "loss": 0.0638, + "step": 15140 + }, + { + "epoch": 0.027553031629152562, + "grad_norm": 0.011037157848477364, + "learning_rate": 0.0002, + "loss": 0.0197, + "step": 15150 + }, + { + "epoch": 0.02757121844870976, + "grad_norm": 0.0758776143193245, + "learning_rate": 0.0002, + "loss": 0.1481, + "step": 15160 + }, + { + "epoch": 0.027589405268266955, + "grad_norm": 0.18878699839115143, + "learning_rate": 0.0002, + "loss": 0.083, + "step": 15170 + }, + { + "epoch": 0.02760759208782415, + "grad_norm": 0.042469121515750885, + "learning_rate": 0.0002, + "loss": 0.0799, + "step": 15180 + }, + { + "epoch": 0.027625778907381347, + "grad_norm": 0.1603335440158844, + "learning_rate": 0.0002, + "loss": 0.0579, + "step": 15190 + }, + { + "epoch": 0.027643965726938544, + "grad_norm": 0.03533349186182022, + "learning_rate": 0.0002, + "loss": 0.0195, + "step": 15200 + }, + { + "epoch": 0.02766215254649574, + "grad_norm": 0.2014724314212799, + "learning_rate": 0.0002, + "loss": 0.1443, + "step": 15210 + }, + { + "epoch": 0.027680339366052936, + "grad_norm": 0.04604899883270264, + "learning_rate": 0.0002, + "loss": 0.0701, + "step": 15220 + }, + { + "epoch": 0.027698526185610133, + "grad_norm": 0.04726789519190788, + "learning_rate": 0.0002, + "loss": 0.078, + "step": 15230 + }, + { + "epoch": 0.02771671300516733, + "grad_norm": 0.16189764440059662, + "learning_rate": 0.0002, + "loss": 0.0686, + "step": 15240 + }, + { + "epoch": 0.027734899824724525, + "grad_norm": 0.018077973276376724, + "learning_rate": 0.0002, + "loss": 0.0155, + "step": 15250 + }, + { + "epoch": 0.02775308664428172, + "grad_norm": 0.09486963599920273, + "learning_rate": 0.0002, + "loss": 0.1695, + "step": 15260 + }, + { + "epoch": 0.027771273463838918, + "grad_norm": 0.19950449466705322, + "learning_rate": 0.0002, + "loss": 0.0784, + "step": 15270 + }, + { + "epoch": 0.027789460283396114, + "grad_norm": 0.03350493311882019, + "learning_rate": 0.0002, + "loss": 0.0797, + "step": 15280 + }, + { + "epoch": 0.027807647102953314, + "grad_norm": 0.14408868551254272, + "learning_rate": 0.0002, + "loss": 0.0624, + "step": 15290 + }, + { + "epoch": 0.02782583392251051, + "grad_norm": 0.03824521601200104, + "learning_rate": 0.0002, + "loss": 0.0182, + "step": 15300 + }, + { + "epoch": 0.027844020742067706, + "grad_norm": 0.051167964935302734, + "learning_rate": 0.0002, + "loss": 0.1342, + "step": 15310 + }, + { + "epoch": 0.027862207561624903, + "grad_norm": 0.08440420031547546, + "learning_rate": 0.0002, + "loss": 0.0775, + "step": 15320 + }, + { + "epoch": 0.0278803943811821, + "grad_norm": 0.05162487551569939, + "learning_rate": 0.0002, + "loss": 0.0824, + "step": 15330 + }, + { + "epoch": 0.027898581200739295, + "grad_norm": 0.1576220989227295, + "learning_rate": 0.0002, + "loss": 0.0607, + "step": 15340 + }, + { + "epoch": 0.02791676802029649, + "grad_norm": 0.03840797394514084, + "learning_rate": 0.0002, + "loss": 0.0197, + "step": 15350 + }, + { + "epoch": 0.027934954839853688, + "grad_norm": 0.1418246179819107, + "learning_rate": 0.0002, + "loss": 0.151, + "step": 15360 + }, + { + "epoch": 0.027953141659410884, + "grad_norm": 0.07326096296310425, + "learning_rate": 0.0002, + "loss": 0.0764, + "step": 15370 + }, + { + "epoch": 0.02797132847896808, + "grad_norm": 0.0582844614982605, + "learning_rate": 0.0002, + "loss": 0.0745, + "step": 15380 + }, + { + "epoch": 0.027989515298525277, + "grad_norm": 0.2234935164451599, + "learning_rate": 0.0002, + "loss": 0.0687, + "step": 15390 + }, + { + "epoch": 0.028007702118082473, + "grad_norm": 0.04384669288992882, + "learning_rate": 0.0002, + "loss": 0.023, + "step": 15400 + }, + { + "epoch": 0.02802588893763967, + "grad_norm": 0.14306089282035828, + "learning_rate": 0.0002, + "loss": 0.1477, + "step": 15410 + }, + { + "epoch": 0.028044075757196866, + "grad_norm": 0.1326105296611786, + "learning_rate": 0.0002, + "loss": 0.0784, + "step": 15420 + }, + { + "epoch": 0.028062262576754062, + "grad_norm": 0.05531894043087959, + "learning_rate": 0.0002, + "loss": 0.0813, + "step": 15430 + }, + { + "epoch": 0.02808044939631126, + "grad_norm": 0.14875297248363495, + "learning_rate": 0.0002, + "loss": 0.0622, + "step": 15440 + }, + { + "epoch": 0.028098636215868455, + "grad_norm": 0.03749268501996994, + "learning_rate": 0.0002, + "loss": 0.0181, + "step": 15450 + }, + { + "epoch": 0.02811682303542565, + "grad_norm": 0.05747106671333313, + "learning_rate": 0.0002, + "loss": 0.1157, + "step": 15460 + }, + { + "epoch": 0.028135009854982847, + "grad_norm": 0.06197863444685936, + "learning_rate": 0.0002, + "loss": 0.0779, + "step": 15470 + }, + { + "epoch": 0.028153196674540044, + "grad_norm": 0.09997677057981491, + "learning_rate": 0.0002, + "loss": 0.0784, + "step": 15480 + }, + { + "epoch": 0.02817138349409724, + "grad_norm": 0.18067684769630432, + "learning_rate": 0.0002, + "loss": 0.0728, + "step": 15490 + }, + { + "epoch": 0.028189570313654436, + "grad_norm": 0.03378088399767876, + "learning_rate": 0.0002, + "loss": 0.0252, + "step": 15500 + }, + { + "epoch": 0.028207757133211633, + "grad_norm": 0.14048723876476288, + "learning_rate": 0.0002, + "loss": 0.1392, + "step": 15510 + }, + { + "epoch": 0.02822594395276883, + "grad_norm": 0.09573493152856827, + "learning_rate": 0.0002, + "loss": 0.0751, + "step": 15520 + }, + { + "epoch": 0.028244130772326025, + "grad_norm": 0.11000777781009674, + "learning_rate": 0.0002, + "loss": 0.08, + "step": 15530 + }, + { + "epoch": 0.02826231759188322, + "grad_norm": 0.17712855339050293, + "learning_rate": 0.0002, + "loss": 0.0658, + "step": 15540 + }, + { + "epoch": 0.028280504411440418, + "grad_norm": 0.0183733981102705, + "learning_rate": 0.0002, + "loss": 0.0188, + "step": 15550 + }, + { + "epoch": 0.028298691230997614, + "grad_norm": 0.15027762949466705, + "learning_rate": 0.0002, + "loss": 0.1235, + "step": 15560 + }, + { + "epoch": 0.02831687805055481, + "grad_norm": 0.10586661100387573, + "learning_rate": 0.0002, + "loss": 0.0791, + "step": 15570 + }, + { + "epoch": 0.028335064870112007, + "grad_norm": 0.031083540990948677, + "learning_rate": 0.0002, + "loss": 0.0765, + "step": 15580 + }, + { + "epoch": 0.028353251689669203, + "grad_norm": 0.12294827401638031, + "learning_rate": 0.0002, + "loss": 0.0615, + "step": 15590 + }, + { + "epoch": 0.0283714385092264, + "grad_norm": 0.03652534633874893, + "learning_rate": 0.0002, + "loss": 0.0203, + "step": 15600 + }, + { + "epoch": 0.028389625328783596, + "grad_norm": 0.046638645231723785, + "learning_rate": 0.0002, + "loss": 0.1327, + "step": 15610 + }, + { + "epoch": 0.028407812148340792, + "grad_norm": 0.07200415432453156, + "learning_rate": 0.0002, + "loss": 0.0765, + "step": 15620 + }, + { + "epoch": 0.028425998967897992, + "grad_norm": 0.040679559111595154, + "learning_rate": 0.0002, + "loss": 0.0812, + "step": 15630 + }, + { + "epoch": 0.028444185787455188, + "grad_norm": 0.1572960615158081, + "learning_rate": 0.0002, + "loss": 0.0637, + "step": 15640 + }, + { + "epoch": 0.028462372607012384, + "grad_norm": 0.036091506481170654, + "learning_rate": 0.0002, + "loss": 0.0266, + "step": 15650 + }, + { + "epoch": 0.02848055942656958, + "grad_norm": 0.10555437207221985, + "learning_rate": 0.0002, + "loss": 0.1093, + "step": 15660 + }, + { + "epoch": 0.028498746246126777, + "grad_norm": 0.08854329586029053, + "learning_rate": 0.0002, + "loss": 0.0741, + "step": 15670 + }, + { + "epoch": 0.028516933065683973, + "grad_norm": 0.02908560261130333, + "learning_rate": 0.0002, + "loss": 0.0732, + "step": 15680 + }, + { + "epoch": 0.02853511988524117, + "grad_norm": 0.1568380743265152, + "learning_rate": 0.0002, + "loss": 0.0586, + "step": 15690 + }, + { + "epoch": 0.028553306704798366, + "grad_norm": 0.04985487833619118, + "learning_rate": 0.0002, + "loss": 0.0247, + "step": 15700 + }, + { + "epoch": 0.028571493524355562, + "grad_norm": 0.07582605630159378, + "learning_rate": 0.0002, + "loss": 0.1196, + "step": 15710 + }, + { + "epoch": 0.02858968034391276, + "grad_norm": 0.02401849813759327, + "learning_rate": 0.0002, + "loss": 0.075, + "step": 15720 + }, + { + "epoch": 0.028607867163469955, + "grad_norm": 0.032545965164899826, + "learning_rate": 0.0002, + "loss": 0.0761, + "step": 15730 + }, + { + "epoch": 0.02862605398302715, + "grad_norm": 0.1098649650812149, + "learning_rate": 0.0002, + "loss": 0.0599, + "step": 15740 + }, + { + "epoch": 0.028644240802584348, + "grad_norm": 0.021166007965803146, + "learning_rate": 0.0002, + "loss": 0.0169, + "step": 15750 + }, + { + "epoch": 0.028662427622141544, + "grad_norm": 0.0823541134595871, + "learning_rate": 0.0002, + "loss": 0.1337, + "step": 15760 + }, + { + "epoch": 0.02868061444169874, + "grad_norm": 0.1009572371840477, + "learning_rate": 0.0002, + "loss": 0.0779, + "step": 15770 + }, + { + "epoch": 0.028698801261255937, + "grad_norm": 0.09160738438367844, + "learning_rate": 0.0002, + "loss": 0.0766, + "step": 15780 + }, + { + "epoch": 0.028716988080813133, + "grad_norm": 0.14419673383235931, + "learning_rate": 0.0002, + "loss": 0.0594, + "step": 15790 + }, + { + "epoch": 0.02873517490037033, + "grad_norm": 0.01628550887107849, + "learning_rate": 0.0002, + "loss": 0.0218, + "step": 15800 + }, + { + "epoch": 0.028753361719927525, + "grad_norm": 0.15207678079605103, + "learning_rate": 0.0002, + "loss": 0.1262, + "step": 15810 + }, + { + "epoch": 0.028771548539484722, + "grad_norm": 0.14951761066913605, + "learning_rate": 0.0002, + "loss": 0.0812, + "step": 15820 + }, + { + "epoch": 0.028789735359041918, + "grad_norm": 0.028078215196728706, + "learning_rate": 0.0002, + "loss": 0.0783, + "step": 15830 + }, + { + "epoch": 0.028807922178599114, + "grad_norm": 0.16079741716384888, + "learning_rate": 0.0002, + "loss": 0.0633, + "step": 15840 + }, + { + "epoch": 0.02882610899815631, + "grad_norm": 0.04218870773911476, + "learning_rate": 0.0002, + "loss": 0.0217, + "step": 15850 + }, + { + "epoch": 0.028844295817713507, + "grad_norm": 0.13758492469787598, + "learning_rate": 0.0002, + "loss": 0.1358, + "step": 15860 + }, + { + "epoch": 0.028862482637270703, + "grad_norm": 0.10366559028625488, + "learning_rate": 0.0002, + "loss": 0.0787, + "step": 15870 + }, + { + "epoch": 0.0288806694568279, + "grad_norm": 0.04433147609233856, + "learning_rate": 0.0002, + "loss": 0.0781, + "step": 15880 + }, + { + "epoch": 0.028898856276385096, + "grad_norm": 0.16709402203559875, + "learning_rate": 0.0002, + "loss": 0.0684, + "step": 15890 + }, + { + "epoch": 0.028917043095942292, + "grad_norm": 0.03370310738682747, + "learning_rate": 0.0002, + "loss": 0.0191, + "step": 15900 + }, + { + "epoch": 0.02893522991549949, + "grad_norm": 0.15469267964363098, + "learning_rate": 0.0002, + "loss": 0.1487, + "step": 15910 + }, + { + "epoch": 0.028953416735056685, + "grad_norm": 0.19974654912948608, + "learning_rate": 0.0002, + "loss": 0.0769, + "step": 15920 + }, + { + "epoch": 0.02897160355461388, + "grad_norm": 0.04307623952627182, + "learning_rate": 0.0002, + "loss": 0.075, + "step": 15930 + }, + { + "epoch": 0.028989790374171077, + "grad_norm": 0.21828149259090424, + "learning_rate": 0.0002, + "loss": 0.0691, + "step": 15940 + }, + { + "epoch": 0.029007977193728274, + "grad_norm": 0.0268656387925148, + "learning_rate": 0.0002, + "loss": 0.022, + "step": 15950 + }, + { + "epoch": 0.02902616401328547, + "grad_norm": 0.11213699728250504, + "learning_rate": 0.0002, + "loss": 0.1326, + "step": 15960 + }, + { + "epoch": 0.029044350832842666, + "grad_norm": 0.2018963098526001, + "learning_rate": 0.0002, + "loss": 0.0772, + "step": 15970 + }, + { + "epoch": 0.029062537652399866, + "grad_norm": 0.06034110113978386, + "learning_rate": 0.0002, + "loss": 0.0712, + "step": 15980 + }, + { + "epoch": 0.029080724471957062, + "grad_norm": 0.1817707121372223, + "learning_rate": 0.0002, + "loss": 0.0692, + "step": 15990 + }, + { + "epoch": 0.02909891129151426, + "grad_norm": 0.03466440737247467, + "learning_rate": 0.0002, + "loss": 0.0205, + "step": 16000 + }, + { + "epoch": 0.029117098111071455, + "grad_norm": 0.1375580132007599, + "learning_rate": 0.0002, + "loss": 0.1499, + "step": 16010 + }, + { + "epoch": 0.02913528493062865, + "grad_norm": 0.14308910071849823, + "learning_rate": 0.0002, + "loss": 0.083, + "step": 16020 + }, + { + "epoch": 0.029153471750185848, + "grad_norm": 0.041022926568984985, + "learning_rate": 0.0002, + "loss": 0.0785, + "step": 16030 + }, + { + "epoch": 0.029171658569743044, + "grad_norm": 0.1701498180627823, + "learning_rate": 0.0002, + "loss": 0.0656, + "step": 16040 + }, + { + "epoch": 0.02918984538930024, + "grad_norm": 0.023075805976986885, + "learning_rate": 0.0002, + "loss": 0.0225, + "step": 16050 + }, + { + "epoch": 0.029208032208857437, + "grad_norm": 0.05303549766540527, + "learning_rate": 0.0002, + "loss": 0.1369, + "step": 16060 + }, + { + "epoch": 0.029226219028414633, + "grad_norm": 0.044178470969200134, + "learning_rate": 0.0002, + "loss": 0.0754, + "step": 16070 + }, + { + "epoch": 0.02924440584797183, + "grad_norm": 0.03951259329915047, + "learning_rate": 0.0002, + "loss": 0.0759, + "step": 16080 + }, + { + "epoch": 0.029262592667529026, + "grad_norm": 0.13762067258358002, + "learning_rate": 0.0002, + "loss": 0.0605, + "step": 16090 + }, + { + "epoch": 0.029280779487086222, + "grad_norm": 0.021227868273854256, + "learning_rate": 0.0002, + "loss": 0.0173, + "step": 16100 + }, + { + "epoch": 0.029298966306643418, + "grad_norm": 0.19493195414543152, + "learning_rate": 0.0002, + "loss": 0.1307, + "step": 16110 + }, + { + "epoch": 0.029317153126200615, + "grad_norm": 0.09980791062116623, + "learning_rate": 0.0002, + "loss": 0.0724, + "step": 16120 + }, + { + "epoch": 0.02933533994575781, + "grad_norm": 0.08762095868587494, + "learning_rate": 0.0002, + "loss": 0.0734, + "step": 16130 + }, + { + "epoch": 0.029353526765315007, + "grad_norm": 0.14261308312416077, + "learning_rate": 0.0002, + "loss": 0.071, + "step": 16140 + }, + { + "epoch": 0.029371713584872203, + "grad_norm": 0.033154651522636414, + "learning_rate": 0.0002, + "loss": 0.0238, + "step": 16150 + }, + { + "epoch": 0.0293899004044294, + "grad_norm": 0.1422877162694931, + "learning_rate": 0.0002, + "loss": 0.1285, + "step": 16160 + }, + { + "epoch": 0.029408087223986596, + "grad_norm": 0.1342266947031021, + "learning_rate": 0.0002, + "loss": 0.0765, + "step": 16170 + }, + { + "epoch": 0.029426274043543792, + "grad_norm": 0.031525906175374985, + "learning_rate": 0.0002, + "loss": 0.0772, + "step": 16180 + }, + { + "epoch": 0.02944446086310099, + "grad_norm": 0.14790122210979462, + "learning_rate": 0.0002, + "loss": 0.0627, + "step": 16190 + }, + { + "epoch": 0.029462647682658185, + "grad_norm": 0.025354932993650436, + "learning_rate": 0.0002, + "loss": 0.0212, + "step": 16200 + }, + { + "epoch": 0.02948083450221538, + "grad_norm": 0.1287624090909958, + "learning_rate": 0.0002, + "loss": 0.1457, + "step": 16210 + }, + { + "epoch": 0.029499021321772578, + "grad_norm": 0.1079782247543335, + "learning_rate": 0.0002, + "loss": 0.0819, + "step": 16220 + }, + { + "epoch": 0.029517208141329774, + "grad_norm": 0.04884497448801994, + "learning_rate": 0.0002, + "loss": 0.0843, + "step": 16230 + }, + { + "epoch": 0.02953539496088697, + "grad_norm": 0.14452646672725677, + "learning_rate": 0.0002, + "loss": 0.0664, + "step": 16240 + }, + { + "epoch": 0.029553581780444167, + "grad_norm": 0.029236188158392906, + "learning_rate": 0.0002, + "loss": 0.0182, + "step": 16250 + }, + { + "epoch": 0.029571768600001363, + "grad_norm": 0.18048252165317535, + "learning_rate": 0.0002, + "loss": 0.1382, + "step": 16260 + }, + { + "epoch": 0.02958995541955856, + "grad_norm": 0.08402508497238159, + "learning_rate": 0.0002, + "loss": 0.078, + "step": 16270 + }, + { + "epoch": 0.029608142239115755, + "grad_norm": 0.07740433514118195, + "learning_rate": 0.0002, + "loss": 0.0776, + "step": 16280 + }, + { + "epoch": 0.029626329058672952, + "grad_norm": 0.1414123773574829, + "learning_rate": 0.0002, + "loss": 0.0611, + "step": 16290 + }, + { + "epoch": 0.029644515878230148, + "grad_norm": 0.03296574577689171, + "learning_rate": 0.0002, + "loss": 0.0228, + "step": 16300 + }, + { + "epoch": 0.029662702697787344, + "grad_norm": 0.09312735497951508, + "learning_rate": 0.0002, + "loss": 0.1213, + "step": 16310 + }, + { + "epoch": 0.029680889517344544, + "grad_norm": 0.07857484370470047, + "learning_rate": 0.0002, + "loss": 0.0812, + "step": 16320 + }, + { + "epoch": 0.02969907633690174, + "grad_norm": 0.0680379793047905, + "learning_rate": 0.0002, + "loss": 0.0774, + "step": 16330 + }, + { + "epoch": 0.029717263156458937, + "grad_norm": 0.18506748974323273, + "learning_rate": 0.0002, + "loss": 0.0675, + "step": 16340 + }, + { + "epoch": 0.029735449976016133, + "grad_norm": 0.029233543202280998, + "learning_rate": 0.0002, + "loss": 0.0187, + "step": 16350 + }, + { + "epoch": 0.02975363679557333, + "grad_norm": 0.1133171021938324, + "learning_rate": 0.0002, + "loss": 0.1217, + "step": 16360 + }, + { + "epoch": 0.029771823615130526, + "grad_norm": 0.06985988467931747, + "learning_rate": 0.0002, + "loss": 0.0761, + "step": 16370 + }, + { + "epoch": 0.029790010434687722, + "grad_norm": 0.13158757984638214, + "learning_rate": 0.0002, + "loss": 0.0764, + "step": 16380 + }, + { + "epoch": 0.02980819725424492, + "grad_norm": 0.19751304388046265, + "learning_rate": 0.0002, + "loss": 0.0652, + "step": 16390 + }, + { + "epoch": 0.029826384073802115, + "grad_norm": 0.019567493349313736, + "learning_rate": 0.0002, + "loss": 0.0166, + "step": 16400 + }, + { + "epoch": 0.02984457089335931, + "grad_norm": 0.1859702467918396, + "learning_rate": 0.0002, + "loss": 0.1482, + "step": 16410 + }, + { + "epoch": 0.029862757712916507, + "grad_norm": 0.03211350366473198, + "learning_rate": 0.0002, + "loss": 0.073, + "step": 16420 + }, + { + "epoch": 0.029880944532473704, + "grad_norm": 0.10664219409227371, + "learning_rate": 0.0002, + "loss": 0.075, + "step": 16430 + }, + { + "epoch": 0.0298991313520309, + "grad_norm": 0.18254978954792023, + "learning_rate": 0.0002, + "loss": 0.0666, + "step": 16440 + }, + { + "epoch": 0.029917318171588096, + "grad_norm": 0.03076091594994068, + "learning_rate": 0.0002, + "loss": 0.0217, + "step": 16450 + }, + { + "epoch": 0.029935504991145293, + "grad_norm": 0.11172248423099518, + "learning_rate": 0.0002, + "loss": 0.1115, + "step": 16460 + }, + { + "epoch": 0.02995369181070249, + "grad_norm": 0.1121174767613411, + "learning_rate": 0.0002, + "loss": 0.0838, + "step": 16470 + }, + { + "epoch": 0.029971878630259685, + "grad_norm": 0.05544061213731766, + "learning_rate": 0.0002, + "loss": 0.0773, + "step": 16480 + }, + { + "epoch": 0.02999006544981688, + "grad_norm": 0.13899610936641693, + "learning_rate": 0.0002, + "loss": 0.0648, + "step": 16490 + }, + { + "epoch": 0.030008252269374078, + "grad_norm": 0.031017031520605087, + "learning_rate": 0.0002, + "loss": 0.0205, + "step": 16500 + }, + { + "epoch": 0.030026439088931274, + "grad_norm": 0.5919166803359985, + "learning_rate": 0.0002, + "loss": 0.1454, + "step": 16510 + }, + { + "epoch": 0.03004462590848847, + "grad_norm": 2.5127646923065186, + "learning_rate": 0.0002, + "loss": 0.0925, + "step": 16520 + }, + { + "epoch": 0.030062812728045667, + "grad_norm": 0.12587642669677734, + "learning_rate": 0.0002, + "loss": 0.0896, + "step": 16530 + }, + { + "epoch": 0.030080999547602863, + "grad_norm": 0.29352524876594543, + "learning_rate": 0.0002, + "loss": 0.0692, + "step": 16540 + }, + { + "epoch": 0.03009918636716006, + "grad_norm": 0.012585405260324478, + "learning_rate": 0.0002, + "loss": 0.021, + "step": 16550 + }, + { + "epoch": 0.030117373186717256, + "grad_norm": 2.432018756866455, + "learning_rate": 0.0002, + "loss": 0.239, + "step": 16560 + }, + { + "epoch": 0.030135560006274452, + "grad_norm": 0.09337054193019867, + "learning_rate": 0.0002, + "loss": 0.0859, + "step": 16570 + }, + { + "epoch": 0.030153746825831648, + "grad_norm": 0.05135548114776611, + "learning_rate": 0.0002, + "loss": 0.0794, + "step": 16580 + }, + { + "epoch": 0.030171933645388845, + "grad_norm": 0.15056684613227844, + "learning_rate": 0.0002, + "loss": 0.0697, + "step": 16590 + }, + { + "epoch": 0.03019012046494604, + "grad_norm": 5.883757694391534e-05, + "learning_rate": 0.0002, + "loss": 0.0085, + "step": 16600 + }, + { + "epoch": 0.030208307284503237, + "grad_norm": 1.0368543863296509, + "learning_rate": 0.0002, + "loss": 0.1861, + "step": 16610 + }, + { + "epoch": 0.030226494104060433, + "grad_norm": 0.07987317442893982, + "learning_rate": 0.0002, + "loss": 0.0938, + "step": 16620 + }, + { + "epoch": 0.03024468092361763, + "grad_norm": 0.02812887355685234, + "learning_rate": 0.0002, + "loss": 0.0753, + "step": 16630 + }, + { + "epoch": 0.030262867743174826, + "grad_norm": 0.24061231315135956, + "learning_rate": 0.0002, + "loss": 0.0653, + "step": 16640 + }, + { + "epoch": 0.030281054562732022, + "grad_norm": 0.0402507558465004, + "learning_rate": 0.0002, + "loss": 0.0266, + "step": 16650 + }, + { + "epoch": 0.030299241382289222, + "grad_norm": 0.13552093505859375, + "learning_rate": 0.0002, + "loss": 0.1709, + "step": 16660 + }, + { + "epoch": 0.03031742820184642, + "grad_norm": 0.6093604564666748, + "learning_rate": 0.0002, + "loss": 0.0857, + "step": 16670 + }, + { + "epoch": 0.030335615021403615, + "grad_norm": 0.11608528345823288, + "learning_rate": 0.0002, + "loss": 0.0874, + "step": 16680 + }, + { + "epoch": 0.03035380184096081, + "grad_norm": 0.23376339673995972, + "learning_rate": 0.0002, + "loss": 0.0688, + "step": 16690 + }, + { + "epoch": 0.030371988660518007, + "grad_norm": 0.03484225273132324, + "learning_rate": 0.0002, + "loss": 0.0172, + "step": 16700 + }, + { + "epoch": 0.030390175480075204, + "grad_norm": 0.30532532930374146, + "learning_rate": 0.0002, + "loss": 0.1686, + "step": 16710 + }, + { + "epoch": 0.0304083622996324, + "grad_norm": 0.05142231658101082, + "learning_rate": 0.0002, + "loss": 0.0766, + "step": 16720 + }, + { + "epoch": 0.030426549119189596, + "grad_norm": 0.08218207955360413, + "learning_rate": 0.0002, + "loss": 0.0839, + "step": 16730 + }, + { + "epoch": 0.030444735938746793, + "grad_norm": 0.15296520292758942, + "learning_rate": 0.0002, + "loss": 0.0717, + "step": 16740 + }, + { + "epoch": 0.03046292275830399, + "grad_norm": 0.009951476007699966, + "learning_rate": 0.0002, + "loss": 0.0103, + "step": 16750 + }, + { + "epoch": 0.030481109577861185, + "grad_norm": 0.18752850592136383, + "learning_rate": 0.0002, + "loss": 0.2382, + "step": 16760 + }, + { + "epoch": 0.03049929639741838, + "grad_norm": 0.1473335325717926, + "learning_rate": 0.0002, + "loss": 0.0975, + "step": 16770 + }, + { + "epoch": 0.030517483216975578, + "grad_norm": 0.04578230902552605, + "learning_rate": 0.0002, + "loss": 0.0812, + "step": 16780 + }, + { + "epoch": 0.030535670036532774, + "grad_norm": 0.2557182312011719, + "learning_rate": 0.0002, + "loss": 0.0691, + "step": 16790 + }, + { + "epoch": 0.03055385685608997, + "grad_norm": 1.473021388053894, + "learning_rate": 0.0002, + "loss": 0.2088, + "step": 16800 + }, + { + "epoch": 0.030572043675647167, + "grad_norm": 1.0227181911468506, + "learning_rate": 0.0002, + "loss": 0.7207, + "step": 16810 + }, + { + "epoch": 0.030590230495204363, + "grad_norm": 0.11395780742168427, + "learning_rate": 0.0002, + "loss": 0.0943, + "step": 16820 + }, + { + "epoch": 0.03060841731476156, + "grad_norm": 6.501937389373779, + "learning_rate": 0.0002, + "loss": 0.0871, + "step": 16830 + }, + { + "epoch": 0.030626604134318756, + "grad_norm": 0.17187578976154327, + "learning_rate": 0.0002, + "loss": 0.0672, + "step": 16840 + }, + { + "epoch": 0.030644790953875952, + "grad_norm": 0.03396519273519516, + "learning_rate": 0.0002, + "loss": 0.0224, + "step": 16850 + }, + { + "epoch": 0.03066297777343315, + "grad_norm": 3.397012948989868, + "learning_rate": 0.0002, + "loss": 0.1641, + "step": 16860 + }, + { + "epoch": 0.030681164592990345, + "grad_norm": 0.44838130474090576, + "learning_rate": 0.0002, + "loss": 0.0868, + "step": 16870 + }, + { + "epoch": 0.03069935141254754, + "grad_norm": 0.08598771691322327, + "learning_rate": 0.0002, + "loss": 0.0766, + "step": 16880 + }, + { + "epoch": 0.030717538232104737, + "grad_norm": 0.15339739620685577, + "learning_rate": 0.0002, + "loss": 0.0609, + "step": 16890 + }, + { + "epoch": 0.030735725051661934, + "grad_norm": 0.04086040332913399, + "learning_rate": 0.0002, + "loss": 0.0218, + "step": 16900 + }, + { + "epoch": 0.03075391187121913, + "grad_norm": 0.40313076972961426, + "learning_rate": 0.0002, + "loss": 0.2017, + "step": 16910 + }, + { + "epoch": 0.030772098690776326, + "grad_norm": 0.2068721503019333, + "learning_rate": 0.0002, + "loss": 0.0906, + "step": 16920 + }, + { + "epoch": 0.030790285510333523, + "grad_norm": 0.12770770490169525, + "learning_rate": 0.0002, + "loss": 0.0801, + "step": 16930 + }, + { + "epoch": 0.03080847232989072, + "grad_norm": 17.294641494750977, + "learning_rate": 0.0002, + "loss": 0.0701, + "step": 16940 + }, + { + "epoch": 0.030826659149447915, + "grad_norm": 0.04612286388874054, + "learning_rate": 0.0002, + "loss": 0.0287, + "step": 16950 + }, + { + "epoch": 0.03084484596900511, + "grad_norm": 0.10311487317085266, + "learning_rate": 0.0002, + "loss": 0.136, + "step": 16960 + }, + { + "epoch": 0.030863032788562308, + "grad_norm": 0.20878446102142334, + "learning_rate": 0.0002, + "loss": 0.0886, + "step": 16970 + }, + { + "epoch": 0.030881219608119504, + "grad_norm": 1.412353515625, + "learning_rate": 0.0002, + "loss": 0.0843, + "step": 16980 + }, + { + "epoch": 0.0308994064276767, + "grad_norm": 0.27046918869018555, + "learning_rate": 0.0002, + "loss": 0.0755, + "step": 16990 + }, + { + "epoch": 0.030917593247233897, + "grad_norm": 0.5227788090705872, + "learning_rate": 0.0002, + "loss": 0.0234, + "step": 17000 + }, + { + "epoch": 0.030935780066791096, + "grad_norm": 0.16006655991077423, + "learning_rate": 0.0002, + "loss": 0.183, + "step": 17010 + }, + { + "epoch": 0.030953966886348293, + "grad_norm": 0.1297607421875, + "learning_rate": 0.0002, + "loss": 0.0868, + "step": 17020 + }, + { + "epoch": 0.03097215370590549, + "grad_norm": 11.198999404907227, + "learning_rate": 0.0002, + "loss": 0.0998, + "step": 17030 + }, + { + "epoch": 0.030990340525462685, + "grad_norm": 0.39887136220932007, + "learning_rate": 0.0002, + "loss": 0.0898, + "step": 17040 + }, + { + "epoch": 0.03100852734501988, + "grad_norm": 0.009262642823159695, + "learning_rate": 0.0002, + "loss": 0.0215, + "step": 17050 + }, + { + "epoch": 0.031026714164577078, + "grad_norm": 0.15820527076721191, + "learning_rate": 0.0002, + "loss": 0.2017, + "step": 17060 + }, + { + "epoch": 0.031044900984134274, + "grad_norm": 0.11645558476448059, + "learning_rate": 0.0002, + "loss": 0.085, + "step": 17070 + }, + { + "epoch": 0.03106308780369147, + "grad_norm": 0.03981775790452957, + "learning_rate": 0.0002, + "loss": 0.0803, + "step": 17080 + }, + { + "epoch": 0.031081274623248667, + "grad_norm": 0.1584177166223526, + "learning_rate": 0.0002, + "loss": 0.0635, + "step": 17090 + }, + { + "epoch": 0.031099461442805863, + "grad_norm": 0.0005907397717237473, + "learning_rate": 0.0002, + "loss": 0.006, + "step": 17100 + }, + { + "epoch": 0.03111764826236306, + "grad_norm": 0.05344061553478241, + "learning_rate": 0.0002, + "loss": 0.3098, + "step": 17110 + }, + { + "epoch": 0.031135835081920256, + "grad_norm": 0.05249408632516861, + "learning_rate": 0.0002, + "loss": 0.1002, + "step": 17120 + }, + { + "epoch": 0.031154021901477452, + "grad_norm": 0.04177263006567955, + "learning_rate": 0.0002, + "loss": 0.0969, + "step": 17130 + }, + { + "epoch": 0.03117220872103465, + "grad_norm": 0.18396486341953278, + "learning_rate": 0.0002, + "loss": 0.0727, + "step": 17140 + }, + { + "epoch": 0.031190395540591845, + "grad_norm": 0.0019848416559398174, + "learning_rate": 0.0002, + "loss": 0.0092, + "step": 17150 + }, + { + "epoch": 0.03120858236014904, + "grad_norm": 0.23747271299362183, + "learning_rate": 0.0002, + "loss": 0.3243, + "step": 17160 + }, + { + "epoch": 0.031226769179706237, + "grad_norm": 0.2365376353263855, + "learning_rate": 0.0002, + "loss": 0.094, + "step": 17170 + }, + { + "epoch": 0.031244955999263434, + "grad_norm": 0.21784919500350952, + "learning_rate": 0.0002, + "loss": 0.0795, + "step": 17180 + }, + { + "epoch": 0.03126314281882063, + "grad_norm": 0.27253153920173645, + "learning_rate": 0.0002, + "loss": 0.0748, + "step": 17190 + }, + { + "epoch": 0.031281329638377826, + "grad_norm": 0.004298684187233448, + "learning_rate": 0.0002, + "loss": 0.014, + "step": 17200 + }, + { + "epoch": 0.03129951645793502, + "grad_norm": 0.267871230840683, + "learning_rate": 0.0002, + "loss": 0.2938, + "step": 17210 + }, + { + "epoch": 0.03131770327749222, + "grad_norm": 0.1428530067205429, + "learning_rate": 0.0002, + "loss": 0.0901, + "step": 17220 + }, + { + "epoch": 0.031335890097049415, + "grad_norm": 0.10623782873153687, + "learning_rate": 0.0002, + "loss": 0.0752, + "step": 17230 + }, + { + "epoch": 0.03135407691660661, + "grad_norm": 0.2869247496128082, + "learning_rate": 0.0002, + "loss": 0.0707, + "step": 17240 + }, + { + "epoch": 0.03137226373616381, + "grad_norm": 0.011321209371089935, + "learning_rate": 0.0002, + "loss": 0.0168, + "step": 17250 + }, + { + "epoch": 0.031390450555721004, + "grad_norm": 0.09432020783424377, + "learning_rate": 0.0002, + "loss": 0.2046, + "step": 17260 + }, + { + "epoch": 0.0314086373752782, + "grad_norm": 0.190867081284523, + "learning_rate": 0.0002, + "loss": 0.0866, + "step": 17270 + }, + { + "epoch": 0.0314268241948354, + "grad_norm": 0.14274829626083374, + "learning_rate": 0.0002, + "loss": 0.0796, + "step": 17280 + }, + { + "epoch": 0.03144501101439259, + "grad_norm": 0.29910504817962646, + "learning_rate": 0.0002, + "loss": 0.0711, + "step": 17290 + }, + { + "epoch": 0.03146319783394979, + "grad_norm": 0.031730011105537415, + "learning_rate": 0.0002, + "loss": 0.0217, + "step": 17300 + }, + { + "epoch": 0.031481384653506986, + "grad_norm": 0.23042625188827515, + "learning_rate": 0.0002, + "loss": 0.1491, + "step": 17310 + }, + { + "epoch": 0.03149957147306418, + "grad_norm": 0.15560220181941986, + "learning_rate": 0.0002, + "loss": 0.0761, + "step": 17320 + }, + { + "epoch": 0.03151775829262138, + "grad_norm": 0.051929160952568054, + "learning_rate": 0.0002, + "loss": 0.0893, + "step": 17330 + }, + { + "epoch": 0.031535945112178575, + "grad_norm": 0.16162756085395813, + "learning_rate": 0.0002, + "loss": 0.0623, + "step": 17340 + }, + { + "epoch": 0.03155413193173577, + "grad_norm": 0.019480068236589432, + "learning_rate": 0.0002, + "loss": 0.0137, + "step": 17350 + }, + { + "epoch": 0.03157231875129297, + "grad_norm": 0.24700693786144257, + "learning_rate": 0.0002, + "loss": 0.1481, + "step": 17360 + }, + { + "epoch": 0.031590505570850164, + "grad_norm": 0.17574873566627502, + "learning_rate": 0.0002, + "loss": 0.079, + "step": 17370 + }, + { + "epoch": 0.03160869239040736, + "grad_norm": 0.10368580371141434, + "learning_rate": 0.0002, + "loss": 0.0811, + "step": 17380 + }, + { + "epoch": 0.031626879209964556, + "grad_norm": 0.23330622911453247, + "learning_rate": 0.0002, + "loss": 0.0669, + "step": 17390 + }, + { + "epoch": 0.03164506602952175, + "grad_norm": 0.031393859535455704, + "learning_rate": 0.0002, + "loss": 0.0183, + "step": 17400 + }, + { + "epoch": 0.03166325284907895, + "grad_norm": 0.22080129384994507, + "learning_rate": 0.0002, + "loss": 0.1567, + "step": 17410 + }, + { + "epoch": 0.031681439668636145, + "grad_norm": 0.177025705575943, + "learning_rate": 0.0002, + "loss": 0.0798, + "step": 17420 + }, + { + "epoch": 0.03169962648819334, + "grad_norm": 0.054285600781440735, + "learning_rate": 0.0002, + "loss": 0.0709, + "step": 17430 + }, + { + "epoch": 0.03171781330775054, + "grad_norm": 0.20625421404838562, + "learning_rate": 0.0002, + "loss": 0.0592, + "step": 17440 + }, + { + "epoch": 0.031736000127307734, + "grad_norm": 0.042640089988708496, + "learning_rate": 0.0002, + "loss": 0.0199, + "step": 17450 + }, + { + "epoch": 0.03175418694686493, + "grad_norm": 0.2505437731742859, + "learning_rate": 0.0002, + "loss": 0.131, + "step": 17460 + }, + { + "epoch": 0.03177237376642213, + "grad_norm": 0.24848629534244537, + "learning_rate": 0.0002, + "loss": 0.0826, + "step": 17470 + }, + { + "epoch": 0.03179056058597932, + "grad_norm": 0.056854844093322754, + "learning_rate": 0.0002, + "loss": 0.0779, + "step": 17480 + }, + { + "epoch": 0.03180874740553652, + "grad_norm": 0.23022660613059998, + "learning_rate": 0.0002, + "loss": 0.0703, + "step": 17490 + }, + { + "epoch": 0.031826934225093716, + "grad_norm": 0.033501993864774704, + "learning_rate": 0.0002, + "loss": 0.0229, + "step": 17500 + }, + { + "epoch": 0.03184512104465091, + "grad_norm": 0.25061148405075073, + "learning_rate": 0.0002, + "loss": 0.1588, + "step": 17510 + }, + { + "epoch": 0.031863307864208115, + "grad_norm": 0.21534167230129242, + "learning_rate": 0.0002, + "loss": 0.079, + "step": 17520 + }, + { + "epoch": 0.03188149468376531, + "grad_norm": 0.04823959991335869, + "learning_rate": 0.0002, + "loss": 0.0826, + "step": 17530 + }, + { + "epoch": 0.03189968150332251, + "grad_norm": 0.23680952191352844, + "learning_rate": 0.0002, + "loss": 0.0617, + "step": 17540 + }, + { + "epoch": 0.031917868322879704, + "grad_norm": 0.016636351123452187, + "learning_rate": 0.0002, + "loss": 0.0143, + "step": 17550 + }, + { + "epoch": 0.0319360551424369, + "grad_norm": 0.3684225082397461, + "learning_rate": 0.0002, + "loss": 0.2011, + "step": 17560 + }, + { + "epoch": 0.0319542419619941, + "grad_norm": 0.07126643508672714, + "learning_rate": 0.0002, + "loss": 0.0792, + "step": 17570 + }, + { + "epoch": 0.03197242878155129, + "grad_norm": 0.05354290455579758, + "learning_rate": 0.0002, + "loss": 0.0831, + "step": 17580 + }, + { + "epoch": 0.03199061560110849, + "grad_norm": 0.20318995416164398, + "learning_rate": 0.0002, + "loss": 0.0617, + "step": 17590 + }, + { + "epoch": 0.032008802420665686, + "grad_norm": 0.021502351388335228, + "learning_rate": 0.0002, + "loss": 0.0137, + "step": 17600 + }, + { + "epoch": 0.03202698924022288, + "grad_norm": 0.3471545875072479, + "learning_rate": 0.0002, + "loss": 0.1823, + "step": 17610 + }, + { + "epoch": 0.03204517605978008, + "grad_norm": 0.23191972076892853, + "learning_rate": 0.0002, + "loss": 0.0837, + "step": 17620 + }, + { + "epoch": 0.032063362879337275, + "grad_norm": 0.0479818731546402, + "learning_rate": 0.0002, + "loss": 0.0845, + "step": 17630 + }, + { + "epoch": 0.03208154969889447, + "grad_norm": 0.2193339467048645, + "learning_rate": 0.0002, + "loss": 0.068, + "step": 17640 + }, + { + "epoch": 0.03209973651845167, + "grad_norm": 0.03661821037530899, + "learning_rate": 0.0002, + "loss": 0.0234, + "step": 17650 + }, + { + "epoch": 0.032117923338008864, + "grad_norm": 0.10396943986415863, + "learning_rate": 0.0002, + "loss": 0.1295, + "step": 17660 + }, + { + "epoch": 0.03213611015756606, + "grad_norm": 0.16999179124832153, + "learning_rate": 0.0002, + "loss": 0.0823, + "step": 17670 + }, + { + "epoch": 0.032154296977123256, + "grad_norm": 0.09069819748401642, + "learning_rate": 0.0002, + "loss": 0.0748, + "step": 17680 + }, + { + "epoch": 0.03217248379668045, + "grad_norm": 0.24210433661937714, + "learning_rate": 0.0002, + "loss": 0.0611, + "step": 17690 + }, + { + "epoch": 0.03219067061623765, + "grad_norm": 0.028281020000576973, + "learning_rate": 0.0002, + "loss": 0.018, + "step": 17700 + }, + { + "epoch": 0.032208857435794845, + "grad_norm": 0.4133516252040863, + "learning_rate": 0.0002, + "loss": 0.1704, + "step": 17710 + }, + { + "epoch": 0.03222704425535204, + "grad_norm": 0.20207400619983673, + "learning_rate": 0.0002, + "loss": 0.0804, + "step": 17720 + }, + { + "epoch": 0.03224523107490924, + "grad_norm": 0.043604232370853424, + "learning_rate": 0.0002, + "loss": 0.0929, + "step": 17730 + }, + { + "epoch": 0.032263417894466434, + "grad_norm": 0.1995580494403839, + "learning_rate": 0.0002, + "loss": 0.062, + "step": 17740 + }, + { + "epoch": 0.03228160471402363, + "grad_norm": 0.03241848200559616, + "learning_rate": 0.0002, + "loss": 0.0137, + "step": 17750 + }, + { + "epoch": 0.03229979153358083, + "grad_norm": 0.28819000720977783, + "learning_rate": 0.0002, + "loss": 0.1696, + "step": 17760 + }, + { + "epoch": 0.03231797835313802, + "grad_norm": 0.2625056803226471, + "learning_rate": 0.0002, + "loss": 0.0704, + "step": 17770 + }, + { + "epoch": 0.03233616517269522, + "grad_norm": 0.03986202925443649, + "learning_rate": 0.0002, + "loss": 0.0848, + "step": 17780 + }, + { + "epoch": 0.032354351992252416, + "grad_norm": 0.24770867824554443, + "learning_rate": 0.0002, + "loss": 0.0608, + "step": 17790 + }, + { + "epoch": 0.03237253881180961, + "grad_norm": 0.031353630125522614, + "learning_rate": 0.0002, + "loss": 0.0145, + "step": 17800 + }, + { + "epoch": 0.03239072563136681, + "grad_norm": 0.2273588478565216, + "learning_rate": 0.0002, + "loss": 0.1765, + "step": 17810 + }, + { + "epoch": 0.032408912450924005, + "grad_norm": 0.19741755723953247, + "learning_rate": 0.0002, + "loss": 0.0818, + "step": 17820 + }, + { + "epoch": 0.0324270992704812, + "grad_norm": 0.03193483129143715, + "learning_rate": 0.0002, + "loss": 0.0737, + "step": 17830 + }, + { + "epoch": 0.0324452860900384, + "grad_norm": 0.13962946832180023, + "learning_rate": 0.0002, + "loss": 0.0575, + "step": 17840 + }, + { + "epoch": 0.03246347290959559, + "grad_norm": 0.01755092851817608, + "learning_rate": 0.0002, + "loss": 0.0159, + "step": 17850 + }, + { + "epoch": 0.03248165972915279, + "grad_norm": 0.21713244915008545, + "learning_rate": 0.0002, + "loss": 0.1476, + "step": 17860 + }, + { + "epoch": 0.032499846548709986, + "grad_norm": 0.15362155437469482, + "learning_rate": 0.0002, + "loss": 0.0747, + "step": 17870 + }, + { + "epoch": 0.03251803336826718, + "grad_norm": 0.02643916755914688, + "learning_rate": 0.0002, + "loss": 0.0793, + "step": 17880 + }, + { + "epoch": 0.03253622018782438, + "grad_norm": 0.2702760100364685, + "learning_rate": 0.0002, + "loss": 0.0641, + "step": 17890 + }, + { + "epoch": 0.032554407007381575, + "grad_norm": 0.05910428613424301, + "learning_rate": 0.0002, + "loss": 0.022, + "step": 17900 + }, + { + "epoch": 0.03257259382693877, + "grad_norm": 0.17692551016807556, + "learning_rate": 0.0002, + "loss": 0.1407, + "step": 17910 + }, + { + "epoch": 0.03259078064649597, + "grad_norm": 0.19877870380878448, + "learning_rate": 0.0002, + "loss": 0.0798, + "step": 17920 + }, + { + "epoch": 0.032608967466053164, + "grad_norm": 0.06731924414634705, + "learning_rate": 0.0002, + "loss": 0.0798, + "step": 17930 + }, + { + "epoch": 0.03262715428561036, + "grad_norm": 0.20342952013015747, + "learning_rate": 0.0002, + "loss": 0.0571, + "step": 17940 + }, + { + "epoch": 0.03264534110516756, + "grad_norm": 0.06299301236867905, + "learning_rate": 0.0002, + "loss": 0.0154, + "step": 17950 + }, + { + "epoch": 0.03266352792472475, + "grad_norm": 0.30317986011505127, + "learning_rate": 0.0002, + "loss": 0.1496, + "step": 17960 + }, + { + "epoch": 0.03268171474428195, + "grad_norm": 0.2737327218055725, + "learning_rate": 0.0002, + "loss": 0.0777, + "step": 17970 + }, + { + "epoch": 0.032699901563839145, + "grad_norm": 0.03226702660322189, + "learning_rate": 0.0002, + "loss": 0.0799, + "step": 17980 + }, + { + "epoch": 0.03271808838339634, + "grad_norm": 0.20195341110229492, + "learning_rate": 0.0002, + "loss": 0.0654, + "step": 17990 + }, + { + "epoch": 0.03273627520295354, + "grad_norm": 0.03351292014122009, + "learning_rate": 0.0002, + "loss": 0.0194, + "step": 18000 + }, + { + "epoch": 0.032754462022510734, + "grad_norm": 0.2281372845172882, + "learning_rate": 0.0002, + "loss": 0.154, + "step": 18010 + }, + { + "epoch": 0.03277264884206793, + "grad_norm": 0.19263891875743866, + "learning_rate": 0.0002, + "loss": 0.0803, + "step": 18020 + }, + { + "epoch": 0.03279083566162513, + "grad_norm": 0.04183288663625717, + "learning_rate": 0.0002, + "loss": 0.0842, + "step": 18030 + }, + { + "epoch": 0.03280902248118232, + "grad_norm": 0.284759521484375, + "learning_rate": 0.0002, + "loss": 0.067, + "step": 18040 + }, + { + "epoch": 0.03282720930073952, + "grad_norm": 0.02972390688955784, + "learning_rate": 0.0002, + "loss": 0.016, + "step": 18050 + }, + { + "epoch": 0.032845396120296716, + "grad_norm": 0.28630614280700684, + "learning_rate": 0.0002, + "loss": 0.1866, + "step": 18060 + }, + { + "epoch": 0.03286358293985391, + "grad_norm": 0.16426514089107513, + "learning_rate": 0.0002, + "loss": 0.0812, + "step": 18070 + }, + { + "epoch": 0.03288176975941111, + "grad_norm": 0.05643441155552864, + "learning_rate": 0.0002, + "loss": 0.0773, + "step": 18080 + }, + { + "epoch": 0.032899956578968305, + "grad_norm": 0.19082742929458618, + "learning_rate": 0.0002, + "loss": 0.0582, + "step": 18090 + }, + { + "epoch": 0.0329181433985255, + "grad_norm": 0.017512233927845955, + "learning_rate": 0.0002, + "loss": 0.0174, + "step": 18100 + }, + { + "epoch": 0.0329363302180827, + "grad_norm": 0.22619640827178955, + "learning_rate": 0.0002, + "loss": 0.166, + "step": 18110 + }, + { + "epoch": 0.032954517037639894, + "grad_norm": 0.10430974513292313, + "learning_rate": 0.0002, + "loss": 0.0716, + "step": 18120 + }, + { + "epoch": 0.03297270385719709, + "grad_norm": 0.07371710985898972, + "learning_rate": 0.0002, + "loss": 0.0733, + "step": 18130 + }, + { + "epoch": 0.032990890676754286, + "grad_norm": 0.19163483381271362, + "learning_rate": 0.0002, + "loss": 0.0609, + "step": 18140 + }, + { + "epoch": 0.03300907749631148, + "grad_norm": 0.03743975609540939, + "learning_rate": 0.0002, + "loss": 0.017, + "step": 18150 + }, + { + "epoch": 0.03302726431586868, + "grad_norm": 0.19496546685695648, + "learning_rate": 0.0002, + "loss": 0.1622, + "step": 18160 + }, + { + "epoch": 0.033045451135425875, + "grad_norm": 0.13054883480072021, + "learning_rate": 0.0002, + "loss": 0.0728, + "step": 18170 + }, + { + "epoch": 0.03306363795498307, + "grad_norm": 0.10058756172657013, + "learning_rate": 0.0002, + "loss": 0.0738, + "step": 18180 + }, + { + "epoch": 0.03308182477454027, + "grad_norm": 0.220932736992836, + "learning_rate": 0.0002, + "loss": 0.063, + "step": 18190 + }, + { + "epoch": 0.033100011594097464, + "grad_norm": 0.04396356642246246, + "learning_rate": 0.0002, + "loss": 0.0207, + "step": 18200 + }, + { + "epoch": 0.03311819841365467, + "grad_norm": 0.23554326593875885, + "learning_rate": 0.0002, + "loss": 0.1484, + "step": 18210 + }, + { + "epoch": 0.033136385233211864, + "grad_norm": 0.11277181655168533, + "learning_rate": 0.0002, + "loss": 0.0763, + "step": 18220 + }, + { + "epoch": 0.03315457205276906, + "grad_norm": 0.05176365375518799, + "learning_rate": 0.0002, + "loss": 0.076, + "step": 18230 + }, + { + "epoch": 0.033172758872326256, + "grad_norm": 0.1521395444869995, + "learning_rate": 0.0002, + "loss": 0.0605, + "step": 18240 + }, + { + "epoch": 0.03319094569188345, + "grad_norm": 0.04682580381631851, + "learning_rate": 0.0002, + "loss": 0.0149, + "step": 18250 + }, + { + "epoch": 0.03320913251144065, + "grad_norm": 0.16890883445739746, + "learning_rate": 0.0002, + "loss": 0.1402, + "step": 18260 + }, + { + "epoch": 0.033227319330997845, + "grad_norm": 0.17221559584140778, + "learning_rate": 0.0002, + "loss": 0.0819, + "step": 18270 + }, + { + "epoch": 0.03324550615055504, + "grad_norm": 0.07434559613466263, + "learning_rate": 0.0002, + "loss": 0.0784, + "step": 18280 + }, + { + "epoch": 0.03326369297011224, + "grad_norm": 0.1912834346294403, + "learning_rate": 0.0002, + "loss": 0.0614, + "step": 18290 + }, + { + "epoch": 0.033281879789669434, + "grad_norm": 0.04286884889006615, + "learning_rate": 0.0002, + "loss": 0.0185, + "step": 18300 + }, + { + "epoch": 0.03330006660922663, + "grad_norm": 0.29059842228889465, + "learning_rate": 0.0002, + "loss": 0.1357, + "step": 18310 + }, + { + "epoch": 0.03331825342878383, + "grad_norm": 0.2289486825466156, + "learning_rate": 0.0002, + "loss": 0.0865, + "step": 18320 + }, + { + "epoch": 0.03333644024834102, + "grad_norm": 0.027094636112451553, + "learning_rate": 0.0002, + "loss": 0.0841, + "step": 18330 + }, + { + "epoch": 0.03335462706789822, + "grad_norm": 0.21263600885868073, + "learning_rate": 0.0002, + "loss": 0.0628, + "step": 18340 + }, + { + "epoch": 0.033372813887455416, + "grad_norm": 0.03497980535030365, + "learning_rate": 0.0002, + "loss": 0.0158, + "step": 18350 + }, + { + "epoch": 0.03339100070701261, + "grad_norm": 0.20155973732471466, + "learning_rate": 0.0002, + "loss": 0.1523, + "step": 18360 + }, + { + "epoch": 0.03340918752656981, + "grad_norm": 0.03746286779642105, + "learning_rate": 0.0002, + "loss": 0.0781, + "step": 18370 + }, + { + "epoch": 0.033427374346127005, + "grad_norm": 0.06747066229581833, + "learning_rate": 0.0002, + "loss": 0.0792, + "step": 18380 + }, + { + "epoch": 0.0334455611656842, + "grad_norm": 0.23699060082435608, + "learning_rate": 0.0002, + "loss": 0.0651, + "step": 18390 + }, + { + "epoch": 0.0334637479852414, + "grad_norm": 0.047832150012254715, + "learning_rate": 0.0002, + "loss": 0.0181, + "step": 18400 + }, + { + "epoch": 0.033481934804798594, + "grad_norm": 0.3178698420524597, + "learning_rate": 0.0002, + "loss": 0.1537, + "step": 18410 + }, + { + "epoch": 0.03350012162435579, + "grad_norm": 0.16258081793785095, + "learning_rate": 0.0002, + "loss": 0.0722, + "step": 18420 + }, + { + "epoch": 0.033518308443912986, + "grad_norm": 0.02807716652750969, + "learning_rate": 0.0002, + "loss": 0.0844, + "step": 18430 + }, + { + "epoch": 0.03353649526347018, + "grad_norm": 0.16596710681915283, + "learning_rate": 0.0002, + "loss": 0.0607, + "step": 18440 + }, + { + "epoch": 0.03355468208302738, + "grad_norm": 0.04448723793029785, + "learning_rate": 0.0002, + "loss": 0.0183, + "step": 18450 + }, + { + "epoch": 0.033572868902584575, + "grad_norm": 0.39318934082984924, + "learning_rate": 0.0002, + "loss": 0.1497, + "step": 18460 + }, + { + "epoch": 0.03359105572214177, + "grad_norm": 0.17387263476848602, + "learning_rate": 0.0002, + "loss": 0.0787, + "step": 18470 + }, + { + "epoch": 0.03360924254169897, + "grad_norm": 0.14859163761138916, + "learning_rate": 0.0002, + "loss": 0.0837, + "step": 18480 + }, + { + "epoch": 0.033627429361256164, + "grad_norm": 0.24148601293563843, + "learning_rate": 0.0002, + "loss": 0.0655, + "step": 18490 + }, + { + "epoch": 0.03364561618081336, + "grad_norm": 0.04743284359574318, + "learning_rate": 0.0002, + "loss": 0.0174, + "step": 18500 + }, + { + "epoch": 0.03366380300037056, + "grad_norm": 0.25396591424942017, + "learning_rate": 0.0002, + "loss": 0.1438, + "step": 18510 + }, + { + "epoch": 0.03368198981992775, + "grad_norm": 0.1759178638458252, + "learning_rate": 0.0002, + "loss": 0.0758, + "step": 18520 + }, + { + "epoch": 0.03370017663948495, + "grad_norm": 0.06611669808626175, + "learning_rate": 0.0002, + "loss": 0.0787, + "step": 18530 + }, + { + "epoch": 0.033718363459042146, + "grad_norm": 0.22699445486068726, + "learning_rate": 0.0002, + "loss": 0.0697, + "step": 18540 + }, + { + "epoch": 0.03373655027859934, + "grad_norm": 0.02634899877011776, + "learning_rate": 0.0002, + "loss": 0.0189, + "step": 18550 + }, + { + "epoch": 0.03375473709815654, + "grad_norm": 0.3238360285758972, + "learning_rate": 0.0002, + "loss": 0.1496, + "step": 18560 + }, + { + "epoch": 0.033772923917713735, + "grad_norm": 0.16044601798057556, + "learning_rate": 0.0002, + "loss": 0.076, + "step": 18570 + }, + { + "epoch": 0.03379111073727093, + "grad_norm": 0.029841836541891098, + "learning_rate": 0.0002, + "loss": 0.0718, + "step": 18580 + }, + { + "epoch": 0.03380929755682813, + "grad_norm": 0.21851007640361786, + "learning_rate": 0.0002, + "loss": 0.0656, + "step": 18590 + }, + { + "epoch": 0.033827484376385324, + "grad_norm": 0.02096417360007763, + "learning_rate": 0.0002, + "loss": 0.0173, + "step": 18600 + }, + { + "epoch": 0.03384567119594252, + "grad_norm": 0.29625844955444336, + "learning_rate": 0.0002, + "loss": 0.1716, + "step": 18610 + }, + { + "epoch": 0.033863858015499716, + "grad_norm": 0.1510130614042282, + "learning_rate": 0.0002, + "loss": 0.0792, + "step": 18620 + }, + { + "epoch": 0.03388204483505691, + "grad_norm": 0.04192917421460152, + "learning_rate": 0.0002, + "loss": 0.0717, + "step": 18630 + }, + { + "epoch": 0.03390023165461411, + "grad_norm": 0.23139427602291107, + "learning_rate": 0.0002, + "loss": 0.0609, + "step": 18640 + }, + { + "epoch": 0.033918418474171305, + "grad_norm": 0.03887970373034477, + "learning_rate": 0.0002, + "loss": 0.0127, + "step": 18650 + }, + { + "epoch": 0.0339366052937285, + "grad_norm": 0.1315147578716278, + "learning_rate": 0.0002, + "loss": 0.1434, + "step": 18660 + }, + { + "epoch": 0.0339547921132857, + "grad_norm": 0.13328243792057037, + "learning_rate": 0.0002, + "loss": 0.0673, + "step": 18670 + }, + { + "epoch": 0.033972978932842894, + "grad_norm": 0.07161080092191696, + "learning_rate": 0.0002, + "loss": 0.0692, + "step": 18680 + }, + { + "epoch": 0.03399116575240009, + "grad_norm": 0.16019296646118164, + "learning_rate": 0.0002, + "loss": 0.0641, + "step": 18690 + }, + { + "epoch": 0.03400935257195729, + "grad_norm": 0.042882539331912994, + "learning_rate": 0.0002, + "loss": 0.0196, + "step": 18700 + }, + { + "epoch": 0.03402753939151448, + "grad_norm": 0.15019817650318146, + "learning_rate": 0.0002, + "loss": 0.1239, + "step": 18710 + }, + { + "epoch": 0.03404572621107168, + "grad_norm": 0.140267476439476, + "learning_rate": 0.0002, + "loss": 0.0715, + "step": 18720 + }, + { + "epoch": 0.034063913030628876, + "grad_norm": 0.060760073363780975, + "learning_rate": 0.0002, + "loss": 0.079, + "step": 18730 + }, + { + "epoch": 0.03408209985018607, + "grad_norm": 0.1783122718334198, + "learning_rate": 0.0002, + "loss": 0.0616, + "step": 18740 + }, + { + "epoch": 0.03410028666974327, + "grad_norm": 0.023139121010899544, + "learning_rate": 0.0002, + "loss": 0.0171, + "step": 18750 + }, + { + "epoch": 0.034118473489300465, + "grad_norm": 0.2645978331565857, + "learning_rate": 0.0002, + "loss": 0.1355, + "step": 18760 + }, + { + "epoch": 0.03413666030885766, + "grad_norm": 0.21009914577007294, + "learning_rate": 0.0002, + "loss": 0.0757, + "step": 18770 + }, + { + "epoch": 0.03415484712841486, + "grad_norm": 0.13494494557380676, + "learning_rate": 0.0002, + "loss": 0.0774, + "step": 18780 + }, + { + "epoch": 0.034173033947972054, + "grad_norm": 0.19806784391403198, + "learning_rate": 0.0002, + "loss": 0.0636, + "step": 18790 + }, + { + "epoch": 0.03419122076752925, + "grad_norm": 0.020482519641518593, + "learning_rate": 0.0002, + "loss": 0.0194, + "step": 18800 + }, + { + "epoch": 0.034209407587086446, + "grad_norm": 0.34826937317848206, + "learning_rate": 0.0002, + "loss": 0.1521, + "step": 18810 + }, + { + "epoch": 0.03422759440664364, + "grad_norm": 0.1293957680463791, + "learning_rate": 0.0002, + "loss": 0.0742, + "step": 18820 + }, + { + "epoch": 0.03424578122620084, + "grad_norm": 0.06574539095163345, + "learning_rate": 0.0002, + "loss": 0.0792, + "step": 18830 + }, + { + "epoch": 0.034263968045758035, + "grad_norm": 0.2005399614572525, + "learning_rate": 0.0002, + "loss": 0.0618, + "step": 18840 + }, + { + "epoch": 0.03428215486531523, + "grad_norm": 0.04699913039803505, + "learning_rate": 0.0002, + "loss": 0.0176, + "step": 18850 + }, + { + "epoch": 0.03430034168487243, + "grad_norm": 0.2593109905719757, + "learning_rate": 0.0002, + "loss": 0.1709, + "step": 18860 + }, + { + "epoch": 0.034318528504429624, + "grad_norm": 0.587365448474884, + "learning_rate": 0.0002, + "loss": 0.0794, + "step": 18870 + }, + { + "epoch": 0.03433671532398682, + "grad_norm": 0.0371614433825016, + "learning_rate": 0.0002, + "loss": 0.076, + "step": 18880 + }, + { + "epoch": 0.03435490214354402, + "grad_norm": 0.2164178341627121, + "learning_rate": 0.0002, + "loss": 0.0577, + "step": 18890 + }, + { + "epoch": 0.03437308896310122, + "grad_norm": 0.028071587905287743, + "learning_rate": 0.0002, + "loss": 0.0184, + "step": 18900 + }, + { + "epoch": 0.034391275782658416, + "grad_norm": 0.25464126467704773, + "learning_rate": 0.0002, + "loss": 0.1616, + "step": 18910 + }, + { + "epoch": 0.03440946260221561, + "grad_norm": 0.2830415368080139, + "learning_rate": 0.0002, + "loss": 0.0795, + "step": 18920 + }, + { + "epoch": 0.03442764942177281, + "grad_norm": 0.07880273461341858, + "learning_rate": 0.0002, + "loss": 0.0717, + "step": 18930 + }, + { + "epoch": 0.034445836241330005, + "grad_norm": 0.19671671092510223, + "learning_rate": 0.0002, + "loss": 0.0625, + "step": 18940 + }, + { + "epoch": 0.0344640230608872, + "grad_norm": 0.038350027054548264, + "learning_rate": 0.0002, + "loss": 0.0172, + "step": 18950 + }, + { + "epoch": 0.0344822098804444, + "grad_norm": 0.196768656373024, + "learning_rate": 0.0002, + "loss": 0.1586, + "step": 18960 + }, + { + "epoch": 0.034500396700001594, + "grad_norm": 0.1861678808927536, + "learning_rate": 0.0002, + "loss": 0.0871, + "step": 18970 + }, + { + "epoch": 0.03451858351955879, + "grad_norm": 0.1074979305267334, + "learning_rate": 0.0002, + "loss": 0.0697, + "step": 18980 + }, + { + "epoch": 0.03453677033911599, + "grad_norm": 0.18214645981788635, + "learning_rate": 0.0002, + "loss": 0.0594, + "step": 18990 + }, + { + "epoch": 0.03455495715867318, + "grad_norm": 0.035948049277067184, + "learning_rate": 0.0002, + "loss": 0.0177, + "step": 19000 + }, + { + "epoch": 0.03457314397823038, + "grad_norm": 0.2434094399213791, + "learning_rate": 0.0002, + "loss": 0.1402, + "step": 19010 + }, + { + "epoch": 0.034591330797787576, + "grad_norm": 0.06897670775651932, + "learning_rate": 0.0002, + "loss": 0.0758, + "step": 19020 + }, + { + "epoch": 0.03460951761734477, + "grad_norm": 0.13107649981975555, + "learning_rate": 0.0002, + "loss": 0.0826, + "step": 19030 + }, + { + "epoch": 0.03462770443690197, + "grad_norm": 0.1787865310907364, + "learning_rate": 0.0002, + "loss": 0.0619, + "step": 19040 + }, + { + "epoch": 0.034645891256459165, + "grad_norm": 0.0460963137447834, + "learning_rate": 0.0002, + "loss": 0.0203, + "step": 19050 + }, + { + "epoch": 0.03466407807601636, + "grad_norm": 0.20582084357738495, + "learning_rate": 0.0002, + "loss": 0.1325, + "step": 19060 + }, + { + "epoch": 0.03468226489557356, + "grad_norm": 0.16120313107967377, + "learning_rate": 0.0002, + "loss": 0.08, + "step": 19070 + }, + { + "epoch": 0.03470045171513075, + "grad_norm": 0.04322347044944763, + "learning_rate": 0.0002, + "loss": 0.0753, + "step": 19080 + }, + { + "epoch": 0.03471863853468795, + "grad_norm": 0.1764109879732132, + "learning_rate": 0.0002, + "loss": 0.0618, + "step": 19090 + }, + { + "epoch": 0.034736825354245146, + "grad_norm": 0.04453815147280693, + "learning_rate": 0.0002, + "loss": 0.0172, + "step": 19100 + }, + { + "epoch": 0.03475501217380234, + "grad_norm": 0.32023972272872925, + "learning_rate": 0.0002, + "loss": 0.1394, + "step": 19110 + }, + { + "epoch": 0.03477319899335954, + "grad_norm": 0.09920009225606918, + "learning_rate": 0.0002, + "loss": 0.0788, + "step": 19120 + }, + { + "epoch": 0.034791385812916735, + "grad_norm": 0.047868456691503525, + "learning_rate": 0.0002, + "loss": 0.0745, + "step": 19130 + }, + { + "epoch": 0.03480957263247393, + "grad_norm": 0.219430074095726, + "learning_rate": 0.0002, + "loss": 0.063, + "step": 19140 + }, + { + "epoch": 0.03482775945203113, + "grad_norm": 0.04879681020975113, + "learning_rate": 0.0002, + "loss": 0.0161, + "step": 19150 + }, + { + "epoch": 0.034845946271588324, + "grad_norm": 0.21360138058662415, + "learning_rate": 0.0002, + "loss": 0.1602, + "step": 19160 + }, + { + "epoch": 0.03486413309114552, + "grad_norm": 0.1391269713640213, + "learning_rate": 0.0002, + "loss": 0.0798, + "step": 19170 + }, + { + "epoch": 0.03488231991070272, + "grad_norm": 0.06293737888336182, + "learning_rate": 0.0002, + "loss": 0.0717, + "step": 19180 + }, + { + "epoch": 0.03490050673025991, + "grad_norm": 0.20241963863372803, + "learning_rate": 0.0002, + "loss": 0.0612, + "step": 19190 + }, + { + "epoch": 0.03491869354981711, + "grad_norm": 0.06246611103415489, + "learning_rate": 0.0002, + "loss": 0.0148, + "step": 19200 + }, + { + "epoch": 0.034936880369374305, + "grad_norm": 0.16479995846748352, + "learning_rate": 0.0002, + "loss": 0.1611, + "step": 19210 + }, + { + "epoch": 0.0349550671889315, + "grad_norm": 0.12036983668804169, + "learning_rate": 0.0002, + "loss": 0.0724, + "step": 19220 + }, + { + "epoch": 0.0349732540084887, + "grad_norm": 0.03939517214894295, + "learning_rate": 0.0002, + "loss": 0.0758, + "step": 19230 + }, + { + "epoch": 0.034991440828045894, + "grad_norm": 0.17047277092933655, + "learning_rate": 0.0002, + "loss": 0.066, + "step": 19240 + }, + { + "epoch": 0.03500962764760309, + "grad_norm": 0.031782686710357666, + "learning_rate": 0.0002, + "loss": 0.0203, + "step": 19250 + }, + { + "epoch": 0.03502781446716029, + "grad_norm": 0.2545730471611023, + "learning_rate": 0.0002, + "loss": 0.1716, + "step": 19260 + }, + { + "epoch": 0.03504600128671748, + "grad_norm": 0.11225811392068863, + "learning_rate": 0.0002, + "loss": 0.0791, + "step": 19270 + }, + { + "epoch": 0.03506418810627468, + "grad_norm": 0.049140989780426025, + "learning_rate": 0.0002, + "loss": 0.0784, + "step": 19280 + }, + { + "epoch": 0.035082374925831876, + "grad_norm": 0.16942913830280304, + "learning_rate": 0.0002, + "loss": 0.0638, + "step": 19290 + }, + { + "epoch": 0.03510056174538907, + "grad_norm": 0.03836115077137947, + "learning_rate": 0.0002, + "loss": 0.0193, + "step": 19300 + }, + { + "epoch": 0.03511874856494627, + "grad_norm": 0.13004787266254425, + "learning_rate": 0.0002, + "loss": 0.1477, + "step": 19310 + }, + { + "epoch": 0.035136935384503465, + "grad_norm": 0.2054329216480255, + "learning_rate": 0.0002, + "loss": 0.0792, + "step": 19320 + }, + { + "epoch": 0.03515512220406066, + "grad_norm": 0.06592074781656265, + "learning_rate": 0.0002, + "loss": 0.0784, + "step": 19330 + }, + { + "epoch": 0.03517330902361786, + "grad_norm": 0.19228027760982513, + "learning_rate": 0.0002, + "loss": 0.067, + "step": 19340 + }, + { + "epoch": 0.035191495843175054, + "grad_norm": 0.04050719738006592, + "learning_rate": 0.0002, + "loss": 0.017, + "step": 19350 + }, + { + "epoch": 0.03520968266273225, + "grad_norm": 0.28715401887893677, + "learning_rate": 0.0002, + "loss": 0.1499, + "step": 19360 + }, + { + "epoch": 0.035227869482289446, + "grad_norm": 0.13954712450504303, + "learning_rate": 0.0002, + "loss": 0.0787, + "step": 19370 + }, + { + "epoch": 0.03524605630184664, + "grad_norm": 0.08851815015077591, + "learning_rate": 0.0002, + "loss": 0.0739, + "step": 19380 + }, + { + "epoch": 0.03526424312140384, + "grad_norm": 0.1788545697927475, + "learning_rate": 0.0002, + "loss": 0.0576, + "step": 19390 + }, + { + "epoch": 0.035282429940961035, + "grad_norm": 0.03644658252596855, + "learning_rate": 0.0002, + "loss": 0.0143, + "step": 19400 + }, + { + "epoch": 0.03530061676051823, + "grad_norm": 0.3140568137168884, + "learning_rate": 0.0002, + "loss": 0.1498, + "step": 19410 + }, + { + "epoch": 0.03531880358007543, + "grad_norm": 0.14550529420375824, + "learning_rate": 0.0002, + "loss": 0.0748, + "step": 19420 + }, + { + "epoch": 0.035336990399632624, + "grad_norm": 0.10995481163263321, + "learning_rate": 0.0002, + "loss": 0.0801, + "step": 19430 + }, + { + "epoch": 0.03535517721918982, + "grad_norm": 0.17238560318946838, + "learning_rate": 0.0002, + "loss": 0.0608, + "step": 19440 + }, + { + "epoch": 0.03537336403874702, + "grad_norm": 0.031363293528556824, + "learning_rate": 0.0002, + "loss": 0.0154, + "step": 19450 + }, + { + "epoch": 0.03539155085830421, + "grad_norm": 0.14145390689373016, + "learning_rate": 0.0002, + "loss": 0.1511, + "step": 19460 + }, + { + "epoch": 0.03540973767786141, + "grad_norm": 0.19073855876922607, + "learning_rate": 0.0002, + "loss": 0.0725, + "step": 19470 + }, + { + "epoch": 0.035427924497418606, + "grad_norm": 0.15639430284500122, + "learning_rate": 0.0002, + "loss": 0.0836, + "step": 19480 + }, + { + "epoch": 0.0354461113169758, + "grad_norm": 0.2566238045692444, + "learning_rate": 0.0002, + "loss": 0.0617, + "step": 19490 + }, + { + "epoch": 0.035464298136533, + "grad_norm": 0.055755820125341415, + "learning_rate": 0.0002, + "loss": 0.0178, + "step": 19500 + }, + { + "epoch": 0.035482484956090195, + "grad_norm": 0.2835562527179718, + "learning_rate": 0.0002, + "loss": 0.1306, + "step": 19510 + }, + { + "epoch": 0.03550067177564739, + "grad_norm": 0.2310812920331955, + "learning_rate": 0.0002, + "loss": 0.0766, + "step": 19520 + }, + { + "epoch": 0.03551885859520459, + "grad_norm": 0.1287071257829666, + "learning_rate": 0.0002, + "loss": 0.0791, + "step": 19530 + }, + { + "epoch": 0.035537045414761784, + "grad_norm": 0.21308869123458862, + "learning_rate": 0.0002, + "loss": 0.0584, + "step": 19540 + }, + { + "epoch": 0.03555523223431898, + "grad_norm": 0.0662735179066658, + "learning_rate": 0.0002, + "loss": 0.0207, + "step": 19550 + }, + { + "epoch": 0.035573419053876176, + "grad_norm": 0.21706523001194, + "learning_rate": 0.0002, + "loss": 0.1308, + "step": 19560 + }, + { + "epoch": 0.03559160587343337, + "grad_norm": 0.09376335144042969, + "learning_rate": 0.0002, + "loss": 0.0677, + "step": 19570 + }, + { + "epoch": 0.035609792692990576, + "grad_norm": 0.1093437597155571, + "learning_rate": 0.0002, + "loss": 0.0741, + "step": 19580 + }, + { + "epoch": 0.03562797951254777, + "grad_norm": 0.21057911217212677, + "learning_rate": 0.0002, + "loss": 0.0637, + "step": 19590 + }, + { + "epoch": 0.03564616633210497, + "grad_norm": 0.04383830726146698, + "learning_rate": 0.0002, + "loss": 0.019, + "step": 19600 + }, + { + "epoch": 0.035664353151662165, + "grad_norm": 0.3657427132129669, + "learning_rate": 0.0002, + "loss": 0.1421, + "step": 19610 + }, + { + "epoch": 0.03568253997121936, + "grad_norm": 0.17154265940189362, + "learning_rate": 0.0002, + "loss": 0.0779, + "step": 19620 + }, + { + "epoch": 0.03570072679077656, + "grad_norm": 0.041993435472249985, + "learning_rate": 0.0002, + "loss": 0.0768, + "step": 19630 + }, + { + "epoch": 0.035718913610333754, + "grad_norm": 0.1658252775669098, + "learning_rate": 0.0002, + "loss": 0.0602, + "step": 19640 + }, + { + "epoch": 0.03573710042989095, + "grad_norm": 0.028523078188300133, + "learning_rate": 0.0002, + "loss": 0.0151, + "step": 19650 + }, + { + "epoch": 0.035755287249448146, + "grad_norm": 0.2624453902244568, + "learning_rate": 0.0002, + "loss": 0.1355, + "step": 19660 + }, + { + "epoch": 0.03577347406900534, + "grad_norm": 0.12055794149637222, + "learning_rate": 0.0002, + "loss": 0.079, + "step": 19670 + }, + { + "epoch": 0.03579166088856254, + "grad_norm": 0.043441224843263626, + "learning_rate": 0.0002, + "loss": 0.0722, + "step": 19680 + }, + { + "epoch": 0.035809847708119735, + "grad_norm": 0.2464340627193451, + "learning_rate": 0.0002, + "loss": 0.0673, + "step": 19690 + }, + { + "epoch": 0.03582803452767693, + "grad_norm": 0.04004153981804848, + "learning_rate": 0.0002, + "loss": 0.0212, + "step": 19700 + }, + { + "epoch": 0.03584622134723413, + "grad_norm": 0.3159453570842743, + "learning_rate": 0.0002, + "loss": 0.1806, + "step": 19710 + }, + { + "epoch": 0.035864408166791324, + "grad_norm": 0.11327318102121353, + "learning_rate": 0.0002, + "loss": 0.0748, + "step": 19720 + }, + { + "epoch": 0.03588259498634852, + "grad_norm": 0.0980909988284111, + "learning_rate": 0.0002, + "loss": 0.0807, + "step": 19730 + }, + { + "epoch": 0.03590078180590572, + "grad_norm": 0.15508098900318146, + "learning_rate": 0.0002, + "loss": 0.0576, + "step": 19740 + }, + { + "epoch": 0.03591896862546291, + "grad_norm": 0.019624806940555573, + "learning_rate": 0.0002, + "loss": 0.0135, + "step": 19750 + }, + { + "epoch": 0.03593715544502011, + "grad_norm": 0.20336109399795532, + "learning_rate": 0.0002, + "loss": 0.1702, + "step": 19760 + }, + { + "epoch": 0.035955342264577306, + "grad_norm": 0.12767620384693146, + "learning_rate": 0.0002, + "loss": 0.0776, + "step": 19770 + }, + { + "epoch": 0.0359735290841345, + "grad_norm": 0.19050805270671844, + "learning_rate": 0.0002, + "loss": 0.0838, + "step": 19780 + }, + { + "epoch": 0.0359917159036917, + "grad_norm": 0.17471866309642792, + "learning_rate": 0.0002, + "loss": 0.0561, + "step": 19790 + }, + { + "epoch": 0.036009902723248895, + "grad_norm": 0.044348277151584625, + "learning_rate": 0.0002, + "loss": 0.0159, + "step": 19800 + }, + { + "epoch": 0.03602808954280609, + "grad_norm": 0.30847081542015076, + "learning_rate": 0.0002, + "loss": 0.1686, + "step": 19810 + }, + { + "epoch": 0.03604627636236329, + "grad_norm": 0.08963622897863388, + "learning_rate": 0.0002, + "loss": 0.078, + "step": 19820 + }, + { + "epoch": 0.036064463181920484, + "grad_norm": 0.0580587275326252, + "learning_rate": 0.0002, + "loss": 0.0741, + "step": 19830 + }, + { + "epoch": 0.03608265000147768, + "grad_norm": 0.1698184460401535, + "learning_rate": 0.0002, + "loss": 0.0631, + "step": 19840 + }, + { + "epoch": 0.036100836821034876, + "grad_norm": 0.025531867519021034, + "learning_rate": 0.0002, + "loss": 0.0166, + "step": 19850 + }, + { + "epoch": 0.03611902364059207, + "grad_norm": 0.3544731140136719, + "learning_rate": 0.0002, + "loss": 0.1886, + "step": 19860 + }, + { + "epoch": 0.03613721046014927, + "grad_norm": 0.2552841901779175, + "learning_rate": 0.0002, + "loss": 0.0859, + "step": 19870 + }, + { + "epoch": 0.036155397279706465, + "grad_norm": 0.07771942019462585, + "learning_rate": 0.0002, + "loss": 0.0859, + "step": 19880 + }, + { + "epoch": 0.03617358409926366, + "grad_norm": 0.15945585072040558, + "learning_rate": 0.0002, + "loss": 0.0609, + "step": 19890 + }, + { + "epoch": 0.03619177091882086, + "grad_norm": 0.04583865404129028, + "learning_rate": 0.0002, + "loss": 0.0196, + "step": 19900 + }, + { + "epoch": 0.036209957738378054, + "grad_norm": 0.2110920548439026, + "learning_rate": 0.0002, + "loss": 0.1305, + "step": 19910 + }, + { + "epoch": 0.03622814455793525, + "grad_norm": 0.22165755927562714, + "learning_rate": 0.0002, + "loss": 0.0767, + "step": 19920 + }, + { + "epoch": 0.03624633137749245, + "grad_norm": 0.0866742879152298, + "learning_rate": 0.0002, + "loss": 0.0785, + "step": 19930 + }, + { + "epoch": 0.03626451819704964, + "grad_norm": 0.19838224351406097, + "learning_rate": 0.0002, + "loss": 0.0663, + "step": 19940 + }, + { + "epoch": 0.03628270501660684, + "grad_norm": 0.05543521046638489, + "learning_rate": 0.0002, + "loss": 0.023, + "step": 19950 + }, + { + "epoch": 0.036300891836164036, + "grad_norm": 0.20800183713436127, + "learning_rate": 0.0002, + "loss": 0.1468, + "step": 19960 + }, + { + "epoch": 0.03631907865572123, + "grad_norm": 0.14951092004776, + "learning_rate": 0.0002, + "loss": 0.0698, + "step": 19970 + }, + { + "epoch": 0.03633726547527843, + "grad_norm": 0.10162603855133057, + "learning_rate": 0.0002, + "loss": 0.0841, + "step": 19980 + }, + { + "epoch": 0.036355452294835625, + "grad_norm": 0.24774019420146942, + "learning_rate": 0.0002, + "loss": 0.0658, + "step": 19990 + }, + { + "epoch": 0.03637363911439282, + "grad_norm": 0.02705777995288372, + "learning_rate": 0.0002, + "loss": 0.02, + "step": 20000 + }, + { + "epoch": 0.03639182593395002, + "grad_norm": 0.2509992718696594, + "learning_rate": 0.0002, + "loss": 0.1529, + "step": 20010 + }, + { + "epoch": 0.036410012753507214, + "grad_norm": 0.2126697599887848, + "learning_rate": 0.0002, + "loss": 0.0716, + "step": 20020 + }, + { + "epoch": 0.03642819957306441, + "grad_norm": 0.1463591754436493, + "learning_rate": 0.0002, + "loss": 0.076, + "step": 20030 + }, + { + "epoch": 0.036446386392621606, + "grad_norm": 0.21879518032073975, + "learning_rate": 0.0002, + "loss": 0.0677, + "step": 20040 + }, + { + "epoch": 0.0364645732121788, + "grad_norm": 0.028337355703115463, + "learning_rate": 0.0002, + "loss": 0.0131, + "step": 20050 + }, + { + "epoch": 0.036482760031736, + "grad_norm": 0.335788756608963, + "learning_rate": 0.0002, + "loss": 0.1693, + "step": 20060 + }, + { + "epoch": 0.036500946851293195, + "grad_norm": 0.17615728080272675, + "learning_rate": 0.0002, + "loss": 0.0791, + "step": 20070 + }, + { + "epoch": 0.03651913367085039, + "grad_norm": 0.034229181706905365, + "learning_rate": 0.0002, + "loss": 0.0774, + "step": 20080 + }, + { + "epoch": 0.03653732049040759, + "grad_norm": 0.20637790858745575, + "learning_rate": 0.0002, + "loss": 0.0544, + "step": 20090 + }, + { + "epoch": 0.036555507309964784, + "grad_norm": 0.033659741282463074, + "learning_rate": 0.0002, + "loss": 0.0128, + "step": 20100 + }, + { + "epoch": 0.03657369412952198, + "grad_norm": 0.18249601125717163, + "learning_rate": 0.0002, + "loss": 0.1939, + "step": 20110 + }, + { + "epoch": 0.03659188094907918, + "grad_norm": 0.18065877258777618, + "learning_rate": 0.0002, + "loss": 0.0816, + "step": 20120 + }, + { + "epoch": 0.03661006776863637, + "grad_norm": 0.4361811876296997, + "learning_rate": 0.0002, + "loss": 0.0978, + "step": 20130 + }, + { + "epoch": 0.03662825458819357, + "grad_norm": 0.24488002061843872, + "learning_rate": 0.0002, + "loss": 0.0742, + "step": 20140 + }, + { + "epoch": 0.036646441407750766, + "grad_norm": 0.023062752559781075, + "learning_rate": 0.0002, + "loss": 0.0196, + "step": 20150 + }, + { + "epoch": 0.03666462822730796, + "grad_norm": 0.22796255350112915, + "learning_rate": 0.0002, + "loss": 0.1457, + "step": 20160 + }, + { + "epoch": 0.03668281504686516, + "grad_norm": 0.16665758192539215, + "learning_rate": 0.0002, + "loss": 0.138, + "step": 20170 + }, + { + "epoch": 0.036701001866422354, + "grad_norm": 0.0503946952521801, + "learning_rate": 0.0002, + "loss": 0.079, + "step": 20180 + }, + { + "epoch": 0.03671918868597955, + "grad_norm": 0.1672963798046112, + "learning_rate": 0.0002, + "loss": 0.0621, + "step": 20190 + }, + { + "epoch": 0.03673737550553675, + "grad_norm": 0.06765859574079514, + "learning_rate": 0.0002, + "loss": 0.0171, + "step": 20200 + }, + { + "epoch": 0.03675556232509394, + "grad_norm": 0.6076682806015015, + "learning_rate": 0.0002, + "loss": 0.6804, + "step": 20210 + }, + { + "epoch": 0.03677374914465114, + "grad_norm": 0.04764563590288162, + "learning_rate": 0.0002, + "loss": 0.0965, + "step": 20220 + }, + { + "epoch": 0.036791935964208336, + "grad_norm": 0.6847806572914124, + "learning_rate": 0.0002, + "loss": 0.0784, + "step": 20230 + }, + { + "epoch": 0.03681012278376553, + "grad_norm": 0.2678837478160858, + "learning_rate": 0.0002, + "loss": 0.069, + "step": 20240 + }, + { + "epoch": 0.03682830960332273, + "grad_norm": 0.039824239909648895, + "learning_rate": 0.0002, + "loss": 0.0206, + "step": 20250 + }, + { + "epoch": 0.036846496422879925, + "grad_norm": 0.19583609700202942, + "learning_rate": 0.0002, + "loss": 0.1588, + "step": 20260 + }, + { + "epoch": 0.03686468324243713, + "grad_norm": 0.08613055944442749, + "learning_rate": 0.0002, + "loss": 0.0777, + "step": 20270 + }, + { + "epoch": 0.036882870061994324, + "grad_norm": 0.028818165883421898, + "learning_rate": 0.0002, + "loss": 0.0704, + "step": 20280 + }, + { + "epoch": 0.03690105688155152, + "grad_norm": 0.19514115154743195, + "learning_rate": 0.0002, + "loss": 0.0654, + "step": 20290 + }, + { + "epoch": 0.03691924370110872, + "grad_norm": 0.043222617357969284, + "learning_rate": 0.0002, + "loss": 0.0216, + "step": 20300 + }, + { + "epoch": 0.03693743052066591, + "grad_norm": 0.2490546703338623, + "learning_rate": 0.0002, + "loss": 0.1472, + "step": 20310 + }, + { + "epoch": 0.03695561734022311, + "grad_norm": 0.16989269852638245, + "learning_rate": 0.0002, + "loss": 0.081, + "step": 20320 + }, + { + "epoch": 0.036973804159780306, + "grad_norm": 0.09191739559173584, + "learning_rate": 0.0002, + "loss": 0.0733, + "step": 20330 + }, + { + "epoch": 0.0369919909793375, + "grad_norm": 0.18435023725032806, + "learning_rate": 0.0002, + "loss": 0.0654, + "step": 20340 + }, + { + "epoch": 0.0370101777988947, + "grad_norm": 0.031144114211201668, + "learning_rate": 0.0002, + "loss": 0.0226, + "step": 20350 + }, + { + "epoch": 0.037028364618451895, + "grad_norm": 0.3244694769382477, + "learning_rate": 0.0002, + "loss": 0.1304, + "step": 20360 + }, + { + "epoch": 0.03704655143800909, + "grad_norm": 0.13787488639354706, + "learning_rate": 0.0002, + "loss": 0.0811, + "step": 20370 + }, + { + "epoch": 0.03706473825756629, + "grad_norm": 0.058523450046777725, + "learning_rate": 0.0002, + "loss": 0.0806, + "step": 20380 + }, + { + "epoch": 0.037082925077123484, + "grad_norm": 0.3001325726509094, + "learning_rate": 0.0002, + "loss": 0.0694, + "step": 20390 + }, + { + "epoch": 0.03710111189668068, + "grad_norm": 0.04447292909026146, + "learning_rate": 0.0002, + "loss": 0.0218, + "step": 20400 + }, + { + "epoch": 0.037119298716237877, + "grad_norm": 0.25786396861076355, + "learning_rate": 0.0002, + "loss": 0.1499, + "step": 20410 + }, + { + "epoch": 0.03713748553579507, + "grad_norm": 0.11381134390830994, + "learning_rate": 0.0002, + "loss": 0.0822, + "step": 20420 + }, + { + "epoch": 0.03715567235535227, + "grad_norm": 0.022713568061590195, + "learning_rate": 0.0002, + "loss": 0.0785, + "step": 20430 + }, + { + "epoch": 0.037173859174909465, + "grad_norm": 0.15770909190177917, + "learning_rate": 0.0002, + "loss": 0.0625, + "step": 20440 + }, + { + "epoch": 0.03719204599446666, + "grad_norm": 0.021412041038274765, + "learning_rate": 0.0002, + "loss": 0.0126, + "step": 20450 + }, + { + "epoch": 0.03721023281402386, + "grad_norm": 0.24260753393173218, + "learning_rate": 0.0002, + "loss": 0.1777, + "step": 20460 + }, + { + "epoch": 0.037228419633581054, + "grad_norm": 0.10953031480312347, + "learning_rate": 0.0002, + "loss": 0.073, + "step": 20470 + }, + { + "epoch": 0.03724660645313825, + "grad_norm": 0.03975062072277069, + "learning_rate": 0.0002, + "loss": 0.0907, + "step": 20480 + }, + { + "epoch": 0.03726479327269545, + "grad_norm": 0.2025018036365509, + "learning_rate": 0.0002, + "loss": 0.0631, + "step": 20490 + }, + { + "epoch": 0.03728298009225264, + "grad_norm": 0.031849734485149384, + "learning_rate": 0.0002, + "loss": 0.0156, + "step": 20500 + }, + { + "epoch": 0.03730116691180984, + "grad_norm": 0.2650098502635956, + "learning_rate": 0.0002, + "loss": 0.1569, + "step": 20510 + }, + { + "epoch": 0.037319353731367036, + "grad_norm": 0.14113937318325043, + "learning_rate": 0.0002, + "loss": 0.0824, + "step": 20520 + }, + { + "epoch": 0.03733754055092423, + "grad_norm": 0.10276420414447784, + "learning_rate": 0.0002, + "loss": 0.0797, + "step": 20530 + }, + { + "epoch": 0.03735572737048143, + "grad_norm": 0.2258286476135254, + "learning_rate": 0.0002, + "loss": 0.0671, + "step": 20540 + }, + { + "epoch": 0.037373914190038625, + "grad_norm": 0.10343242436647415, + "learning_rate": 0.0002, + "loss": 0.0178, + "step": 20550 + }, + { + "epoch": 0.03739210100959582, + "grad_norm": 0.19423982501029968, + "learning_rate": 0.0002, + "loss": 0.1423, + "step": 20560 + }, + { + "epoch": 0.03741028782915302, + "grad_norm": 0.12046124786138535, + "learning_rate": 0.0002, + "loss": 0.0827, + "step": 20570 + }, + { + "epoch": 0.037428474648710214, + "grad_norm": 0.026751041412353516, + "learning_rate": 0.0002, + "loss": 0.0743, + "step": 20580 + }, + { + "epoch": 0.03744666146826741, + "grad_norm": 0.23576834797859192, + "learning_rate": 0.0002, + "loss": 0.0629, + "step": 20590 + }, + { + "epoch": 0.037464848287824606, + "grad_norm": 0.05146399885416031, + "learning_rate": 0.0002, + "loss": 0.0205, + "step": 20600 + }, + { + "epoch": 0.0374830351073818, + "grad_norm": 0.21750135719776154, + "learning_rate": 0.0002, + "loss": 0.1397, + "step": 20610 + }, + { + "epoch": 0.037501221926939, + "grad_norm": 0.08351115882396698, + "learning_rate": 0.0002, + "loss": 0.0801, + "step": 20620 + }, + { + "epoch": 0.037519408746496195, + "grad_norm": 0.07272092998027802, + "learning_rate": 0.0002, + "loss": 0.0881, + "step": 20630 + }, + { + "epoch": 0.03753759556605339, + "grad_norm": 0.23707769811153412, + "learning_rate": 0.0002, + "loss": 0.0706, + "step": 20640 + }, + { + "epoch": 0.03755578238561059, + "grad_norm": 0.05208323150873184, + "learning_rate": 0.0002, + "loss": 0.024, + "step": 20650 + }, + { + "epoch": 0.037573969205167784, + "grad_norm": 0.4163022041320801, + "learning_rate": 0.0002, + "loss": 0.159, + "step": 20660 + }, + { + "epoch": 0.03759215602472498, + "grad_norm": 0.1036575511097908, + "learning_rate": 0.0002, + "loss": 0.0814, + "step": 20670 + }, + { + "epoch": 0.03761034284428218, + "grad_norm": 0.09861626476049423, + "learning_rate": 0.0002, + "loss": 0.0828, + "step": 20680 + }, + { + "epoch": 0.03762852966383937, + "grad_norm": 0.1685744971036911, + "learning_rate": 0.0002, + "loss": 0.0597, + "step": 20690 + }, + { + "epoch": 0.03764671648339657, + "grad_norm": 0.02716050110757351, + "learning_rate": 0.0002, + "loss": 0.0164, + "step": 20700 + }, + { + "epoch": 0.037664903302953766, + "grad_norm": 0.46858713030815125, + "learning_rate": 0.0002, + "loss": 0.1596, + "step": 20710 + }, + { + "epoch": 0.03768309012251096, + "grad_norm": 0.15260715782642365, + "learning_rate": 0.0002, + "loss": 0.0835, + "step": 20720 + }, + { + "epoch": 0.03770127694206816, + "grad_norm": 0.2063397914171219, + "learning_rate": 0.0002, + "loss": 0.0845, + "step": 20730 + }, + { + "epoch": 0.037719463761625355, + "grad_norm": 0.16447599232196808, + "learning_rate": 0.0002, + "loss": 0.0595, + "step": 20740 + }, + { + "epoch": 0.03773765058118255, + "grad_norm": 0.020755184814333916, + "learning_rate": 0.0002, + "loss": 0.0164, + "step": 20750 + }, + { + "epoch": 0.03775583740073975, + "grad_norm": 0.23675021529197693, + "learning_rate": 0.0002, + "loss": 0.1634, + "step": 20760 + }, + { + "epoch": 0.037774024220296944, + "grad_norm": 0.08625516295433044, + "learning_rate": 0.0002, + "loss": 0.0685, + "step": 20770 + }, + { + "epoch": 0.03779221103985414, + "grad_norm": 0.043796882033348083, + "learning_rate": 0.0002, + "loss": 0.0812, + "step": 20780 + }, + { + "epoch": 0.037810397859411336, + "grad_norm": 0.20600435137748718, + "learning_rate": 0.0002, + "loss": 0.0651, + "step": 20790 + }, + { + "epoch": 0.03782858467896853, + "grad_norm": 0.04963940382003784, + "learning_rate": 0.0002, + "loss": 0.0202, + "step": 20800 + }, + { + "epoch": 0.03784677149852573, + "grad_norm": 0.34920167922973633, + "learning_rate": 0.0002, + "loss": 0.1494, + "step": 20810 + }, + { + "epoch": 0.037864958318082925, + "grad_norm": 0.18662041425704956, + "learning_rate": 0.0002, + "loss": 0.0823, + "step": 20820 + }, + { + "epoch": 0.03788314513764012, + "grad_norm": 0.12615887820720673, + "learning_rate": 0.0002, + "loss": 0.0856, + "step": 20830 + }, + { + "epoch": 0.03790133195719732, + "grad_norm": 0.1857282668352127, + "learning_rate": 0.0002, + "loss": 0.0676, + "step": 20840 + }, + { + "epoch": 0.037919518776754514, + "grad_norm": 0.05569197237491608, + "learning_rate": 0.0002, + "loss": 0.0181, + "step": 20850 + }, + { + "epoch": 0.03793770559631171, + "grad_norm": 0.29011765122413635, + "learning_rate": 0.0002, + "loss": 0.1418, + "step": 20860 + }, + { + "epoch": 0.03795589241586891, + "grad_norm": 0.14119744300842285, + "learning_rate": 0.0002, + "loss": 0.0812, + "step": 20870 + }, + { + "epoch": 0.0379740792354261, + "grad_norm": 0.039884984493255615, + "learning_rate": 0.0002, + "loss": 0.0781, + "step": 20880 + }, + { + "epoch": 0.0379922660549833, + "grad_norm": 0.23705685138702393, + "learning_rate": 0.0002, + "loss": 0.0621, + "step": 20890 + }, + { + "epoch": 0.038010452874540496, + "grad_norm": 0.07462739199399948, + "learning_rate": 0.0002, + "loss": 0.022, + "step": 20900 + }, + { + "epoch": 0.03802863969409769, + "grad_norm": 0.2610052824020386, + "learning_rate": 0.0002, + "loss": 0.1517, + "step": 20910 + }, + { + "epoch": 0.03804682651365489, + "grad_norm": 0.12775090336799622, + "learning_rate": 0.0002, + "loss": 0.0758, + "step": 20920 + }, + { + "epoch": 0.038065013333212085, + "grad_norm": 0.03661905974149704, + "learning_rate": 0.0002, + "loss": 0.0738, + "step": 20930 + }, + { + "epoch": 0.03808320015276928, + "grad_norm": 0.20907218754291534, + "learning_rate": 0.0002, + "loss": 0.0627, + "step": 20940 + }, + { + "epoch": 0.03810138697232648, + "grad_norm": 0.022804679349064827, + "learning_rate": 0.0002, + "loss": 0.0205, + "step": 20950 + }, + { + "epoch": 0.03811957379188368, + "grad_norm": 0.258284330368042, + "learning_rate": 0.0002, + "loss": 0.1428, + "step": 20960 + }, + { + "epoch": 0.03813776061144088, + "grad_norm": 0.1477317065000534, + "learning_rate": 0.0002, + "loss": 0.0789, + "step": 20970 + }, + { + "epoch": 0.03815594743099807, + "grad_norm": 0.0610325001180172, + "learning_rate": 0.0002, + "loss": 0.0836, + "step": 20980 + }, + { + "epoch": 0.03817413425055527, + "grad_norm": 0.18825507164001465, + "learning_rate": 0.0002, + "loss": 0.0621, + "step": 20990 + }, + { + "epoch": 0.038192321070112466, + "grad_norm": 0.03943372145295143, + "learning_rate": 0.0002, + "loss": 0.0185, + "step": 21000 + }, + { + "epoch": 0.03821050788966966, + "grad_norm": 0.34519344568252563, + "learning_rate": 0.0002, + "loss": 0.1345, + "step": 21010 + }, + { + "epoch": 0.03822869470922686, + "grad_norm": 0.09635084867477417, + "learning_rate": 0.0002, + "loss": 0.0753, + "step": 21020 + }, + { + "epoch": 0.038246881528784055, + "grad_norm": 0.032520972192287445, + "learning_rate": 0.0002, + "loss": 0.082, + "step": 21030 + }, + { + "epoch": 0.03826506834834125, + "grad_norm": 0.18068930506706238, + "learning_rate": 0.0002, + "loss": 0.0609, + "step": 21040 + }, + { + "epoch": 0.03828325516789845, + "grad_norm": 0.05550973862409592, + "learning_rate": 0.0002, + "loss": 0.0241, + "step": 21050 + }, + { + "epoch": 0.038301441987455644, + "grad_norm": 0.19561107456684113, + "learning_rate": 0.0002, + "loss": 0.1337, + "step": 21060 + }, + { + "epoch": 0.03831962880701284, + "grad_norm": 0.1852179914712906, + "learning_rate": 0.0002, + "loss": 0.0724, + "step": 21070 + }, + { + "epoch": 0.038337815626570036, + "grad_norm": 0.11915116757154465, + "learning_rate": 0.0002, + "loss": 0.0836, + "step": 21080 + }, + { + "epoch": 0.03835600244612723, + "grad_norm": 0.21116836369037628, + "learning_rate": 0.0002, + "loss": 0.0628, + "step": 21090 + }, + { + "epoch": 0.03837418926568443, + "grad_norm": 0.042745884507894516, + "learning_rate": 0.0002, + "loss": 0.0214, + "step": 21100 + }, + { + "epoch": 0.038392376085241625, + "grad_norm": 0.43089792132377625, + "learning_rate": 0.0002, + "loss": 0.1351, + "step": 21110 + }, + { + "epoch": 0.03841056290479882, + "grad_norm": 0.09607810527086258, + "learning_rate": 0.0002, + "loss": 0.0778, + "step": 21120 + }, + { + "epoch": 0.03842874972435602, + "grad_norm": 0.13603460788726807, + "learning_rate": 0.0002, + "loss": 0.0787, + "step": 21130 + }, + { + "epoch": 0.038446936543913214, + "grad_norm": 0.20110103487968445, + "learning_rate": 0.0002, + "loss": 0.067, + "step": 21140 + }, + { + "epoch": 0.03846512336347041, + "grad_norm": 0.042503997683525085, + "learning_rate": 0.0002, + "loss": 0.0194, + "step": 21150 + }, + { + "epoch": 0.03848331018302761, + "grad_norm": 0.2605084478855133, + "learning_rate": 0.0002, + "loss": 0.1374, + "step": 21160 + }, + { + "epoch": 0.0385014970025848, + "grad_norm": 0.09476794302463531, + "learning_rate": 0.0002, + "loss": 0.078, + "step": 21170 + }, + { + "epoch": 0.038519683822142, + "grad_norm": 0.03458428382873535, + "learning_rate": 0.0002, + "loss": 0.08, + "step": 21180 + }, + { + "epoch": 0.038537870641699196, + "grad_norm": 0.31196194887161255, + "learning_rate": 0.0002, + "loss": 0.0664, + "step": 21190 + }, + { + "epoch": 0.03855605746125639, + "grad_norm": 0.037113118916749954, + "learning_rate": 0.0002, + "loss": 0.0221, + "step": 21200 + }, + { + "epoch": 0.03857424428081359, + "grad_norm": 0.3699415922164917, + "learning_rate": 0.0002, + "loss": 0.1534, + "step": 21210 + }, + { + "epoch": 0.038592431100370785, + "grad_norm": 0.06454256922006607, + "learning_rate": 0.0002, + "loss": 0.0762, + "step": 21220 + }, + { + "epoch": 0.03861061791992798, + "grad_norm": 0.09858033806085587, + "learning_rate": 0.0002, + "loss": 0.0785, + "step": 21230 + }, + { + "epoch": 0.03862880473948518, + "grad_norm": 0.1482791304588318, + "learning_rate": 0.0002, + "loss": 0.062, + "step": 21240 + }, + { + "epoch": 0.038646991559042373, + "grad_norm": 0.031473588198423386, + "learning_rate": 0.0002, + "loss": 0.0163, + "step": 21250 + }, + { + "epoch": 0.03866517837859957, + "grad_norm": 0.09360513091087341, + "learning_rate": 0.0002, + "loss": 0.1397, + "step": 21260 + }, + { + "epoch": 0.038683365198156766, + "grad_norm": 0.10830901563167572, + "learning_rate": 0.0002, + "loss": 0.0789, + "step": 21270 + }, + { + "epoch": 0.03870155201771396, + "grad_norm": 0.08910014480352402, + "learning_rate": 0.0002, + "loss": 0.0758, + "step": 21280 + }, + { + "epoch": 0.03871973883727116, + "grad_norm": 0.21524523198604584, + "learning_rate": 0.0002, + "loss": 0.0628, + "step": 21290 + }, + { + "epoch": 0.038737925656828355, + "grad_norm": 0.03794678673148155, + "learning_rate": 0.0002, + "loss": 0.0229, + "step": 21300 + }, + { + "epoch": 0.03875611247638555, + "grad_norm": 0.46754345297813416, + "learning_rate": 0.0002, + "loss": 0.1291, + "step": 21310 + }, + { + "epoch": 0.03877429929594275, + "grad_norm": 0.07472983002662659, + "learning_rate": 0.0002, + "loss": 0.076, + "step": 21320 + }, + { + "epoch": 0.038792486115499944, + "grad_norm": 0.11820811778306961, + "learning_rate": 0.0002, + "loss": 0.0772, + "step": 21330 + }, + { + "epoch": 0.03881067293505714, + "grad_norm": 0.21140390634536743, + "learning_rate": 0.0002, + "loss": 0.0539, + "step": 21340 + }, + { + "epoch": 0.03882885975461434, + "grad_norm": 0.044819217175245285, + "learning_rate": 0.0002, + "loss": 0.0228, + "step": 21350 + }, + { + "epoch": 0.03884704657417153, + "grad_norm": 0.2267816811800003, + "learning_rate": 0.0002, + "loss": 0.1462, + "step": 21360 + }, + { + "epoch": 0.03886523339372873, + "grad_norm": 0.10087496787309647, + "learning_rate": 0.0002, + "loss": 0.0766, + "step": 21370 + }, + { + "epoch": 0.038883420213285926, + "grad_norm": 0.09982341527938843, + "learning_rate": 0.0002, + "loss": 0.0798, + "step": 21380 + }, + { + "epoch": 0.03890160703284312, + "grad_norm": 0.21729151904582977, + "learning_rate": 0.0002, + "loss": 0.0586, + "step": 21390 + }, + { + "epoch": 0.03891979385240032, + "grad_norm": 0.020691821351647377, + "learning_rate": 0.0002, + "loss": 0.0175, + "step": 21400 + }, + { + "epoch": 0.038937980671957514, + "grad_norm": 0.33531665802001953, + "learning_rate": 0.0002, + "loss": 0.149, + "step": 21410 + }, + { + "epoch": 0.03895616749151471, + "grad_norm": 0.11777795851230621, + "learning_rate": 0.0002, + "loss": 0.0736, + "step": 21420 + }, + { + "epoch": 0.03897435431107191, + "grad_norm": 0.07860718667507172, + "learning_rate": 0.0002, + "loss": 0.0788, + "step": 21430 + }, + { + "epoch": 0.0389925411306291, + "grad_norm": 0.16030597686767578, + "learning_rate": 0.0002, + "loss": 0.0581, + "step": 21440 + }, + { + "epoch": 0.0390107279501863, + "grad_norm": 0.01747356541454792, + "learning_rate": 0.0002, + "loss": 0.0185, + "step": 21450 + }, + { + "epoch": 0.039028914769743496, + "grad_norm": 0.2313859909772873, + "learning_rate": 0.0002, + "loss": 0.1383, + "step": 21460 + }, + { + "epoch": 0.03904710158930069, + "grad_norm": 0.14510080218315125, + "learning_rate": 0.0002, + "loss": 0.0805, + "step": 21470 + }, + { + "epoch": 0.03906528840885789, + "grad_norm": 0.04511871561408043, + "learning_rate": 0.0002, + "loss": 0.0793, + "step": 21480 + }, + { + "epoch": 0.039083475228415085, + "grad_norm": 0.24205265939235687, + "learning_rate": 0.0002, + "loss": 0.0624, + "step": 21490 + }, + { + "epoch": 0.03910166204797228, + "grad_norm": 0.08096791058778763, + "learning_rate": 0.0002, + "loss": 0.0208, + "step": 21500 + }, + { + "epoch": 0.03911984886752948, + "grad_norm": 0.14405490458011627, + "learning_rate": 0.0002, + "loss": 0.1189, + "step": 21510 + }, + { + "epoch": 0.039138035687086674, + "grad_norm": 0.06753374636173248, + "learning_rate": 0.0002, + "loss": 0.0772, + "step": 21520 + }, + { + "epoch": 0.03915622250664387, + "grad_norm": 0.029025042429566383, + "learning_rate": 0.0002, + "loss": 0.0761, + "step": 21530 + }, + { + "epoch": 0.039174409326201066, + "grad_norm": 0.2987070381641388, + "learning_rate": 0.0002, + "loss": 0.0656, + "step": 21540 + }, + { + "epoch": 0.03919259614575826, + "grad_norm": 0.04445091262459755, + "learning_rate": 0.0002, + "loss": 0.0241, + "step": 21550 + }, + { + "epoch": 0.03921078296531546, + "grad_norm": 0.34976306557655334, + "learning_rate": 0.0002, + "loss": 0.138, + "step": 21560 + }, + { + "epoch": 0.039228969784872655, + "grad_norm": 0.07521916925907135, + "learning_rate": 0.0002, + "loss": 0.0774, + "step": 21570 + }, + { + "epoch": 0.03924715660442985, + "grad_norm": 0.1445412039756775, + "learning_rate": 0.0002, + "loss": 0.087, + "step": 21580 + }, + { + "epoch": 0.03926534342398705, + "grad_norm": 0.2688128352165222, + "learning_rate": 0.0002, + "loss": 0.0712, + "step": 21590 + }, + { + "epoch": 0.039283530243544244, + "grad_norm": 0.05321233719587326, + "learning_rate": 0.0002, + "loss": 0.0245, + "step": 21600 + }, + { + "epoch": 0.03930171706310144, + "grad_norm": 0.44459134340286255, + "learning_rate": 0.0002, + "loss": 0.1524, + "step": 21610 + }, + { + "epoch": 0.03931990388265864, + "grad_norm": 0.13169553875923157, + "learning_rate": 0.0002, + "loss": 0.0726, + "step": 21620 + }, + { + "epoch": 0.03933809070221583, + "grad_norm": 0.0908237174153328, + "learning_rate": 0.0002, + "loss": 0.0785, + "step": 21630 + }, + { + "epoch": 0.03935627752177303, + "grad_norm": 0.18110623955726624, + "learning_rate": 0.0002, + "loss": 0.0606, + "step": 21640 + }, + { + "epoch": 0.03937446434133023, + "grad_norm": 0.021362677216529846, + "learning_rate": 0.0002, + "loss": 0.0175, + "step": 21650 + }, + { + "epoch": 0.03939265116088743, + "grad_norm": 0.27973899245262146, + "learning_rate": 0.0002, + "loss": 0.1641, + "step": 21660 + }, + { + "epoch": 0.039410837980444625, + "grad_norm": 0.09090718626976013, + "learning_rate": 0.0002, + "loss": 0.073, + "step": 21670 + }, + { + "epoch": 0.03942902480000182, + "grad_norm": 0.13408254086971283, + "learning_rate": 0.0002, + "loss": 0.0769, + "step": 21680 + }, + { + "epoch": 0.03944721161955902, + "grad_norm": 0.2530055046081543, + "learning_rate": 0.0002, + "loss": 0.0729, + "step": 21690 + }, + { + "epoch": 0.039465398439116214, + "grad_norm": 0.027523871511220932, + "learning_rate": 0.0002, + "loss": 0.017, + "step": 21700 + }, + { + "epoch": 0.03948358525867341, + "grad_norm": 0.2520642578601837, + "learning_rate": 0.0002, + "loss": 0.1804, + "step": 21710 + }, + { + "epoch": 0.03950177207823061, + "grad_norm": 0.11017465591430664, + "learning_rate": 0.0002, + "loss": 0.0767, + "step": 21720 + }, + { + "epoch": 0.0395199588977878, + "grad_norm": 0.05129052326083183, + "learning_rate": 0.0002, + "loss": 0.0723, + "step": 21730 + }, + { + "epoch": 0.039538145717345, + "grad_norm": 0.1846659779548645, + "learning_rate": 0.0002, + "loss": 0.0619, + "step": 21740 + }, + { + "epoch": 0.039556332536902196, + "grad_norm": 0.014305013231933117, + "learning_rate": 0.0002, + "loss": 0.0171, + "step": 21750 + }, + { + "epoch": 0.03957451935645939, + "grad_norm": 0.21667814254760742, + "learning_rate": 0.0002, + "loss": 0.157, + "step": 21760 + }, + { + "epoch": 0.03959270617601659, + "grad_norm": 0.21456903219223022, + "learning_rate": 0.0002, + "loss": 0.0803, + "step": 21770 + }, + { + "epoch": 0.039610892995573785, + "grad_norm": 0.03621416166424751, + "learning_rate": 0.0002, + "loss": 0.0796, + "step": 21780 + }, + { + "epoch": 0.03962907981513098, + "grad_norm": 0.20819205045700073, + "learning_rate": 0.0002, + "loss": 0.0633, + "step": 21790 + }, + { + "epoch": 0.03964726663468818, + "grad_norm": 0.06860963255167007, + "learning_rate": 0.0002, + "loss": 0.0172, + "step": 21800 + }, + { + "epoch": 0.039665453454245374, + "grad_norm": 0.2568039894104004, + "learning_rate": 0.0002, + "loss": 0.134, + "step": 21810 + }, + { + "epoch": 0.03968364027380257, + "grad_norm": 0.08747372031211853, + "learning_rate": 0.0002, + "loss": 0.0753, + "step": 21820 + }, + { + "epoch": 0.039701827093359766, + "grad_norm": 0.13403570652008057, + "learning_rate": 0.0002, + "loss": 0.0807, + "step": 21830 + }, + { + "epoch": 0.03972001391291696, + "grad_norm": 0.20756667852401733, + "learning_rate": 0.0002, + "loss": 0.0625, + "step": 21840 + }, + { + "epoch": 0.03973820073247416, + "grad_norm": 0.03678170591592789, + "learning_rate": 0.0002, + "loss": 0.019, + "step": 21850 + }, + { + "epoch": 0.039756387552031355, + "grad_norm": 0.1847693920135498, + "learning_rate": 0.0002, + "loss": 0.1385, + "step": 21860 + }, + { + "epoch": 0.03977457437158855, + "grad_norm": 0.1627635508775711, + "learning_rate": 0.0002, + "loss": 0.0765, + "step": 21870 + }, + { + "epoch": 0.03979276119114575, + "grad_norm": 0.0535571426153183, + "learning_rate": 0.0002, + "loss": 0.0741, + "step": 21880 + }, + { + "epoch": 0.039810948010702944, + "grad_norm": 0.3128276765346527, + "learning_rate": 0.0002, + "loss": 0.0598, + "step": 21890 + }, + { + "epoch": 0.03982913483026014, + "grad_norm": 0.03369860351085663, + "learning_rate": 0.0002, + "loss": 0.0217, + "step": 21900 + }, + { + "epoch": 0.03984732164981734, + "grad_norm": 0.1962599903345108, + "learning_rate": 0.0002, + "loss": 0.1319, + "step": 21910 + }, + { + "epoch": 0.03986550846937453, + "grad_norm": 0.1397421509027481, + "learning_rate": 0.0002, + "loss": 0.068, + "step": 21920 + }, + { + "epoch": 0.03988369528893173, + "grad_norm": 0.10252605378627777, + "learning_rate": 0.0002, + "loss": 0.0736, + "step": 21930 + }, + { + "epoch": 0.039901882108488926, + "grad_norm": 0.22179432213306427, + "learning_rate": 0.0002, + "loss": 0.0625, + "step": 21940 + }, + { + "epoch": 0.03992006892804612, + "grad_norm": 0.06068069487810135, + "learning_rate": 0.0002, + "loss": 0.0242, + "step": 21950 + }, + { + "epoch": 0.03993825574760332, + "grad_norm": 0.20243950188159943, + "learning_rate": 0.0002, + "loss": 0.143, + "step": 21960 + }, + { + "epoch": 0.039956442567160515, + "grad_norm": 0.11786511540412903, + "learning_rate": 0.0002, + "loss": 0.0779, + "step": 21970 + }, + { + "epoch": 0.03997462938671771, + "grad_norm": 0.08299421519041061, + "learning_rate": 0.0002, + "loss": 0.0774, + "step": 21980 + }, + { + "epoch": 0.03999281620627491, + "grad_norm": 0.2844075858592987, + "learning_rate": 0.0002, + "loss": 0.0711, + "step": 21990 + }, + { + "epoch": 0.040011003025832104, + "grad_norm": 0.034433312714099884, + "learning_rate": 0.0002, + "loss": 0.0217, + "step": 22000 + }, + { + "epoch": 0.0400291898453893, + "grad_norm": 0.3878481388092041, + "learning_rate": 0.0002, + "loss": 0.1525, + "step": 22010 + }, + { + "epoch": 0.040047376664946496, + "grad_norm": 0.16157971322536469, + "learning_rate": 0.0002, + "loss": 0.0788, + "step": 22020 + }, + { + "epoch": 0.04006556348450369, + "grad_norm": 0.10347063094377518, + "learning_rate": 0.0002, + "loss": 0.0809, + "step": 22030 + }, + { + "epoch": 0.04008375030406089, + "grad_norm": 0.20982638001441956, + "learning_rate": 0.0002, + "loss": 0.0662, + "step": 22040 + }, + { + "epoch": 0.040101937123618085, + "grad_norm": 5.856126308441162, + "learning_rate": 0.0002, + "loss": 0.0578, + "step": 22050 + }, + { + "epoch": 0.04012012394317528, + "grad_norm": 0.21289357542991638, + "learning_rate": 0.0002, + "loss": 0.1257, + "step": 22060 + }, + { + "epoch": 0.04013831076273248, + "grad_norm": 0.040848907083272934, + "learning_rate": 0.0002, + "loss": 0.0783, + "step": 22070 + }, + { + "epoch": 0.040156497582289674, + "grad_norm": 0.056517478078603745, + "learning_rate": 0.0002, + "loss": 0.0693, + "step": 22080 + }, + { + "epoch": 0.04017468440184687, + "grad_norm": 0.274312287569046, + "learning_rate": 0.0002, + "loss": 0.0685, + "step": 22090 + }, + { + "epoch": 0.04019287122140407, + "grad_norm": 0.06353340297937393, + "learning_rate": 0.0002, + "loss": 0.0263, + "step": 22100 + }, + { + "epoch": 0.04021105804096126, + "grad_norm": 0.287201464176178, + "learning_rate": 0.0002, + "loss": 0.1425, + "step": 22110 + }, + { + "epoch": 0.04022924486051846, + "grad_norm": 0.0990116223692894, + "learning_rate": 0.0002, + "loss": 0.0732, + "step": 22120 + }, + { + "epoch": 0.040247431680075656, + "grad_norm": 0.03471527248620987, + "learning_rate": 0.0002, + "loss": 0.0806, + "step": 22130 + }, + { + "epoch": 0.04026561849963285, + "grad_norm": 0.16411902010440826, + "learning_rate": 0.0002, + "loss": 0.0646, + "step": 22140 + }, + { + "epoch": 0.04028380531919005, + "grad_norm": 0.032927367836236954, + "learning_rate": 0.0002, + "loss": 0.0225, + "step": 22150 + }, + { + "epoch": 0.040301992138747245, + "grad_norm": 0.31128716468811035, + "learning_rate": 0.0002, + "loss": 0.1227, + "step": 22160 + }, + { + "epoch": 0.04032017895830444, + "grad_norm": 0.14056596159934998, + "learning_rate": 0.0002, + "loss": 0.0866, + "step": 22170 + }, + { + "epoch": 0.04033836577786164, + "grad_norm": 0.10555677115917206, + "learning_rate": 0.0002, + "loss": 0.0785, + "step": 22180 + }, + { + "epoch": 0.040356552597418834, + "grad_norm": 0.25597816705703735, + "learning_rate": 0.0002, + "loss": 0.0667, + "step": 22190 + }, + { + "epoch": 0.04037473941697603, + "grad_norm": 0.04694845899939537, + "learning_rate": 0.0002, + "loss": 0.021, + "step": 22200 + }, + { + "epoch": 0.040392926236533226, + "grad_norm": 0.2536766529083252, + "learning_rate": 0.0002, + "loss": 0.1485, + "step": 22210 + }, + { + "epoch": 0.04041111305609042, + "grad_norm": 0.0536673367023468, + "learning_rate": 0.0002, + "loss": 0.0737, + "step": 22220 + }, + { + "epoch": 0.04042929987564762, + "grad_norm": 0.13121111690998077, + "learning_rate": 0.0002, + "loss": 0.0793, + "step": 22230 + }, + { + "epoch": 0.040447486695204815, + "grad_norm": 0.23850645124912262, + "learning_rate": 0.0002, + "loss": 0.0698, + "step": 22240 + }, + { + "epoch": 0.04046567351476201, + "grad_norm": 0.04178560525178909, + "learning_rate": 0.0002, + "loss": 0.0212, + "step": 22250 + }, + { + "epoch": 0.04048386033431921, + "grad_norm": 0.42834579944610596, + "learning_rate": 0.0002, + "loss": 0.1352, + "step": 22260 + }, + { + "epoch": 0.040502047153876404, + "grad_norm": 0.050178542733192444, + "learning_rate": 0.0002, + "loss": 0.0853, + "step": 22270 + }, + { + "epoch": 0.0405202339734336, + "grad_norm": 0.042758237570524216, + "learning_rate": 0.0002, + "loss": 0.0709, + "step": 22280 + }, + { + "epoch": 0.0405384207929908, + "grad_norm": 0.2604416012763977, + "learning_rate": 0.0002, + "loss": 0.0643, + "step": 22290 + }, + { + "epoch": 0.04055660761254799, + "grad_norm": 0.06166388466954231, + "learning_rate": 0.0002, + "loss": 0.0236, + "step": 22300 + }, + { + "epoch": 0.04057479443210519, + "grad_norm": 0.2337518334388733, + "learning_rate": 0.0002, + "loss": 0.132, + "step": 22310 + }, + { + "epoch": 0.040592981251662386, + "grad_norm": 0.15794694423675537, + "learning_rate": 0.0002, + "loss": 0.0739, + "step": 22320 + }, + { + "epoch": 0.04061116807121959, + "grad_norm": 0.12059915065765381, + "learning_rate": 0.0002, + "loss": 0.0743, + "step": 22330 + }, + { + "epoch": 0.040629354890776785, + "grad_norm": 0.25351977348327637, + "learning_rate": 0.0002, + "loss": 0.065, + "step": 22340 + }, + { + "epoch": 0.04064754171033398, + "grad_norm": 0.03265364468097687, + "learning_rate": 0.0002, + "loss": 0.02, + "step": 22350 + }, + { + "epoch": 0.04066572852989118, + "grad_norm": 0.22959749400615692, + "learning_rate": 0.0002, + "loss": 0.1278, + "step": 22360 + }, + { + "epoch": 0.040683915349448374, + "grad_norm": 0.11381889134645462, + "learning_rate": 0.0002, + "loss": 0.0823, + "step": 22370 + }, + { + "epoch": 0.04070210216900557, + "grad_norm": 0.03541165217757225, + "learning_rate": 0.0002, + "loss": 0.0809, + "step": 22380 + }, + { + "epoch": 0.04072028898856277, + "grad_norm": 0.20604047179222107, + "learning_rate": 0.0002, + "loss": 0.0693, + "step": 22390 + }, + { + "epoch": 0.04073847580811996, + "grad_norm": 0.051576532423496246, + "learning_rate": 0.0002, + "loss": 0.0213, + "step": 22400 + }, + { + "epoch": 0.04075666262767716, + "grad_norm": 0.208265483379364, + "learning_rate": 0.0002, + "loss": 0.1203, + "step": 22410 + }, + { + "epoch": 0.040774849447234356, + "grad_norm": 0.14376410841941833, + "learning_rate": 0.0002, + "loss": 0.0832, + "step": 22420 + }, + { + "epoch": 0.04079303626679155, + "grad_norm": 0.0634629875421524, + "learning_rate": 0.0002, + "loss": 0.0797, + "step": 22430 + }, + { + "epoch": 0.04081122308634875, + "grad_norm": 0.22782418131828308, + "learning_rate": 0.0002, + "loss": 0.0594, + "step": 22440 + }, + { + "epoch": 0.040829409905905945, + "grad_norm": 0.034153662621974945, + "learning_rate": 0.0002, + "loss": 0.0197, + "step": 22450 + }, + { + "epoch": 0.04084759672546314, + "grad_norm": 0.22994177043437958, + "learning_rate": 0.0002, + "loss": 0.1276, + "step": 22460 + }, + { + "epoch": 0.04086578354502034, + "grad_norm": 0.37397289276123047, + "learning_rate": 0.0002, + "loss": 0.0794, + "step": 22470 + }, + { + "epoch": 0.040883970364577533, + "grad_norm": 0.03585643321275711, + "learning_rate": 0.0002, + "loss": 0.0765, + "step": 22480 + }, + { + "epoch": 0.04090215718413473, + "grad_norm": 0.2266087681055069, + "learning_rate": 0.0002, + "loss": 0.0661, + "step": 22490 + }, + { + "epoch": 0.040920344003691926, + "grad_norm": 0.03867397829890251, + "learning_rate": 0.0002, + "loss": 0.0241, + "step": 22500 + }, + { + "epoch": 0.04093853082324912, + "grad_norm": 0.23483702540397644, + "learning_rate": 0.0002, + "loss": 0.1442, + "step": 22510 + }, + { + "epoch": 0.04095671764280632, + "grad_norm": 0.11447428911924362, + "learning_rate": 0.0002, + "loss": 0.0759, + "step": 22520 + }, + { + "epoch": 0.040974904462363515, + "grad_norm": 0.1060417965054512, + "learning_rate": 0.0002, + "loss": 0.0792, + "step": 22530 + }, + { + "epoch": 0.04099309128192071, + "grad_norm": 0.1915966123342514, + "learning_rate": 0.0002, + "loss": 0.0679, + "step": 22540 + }, + { + "epoch": 0.04101127810147791, + "grad_norm": 0.05328527092933655, + "learning_rate": 0.0002, + "loss": 0.0211, + "step": 22550 + }, + { + "epoch": 0.041029464921035104, + "grad_norm": 0.31612515449523926, + "learning_rate": 0.0002, + "loss": 0.1395, + "step": 22560 + }, + { + "epoch": 0.0410476517405923, + "grad_norm": 0.1860841065645218, + "learning_rate": 0.0002, + "loss": 0.0704, + "step": 22570 + }, + { + "epoch": 0.0410658385601495, + "grad_norm": 0.11183702945709229, + "learning_rate": 0.0002, + "loss": 0.073, + "step": 22580 + }, + { + "epoch": 0.04108402537970669, + "grad_norm": 0.2028307020664215, + "learning_rate": 0.0002, + "loss": 0.0592, + "step": 22590 + }, + { + "epoch": 0.04110221219926389, + "grad_norm": 0.032915905117988586, + "learning_rate": 0.0002, + "loss": 0.0211, + "step": 22600 + }, + { + "epoch": 0.041120399018821086, + "grad_norm": 0.2932131588459015, + "learning_rate": 0.0002, + "loss": 0.1542, + "step": 22610 + }, + { + "epoch": 0.04113858583837828, + "grad_norm": 0.08883325010538101, + "learning_rate": 0.0002, + "loss": 0.079, + "step": 22620 + }, + { + "epoch": 0.04115677265793548, + "grad_norm": 0.07874555885791779, + "learning_rate": 0.0002, + "loss": 0.0801, + "step": 22630 + }, + { + "epoch": 0.041174959477492674, + "grad_norm": 0.13785040378570557, + "learning_rate": 0.0002, + "loss": 0.0636, + "step": 22640 + }, + { + "epoch": 0.04119314629704987, + "grad_norm": 0.0321812778711319, + "learning_rate": 0.0002, + "loss": 0.0208, + "step": 22650 + }, + { + "epoch": 0.04121133311660707, + "grad_norm": 0.142785906791687, + "learning_rate": 0.0002, + "loss": 0.1292, + "step": 22660 + }, + { + "epoch": 0.04122951993616426, + "grad_norm": 0.15572668612003326, + "learning_rate": 0.0002, + "loss": 0.0774, + "step": 22670 + }, + { + "epoch": 0.04124770675572146, + "grad_norm": 0.033191781491041183, + "learning_rate": 0.0002, + "loss": 0.0805, + "step": 22680 + }, + { + "epoch": 0.041265893575278656, + "grad_norm": 0.23840776085853577, + "learning_rate": 0.0002, + "loss": 0.06, + "step": 22690 + }, + { + "epoch": 0.04128408039483585, + "grad_norm": 0.05943412706255913, + "learning_rate": 0.0002, + "loss": 0.0215, + "step": 22700 + }, + { + "epoch": 0.04130226721439305, + "grad_norm": 0.05142183229327202, + "learning_rate": 0.0002, + "loss": 0.1181, + "step": 22710 + }, + { + "epoch": 0.041320454033950245, + "grad_norm": 0.1583058387041092, + "learning_rate": 0.0002, + "loss": 0.0822, + "step": 22720 + }, + { + "epoch": 0.04133864085350744, + "grad_norm": 0.035809941589832306, + "learning_rate": 0.0002, + "loss": 0.0723, + "step": 22730 + }, + { + "epoch": 0.04135682767306464, + "grad_norm": 0.24066607654094696, + "learning_rate": 0.0002, + "loss": 0.0621, + "step": 22740 + }, + { + "epoch": 0.041375014492621834, + "grad_norm": 0.0327225998044014, + "learning_rate": 0.0002, + "loss": 0.0264, + "step": 22750 + }, + { + "epoch": 0.04139320131217903, + "grad_norm": 0.16599033772945404, + "learning_rate": 0.0002, + "loss": 0.1082, + "step": 22760 + }, + { + "epoch": 0.041411388131736226, + "grad_norm": 0.18834830820560455, + "learning_rate": 0.0002, + "loss": 0.0767, + "step": 22770 + }, + { + "epoch": 0.04142957495129342, + "grad_norm": 0.04162973538041115, + "learning_rate": 0.0002, + "loss": 0.0821, + "step": 22780 + }, + { + "epoch": 0.04144776177085062, + "grad_norm": 0.21065399050712585, + "learning_rate": 0.0002, + "loss": 0.0591, + "step": 22790 + }, + { + "epoch": 0.041465948590407815, + "grad_norm": 0.03744394704699516, + "learning_rate": 0.0002, + "loss": 0.0199, + "step": 22800 + }, + { + "epoch": 0.04148413540996501, + "grad_norm": 0.30440911650657654, + "learning_rate": 0.0002, + "loss": 0.1321, + "step": 22810 + }, + { + "epoch": 0.04150232222952221, + "grad_norm": 0.07215052098035812, + "learning_rate": 0.0002, + "loss": 0.077, + "step": 22820 + }, + { + "epoch": 0.041520509049079404, + "grad_norm": 0.0822744220495224, + "learning_rate": 0.0002, + "loss": 0.0695, + "step": 22830 + }, + { + "epoch": 0.0415386958686366, + "grad_norm": 0.20610104501247406, + "learning_rate": 0.0002, + "loss": 0.0668, + "step": 22840 + }, + { + "epoch": 0.0415568826881938, + "grad_norm": 0.05089128017425537, + "learning_rate": 0.0002, + "loss": 0.0275, + "step": 22850 + }, + { + "epoch": 0.04157506950775099, + "grad_norm": 0.23365797102451324, + "learning_rate": 0.0002, + "loss": 0.1308, + "step": 22860 + }, + { + "epoch": 0.04159325632730819, + "grad_norm": 0.03983612358570099, + "learning_rate": 0.0002, + "loss": 0.0738, + "step": 22870 + }, + { + "epoch": 0.041611443146865386, + "grad_norm": 0.12472117692232132, + "learning_rate": 0.0002, + "loss": 0.082, + "step": 22880 + }, + { + "epoch": 0.04162962996642258, + "grad_norm": 0.19599118828773499, + "learning_rate": 0.0002, + "loss": 0.0614, + "step": 22890 + }, + { + "epoch": 0.04164781678597978, + "grad_norm": 0.04077763110399246, + "learning_rate": 0.0002, + "loss": 0.0298, + "step": 22900 + }, + { + "epoch": 0.041666003605536975, + "grad_norm": 0.3027828633785248, + "learning_rate": 0.0002, + "loss": 0.1294, + "step": 22910 + }, + { + "epoch": 0.04168419042509417, + "grad_norm": 0.1551598757505417, + "learning_rate": 0.0002, + "loss": 0.0716, + "step": 22920 + }, + { + "epoch": 0.04170237724465137, + "grad_norm": 0.06512947380542755, + "learning_rate": 0.0002, + "loss": 0.071, + "step": 22930 + }, + { + "epoch": 0.041720564064208564, + "grad_norm": 0.2486017346382141, + "learning_rate": 0.0002, + "loss": 0.0726, + "step": 22940 + }, + { + "epoch": 0.04173875088376576, + "grad_norm": 0.0658118799328804, + "learning_rate": 0.0002, + "loss": 0.0211, + "step": 22950 + }, + { + "epoch": 0.041756937703322956, + "grad_norm": 0.18327641487121582, + "learning_rate": 0.0002, + "loss": 0.1307, + "step": 22960 + }, + { + "epoch": 0.04177512452288015, + "grad_norm": 0.06218123063445091, + "learning_rate": 0.0002, + "loss": 0.0839, + "step": 22970 + }, + { + "epoch": 0.04179331134243735, + "grad_norm": 0.07085203379392624, + "learning_rate": 0.0002, + "loss": 0.0786, + "step": 22980 + }, + { + "epoch": 0.041811498161994545, + "grad_norm": 0.19552426040172577, + "learning_rate": 0.0002, + "loss": 0.0646, + "step": 22990 + }, + { + "epoch": 0.04182968498155174, + "grad_norm": 0.06710335612297058, + "learning_rate": 0.0002, + "loss": 0.0252, + "step": 23000 + } + ], + "logging_steps": 10, + "max_steps": 100000, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 250, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.0398767809662812e+19, + "train_batch_size": 24, + "trial_name": null, + "trial_params": null +}