{ "best_metric": 0.08026164770126343, "best_model_checkpoint": "./vit-base-cifar10/checkpoint-4300", "epoch": 2.0, "eval_steps": 100, "global_step": 4376, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004570383912248629, "grad_norm": 7.92866325378418, "learning_rate": 0.00019954296160877515, "loss": 0.2503, "step": 10 }, { "epoch": 0.009140767824497258, "grad_norm": 0.6853145956993103, "learning_rate": 0.00019908592321755028, "loss": 0.2887, "step": 20 }, { "epoch": 0.013711151736745886, "grad_norm": 8.078825950622559, "learning_rate": 0.00019862888482632542, "loss": 0.3522, "step": 30 }, { "epoch": 0.018281535648994516, "grad_norm": 0.5457130670547485, "learning_rate": 0.00019817184643510056, "loss": 0.2236, "step": 40 }, { "epoch": 0.022851919561243144, "grad_norm": 7.070144176483154, "learning_rate": 0.0001977148080438757, "loss": 0.3022, "step": 50 }, { "epoch": 0.027422303473491772, "grad_norm": 6.712225437164307, "learning_rate": 0.00019725776965265083, "loss": 0.2947, "step": 60 }, { "epoch": 0.031992687385740404, "grad_norm": 0.18951156735420227, "learning_rate": 0.00019680073126142596, "loss": 0.2885, "step": 70 }, { "epoch": 0.03656307129798903, "grad_norm": 5.366819858551025, "learning_rate": 0.0001963436928702011, "loss": 0.2822, "step": 80 }, { "epoch": 0.04113345521023766, "grad_norm": 1.0978411436080933, "learning_rate": 0.00019588665447897624, "loss": 0.2124, "step": 90 }, { "epoch": 0.04570383912248629, "grad_norm": 0.3122899532318115, "learning_rate": 0.0001954296160877514, "loss": 0.1043, "step": 100 }, { "epoch": 0.04570383912248629, "eval_accuracy": 0.919, "eval_loss": 0.285547137260437, "eval_runtime": 71.7238, "eval_samples_per_second": 209.136, "eval_steps_per_second": 13.078, "step": 100 }, { "epoch": 0.050274223034734916, "grad_norm": 0.872364342212677, "learning_rate": 0.00019497257769652654, "loss": 0.1525, "step": 110 }, { "epoch": 0.054844606946983544, "grad_norm": 0.4638870358467102, "learning_rate": 0.00019451553930530167, "loss": 0.269, "step": 120 }, { "epoch": 0.05941499085923217, "grad_norm": 4.21981143951416, "learning_rate": 0.0001940585009140768, "loss": 0.3743, "step": 130 }, { "epoch": 0.06398537477148081, "grad_norm": 0.4475237727165222, "learning_rate": 0.00019360146252285195, "loss": 0.2608, "step": 140 }, { "epoch": 0.06855575868372943, "grad_norm": 6.892650127410889, "learning_rate": 0.00019314442413162706, "loss": 0.271, "step": 150 }, { "epoch": 0.07312614259597806, "grad_norm": 5.913930416107178, "learning_rate": 0.0001926873857404022, "loss": 0.1474, "step": 160 }, { "epoch": 0.07769652650822668, "grad_norm": 2.620058298110962, "learning_rate": 0.00019223034734917733, "loss": 0.1543, "step": 170 }, { "epoch": 0.08226691042047532, "grad_norm": 8.168417930603027, "learning_rate": 0.00019177330895795246, "loss": 0.4235, "step": 180 }, { "epoch": 0.08683729433272395, "grad_norm": 6.655405044555664, "learning_rate": 0.0001913162705667276, "loss": 0.2081, "step": 190 }, { "epoch": 0.09140767824497258, "grad_norm": 0.3367229104042053, "learning_rate": 0.00019085923217550274, "loss": 0.2671, "step": 200 }, { "epoch": 0.09140767824497258, "eval_accuracy": 0.9015333333333333, "eval_loss": 0.3649848401546478, "eval_runtime": 70.014, "eval_samples_per_second": 214.243, "eval_steps_per_second": 13.397, "step": 200 }, { "epoch": 0.09597806215722121, "grad_norm": 7.15602970123291, "learning_rate": 0.00019040219378427787, "loss": 0.5093, "step": 210 }, { "epoch": 0.10054844606946983, "grad_norm": 3.229771137237549, "learning_rate": 0.000189945155393053, "loss": 0.4777, "step": 220 }, { "epoch": 0.10511882998171847, "grad_norm": 7.438943862915039, "learning_rate": 0.00018948811700182815, "loss": 0.407, "step": 230 }, { "epoch": 0.10968921389396709, "grad_norm": 8.4913969039917, "learning_rate": 0.00018903107861060328, "loss": 0.338, "step": 240 }, { "epoch": 0.11425959780621572, "grad_norm": 1.3278025388717651, "learning_rate": 0.00018857404021937845, "loss": 0.3518, "step": 250 }, { "epoch": 0.11882998171846434, "grad_norm": 9.502877235412598, "learning_rate": 0.00018811700182815358, "loss": 0.4069, "step": 260 }, { "epoch": 0.12340036563071298, "grad_norm": 3.407360553741455, "learning_rate": 0.00018765996343692872, "loss": 0.2093, "step": 270 }, { "epoch": 0.12797074954296161, "grad_norm": 9.035057067871094, "learning_rate": 0.00018720292504570386, "loss": 0.3598, "step": 280 }, { "epoch": 0.13254113345521024, "grad_norm": 0.40073174238204956, "learning_rate": 0.000186745886654479, "loss": 0.2389, "step": 290 }, { "epoch": 0.13711151736745886, "grad_norm": 0.32648196816444397, "learning_rate": 0.00018628884826325413, "loss": 0.2935, "step": 300 }, { "epoch": 0.13711151736745886, "eval_accuracy": 0.9066666666666666, "eval_loss": 0.31670910120010376, "eval_runtime": 69.8802, "eval_samples_per_second": 214.653, "eval_steps_per_second": 13.423, "step": 300 }, { "epoch": 0.1416819012797075, "grad_norm": 5.271216869354248, "learning_rate": 0.00018583180987202927, "loss": 0.3934, "step": 310 }, { "epoch": 0.14625228519195613, "grad_norm": 6.032111167907715, "learning_rate": 0.0001853747714808044, "loss": 0.2739, "step": 320 }, { "epoch": 0.15082266910420475, "grad_norm": 1.9436883926391602, "learning_rate": 0.00018491773308957954, "loss": 0.2362, "step": 330 }, { "epoch": 0.15539305301645337, "grad_norm": 7.990470886230469, "learning_rate": 0.00018446069469835467, "loss": 0.422, "step": 340 }, { "epoch": 0.15996343692870202, "grad_norm": 3.6017348766326904, "learning_rate": 0.0001840036563071298, "loss": 0.3151, "step": 350 }, { "epoch": 0.16453382084095064, "grad_norm": 4.97476053237915, "learning_rate": 0.00018354661791590495, "loss": 0.486, "step": 360 }, { "epoch": 0.16910420475319926, "grad_norm": 5.418762683868408, "learning_rate": 0.00018308957952468008, "loss": 0.5847, "step": 370 }, { "epoch": 0.1736745886654479, "grad_norm": 2.143413782119751, "learning_rate": 0.00018263254113345522, "loss": 0.3188, "step": 380 }, { "epoch": 0.17824497257769653, "grad_norm": 5.15682315826416, "learning_rate": 0.00018217550274223036, "loss": 0.27, "step": 390 }, { "epoch": 0.18281535648994515, "grad_norm": 4.638512134552002, "learning_rate": 0.0001817184643510055, "loss": 0.27, "step": 400 }, { "epoch": 0.18281535648994515, "eval_accuracy": 0.8922, "eval_loss": 0.35180962085723877, "eval_runtime": 69.6193, "eval_samples_per_second": 215.458, "eval_steps_per_second": 13.473, "step": 400 }, { "epoch": 0.18738574040219377, "grad_norm": 7.933419704437256, "learning_rate": 0.00018126142595978063, "loss": 0.2467, "step": 410 }, { "epoch": 0.19195612431444242, "grad_norm": 2.749178886413574, "learning_rate": 0.00018080438756855577, "loss": 0.283, "step": 420 }, { "epoch": 0.19652650822669104, "grad_norm": 4.662679672241211, "learning_rate": 0.0001803473491773309, "loss": 0.3666, "step": 430 }, { "epoch": 0.20109689213893966, "grad_norm": 8.538412094116211, "learning_rate": 0.00017989031078610604, "loss": 0.3315, "step": 440 }, { "epoch": 0.2056672760511883, "grad_norm": 7.226283550262451, "learning_rate": 0.00017943327239488118, "loss": 0.2998, "step": 450 }, { "epoch": 0.21023765996343693, "grad_norm": 7.400086402893066, "learning_rate": 0.0001789762340036563, "loss": 0.3298, "step": 460 }, { "epoch": 0.21480804387568556, "grad_norm": 2.4320435523986816, "learning_rate": 0.00017851919561243145, "loss": 0.3493, "step": 470 }, { "epoch": 0.21937842778793418, "grad_norm": 2.891914129257202, "learning_rate": 0.00017806215722120658, "loss": 0.2929, "step": 480 }, { "epoch": 0.22394881170018283, "grad_norm": 4.258346080780029, "learning_rate": 0.00017760511882998172, "loss": 0.3847, "step": 490 }, { "epoch": 0.22851919561243145, "grad_norm": 3.8576903343200684, "learning_rate": 0.00017714808043875686, "loss": 0.3634, "step": 500 }, { "epoch": 0.22851919561243145, "eval_accuracy": 0.8953333333333333, "eval_loss": 0.36601880192756653, "eval_runtime": 70.2316, "eval_samples_per_second": 213.579, "eval_steps_per_second": 13.356, "step": 500 }, { "epoch": 0.23308957952468007, "grad_norm": 6.1643853187561035, "learning_rate": 0.000176691042047532, "loss": 0.3764, "step": 510 }, { "epoch": 0.2376599634369287, "grad_norm": 0.7884778380393982, "learning_rate": 0.00017623400365630713, "loss": 0.2016, "step": 520 }, { "epoch": 0.24223034734917734, "grad_norm": 3.4569339752197266, "learning_rate": 0.00017577696526508227, "loss": 0.3292, "step": 530 }, { "epoch": 0.24680073126142596, "grad_norm": 0.9117717146873474, "learning_rate": 0.0001753199268738574, "loss": 0.1951, "step": 540 }, { "epoch": 0.2513711151736746, "grad_norm": 11.449209213256836, "learning_rate": 0.00017486288848263254, "loss": 0.3511, "step": 550 }, { "epoch": 0.25594149908592323, "grad_norm": 3.824899435043335, "learning_rate": 0.00017440585009140768, "loss": 0.2262, "step": 560 }, { "epoch": 0.26051188299817185, "grad_norm": 7.360519886016846, "learning_rate": 0.0001739488117001828, "loss": 0.2491, "step": 570 }, { "epoch": 0.26508226691042047, "grad_norm": 6.038581848144531, "learning_rate": 0.00017349177330895795, "loss": 0.2548, "step": 580 }, { "epoch": 0.2696526508226691, "grad_norm": 3.3812975883483887, "learning_rate": 0.0001730347349177331, "loss": 0.4146, "step": 590 }, { "epoch": 0.2742230347349177, "grad_norm": 0.8644607067108154, "learning_rate": 0.00017257769652650825, "loss": 0.2559, "step": 600 }, { "epoch": 0.2742230347349177, "eval_accuracy": 0.8901333333333333, "eval_loss": 0.3964242935180664, "eval_runtime": 70.6896, "eval_samples_per_second": 212.195, "eval_steps_per_second": 13.269, "step": 600 }, { "epoch": 0.27879341864716634, "grad_norm": 0.19820252060890198, "learning_rate": 0.00017212065813528338, "loss": 0.2866, "step": 610 }, { "epoch": 0.283363802559415, "grad_norm": 4.149306774139404, "learning_rate": 0.00017166361974405852, "loss": 0.2711, "step": 620 }, { "epoch": 0.28793418647166363, "grad_norm": 0.7687764167785645, "learning_rate": 0.00017120658135283366, "loss": 0.2035, "step": 630 }, { "epoch": 0.29250457038391225, "grad_norm": 3.1452839374542236, "learning_rate": 0.0001707495429616088, "loss": 0.3511, "step": 640 }, { "epoch": 0.2970749542961609, "grad_norm": 4.324541091918945, "learning_rate": 0.00017029250457038393, "loss": 0.2869, "step": 650 }, { "epoch": 0.3016453382084095, "grad_norm": 0.06479712575674057, "learning_rate": 0.00016983546617915907, "loss": 0.1225, "step": 660 }, { "epoch": 0.3062157221206581, "grad_norm": 1.430450677871704, "learning_rate": 0.0001693784277879342, "loss": 0.2164, "step": 670 }, { "epoch": 0.31078610603290674, "grad_norm": 3.9292774200439453, "learning_rate": 0.00016892138939670934, "loss": 0.3039, "step": 680 }, { "epoch": 0.3153564899451554, "grad_norm": 5.680319309234619, "learning_rate": 0.00016846435100548448, "loss": 0.2702, "step": 690 }, { "epoch": 0.31992687385740404, "grad_norm": 0.42744606733322144, "learning_rate": 0.0001680073126142596, "loss": 0.197, "step": 700 }, { "epoch": 0.31992687385740404, "eval_accuracy": 0.9252666666666667, "eval_loss": 0.24806976318359375, "eval_runtime": 70.2676, "eval_samples_per_second": 213.47, "eval_steps_per_second": 13.349, "step": 700 }, { "epoch": 0.32449725776965266, "grad_norm": 5.1783447265625, "learning_rate": 0.00016755027422303475, "loss": 0.4019, "step": 710 }, { "epoch": 0.3290676416819013, "grad_norm": 4.345694541931152, "learning_rate": 0.00016709323583180986, "loss": 0.191, "step": 720 }, { "epoch": 0.3336380255941499, "grad_norm": 5.100950241088867, "learning_rate": 0.000166636197440585, "loss": 0.2976, "step": 730 }, { "epoch": 0.3382084095063985, "grad_norm": 6.052485942840576, "learning_rate": 0.00016617915904936016, "loss": 0.3266, "step": 740 }, { "epoch": 0.34277879341864714, "grad_norm": 2.7631659507751465, "learning_rate": 0.0001657221206581353, "loss": 0.3042, "step": 750 }, { "epoch": 0.3473491773308958, "grad_norm": 3.2634143829345703, "learning_rate": 0.00016526508226691043, "loss": 0.1501, "step": 760 }, { "epoch": 0.35191956124314444, "grad_norm": 5.064187049865723, "learning_rate": 0.00016480804387568557, "loss": 0.3669, "step": 770 }, { "epoch": 0.35648994515539306, "grad_norm": 6.092987060546875, "learning_rate": 0.0001643510054844607, "loss": 0.2448, "step": 780 }, { "epoch": 0.3610603290676417, "grad_norm": 4.66456413269043, "learning_rate": 0.00016389396709323584, "loss": 0.3027, "step": 790 }, { "epoch": 0.3656307129798903, "grad_norm": 6.245533466339111, "learning_rate": 0.00016343692870201098, "loss": 0.2594, "step": 800 }, { "epoch": 0.3656307129798903, "eval_accuracy": 0.923, "eval_loss": 0.24855366349220276, "eval_runtime": 70.4374, "eval_samples_per_second": 212.955, "eval_steps_per_second": 13.317, "step": 800 }, { "epoch": 0.3702010968921389, "grad_norm": 0.3337598741054535, "learning_rate": 0.0001629798903107861, "loss": 0.2141, "step": 810 }, { "epoch": 0.37477148080438755, "grad_norm": 3.9450998306274414, "learning_rate": 0.00016252285191956125, "loss": 0.2284, "step": 820 }, { "epoch": 0.3793418647166362, "grad_norm": 4.09628438949585, "learning_rate": 0.00016206581352833639, "loss": 0.2501, "step": 830 }, { "epoch": 0.38391224862888484, "grad_norm": 3.9277567863464355, "learning_rate": 0.00016160877513711152, "loss": 0.1436, "step": 840 }, { "epoch": 0.38848263254113347, "grad_norm": 1.291695237159729, "learning_rate": 0.00016115173674588666, "loss": 0.3047, "step": 850 }, { "epoch": 0.3930530164533821, "grad_norm": 3.195793867111206, "learning_rate": 0.0001606946983546618, "loss": 0.2684, "step": 860 }, { "epoch": 0.3976234003656307, "grad_norm": 9.900090217590332, "learning_rate": 0.00016023765996343693, "loss": 0.2006, "step": 870 }, { "epoch": 0.40219378427787933, "grad_norm": 7.173875331878662, "learning_rate": 0.00015978062157221207, "loss": 0.3154, "step": 880 }, { "epoch": 0.40676416819012795, "grad_norm": 6.029903411865234, "learning_rate": 0.0001593235831809872, "loss": 0.3175, "step": 890 }, { "epoch": 0.4113345521023766, "grad_norm": 6.628474235534668, "learning_rate": 0.00015886654478976234, "loss": 0.4545, "step": 900 }, { "epoch": 0.4113345521023766, "eval_accuracy": 0.9, "eval_loss": 0.327102929353714, "eval_runtime": 70.3538, "eval_samples_per_second": 213.208, "eval_steps_per_second": 13.333, "step": 900 }, { "epoch": 0.41590493601462525, "grad_norm": 4.372049808502197, "learning_rate": 0.00015840950639853748, "loss": 0.2577, "step": 910 }, { "epoch": 0.42047531992687387, "grad_norm": 5.5920562744140625, "learning_rate": 0.0001579524680073126, "loss": 0.2078, "step": 920 }, { "epoch": 0.4250457038391225, "grad_norm": 0.5012129545211792, "learning_rate": 0.00015749542961608778, "loss": 0.4423, "step": 930 }, { "epoch": 0.4296160877513711, "grad_norm": 4.091048717498779, "learning_rate": 0.0001570383912248629, "loss": 0.2831, "step": 940 }, { "epoch": 0.43418647166361973, "grad_norm": 3.677157163619995, "learning_rate": 0.00015658135283363805, "loss": 0.3525, "step": 950 }, { "epoch": 0.43875685557586835, "grad_norm": 2.8373067378997803, "learning_rate": 0.00015612431444241319, "loss": 0.1623, "step": 960 }, { "epoch": 0.443327239488117, "grad_norm": 7.7640156745910645, "learning_rate": 0.00015566727605118832, "loss": 0.2469, "step": 970 }, { "epoch": 0.44789762340036565, "grad_norm": 0.34190094470977783, "learning_rate": 0.00015521023765996346, "loss": 0.2173, "step": 980 }, { "epoch": 0.4524680073126143, "grad_norm": 0.42838719487190247, "learning_rate": 0.0001547531992687386, "loss": 0.1671, "step": 990 }, { "epoch": 0.4570383912248629, "grad_norm": 1.7106258869171143, "learning_rate": 0.00015429616087751373, "loss": 0.1243, "step": 1000 }, { "epoch": 0.4570383912248629, "eval_accuracy": 0.9268666666666666, "eval_loss": 0.24482281506061554, "eval_runtime": 70.0663, "eval_samples_per_second": 214.083, "eval_steps_per_second": 13.387, "step": 1000 }, { "epoch": 0.4616087751371115, "grad_norm": 4.664043426513672, "learning_rate": 0.00015383912248628884, "loss": 0.2915, "step": 1010 }, { "epoch": 0.46617915904936014, "grad_norm": 6.955049991607666, "learning_rate": 0.00015338208409506398, "loss": 0.2288, "step": 1020 }, { "epoch": 0.47074954296160876, "grad_norm": 6.842510223388672, "learning_rate": 0.00015292504570383911, "loss": 0.4217, "step": 1030 }, { "epoch": 0.4753199268738574, "grad_norm": 2.6210012435913086, "learning_rate": 0.00015246800731261425, "loss": 0.3455, "step": 1040 }, { "epoch": 0.47989031078610606, "grad_norm": 0.2048071324825287, "learning_rate": 0.0001520109689213894, "loss": 0.1599, "step": 1050 }, { "epoch": 0.4844606946983547, "grad_norm": 2.204970121383667, "learning_rate": 0.00015155393053016452, "loss": 0.2539, "step": 1060 }, { "epoch": 0.4890310786106033, "grad_norm": 4.208988189697266, "learning_rate": 0.00015109689213893966, "loss": 0.1719, "step": 1070 }, { "epoch": 0.4936014625228519, "grad_norm": 1.6201022863388062, "learning_rate": 0.00015063985374771482, "loss": 0.103, "step": 1080 }, { "epoch": 0.49817184643510054, "grad_norm": 2.211272954940796, "learning_rate": 0.00015018281535648996, "loss": 0.1956, "step": 1090 }, { "epoch": 0.5027422303473492, "grad_norm": 7.531051158905029, "learning_rate": 0.0001497257769652651, "loss": 0.3593, "step": 1100 }, { "epoch": 0.5027422303473492, "eval_accuracy": 0.9354, "eval_loss": 0.2118164449930191, "eval_runtime": 70.235, "eval_samples_per_second": 213.569, "eval_steps_per_second": 13.355, "step": 1100 }, { "epoch": 0.5073126142595978, "grad_norm": 3.182407855987549, "learning_rate": 0.00014926873857404023, "loss": 0.3023, "step": 1110 }, { "epoch": 0.5118829981718465, "grad_norm": 3.975743293762207, "learning_rate": 0.00014881170018281537, "loss": 0.2358, "step": 1120 }, { "epoch": 0.5164533820840951, "grad_norm": 4.823325157165527, "learning_rate": 0.0001483546617915905, "loss": 0.2031, "step": 1130 }, { "epoch": 0.5210237659963437, "grad_norm": 0.8731510043144226, "learning_rate": 0.00014789762340036564, "loss": 0.2254, "step": 1140 }, { "epoch": 0.5255941499085923, "grad_norm": 1.2150533199310303, "learning_rate": 0.00014744058500914078, "loss": 0.312, "step": 1150 }, { "epoch": 0.5301645338208409, "grad_norm": 1.9706271886825562, "learning_rate": 0.00014698354661791591, "loss": 0.1619, "step": 1160 }, { "epoch": 0.5347349177330896, "grad_norm": 0.36833953857421875, "learning_rate": 0.00014652650822669105, "loss": 0.1976, "step": 1170 }, { "epoch": 0.5393053016453382, "grad_norm": 3.9230244159698486, "learning_rate": 0.0001460694698354662, "loss": 0.2928, "step": 1180 }, { "epoch": 0.5438756855575868, "grad_norm": 4.378619194030762, "learning_rate": 0.00014561243144424132, "loss": 0.2008, "step": 1190 }, { "epoch": 0.5484460694698354, "grad_norm": 0.8093172907829285, "learning_rate": 0.00014515539305301646, "loss": 0.1375, "step": 1200 }, { "epoch": 0.5484460694698354, "eval_accuracy": 0.9348666666666666, "eval_loss": 0.22052045166492462, "eval_runtime": 70.7854, "eval_samples_per_second": 211.908, "eval_steps_per_second": 13.251, "step": 1200 }, { "epoch": 0.553016453382084, "grad_norm": 8.147080421447754, "learning_rate": 0.0001446983546617916, "loss": 0.1968, "step": 1210 }, { "epoch": 0.5575868372943327, "grad_norm": 4.382139682769775, "learning_rate": 0.00014424131627056673, "loss": 0.2315, "step": 1220 }, { "epoch": 0.5621572212065814, "grad_norm": 2.122556209564209, "learning_rate": 0.00014378427787934187, "loss": 0.1646, "step": 1230 }, { "epoch": 0.56672760511883, "grad_norm": 4.841265678405762, "learning_rate": 0.000143327239488117, "loss": 0.2223, "step": 1240 }, { "epoch": 0.5712979890310786, "grad_norm": 0.27766382694244385, "learning_rate": 0.00014287020109689214, "loss": 0.2387, "step": 1250 }, { "epoch": 0.5758683729433273, "grad_norm": 1.7641736268997192, "learning_rate": 0.00014241316270566728, "loss": 0.1726, "step": 1260 }, { "epoch": 0.5804387568555759, "grad_norm": 6.29493522644043, "learning_rate": 0.00014195612431444244, "loss": 0.4514, "step": 1270 }, { "epoch": 0.5850091407678245, "grad_norm": 0.2854464054107666, "learning_rate": 0.00014149908592321758, "loss": 0.1004, "step": 1280 }, { "epoch": 0.5895795246800731, "grad_norm": 5.928961753845215, "learning_rate": 0.00014104204753199271, "loss": 0.2575, "step": 1290 }, { "epoch": 0.5941499085923218, "grad_norm": 0.3155369758605957, "learning_rate": 0.00014058500914076782, "loss": 0.1521, "step": 1300 }, { "epoch": 0.5941499085923218, "eval_accuracy": 0.9376, "eval_loss": 0.200880229473114, "eval_runtime": 70.0018, "eval_samples_per_second": 214.28, "eval_steps_per_second": 13.4, "step": 1300 }, { "epoch": 0.5987202925045704, "grad_norm": 5.347257614135742, "learning_rate": 0.00014012797074954296, "loss": 0.178, "step": 1310 }, { "epoch": 0.603290676416819, "grad_norm": 8.616045951843262, "learning_rate": 0.0001396709323583181, "loss": 0.2389, "step": 1320 }, { "epoch": 0.6078610603290676, "grad_norm": 3.013582229614258, "learning_rate": 0.00013921389396709323, "loss": 0.1769, "step": 1330 }, { "epoch": 0.6124314442413162, "grad_norm": 5.738295078277588, "learning_rate": 0.00013875685557586837, "loss": 0.1847, "step": 1340 }, { "epoch": 0.6170018281535649, "grad_norm": 0.47330769896507263, "learning_rate": 0.0001382998171846435, "loss": 0.1451, "step": 1350 }, { "epoch": 0.6215722120658135, "grad_norm": 4.576214790344238, "learning_rate": 0.00013784277879341864, "loss": 0.2656, "step": 1360 }, { "epoch": 0.6261425959780622, "grad_norm": 2.6387856006622314, "learning_rate": 0.00013738574040219378, "loss": 0.1181, "step": 1370 }, { "epoch": 0.6307129798903108, "grad_norm": 0.5952144265174866, "learning_rate": 0.00013692870201096892, "loss": 0.1286, "step": 1380 }, { "epoch": 0.6352833638025595, "grad_norm": 5.721958160400391, "learning_rate": 0.00013647166361974405, "loss": 0.2129, "step": 1390 }, { "epoch": 0.6398537477148081, "grad_norm": 1.397820234298706, "learning_rate": 0.0001360146252285192, "loss": 0.1237, "step": 1400 }, { "epoch": 0.6398537477148081, "eval_accuracy": 0.9444666666666667, "eval_loss": 0.18025201559066772, "eval_runtime": 69.8721, "eval_samples_per_second": 214.678, "eval_steps_per_second": 13.425, "step": 1400 }, { "epoch": 0.6444241316270567, "grad_norm": 2.166053533554077, "learning_rate": 0.00013555758683729432, "loss": 0.2256, "step": 1410 }, { "epoch": 0.6489945155393053, "grad_norm": 1.5744013786315918, "learning_rate": 0.0001351005484460695, "loss": 0.085, "step": 1420 }, { "epoch": 0.6535648994515539, "grad_norm": 5.0064167976379395, "learning_rate": 0.00013464351005484462, "loss": 0.1947, "step": 1430 }, { "epoch": 0.6581352833638026, "grad_norm": 6.537513732910156, "learning_rate": 0.00013418647166361976, "loss": 0.095, "step": 1440 }, { "epoch": 0.6627056672760512, "grad_norm": 10.165826797485352, "learning_rate": 0.0001337294332723949, "loss": 0.2335, "step": 1450 }, { "epoch": 0.6672760511882998, "grad_norm": 2.5049922466278076, "learning_rate": 0.00013327239488117003, "loss": 0.1958, "step": 1460 }, { "epoch": 0.6718464351005484, "grad_norm": 6.945699214935303, "learning_rate": 0.00013281535648994517, "loss": 0.141, "step": 1470 }, { "epoch": 0.676416819012797, "grad_norm": 0.31921499967575073, "learning_rate": 0.0001323583180987203, "loss": 0.1275, "step": 1480 }, { "epoch": 0.6809872029250457, "grad_norm": 6.356455326080322, "learning_rate": 0.00013190127970749544, "loss": 0.2715, "step": 1490 }, { "epoch": 0.6855575868372943, "grad_norm": 0.11626709252595901, "learning_rate": 0.00013144424131627058, "loss": 0.2214, "step": 1500 }, { "epoch": 0.6855575868372943, "eval_accuracy": 0.9394666666666667, "eval_loss": 0.20262379944324493, "eval_runtime": 70.7416, "eval_samples_per_second": 212.039, "eval_steps_per_second": 13.26, "step": 1500 }, { "epoch": 0.6901279707495429, "grad_norm": 10.25921630859375, "learning_rate": 0.00013098720292504572, "loss": 0.3417, "step": 1510 }, { "epoch": 0.6946983546617916, "grad_norm": 0.09503093361854553, "learning_rate": 0.00013053016453382085, "loss": 0.1734, "step": 1520 }, { "epoch": 0.6992687385740403, "grad_norm": 4.171358108520508, "learning_rate": 0.000130073126142596, "loss": 0.1565, "step": 1530 }, { "epoch": 0.7038391224862889, "grad_norm": 1.6032589673995972, "learning_rate": 0.00012961608775137112, "loss": 0.279, "step": 1540 }, { "epoch": 0.7084095063985375, "grad_norm": 4.150899410247803, "learning_rate": 0.00012915904936014626, "loss": 0.261, "step": 1550 }, { "epoch": 0.7129798903107861, "grad_norm": 3.3375895023345947, "learning_rate": 0.0001287020109689214, "loss": 0.2531, "step": 1560 }, { "epoch": 0.7175502742230347, "grad_norm": 6.167548179626465, "learning_rate": 0.00012824497257769653, "loss": 0.2602, "step": 1570 }, { "epoch": 0.7221206581352834, "grad_norm": 2.9471616744995117, "learning_rate": 0.00012778793418647167, "loss": 0.2944, "step": 1580 }, { "epoch": 0.726691042047532, "grad_norm": 1.4548298120498657, "learning_rate": 0.0001273308957952468, "loss": 0.0746, "step": 1590 }, { "epoch": 0.7312614259597806, "grad_norm": 6.375178337097168, "learning_rate": 0.00012687385740402194, "loss": 0.1324, "step": 1600 }, { "epoch": 0.7312614259597806, "eval_accuracy": 0.9493333333333334, "eval_loss": 0.16354645788669586, "eval_runtime": 70.3467, "eval_samples_per_second": 213.23, "eval_steps_per_second": 13.334, "step": 1600 }, { "epoch": 0.7358318098720292, "grad_norm": 6.716129302978516, "learning_rate": 0.00012641681901279708, "loss": 0.2021, "step": 1610 }, { "epoch": 0.7404021937842779, "grad_norm": 0.047931790351867676, "learning_rate": 0.00012595978062157222, "loss": 0.1456, "step": 1620 }, { "epoch": 0.7449725776965265, "grad_norm": 0.08262607455253601, "learning_rate": 0.00012550274223034735, "loss": 0.1167, "step": 1630 }, { "epoch": 0.7495429616087751, "grad_norm": 0.21918249130249023, "learning_rate": 0.0001250457038391225, "loss": 0.1558, "step": 1640 }, { "epoch": 0.7541133455210237, "grad_norm": 3.03836989402771, "learning_rate": 0.00012458866544789763, "loss": 0.1506, "step": 1650 }, { "epoch": 0.7586837294332724, "grad_norm": 9.57582950592041, "learning_rate": 0.00012413162705667276, "loss": 0.2193, "step": 1660 }, { "epoch": 0.7632541133455211, "grad_norm": 0.2596251964569092, "learning_rate": 0.0001236745886654479, "loss": 0.1041, "step": 1670 }, { "epoch": 0.7678244972577697, "grad_norm": 11.114243507385254, "learning_rate": 0.00012321755027422303, "loss": 0.1528, "step": 1680 }, { "epoch": 0.7723948811700183, "grad_norm": 5.171649932861328, "learning_rate": 0.00012276051188299817, "loss": 0.1856, "step": 1690 }, { "epoch": 0.7769652650822669, "grad_norm": 4.471177577972412, "learning_rate": 0.0001223034734917733, "loss": 0.1864, "step": 1700 }, { "epoch": 0.7769652650822669, "eval_accuracy": 0.9493333333333334, "eval_loss": 0.16721826791763306, "eval_runtime": 70.5466, "eval_samples_per_second": 212.625, "eval_steps_per_second": 13.296, "step": 1700 }, { "epoch": 0.7815356489945156, "grad_norm": 4.236210346221924, "learning_rate": 0.00012184643510054844, "loss": 0.1898, "step": 1710 }, { "epoch": 0.7861060329067642, "grad_norm": 3.0980677604675293, "learning_rate": 0.0001213893967093236, "loss": 0.1925, "step": 1720 }, { "epoch": 0.7906764168190128, "grad_norm": 2.8537585735321045, "learning_rate": 0.00012093235831809873, "loss": 0.1141, "step": 1730 }, { "epoch": 0.7952468007312614, "grad_norm": 3.942676305770874, "learning_rate": 0.00012047531992687387, "loss": 0.2004, "step": 1740 }, { "epoch": 0.79981718464351, "grad_norm": 2.8048367500305176, "learning_rate": 0.000120018281535649, "loss": 0.2308, "step": 1750 }, { "epoch": 0.8043875685557587, "grad_norm": 2.2184743881225586, "learning_rate": 0.00011956124314442414, "loss": 0.1032, "step": 1760 }, { "epoch": 0.8089579524680073, "grad_norm": 12.887242317199707, "learning_rate": 0.00011910420475319928, "loss": 0.2534, "step": 1770 }, { "epoch": 0.8135283363802559, "grad_norm": 0.6864253878593445, "learning_rate": 0.00011864716636197441, "loss": 0.1879, "step": 1780 }, { "epoch": 0.8180987202925045, "grad_norm": 3.042908191680908, "learning_rate": 0.00011819012797074955, "loss": 0.2072, "step": 1790 }, { "epoch": 0.8226691042047533, "grad_norm": 0.42097190022468567, "learning_rate": 0.00011773308957952469, "loss": 0.128, "step": 1800 }, { "epoch": 0.8226691042047533, "eval_accuracy": 0.9409333333333333, "eval_loss": 0.2014516144990921, "eval_runtime": 71.2106, "eval_samples_per_second": 210.643, "eval_steps_per_second": 13.172, "step": 1800 }, { "epoch": 0.8272394881170019, "grad_norm": 5.429555892944336, "learning_rate": 0.00011727605118829984, "loss": 0.099, "step": 1810 }, { "epoch": 0.8318098720292505, "grad_norm": 6.33099889755249, "learning_rate": 0.00011681901279707497, "loss": 0.1546, "step": 1820 }, { "epoch": 0.8363802559414991, "grad_norm": 5.382350921630859, "learning_rate": 0.00011636197440585011, "loss": 0.2645, "step": 1830 }, { "epoch": 0.8409506398537477, "grad_norm": 2.201632022857666, "learning_rate": 0.00011590493601462524, "loss": 0.2078, "step": 1840 }, { "epoch": 0.8455210237659964, "grad_norm": 7.094603538513184, "learning_rate": 0.00011544789762340038, "loss": 0.3097, "step": 1850 }, { "epoch": 0.850091407678245, "grad_norm": 0.3324418067932129, "learning_rate": 0.00011499085923217552, "loss": 0.1623, "step": 1860 }, { "epoch": 0.8546617915904936, "grad_norm": 0.12678645551204681, "learning_rate": 0.00011453382084095065, "loss": 0.1275, "step": 1870 }, { "epoch": 0.8592321755027422, "grad_norm": 2.2118465900421143, "learning_rate": 0.00011407678244972578, "loss": 0.1834, "step": 1880 }, { "epoch": 0.8638025594149908, "grad_norm": 11.703352928161621, "learning_rate": 0.00011361974405850091, "loss": 0.165, "step": 1890 }, { "epoch": 0.8683729433272395, "grad_norm": 5.437178611755371, "learning_rate": 0.00011316270566727605, "loss": 0.121, "step": 1900 }, { "epoch": 0.8683729433272395, "eval_accuracy": 0.9451333333333334, "eval_loss": 0.17528271675109863, "eval_runtime": 70.5119, "eval_samples_per_second": 212.73, "eval_steps_per_second": 13.303, "step": 1900 }, { "epoch": 0.8729433272394881, "grad_norm": 2.402738332748413, "learning_rate": 0.00011270566727605119, "loss": 0.1896, "step": 1910 }, { "epoch": 0.8775137111517367, "grad_norm": 0.6686537861824036, "learning_rate": 0.00011224862888482632, "loss": 0.1535, "step": 1920 }, { "epoch": 0.8820840950639853, "grad_norm": 3.365333080291748, "learning_rate": 0.00011179159049360146, "loss": 0.1135, "step": 1930 }, { "epoch": 0.886654478976234, "grad_norm": 3.6913065910339355, "learning_rate": 0.0001113345521023766, "loss": 0.1391, "step": 1940 }, { "epoch": 0.8912248628884827, "grad_norm": 7.079347133636475, "learning_rate": 0.00011087751371115173, "loss": 0.1528, "step": 1950 }, { "epoch": 0.8957952468007313, "grad_norm": 2.3773577213287354, "learning_rate": 0.00011042047531992688, "loss": 0.1345, "step": 1960 }, { "epoch": 0.9003656307129799, "grad_norm": 3.536985158920288, "learning_rate": 0.00010996343692870202, "loss": 0.1762, "step": 1970 }, { "epoch": 0.9049360146252285, "grad_norm": 0.4026005268096924, "learning_rate": 0.00010950639853747715, "loss": 0.2182, "step": 1980 }, { "epoch": 0.9095063985374772, "grad_norm": 2.521723747253418, "learning_rate": 0.00010904936014625229, "loss": 0.1531, "step": 1990 }, { "epoch": 0.9140767824497258, "grad_norm": 0.42068588733673096, "learning_rate": 0.00010859232175502743, "loss": 0.1918, "step": 2000 }, { "epoch": 0.9140767824497258, "eval_accuracy": 0.9588, "eval_loss": 0.13700534403324127, "eval_runtime": 70.8331, "eval_samples_per_second": 211.766, "eval_steps_per_second": 13.242, "step": 2000 }, { "epoch": 0.9186471663619744, "grad_norm": 0.27603089809417725, "learning_rate": 0.00010813528336380256, "loss": 0.172, "step": 2010 }, { "epoch": 0.923217550274223, "grad_norm": 1.674926996231079, "learning_rate": 0.0001076782449725777, "loss": 0.1129, "step": 2020 }, { "epoch": 0.9277879341864717, "grad_norm": 0.18493302166461945, "learning_rate": 0.00010722120658135284, "loss": 0.0895, "step": 2030 }, { "epoch": 0.9323583180987203, "grad_norm": 4.929255485534668, "learning_rate": 0.00010676416819012797, "loss": 0.0942, "step": 2040 }, { "epoch": 0.9369287020109689, "grad_norm": 2.887568950653076, "learning_rate": 0.00010630712979890312, "loss": 0.1769, "step": 2050 }, { "epoch": 0.9414990859232175, "grad_norm": 2.5681095123291016, "learning_rate": 0.00010585009140767826, "loss": 0.1939, "step": 2060 }, { "epoch": 0.9460694698354661, "grad_norm": 9.683144569396973, "learning_rate": 0.0001053930530164534, "loss": 0.212, "step": 2070 }, { "epoch": 0.9506398537477148, "grad_norm": 0.2908894419670105, "learning_rate": 0.00010493601462522853, "loss": 0.1635, "step": 2080 }, { "epoch": 0.9552102376599635, "grad_norm": 5.623505115509033, "learning_rate": 0.00010447897623400367, "loss": 0.0803, "step": 2090 }, { "epoch": 0.9597806215722121, "grad_norm": 1.6659337282180786, "learning_rate": 0.0001040219378427788, "loss": 0.1658, "step": 2100 }, { "epoch": 0.9597806215722121, "eval_accuracy": 0.9534666666666667, "eval_loss": 0.15428349375724792, "eval_runtime": 70.1951, "eval_samples_per_second": 213.69, "eval_steps_per_second": 13.363, "step": 2100 }, { "epoch": 0.9643510054844607, "grad_norm": 3.0254101753234863, "learning_rate": 0.00010356489945155394, "loss": 0.2228, "step": 2110 }, { "epoch": 0.9689213893967094, "grad_norm": 4.310238838195801, "learning_rate": 0.00010310786106032908, "loss": 0.1241, "step": 2120 }, { "epoch": 0.973491773308958, "grad_norm": 3.578296422958374, "learning_rate": 0.00010265082266910421, "loss": 0.212, "step": 2130 }, { "epoch": 0.9780621572212066, "grad_norm": 1.9950228929519653, "learning_rate": 0.00010219378427787935, "loss": 0.2508, "step": 2140 }, { "epoch": 0.9826325411334552, "grad_norm": 0.7868995666503906, "learning_rate": 0.0001017367458866545, "loss": 0.1755, "step": 2150 }, { "epoch": 0.9872029250457038, "grad_norm": 2.1217503547668457, "learning_rate": 0.00010127970749542961, "loss": 0.1431, "step": 2160 }, { "epoch": 0.9917733089579525, "grad_norm": 0.1202094554901123, "learning_rate": 0.00010082266910420475, "loss": 0.0914, "step": 2170 }, { "epoch": 0.9963436928702011, "grad_norm": 4.719986438751221, "learning_rate": 0.00010036563071297988, "loss": 0.137, "step": 2180 }, { "epoch": 1.0009140767824498, "grad_norm": 0.4999386668205261, "learning_rate": 9.990859232175503e-05, "loss": 0.1686, "step": 2190 }, { "epoch": 1.0054844606946984, "grad_norm": 0.2870078980922699, "learning_rate": 9.945155393053017e-05, "loss": 0.1088, "step": 2200 }, { "epoch": 1.0054844606946984, "eval_accuracy": 0.9577333333333333, "eval_loss": 0.1361219733953476, "eval_runtime": 70.2423, "eval_samples_per_second": 213.547, "eval_steps_per_second": 13.354, "step": 2200 }, { "epoch": 1.010054844606947, "grad_norm": 0.8209459781646729, "learning_rate": 9.89945155393053e-05, "loss": 0.1142, "step": 2210 }, { "epoch": 1.0146252285191957, "grad_norm": 9.744668006896973, "learning_rate": 9.853747714808045e-05, "loss": 0.0546, "step": 2220 }, { "epoch": 1.0191956124314443, "grad_norm": 0.15120500326156616, "learning_rate": 9.808043875685559e-05, "loss": 0.0184, "step": 2230 }, { "epoch": 1.023765996343693, "grad_norm": 3.5596585273742676, "learning_rate": 9.762340036563071e-05, "loss": 0.1219, "step": 2240 }, { "epoch": 1.0283363802559415, "grad_norm": 4.676599025726318, "learning_rate": 9.716636197440585e-05, "loss": 0.0767, "step": 2250 }, { "epoch": 1.0329067641681902, "grad_norm": 0.02072470262646675, "learning_rate": 9.670932358318099e-05, "loss": 0.0152, "step": 2260 }, { "epoch": 1.0374771480804388, "grad_norm": 0.09274908900260925, "learning_rate": 9.625228519195612e-05, "loss": 0.0423, "step": 2270 }, { "epoch": 1.0420475319926874, "grad_norm": 2.4434385299682617, "learning_rate": 9.579524680073126e-05, "loss": 0.0651, "step": 2280 }, { "epoch": 1.046617915904936, "grad_norm": 0.028794042766094208, "learning_rate": 9.53382084095064e-05, "loss": 0.0635, "step": 2290 }, { "epoch": 1.0511882998171846, "grad_norm": 1.619289755821228, "learning_rate": 9.488117001828155e-05, "loss": 0.0916, "step": 2300 }, { "epoch": 1.0511882998171846, "eval_accuracy": 0.9596666666666667, "eval_loss": 0.13929586112499237, "eval_runtime": 70.231, "eval_samples_per_second": 213.581, "eval_steps_per_second": 13.356, "step": 2300 }, { "epoch": 1.0557586837294333, "grad_norm": 4.361185073852539, "learning_rate": 9.442413162705668e-05, "loss": 0.039, "step": 2310 }, { "epoch": 1.0603290676416819, "grad_norm": 0.0270086620002985, "learning_rate": 9.396709323583182e-05, "loss": 0.087, "step": 2320 }, { "epoch": 1.0648994515539305, "grad_norm": 0.09628736972808838, "learning_rate": 9.351005484460696e-05, "loss": 0.0222, "step": 2330 }, { "epoch": 1.0694698354661791, "grad_norm": 4.285031318664551, "learning_rate": 9.305301645338209e-05, "loss": 0.0478, "step": 2340 }, { "epoch": 1.0740402193784278, "grad_norm": 0.06926529854536057, "learning_rate": 9.259597806215723e-05, "loss": 0.0531, "step": 2350 }, { "epoch": 1.0786106032906764, "grad_norm": 0.01788182742893696, "learning_rate": 9.213893967093236e-05, "loss": 0.0723, "step": 2360 }, { "epoch": 1.083180987202925, "grad_norm": 0.1285097748041153, "learning_rate": 9.16819012797075e-05, "loss": 0.0565, "step": 2370 }, { "epoch": 1.0877513711151736, "grad_norm": 0.013244764879345894, "learning_rate": 9.122486288848264e-05, "loss": 0.0363, "step": 2380 }, { "epoch": 1.0923217550274222, "grad_norm": 2.3318049907684326, "learning_rate": 9.076782449725777e-05, "loss": 0.0584, "step": 2390 }, { "epoch": 1.0968921389396709, "grad_norm": 0.3406066298484802, "learning_rate": 9.031078610603291e-05, "loss": 0.005, "step": 2400 }, { "epoch": 1.0968921389396709, "eval_accuracy": 0.9620666666666666, "eval_loss": 0.12949973344802856, "eval_runtime": 70.9675, "eval_samples_per_second": 211.364, "eval_steps_per_second": 13.217, "step": 2400 }, { "epoch": 1.1014625228519195, "grad_norm": 0.09992707520723343, "learning_rate": 8.985374771480805e-05, "loss": 0.006, "step": 2410 }, { "epoch": 1.106032906764168, "grad_norm": 3.9798166751861572, "learning_rate": 8.939670932358318e-05, "loss": 0.0458, "step": 2420 }, { "epoch": 1.1106032906764167, "grad_norm": 0.20624032616615295, "learning_rate": 8.893967093235832e-05, "loss": 0.0366, "step": 2430 }, { "epoch": 1.1151736745886653, "grad_norm": 0.03891080617904663, "learning_rate": 8.848263254113346e-05, "loss": 0.0118, "step": 2440 }, { "epoch": 1.1197440585009142, "grad_norm": 7.250652313232422, "learning_rate": 8.802559414990859e-05, "loss": 0.0622, "step": 2450 }, { "epoch": 1.1243144424131628, "grad_norm": 2.0701119899749756, "learning_rate": 8.756855575868373e-05, "loss": 0.0504, "step": 2460 }, { "epoch": 1.1288848263254114, "grad_norm": 4.752568244934082, "learning_rate": 8.711151736745888e-05, "loss": 0.0581, "step": 2470 }, { "epoch": 1.13345521023766, "grad_norm": 0.023835673928260803, "learning_rate": 8.665447897623402e-05, "loss": 0.0401, "step": 2480 }, { "epoch": 1.1380255941499087, "grad_norm": 0.009058034047484398, "learning_rate": 8.619744058500915e-05, "loss": 0.0406, "step": 2490 }, { "epoch": 1.1425959780621573, "grad_norm": 0.2688920795917511, "learning_rate": 8.574040219378429e-05, "loss": 0.0294, "step": 2500 }, { "epoch": 1.1425959780621573, "eval_accuracy": 0.9639333333333333, "eval_loss": 0.1327054649591446, "eval_runtime": 70.9468, "eval_samples_per_second": 211.426, "eval_steps_per_second": 13.221, "step": 2500 }, { "epoch": 1.147166361974406, "grad_norm": 0.007774589583277702, "learning_rate": 8.528336380255942e-05, "loss": 0.0427, "step": 2510 }, { "epoch": 1.1517367458866545, "grad_norm": 5.866430759429932, "learning_rate": 8.482632541133455e-05, "loss": 0.0595, "step": 2520 }, { "epoch": 1.1563071297989032, "grad_norm": 0.11866763979196548, "learning_rate": 8.436928702010968e-05, "loss": 0.0673, "step": 2530 }, { "epoch": 1.1608775137111518, "grad_norm": 5.5359978675842285, "learning_rate": 8.391224862888482e-05, "loss": 0.1446, "step": 2540 }, { "epoch": 1.1654478976234004, "grad_norm": 0.016985343769192696, "learning_rate": 8.345521023765997e-05, "loss": 0.0559, "step": 2550 }, { "epoch": 1.170018281535649, "grad_norm": 0.7032074928283691, "learning_rate": 8.29981718464351e-05, "loss": 0.0093, "step": 2560 }, { "epoch": 1.1745886654478976, "grad_norm": 0.14500297605991364, "learning_rate": 8.254113345521024e-05, "loss": 0.0591, "step": 2570 }, { "epoch": 1.1791590493601463, "grad_norm": 4.615384578704834, "learning_rate": 8.208409506398538e-05, "loss": 0.0167, "step": 2580 }, { "epoch": 1.1837294332723949, "grad_norm": 3.747305154800415, "learning_rate": 8.162705667276052e-05, "loss": 0.1315, "step": 2590 }, { "epoch": 1.1882998171846435, "grad_norm": 6.55547571182251, "learning_rate": 8.117001828153565e-05, "loss": 0.0939, "step": 2600 }, { "epoch": 1.1882998171846435, "eval_accuracy": 0.9621333333333333, "eval_loss": 0.1408853828907013, "eval_runtime": 70.3628, "eval_samples_per_second": 213.181, "eval_steps_per_second": 13.331, "step": 2600 }, { "epoch": 1.1928702010968921, "grad_norm": 0.018332751467823982, "learning_rate": 8.071297989031079e-05, "loss": 0.0422, "step": 2610 }, { "epoch": 1.1974405850091407, "grad_norm": 8.07509708404541, "learning_rate": 8.025594149908592e-05, "loss": 0.0548, "step": 2620 }, { "epoch": 1.2020109689213894, "grad_norm": 0.015664540231227875, "learning_rate": 7.979890310786106e-05, "loss": 0.0201, "step": 2630 }, { "epoch": 1.206581352833638, "grad_norm": 0.34166190028190613, "learning_rate": 7.934186471663621e-05, "loss": 0.0063, "step": 2640 }, { "epoch": 1.2111517367458866, "grad_norm": 0.024543585255742073, "learning_rate": 7.888482632541135e-05, "loss": 0.067, "step": 2650 }, { "epoch": 1.2157221206581352, "grad_norm": 5.0015788078308105, "learning_rate": 7.842778793418648e-05, "loss": 0.0677, "step": 2660 }, { "epoch": 1.2202925045703839, "grad_norm": 0.3825192153453827, "learning_rate": 7.79707495429616e-05, "loss": 0.0447, "step": 2670 }, { "epoch": 1.2248628884826325, "grad_norm": 1.0526628494262695, "learning_rate": 7.751371115173674e-05, "loss": 0.0264, "step": 2680 }, { "epoch": 1.229433272394881, "grad_norm": 0.015143281780183315, "learning_rate": 7.705667276051188e-05, "loss": 0.0182, "step": 2690 }, { "epoch": 1.2340036563071297, "grad_norm": 6.874438285827637, "learning_rate": 7.659963436928702e-05, "loss": 0.0756, "step": 2700 }, { "epoch": 1.2340036563071297, "eval_accuracy": 0.9682, "eval_loss": 0.1201971173286438, "eval_runtime": 70.6459, "eval_samples_per_second": 212.327, "eval_steps_per_second": 13.277, "step": 2700 }, { "epoch": 1.2385740402193783, "grad_norm": 0.02307463437318802, "learning_rate": 7.614259597806215e-05, "loss": 0.033, "step": 2710 }, { "epoch": 1.2431444241316272, "grad_norm": 0.01136768702417612, "learning_rate": 7.56855575868373e-05, "loss": 0.0762, "step": 2720 }, { "epoch": 1.2477148080438756, "grad_norm": 5.031988620758057, "learning_rate": 7.522851919561244e-05, "loss": 0.0692, "step": 2730 }, { "epoch": 1.2522851919561244, "grad_norm": 0.028815852478146553, "learning_rate": 7.477148080438758e-05, "loss": 0.008, "step": 2740 }, { "epoch": 1.2568555758683728, "grad_norm": 10.7840576171875, "learning_rate": 7.431444241316271e-05, "loss": 0.0711, "step": 2750 }, { "epoch": 1.2614259597806217, "grad_norm": 3.8280370235443115, "learning_rate": 7.385740402193785e-05, "loss": 0.0581, "step": 2760 }, { "epoch": 1.26599634369287, "grad_norm": 0.03191199526190758, "learning_rate": 7.340036563071298e-05, "loss": 0.0882, "step": 2770 }, { "epoch": 1.270566727605119, "grad_norm": 0.010684626176953316, "learning_rate": 7.294332723948812e-05, "loss": 0.0228, "step": 2780 }, { "epoch": 1.2751371115173675, "grad_norm": 0.01165696233510971, "learning_rate": 7.248628884826326e-05, "loss": 0.0364, "step": 2790 }, { "epoch": 1.2797074954296161, "grad_norm": 5.020371913909912, "learning_rate": 7.20292504570384e-05, "loss": 0.0466, "step": 2800 }, { "epoch": 1.2797074954296161, "eval_accuracy": 0.964, "eval_loss": 0.1273525506258011, "eval_runtime": 70.5245, "eval_samples_per_second": 212.692, "eval_steps_per_second": 13.3, "step": 2800 }, { "epoch": 1.2842778793418648, "grad_norm": 3.339799165725708, "learning_rate": 7.157221206581353e-05, "loss": 0.0331, "step": 2810 }, { "epoch": 1.2888482632541134, "grad_norm": 2.2458271980285645, "learning_rate": 7.111517367458867e-05, "loss": 0.0139, "step": 2820 }, { "epoch": 1.293418647166362, "grad_norm": 0.03158143162727356, "learning_rate": 7.06581352833638e-05, "loss": 0.0316, "step": 2830 }, { "epoch": 1.2979890310786106, "grad_norm": 0.012245237827301025, "learning_rate": 7.020109689213894e-05, "loss": 0.0501, "step": 2840 }, { "epoch": 1.3025594149908593, "grad_norm": 6.688177585601807, "learning_rate": 6.974405850091408e-05, "loss": 0.0349, "step": 2850 }, { "epoch": 1.3071297989031079, "grad_norm": 0.01068816240876913, "learning_rate": 6.928702010968921e-05, "loss": 0.0373, "step": 2860 }, { "epoch": 1.3117001828153565, "grad_norm": 0.017882896587252617, "learning_rate": 6.882998171846435e-05, "loss": 0.0272, "step": 2870 }, { "epoch": 1.3162705667276051, "grad_norm": 0.009448254480957985, "learning_rate": 6.837294332723948e-05, "loss": 0.045, "step": 2880 }, { "epoch": 1.3208409506398537, "grad_norm": 8.753164291381836, "learning_rate": 6.791590493601463e-05, "loss": 0.1252, "step": 2890 }, { "epoch": 1.3254113345521024, "grad_norm": 0.02937444858253002, "learning_rate": 6.745886654478977e-05, "loss": 0.0565, "step": 2900 }, { "epoch": 1.3254113345521024, "eval_accuracy": 0.9662666666666667, "eval_loss": 0.12496425956487656, "eval_runtime": 70.9263, "eval_samples_per_second": 211.487, "eval_steps_per_second": 13.225, "step": 2900 }, { "epoch": 1.329981718464351, "grad_norm": 0.7451736927032471, "learning_rate": 6.700182815356491e-05, "loss": 0.0298, "step": 2910 }, { "epoch": 1.3345521023765996, "grad_norm": 0.014505515806376934, "learning_rate": 6.654478976234004e-05, "loss": 0.0954, "step": 2920 }, { "epoch": 1.3391224862888482, "grad_norm": 6.800475597381592, "learning_rate": 6.608775137111518e-05, "loss": 0.0584, "step": 2930 }, { "epoch": 1.3436928702010968, "grad_norm": 0.08044274151325226, "learning_rate": 6.563071297989032e-05, "loss": 0.0944, "step": 2940 }, { "epoch": 1.3482632541133455, "grad_norm": 0.051912058144807816, "learning_rate": 6.517367458866545e-05, "loss": 0.0128, "step": 2950 }, { "epoch": 1.352833638025594, "grad_norm": 0.8656260967254639, "learning_rate": 6.471663619744059e-05, "loss": 0.1187, "step": 2960 }, { "epoch": 1.3574040219378427, "grad_norm": 0.004978422075510025, "learning_rate": 6.425959780621573e-05, "loss": 0.0228, "step": 2970 }, { "epoch": 1.3619744058500913, "grad_norm": 0.5934199094772339, "learning_rate": 6.380255941499086e-05, "loss": 0.0038, "step": 2980 }, { "epoch": 1.3665447897623402, "grad_norm": 3.7717771530151367, "learning_rate": 6.3345521023766e-05, "loss": 0.087, "step": 2990 }, { "epoch": 1.3711151736745886, "grad_norm": 0.01153448224067688, "learning_rate": 6.288848263254114e-05, "loss": 0.0609, "step": 3000 }, { "epoch": 1.3711151736745886, "eval_accuracy": 0.9656666666666667, "eval_loss": 0.12994171679019928, "eval_runtime": 71.2707, "eval_samples_per_second": 210.465, "eval_steps_per_second": 13.161, "step": 3000 }, { "epoch": 1.3756855575868374, "grad_norm": 0.17517925798892975, "learning_rate": 6.243144424131627e-05, "loss": 0.026, "step": 3010 }, { "epoch": 1.3802559414990858, "grad_norm": 0.11696294695138931, "learning_rate": 6.197440585009141e-05, "loss": 0.025, "step": 3020 }, { "epoch": 1.3848263254113347, "grad_norm": 0.007365319412201643, "learning_rate": 6.151736745886654e-05, "loss": 0.0251, "step": 3030 }, { "epoch": 1.389396709323583, "grad_norm": 0.3844846189022064, "learning_rate": 6.106032906764168e-05, "loss": 0.0252, "step": 3040 }, { "epoch": 1.393967093235832, "grad_norm": 0.014660494402050972, "learning_rate": 6.0603290676416824e-05, "loss": 0.0274, "step": 3050 }, { "epoch": 1.3985374771480805, "grad_norm": 4.684697151184082, "learning_rate": 6.014625228519196e-05, "loss": 0.0294, "step": 3060 }, { "epoch": 1.4031078610603291, "grad_norm": 0.05706701800227165, "learning_rate": 5.96892138939671e-05, "loss": 0.0053, "step": 3070 }, { "epoch": 1.4076782449725778, "grad_norm": 0.010988248512148857, "learning_rate": 5.923217550274224e-05, "loss": 0.0227, "step": 3080 }, { "epoch": 1.4122486288848264, "grad_norm": 0.009499771520495415, "learning_rate": 5.8775137111517377e-05, "loss": 0.0061, "step": 3090 }, { "epoch": 1.416819012797075, "grad_norm": 0.35233938694000244, "learning_rate": 5.83180987202925e-05, "loss": 0.0201, "step": 3100 }, { "epoch": 1.416819012797075, "eval_accuracy": 0.9685333333333334, "eval_loss": 0.12030760943889618, "eval_runtime": 70.6066, "eval_samples_per_second": 212.445, "eval_steps_per_second": 13.285, "step": 3100 }, { "epoch": 1.4213893967093236, "grad_norm": 0.680586576461792, "learning_rate": 5.786106032906764e-05, "loss": 0.0825, "step": 3110 }, { "epoch": 1.4259597806215722, "grad_norm": 1.1647635698318481, "learning_rate": 5.740402193784278e-05, "loss": 0.0668, "step": 3120 }, { "epoch": 1.4305301645338209, "grad_norm": 0.3994854986667633, "learning_rate": 5.6946983546617915e-05, "loss": 0.054, "step": 3130 }, { "epoch": 1.4351005484460695, "grad_norm": 0.012723923660814762, "learning_rate": 5.648994515539305e-05, "loss": 0.1109, "step": 3140 }, { "epoch": 1.4396709323583181, "grad_norm": 0.02893258072435856, "learning_rate": 5.603290676416819e-05, "loss": 0.0447, "step": 3150 }, { "epoch": 1.4442413162705667, "grad_norm": 2.296046495437622, "learning_rate": 5.557586837294333e-05, "loss": 0.0139, "step": 3160 }, { "epoch": 1.4488117001828154, "grad_norm": 0.0075446791015565395, "learning_rate": 5.511882998171847e-05, "loss": 0.0254, "step": 3170 }, { "epoch": 1.453382084095064, "grad_norm": 4.601187705993652, "learning_rate": 5.47074954296161e-05, "loss": 0.0619, "step": 3180 }, { "epoch": 1.4579524680073126, "grad_norm": 0.028089461848139763, "learning_rate": 5.425045703839122e-05, "loss": 0.0324, "step": 3190 }, { "epoch": 1.4625228519195612, "grad_norm": 0.00567116541787982, "learning_rate": 5.3793418647166363e-05, "loss": 0.0258, "step": 3200 }, { "epoch": 1.4625228519195612, "eval_accuracy": 0.9692666666666667, "eval_loss": 0.11664163321256638, "eval_runtime": 70.1343, "eval_samples_per_second": 213.875, "eval_steps_per_second": 13.374, "step": 3200 }, { "epoch": 1.4670932358318098, "grad_norm": 0.008565380237996578, "learning_rate": 5.33363802559415e-05, "loss": 0.1077, "step": 3210 }, { "epoch": 1.4716636197440585, "grad_norm": 6.626660346984863, "learning_rate": 5.2879341864716636e-05, "loss": 0.0685, "step": 3220 }, { "epoch": 1.476234003656307, "grad_norm": 5.336280822753906, "learning_rate": 5.242230347349177e-05, "loss": 0.0734, "step": 3230 }, { "epoch": 1.4808043875685557, "grad_norm": 2.9951882362365723, "learning_rate": 5.196526508226691e-05, "loss": 0.0216, "step": 3240 }, { "epoch": 1.4853747714808043, "grad_norm": 0.44102242588996887, "learning_rate": 5.150822669104205e-05, "loss": 0.0036, "step": 3250 }, { "epoch": 1.489945155393053, "grad_norm": 3.0561587810516357, "learning_rate": 5.105118829981719e-05, "loss": 0.0808, "step": 3260 }, { "epoch": 1.4945155393053016, "grad_norm": 0.09664315730333328, "learning_rate": 5.0594149908592325e-05, "loss": 0.0796, "step": 3270 }, { "epoch": 1.4990859232175504, "grad_norm": 0.01629328727722168, "learning_rate": 5.013711151736746e-05, "loss": 0.003, "step": 3280 }, { "epoch": 1.5036563071297988, "grad_norm": 0.008465313352644444, "learning_rate": 4.96800731261426e-05, "loss": 0.1065, "step": 3290 }, { "epoch": 1.5082266910420477, "grad_norm": 2.8709588050842285, "learning_rate": 4.9223034734917734e-05, "loss": 0.0913, "step": 3300 }, { "epoch": 1.5082266910420477, "eval_accuracy": 0.9736, "eval_loss": 0.10086899250745773, "eval_runtime": 70.4855, "eval_samples_per_second": 212.81, "eval_steps_per_second": 13.308, "step": 3300 }, { "epoch": 1.512797074954296, "grad_norm": 0.006857722532004118, "learning_rate": 4.876599634369287e-05, "loss": 0.0035, "step": 3310 }, { "epoch": 1.517367458866545, "grad_norm": 0.046820204704999924, "learning_rate": 4.830895795246801e-05, "loss": 0.0234, "step": 3320 }, { "epoch": 1.5219378427787933, "grad_norm": 0.009667345322668552, "learning_rate": 4.785191956124315e-05, "loss": 0.0304, "step": 3330 }, { "epoch": 1.5265082266910421, "grad_norm": 5.041330814361572, "learning_rate": 4.739488117001829e-05, "loss": 0.0812, "step": 3340 }, { "epoch": 1.5310786106032905, "grad_norm": 0.12366422265768051, "learning_rate": 4.693784277879342e-05, "loss": 0.0913, "step": 3350 }, { "epoch": 1.5356489945155394, "grad_norm": 0.19020313024520874, "learning_rate": 4.648080438756856e-05, "loss": 0.0027, "step": 3360 }, { "epoch": 1.5402193784277878, "grad_norm": 0.008719071745872498, "learning_rate": 4.6023765996343696e-05, "loss": 0.0087, "step": 3370 }, { "epoch": 1.5447897623400366, "grad_norm": 5.164638996124268, "learning_rate": 4.556672760511883e-05, "loss": 0.0509, "step": 3380 }, { "epoch": 1.5493601462522852, "grad_norm": 1.9091380834579468, "learning_rate": 4.510968921389397e-05, "loss": 0.0622, "step": 3390 }, { "epoch": 1.5539305301645339, "grad_norm": 0.015776393935084343, "learning_rate": 4.4652650822669105e-05, "loss": 0.0235, "step": 3400 }, { "epoch": 1.5539305301645339, "eval_accuracy": 0.9732, "eval_loss": 0.0964307188987732, "eval_runtime": 71.1476, "eval_samples_per_second": 210.829, "eval_steps_per_second": 13.184, "step": 3400 }, { "epoch": 1.5585009140767825, "grad_norm": 0.04088925942778587, "learning_rate": 4.419561243144424e-05, "loss": 0.0579, "step": 3410 }, { "epoch": 1.563071297989031, "grad_norm": 0.04248817265033722, "learning_rate": 4.3738574040219385e-05, "loss": 0.0368, "step": 3420 }, { "epoch": 1.5676416819012797, "grad_norm": 0.06178814917802811, "learning_rate": 4.328153564899452e-05, "loss": 0.0356, "step": 3430 }, { "epoch": 1.5722120658135283, "grad_norm": 0.014863620512187481, "learning_rate": 4.282449725776965e-05, "loss": 0.0173, "step": 3440 }, { "epoch": 1.576782449725777, "grad_norm": 0.0053153312765061855, "learning_rate": 4.236745886654479e-05, "loss": 0.0295, "step": 3450 }, { "epoch": 1.5813528336380256, "grad_norm": 0.03949157893657684, "learning_rate": 4.191042047531993e-05, "loss": 0.0751, "step": 3460 }, { "epoch": 1.5859232175502742, "grad_norm": 0.01701487973332405, "learning_rate": 4.145338208409507e-05, "loss": 0.0311, "step": 3470 }, { "epoch": 1.5904936014625228, "grad_norm": 0.06288379430770874, "learning_rate": 4.09963436928702e-05, "loss": 0.0779, "step": 3480 }, { "epoch": 1.5950639853747715, "grad_norm": 12.93021297454834, "learning_rate": 4.053930530164534e-05, "loss": 0.0274, "step": 3490 }, { "epoch": 1.59963436928702, "grad_norm": 0.03469611704349518, "learning_rate": 4.008226691042048e-05, "loss": 0.0089, "step": 3500 }, { "epoch": 1.59963436928702, "eval_accuracy": 0.9747333333333333, "eval_loss": 0.09657016396522522, "eval_runtime": 70.2849, "eval_samples_per_second": 213.417, "eval_steps_per_second": 13.346, "step": 3500 }, { "epoch": 1.6042047531992687, "grad_norm": 0.01145760528743267, "learning_rate": 3.962522851919561e-05, "loss": 0.0226, "step": 3510 }, { "epoch": 1.6087751371115173, "grad_norm": 0.006313066463917494, "learning_rate": 3.916819012797075e-05, "loss": 0.1061, "step": 3520 }, { "epoch": 1.6133455210237662, "grad_norm": 0.006270520854741335, "learning_rate": 3.8711151736745885e-05, "loss": 0.0439, "step": 3530 }, { "epoch": 1.6179159049360146, "grad_norm": 2.423236608505249, "learning_rate": 3.825411334552103e-05, "loss": 0.0273, "step": 3540 }, { "epoch": 1.6224862888482634, "grad_norm": 0.025253351777791977, "learning_rate": 3.7797074954296165e-05, "loss": 0.0265, "step": 3550 }, { "epoch": 1.6270566727605118, "grad_norm": 0.016615109518170357, "learning_rate": 3.73400365630713e-05, "loss": 0.0226, "step": 3560 }, { "epoch": 1.6316270566727606, "grad_norm": 0.006486339028924704, "learning_rate": 3.688299817184644e-05, "loss": 0.0108, "step": 3570 }, { "epoch": 1.636197440585009, "grad_norm": 0.019093792885541916, "learning_rate": 3.6425959780621574e-05, "loss": 0.0916, "step": 3580 }, { "epoch": 1.6407678244972579, "grad_norm": 0.014140899293124676, "learning_rate": 3.596892138939671e-05, "loss": 0.1538, "step": 3590 }, { "epoch": 1.6453382084095063, "grad_norm": 0.009632795117795467, "learning_rate": 3.551188299817185e-05, "loss": 0.0455, "step": 3600 }, { "epoch": 1.6453382084095063, "eval_accuracy": 0.9748, "eval_loss": 0.09634851664304733, "eval_runtime": 70.4832, "eval_samples_per_second": 212.817, "eval_steps_per_second": 13.308, "step": 3600 }, { "epoch": 1.6499085923217551, "grad_norm": 0.538037896156311, "learning_rate": 3.505484460694698e-05, "loss": 0.0632, "step": 3610 }, { "epoch": 1.6544789762340035, "grad_norm": 2.536642551422119, "learning_rate": 3.459780621572212e-05, "loss": 0.0215, "step": 3620 }, { "epoch": 1.6590493601462524, "grad_norm": 0.014462544582784176, "learning_rate": 3.414076782449726e-05, "loss": 0.0783, "step": 3630 }, { "epoch": 1.6636197440585008, "grad_norm": 0.04104587808251381, "learning_rate": 3.36837294332724e-05, "loss": 0.0468, "step": 3640 }, { "epoch": 1.6681901279707496, "grad_norm": 12.753653526306152, "learning_rate": 3.3226691042047536e-05, "loss": 0.096, "step": 3650 }, { "epoch": 1.672760511882998, "grad_norm": 0.6275530457496643, "learning_rate": 3.2769652650822665e-05, "loss": 0.0459, "step": 3660 }, { "epoch": 1.6773308957952469, "grad_norm": 0.028450943529605865, "learning_rate": 3.231261425959781e-05, "loss": 0.0129, "step": 3670 }, { "epoch": 1.6819012797074955, "grad_norm": 3.766301393508911, "learning_rate": 3.1855575868372945e-05, "loss": 0.01, "step": 3680 }, { "epoch": 1.686471663619744, "grad_norm": 1.72735595703125, "learning_rate": 3.139853747714808e-05, "loss": 0.0162, "step": 3690 }, { "epoch": 1.6910420475319927, "grad_norm": 0.012926338240504265, "learning_rate": 3.094149908592322e-05, "loss": 0.0271, "step": 3700 }, { "epoch": 1.6910420475319927, "eval_accuracy": 0.9762666666666666, "eval_loss": 0.0874376893043518, "eval_runtime": 71.1478, "eval_samples_per_second": 210.829, "eval_steps_per_second": 13.184, "step": 3700 }, { "epoch": 1.6956124314442413, "grad_norm": 0.20961987972259521, "learning_rate": 3.0484460694698358e-05, "loss": 0.0269, "step": 3710 }, { "epoch": 1.70018281535649, "grad_norm": 5.752171039581299, "learning_rate": 3.0027422303473497e-05, "loss": 0.0117, "step": 3720 }, { "epoch": 1.7047531992687386, "grad_norm": 0.02492084540426731, "learning_rate": 2.9570383912248627e-05, "loss": 0.0494, "step": 3730 }, { "epoch": 1.7093235831809872, "grad_norm": 1.6408967971801758, "learning_rate": 2.9113345521023767e-05, "loss": 0.0079, "step": 3740 }, { "epoch": 1.7138939670932358, "grad_norm": 0.010781402699649334, "learning_rate": 2.8656307129798903e-05, "loss": 0.0288, "step": 3750 }, { "epoch": 1.7184643510054844, "grad_norm": 0.007589300163090229, "learning_rate": 2.8199268738574043e-05, "loss": 0.0304, "step": 3760 }, { "epoch": 1.723034734917733, "grad_norm": 0.0105056157335639, "learning_rate": 2.774223034734918e-05, "loss": 0.0023, "step": 3770 }, { "epoch": 1.7276051188299817, "grad_norm": 0.028248343616724014, "learning_rate": 2.7285191956124316e-05, "loss": 0.0156, "step": 3780 }, { "epoch": 1.7321755027422303, "grad_norm": 0.004776041954755783, "learning_rate": 2.6828153564899456e-05, "loss": 0.0113, "step": 3790 }, { "epoch": 1.736745886654479, "grad_norm": 7.163562297821045, "learning_rate": 2.637111517367459e-05, "loss": 0.0407, "step": 3800 }, { "epoch": 1.736745886654479, "eval_accuracy": 0.9761333333333333, "eval_loss": 0.08977096527814865, "eval_runtime": 70.6674, "eval_samples_per_second": 212.262, "eval_steps_per_second": 13.273, "step": 3800 }, { "epoch": 1.7413162705667276, "grad_norm": 1.1980034112930298, "learning_rate": 2.5914076782449725e-05, "loss": 0.008, "step": 3810 }, { "epoch": 1.7458866544789764, "grad_norm": 5.438980579376221, "learning_rate": 2.5457038391224865e-05, "loss": 0.0558, "step": 3820 }, { "epoch": 1.7504570383912248, "grad_norm": 0.02217746712267399, "learning_rate": 2.5e-05, "loss": 0.0057, "step": 3830 }, { "epoch": 1.7550274223034736, "grad_norm": 0.595504641532898, "learning_rate": 2.4542961608775138e-05, "loss": 0.0551, "step": 3840 }, { "epoch": 1.759597806215722, "grad_norm": 0.09388578683137894, "learning_rate": 2.4085923217550274e-05, "loss": 0.0744, "step": 3850 }, { "epoch": 1.7641681901279709, "grad_norm": 2.807389736175537, "learning_rate": 2.362888482632541e-05, "loss": 0.0051, "step": 3860 }, { "epoch": 1.7687385740402193, "grad_norm": 0.006797166541218758, "learning_rate": 2.317184643510055e-05, "loss": 0.0069, "step": 3870 }, { "epoch": 1.7733089579524681, "grad_norm": 0.0043932488188147545, "learning_rate": 2.2714808043875687e-05, "loss": 0.0435, "step": 3880 }, { "epoch": 1.7778793418647165, "grad_norm": 0.009305426850914955, "learning_rate": 2.2257769652650823e-05, "loss": 0.0189, "step": 3890 }, { "epoch": 1.7824497257769654, "grad_norm": 0.003490304574370384, "learning_rate": 2.180073126142596e-05, "loss": 0.1095, "step": 3900 }, { "epoch": 1.7824497257769654, "eval_accuracy": 0.976, "eval_loss": 0.08494840562343597, "eval_runtime": 71.5803, "eval_samples_per_second": 209.555, "eval_steps_per_second": 13.104, "step": 3900 }, { "epoch": 1.7870201096892138, "grad_norm": 0.00673332205042243, "learning_rate": 2.13436928702011e-05, "loss": 0.0653, "step": 3910 }, { "epoch": 1.7915904936014626, "grad_norm": 0.9469023942947388, "learning_rate": 2.0886654478976232e-05, "loss": 0.0077, "step": 3920 }, { "epoch": 1.796160877513711, "grad_norm": 0.03154715150594711, "learning_rate": 2.0429616087751372e-05, "loss": 0.0323, "step": 3930 }, { "epoch": 1.8007312614259599, "grad_norm": 0.020610906183719635, "learning_rate": 1.997257769652651e-05, "loss": 0.0211, "step": 3940 }, { "epoch": 1.8053016453382082, "grad_norm": 0.014532508328557014, "learning_rate": 1.9515539305301648e-05, "loss": 0.0357, "step": 3950 }, { "epoch": 1.809872029250457, "grad_norm": 0.020481685176491737, "learning_rate": 1.905850091407678e-05, "loss": 0.0498, "step": 3960 }, { "epoch": 1.8144424131627057, "grad_norm": 0.018279431387782097, "learning_rate": 1.860146252285192e-05, "loss": 0.0559, "step": 3970 }, { "epoch": 1.8190127970749543, "grad_norm": 0.03680342435836792, "learning_rate": 1.8144424131627057e-05, "loss": 0.0176, "step": 3980 }, { "epoch": 1.823583180987203, "grad_norm": 0.014550072140991688, "learning_rate": 1.7687385740402197e-05, "loss": 0.1098, "step": 3990 }, { "epoch": 1.8281535648994516, "grad_norm": 0.01788398250937462, "learning_rate": 1.723034734917733e-05, "loss": 0.0327, "step": 4000 }, { "epoch": 1.8281535648994516, "eval_accuracy": 0.9745333333333334, "eval_loss": 0.0925898626446724, "eval_runtime": 71.855, "eval_samples_per_second": 208.754, "eval_steps_per_second": 13.054, "step": 4000 }, { "epoch": 1.8327239488117002, "grad_norm": 1.894504189491272, "learning_rate": 1.677330895795247e-05, "loss": 0.0373, "step": 4010 }, { "epoch": 1.8372943327239488, "grad_norm": 0.005655787419527769, "learning_rate": 1.6316270566727607e-05, "loss": 0.0067, "step": 4020 }, { "epoch": 1.8418647166361974, "grad_norm": 4.612732410430908, "learning_rate": 1.5859232175502743e-05, "loss": 0.0183, "step": 4030 }, { "epoch": 1.846435100548446, "grad_norm": 0.7349024415016174, "learning_rate": 1.540219378427788e-05, "loss": 0.0021, "step": 4040 }, { "epoch": 1.8510054844606947, "grad_norm": 0.03837637975811958, "learning_rate": 1.4945155393053017e-05, "loss": 0.0031, "step": 4050 }, { "epoch": 1.8555758683729433, "grad_norm": 0.0046151746064424515, "learning_rate": 1.4488117001828156e-05, "loss": 0.1252, "step": 4060 }, { "epoch": 1.860146252285192, "grad_norm": 0.01960400864481926, "learning_rate": 1.403107861060329e-05, "loss": 0.0024, "step": 4070 }, { "epoch": 1.8647166361974405, "grad_norm": 0.012547549791634083, "learning_rate": 1.3574040219378428e-05, "loss": 0.0207, "step": 4080 }, { "epoch": 1.8692870201096892, "grad_norm": 0.0962536633014679, "learning_rate": 1.3117001828153566e-05, "loss": 0.0475, "step": 4090 }, { "epoch": 1.8738574040219378, "grad_norm": 0.012872631661593914, "learning_rate": 1.2659963436928701e-05, "loss": 0.0427, "step": 4100 }, { "epoch": 1.8738574040219378, "eval_accuracy": 0.9768666666666667, "eval_loss": 0.08114204555749893, "eval_runtime": 70.3536, "eval_samples_per_second": 213.209, "eval_steps_per_second": 13.333, "step": 4100 }, { "epoch": 1.8784277879341866, "grad_norm": 0.004386584740132093, "learning_rate": 1.220292504570384e-05, "loss": 0.0148, "step": 4110 }, { "epoch": 1.882998171846435, "grad_norm": 7.613697528839111, "learning_rate": 1.1745886654478977e-05, "loss": 0.0598, "step": 4120 }, { "epoch": 1.8875685557586839, "grad_norm": 0.19733187556266785, "learning_rate": 1.1288848263254114e-05, "loss": 0.0254, "step": 4130 }, { "epoch": 1.8921389396709323, "grad_norm": 0.003689356381073594, "learning_rate": 1.0831809872029252e-05, "loss": 0.0044, "step": 4140 }, { "epoch": 1.8967093235831811, "grad_norm": 1.0105313062667847, "learning_rate": 1.0374771480804388e-05, "loss": 0.03, "step": 4150 }, { "epoch": 1.9012797074954295, "grad_norm": 0.02574901282787323, "learning_rate": 9.917733089579526e-06, "loss": 0.012, "step": 4160 }, { "epoch": 1.9058500914076784, "grad_norm": 4.768786907196045, "learning_rate": 9.460694698354663e-06, "loss": 0.0395, "step": 4170 }, { "epoch": 1.9104204753199268, "grad_norm": 0.004033361561596394, "learning_rate": 9.0036563071298e-06, "loss": 0.012, "step": 4180 }, { "epoch": 1.9149908592321756, "grad_norm": 0.013113109394907951, "learning_rate": 8.546617915904936e-06, "loss": 0.0672, "step": 4190 }, { "epoch": 1.919561243144424, "grad_norm": 0.294810950756073, "learning_rate": 8.089579524680074e-06, "loss": 0.003, "step": 4200 }, { "epoch": 1.919561243144424, "eval_accuracy": 0.9761333333333333, "eval_loss": 0.08205202966928482, "eval_runtime": 70.0255, "eval_samples_per_second": 214.208, "eval_steps_per_second": 13.395, "step": 4200 }, { "epoch": 1.9241316270566728, "grad_norm": 0.05837790668010712, "learning_rate": 7.63254113345521e-06, "loss": 0.0415, "step": 4210 }, { "epoch": 1.9287020109689212, "grad_norm": 1.440628170967102, "learning_rate": 7.175502742230347e-06, "loss": 0.0267, "step": 4220 }, { "epoch": 1.93327239488117, "grad_norm": 0.04937027022242546, "learning_rate": 6.7184643510054855e-06, "loss": 0.0385, "step": 4230 }, { "epoch": 1.9378427787934185, "grad_norm": 0.006680316291749477, "learning_rate": 6.261425959780622e-06, "loss": 0.0025, "step": 4240 }, { "epoch": 1.9424131627056673, "grad_norm": 0.004382527898997068, "learning_rate": 5.804387568555759e-06, "loss": 0.0843, "step": 4250 }, { "epoch": 1.946983546617916, "grad_norm": 0.06263825297355652, "learning_rate": 5.3473491773308956e-06, "loss": 0.0359, "step": 4260 }, { "epoch": 1.9515539305301646, "grad_norm": 0.017276106402277946, "learning_rate": 4.890310786106033e-06, "loss": 0.0264, "step": 4270 }, { "epoch": 1.9561243144424132, "grad_norm": 0.7312209606170654, "learning_rate": 4.43327239488117e-06, "loss": 0.0128, "step": 4280 }, { "epoch": 1.9606946983546618, "grad_norm": 0.007708389312028885, "learning_rate": 3.976234003656307e-06, "loss": 0.0356, "step": 4290 }, { "epoch": 1.9652650822669104, "grad_norm": 0.004229346755892038, "learning_rate": 3.5191956124314446e-06, "loss": 0.0182, "step": 4300 }, { "epoch": 1.9652650822669104, "eval_accuracy": 0.9772666666666666, "eval_loss": 0.08026164770126343, "eval_runtime": 69.5428, "eval_samples_per_second": 215.695, "eval_steps_per_second": 13.488, "step": 4300 }, { "epoch": 1.969835466179159, "grad_norm": 0.0215240940451622, "learning_rate": 3.0621572212065814e-06, "loss": 0.062, "step": 4310 }, { "epoch": 1.9744058500914077, "grad_norm": 0.022770356386899948, "learning_rate": 2.6051188299817187e-06, "loss": 0.0424, "step": 4320 }, { "epoch": 1.9789762340036563, "grad_norm": 0.01893909089267254, "learning_rate": 2.148080438756856e-06, "loss": 0.0031, "step": 4330 }, { "epoch": 1.983546617915905, "grad_norm": 7.610752105712891, "learning_rate": 1.691042047531993e-06, "loss": 0.0827, "step": 4340 }, { "epoch": 1.9881170018281535, "grad_norm": 0.008086251094937325, "learning_rate": 1.2340036563071298e-06, "loss": 0.0087, "step": 4350 }, { "epoch": 1.9926873857404022, "grad_norm": 4.746099948883057, "learning_rate": 7.769652650822669e-07, "loss": 0.0487, "step": 4360 }, { "epoch": 1.9972577696526508, "grad_norm": 0.3847046494483948, "learning_rate": 3.1992687385740404e-07, "loss": 0.0442, "step": 4370 }, { "epoch": 2.0, "step": 4376, "total_flos": 5.42482821328896e+18, "train_loss": 0.13969962793986365, "train_runtime": 4718.8358, "train_samples_per_second": 14.834, "train_steps_per_second": 0.927 }, { "epoch": 2.0, "eval_accuracy": 0.9772666666666666, "eval_loss": 0.08026164770126343, "eval_runtime": 73.8075, "eval_samples_per_second": 203.231, "eval_steps_per_second": 12.709, "step": 4376 } ], "logging_steps": 10, "max_steps": 4376, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.42482821328896e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }