{ "best_metric": 0.10886295884847641, "best_model_checkpoint": "./vit-base-brain-tumor-detection/checkpoint-3700", "epoch": 20.0, "eval_steps": 100, "global_step": 6400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03125, "grad_norm": 1.5038443803787231, "learning_rate": 0.0001996875, "loss": 1.1434, "step": 10 }, { "epoch": 0.0625, "grad_norm": 0.8518216013908386, "learning_rate": 0.000199375, "loss": 0.9712, "step": 20 }, { "epoch": 0.09375, "grad_norm": 2.0169894695281982, "learning_rate": 0.00019906250000000002, "loss": 0.9002, "step": 30 }, { "epoch": 0.125, "grad_norm": 1.4780454635620117, "learning_rate": 0.00019875, "loss": 0.9304, "step": 40 }, { "epoch": 0.15625, "grad_norm": 0.8681000471115112, "learning_rate": 0.00019843750000000002, "loss": 0.9191, "step": 50 }, { "epoch": 0.1875, "grad_norm": 0.9179439544677734, "learning_rate": 0.000198125, "loss": 0.929, "step": 60 }, { "epoch": 0.21875, "grad_norm": 1.7340352535247803, "learning_rate": 0.0001978125, "loss": 0.878, "step": 70 }, { "epoch": 0.25, "grad_norm": 1.0093904733657837, "learning_rate": 0.00019750000000000003, "loss": 0.9233, "step": 80 }, { "epoch": 0.28125, "grad_norm": 2.5266435146331787, "learning_rate": 0.00019718750000000002, "loss": 0.9763, "step": 90 }, { "epoch": 0.3125, "grad_norm": 3.176175594329834, "learning_rate": 0.000196875, "loss": 0.8826, "step": 100 }, { "epoch": 0.3125, "eval_accuracy": 0.575, "eval_loss": 0.9027458429336548, "eval_runtime": 6.7552, "eval_samples_per_second": 189.483, "eval_steps_per_second": 23.685, "step": 100 }, { "epoch": 0.34375, "grad_norm": 2.2321112155914307, "learning_rate": 0.00019656250000000001, "loss": 0.8653, "step": 110 }, { "epoch": 0.375, "grad_norm": 2.9958410263061523, "learning_rate": 0.00019625, "loss": 0.8645, "step": 120 }, { "epoch": 0.40625, "grad_norm": 1.3575609922409058, "learning_rate": 0.0001959375, "loss": 0.8028, "step": 130 }, { "epoch": 0.4375, "grad_norm": 2.6939704418182373, "learning_rate": 0.00019562500000000003, "loss": 0.7853, "step": 140 }, { "epoch": 0.46875, "grad_norm": 3.4280309677124023, "learning_rate": 0.0001953125, "loss": 0.8574, "step": 150 }, { "epoch": 0.5, "grad_norm": 2.5754916667938232, "learning_rate": 0.000195, "loss": 0.9127, "step": 160 }, { "epoch": 0.53125, "grad_norm": 1.0515691041946411, "learning_rate": 0.0001946875, "loss": 0.6715, "step": 170 }, { "epoch": 0.5625, "grad_norm": 4.097135066986084, "learning_rate": 0.00019437500000000002, "loss": 0.9268, "step": 180 }, { "epoch": 0.59375, "grad_norm": 1.1513463258743286, "learning_rate": 0.0001940625, "loss": 1.1014, "step": 190 }, { "epoch": 0.625, "grad_norm": 2.027341604232788, "learning_rate": 0.00019375000000000002, "loss": 0.8908, "step": 200 }, { "epoch": 0.625, "eval_accuracy": 0.5984375, "eval_loss": 0.8483627438545227, "eval_runtime": 6.1939, "eval_samples_per_second": 206.656, "eval_steps_per_second": 25.832, "step": 200 }, { "epoch": 0.65625, "grad_norm": 3.054396629333496, "learning_rate": 0.0001934375, "loss": 1.0798, "step": 210 }, { "epoch": 0.6875, "grad_norm": 1.4708349704742432, "learning_rate": 0.000193125, "loss": 0.9002, "step": 220 }, { "epoch": 0.71875, "grad_norm": 3.5276074409484863, "learning_rate": 0.00019281250000000003, "loss": 0.7594, "step": 230 }, { "epoch": 0.75, "grad_norm": 1.7735655307769775, "learning_rate": 0.00019250000000000002, "loss": 0.8413, "step": 240 }, { "epoch": 0.78125, "grad_norm": 2.533834457397461, "learning_rate": 0.0001921875, "loss": 0.6925, "step": 250 }, { "epoch": 0.8125, "grad_norm": 1.8991681337356567, "learning_rate": 0.00019187500000000002, "loss": 0.7569, "step": 260 }, { "epoch": 0.84375, "grad_norm": 1.8567323684692383, "learning_rate": 0.0001915625, "loss": 0.8479, "step": 270 }, { "epoch": 0.875, "grad_norm": 2.904021739959717, "learning_rate": 0.00019125000000000001, "loss": 0.7737, "step": 280 }, { "epoch": 0.90625, "grad_norm": 2.1224987506866455, "learning_rate": 0.00019093750000000003, "loss": 0.7647, "step": 290 }, { "epoch": 0.9375, "grad_norm": 2.221484661102295, "learning_rate": 0.000190625, "loss": 0.8229, "step": 300 }, { "epoch": 0.9375, "eval_accuracy": 0.66953125, "eval_loss": 0.7514046430587769, "eval_runtime": 6.6282, "eval_samples_per_second": 193.115, "eval_steps_per_second": 24.139, "step": 300 }, { "epoch": 0.96875, "grad_norm": 2.0415520668029785, "learning_rate": 0.0001903125, "loss": 0.8343, "step": 310 }, { "epoch": 1.0, "grad_norm": 1.7857444286346436, "learning_rate": 0.00019, "loss": 0.7851, "step": 320 }, { "epoch": 1.03125, "grad_norm": 3.682004451751709, "learning_rate": 0.00018968750000000002, "loss": 0.659, "step": 330 }, { "epoch": 1.0625, "grad_norm": 3.909653902053833, "learning_rate": 0.000189375, "loss": 0.5986, "step": 340 }, { "epoch": 1.09375, "grad_norm": 2.665677785873413, "learning_rate": 0.00018906250000000002, "loss": 0.599, "step": 350 }, { "epoch": 1.125, "grad_norm": 3.378941059112549, "learning_rate": 0.00018875, "loss": 0.6162, "step": 360 }, { "epoch": 1.15625, "grad_norm": 2.2860324382781982, "learning_rate": 0.0001884375, "loss": 0.5133, "step": 370 }, { "epoch": 1.1875, "grad_norm": 4.41148567199707, "learning_rate": 0.000188125, "loss": 0.4787, "step": 380 }, { "epoch": 1.21875, "grad_norm": 2.802339792251587, "learning_rate": 0.00018781250000000002, "loss": 0.8346, "step": 390 }, { "epoch": 1.25, "grad_norm": 3.0579328536987305, "learning_rate": 0.0001875, "loss": 0.5299, "step": 400 }, { "epoch": 1.25, "eval_accuracy": 0.71640625, "eval_loss": 0.6797709465026855, "eval_runtime": 6.1605, "eval_samples_per_second": 207.776, "eval_steps_per_second": 25.972, "step": 400 }, { "epoch": 1.28125, "grad_norm": 4.516961574554443, "learning_rate": 0.00018718750000000002, "loss": 0.5706, "step": 410 }, { "epoch": 1.3125, "grad_norm": 4.825928688049316, "learning_rate": 0.000186875, "loss": 0.4748, "step": 420 }, { "epoch": 1.34375, "grad_norm": 4.58062219619751, "learning_rate": 0.00018656250000000001, "loss": 0.589, "step": 430 }, { "epoch": 1.375, "grad_norm": 3.5321483612060547, "learning_rate": 0.00018625, "loss": 0.4854, "step": 440 }, { "epoch": 1.40625, "grad_norm": 2.8487465381622314, "learning_rate": 0.0001859375, "loss": 0.4736, "step": 450 }, { "epoch": 1.4375, "grad_norm": 3.3944597244262695, "learning_rate": 0.000185625, "loss": 0.5127, "step": 460 }, { "epoch": 1.46875, "grad_norm": 3.1018853187561035, "learning_rate": 0.0001853125, "loss": 0.5043, "step": 470 }, { "epoch": 1.5, "grad_norm": 1.7340843677520752, "learning_rate": 0.00018500000000000002, "loss": 0.4293, "step": 480 }, { "epoch": 1.53125, "grad_norm": 5.125267505645752, "learning_rate": 0.0001846875, "loss": 0.4616, "step": 490 }, { "epoch": 1.5625, "grad_norm": 4.208590030670166, "learning_rate": 0.000184375, "loss": 0.5207, "step": 500 }, { "epoch": 1.5625, "eval_accuracy": 0.7375, "eval_loss": 0.6465662717819214, "eval_runtime": 6.4735, "eval_samples_per_second": 197.729, "eval_steps_per_second": 24.716, "step": 500 }, { "epoch": 1.59375, "grad_norm": 3.470857620239258, "learning_rate": 0.0001840625, "loss": 0.5758, "step": 510 }, { "epoch": 1.625, "grad_norm": 2.353254556655884, "learning_rate": 0.00018375, "loss": 0.4954, "step": 520 }, { "epoch": 1.65625, "grad_norm": 5.26539421081543, "learning_rate": 0.0001834375, "loss": 0.4687, "step": 530 }, { "epoch": 1.6875, "grad_norm": 4.430329322814941, "learning_rate": 0.00018312500000000002, "loss": 0.3183, "step": 540 }, { "epoch": 1.71875, "grad_norm": 3.066425323486328, "learning_rate": 0.0001828125, "loss": 0.4647, "step": 550 }, { "epoch": 1.75, "grad_norm": 3.959084987640381, "learning_rate": 0.0001825, "loss": 0.3383, "step": 560 }, { "epoch": 1.78125, "grad_norm": 7.477952480316162, "learning_rate": 0.0001821875, "loss": 0.4317, "step": 570 }, { "epoch": 1.8125, "grad_norm": 3.7508246898651123, "learning_rate": 0.00018187500000000002, "loss": 0.621, "step": 580 }, { "epoch": 1.84375, "grad_norm": 6.189945220947266, "learning_rate": 0.0001815625, "loss": 0.5086, "step": 590 }, { "epoch": 1.875, "grad_norm": 6.064743995666504, "learning_rate": 0.00018125000000000001, "loss": 0.4967, "step": 600 }, { "epoch": 1.875, "eval_accuracy": 0.74609375, "eval_loss": 0.6302900314331055, "eval_runtime": 6.8591, "eval_samples_per_second": 186.614, "eval_steps_per_second": 23.327, "step": 600 }, { "epoch": 1.90625, "grad_norm": 2.6553187370300293, "learning_rate": 0.0001809375, "loss": 0.4273, "step": 610 }, { "epoch": 1.9375, "grad_norm": 8.86166000366211, "learning_rate": 0.000180625, "loss": 0.4264, "step": 620 }, { "epoch": 1.96875, "grad_norm": 5.616076946258545, "learning_rate": 0.00018031250000000003, "loss": 0.4907, "step": 630 }, { "epoch": 2.0, "grad_norm": 4.036799907684326, "learning_rate": 0.00018, "loss": 0.4537, "step": 640 }, { "epoch": 2.03125, "grad_norm": 4.771281719207764, "learning_rate": 0.0001796875, "loss": 0.2492, "step": 650 }, { "epoch": 2.0625, "grad_norm": 3.654841423034668, "learning_rate": 0.000179375, "loss": 0.2568, "step": 660 }, { "epoch": 2.09375, "grad_norm": 3.043989896774292, "learning_rate": 0.0001790625, "loss": 0.2354, "step": 670 }, { "epoch": 2.125, "grad_norm": 6.0935516357421875, "learning_rate": 0.00017875, "loss": 0.2106, "step": 680 }, { "epoch": 2.15625, "grad_norm": 2.231640577316284, "learning_rate": 0.00017843750000000002, "loss": 0.2804, "step": 690 }, { "epoch": 2.1875, "grad_norm": 8.500765800476074, "learning_rate": 0.000178125, "loss": 0.3977, "step": 700 }, { "epoch": 2.1875, "eval_accuracy": 0.771875, "eval_loss": 0.7239754796028137, "eval_runtime": 6.8507, "eval_samples_per_second": 186.843, "eval_steps_per_second": 23.355, "step": 700 }, { "epoch": 2.21875, "grad_norm": 5.513547420501709, "learning_rate": 0.0001778125, "loss": 0.3942, "step": 710 }, { "epoch": 2.25, "grad_norm": 6.819497585296631, "learning_rate": 0.0001775, "loss": 0.2489, "step": 720 }, { "epoch": 2.28125, "grad_norm": 2.9887595176696777, "learning_rate": 0.00017718750000000002, "loss": 0.2054, "step": 730 }, { "epoch": 2.3125, "grad_norm": 2.461519479751587, "learning_rate": 0.000176875, "loss": 0.2252, "step": 740 }, { "epoch": 2.34375, "grad_norm": 1.7205028533935547, "learning_rate": 0.00017656250000000002, "loss": 0.1975, "step": 750 }, { "epoch": 2.375, "grad_norm": 0.9731912612915039, "learning_rate": 0.00017625, "loss": 0.2049, "step": 760 }, { "epoch": 2.40625, "grad_norm": 4.950948715209961, "learning_rate": 0.0001759375, "loss": 0.283, "step": 770 }, { "epoch": 2.4375, "grad_norm": 3.5972580909729004, "learning_rate": 0.00017562500000000003, "loss": 0.2172, "step": 780 }, { "epoch": 2.46875, "grad_norm": 8.976056098937988, "learning_rate": 0.0001753125, "loss": 0.2733, "step": 790 }, { "epoch": 2.5, "grad_norm": 1.2202197313308716, "learning_rate": 0.000175, "loss": 0.2744, "step": 800 }, { "epoch": 2.5, "eval_accuracy": 0.8734375, "eval_loss": 0.35440793633461, "eval_runtime": 6.2483, "eval_samples_per_second": 204.854, "eval_steps_per_second": 25.607, "step": 800 }, { "epoch": 2.53125, "grad_norm": 7.248142719268799, "learning_rate": 0.0001746875, "loss": 0.2173, "step": 810 }, { "epoch": 2.5625, "grad_norm": 6.068428039550781, "learning_rate": 0.000174375, "loss": 0.2172, "step": 820 }, { "epoch": 2.59375, "grad_norm": 5.297908306121826, "learning_rate": 0.0001740625, "loss": 0.3827, "step": 830 }, { "epoch": 2.625, "grad_norm": 8.936563491821289, "learning_rate": 0.00017375000000000002, "loss": 0.2638, "step": 840 }, { "epoch": 2.65625, "grad_norm": 5.005488395690918, "learning_rate": 0.0001734375, "loss": 0.281, "step": 850 }, { "epoch": 2.6875, "grad_norm": 0.3621249794960022, "learning_rate": 0.000173125, "loss": 0.1424, "step": 860 }, { "epoch": 2.71875, "grad_norm": 2.321965217590332, "learning_rate": 0.0001728125, "loss": 0.2586, "step": 870 }, { "epoch": 2.75, "grad_norm": 9.775487899780273, "learning_rate": 0.00017250000000000002, "loss": 0.185, "step": 880 }, { "epoch": 2.78125, "grad_norm": 0.6502866744995117, "learning_rate": 0.0001721875, "loss": 0.1635, "step": 890 }, { "epoch": 2.8125, "grad_norm": 3.4629428386688232, "learning_rate": 0.00017187500000000002, "loss": 0.4271, "step": 900 }, { "epoch": 2.8125, "eval_accuracy": 0.89375, "eval_loss": 0.30369627475738525, "eval_runtime": 6.7624, "eval_samples_per_second": 189.281, "eval_steps_per_second": 23.66, "step": 900 }, { "epoch": 2.84375, "grad_norm": 1.4681894779205322, "learning_rate": 0.0001715625, "loss": 0.193, "step": 910 }, { "epoch": 2.875, "grad_norm": 2.8025732040405273, "learning_rate": 0.00017125, "loss": 0.2532, "step": 920 }, { "epoch": 2.90625, "grad_norm": 5.300601482391357, "learning_rate": 0.00017093750000000003, "loss": 0.178, "step": 930 }, { "epoch": 2.9375, "grad_norm": 0.9630404710769653, "learning_rate": 0.00017062500000000001, "loss": 0.2453, "step": 940 }, { "epoch": 2.96875, "grad_norm": 3.942497968673706, "learning_rate": 0.0001703125, "loss": 0.2302, "step": 950 }, { "epoch": 3.0, "grad_norm": 5.0087809562683105, "learning_rate": 0.00017, "loss": 0.2989, "step": 960 }, { "epoch": 3.03125, "grad_norm": 6.783499717712402, "learning_rate": 0.0001696875, "loss": 0.3932, "step": 970 }, { "epoch": 3.0625, "grad_norm": 4.156474590301514, "learning_rate": 0.000169375, "loss": 0.1148, "step": 980 }, { "epoch": 3.09375, "grad_norm": 0.22207698225975037, "learning_rate": 0.00016906250000000002, "loss": 0.0925, "step": 990 }, { "epoch": 3.125, "grad_norm": 7.46227502822876, "learning_rate": 0.00016875, "loss": 0.2484, "step": 1000 }, { "epoch": 3.125, "eval_accuracy": 0.86015625, "eval_loss": 0.4111490249633789, "eval_runtime": 6.2118, "eval_samples_per_second": 206.058, "eval_steps_per_second": 25.757, "step": 1000 }, { "epoch": 3.15625, "grad_norm": 1.8679814338684082, "learning_rate": 0.0001684375, "loss": 0.1024, "step": 1010 }, { "epoch": 3.1875, "grad_norm": 1.4344357252120972, "learning_rate": 0.000168125, "loss": 0.1532, "step": 1020 }, { "epoch": 3.21875, "grad_norm": 5.7756218910217285, "learning_rate": 0.00016781250000000002, "loss": 0.1433, "step": 1030 }, { "epoch": 3.25, "grad_norm": 6.2325358390808105, "learning_rate": 0.0001675, "loss": 0.2056, "step": 1040 }, { "epoch": 3.28125, "grad_norm": 0.7062513828277588, "learning_rate": 0.00016718750000000002, "loss": 0.1883, "step": 1050 }, { "epoch": 3.3125, "grad_norm": 0.5934551954269409, "learning_rate": 0.000166875, "loss": 0.1311, "step": 1060 }, { "epoch": 3.34375, "grad_norm": 2.90659236907959, "learning_rate": 0.0001665625, "loss": 0.0926, "step": 1070 }, { "epoch": 3.375, "grad_norm": 0.12364790588617325, "learning_rate": 0.00016625000000000003, "loss": 0.0998, "step": 1080 }, { "epoch": 3.40625, "grad_norm": 8.838610649108887, "learning_rate": 0.00016593750000000002, "loss": 0.1322, "step": 1090 }, { "epoch": 3.4375, "grad_norm": 0.9064333438873291, "learning_rate": 0.000165625, "loss": 0.0797, "step": 1100 }, { "epoch": 3.4375, "eval_accuracy": 0.8953125, "eval_loss": 0.3781999349594116, "eval_runtime": 6.8762, "eval_samples_per_second": 186.149, "eval_steps_per_second": 23.269, "step": 1100 }, { "epoch": 3.46875, "grad_norm": 0.37031790614128113, "learning_rate": 0.0001653125, "loss": 0.1157, "step": 1110 }, { "epoch": 3.5, "grad_norm": 10.903687477111816, "learning_rate": 0.000165, "loss": 0.1616, "step": 1120 }, { "epoch": 3.53125, "grad_norm": 7.575557231903076, "learning_rate": 0.0001646875, "loss": 0.0864, "step": 1130 }, { "epoch": 3.5625, "grad_norm": 0.7798458337783813, "learning_rate": 0.00016437500000000002, "loss": 0.1479, "step": 1140 }, { "epoch": 3.59375, "grad_norm": 0.30808526277542114, "learning_rate": 0.0001640625, "loss": 0.091, "step": 1150 }, { "epoch": 3.625, "grad_norm": 1.3779064416885376, "learning_rate": 0.00016375, "loss": 0.0918, "step": 1160 }, { "epoch": 3.65625, "grad_norm": 10.742758750915527, "learning_rate": 0.0001634375, "loss": 0.213, "step": 1170 }, { "epoch": 3.6875, "grad_norm": 4.325877666473389, "learning_rate": 0.00016312500000000002, "loss": 0.1672, "step": 1180 }, { "epoch": 3.71875, "grad_norm": 0.06019480153918266, "learning_rate": 0.0001628125, "loss": 0.1229, "step": 1190 }, { "epoch": 3.75, "grad_norm": 0.060391902923583984, "learning_rate": 0.00016250000000000002, "loss": 0.0662, "step": 1200 }, { "epoch": 3.75, "eval_accuracy": 0.9171875, "eval_loss": 0.309553325176239, "eval_runtime": 6.3504, "eval_samples_per_second": 201.561, "eval_steps_per_second": 25.195, "step": 1200 }, { "epoch": 3.78125, "grad_norm": 7.1998066902160645, "learning_rate": 0.0001621875, "loss": 0.2368, "step": 1210 }, { "epoch": 3.8125, "grad_norm": 2.0582878589630127, "learning_rate": 0.000161875, "loss": 0.1161, "step": 1220 }, { "epoch": 3.84375, "grad_norm": 0.4987049698829651, "learning_rate": 0.0001615625, "loss": 0.1798, "step": 1230 }, { "epoch": 3.875, "grad_norm": 6.303996562957764, "learning_rate": 0.00016125000000000002, "loss": 0.2193, "step": 1240 }, { "epoch": 3.90625, "grad_norm": 0.1131618395447731, "learning_rate": 0.0001609375, "loss": 0.1626, "step": 1250 }, { "epoch": 3.9375, "grad_norm": 8.112652778625488, "learning_rate": 0.00016062500000000001, "loss": 0.1033, "step": 1260 }, { "epoch": 3.96875, "grad_norm": 0.14582502841949463, "learning_rate": 0.0001603125, "loss": 0.0548, "step": 1270 }, { "epoch": 4.0, "grad_norm": 7.206060409545898, "learning_rate": 0.00016, "loss": 0.1758, "step": 1280 }, { "epoch": 4.03125, "grad_norm": 0.12302320450544357, "learning_rate": 0.0001596875, "loss": 0.0118, "step": 1290 }, { "epoch": 4.0625, "grad_norm": 1.9936612844467163, "learning_rate": 0.000159375, "loss": 0.0894, "step": 1300 }, { "epoch": 4.0625, "eval_accuracy": 0.92890625, "eval_loss": 0.281791090965271, "eval_runtime": 6.6912, "eval_samples_per_second": 191.295, "eval_steps_per_second": 23.912, "step": 1300 }, { "epoch": 4.09375, "grad_norm": 15.060647010803223, "learning_rate": 0.0001590625, "loss": 0.0328, "step": 1310 }, { "epoch": 4.125, "grad_norm": 0.06893154978752136, "learning_rate": 0.00015875, "loss": 0.0508, "step": 1320 }, { "epoch": 4.15625, "grad_norm": 0.38513150811195374, "learning_rate": 0.00015843750000000002, "loss": 0.054, "step": 1330 }, { "epoch": 4.1875, "grad_norm": 9.78172492980957, "learning_rate": 0.000158125, "loss": 0.0225, "step": 1340 }, { "epoch": 4.21875, "grad_norm": 0.9426405429840088, "learning_rate": 0.00015781250000000002, "loss": 0.0897, "step": 1350 }, { "epoch": 4.25, "grad_norm": 0.03851567581295967, "learning_rate": 0.0001575, "loss": 0.0723, "step": 1360 }, { "epoch": 4.28125, "grad_norm": 3.465240001678467, "learning_rate": 0.0001571875, "loss": 0.0281, "step": 1370 }, { "epoch": 4.3125, "grad_norm": 4.168702125549316, "learning_rate": 0.000156875, "loss": 0.0334, "step": 1380 }, { "epoch": 4.34375, "grad_norm": 0.022729417309165, "learning_rate": 0.00015656250000000002, "loss": 0.0786, "step": 1390 }, { "epoch": 4.375, "grad_norm": 4.370222568511963, "learning_rate": 0.00015625, "loss": 0.1005, "step": 1400 }, { "epoch": 4.375, "eval_accuracy": 0.946875, "eval_loss": 0.21635571122169495, "eval_runtime": 6.8886, "eval_samples_per_second": 185.815, "eval_steps_per_second": 23.227, "step": 1400 }, { "epoch": 4.40625, "grad_norm": 2.6576523780822754, "learning_rate": 0.00015593750000000002, "loss": 0.0235, "step": 1410 }, { "epoch": 4.4375, "grad_norm": 9.146688461303711, "learning_rate": 0.000155625, "loss": 0.0842, "step": 1420 }, { "epoch": 4.46875, "grad_norm": 0.019813504070043564, "learning_rate": 0.00015531250000000001, "loss": 0.092, "step": 1430 }, { "epoch": 4.5, "grad_norm": 0.2571779191493988, "learning_rate": 0.000155, "loss": 0.0473, "step": 1440 }, { "epoch": 4.53125, "grad_norm": 5.699305057525635, "learning_rate": 0.0001546875, "loss": 0.1053, "step": 1450 }, { "epoch": 4.5625, "grad_norm": 0.04671861603856087, "learning_rate": 0.000154375, "loss": 0.0068, "step": 1460 }, { "epoch": 4.59375, "grad_norm": 0.1444374918937683, "learning_rate": 0.0001540625, "loss": 0.0946, "step": 1470 }, { "epoch": 4.625, "grad_norm": 0.06924466788768768, "learning_rate": 0.00015375000000000002, "loss": 0.0213, "step": 1480 }, { "epoch": 4.65625, "grad_norm": 0.03169933706521988, "learning_rate": 0.0001534375, "loss": 0.24, "step": 1490 }, { "epoch": 4.6875, "grad_norm": 4.107417106628418, "learning_rate": 0.000153125, "loss": 0.0997, "step": 1500 }, { "epoch": 4.6875, "eval_accuracy": 0.9109375, "eval_loss": 0.3378385901451111, "eval_runtime": 6.6178, "eval_samples_per_second": 193.418, "eval_steps_per_second": 24.177, "step": 1500 }, { "epoch": 4.71875, "grad_norm": 0.2504006028175354, "learning_rate": 0.0001528125, "loss": 0.0903, "step": 1510 }, { "epoch": 4.75, "grad_norm": 3.6016788482666016, "learning_rate": 0.0001525, "loss": 0.1676, "step": 1520 }, { "epoch": 4.78125, "grad_norm": 0.02008502557873726, "learning_rate": 0.0001521875, "loss": 0.0731, "step": 1530 }, { "epoch": 4.8125, "grad_norm": 0.019816860556602478, "learning_rate": 0.00015187500000000002, "loss": 0.0559, "step": 1540 }, { "epoch": 4.84375, "grad_norm": 1.7278194427490234, "learning_rate": 0.0001515625, "loss": 0.0306, "step": 1550 }, { "epoch": 4.875, "grad_norm": 0.19783662259578705, "learning_rate": 0.00015125, "loss": 0.1335, "step": 1560 }, { "epoch": 4.90625, "grad_norm": 11.390993118286133, "learning_rate": 0.0001509375, "loss": 0.1606, "step": 1570 }, { "epoch": 4.9375, "grad_norm": 4.290327072143555, "learning_rate": 0.00015062500000000002, "loss": 0.0807, "step": 1580 }, { "epoch": 4.96875, "grad_norm": 0.28723642230033875, "learning_rate": 0.0001503125, "loss": 0.1673, "step": 1590 }, { "epoch": 5.0, "grad_norm": 0.05478620529174805, "learning_rate": 0.00015000000000000001, "loss": 0.0715, "step": 1600 }, { "epoch": 5.0, "eval_accuracy": 0.91328125, "eval_loss": 0.3626965880393982, "eval_runtime": 6.9646, "eval_samples_per_second": 183.788, "eval_steps_per_second": 22.973, "step": 1600 }, { "epoch": 5.03125, "grad_norm": 3.0136938095092773, "learning_rate": 0.0001496875, "loss": 0.1965, "step": 1610 }, { "epoch": 5.0625, "grad_norm": 0.11938930302858353, "learning_rate": 0.00014937499999999999, "loss": 0.0065, "step": 1620 }, { "epoch": 5.09375, "grad_norm": 0.04192354157567024, "learning_rate": 0.00014906250000000003, "loss": 0.0813, "step": 1630 }, { "epoch": 5.125, "grad_norm": 0.018585534766316414, "learning_rate": 0.00014875, "loss": 0.0973, "step": 1640 }, { "epoch": 5.15625, "grad_norm": 3.9032442569732666, "learning_rate": 0.0001484375, "loss": 0.0587, "step": 1650 }, { "epoch": 5.1875, "grad_norm": 0.04510480910539627, "learning_rate": 0.000148125, "loss": 0.1506, "step": 1660 }, { "epoch": 5.21875, "grad_norm": 0.2749982178211212, "learning_rate": 0.0001478125, "loss": 0.1367, "step": 1670 }, { "epoch": 5.25, "grad_norm": 6.217327117919922, "learning_rate": 0.0001475, "loss": 0.0734, "step": 1680 }, { "epoch": 5.28125, "grad_norm": 0.11703751981258392, "learning_rate": 0.00014718750000000002, "loss": 0.0961, "step": 1690 }, { "epoch": 5.3125, "grad_norm": 1.1948308944702148, "learning_rate": 0.000146875, "loss": 0.0567, "step": 1700 }, { "epoch": 5.3125, "eval_accuracy": 0.9234375, "eval_loss": 0.30611464381217957, "eval_runtime": 7.2211, "eval_samples_per_second": 177.258, "eval_steps_per_second": 22.157, "step": 1700 }, { "epoch": 5.34375, "grad_norm": 0.025335168465971947, "learning_rate": 0.0001465625, "loss": 0.0511, "step": 1710 }, { "epoch": 5.375, "grad_norm": 0.033230509608983994, "learning_rate": 0.00014625, "loss": 0.0857, "step": 1720 }, { "epoch": 5.40625, "grad_norm": 0.027418823912739754, "learning_rate": 0.00014593750000000002, "loss": 0.0054, "step": 1730 }, { "epoch": 5.4375, "grad_norm": 7.358951568603516, "learning_rate": 0.000145625, "loss": 0.0582, "step": 1740 }, { "epoch": 5.46875, "grad_norm": 0.06502091139554977, "learning_rate": 0.00014531250000000002, "loss": 0.11, "step": 1750 }, { "epoch": 5.5, "grad_norm": 3.4256505966186523, "learning_rate": 0.000145, "loss": 0.1004, "step": 1760 }, { "epoch": 5.53125, "grad_norm": 0.023563764989376068, "learning_rate": 0.0001446875, "loss": 0.0066, "step": 1770 }, { "epoch": 5.5625, "grad_norm": 0.049036819487810135, "learning_rate": 0.00014437500000000003, "loss": 0.0271, "step": 1780 }, { "epoch": 5.59375, "grad_norm": 0.02042466588318348, "learning_rate": 0.0001440625, "loss": 0.0132, "step": 1790 }, { "epoch": 5.625, "grad_norm": 0.05669878423213959, "learning_rate": 0.00014375, "loss": 0.0558, "step": 1800 }, { "epoch": 5.625, "eval_accuracy": 0.94609375, "eval_loss": 0.23927736282348633, "eval_runtime": 6.5481, "eval_samples_per_second": 195.476, "eval_steps_per_second": 24.434, "step": 1800 }, { "epoch": 5.65625, "grad_norm": 0.9565289616584778, "learning_rate": 0.0001434375, "loss": 0.0366, "step": 1810 }, { "epoch": 5.6875, "grad_norm": 0.024000531062483788, "learning_rate": 0.000143125, "loss": 0.14, "step": 1820 }, { "epoch": 5.71875, "grad_norm": 6.16231107711792, "learning_rate": 0.0001428125, "loss": 0.1586, "step": 1830 }, { "epoch": 5.75, "grad_norm": 10.281587600708008, "learning_rate": 0.00014250000000000002, "loss": 0.1454, "step": 1840 }, { "epoch": 5.78125, "grad_norm": 8.064495086669922, "learning_rate": 0.0001421875, "loss": 0.1354, "step": 1850 }, { "epoch": 5.8125, "grad_norm": 0.27762091159820557, "learning_rate": 0.000141875, "loss": 0.0356, "step": 1860 }, { "epoch": 5.84375, "grad_norm": 7.199831962585449, "learning_rate": 0.0001415625, "loss": 0.0485, "step": 1870 }, { "epoch": 5.875, "grad_norm": 0.013454968109726906, "learning_rate": 0.00014125000000000002, "loss": 0.0317, "step": 1880 }, { "epoch": 5.90625, "grad_norm": 0.8754172921180725, "learning_rate": 0.0001409375, "loss": 0.0198, "step": 1890 }, { "epoch": 5.9375, "grad_norm": 0.06731715798377991, "learning_rate": 0.00014062500000000002, "loss": 0.0061, "step": 1900 }, { "epoch": 5.9375, "eval_accuracy": 0.95859375, "eval_loss": 0.17380020022392273, "eval_runtime": 6.5259, "eval_samples_per_second": 196.142, "eval_steps_per_second": 24.518, "step": 1900 }, { "epoch": 5.96875, "grad_norm": 1.3012233972549438, "learning_rate": 0.0001403125, "loss": 0.1023, "step": 1910 }, { "epoch": 6.0, "grad_norm": 0.058877017349004745, "learning_rate": 0.00014, "loss": 0.0555, "step": 1920 }, { "epoch": 6.03125, "grad_norm": 1.3218984603881836, "learning_rate": 0.00013968750000000003, "loss": 0.1671, "step": 1930 }, { "epoch": 6.0625, "grad_norm": 0.7441987991333008, "learning_rate": 0.000139375, "loss": 0.046, "step": 1940 }, { "epoch": 6.09375, "grad_norm": 0.3219761550426483, "learning_rate": 0.0001390625, "loss": 0.0741, "step": 1950 }, { "epoch": 6.125, "grad_norm": 0.3803882598876953, "learning_rate": 0.00013875, "loss": 0.1102, "step": 1960 }, { "epoch": 6.15625, "grad_norm": 0.034619808197021484, "learning_rate": 0.0001384375, "loss": 0.0844, "step": 1970 }, { "epoch": 6.1875, "grad_norm": 0.03565617650747299, "learning_rate": 0.000138125, "loss": 0.0162, "step": 1980 }, { "epoch": 6.21875, "grad_norm": 0.05813159421086311, "learning_rate": 0.00013781250000000002, "loss": 0.0303, "step": 1990 }, { "epoch": 6.25, "grad_norm": 0.16888560354709625, "learning_rate": 0.0001375, "loss": 0.0449, "step": 2000 }, { "epoch": 6.25, "eval_accuracy": 0.94921875, "eval_loss": 0.20937061309814453, "eval_runtime": 6.6824, "eval_samples_per_second": 191.549, "eval_steps_per_second": 23.944, "step": 2000 }, { "epoch": 6.28125, "grad_norm": 0.13594718277454376, "learning_rate": 0.0001371875, "loss": 0.0127, "step": 2010 }, { "epoch": 6.3125, "grad_norm": 5.2705183029174805, "learning_rate": 0.000136875, "loss": 0.032, "step": 2020 }, { "epoch": 6.34375, "grad_norm": 11.641499519348145, "learning_rate": 0.00013656250000000002, "loss": 0.063, "step": 2030 }, { "epoch": 6.375, "grad_norm": 0.017323823645710945, "learning_rate": 0.00013625, "loss": 0.0106, "step": 2040 }, { "epoch": 6.40625, "grad_norm": 0.029373859986662865, "learning_rate": 0.00013593750000000002, "loss": 0.0032, "step": 2050 }, { "epoch": 6.4375, "grad_norm": 0.3746764063835144, "learning_rate": 0.000135625, "loss": 0.0606, "step": 2060 }, { "epoch": 6.46875, "grad_norm": 0.1948755830526352, "learning_rate": 0.0001353125, "loss": 0.0763, "step": 2070 }, { "epoch": 6.5, "grad_norm": 0.017781907692551613, "learning_rate": 0.00013500000000000003, "loss": 0.1066, "step": 2080 }, { "epoch": 6.53125, "grad_norm": 7.0899577140808105, "learning_rate": 0.00013468750000000001, "loss": 0.0805, "step": 2090 }, { "epoch": 6.5625, "grad_norm": 0.2712390720844269, "learning_rate": 0.000134375, "loss": 0.0073, "step": 2100 }, { "epoch": 6.5625, "eval_accuracy": 0.95390625, "eval_loss": 0.18335095047950745, "eval_runtime": 7.1604, "eval_samples_per_second": 178.761, "eval_steps_per_second": 22.345, "step": 2100 }, { "epoch": 6.59375, "grad_norm": 18.94267463684082, "learning_rate": 0.0001340625, "loss": 0.0978, "step": 2110 }, { "epoch": 6.625, "grad_norm": 12.507984161376953, "learning_rate": 0.00013375, "loss": 0.1564, "step": 2120 }, { "epoch": 6.65625, "grad_norm": 0.11404982954263687, "learning_rate": 0.0001334375, "loss": 0.2087, "step": 2130 }, { "epoch": 6.6875, "grad_norm": 5.735910415649414, "learning_rate": 0.00013312500000000002, "loss": 0.1009, "step": 2140 }, { "epoch": 6.71875, "grad_norm": 0.24378864467144012, "learning_rate": 0.0001328125, "loss": 0.117, "step": 2150 }, { "epoch": 6.75, "grad_norm": 1.378218412399292, "learning_rate": 0.0001325, "loss": 0.0559, "step": 2160 }, { "epoch": 6.78125, "grad_norm": 0.11509452760219574, "learning_rate": 0.0001321875, "loss": 0.1116, "step": 2170 }, { "epoch": 6.8125, "grad_norm": 0.024781059473752975, "learning_rate": 0.00013187500000000002, "loss": 0.0162, "step": 2180 }, { "epoch": 6.84375, "grad_norm": 0.013253854587674141, "learning_rate": 0.0001315625, "loss": 0.0124, "step": 2190 }, { "epoch": 6.875, "grad_norm": 0.009435000829398632, "learning_rate": 0.00013125000000000002, "loss": 0.0425, "step": 2200 }, { "epoch": 6.875, "eval_accuracy": 0.9265625, "eval_loss": 0.2847265601158142, "eval_runtime": 6.4944, "eval_samples_per_second": 197.092, "eval_steps_per_second": 24.637, "step": 2200 }, { "epoch": 6.90625, "grad_norm": 0.009621995501220226, "learning_rate": 0.0001309375, "loss": 0.0092, "step": 2210 }, { "epoch": 6.9375, "grad_norm": 13.595647811889648, "learning_rate": 0.000130625, "loss": 0.033, "step": 2220 }, { "epoch": 6.96875, "grad_norm": 0.046547386795282364, "learning_rate": 0.0001303125, "loss": 0.0697, "step": 2230 }, { "epoch": 7.0, "grad_norm": 0.018974941223859787, "learning_rate": 0.00013000000000000002, "loss": 0.0325, "step": 2240 }, { "epoch": 7.03125, "grad_norm": 0.011768043972551823, "learning_rate": 0.0001296875, "loss": 0.1392, "step": 2250 }, { "epoch": 7.0625, "grad_norm": 0.10145868360996246, "learning_rate": 0.00012937500000000001, "loss": 0.0034, "step": 2260 }, { "epoch": 7.09375, "grad_norm": 2.6255838871002197, "learning_rate": 0.0001290625, "loss": 0.0205, "step": 2270 }, { "epoch": 7.125, "grad_norm": 7.713711261749268, "learning_rate": 0.00012875, "loss": 0.085, "step": 2280 }, { "epoch": 7.15625, "grad_norm": 0.015717368572950363, "learning_rate": 0.0001284375, "loss": 0.073, "step": 2290 }, { "epoch": 7.1875, "grad_norm": 0.023385796695947647, "learning_rate": 0.000128125, "loss": 0.0397, "step": 2300 }, { "epoch": 7.1875, "eval_accuracy": 0.9125, "eval_loss": 0.40313416719436646, "eval_runtime": 7.0181, "eval_samples_per_second": 182.385, "eval_steps_per_second": 22.798, "step": 2300 }, { "epoch": 7.21875, "grad_norm": 0.05634606257081032, "learning_rate": 0.0001278125, "loss": 0.0661, "step": 2310 }, { "epoch": 7.25, "grad_norm": 0.0715768113732338, "learning_rate": 0.0001275, "loss": 0.0194, "step": 2320 }, { "epoch": 7.28125, "grad_norm": 0.03949156031012535, "learning_rate": 0.00012718750000000002, "loss": 0.0039, "step": 2330 }, { "epoch": 7.3125, "grad_norm": 0.011410553939640522, "learning_rate": 0.000126875, "loss": 0.0305, "step": 2340 }, { "epoch": 7.34375, "grad_norm": 0.00789843499660492, "learning_rate": 0.0001265625, "loss": 0.0022, "step": 2350 }, { "epoch": 7.375, "grad_norm": 0.6067203283309937, "learning_rate": 0.00012625, "loss": 0.0022, "step": 2360 }, { "epoch": 7.40625, "grad_norm": 3.3476336002349854, "learning_rate": 0.0001259375, "loss": 0.0642, "step": 2370 }, { "epoch": 7.4375, "grad_norm": 0.017286688089370728, "learning_rate": 0.000125625, "loss": 0.0081, "step": 2380 }, { "epoch": 7.46875, "grad_norm": 0.007009466178715229, "learning_rate": 0.00012531250000000002, "loss": 0.0308, "step": 2390 }, { "epoch": 7.5, "grad_norm": 5.178075313568115, "learning_rate": 0.000125, "loss": 0.0284, "step": 2400 }, { "epoch": 7.5, "eval_accuracy": 0.940625, "eval_loss": 0.29945191740989685, "eval_runtime": 6.4921, "eval_samples_per_second": 197.161, "eval_steps_per_second": 24.645, "step": 2400 }, { "epoch": 7.53125, "grad_norm": 0.03435547277331352, "learning_rate": 0.0001246875, "loss": 0.0786, "step": 2410 }, { "epoch": 7.5625, "grad_norm": 0.2063717395067215, "learning_rate": 0.000124375, "loss": 0.0575, "step": 2420 }, { "epoch": 7.59375, "grad_norm": 0.021375322714447975, "learning_rate": 0.00012406250000000001, "loss": 0.0017, "step": 2430 }, { "epoch": 7.625, "grad_norm": 0.010769976302981377, "learning_rate": 0.00012375, "loss": 0.0437, "step": 2440 }, { "epoch": 7.65625, "grad_norm": 0.01713966764509678, "learning_rate": 0.0001234375, "loss": 0.0018, "step": 2450 }, { "epoch": 7.6875, "grad_norm": 0.007944900542497635, "learning_rate": 0.000123125, "loss": 0.011, "step": 2460 }, { "epoch": 7.71875, "grad_norm": 0.041198715567588806, "learning_rate": 0.0001228125, "loss": 0.0019, "step": 2470 }, { "epoch": 7.75, "grad_norm": 0.009142044000327587, "learning_rate": 0.00012250000000000002, "loss": 0.0287, "step": 2480 }, { "epoch": 7.78125, "grad_norm": 0.00957415159791708, "learning_rate": 0.0001221875, "loss": 0.0015, "step": 2490 }, { "epoch": 7.8125, "grad_norm": 0.018495427444577217, "learning_rate": 0.00012187500000000001, "loss": 0.0158, "step": 2500 }, { "epoch": 7.8125, "eval_accuracy": 0.96640625, "eval_loss": 0.19092732667922974, "eval_runtime": 6.9695, "eval_samples_per_second": 183.658, "eval_steps_per_second": 22.957, "step": 2500 }, { "epoch": 7.84375, "grad_norm": 0.005418677814304829, "learning_rate": 0.00012156250000000001, "loss": 0.0196, "step": 2510 }, { "epoch": 7.875, "grad_norm": 0.30978670716285706, "learning_rate": 0.00012124999999999999, "loss": 0.0327, "step": 2520 }, { "epoch": 7.90625, "grad_norm": 0.006560006178915501, "learning_rate": 0.00012093750000000002, "loss": 0.0079, "step": 2530 }, { "epoch": 7.9375, "grad_norm": 0.006091834977269173, "learning_rate": 0.000120625, "loss": 0.0086, "step": 2540 }, { "epoch": 7.96875, "grad_norm": 4.461112022399902, "learning_rate": 0.0001203125, "loss": 0.0603, "step": 2550 }, { "epoch": 8.0, "grad_norm": 18.875268936157227, "learning_rate": 0.00012, "loss": 0.0695, "step": 2560 }, { "epoch": 8.03125, "grad_norm": 0.006029163487255573, "learning_rate": 0.0001196875, "loss": 0.0015, "step": 2570 }, { "epoch": 8.0625, "grad_norm": 0.00949636660516262, "learning_rate": 0.00011937500000000001, "loss": 0.0029, "step": 2580 }, { "epoch": 8.09375, "grad_norm": 0.011131886392831802, "learning_rate": 0.00011906250000000001, "loss": 0.0037, "step": 2590 }, { "epoch": 8.125, "grad_norm": 0.43485090136528015, "learning_rate": 0.00011875, "loss": 0.006, "step": 2600 }, { "epoch": 8.125, "eval_accuracy": 0.9296875, "eval_loss": 0.35239508748054504, "eval_runtime": 6.7581, "eval_samples_per_second": 189.401, "eval_steps_per_second": 23.675, "step": 2600 }, { "epoch": 8.15625, "grad_norm": 0.004519434180110693, "learning_rate": 0.0001184375, "loss": 0.0774, "step": 2610 }, { "epoch": 8.1875, "grad_norm": 6.793065547943115, "learning_rate": 0.000118125, "loss": 0.0311, "step": 2620 }, { "epoch": 8.21875, "grad_norm": 0.0822053775191307, "learning_rate": 0.00011781250000000001, "loss": 0.0164, "step": 2630 }, { "epoch": 8.25, "grad_norm": 0.0055194334127008915, "learning_rate": 0.00011750000000000001, "loss": 0.0019, "step": 2640 }, { "epoch": 8.28125, "grad_norm": 0.016379250213503838, "learning_rate": 0.00011718750000000001, "loss": 0.0206, "step": 2650 }, { "epoch": 8.3125, "grad_norm": 0.00409812992438674, "learning_rate": 0.000116875, "loss": 0.0017, "step": 2660 }, { "epoch": 8.34375, "grad_norm": 0.004736763890832663, "learning_rate": 0.0001165625, "loss": 0.0173, "step": 2670 }, { "epoch": 8.375, "grad_norm": 0.006192313041538, "learning_rate": 0.00011625000000000002, "loss": 0.0059, "step": 2680 }, { "epoch": 8.40625, "grad_norm": 0.019787801429629326, "learning_rate": 0.0001159375, "loss": 0.0559, "step": 2690 }, { "epoch": 8.4375, "grad_norm": 0.945681095123291, "learning_rate": 0.000115625, "loss": 0.0017, "step": 2700 }, { "epoch": 8.4375, "eval_accuracy": 0.96171875, "eval_loss": 0.19076545536518097, "eval_runtime": 6.3936, "eval_samples_per_second": 200.199, "eval_steps_per_second": 25.025, "step": 2700 }, { "epoch": 8.46875, "grad_norm": 0.3741857409477234, "learning_rate": 0.0001153125, "loss": 0.001, "step": 2710 }, { "epoch": 8.5, "grad_norm": 0.0038401270285248756, "learning_rate": 0.00011499999999999999, "loss": 0.0011, "step": 2720 }, { "epoch": 8.53125, "grad_norm": 0.005083560012280941, "learning_rate": 0.00011468750000000002, "loss": 0.0008, "step": 2730 }, { "epoch": 8.5625, "grad_norm": 0.0035322927869856358, "learning_rate": 0.00011437500000000002, "loss": 0.0183, "step": 2740 }, { "epoch": 8.59375, "grad_norm": 0.005947966128587723, "learning_rate": 0.0001140625, "loss": 0.0011, "step": 2750 }, { "epoch": 8.625, "grad_norm": 5.917221546173096, "learning_rate": 0.00011375, "loss": 0.0021, "step": 2760 }, { "epoch": 8.65625, "grad_norm": 0.0036874141078442335, "learning_rate": 0.0001134375, "loss": 0.0186, "step": 2770 }, { "epoch": 8.6875, "grad_norm": 0.005372929852455854, "learning_rate": 0.00011312500000000001, "loss": 0.0241, "step": 2780 }, { "epoch": 8.71875, "grad_norm": 0.004914106801152229, "learning_rate": 0.00011281250000000001, "loss": 0.0421, "step": 2790 }, { "epoch": 8.75, "grad_norm": 0.02046949602663517, "learning_rate": 0.00011250000000000001, "loss": 0.0026, "step": 2800 }, { "epoch": 8.75, "eval_accuracy": 0.9625, "eval_loss": 0.17868757247924805, "eval_runtime": 6.3446, "eval_samples_per_second": 201.746, "eval_steps_per_second": 25.218, "step": 2800 }, { "epoch": 8.78125, "grad_norm": 0.023118741810321808, "learning_rate": 0.0001121875, "loss": 0.0016, "step": 2810 }, { "epoch": 8.8125, "grad_norm": 16.51854133605957, "learning_rate": 0.000111875, "loss": 0.0685, "step": 2820 }, { "epoch": 8.84375, "grad_norm": 0.011586804874241352, "learning_rate": 0.00011156250000000001, "loss": 0.0323, "step": 2830 }, { "epoch": 8.875, "grad_norm": 0.018752580508589745, "learning_rate": 0.00011125000000000001, "loss": 0.0012, "step": 2840 }, { "epoch": 8.90625, "grad_norm": 0.014984137378633022, "learning_rate": 0.0001109375, "loss": 0.0288, "step": 2850 }, { "epoch": 8.9375, "grad_norm": 12.684847831726074, "learning_rate": 0.00011065625, "loss": 0.1146, "step": 2860 }, { "epoch": 8.96875, "grad_norm": 0.09825449436903, "learning_rate": 0.00011034375000000001, "loss": 0.0147, "step": 2870 }, { "epoch": 9.0, "grad_norm": 0.011579761281609535, "learning_rate": 0.00011003125000000001, "loss": 0.0012, "step": 2880 }, { "epoch": 9.03125, "grad_norm": 0.009486192837357521, "learning_rate": 0.00010971875000000001, "loss": 0.0257, "step": 2890 }, { "epoch": 9.0625, "grad_norm": 0.005146791692823172, "learning_rate": 0.00010940624999999999, "loss": 0.001, "step": 2900 }, { "epoch": 9.0625, "eval_accuracy": 0.96875, "eval_loss": 0.1328631341457367, "eval_runtime": 6.918, "eval_samples_per_second": 185.024, "eval_steps_per_second": 23.128, "step": 2900 }, { "epoch": 9.09375, "grad_norm": 0.005191614385694265, "learning_rate": 0.00010909374999999999, "loss": 0.0013, "step": 2910 }, { "epoch": 9.125, "grad_norm": 0.024972369894385338, "learning_rate": 0.00010878125000000002, "loss": 0.0356, "step": 2920 }, { "epoch": 9.15625, "grad_norm": 0.00825554970651865, "learning_rate": 0.00010846875, "loss": 0.0009, "step": 2930 }, { "epoch": 9.1875, "grad_norm": 0.004267066717147827, "learning_rate": 0.00010815625, "loss": 0.0395, "step": 2940 }, { "epoch": 9.21875, "grad_norm": 0.7409490346908569, "learning_rate": 0.00010784375, "loss": 0.0037, "step": 2950 }, { "epoch": 9.25, "grad_norm": 0.006523266900330782, "learning_rate": 0.00010753124999999999, "loss": 0.0205, "step": 2960 }, { "epoch": 9.28125, "grad_norm": 0.00338526233099401, "learning_rate": 0.00010721875000000001, "loss": 0.0185, "step": 2970 }, { "epoch": 9.3125, "grad_norm": 0.00550084188580513, "learning_rate": 0.00010690625000000001, "loss": 0.0135, "step": 2980 }, { "epoch": 9.34375, "grad_norm": 0.0031597930938005447, "learning_rate": 0.00010659375, "loss": 0.0012, "step": 2990 }, { "epoch": 9.375, "grad_norm": 0.004799798596650362, "learning_rate": 0.00010628125, "loss": 0.0497, "step": 3000 }, { "epoch": 9.375, "eval_accuracy": 0.959375, "eval_loss": 0.1878364086151123, "eval_runtime": 6.9355, "eval_samples_per_second": 184.558, "eval_steps_per_second": 23.07, "step": 3000 }, { "epoch": 9.40625, "grad_norm": 0.03460833802819252, "learning_rate": 0.00010596875, "loss": 0.002, "step": 3010 }, { "epoch": 9.4375, "grad_norm": 2.7818896770477295, "learning_rate": 0.00010565625000000001, "loss": 0.0333, "step": 3020 }, { "epoch": 9.46875, "grad_norm": 0.00796230137348175, "learning_rate": 0.00010534375000000001, "loss": 0.041, "step": 3030 }, { "epoch": 9.5, "grad_norm": 0.0041164797730743885, "learning_rate": 0.00010503125000000001, "loss": 0.071, "step": 3040 }, { "epoch": 9.53125, "grad_norm": 0.004923259373754263, "learning_rate": 0.00010471875, "loss": 0.0016, "step": 3050 }, { "epoch": 9.5625, "grad_norm": 0.03983521834015846, "learning_rate": 0.00010440625, "loss": 0.0245, "step": 3060 }, { "epoch": 9.59375, "grad_norm": 0.005190184339880943, "learning_rate": 0.00010409375, "loss": 0.0134, "step": 3070 }, { "epoch": 9.625, "grad_norm": 0.05828193947672844, "learning_rate": 0.00010378125, "loss": 0.0599, "step": 3080 }, { "epoch": 9.65625, "grad_norm": 0.0039481050334870815, "learning_rate": 0.00010346875, "loss": 0.0581, "step": 3090 }, { "epoch": 9.6875, "grad_norm": 0.007265688385814428, "learning_rate": 0.00010315625, "loss": 0.09, "step": 3100 }, { "epoch": 9.6875, "eval_accuracy": 0.96484375, "eval_loss": 0.1753551959991455, "eval_runtime": 6.4623, "eval_samples_per_second": 198.073, "eval_steps_per_second": 24.759, "step": 3100 }, { "epoch": 9.71875, "grad_norm": 0.009442336857318878, "learning_rate": 0.00010284374999999999, "loss": 0.0009, "step": 3110 }, { "epoch": 9.75, "grad_norm": 0.0029557254165410995, "learning_rate": 0.00010253125000000002, "loss": 0.0011, "step": 3120 }, { "epoch": 9.78125, "grad_norm": 0.6488510370254517, "learning_rate": 0.00010221875, "loss": 0.0207, "step": 3130 }, { "epoch": 9.8125, "grad_norm": 0.008899745531380177, "learning_rate": 0.00010190625, "loss": 0.0106, "step": 3140 }, { "epoch": 9.84375, "grad_norm": 0.002760515082627535, "learning_rate": 0.00010159375, "loss": 0.0172, "step": 3150 }, { "epoch": 9.875, "grad_norm": 0.9725448489189148, "learning_rate": 0.00010128125, "loss": 0.0065, "step": 3160 }, { "epoch": 9.90625, "grad_norm": 0.002948402427136898, "learning_rate": 0.00010096875000000001, "loss": 0.0008, "step": 3170 }, { "epoch": 9.9375, "grad_norm": 0.0027872335631400347, "learning_rate": 0.00010065625000000001, "loss": 0.0023, "step": 3180 }, { "epoch": 9.96875, "grad_norm": 0.00609046733006835, "learning_rate": 0.00010034375000000001, "loss": 0.0385, "step": 3190 }, { "epoch": 10.0, "grad_norm": 0.025308266282081604, "learning_rate": 0.00010003125, "loss": 0.0046, "step": 3200 }, { "epoch": 10.0, "eval_accuracy": 0.9671875, "eval_loss": 0.15844720602035522, "eval_runtime": 6.5917, "eval_samples_per_second": 194.184, "eval_steps_per_second": 24.273, "step": 3200 }, { "epoch": 10.03125, "grad_norm": 0.0036033540964126587, "learning_rate": 9.971875000000001e-05, "loss": 0.0007, "step": 3210 }, { "epoch": 10.0625, "grad_norm": 0.005487027112394571, "learning_rate": 9.940625000000001e-05, "loss": 0.0893, "step": 3220 }, { "epoch": 10.09375, "grad_norm": 0.017159543931484222, "learning_rate": 9.909375e-05, "loss": 0.003, "step": 3230 }, { "epoch": 10.125, "grad_norm": 0.006799894850701094, "learning_rate": 9.878125e-05, "loss": 0.0026, "step": 3240 }, { "epoch": 10.15625, "grad_norm": 5.967584609985352, "learning_rate": 9.846875e-05, "loss": 0.035, "step": 3250 }, { "epoch": 10.1875, "grad_norm": 0.027164561673998833, "learning_rate": 9.815625e-05, "loss": 0.0187, "step": 3260 }, { "epoch": 10.21875, "grad_norm": 0.010049635544419289, "learning_rate": 9.784375e-05, "loss": 0.0012, "step": 3270 }, { "epoch": 10.25, "grad_norm": 0.006957762409001589, "learning_rate": 9.753125e-05, "loss": 0.0009, "step": 3280 }, { "epoch": 10.28125, "grad_norm": 0.009408445097506046, "learning_rate": 9.721875e-05, "loss": 0.0016, "step": 3290 }, { "epoch": 10.3125, "grad_norm": 0.004588890355080366, "learning_rate": 9.690625000000001e-05, "loss": 0.0006, "step": 3300 }, { "epoch": 10.3125, "eval_accuracy": 0.96484375, "eval_loss": 0.20075881481170654, "eval_runtime": 6.9603, "eval_samples_per_second": 183.901, "eval_steps_per_second": 22.988, "step": 3300 }, { "epoch": 10.34375, "grad_norm": 0.0026115712244063616, "learning_rate": 9.659375e-05, "loss": 0.0008, "step": 3310 }, { "epoch": 10.375, "grad_norm": 0.010643471032381058, "learning_rate": 9.628125e-05, "loss": 0.0424, "step": 3320 }, { "epoch": 10.40625, "grad_norm": 23.894906997680664, "learning_rate": 9.596875000000001e-05, "loss": 0.0637, "step": 3330 }, { "epoch": 10.4375, "grad_norm": 0.00423228507861495, "learning_rate": 9.565625e-05, "loss": 0.0012, "step": 3340 }, { "epoch": 10.46875, "grad_norm": 0.0030155859421938658, "learning_rate": 9.534375000000001e-05, "loss": 0.0186, "step": 3350 }, { "epoch": 10.5, "grad_norm": 0.003306414932012558, "learning_rate": 9.503125000000001e-05, "loss": 0.0363, "step": 3360 }, { "epoch": 10.53125, "grad_norm": 0.0035443324595689774, "learning_rate": 9.471875e-05, "loss": 0.0269, "step": 3370 }, { "epoch": 10.5625, "grad_norm": 0.00464298902079463, "learning_rate": 9.440625000000001e-05, "loss": 0.0006, "step": 3380 }, { "epoch": 10.59375, "grad_norm": 0.003652532584965229, "learning_rate": 9.409375000000001e-05, "loss": 0.0006, "step": 3390 }, { "epoch": 10.625, "grad_norm": 0.0041074915789067745, "learning_rate": 9.378125e-05, "loss": 0.0008, "step": 3400 }, { "epoch": 10.625, "eval_accuracy": 0.975, "eval_loss": 0.12715043127536774, "eval_runtime": 6.4535, "eval_samples_per_second": 198.341, "eval_steps_per_second": 24.793, "step": 3400 }, { "epoch": 10.65625, "grad_norm": 0.003479533363133669, "learning_rate": 9.346875e-05, "loss": 0.0022, "step": 3410 }, { "epoch": 10.6875, "grad_norm": 0.002636971650645137, "learning_rate": 9.315625e-05, "loss": 0.0264, "step": 3420 }, { "epoch": 10.71875, "grad_norm": 0.0028728495817631483, "learning_rate": 9.284375e-05, "loss": 0.0121, "step": 3430 }, { "epoch": 10.75, "grad_norm": 0.0024031987413764, "learning_rate": 9.253125e-05, "loss": 0.0147, "step": 3440 }, { "epoch": 10.78125, "grad_norm": 11.363987922668457, "learning_rate": 9.221875000000002e-05, "loss": 0.0628, "step": 3450 }, { "epoch": 10.8125, "grad_norm": 0.01323883980512619, "learning_rate": 9.190625e-05, "loss": 0.0016, "step": 3460 }, { "epoch": 10.84375, "grad_norm": 0.005075179971754551, "learning_rate": 9.159375e-05, "loss": 0.0298, "step": 3470 }, { "epoch": 10.875, "grad_norm": 0.03815023973584175, "learning_rate": 9.128125000000001e-05, "loss": 0.0006, "step": 3480 }, { "epoch": 10.90625, "grad_norm": 0.028709406033158302, "learning_rate": 9.096875e-05, "loss": 0.0165, "step": 3490 }, { "epoch": 10.9375, "grad_norm": 0.002887293929234147, "learning_rate": 9.065625000000001e-05, "loss": 0.028, "step": 3500 }, { "epoch": 10.9375, "eval_accuracy": 0.9765625, "eval_loss": 0.14528806507587433, "eval_runtime": 6.3552, "eval_samples_per_second": 201.41, "eval_steps_per_second": 25.176, "step": 3500 }, { "epoch": 10.96875, "grad_norm": 0.0032660234719514847, "learning_rate": 9.034375000000001e-05, "loss": 0.0008, "step": 3510 }, { "epoch": 11.0, "grad_norm": 0.003932199906557798, "learning_rate": 9.003125e-05, "loss": 0.0327, "step": 3520 }, { "epoch": 11.03125, "grad_norm": 0.0021477933041751385, "learning_rate": 8.971875000000001e-05, "loss": 0.0005, "step": 3530 }, { "epoch": 11.0625, "grad_norm": 0.002401479985564947, "learning_rate": 8.940625000000001e-05, "loss": 0.0306, "step": 3540 }, { "epoch": 11.09375, "grad_norm": 8.878641128540039, "learning_rate": 8.909375000000001e-05, "loss": 0.0523, "step": 3550 }, { "epoch": 11.125, "grad_norm": 0.0029539538081735373, "learning_rate": 8.878125e-05, "loss": 0.0006, "step": 3560 }, { "epoch": 11.15625, "grad_norm": 0.004306386224925518, "learning_rate": 8.846875e-05, "loss": 0.001, "step": 3570 }, { "epoch": 11.1875, "grad_norm": 0.013514366932213306, "learning_rate": 8.815625e-05, "loss": 0.0005, "step": 3580 }, { "epoch": 11.21875, "grad_norm": 0.013187670148909092, "learning_rate": 8.784375e-05, "loss": 0.0005, "step": 3590 }, { "epoch": 11.25, "grad_norm": 0.002293806755915284, "learning_rate": 8.753125e-05, "loss": 0.0005, "step": 3600 }, { "epoch": 11.25, "eval_accuracy": 0.975, "eval_loss": 0.1256314218044281, "eval_runtime": 6.8185, "eval_samples_per_second": 187.725, "eval_steps_per_second": 23.466, "step": 3600 }, { "epoch": 11.28125, "grad_norm": 0.0030747319106012583, "learning_rate": 8.721875e-05, "loss": 0.0213, "step": 3610 }, { "epoch": 11.3125, "grad_norm": 0.002191092586144805, "learning_rate": 8.690625e-05, "loss": 0.0014, "step": 3620 }, { "epoch": 11.34375, "grad_norm": 15.502256393432617, "learning_rate": 8.659375e-05, "loss": 0.0188, "step": 3630 }, { "epoch": 11.375, "grad_norm": 0.002841190667822957, "learning_rate": 8.628125e-05, "loss": 0.0006, "step": 3640 }, { "epoch": 11.40625, "grad_norm": 0.009304801933467388, "learning_rate": 8.596875000000001e-05, "loss": 0.0005, "step": 3650 }, { "epoch": 11.4375, "grad_norm": 0.002438412746414542, "learning_rate": 8.565625e-05, "loss": 0.0004, "step": 3660 }, { "epoch": 11.46875, "grad_norm": 0.0025553186424076557, "learning_rate": 8.534375e-05, "loss": 0.0005, "step": 3670 }, { "epoch": 11.5, "grad_norm": 0.007872486487030983, "learning_rate": 8.503125000000001e-05, "loss": 0.0004, "step": 3680 }, { "epoch": 11.53125, "grad_norm": 0.003905347315594554, "learning_rate": 8.471875e-05, "loss": 0.0004, "step": 3690 }, { "epoch": 11.5625, "grad_norm": 0.0032572925556451082, "learning_rate": 8.440625000000001e-05, "loss": 0.0005, "step": 3700 }, { "epoch": 11.5625, "eval_accuracy": 0.97890625, "eval_loss": 0.10886295884847641, "eval_runtime": 6.8932, "eval_samples_per_second": 185.689, "eval_steps_per_second": 23.211, "step": 3700 }, { "epoch": 11.59375, "grad_norm": 0.005474507808685303, "learning_rate": 8.409375000000001e-05, "loss": 0.0004, "step": 3710 }, { "epoch": 11.625, "grad_norm": 0.003425732720643282, "learning_rate": 8.378125e-05, "loss": 0.0005, "step": 3720 }, { "epoch": 11.65625, "grad_norm": 0.010524573735892773, "learning_rate": 8.346875e-05, "loss": 0.0004, "step": 3730 }, { "epoch": 11.6875, "grad_norm": 0.002859619678929448, "learning_rate": 8.315625e-05, "loss": 0.0004, "step": 3740 }, { "epoch": 11.71875, "grad_norm": 0.0019814125262200832, "learning_rate": 8.284375e-05, "loss": 0.0003, "step": 3750 }, { "epoch": 11.75, "grad_norm": 0.0019714718218892813, "learning_rate": 8.253125e-05, "loss": 0.0004, "step": 3760 }, { "epoch": 11.78125, "grad_norm": 0.0027500391006469727, "learning_rate": 8.221875e-05, "loss": 0.0004, "step": 3770 }, { "epoch": 11.8125, "grad_norm": 0.001707090763375163, "learning_rate": 8.190625e-05, "loss": 0.0004, "step": 3780 }, { "epoch": 11.84375, "grad_norm": 0.0019228786695748568, "learning_rate": 8.159375e-05, "loss": 0.0004, "step": 3790 }, { "epoch": 11.875, "grad_norm": 0.0019343816675245762, "learning_rate": 8.128125000000001e-05, "loss": 0.0004, "step": 3800 }, { "epoch": 11.875, "eval_accuracy": 0.978125, "eval_loss": 0.1097874864935875, "eval_runtime": 6.3764, "eval_samples_per_second": 200.739, "eval_steps_per_second": 25.092, "step": 3800 }, { "epoch": 11.90625, "grad_norm": 0.001760054030455649, "learning_rate": 8.096875e-05, "loss": 0.0003, "step": 3810 }, { "epoch": 11.9375, "grad_norm": 0.005902810953557491, "learning_rate": 8.065625e-05, "loss": 0.0024, "step": 3820 }, { "epoch": 11.96875, "grad_norm": 0.0037092138081789017, "learning_rate": 8.034375000000001e-05, "loss": 0.0464, "step": 3830 }, { "epoch": 12.0, "grad_norm": 0.016846604645252228, "learning_rate": 8.003125e-05, "loss": 0.0023, "step": 3840 }, { "epoch": 12.03125, "grad_norm": 0.0020532661583274603, "learning_rate": 7.971875000000001e-05, "loss": 0.0004, "step": 3850 }, { "epoch": 12.0625, "grad_norm": 0.005589602515101433, "learning_rate": 7.940625000000001e-05, "loss": 0.0003, "step": 3860 }, { "epoch": 12.09375, "grad_norm": 0.008196796290576458, "learning_rate": 7.909375e-05, "loss": 0.0025, "step": 3870 }, { "epoch": 12.125, "grad_norm": 0.7199010848999023, "learning_rate": 7.878125000000001e-05, "loss": 0.0313, "step": 3880 }, { "epoch": 12.15625, "grad_norm": 0.0015094159170985222, "learning_rate": 7.846875e-05, "loss": 0.0007, "step": 3890 }, { "epoch": 12.1875, "grad_norm": 0.019164152443408966, "learning_rate": 7.815625e-05, "loss": 0.0003, "step": 3900 }, { "epoch": 12.1875, "eval_accuracy": 0.9625, "eval_loss": 0.17790503799915314, "eval_runtime": 6.2858, "eval_samples_per_second": 203.634, "eval_steps_per_second": 25.454, "step": 3900 }, { "epoch": 12.21875, "grad_norm": 0.002245397539809346, "learning_rate": 7.784375e-05, "loss": 0.0009, "step": 3910 }, { "epoch": 12.25, "grad_norm": 0.0015030098147690296, "learning_rate": 7.753125e-05, "loss": 0.0003, "step": 3920 }, { "epoch": 12.28125, "grad_norm": 0.11134529858827591, "learning_rate": 7.721875e-05, "loss": 0.0005, "step": 3930 }, { "epoch": 12.3125, "grad_norm": 0.002319012302905321, "learning_rate": 7.690625e-05, "loss": 0.0332, "step": 3940 }, { "epoch": 12.34375, "grad_norm": 0.002300011459738016, "learning_rate": 7.659375000000002e-05, "loss": 0.0005, "step": 3950 }, { "epoch": 12.375, "grad_norm": 0.004005583468824625, "learning_rate": 7.628125e-05, "loss": 0.0006, "step": 3960 }, { "epoch": 12.40625, "grad_norm": 0.0016996270278468728, "learning_rate": 7.596875e-05, "loss": 0.0052, "step": 3970 }, { "epoch": 12.4375, "grad_norm": 0.0016149221919476986, "learning_rate": 7.565625000000001e-05, "loss": 0.0003, "step": 3980 }, { "epoch": 12.46875, "grad_norm": 0.004814577754586935, "learning_rate": 7.534375e-05, "loss": 0.0041, "step": 3990 }, { "epoch": 12.5, "grad_norm": 0.010618665255606174, "learning_rate": 7.503125000000001e-05, "loss": 0.0163, "step": 4000 }, { "epoch": 12.5, "eval_accuracy": 0.95390625, "eval_loss": 0.25004318356513977, "eval_runtime": 6.3222, "eval_samples_per_second": 202.461, "eval_steps_per_second": 25.308, "step": 4000 }, { "epoch": 12.53125, "grad_norm": 0.010156257078051567, "learning_rate": 7.471875000000001e-05, "loss": 0.0349, "step": 4010 }, { "epoch": 12.5625, "grad_norm": 0.015218588523566723, "learning_rate": 7.440625e-05, "loss": 0.1117, "step": 4020 }, { "epoch": 12.59375, "grad_norm": 0.00266676745377481, "learning_rate": 7.409375000000001e-05, "loss": 0.0004, "step": 4030 }, { "epoch": 12.625, "grad_norm": 0.001848011976107955, "learning_rate": 7.378125000000001e-05, "loss": 0.001, "step": 4040 }, { "epoch": 12.65625, "grad_norm": 0.014798123389482498, "learning_rate": 7.346875000000001e-05, "loss": 0.0005, "step": 4050 }, { "epoch": 12.6875, "grad_norm": 0.0023186809848994017, "learning_rate": 7.315625e-05, "loss": 0.0371, "step": 4060 }, { "epoch": 12.71875, "grad_norm": 0.001521400292403996, "learning_rate": 7.284375e-05, "loss": 0.0036, "step": 4070 }, { "epoch": 12.75, "grad_norm": 0.0016448964597657323, "learning_rate": 7.253125e-05, "loss": 0.0003, "step": 4080 }, { "epoch": 12.78125, "grad_norm": 0.003130651544779539, "learning_rate": 7.221875e-05, "loss": 0.0004, "step": 4090 }, { "epoch": 12.8125, "grad_norm": 0.001429844181984663, "learning_rate": 7.190625e-05, "loss": 0.0003, "step": 4100 }, { "epoch": 12.8125, "eval_accuracy": 0.9734375, "eval_loss": 0.1555672138929367, "eval_runtime": 6.821, "eval_samples_per_second": 187.655, "eval_steps_per_second": 23.457, "step": 4100 }, { "epoch": 12.84375, "grad_norm": 0.0016585810808464885, "learning_rate": 7.159375e-05, "loss": 0.0097, "step": 4110 }, { "epoch": 12.875, "grad_norm": 0.0025756254326552153, "learning_rate": 7.128125e-05, "loss": 0.0003, "step": 4120 }, { "epoch": 12.90625, "grad_norm": 0.0014616530388593674, "learning_rate": 7.096875e-05, "loss": 0.0011, "step": 4130 }, { "epoch": 12.9375, "grad_norm": 0.001520119491033256, "learning_rate": 7.065625e-05, "loss": 0.0004, "step": 4140 }, { "epoch": 12.96875, "grad_norm": 0.0017162116710096598, "learning_rate": 7.034375000000001e-05, "loss": 0.0381, "step": 4150 }, { "epoch": 13.0, "grad_norm": 0.0027610217221081257, "learning_rate": 7.003125e-05, "loss": 0.0286, "step": 4160 }, { "epoch": 13.03125, "grad_norm": 0.0013741106959059834, "learning_rate": 6.971875e-05, "loss": 0.0099, "step": 4170 }, { "epoch": 13.0625, "grad_norm": 0.08995310217142105, "learning_rate": 6.940625000000001e-05, "loss": 0.0005, "step": 4180 }, { "epoch": 13.09375, "grad_norm": 0.002924903528764844, "learning_rate": 6.909375e-05, "loss": 0.0003, "step": 4190 }, { "epoch": 13.125, "grad_norm": 0.013759560883045197, "learning_rate": 6.878125000000001e-05, "loss": 0.0003, "step": 4200 }, { "epoch": 13.125, "eval_accuracy": 0.97421875, "eval_loss": 0.12048967182636261, "eval_runtime": 7.0166, "eval_samples_per_second": 182.425, "eval_steps_per_second": 22.803, "step": 4200 }, { "epoch": 13.15625, "grad_norm": 1.506090760231018, "learning_rate": 6.846875000000001e-05, "loss": 0.001, "step": 4210 }, { "epoch": 13.1875, "grad_norm": 0.001895511755719781, "learning_rate": 6.815624999999999e-05, "loss": 0.0003, "step": 4220 }, { "epoch": 13.21875, "grad_norm": 0.006083915941417217, "learning_rate": 6.784375e-05, "loss": 0.0004, "step": 4230 }, { "epoch": 13.25, "grad_norm": 19.973133087158203, "learning_rate": 6.753125e-05, "loss": 0.0203, "step": 4240 }, { "epoch": 13.28125, "grad_norm": 0.0028971272986382246, "learning_rate": 6.721875e-05, "loss": 0.0003, "step": 4250 }, { "epoch": 13.3125, "grad_norm": 0.022618748247623444, "learning_rate": 6.690625e-05, "loss": 0.0003, "step": 4260 }, { "epoch": 13.34375, "grad_norm": 0.001784573425538838, "learning_rate": 6.659375e-05, "loss": 0.0004, "step": 4270 }, { "epoch": 13.375, "grad_norm": 0.004388020373880863, "learning_rate": 6.628125e-05, "loss": 0.0003, "step": 4280 }, { "epoch": 13.40625, "grad_norm": 0.0013094112509861588, "learning_rate": 6.596875e-05, "loss": 0.0003, "step": 4290 }, { "epoch": 13.4375, "grad_norm": 0.001405878458172083, "learning_rate": 6.565625000000001e-05, "loss": 0.0002, "step": 4300 }, { "epoch": 13.4375, "eval_accuracy": 0.971875, "eval_loss": 0.15426388382911682, "eval_runtime": 6.7768, "eval_samples_per_second": 188.88, "eval_steps_per_second": 23.61, "step": 4300 }, { "epoch": 13.46875, "grad_norm": 0.001250360975973308, "learning_rate": 6.534375e-05, "loss": 0.0003, "step": 4310 }, { "epoch": 13.5, "grad_norm": 0.0014933838974684477, "learning_rate": 6.503125e-05, "loss": 0.0003, "step": 4320 }, { "epoch": 13.53125, "grad_norm": 0.0013753235107287765, "learning_rate": 6.471875000000001e-05, "loss": 0.0003, "step": 4330 }, { "epoch": 13.5625, "grad_norm": 0.0015354104107245803, "learning_rate": 6.440625e-05, "loss": 0.0004, "step": 4340 }, { "epoch": 13.59375, "grad_norm": 0.0018886132165789604, "learning_rate": 6.409375000000001e-05, "loss": 0.0145, "step": 4350 }, { "epoch": 13.625, "grad_norm": 0.0012300664093345404, "learning_rate": 6.378125000000001e-05, "loss": 0.0003, "step": 4360 }, { "epoch": 13.65625, "grad_norm": 0.0015221609501168132, "learning_rate": 6.346875e-05, "loss": 0.0248, "step": 4370 }, { "epoch": 13.6875, "grad_norm": 0.0017438657814636827, "learning_rate": 6.315625000000001e-05, "loss": 0.0003, "step": 4380 }, { "epoch": 13.71875, "grad_norm": 0.0020069123711436987, "learning_rate": 6.284375e-05, "loss": 0.0002, "step": 4390 }, { "epoch": 13.75, "grad_norm": 0.0013785591581836343, "learning_rate": 6.253125e-05, "loss": 0.0002, "step": 4400 }, { "epoch": 13.75, "eval_accuracy": 0.975, "eval_loss": 0.15483084321022034, "eval_runtime": 6.2696, "eval_samples_per_second": 204.159, "eval_steps_per_second": 25.52, "step": 4400 }, { "epoch": 13.78125, "grad_norm": 1.8585741519927979, "learning_rate": 6.221875e-05, "loss": 0.0007, "step": 4410 }, { "epoch": 13.8125, "grad_norm": 0.0011989879421889782, "learning_rate": 6.190625e-05, "loss": 0.0003, "step": 4420 }, { "epoch": 13.84375, "grad_norm": 10.47256851196289, "learning_rate": 6.159375e-05, "loss": 0.0275, "step": 4430 }, { "epoch": 13.875, "grad_norm": 1.398241639137268, "learning_rate": 6.128125e-05, "loss": 0.0011, "step": 4440 }, { "epoch": 13.90625, "grad_norm": 0.0019299676641821861, "learning_rate": 6.096875000000001e-05, "loss": 0.0002, "step": 4450 }, { "epoch": 13.9375, "grad_norm": 0.0016532372683286667, "learning_rate": 6.065625e-05, "loss": 0.0003, "step": 4460 }, { "epoch": 13.96875, "grad_norm": 0.002245826181024313, "learning_rate": 6.034375e-05, "loss": 0.0259, "step": 4470 }, { "epoch": 14.0, "grad_norm": 0.002995037008076906, "learning_rate": 6.0031250000000006e-05, "loss": 0.0003, "step": 4480 }, { "epoch": 14.03125, "grad_norm": 0.0015550617827102542, "learning_rate": 5.971875e-05, "loss": 0.0002, "step": 4490 }, { "epoch": 14.0625, "grad_norm": 0.0018337038345634937, "learning_rate": 5.940625000000001e-05, "loss": 0.0003, "step": 4500 }, { "epoch": 14.0625, "eval_accuracy": 0.975, "eval_loss": 0.14965741336345673, "eval_runtime": 6.9272, "eval_samples_per_second": 184.78, "eval_steps_per_second": 23.097, "step": 4500 }, { "epoch": 14.09375, "grad_norm": 0.0013705631718039513, "learning_rate": 5.909375e-05, "loss": 0.0002, "step": 4510 }, { "epoch": 14.125, "grad_norm": 0.0015554011333733797, "learning_rate": 5.8781249999999996e-05, "loss": 0.0003, "step": 4520 }, { "epoch": 14.15625, "grad_norm": 0.0012009346392005682, "learning_rate": 5.846875000000001e-05, "loss": 0.0002, "step": 4530 }, { "epoch": 14.1875, "grad_norm": 0.0011792039731517434, "learning_rate": 5.815625e-05, "loss": 0.0002, "step": 4540 }, { "epoch": 14.21875, "grad_norm": 9.245038032531738, "learning_rate": 5.784375000000001e-05, "loss": 0.0038, "step": 4550 }, { "epoch": 14.25, "grad_norm": 0.0013690210180357099, "learning_rate": 5.7531250000000006e-05, "loss": 0.0003, "step": 4560 }, { "epoch": 14.28125, "grad_norm": 0.0013083881931379437, "learning_rate": 5.721875e-05, "loss": 0.0012, "step": 4570 }, { "epoch": 14.3125, "grad_norm": 34.58968734741211, "learning_rate": 5.6906250000000004e-05, "loss": 0.02, "step": 4580 }, { "epoch": 14.34375, "grad_norm": 0.0016999391373246908, "learning_rate": 5.6593750000000003e-05, "loss": 0.0002, "step": 4590 }, { "epoch": 14.375, "grad_norm": 0.0012809026520699263, "learning_rate": 5.628125000000001e-05, "loss": 0.0002, "step": 4600 }, { "epoch": 14.375, "eval_accuracy": 0.9640625, "eval_loss": 0.23174042999744415, "eval_runtime": 6.8846, "eval_samples_per_second": 185.923, "eval_steps_per_second": 23.24, "step": 4600 }, { "epoch": 14.40625, "grad_norm": 0.001300526550039649, "learning_rate": 5.596875e-05, "loss": 0.0006, "step": 4610 }, { "epoch": 14.4375, "grad_norm": 0.0010748986387625337, "learning_rate": 5.565625e-05, "loss": 0.0002, "step": 4620 }, { "epoch": 14.46875, "grad_norm": 0.0012471479130908847, "learning_rate": 5.534375000000001e-05, "loss": 0.0012, "step": 4630 }, { "epoch": 14.5, "grad_norm": 0.003839879296720028, "learning_rate": 5.503125e-05, "loss": 0.0002, "step": 4640 }, { "epoch": 14.53125, "grad_norm": 0.0013317788252606988, "learning_rate": 5.4718750000000005e-05, "loss": 0.0003, "step": 4650 }, { "epoch": 14.5625, "grad_norm": 0.001501008402556181, "learning_rate": 5.4406250000000004e-05, "loss": 0.0193, "step": 4660 }, { "epoch": 14.59375, "grad_norm": 0.0018197267781943083, "learning_rate": 5.409375e-05, "loss": 0.0035, "step": 4670 }, { "epoch": 14.625, "grad_norm": 0.0025837391149252653, "learning_rate": 5.378125e-05, "loss": 0.0005, "step": 4680 }, { "epoch": 14.65625, "grad_norm": 0.0016741983126848936, "learning_rate": 5.346875e-05, "loss": 0.0004, "step": 4690 }, { "epoch": 14.6875, "grad_norm": 0.036828938871622086, "learning_rate": 5.315625000000001e-05, "loss": 0.0003, "step": 4700 }, { "epoch": 14.6875, "eval_accuracy": 0.978125, "eval_loss": 0.14183716475963593, "eval_runtime": 6.7893, "eval_samples_per_second": 188.532, "eval_steps_per_second": 23.566, "step": 4700 }, { "epoch": 14.71875, "grad_norm": 0.0009561299229972064, "learning_rate": 5.284375e-05, "loss": 0.0002, "step": 4710 }, { "epoch": 14.75, "grad_norm": 0.001365609117783606, "learning_rate": 5.253125e-05, "loss": 0.0002, "step": 4720 }, { "epoch": 14.78125, "grad_norm": 0.0010248490143567324, "learning_rate": 5.2218750000000006e-05, "loss": 0.0002, "step": 4730 }, { "epoch": 14.8125, "grad_norm": 0.001031559775583446, "learning_rate": 5.190625e-05, "loss": 0.0002, "step": 4740 }, { "epoch": 14.84375, "grad_norm": 0.001064546755515039, "learning_rate": 5.159375000000001e-05, "loss": 0.0002, "step": 4750 }, { "epoch": 14.875, "grad_norm": 0.0011686970246955752, "learning_rate": 5.128125e-05, "loss": 0.0002, "step": 4760 }, { "epoch": 14.90625, "grad_norm": 0.0009815421653911471, "learning_rate": 5.0968749999999995e-05, "loss": 0.0002, "step": 4770 }, { "epoch": 14.9375, "grad_norm": 0.05007686838507652, "learning_rate": 5.065625000000001e-05, "loss": 0.0004, "step": 4780 }, { "epoch": 14.96875, "grad_norm": 0.001136138685978949, "learning_rate": 5.034375e-05, "loss": 0.0308, "step": 4790 }, { "epoch": 15.0, "grad_norm": 0.0011941984994336963, "learning_rate": 5.0031250000000007e-05, "loss": 0.0002, "step": 4800 }, { "epoch": 15.0, "eval_accuracy": 0.9734375, "eval_loss": 0.15367402136325836, "eval_runtime": 6.8694, "eval_samples_per_second": 186.333, "eval_steps_per_second": 23.292, "step": 4800 }, { "epoch": 15.03125, "grad_norm": 0.0012780999531969428, "learning_rate": 4.9718750000000006e-05, "loss": 0.0003, "step": 4810 }, { "epoch": 15.0625, "grad_norm": 0.0019433508859947324, "learning_rate": 4.9406250000000005e-05, "loss": 0.0002, "step": 4820 }, { "epoch": 15.09375, "grad_norm": 0.0011221400927752256, "learning_rate": 4.9093750000000004e-05, "loss": 0.0002, "step": 4830 }, { "epoch": 15.125, "grad_norm": 0.0010336939012631774, "learning_rate": 4.878125e-05, "loss": 0.0002, "step": 4840 }, { "epoch": 15.15625, "grad_norm": 0.11716494709253311, "learning_rate": 4.846875e-05, "loss": 0.0002, "step": 4850 }, { "epoch": 15.1875, "grad_norm": 0.0021654649171978235, "learning_rate": 4.815625e-05, "loss": 0.0002, "step": 4860 }, { "epoch": 15.21875, "grad_norm": 0.0010919362539425492, "learning_rate": 4.784375e-05, "loss": 0.0002, "step": 4870 }, { "epoch": 15.25, "grad_norm": 0.00111663737334311, "learning_rate": 4.753125000000001e-05, "loss": 0.0002, "step": 4880 }, { "epoch": 15.28125, "grad_norm": 0.0014220753218978643, "learning_rate": 4.721875e-05, "loss": 0.0002, "step": 4890 }, { "epoch": 15.3125, "grad_norm": 0.001022492302581668, "learning_rate": 4.690625e-05, "loss": 0.0002, "step": 4900 }, { "epoch": 15.3125, "eval_accuracy": 0.978125, "eval_loss": 0.14259929955005646, "eval_runtime": 6.2156, "eval_samples_per_second": 205.935, "eval_steps_per_second": 25.742, "step": 4900 }, { "epoch": 15.34375, "grad_norm": 0.001057266490533948, "learning_rate": 4.6593750000000004e-05, "loss": 0.0002, "step": 4910 }, { "epoch": 15.375, "grad_norm": 0.001144145498983562, "learning_rate": 4.6281250000000003e-05, "loss": 0.0002, "step": 4920 }, { "epoch": 15.40625, "grad_norm": 0.0009692934690974653, "learning_rate": 4.596875e-05, "loss": 0.0002, "step": 4930 }, { "epoch": 15.4375, "grad_norm": 0.000945160398259759, "learning_rate": 4.565625e-05, "loss": 0.0002, "step": 4940 }, { "epoch": 15.46875, "grad_norm": 0.0009887829655781388, "learning_rate": 4.534375e-05, "loss": 0.0002, "step": 4950 }, { "epoch": 15.5, "grad_norm": 0.002527805743739009, "learning_rate": 4.503125e-05, "loss": 0.0002, "step": 4960 }, { "epoch": 15.53125, "grad_norm": 0.0008647911017760634, "learning_rate": 4.4718750000000006e-05, "loss": 0.0002, "step": 4970 }, { "epoch": 15.5625, "grad_norm": 0.0011423344258219004, "learning_rate": 4.4406250000000005e-05, "loss": 0.0002, "step": 4980 }, { "epoch": 15.59375, "grad_norm": 0.0022281960118561983, "learning_rate": 4.409375e-05, "loss": 0.0002, "step": 4990 }, { "epoch": 15.625, "grad_norm": 0.0011202392634004354, "learning_rate": 4.3781250000000004e-05, "loss": 0.0002, "step": 5000 }, { "epoch": 15.625, "eval_accuracy": 0.98203125, "eval_loss": 0.12530331313610077, "eval_runtime": 6.8051, "eval_samples_per_second": 188.096, "eval_steps_per_second": 23.512, "step": 5000 }, { "epoch": 15.65625, "grad_norm": 0.0008281685295514762, "learning_rate": 4.346875e-05, "loss": 0.0002, "step": 5010 }, { "epoch": 15.6875, "grad_norm": 0.001133206533268094, "learning_rate": 4.315625e-05, "loss": 0.0002, "step": 5020 }, { "epoch": 15.71875, "grad_norm": 0.000926899432670325, "learning_rate": 4.284375000000001e-05, "loss": 0.0002, "step": 5030 }, { "epoch": 15.75, "grad_norm": 0.0009977484587579966, "learning_rate": 4.253125e-05, "loss": 0.0006, "step": 5040 }, { "epoch": 15.78125, "grad_norm": 0.0011439846130087972, "learning_rate": 4.221875e-05, "loss": 0.0428, "step": 5050 }, { "epoch": 15.8125, "grad_norm": 0.0012359012616798282, "learning_rate": 4.1906250000000006e-05, "loss": 0.0002, "step": 5060 }, { "epoch": 15.84375, "grad_norm": 0.0009622338111512363, "learning_rate": 4.1593750000000005e-05, "loss": 0.0002, "step": 5070 }, { "epoch": 15.875, "grad_norm": 0.0010638950625434518, "learning_rate": 4.1281250000000004e-05, "loss": 0.0002, "step": 5080 }, { "epoch": 15.90625, "grad_norm": 0.0011404824908822775, "learning_rate": 4.096875e-05, "loss": 0.0002, "step": 5090 }, { "epoch": 15.9375, "grad_norm": 0.0012403588043525815, "learning_rate": 4.065625e-05, "loss": 0.0002, "step": 5100 }, { "epoch": 15.9375, "eval_accuracy": 0.98359375, "eval_loss": 0.1128150224685669, "eval_runtime": 6.2783, "eval_samples_per_second": 203.878, "eval_steps_per_second": 25.485, "step": 5100 }, { "epoch": 15.96875, "grad_norm": 0.001123543013818562, "learning_rate": 4.034375e-05, "loss": 0.0002, "step": 5110 }, { "epoch": 16.0, "grad_norm": 0.001036747358739376, "learning_rate": 4.003125e-05, "loss": 0.0002, "step": 5120 }, { "epoch": 16.03125, "grad_norm": 0.0013284431770443916, "learning_rate": 3.9718750000000007e-05, "loss": 0.0002, "step": 5130 }, { "epoch": 16.0625, "grad_norm": 0.0008586233016103506, "learning_rate": 3.940625e-05, "loss": 0.0002, "step": 5140 }, { "epoch": 16.09375, "grad_norm": 0.0012248932616785169, "learning_rate": 3.909375e-05, "loss": 0.0002, "step": 5150 }, { "epoch": 16.125, "grad_norm": 0.006662206724286079, "learning_rate": 3.8781250000000004e-05, "loss": 0.0002, "step": 5160 }, { "epoch": 16.15625, "grad_norm": 0.0011745645897462964, "learning_rate": 3.846875e-05, "loss": 0.0002, "step": 5170 }, { "epoch": 16.1875, "grad_norm": 0.0010919731575995684, "learning_rate": 3.815625e-05, "loss": 0.0002, "step": 5180 }, { "epoch": 16.21875, "grad_norm": 0.0010604038834571838, "learning_rate": 3.784375e-05, "loss": 0.0002, "step": 5190 }, { "epoch": 16.25, "grad_norm": 0.0011869663139805198, "learning_rate": 3.753125e-05, "loss": 0.0002, "step": 5200 }, { "epoch": 16.25, "eval_accuracy": 0.98046875, "eval_loss": 0.1246190294623375, "eval_runtime": 6.8551, "eval_samples_per_second": 186.722, "eval_steps_per_second": 23.34, "step": 5200 }, { "epoch": 16.28125, "grad_norm": 0.0010713781230151653, "learning_rate": 3.721875e-05, "loss": 0.0002, "step": 5210 }, { "epoch": 16.3125, "grad_norm": 0.0007901139324530959, "learning_rate": 3.6906250000000006e-05, "loss": 0.0002, "step": 5220 }, { "epoch": 16.34375, "grad_norm": 0.001095872139558196, "learning_rate": 3.6593750000000005e-05, "loss": 0.0002, "step": 5230 }, { "epoch": 16.375, "grad_norm": 0.0010994484182447195, "learning_rate": 3.628125e-05, "loss": 0.0002, "step": 5240 }, { "epoch": 16.40625, "grad_norm": 0.0017003176035359502, "learning_rate": 3.5968750000000004e-05, "loss": 0.0002, "step": 5250 }, { "epoch": 16.4375, "grad_norm": 0.0007925952086225152, "learning_rate": 3.565625e-05, "loss": 0.0002, "step": 5260 }, { "epoch": 16.46875, "grad_norm": 0.0008510766783729196, "learning_rate": 3.534375e-05, "loss": 0.0002, "step": 5270 }, { "epoch": 16.5, "grad_norm": 0.18106159567832947, "learning_rate": 3.503125e-05, "loss": 0.0002, "step": 5280 }, { "epoch": 16.53125, "grad_norm": 0.0008954983204603195, "learning_rate": 3.471875e-05, "loss": 0.0002, "step": 5290 }, { "epoch": 16.5625, "grad_norm": 0.0011244250927120447, "learning_rate": 3.440625e-05, "loss": 0.0002, "step": 5300 }, { "epoch": 16.5625, "eval_accuracy": 0.9828125, "eval_loss": 0.11365531384944916, "eval_runtime": 6.8269, "eval_samples_per_second": 187.495, "eval_steps_per_second": 23.437, "step": 5300 }, { "epoch": 16.59375, "grad_norm": 0.0009858476696535945, "learning_rate": 3.4093750000000005e-05, "loss": 0.0002, "step": 5310 }, { "epoch": 16.625, "grad_norm": 0.0012397761456668377, "learning_rate": 3.3781250000000005e-05, "loss": 0.0002, "step": 5320 }, { "epoch": 16.65625, "grad_norm": 0.0008871417958289385, "learning_rate": 3.3468750000000004e-05, "loss": 0.0002, "step": 5330 }, { "epoch": 16.6875, "grad_norm": 0.0007771385135129094, "learning_rate": 3.315625e-05, "loss": 0.0002, "step": 5340 }, { "epoch": 16.71875, "grad_norm": 0.0007905489183031023, "learning_rate": 3.284375e-05, "loss": 0.0002, "step": 5350 }, { "epoch": 16.75, "grad_norm": 0.0009036400006152689, "learning_rate": 3.253125e-05, "loss": 0.0001, "step": 5360 }, { "epoch": 16.78125, "grad_norm": 0.0007690058555454016, "learning_rate": 3.221875e-05, "loss": 0.0002, "step": 5370 }, { "epoch": 16.8125, "grad_norm": 0.0011280475882813334, "learning_rate": 3.1906250000000006e-05, "loss": 0.0002, "step": 5380 }, { "epoch": 16.84375, "grad_norm": 0.000766371435020119, "learning_rate": 3.159375e-05, "loss": 0.0002, "step": 5390 }, { "epoch": 16.875, "grad_norm": 0.005487964954227209, "learning_rate": 3.128125e-05, "loss": 0.0001, "step": 5400 }, { "epoch": 16.875, "eval_accuracy": 0.984375, "eval_loss": 0.11014194786548615, "eval_runtime": 6.2479, "eval_samples_per_second": 204.867, "eval_steps_per_second": 25.608, "step": 5400 }, { "epoch": 16.90625, "grad_norm": 0.0009019103599712253, "learning_rate": 3.0968750000000004e-05, "loss": 0.0002, "step": 5410 }, { "epoch": 16.9375, "grad_norm": 0.0008425983251072466, "learning_rate": 3.065625e-05, "loss": 0.0001, "step": 5420 }, { "epoch": 16.96875, "grad_norm": 0.000717274087946862, "learning_rate": 3.0343750000000006e-05, "loss": 0.0002, "step": 5430 }, { "epoch": 17.0, "grad_norm": 0.0007335466798394918, "learning_rate": 3.0031249999999998e-05, "loss": 0.0001, "step": 5440 }, { "epoch": 17.03125, "grad_norm": 0.0015981667675077915, "learning_rate": 2.971875e-05, "loss": 0.0001, "step": 5450 }, { "epoch": 17.0625, "grad_norm": 0.000765695353038609, "learning_rate": 2.9406250000000003e-05, "loss": 0.0002, "step": 5460 }, { "epoch": 17.09375, "grad_norm": 0.0017417181516066194, "learning_rate": 2.9093750000000002e-05, "loss": 0.0002, "step": 5470 }, { "epoch": 17.125, "grad_norm": 0.0010076581966131926, "learning_rate": 2.8781250000000005e-05, "loss": 0.0001, "step": 5480 }, { "epoch": 17.15625, "grad_norm": 0.0008663997869007289, "learning_rate": 2.846875e-05, "loss": 0.0002, "step": 5490 }, { "epoch": 17.1875, "grad_norm": 0.0007040807977318764, "learning_rate": 2.815625e-05, "loss": 0.0001, "step": 5500 }, { "epoch": 17.1875, "eval_accuracy": 0.984375, "eval_loss": 0.11123112589120865, "eval_runtime": 7.2338, "eval_samples_per_second": 176.947, "eval_steps_per_second": 22.118, "step": 5500 }, { "epoch": 17.21875, "grad_norm": 0.0008027940057218075, "learning_rate": 2.7843750000000003e-05, "loss": 0.0001, "step": 5510 }, { "epoch": 17.25, "grad_norm": 0.0009467356721870601, "learning_rate": 2.7531250000000002e-05, "loss": 0.0002, "step": 5520 }, { "epoch": 17.28125, "grad_norm": 0.0007668856414966285, "learning_rate": 2.7218750000000004e-05, "loss": 0.0002, "step": 5530 }, { "epoch": 17.3125, "grad_norm": 0.0011194231919944286, "learning_rate": 2.690625e-05, "loss": 0.0002, "step": 5540 }, { "epoch": 17.34375, "grad_norm": 0.0010725741740316153, "learning_rate": 2.659375e-05, "loss": 0.0002, "step": 5550 }, { "epoch": 17.375, "grad_norm": 0.0013631158508360386, "learning_rate": 2.6281250000000002e-05, "loss": 0.0002, "step": 5560 }, { "epoch": 17.40625, "grad_norm": 0.0008963380823843181, "learning_rate": 2.5968750000000004e-05, "loss": 0.0001, "step": 5570 }, { "epoch": 17.4375, "grad_norm": 0.0008231993415392935, "learning_rate": 2.5656250000000004e-05, "loss": 0.0001, "step": 5580 }, { "epoch": 17.46875, "grad_norm": 0.0007719449349679053, "learning_rate": 2.534375e-05, "loss": 0.0002, "step": 5590 }, { "epoch": 17.5, "grad_norm": 0.0008315558661706746, "learning_rate": 2.5031250000000002e-05, "loss": 0.0001, "step": 5600 }, { "epoch": 17.5, "eval_accuracy": 0.984375, "eval_loss": 0.11211228370666504, "eval_runtime": 6.9482, "eval_samples_per_second": 184.221, "eval_steps_per_second": 23.028, "step": 5600 }, { "epoch": 17.53125, "grad_norm": 0.0007284819148480892, "learning_rate": 2.471875e-05, "loss": 0.0001, "step": 5610 }, { "epoch": 17.5625, "grad_norm": 0.0006902694585733116, "learning_rate": 2.440625e-05, "loss": 0.0001, "step": 5620 }, { "epoch": 17.59375, "grad_norm": 0.0007927055121399462, "learning_rate": 2.409375e-05, "loss": 0.0001, "step": 5630 }, { "epoch": 17.625, "grad_norm": 0.0008816330228000879, "learning_rate": 2.3781250000000002e-05, "loss": 0.0001, "step": 5640 }, { "epoch": 17.65625, "grad_norm": 0.00072521495167166, "learning_rate": 2.346875e-05, "loss": 0.0001, "step": 5650 }, { "epoch": 17.6875, "grad_norm": 0.0009029438951984048, "learning_rate": 2.315625e-05, "loss": 0.0001, "step": 5660 }, { "epoch": 17.71875, "grad_norm": 0.0007223158027045429, "learning_rate": 2.284375e-05, "loss": 0.0001, "step": 5670 }, { "epoch": 17.75, "grad_norm": 0.0007876714807935059, "learning_rate": 2.2531250000000002e-05, "loss": 0.0001, "step": 5680 }, { "epoch": 17.78125, "grad_norm": 0.0010673877550289035, "learning_rate": 2.221875e-05, "loss": 0.0001, "step": 5690 }, { "epoch": 17.8125, "grad_norm": 0.001492728479206562, "learning_rate": 2.190625e-05, "loss": 0.0001, "step": 5700 }, { "epoch": 17.8125, "eval_accuracy": 0.98359375, "eval_loss": 0.11293692886829376, "eval_runtime": 6.3413, "eval_samples_per_second": 201.851, "eval_steps_per_second": 25.231, "step": 5700 }, { "epoch": 17.84375, "grad_norm": 0.0008830283186398447, "learning_rate": 2.1593750000000003e-05, "loss": 0.0001, "step": 5710 }, { "epoch": 17.875, "grad_norm": 0.0006816980894654989, "learning_rate": 2.128125e-05, "loss": 0.0001, "step": 5720 }, { "epoch": 17.90625, "grad_norm": 0.0008512111380696297, "learning_rate": 2.096875e-05, "loss": 0.0001, "step": 5730 }, { "epoch": 17.9375, "grad_norm": 0.0008296071318909526, "learning_rate": 2.065625e-05, "loss": 0.0001, "step": 5740 }, { "epoch": 17.96875, "grad_norm": 0.0007265993626788259, "learning_rate": 2.034375e-05, "loss": 0.0001, "step": 5750 }, { "epoch": 18.0, "grad_norm": 0.001100354827940464, "learning_rate": 2.0031250000000002e-05, "loss": 0.0001, "step": 5760 }, { "epoch": 18.03125, "grad_norm": 0.0008238213486038148, "learning_rate": 1.9718749999999998e-05, "loss": 0.0001, "step": 5770 }, { "epoch": 18.0625, "grad_norm": 0.0010971089359372854, "learning_rate": 1.940625e-05, "loss": 0.0002, "step": 5780 }, { "epoch": 18.09375, "grad_norm": 0.0007329233339987695, "learning_rate": 1.9093750000000003e-05, "loss": 0.0002, "step": 5790 }, { "epoch": 18.125, "grad_norm": 0.0008615129627287388, "learning_rate": 1.878125e-05, "loss": 0.0001, "step": 5800 }, { "epoch": 18.125, "eval_accuracy": 0.984375, "eval_loss": 0.11349210888147354, "eval_runtime": 6.4474, "eval_samples_per_second": 198.53, "eval_steps_per_second": 24.816, "step": 5800 }, { "epoch": 18.15625, "grad_norm": 0.0007490671123377979, "learning_rate": 1.846875e-05, "loss": 0.0001, "step": 5810 }, { "epoch": 18.1875, "grad_norm": 0.0011704419739544392, "learning_rate": 1.815625e-05, "loss": 0.0001, "step": 5820 }, { "epoch": 18.21875, "grad_norm": 0.0007434096769429743, "learning_rate": 1.784375e-05, "loss": 0.0002, "step": 5830 }, { "epoch": 18.25, "grad_norm": 0.0007305287872441113, "learning_rate": 1.7531250000000003e-05, "loss": 0.0001, "step": 5840 }, { "epoch": 18.28125, "grad_norm": 0.0011461430694907904, "learning_rate": 1.7218750000000002e-05, "loss": 0.0001, "step": 5850 }, { "epoch": 18.3125, "grad_norm": 0.001021283445879817, "learning_rate": 1.690625e-05, "loss": 0.0001, "step": 5860 }, { "epoch": 18.34375, "grad_norm": 0.0006852642400190234, "learning_rate": 1.659375e-05, "loss": 0.0001, "step": 5870 }, { "epoch": 18.375, "grad_norm": 0.0007609901949763298, "learning_rate": 1.628125e-05, "loss": 0.0001, "step": 5880 }, { "epoch": 18.40625, "grad_norm": 0.0038062427192926407, "learning_rate": 1.5968750000000002e-05, "loss": 0.0001, "step": 5890 }, { "epoch": 18.4375, "grad_norm": 0.0007008612737990916, "learning_rate": 1.565625e-05, "loss": 0.0001, "step": 5900 }, { "epoch": 18.4375, "eval_accuracy": 0.984375, "eval_loss": 0.11404496431350708, "eval_runtime": 6.2975, "eval_samples_per_second": 203.256, "eval_steps_per_second": 25.407, "step": 5900 }, { "epoch": 18.46875, "grad_norm": 0.001011393149383366, "learning_rate": 1.534375e-05, "loss": 0.0001, "step": 5910 }, { "epoch": 18.5, "grad_norm": 0.0007079701754264534, "learning_rate": 1.503125e-05, "loss": 0.0001, "step": 5920 }, { "epoch": 18.53125, "grad_norm": 0.001830734545364976, "learning_rate": 1.471875e-05, "loss": 0.0001, "step": 5930 }, { "epoch": 18.5625, "grad_norm": 0.0007542877574451268, "learning_rate": 1.4406250000000001e-05, "loss": 0.0001, "step": 5940 }, { "epoch": 18.59375, "grad_norm": 0.0006455339025706053, "learning_rate": 1.409375e-05, "loss": 0.0001, "step": 5950 }, { "epoch": 18.625, "grad_norm": 0.0009898885618895292, "learning_rate": 1.3781250000000001e-05, "loss": 0.0001, "step": 5960 }, { "epoch": 18.65625, "grad_norm": 0.0009699007496237755, "learning_rate": 1.3468749999999999e-05, "loss": 0.0001, "step": 5970 }, { "epoch": 18.6875, "grad_norm": 0.0007313139503821731, "learning_rate": 1.3156250000000001e-05, "loss": 0.0001, "step": 5980 }, { "epoch": 18.71875, "grad_norm": 0.0010632864432409406, "learning_rate": 1.2843750000000002e-05, "loss": 0.0001, "step": 5990 }, { "epoch": 18.75, "grad_norm": 0.000874130695592612, "learning_rate": 1.253125e-05, "loss": 0.0001, "step": 6000 }, { "epoch": 18.75, "eval_accuracy": 0.984375, "eval_loss": 0.11457158625125885, "eval_runtime": 6.7696, "eval_samples_per_second": 189.082, "eval_steps_per_second": 23.635, "step": 6000 }, { "epoch": 18.78125, "grad_norm": 0.0007607897859998047, "learning_rate": 1.221875e-05, "loss": 0.0001, "step": 6010 }, { "epoch": 18.8125, "grad_norm": 0.0006213558372110128, "learning_rate": 1.1906250000000001e-05, "loss": 0.0001, "step": 6020 }, { "epoch": 18.84375, "grad_norm": 0.0009066364727914333, "learning_rate": 1.159375e-05, "loss": 0.0001, "step": 6030 }, { "epoch": 18.875, "grad_norm": 0.0008698303136043251, "learning_rate": 1.128125e-05, "loss": 0.0001, "step": 6040 }, { "epoch": 18.90625, "grad_norm": 0.0008234538836404681, "learning_rate": 1.096875e-05, "loss": 0.0001, "step": 6050 }, { "epoch": 18.9375, "grad_norm": 0.0009270088630728424, "learning_rate": 1.0656250000000002e-05, "loss": 0.0001, "step": 6060 }, { "epoch": 18.96875, "grad_norm": 0.0007937824120745063, "learning_rate": 1.034375e-05, "loss": 0.0001, "step": 6070 }, { "epoch": 19.0, "grad_norm": 0.0009614995797164738, "learning_rate": 1.003125e-05, "loss": 0.0001, "step": 6080 }, { "epoch": 19.03125, "grad_norm": 0.0006743675330653787, "learning_rate": 9.71875e-06, "loss": 0.0001, "step": 6090 }, { "epoch": 19.0625, "grad_norm": 0.0007780918967910111, "learning_rate": 9.40625e-06, "loss": 0.0001, "step": 6100 }, { "epoch": 19.0625, "eval_accuracy": 0.984375, "eval_loss": 0.11495751142501831, "eval_runtime": 6.295, "eval_samples_per_second": 203.337, "eval_steps_per_second": 25.417, "step": 6100 }, { "epoch": 19.09375, "grad_norm": 0.0010844800854101777, "learning_rate": 9.09375e-06, "loss": 0.0001, "step": 6110 }, { "epoch": 19.125, "grad_norm": 0.0007763968897052109, "learning_rate": 8.78125e-06, "loss": 0.0001, "step": 6120 }, { "epoch": 19.15625, "grad_norm": 0.0006313940975815058, "learning_rate": 8.468750000000001e-06, "loss": 0.0001, "step": 6130 }, { "epoch": 19.1875, "grad_norm": 0.0010176225332543254, "learning_rate": 8.15625e-06, "loss": 0.0001, "step": 6140 }, { "epoch": 19.21875, "grad_norm": 0.0006113633280619979, "learning_rate": 7.84375e-06, "loss": 0.0001, "step": 6150 }, { "epoch": 19.25, "grad_norm": 0.0009647855767980218, "learning_rate": 7.531250000000001e-06, "loss": 0.0001, "step": 6160 }, { "epoch": 19.28125, "grad_norm": 0.0008165627950802445, "learning_rate": 7.21875e-06, "loss": 0.0001, "step": 6170 }, { "epoch": 19.3125, "grad_norm": 0.000816655985545367, "learning_rate": 6.90625e-06, "loss": 0.0001, "step": 6180 }, { "epoch": 19.34375, "grad_norm": 0.0008548317127861083, "learning_rate": 6.59375e-06, "loss": 0.0001, "step": 6190 }, { "epoch": 19.375, "grad_norm": 0.0006300067761912942, "learning_rate": 6.281249999999999e-06, "loss": 0.0001, "step": 6200 }, { "epoch": 19.375, "eval_accuracy": 0.984375, "eval_loss": 0.11527726799249649, "eval_runtime": 6.2032, "eval_samples_per_second": 206.344, "eval_steps_per_second": 25.793, "step": 6200 }, { "epoch": 19.40625, "grad_norm": 0.0009761661058291793, "learning_rate": 5.96875e-06, "loss": 0.0001, "step": 6210 }, { "epoch": 19.4375, "grad_norm": 0.0007255289820022881, "learning_rate": 5.65625e-06, "loss": 0.0001, "step": 6220 }, { "epoch": 19.46875, "grad_norm": 0.0006587054231204093, "learning_rate": 5.34375e-06, "loss": 0.0001, "step": 6230 }, { "epoch": 19.5, "grad_norm": 0.0009364929865114391, "learning_rate": 5.03125e-06, "loss": 0.0001, "step": 6240 }, { "epoch": 19.53125, "grad_norm": 0.0006768327439203858, "learning_rate": 4.71875e-06, "loss": 0.0001, "step": 6250 }, { "epoch": 19.5625, "grad_norm": 0.0015788966557011008, "learning_rate": 4.40625e-06, "loss": 0.0001, "step": 6260 }, { "epoch": 19.59375, "grad_norm": 0.0006980461766943336, "learning_rate": 4.09375e-06, "loss": 0.0001, "step": 6270 }, { "epoch": 19.625, "grad_norm": 0.0008529416518285871, "learning_rate": 3.78125e-06, "loss": 0.0001, "step": 6280 }, { "epoch": 19.65625, "grad_norm": 0.0008394841570407152, "learning_rate": 3.4687500000000005e-06, "loss": 0.0001, "step": 6290 }, { "epoch": 19.6875, "grad_norm": 0.0008695558062754571, "learning_rate": 3.15625e-06, "loss": 0.0001, "step": 6300 }, { "epoch": 19.6875, "eval_accuracy": 0.984375, "eval_loss": 0.11545456945896149, "eval_runtime": 6.236, "eval_samples_per_second": 205.259, "eval_steps_per_second": 25.657, "step": 6300 }, { "epoch": 19.71875, "grad_norm": 0.0012889541685581207, "learning_rate": 2.84375e-06, "loss": 0.0001, "step": 6310 }, { "epoch": 19.75, "grad_norm": 0.0006377049721777439, "learning_rate": 2.53125e-06, "loss": 0.0001, "step": 6320 }, { "epoch": 19.78125, "grad_norm": 0.0007706707692705095, "learning_rate": 2.21875e-06, "loss": 0.0001, "step": 6330 }, { "epoch": 19.8125, "grad_norm": 0.0007259553531184793, "learning_rate": 1.90625e-06, "loss": 0.0001, "step": 6340 }, { "epoch": 19.84375, "grad_norm": 0.0009117970475926995, "learning_rate": 1.5937500000000002e-06, "loss": 0.0001, "step": 6350 }, { "epoch": 19.875, "grad_norm": 0.0008638539584353566, "learning_rate": 1.28125e-06, "loss": 0.0001, "step": 6360 }, { "epoch": 19.90625, "grad_norm": 0.0006554033607244492, "learning_rate": 9.6875e-07, "loss": 0.0001, "step": 6370 }, { "epoch": 19.9375, "grad_norm": 0.00083553371950984, "learning_rate": 6.5625e-07, "loss": 0.0001, "step": 6380 }, { "epoch": 19.96875, "grad_norm": 0.0006699099321849644, "learning_rate": 3.4375000000000004e-07, "loss": 0.0001, "step": 6390 }, { "epoch": 20.0, "grad_norm": 0.000951381167396903, "learning_rate": 3.1250000000000005e-08, "loss": 0.0001, "step": 6400 }, { "epoch": 20.0, "eval_accuracy": 0.984375, "eval_loss": 0.11548350006341934, "eval_runtime": 6.596, "eval_samples_per_second": 194.058, "eval_steps_per_second": 24.257, "step": 6400 }, { "epoch": 20.0, "step": 6400, "total_flos": 7.935321977546342e+18, "train_loss": 0.10612523018964566, "train_runtime": 3045.4016, "train_samples_per_second": 33.624, "train_steps_per_second": 2.102 } ], "logging_steps": 10, "max_steps": 6400, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.935321977546342e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }