{ "best_metric": 0.910958904109589, "best_model_checkpoint": "swinv2-tiny-patch4-window8-256-finetuned-5emotions\\checkpoint-5281", "epoch": 24.99881656804734, "eval_steps": 500, "global_step": 5281, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.047337278106508875, "grad_norm": 7.075885772705078, "learning_rate": 6.765899864682003e-07, "loss": 1.691, "step": 10 }, { "epoch": 0.09467455621301775, "grad_norm": 9.890098571777344, "learning_rate": 1.3531799729364006e-06, "loss": 1.6712, "step": 20 }, { "epoch": 0.14201183431952663, "grad_norm": 7.415971755981445, "learning_rate": 2.029769959404601e-06, "loss": 1.6787, "step": 30 }, { "epoch": 0.1893491124260355, "grad_norm": 6.3063764572143555, "learning_rate": 2.7063599458728013e-06, "loss": 1.6685, "step": 40 }, { "epoch": 0.23668639053254437, "grad_norm": 8.763900756835938, "learning_rate": 3.3829499323410016e-06, "loss": 1.6143, "step": 50 }, { "epoch": 0.28402366863905326, "grad_norm": 6.661700248718262, "learning_rate": 4.059539918809202e-06, "loss": 1.5849, "step": 60 }, { "epoch": 0.33136094674556216, "grad_norm": 7.178672790527344, "learning_rate": 4.736129905277402e-06, "loss": 1.5502, "step": 70 }, { "epoch": 0.378698224852071, "grad_norm": 5.857969284057617, "learning_rate": 5.4127198917456026e-06, "loss": 1.5274, "step": 80 }, { "epoch": 0.4260355029585799, "grad_norm": 6.652136325836182, "learning_rate": 6.089309878213803e-06, "loss": 1.4915, "step": 90 }, { "epoch": 0.47337278106508873, "grad_norm": 6.222568035125732, "learning_rate": 6.765899864682003e-06, "loss": 1.4063, "step": 100 }, { "epoch": 0.5207100591715976, "grad_norm": 6.365822792053223, "learning_rate": 7.442489851150203e-06, "loss": 1.3811, "step": 110 }, { "epoch": 0.5680473372781065, "grad_norm": 7.9343414306640625, "learning_rate": 8.119079837618404e-06, "loss": 1.3026, "step": 120 }, { "epoch": 0.6153846153846154, "grad_norm": 9.204723358154297, "learning_rate": 8.795669824086604e-06, "loss": 1.2516, "step": 130 }, { "epoch": 0.6627218934911243, "grad_norm": 7.836040496826172, "learning_rate": 9.472259810554804e-06, "loss": 1.1664, "step": 140 }, { "epoch": 0.7100591715976331, "grad_norm": 10.82960319519043, "learning_rate": 1.0148849797023005e-05, "loss": 1.182, "step": 150 }, { "epoch": 0.757396449704142, "grad_norm": 13.981669425964355, "learning_rate": 1.0825439783491205e-05, "loss": 1.0992, "step": 160 }, { "epoch": 0.8047337278106509, "grad_norm": 17.63448715209961, "learning_rate": 1.1502029769959405e-05, "loss": 1.036, "step": 170 }, { "epoch": 0.8520710059171598, "grad_norm": 13.67409610748291, "learning_rate": 1.2178619756427606e-05, "loss": 1.0372, "step": 180 }, { "epoch": 0.8994082840236687, "grad_norm": 10.486079216003418, "learning_rate": 1.2855209742895804e-05, "loss": 0.9888, "step": 190 }, { "epoch": 0.9467455621301775, "grad_norm": 10.388420104980469, "learning_rate": 1.3531799729364006e-05, "loss": 0.98, "step": 200 }, { "epoch": 0.9940828402366864, "grad_norm": 11.530645370483398, "learning_rate": 1.4208389715832207e-05, "loss": 0.9665, "step": 210 }, { "epoch": 0.9988165680473373, "eval_accuracy": 0.6835616438356165, "eval_loss": 0.8002648949623108, "eval_runtime": 6.3464, "eval_samples_per_second": 230.051, "eval_steps_per_second": 28.835, "step": 211 }, { "epoch": 1.0414201183431953, "grad_norm": 14.492610931396484, "learning_rate": 1.4884979702300405e-05, "loss": 0.9431, "step": 220 }, { "epoch": 1.0887573964497042, "grad_norm": 11.852544784545898, "learning_rate": 1.5561569688768607e-05, "loss": 0.8959, "step": 230 }, { "epoch": 1.136094674556213, "grad_norm": 11.708285331726074, "learning_rate": 1.6238159675236808e-05, "loss": 0.9688, "step": 240 }, { "epoch": 1.183431952662722, "grad_norm": 14.45132827758789, "learning_rate": 1.6914749661705008e-05, "loss": 0.8888, "step": 250 }, { "epoch": 1.2307692307692308, "grad_norm": 14.281059265136719, "learning_rate": 1.759133964817321e-05, "loss": 0.846, "step": 260 }, { "epoch": 1.2781065088757395, "grad_norm": 14.860888481140137, "learning_rate": 1.826792963464141e-05, "loss": 0.8906, "step": 270 }, { "epoch": 1.3254437869822486, "grad_norm": 10.605212211608887, "learning_rate": 1.894451962110961e-05, "loss": 0.8169, "step": 280 }, { "epoch": 1.3727810650887573, "grad_norm": 12.668191909790039, "learning_rate": 1.962110960757781e-05, "loss": 0.836, "step": 290 }, { "epoch": 1.4201183431952662, "grad_norm": 10.248248100280762, "learning_rate": 2.029769959404601e-05, "loss": 0.7644, "step": 300 }, { "epoch": 1.467455621301775, "grad_norm": 9.778542518615723, "learning_rate": 2.097428958051421e-05, "loss": 0.7755, "step": 310 }, { "epoch": 1.514792899408284, "grad_norm": 9.64427661895752, "learning_rate": 2.165087956698241e-05, "loss": 0.7387, "step": 320 }, { "epoch": 1.5621301775147929, "grad_norm": 10.04445743560791, "learning_rate": 2.232746955345061e-05, "loss": 0.7605, "step": 330 }, { "epoch": 1.6094674556213018, "grad_norm": 13.125927925109863, "learning_rate": 2.300405953991881e-05, "loss": 0.6781, "step": 340 }, { "epoch": 1.6568047337278107, "grad_norm": 13.797953605651855, "learning_rate": 2.368064952638701e-05, "loss": 0.6551, "step": 350 }, { "epoch": 1.7041420118343196, "grad_norm": 14.754645347595215, "learning_rate": 2.435723951285521e-05, "loss": 0.7542, "step": 360 }, { "epoch": 1.7514792899408285, "grad_norm": 13.914559364318848, "learning_rate": 2.5033829499323412e-05, "loss": 0.8104, "step": 370 }, { "epoch": 1.7988165680473371, "grad_norm": 11.46696662902832, "learning_rate": 2.571041948579161e-05, "loss": 0.6945, "step": 380 }, { "epoch": 1.8461538461538463, "grad_norm": 10.812294960021973, "learning_rate": 2.638700947225981e-05, "loss": 0.6711, "step": 390 }, { "epoch": 1.893491124260355, "grad_norm": 15.02450180053711, "learning_rate": 2.7063599458728013e-05, "loss": 0.7345, "step": 400 }, { "epoch": 1.940828402366864, "grad_norm": 11.53946590423584, "learning_rate": 2.7740189445196213e-05, "loss": 0.723, "step": 410 }, { "epoch": 1.9881656804733727, "grad_norm": 8.337069511413574, "learning_rate": 2.8416779431664413e-05, "loss": 0.6443, "step": 420 }, { "epoch": 1.9976331360946746, "eval_accuracy": 0.8246575342465754, "eval_loss": 0.4562951624393463, "eval_runtime": 6.3186, "eval_samples_per_second": 231.065, "eval_steps_per_second": 28.962, "step": 422 }, { "epoch": 2.035502958579882, "grad_norm": 9.038360595703125, "learning_rate": 2.9093369418132617e-05, "loss": 0.6256, "step": 430 }, { "epoch": 2.0828402366863905, "grad_norm": 12.379063606262207, "learning_rate": 2.976995940460081e-05, "loss": 0.5998, "step": 440 }, { "epoch": 2.1301775147928996, "grad_norm": 12.626445770263672, "learning_rate": 3.044654939106901e-05, "loss": 0.6456, "step": 450 }, { "epoch": 2.1775147928994083, "grad_norm": 10.665410995483398, "learning_rate": 3.1123139377537215e-05, "loss": 0.6145, "step": 460 }, { "epoch": 2.224852071005917, "grad_norm": 11.917645454406738, "learning_rate": 3.1799729364005415e-05, "loss": 0.6494, "step": 470 }, { "epoch": 2.272189349112426, "grad_norm": 14.427268981933594, "learning_rate": 3.2476319350473615e-05, "loss": 0.5967, "step": 480 }, { "epoch": 2.3195266272189348, "grad_norm": 14.22167682647705, "learning_rate": 3.3152909336941816e-05, "loss": 0.6356, "step": 490 }, { "epoch": 2.366863905325444, "grad_norm": 15.034667015075684, "learning_rate": 3.3829499323410016e-05, "loss": 0.6583, "step": 500 }, { "epoch": 2.4142011834319526, "grad_norm": 11.716626167297363, "learning_rate": 3.4506089309878216e-05, "loss": 0.5703, "step": 510 }, { "epoch": 2.4615384615384617, "grad_norm": 8.812618255615234, "learning_rate": 3.518267929634642e-05, "loss": 0.5495, "step": 520 }, { "epoch": 2.5088757396449703, "grad_norm": 19.886188507080078, "learning_rate": 3.585926928281462e-05, "loss": 0.7156, "step": 530 }, { "epoch": 2.556213017751479, "grad_norm": 10.014534950256348, "learning_rate": 3.653585926928282e-05, "loss": 0.7279, "step": 540 }, { "epoch": 2.603550295857988, "grad_norm": 8.2186861038208, "learning_rate": 3.721244925575101e-05, "loss": 0.6629, "step": 550 }, { "epoch": 2.6508875739644973, "grad_norm": 11.415748596191406, "learning_rate": 3.788903924221922e-05, "loss": 0.6031, "step": 560 }, { "epoch": 2.698224852071006, "grad_norm": 6.490344047546387, "learning_rate": 3.856562922868742e-05, "loss": 0.5158, "step": 570 }, { "epoch": 2.7455621301775146, "grad_norm": 10.63316822052002, "learning_rate": 3.924221921515562e-05, "loss": 0.6533, "step": 580 }, { "epoch": 2.7928994082840237, "grad_norm": 9.291253089904785, "learning_rate": 3.991880920162382e-05, "loss": 0.5501, "step": 590 }, { "epoch": 2.8402366863905324, "grad_norm": 10.60273551940918, "learning_rate": 4.059539918809202e-05, "loss": 0.5719, "step": 600 }, { "epoch": 2.8875739644970415, "grad_norm": 10.603645324707031, "learning_rate": 4.127198917456021e-05, "loss": 0.4905, "step": 610 }, { "epoch": 2.93491124260355, "grad_norm": 17.47416877746582, "learning_rate": 4.194857916102842e-05, "loss": 0.7037, "step": 620 }, { "epoch": 2.9822485207100593, "grad_norm": 9.434072494506836, "learning_rate": 4.262516914749662e-05, "loss": 0.5815, "step": 630 }, { "epoch": 2.996449704142012, "eval_accuracy": 0.8568493150684932, "eval_loss": 0.3556749224662781, "eval_runtime": 6.2074, "eval_samples_per_second": 235.204, "eval_steps_per_second": 29.481, "step": 633 }, { "epoch": 3.029585798816568, "grad_norm": 13.81190299987793, "learning_rate": 4.330175913396482e-05, "loss": 0.5877, "step": 640 }, { "epoch": 3.076923076923077, "grad_norm": 8.872483253479004, "learning_rate": 4.397834912043302e-05, "loss": 0.55, "step": 650 }, { "epoch": 3.1242603550295858, "grad_norm": 11.748785972595215, "learning_rate": 4.465493910690122e-05, "loss": 0.6155, "step": 660 }, { "epoch": 3.171597633136095, "grad_norm": 13.621400833129883, "learning_rate": 4.5331529093369415e-05, "loss": 0.5907, "step": 670 }, { "epoch": 3.2189349112426036, "grad_norm": 10.422270774841309, "learning_rate": 4.600811907983762e-05, "loss": 0.6022, "step": 680 }, { "epoch": 3.2662721893491122, "grad_norm": 12.192015647888184, "learning_rate": 4.668470906630582e-05, "loss": 0.558, "step": 690 }, { "epoch": 3.3136094674556213, "grad_norm": 5.769958972930908, "learning_rate": 4.736129905277402e-05, "loss": 0.5257, "step": 700 }, { "epoch": 3.36094674556213, "grad_norm": 11.664800643920898, "learning_rate": 4.803788903924222e-05, "loss": 0.6242, "step": 710 }, { "epoch": 3.408284023668639, "grad_norm": 10.007041931152344, "learning_rate": 4.871447902571042e-05, "loss": 0.5789, "step": 720 }, { "epoch": 3.455621301775148, "grad_norm": 18.98644256591797, "learning_rate": 4.9391069012178623e-05, "loss": 0.4632, "step": 730 }, { "epoch": 3.502958579881657, "grad_norm": 9.949424743652344, "learning_rate": 4.999247667770087e-05, "loss": 0.5657, "step": 740 }, { "epoch": 3.5502958579881656, "grad_norm": 7.471621513366699, "learning_rate": 4.99172434547096e-05, "loss": 0.4076, "step": 750 }, { "epoch": 3.5976331360946747, "grad_norm": 9.102510452270508, "learning_rate": 4.9842010231718327e-05, "loss": 0.532, "step": 760 }, { "epoch": 3.6449704142011834, "grad_norm": 9.587445259094238, "learning_rate": 4.976677700872706e-05, "loss": 0.5685, "step": 770 }, { "epoch": 3.6923076923076925, "grad_norm": 10.277064323425293, "learning_rate": 4.969154378573578e-05, "loss": 0.5004, "step": 780 }, { "epoch": 3.739644970414201, "grad_norm": 15.665764808654785, "learning_rate": 4.9616310562744514e-05, "loss": 0.5571, "step": 790 }, { "epoch": 3.78698224852071, "grad_norm": 9.643716812133789, "learning_rate": 4.954107733975324e-05, "loss": 0.5235, "step": 800 }, { "epoch": 3.834319526627219, "grad_norm": 12.600419044494629, "learning_rate": 4.9465844116761964e-05, "loss": 0.5579, "step": 810 }, { "epoch": 3.8816568047337277, "grad_norm": 9.6210298538208, "learning_rate": 4.939061089377069e-05, "loss": 0.4711, "step": 820 }, { "epoch": 3.9289940828402368, "grad_norm": 10.485040664672852, "learning_rate": 4.9315377670779414e-05, "loss": 0.4848, "step": 830 }, { "epoch": 3.9763313609467454, "grad_norm": 7.453371524810791, "learning_rate": 4.9240144447788145e-05, "loss": 0.474, "step": 840 }, { "epoch": 4.0, "eval_accuracy": 0.8726027397260274, "eval_loss": 0.35826006531715393, "eval_runtime": 6.1388, "eval_samples_per_second": 237.833, "eval_steps_per_second": 29.811, "step": 845 }, { "epoch": 4.023668639053255, "grad_norm": 8.34096908569336, "learning_rate": 4.916491122479687e-05, "loss": 0.5093, "step": 850 }, { "epoch": 4.071005917159764, "grad_norm": 7.713958263397217, "learning_rate": 4.90896780018056e-05, "loss": 0.4608, "step": 860 }, { "epoch": 4.118343195266272, "grad_norm": 9.734159469604492, "learning_rate": 4.9014444778814326e-05, "loss": 0.4247, "step": 870 }, { "epoch": 4.165680473372781, "grad_norm": 7.637202739715576, "learning_rate": 4.893921155582306e-05, "loss": 0.554, "step": 880 }, { "epoch": 4.21301775147929, "grad_norm": 12.172405242919922, "learning_rate": 4.886397833283178e-05, "loss": 0.4859, "step": 890 }, { "epoch": 4.260355029585799, "grad_norm": 9.40637493133545, "learning_rate": 4.878874510984051e-05, "loss": 0.5068, "step": 900 }, { "epoch": 4.3076923076923075, "grad_norm": 5.2307209968566895, "learning_rate": 4.871351188684923e-05, "loss": 0.4318, "step": 910 }, { "epoch": 4.355029585798817, "grad_norm": 13.809428215026855, "learning_rate": 4.8638278663857964e-05, "loss": 0.5231, "step": 920 }, { "epoch": 4.402366863905326, "grad_norm": 9.841399192810059, "learning_rate": 4.856304544086669e-05, "loss": 0.4441, "step": 930 }, { "epoch": 4.449704142011834, "grad_norm": 7.034471035003662, "learning_rate": 4.8487812217875414e-05, "loss": 0.5421, "step": 940 }, { "epoch": 4.497041420118343, "grad_norm": 6.35905122756958, "learning_rate": 4.8412578994884145e-05, "loss": 0.5084, "step": 950 }, { "epoch": 4.544378698224852, "grad_norm": 8.407711029052734, "learning_rate": 4.833734577189287e-05, "loss": 0.4067, "step": 960 }, { "epoch": 4.591715976331361, "grad_norm": 7.5561113357543945, "learning_rate": 4.8262112548901595e-05, "loss": 0.4881, "step": 970 }, { "epoch": 4.6390532544378695, "grad_norm": 7.843471050262451, "learning_rate": 4.818687932591032e-05, "loss": 0.4175, "step": 980 }, { "epoch": 4.686390532544379, "grad_norm": 11.301685333251953, "learning_rate": 4.811164610291905e-05, "loss": 0.4423, "step": 990 }, { "epoch": 4.733727810650888, "grad_norm": 7.472105503082275, "learning_rate": 4.8036412879927776e-05, "loss": 0.4525, "step": 1000 }, { "epoch": 4.781065088757396, "grad_norm": 9.092314720153809, "learning_rate": 4.796117965693651e-05, "loss": 0.5699, "step": 1010 }, { "epoch": 4.828402366863905, "grad_norm": 12.238302230834961, "learning_rate": 4.788594643394523e-05, "loss": 0.4524, "step": 1020 }, { "epoch": 4.875739644970414, "grad_norm": 5.100959777832031, "learning_rate": 4.7810713210953964e-05, "loss": 0.3866, "step": 1030 }, { "epoch": 4.923076923076923, "grad_norm": 9.616569519042969, "learning_rate": 4.773547998796269e-05, "loss": 0.3577, "step": 1040 }, { "epoch": 4.970414201183432, "grad_norm": 9.995213508605957, "learning_rate": 4.7660246764971413e-05, "loss": 0.5819, "step": 1050 }, { "epoch": 4.998816568047337, "eval_accuracy": 0.8671232876712329, "eval_loss": 0.34042322635650635, "eval_runtime": 6.4475, "eval_samples_per_second": 226.444, "eval_steps_per_second": 28.383, "step": 1056 }, { "epoch": 5.017751479289941, "grad_norm": 6.531469345092773, "learning_rate": 4.758501354198014e-05, "loss": 0.4182, "step": 1060 }, { "epoch": 5.06508875739645, "grad_norm": 11.092623710632324, "learning_rate": 4.750978031898887e-05, "loss": 0.4458, "step": 1070 }, { "epoch": 5.112426035502959, "grad_norm": 12.276275634765625, "learning_rate": 4.7434547095997595e-05, "loss": 0.5101, "step": 1080 }, { "epoch": 5.159763313609467, "grad_norm": 10.82636833190918, "learning_rate": 4.735931387300632e-05, "loss": 0.4708, "step": 1090 }, { "epoch": 5.207100591715976, "grad_norm": 9.973958015441895, "learning_rate": 4.728408065001505e-05, "loss": 0.5191, "step": 1100 }, { "epoch": 5.254437869822485, "grad_norm": 9.460865020751953, "learning_rate": 4.7208847427023776e-05, "loss": 0.4285, "step": 1110 }, { "epoch": 5.3017751479289945, "grad_norm": 15.347735404968262, "learning_rate": 4.713361420403251e-05, "loss": 0.4579, "step": 1120 }, { "epoch": 5.349112426035503, "grad_norm": 14.214599609375, "learning_rate": 4.7058380981041225e-05, "loss": 0.4787, "step": 1130 }, { "epoch": 5.396449704142012, "grad_norm": 9.042417526245117, "learning_rate": 4.698314775804996e-05, "loss": 0.4146, "step": 1140 }, { "epoch": 5.443786982248521, "grad_norm": 8.627814292907715, "learning_rate": 4.690791453505868e-05, "loss": 0.394, "step": 1150 }, { "epoch": 5.491124260355029, "grad_norm": 8.060114860534668, "learning_rate": 4.683268131206741e-05, "loss": 0.412, "step": 1160 }, { "epoch": 5.538461538461538, "grad_norm": 8.569971084594727, "learning_rate": 4.675744808907614e-05, "loss": 0.443, "step": 1170 }, { "epoch": 5.585798816568047, "grad_norm": 31.7719669342041, "learning_rate": 4.668221486608487e-05, "loss": 0.4424, "step": 1180 }, { "epoch": 5.633136094674557, "grad_norm": 10.994864463806152, "learning_rate": 4.6606981643093595e-05, "loss": 0.4072, "step": 1190 }, { "epoch": 5.680473372781065, "grad_norm": 12.489917755126953, "learning_rate": 4.653174842010232e-05, "loss": 0.4193, "step": 1200 }, { "epoch": 5.727810650887574, "grad_norm": 6.0672760009765625, "learning_rate": 4.6456515197111044e-05, "loss": 0.463, "step": 1210 }, { "epoch": 5.775147928994083, "grad_norm": 9.66230297088623, "learning_rate": 4.6381281974119776e-05, "loss": 0.3863, "step": 1220 }, { "epoch": 5.822485207100591, "grad_norm": 12.802431106567383, "learning_rate": 4.63060487511285e-05, "loss": 0.4471, "step": 1230 }, { "epoch": 5.8698224852071, "grad_norm": 10.842957496643066, "learning_rate": 4.6230815528137225e-05, "loss": 0.5186, "step": 1240 }, { "epoch": 5.9171597633136095, "grad_norm": 8.612702369689941, "learning_rate": 4.615558230514596e-05, "loss": 0.4908, "step": 1250 }, { "epoch": 5.964497041420119, "grad_norm": 8.768792152404785, "learning_rate": 4.608034908215468e-05, "loss": 0.4557, "step": 1260 }, { "epoch": 5.997633136094675, "eval_accuracy": 0.8993150684931507, "eval_loss": 0.2699526846408844, "eval_runtime": 6.396, "eval_samples_per_second": 228.266, "eval_steps_per_second": 28.611, "step": 1267 }, { "epoch": 6.011834319526627, "grad_norm": 6.778576374053955, "learning_rate": 4.600511585916341e-05, "loss": 0.4647, "step": 1270 }, { "epoch": 6.059171597633136, "grad_norm": 5.115172863006592, "learning_rate": 4.592988263617213e-05, "loss": 0.4053, "step": 1280 }, { "epoch": 6.106508875739645, "grad_norm": 7.163010120391846, "learning_rate": 4.585464941318086e-05, "loss": 0.4136, "step": 1290 }, { "epoch": 6.153846153846154, "grad_norm": 5.242615699768066, "learning_rate": 4.577941619018959e-05, "loss": 0.4233, "step": 1300 }, { "epoch": 6.201183431952662, "grad_norm": 7.148778915405273, "learning_rate": 4.570418296719832e-05, "loss": 0.3791, "step": 1310 }, { "epoch": 6.2485207100591715, "grad_norm": 6.911210060119629, "learning_rate": 4.5628949744207044e-05, "loss": 0.3933, "step": 1320 }, { "epoch": 6.295857988165681, "grad_norm": 7.753135681152344, "learning_rate": 4.5553716521215776e-05, "loss": 0.428, "step": 1330 }, { "epoch": 6.34319526627219, "grad_norm": 5.933778762817383, "learning_rate": 4.54784832982245e-05, "loss": 0.4668, "step": 1340 }, { "epoch": 6.390532544378698, "grad_norm": 7.8352556228637695, "learning_rate": 4.5403250075233225e-05, "loss": 0.3272, "step": 1350 }, { "epoch": 6.437869822485207, "grad_norm": 11.419840812683105, "learning_rate": 4.532801685224195e-05, "loss": 0.3954, "step": 1360 }, { "epoch": 6.485207100591716, "grad_norm": 9.681208610534668, "learning_rate": 4.5252783629250675e-05, "loss": 0.5153, "step": 1370 }, { "epoch": 6.5325443786982245, "grad_norm": 6.971587657928467, "learning_rate": 4.5177550406259406e-05, "loss": 0.4247, "step": 1380 }, { "epoch": 6.579881656804734, "grad_norm": 6.286644458770752, "learning_rate": 4.510231718326813e-05, "loss": 0.4618, "step": 1390 }, { "epoch": 6.627218934911243, "grad_norm": 11.171966552734375, "learning_rate": 4.502708396027686e-05, "loss": 0.4352, "step": 1400 }, { "epoch": 6.674556213017752, "grad_norm": 10.539188385009766, "learning_rate": 4.495185073728559e-05, "loss": 0.3841, "step": 1410 }, { "epoch": 6.72189349112426, "grad_norm": 5.127812385559082, "learning_rate": 4.487661751429432e-05, "loss": 0.3388, "step": 1420 }, { "epoch": 6.769230769230769, "grad_norm": 10.178089141845703, "learning_rate": 4.480138429130304e-05, "loss": 0.4024, "step": 1430 }, { "epoch": 6.816568047337278, "grad_norm": 5.93577766418457, "learning_rate": 4.472615106831177e-05, "loss": 0.4173, "step": 1440 }, { "epoch": 6.8639053254437865, "grad_norm": 5.2099609375, "learning_rate": 4.4650917845320493e-05, "loss": 0.3462, "step": 1450 }, { "epoch": 6.911242603550296, "grad_norm": 7.551539897918701, "learning_rate": 4.4575684622329225e-05, "loss": 0.4034, "step": 1460 }, { "epoch": 6.958579881656805, "grad_norm": 10.478506088256836, "learning_rate": 4.450045139933795e-05, "loss": 0.4021, "step": 1470 }, { "epoch": 6.9964497041420115, "eval_accuracy": 0.8917808219178082, "eval_loss": 0.3158508837223053, "eval_runtime": 6.1877, "eval_samples_per_second": 235.95, "eval_steps_per_second": 29.575, "step": 1478 }, { "epoch": 7.005917159763314, "grad_norm": 10.892561912536621, "learning_rate": 4.4425218176346675e-05, "loss": 0.3283, "step": 1480 }, { "epoch": 7.053254437869822, "grad_norm": 8.013442993164062, "learning_rate": 4.4349984953355406e-05, "loss": 0.4517, "step": 1490 }, { "epoch": 7.100591715976331, "grad_norm": 6.160177230834961, "learning_rate": 4.427475173036413e-05, "loss": 0.4099, "step": 1500 }, { "epoch": 7.14792899408284, "grad_norm": 8.48135757446289, "learning_rate": 4.4199518507372856e-05, "loss": 0.4019, "step": 1510 }, { "epoch": 7.195266272189349, "grad_norm": 10.302865982055664, "learning_rate": 4.412428528438158e-05, "loss": 0.3329, "step": 1520 }, { "epoch": 7.242603550295858, "grad_norm": 10.503307342529297, "learning_rate": 4.404905206139031e-05, "loss": 0.394, "step": 1530 }, { "epoch": 7.289940828402367, "grad_norm": 7.577216148376465, "learning_rate": 4.397381883839904e-05, "loss": 0.4075, "step": 1540 }, { "epoch": 7.337278106508876, "grad_norm": 12.196857452392578, "learning_rate": 4.389858561540777e-05, "loss": 0.3919, "step": 1550 }, { "epoch": 7.384615384615385, "grad_norm": 6.480340003967285, "learning_rate": 4.382335239241649e-05, "loss": 0.3562, "step": 1560 }, { "epoch": 7.431952662721893, "grad_norm": 4.814269542694092, "learning_rate": 4.3748119169425225e-05, "loss": 0.3232, "step": 1570 }, { "epoch": 7.479289940828402, "grad_norm": 8.813551902770996, "learning_rate": 4.367288594643394e-05, "loss": 0.3947, "step": 1580 }, { "epoch": 7.5266272189349115, "grad_norm": 10.225379943847656, "learning_rate": 4.3597652723442675e-05, "loss": 0.4059, "step": 1590 }, { "epoch": 7.57396449704142, "grad_norm": 9.415613174438477, "learning_rate": 4.35224195004514e-05, "loss": 0.3371, "step": 1600 }, { "epoch": 7.621301775147929, "grad_norm": 6.129647731781006, "learning_rate": 4.344718627746013e-05, "loss": 0.3652, "step": 1610 }, { "epoch": 7.668639053254438, "grad_norm": 9.19030475616455, "learning_rate": 4.3371953054468856e-05, "loss": 0.3562, "step": 1620 }, { "epoch": 7.715976331360947, "grad_norm": 12.973560333251953, "learning_rate": 4.329671983147758e-05, "loss": 0.3804, "step": 1630 }, { "epoch": 7.763313609467455, "grad_norm": 7.263617515563965, "learning_rate": 4.322148660848631e-05, "loss": 0.3808, "step": 1640 }, { "epoch": 7.810650887573964, "grad_norm": 6.532052516937256, "learning_rate": 4.314625338549504e-05, "loss": 0.468, "step": 1650 }, { "epoch": 7.8579881656804735, "grad_norm": 8.766283988952637, "learning_rate": 4.307102016250376e-05, "loss": 0.4145, "step": 1660 }, { "epoch": 7.905325443786982, "grad_norm": 5.956889629364014, "learning_rate": 4.2995786939512487e-05, "loss": 0.4047, "step": 1670 }, { "epoch": 7.952662721893491, "grad_norm": 6.531178951263428, "learning_rate": 4.292055371652122e-05, "loss": 0.3396, "step": 1680 }, { "epoch": 8.0, "grad_norm": 8.662644386291504, "learning_rate": 4.284532049352994e-05, "loss": 0.3209, "step": 1690 }, { "epoch": 8.0, "eval_accuracy": 0.8972602739726028, "eval_loss": 0.3082219660282135, "eval_runtime": 6.2922, "eval_samples_per_second": 232.034, "eval_steps_per_second": 29.084, "step": 1690 }, { "epoch": 8.04733727810651, "grad_norm": 12.477700233459473, "learning_rate": 4.2770087270538674e-05, "loss": 0.3262, "step": 1700 }, { "epoch": 8.094674556213018, "grad_norm": 6.367954730987549, "learning_rate": 4.26948540475474e-05, "loss": 0.3579, "step": 1710 }, { "epoch": 8.142011834319527, "grad_norm": 7.339391708374023, "learning_rate": 4.261962082455613e-05, "loss": 0.3993, "step": 1720 }, { "epoch": 8.189349112426035, "grad_norm": 7.060799598693848, "learning_rate": 4.2544387601564856e-05, "loss": 0.3702, "step": 1730 }, { "epoch": 8.236686390532544, "grad_norm": 7.423877239227295, "learning_rate": 4.246915437857358e-05, "loss": 0.4548, "step": 1740 }, { "epoch": 8.284023668639053, "grad_norm": 7.742123603820801, "learning_rate": 4.2393921155582305e-05, "loss": 0.3914, "step": 1750 }, { "epoch": 8.331360946745562, "grad_norm": 3.941162109375, "learning_rate": 4.231868793259104e-05, "loss": 0.3953, "step": 1760 }, { "epoch": 8.378698224852071, "grad_norm": 7.15812349319458, "learning_rate": 4.224345470959976e-05, "loss": 0.3871, "step": 1770 }, { "epoch": 8.42603550295858, "grad_norm": 11.954395294189453, "learning_rate": 4.2168221486608486e-05, "loss": 0.3919, "step": 1780 }, { "epoch": 8.47337278106509, "grad_norm": 7.049565315246582, "learning_rate": 4.209298826361722e-05, "loss": 0.3539, "step": 1790 }, { "epoch": 8.520710059171599, "grad_norm": 8.527347564697266, "learning_rate": 4.201775504062594e-05, "loss": 0.3883, "step": 1800 }, { "epoch": 8.568047337278106, "grad_norm": 9.178783416748047, "learning_rate": 4.194252181763467e-05, "loss": 0.4226, "step": 1810 }, { "epoch": 8.615384615384615, "grad_norm": 10.065650939941406, "learning_rate": 4.186728859464339e-05, "loss": 0.3773, "step": 1820 }, { "epoch": 8.662721893491124, "grad_norm": 5.588104724884033, "learning_rate": 4.1792055371652124e-05, "loss": 0.3921, "step": 1830 }, { "epoch": 8.710059171597633, "grad_norm": 4.505855083465576, "learning_rate": 4.171682214866085e-05, "loss": 0.3483, "step": 1840 }, { "epoch": 8.757396449704142, "grad_norm": 10.081398963928223, "learning_rate": 4.164158892566958e-05, "loss": 0.3312, "step": 1850 }, { "epoch": 8.804733727810651, "grad_norm": 7.667760848999023, "learning_rate": 4.1566355702678305e-05, "loss": 0.2838, "step": 1860 }, { "epoch": 8.85207100591716, "grad_norm": 11.876665115356445, "learning_rate": 4.149112247968704e-05, "loss": 0.4328, "step": 1870 }, { "epoch": 8.899408284023668, "grad_norm": 7.79551887512207, "learning_rate": 4.141588925669576e-05, "loss": 0.4616, "step": 1880 }, { "epoch": 8.946745562130177, "grad_norm": 6.006857395172119, "learning_rate": 4.1340656033704486e-05, "loss": 0.3389, "step": 1890 }, { "epoch": 8.994082840236686, "grad_norm": 9.194988250732422, "learning_rate": 4.126542281071321e-05, "loss": 0.3479, "step": 1900 }, { "epoch": 8.998816568047337, "eval_accuracy": 0.9027397260273973, "eval_loss": 0.28129294514656067, "eval_runtime": 6.217, "eval_samples_per_second": 234.84, "eval_steps_per_second": 29.435, "step": 1901 }, { "epoch": 9.041420118343195, "grad_norm": 7.2297163009643555, "learning_rate": 4.1190189587721936e-05, "loss": 0.3223, "step": 1910 }, { "epoch": 9.088757396449704, "grad_norm": 9.67817211151123, "learning_rate": 4.111495636473067e-05, "loss": 0.3681, "step": 1920 }, { "epoch": 9.136094674556213, "grad_norm": 6.748856544494629, "learning_rate": 4.103972314173939e-05, "loss": 0.351, "step": 1930 }, { "epoch": 9.183431952662723, "grad_norm": 3.9139935970306396, "learning_rate": 4.0964489918748124e-05, "loss": 0.39, "step": 1940 }, { "epoch": 9.23076923076923, "grad_norm": 5.222900390625, "learning_rate": 4.088925669575685e-05, "loss": 0.3132, "step": 1950 }, { "epoch": 9.278106508875739, "grad_norm": 11.637986183166504, "learning_rate": 4.081402347276558e-05, "loss": 0.3373, "step": 1960 }, { "epoch": 9.325443786982248, "grad_norm": 10.712813377380371, "learning_rate": 4.07387902497743e-05, "loss": 0.3424, "step": 1970 }, { "epoch": 9.372781065088757, "grad_norm": 7.3563947677612305, "learning_rate": 4.066355702678303e-05, "loss": 0.3709, "step": 1980 }, { "epoch": 9.420118343195266, "grad_norm": 8.500737190246582, "learning_rate": 4.0588323803791755e-05, "loss": 0.3398, "step": 1990 }, { "epoch": 9.467455621301776, "grad_norm": 10.802979469299316, "learning_rate": 4.0513090580800486e-05, "loss": 0.308, "step": 2000 }, { "epoch": 9.514792899408285, "grad_norm": 7.362417697906494, "learning_rate": 4.043785735780921e-05, "loss": 0.3193, "step": 2010 }, { "epoch": 9.562130177514792, "grad_norm": 5.569155693054199, "learning_rate": 4.0362624134817936e-05, "loss": 0.3028, "step": 2020 }, { "epoch": 9.609467455621301, "grad_norm": 8.995447158813477, "learning_rate": 4.028739091182667e-05, "loss": 0.4206, "step": 2030 }, { "epoch": 9.65680473372781, "grad_norm": 5.864706993103027, "learning_rate": 4.021215768883539e-05, "loss": 0.2987, "step": 2040 }, { "epoch": 9.70414201183432, "grad_norm": 8.34255313873291, "learning_rate": 4.013692446584412e-05, "loss": 0.4161, "step": 2050 }, { "epoch": 9.751479289940828, "grad_norm": 8.392521858215332, "learning_rate": 4.006169124285284e-05, "loss": 0.4073, "step": 2060 }, { "epoch": 9.798816568047338, "grad_norm": 6.388725280761719, "learning_rate": 3.998645801986157e-05, "loss": 0.3513, "step": 2070 }, { "epoch": 9.846153846153847, "grad_norm": 5.696859836578369, "learning_rate": 3.99112247968703e-05, "loss": 0.3219, "step": 2080 }, { "epoch": 9.893491124260356, "grad_norm": 8.325499534606934, "learning_rate": 3.983599157387903e-05, "loss": 0.394, "step": 2090 }, { "epoch": 9.940828402366863, "grad_norm": 11.819910049438477, "learning_rate": 3.9760758350887755e-05, "loss": 0.4085, "step": 2100 }, { "epoch": 9.988165680473372, "grad_norm": 6.419707298278809, "learning_rate": 3.9685525127896486e-05, "loss": 0.3429, "step": 2110 }, { "epoch": 9.997633136094674, "eval_accuracy": 0.8924657534246575, "eval_loss": 0.3318786323070526, "eval_runtime": 6.1733, "eval_samples_per_second": 236.501, "eval_steps_per_second": 29.644, "step": 2112 }, { "epoch": 10.035502958579881, "grad_norm": 4.879507064819336, "learning_rate": 3.9610291904905204e-05, "loss": 0.2607, "step": 2120 }, { "epoch": 10.08284023668639, "grad_norm": 10.089688301086426, "learning_rate": 3.9535058681913936e-05, "loss": 0.3887, "step": 2130 }, { "epoch": 10.1301775147929, "grad_norm": 6.6358819007873535, "learning_rate": 3.945982545892266e-05, "loss": 0.3926, "step": 2140 }, { "epoch": 10.177514792899409, "grad_norm": 4.718569755554199, "learning_rate": 3.938459223593139e-05, "loss": 0.2977, "step": 2150 }, { "epoch": 10.224852071005918, "grad_norm": 4.798628807067871, "learning_rate": 3.930935901294012e-05, "loss": 0.3167, "step": 2160 }, { "epoch": 10.272189349112425, "grad_norm": 12.527241706848145, "learning_rate": 3.923412578994884e-05, "loss": 0.3498, "step": 2170 }, { "epoch": 10.319526627218934, "grad_norm": 19.981807708740234, "learning_rate": 3.915889256695757e-05, "loss": 0.3791, "step": 2180 }, { "epoch": 10.366863905325443, "grad_norm": 5.31036901473999, "learning_rate": 3.90836593439663e-05, "loss": 0.3635, "step": 2190 }, { "epoch": 10.414201183431953, "grad_norm": 7.329598426818848, "learning_rate": 3.900842612097502e-05, "loss": 0.2612, "step": 2200 }, { "epoch": 10.461538461538462, "grad_norm": 10.241847038269043, "learning_rate": 3.893319289798375e-05, "loss": 0.3508, "step": 2210 }, { "epoch": 10.50887573964497, "grad_norm": 9.222640991210938, "learning_rate": 3.885795967499248e-05, "loss": 0.4113, "step": 2220 }, { "epoch": 10.55621301775148, "grad_norm": 5.4523115158081055, "learning_rate": 3.8782726452001204e-05, "loss": 0.312, "step": 2230 }, { "epoch": 10.603550295857989, "grad_norm": 25.376020431518555, "learning_rate": 3.8707493229009936e-05, "loss": 0.382, "step": 2240 }, { "epoch": 10.650887573964496, "grad_norm": 7.494572162628174, "learning_rate": 3.863226000601866e-05, "loss": 0.3078, "step": 2250 }, { "epoch": 10.698224852071005, "grad_norm": 9.24726390838623, "learning_rate": 3.855702678302739e-05, "loss": 0.3368, "step": 2260 }, { "epoch": 10.745562130177515, "grad_norm": 7.74558162689209, "learning_rate": 3.848179356003611e-05, "loss": 0.2912, "step": 2270 }, { "epoch": 10.792899408284024, "grad_norm": 7.557544708251953, "learning_rate": 3.840656033704484e-05, "loss": 0.3268, "step": 2280 }, { "epoch": 10.840236686390533, "grad_norm": 9.215229988098145, "learning_rate": 3.8331327114053566e-05, "loss": 0.4372, "step": 2290 }, { "epoch": 10.887573964497042, "grad_norm": 9.268451690673828, "learning_rate": 3.82560938910623e-05, "loss": 0.3564, "step": 2300 }, { "epoch": 10.934911242603551, "grad_norm": 4.07456111907959, "learning_rate": 3.818086066807102e-05, "loss": 0.3003, "step": 2310 }, { "epoch": 10.982248520710058, "grad_norm": 8.930679321289062, "learning_rate": 3.810562744507975e-05, "loss": 0.3341, "step": 2320 }, { "epoch": 10.996449704142012, "eval_accuracy": 0.8972602739726028, "eval_loss": 0.2900165021419525, "eval_runtime": 6.2027, "eval_samples_per_second": 235.379, "eval_steps_per_second": 29.503, "step": 2323 }, { "epoch": 11.029585798816568, "grad_norm": 7.869425296783447, "learning_rate": 3.803039422208848e-05, "loss": 0.3134, "step": 2330 }, { "epoch": 11.076923076923077, "grad_norm": 8.941612243652344, "learning_rate": 3.7955160999097204e-05, "loss": 0.3465, "step": 2340 }, { "epoch": 11.124260355029586, "grad_norm": 8.30190372467041, "learning_rate": 3.787992777610593e-05, "loss": 0.2489, "step": 2350 }, { "epoch": 11.171597633136095, "grad_norm": 8.490402221679688, "learning_rate": 3.7804694553114653e-05, "loss": 0.326, "step": 2360 }, { "epoch": 11.218934911242604, "grad_norm": 19.662193298339844, "learning_rate": 3.7729461330123385e-05, "loss": 0.3444, "step": 2370 }, { "epoch": 11.266272189349113, "grad_norm": 9.445649147033691, "learning_rate": 3.765422810713211e-05, "loss": 0.3185, "step": 2380 }, { "epoch": 11.31360946745562, "grad_norm": 4.701760292053223, "learning_rate": 3.757899488414084e-05, "loss": 0.3665, "step": 2390 }, { "epoch": 11.36094674556213, "grad_norm": 5.095606327056885, "learning_rate": 3.7503761661149566e-05, "loss": 0.2736, "step": 2400 }, { "epoch": 11.408284023668639, "grad_norm": 10.870713233947754, "learning_rate": 3.74285284381583e-05, "loss": 0.2966, "step": 2410 }, { "epoch": 11.455621301775148, "grad_norm": 6.850511074066162, "learning_rate": 3.7353295215167016e-05, "loss": 0.2624, "step": 2420 }, { "epoch": 11.502958579881657, "grad_norm": 10.627695083618164, "learning_rate": 3.727806199217575e-05, "loss": 0.3767, "step": 2430 }, { "epoch": 11.550295857988166, "grad_norm": 8.704399108886719, "learning_rate": 3.720282876918447e-05, "loss": 0.3127, "step": 2440 }, { "epoch": 11.597633136094675, "grad_norm": 7.4766716957092285, "learning_rate": 3.71275955461932e-05, "loss": 0.3015, "step": 2450 }, { "epoch": 11.644970414201183, "grad_norm": 8.510762214660645, "learning_rate": 3.705236232320193e-05, "loss": 0.3406, "step": 2460 }, { "epoch": 11.692307692307692, "grad_norm": 9.42719841003418, "learning_rate": 3.697712910021065e-05, "loss": 0.3085, "step": 2470 }, { "epoch": 11.7396449704142, "grad_norm": 6.386455535888672, "learning_rate": 3.6901895877219385e-05, "loss": 0.3426, "step": 2480 }, { "epoch": 11.78698224852071, "grad_norm": 7.612992286682129, "learning_rate": 3.682666265422811e-05, "loss": 0.3567, "step": 2490 }, { "epoch": 11.834319526627219, "grad_norm": 8.440069198608398, "learning_rate": 3.6751429431236835e-05, "loss": 0.3288, "step": 2500 }, { "epoch": 11.881656804733728, "grad_norm": 7.730615615844727, "learning_rate": 3.667619620824556e-05, "loss": 0.3253, "step": 2510 }, { "epoch": 11.928994082840237, "grad_norm": 7.29069185256958, "learning_rate": 3.660096298525429e-05, "loss": 0.306, "step": 2520 }, { "epoch": 11.976331360946746, "grad_norm": 8.983368873596191, "learning_rate": 3.6525729762263016e-05, "loss": 0.2937, "step": 2530 }, { "epoch": 12.0, "eval_accuracy": 0.8993150684931507, "eval_loss": 0.3500230312347412, "eval_runtime": 6.3178, "eval_samples_per_second": 231.093, "eval_steps_per_second": 28.966, "step": 2535 }, { "epoch": 12.023668639053254, "grad_norm": 6.368637561798096, "learning_rate": 3.645049653927175e-05, "loss": 0.3998, "step": 2540 }, { "epoch": 12.071005917159763, "grad_norm": 14.744524955749512, "learning_rate": 3.637526331628047e-05, "loss": 0.324, "step": 2550 }, { "epoch": 12.118343195266272, "grad_norm": 4.304303169250488, "learning_rate": 3.63000300932892e-05, "loss": 0.3538, "step": 2560 }, { "epoch": 12.165680473372781, "grad_norm": 11.705492973327637, "learning_rate": 3.622479687029793e-05, "loss": 0.3422, "step": 2570 }, { "epoch": 12.21301775147929, "grad_norm": 9.357977867126465, "learning_rate": 3.614956364730665e-05, "loss": 0.2732, "step": 2580 }, { "epoch": 12.2603550295858, "grad_norm": 12.46599006652832, "learning_rate": 3.607433042431538e-05, "loss": 0.4473, "step": 2590 }, { "epoch": 12.307692307692308, "grad_norm": 20.074487686157227, "learning_rate": 3.59990972013241e-05, "loss": 0.2837, "step": 2600 }, { "epoch": 12.355029585798816, "grad_norm": 4.281162738800049, "learning_rate": 3.5923863978332834e-05, "loss": 0.3035, "step": 2610 }, { "epoch": 12.402366863905325, "grad_norm": 10.390352249145508, "learning_rate": 3.584863075534156e-05, "loss": 0.2636, "step": 2620 }, { "epoch": 12.449704142011834, "grad_norm": 3.76784348487854, "learning_rate": 3.577339753235029e-05, "loss": 0.3388, "step": 2630 }, { "epoch": 12.497041420118343, "grad_norm": 9.673295021057129, "learning_rate": 3.5698164309359016e-05, "loss": 0.2947, "step": 2640 }, { "epoch": 12.544378698224852, "grad_norm": 6.6694722175598145, "learning_rate": 3.562293108636774e-05, "loss": 0.3453, "step": 2650 }, { "epoch": 12.591715976331361, "grad_norm": 7.178610324859619, "learning_rate": 3.5547697863376465e-05, "loss": 0.3383, "step": 2660 }, { "epoch": 12.63905325443787, "grad_norm": 10.715120315551758, "learning_rate": 3.54724646403852e-05, "loss": 0.3222, "step": 2670 }, { "epoch": 12.68639053254438, "grad_norm": 6.3047285079956055, "learning_rate": 3.539723141739392e-05, "loss": 0.3521, "step": 2680 }, { "epoch": 12.733727810650887, "grad_norm": 6.073225021362305, "learning_rate": 3.532199819440265e-05, "loss": 0.2904, "step": 2690 }, { "epoch": 12.781065088757396, "grad_norm": 9.05847454071045, "learning_rate": 3.524676497141138e-05, "loss": 0.3764, "step": 2700 }, { "epoch": 12.828402366863905, "grad_norm": 6.264795303344727, "learning_rate": 3.51715317484201e-05, "loss": 0.3159, "step": 2710 }, { "epoch": 12.875739644970414, "grad_norm": 7.125365257263184, "learning_rate": 3.5096298525428834e-05, "loss": 0.2996, "step": 2720 }, { "epoch": 12.923076923076923, "grad_norm": 9.880492210388184, "learning_rate": 3.502106530243755e-05, "loss": 0.3283, "step": 2730 }, { "epoch": 12.970414201183432, "grad_norm": 14.802063941955566, "learning_rate": 3.4945832079446284e-05, "loss": 0.3478, "step": 2740 }, { "epoch": 12.998816568047337, "eval_accuracy": 0.9013698630136986, "eval_loss": 0.3168272078037262, "eval_runtime": 6.174, "eval_samples_per_second": 236.475, "eval_steps_per_second": 29.64, "step": 2746 }, { "epoch": 13.017751479289942, "grad_norm": 9.61425495147705, "learning_rate": 3.487059885645501e-05, "loss": 0.2884, "step": 2750 }, { "epoch": 13.065088757396449, "grad_norm": 7.737671375274658, "learning_rate": 3.479536563346374e-05, "loss": 0.3732, "step": 2760 }, { "epoch": 13.112426035502958, "grad_norm": 7.558273792266846, "learning_rate": 3.4720132410472465e-05, "loss": 0.2859, "step": 2770 }, { "epoch": 13.159763313609467, "grad_norm": 7.560544013977051, "learning_rate": 3.46448991874812e-05, "loss": 0.2986, "step": 2780 }, { "epoch": 13.207100591715976, "grad_norm": 7.7973480224609375, "learning_rate": 3.456966596448992e-05, "loss": 0.291, "step": 2790 }, { "epoch": 13.254437869822485, "grad_norm": 9.302266120910645, "learning_rate": 3.449443274149865e-05, "loss": 0.3669, "step": 2800 }, { "epoch": 13.301775147928995, "grad_norm": 5.183737277984619, "learning_rate": 3.441919951850737e-05, "loss": 0.293, "step": 2810 }, { "epoch": 13.349112426035504, "grad_norm": 6.064436912536621, "learning_rate": 3.43439662955161e-05, "loss": 0.2944, "step": 2820 }, { "epoch": 13.396449704142011, "grad_norm": 9.409137725830078, "learning_rate": 3.426873307252483e-05, "loss": 0.3103, "step": 2830 }, { "epoch": 13.44378698224852, "grad_norm": 20.371089935302734, "learning_rate": 3.419349984953356e-05, "loss": 0.2879, "step": 2840 }, { "epoch": 13.49112426035503, "grad_norm": 9.97218132019043, "learning_rate": 3.4118266626542284e-05, "loss": 0.2959, "step": 2850 }, { "epoch": 13.538461538461538, "grad_norm": 7.915639400482178, "learning_rate": 3.404303340355101e-05, "loss": 0.2929, "step": 2860 }, { "epoch": 13.585798816568047, "grad_norm": 6.3162641525268555, "learning_rate": 3.396780018055974e-05, "loss": 0.2773, "step": 2870 }, { "epoch": 13.633136094674557, "grad_norm": 7.813812732696533, "learning_rate": 3.389256695756846e-05, "loss": 0.2795, "step": 2880 }, { "epoch": 13.680473372781066, "grad_norm": 13.80722427368164, "learning_rate": 3.381733373457719e-05, "loss": 0.3648, "step": 2890 }, { "epoch": 13.727810650887575, "grad_norm": 9.83273696899414, "learning_rate": 3.3742100511585915e-05, "loss": 0.2952, "step": 2900 }, { "epoch": 13.775147928994082, "grad_norm": 10.903112411499023, "learning_rate": 3.3666867288594646e-05, "loss": 0.269, "step": 2910 }, { "epoch": 13.822485207100591, "grad_norm": 4.992847919464111, "learning_rate": 3.359163406560337e-05, "loss": 0.3689, "step": 2920 }, { "epoch": 13.8698224852071, "grad_norm": 7.029762268066406, "learning_rate": 3.35164008426121e-05, "loss": 0.3296, "step": 2930 }, { "epoch": 13.91715976331361, "grad_norm": 15.533370018005371, "learning_rate": 3.344116761962083e-05, "loss": 0.2764, "step": 2940 }, { "epoch": 13.964497041420119, "grad_norm": 14.553123474121094, "learning_rate": 3.336593439662956e-05, "loss": 0.3148, "step": 2950 }, { "epoch": 13.997633136094674, "eval_accuracy": 0.9054794520547945, "eval_loss": 0.3071611225605011, "eval_runtime": 6.0563, "eval_samples_per_second": 241.071, "eval_steps_per_second": 30.216, "step": 2957 }, { "epoch": 14.011834319526628, "grad_norm": 6.9089035987854, "learning_rate": 3.329070117363828e-05, "loss": 0.3318, "step": 2960 }, { "epoch": 14.059171597633137, "grad_norm": 7.897435665130615, "learning_rate": 3.321546795064701e-05, "loss": 0.2537, "step": 2970 }, { "epoch": 14.106508875739644, "grad_norm": 12.082826614379883, "learning_rate": 3.314023472765573e-05, "loss": 0.2685, "step": 2980 }, { "epoch": 14.153846153846153, "grad_norm": 8.465901374816895, "learning_rate": 3.306500150466446e-05, "loss": 0.3849, "step": 2990 }, { "epoch": 14.201183431952662, "grad_norm": 9.606731414794922, "learning_rate": 3.298976828167319e-05, "loss": 0.3219, "step": 3000 }, { "epoch": 14.248520710059172, "grad_norm": 5.763510704040527, "learning_rate": 3.2914535058681914e-05, "loss": 0.2798, "step": 3010 }, { "epoch": 14.29585798816568, "grad_norm": 7.898010730743408, "learning_rate": 3.2839301835690646e-05, "loss": 0.353, "step": 3020 }, { "epoch": 14.34319526627219, "grad_norm": 4.139184951782227, "learning_rate": 3.276406861269937e-05, "loss": 0.3145, "step": 3030 }, { "epoch": 14.390532544378699, "grad_norm": 10.472068786621094, "learning_rate": 3.2688835389708096e-05, "loss": 0.2997, "step": 3040 }, { "epoch": 14.437869822485208, "grad_norm": 6.952048301696777, "learning_rate": 3.261360216671682e-05, "loss": 0.2931, "step": 3050 }, { "epoch": 14.485207100591715, "grad_norm": 11.008207321166992, "learning_rate": 3.253836894372555e-05, "loss": 0.2891, "step": 3060 }, { "epoch": 14.532544378698224, "grad_norm": 4.314377784729004, "learning_rate": 3.246313572073428e-05, "loss": 0.2922, "step": 3070 }, { "epoch": 14.579881656804734, "grad_norm": 6.738071441650391, "learning_rate": 3.238790249774301e-05, "loss": 0.2226, "step": 3080 }, { "epoch": 14.627218934911243, "grad_norm": 5.609333038330078, "learning_rate": 3.231266927475173e-05, "loss": 0.2366, "step": 3090 }, { "epoch": 14.674556213017752, "grad_norm": 5.399454116821289, "learning_rate": 3.223743605176046e-05, "loss": 0.32, "step": 3100 }, { "epoch": 14.721893491124261, "grad_norm": 13.962152481079102, "learning_rate": 3.216220282876918e-05, "loss": 0.3652, "step": 3110 }, { "epoch": 14.76923076923077, "grad_norm": 8.14931869506836, "learning_rate": 3.2086969605777914e-05, "loss": 0.2513, "step": 3120 }, { "epoch": 14.816568047337277, "grad_norm": 6.72014045715332, "learning_rate": 3.201173638278664e-05, "loss": 0.3068, "step": 3130 }, { "epoch": 14.863905325443787, "grad_norm": 9.025717735290527, "learning_rate": 3.1936503159795364e-05, "loss": 0.2845, "step": 3140 }, { "epoch": 14.911242603550296, "grad_norm": 3.6108787059783936, "learning_rate": 3.1861269936804096e-05, "loss": 0.2868, "step": 3150 }, { "epoch": 14.958579881656805, "grad_norm": 12.648404121398926, "learning_rate": 3.178603671381282e-05, "loss": 0.2896, "step": 3160 }, { "epoch": 14.996449704142012, "eval_accuracy": 0.9061643835616439, "eval_loss": 0.30652791261672974, "eval_runtime": 6.136, "eval_samples_per_second": 237.938, "eval_steps_per_second": 29.824, "step": 3168 }, { "epoch": 15.005917159763314, "grad_norm": 5.476109027862549, "learning_rate": 3.171080349082155e-05, "loss": 0.3452, "step": 3170 }, { "epoch": 15.053254437869823, "grad_norm": 8.330878257751465, "learning_rate": 3.163557026783028e-05, "loss": 0.2456, "step": 3180 }, { "epoch": 15.100591715976332, "grad_norm": 8.56313705444336, "learning_rate": 3.1560337044839e-05, "loss": 0.2296, "step": 3190 }, { "epoch": 15.14792899408284, "grad_norm": 10.402885437011719, "learning_rate": 3.1485103821847726e-05, "loss": 0.2862, "step": 3200 }, { "epoch": 15.195266272189349, "grad_norm": 7.497808933258057, "learning_rate": 3.140987059885646e-05, "loss": 0.3389, "step": 3210 }, { "epoch": 15.242603550295858, "grad_norm": 7.207127094268799, "learning_rate": 3.133463737586518e-05, "loss": 0.2575, "step": 3220 }, { "epoch": 15.289940828402367, "grad_norm": 4.729502201080322, "learning_rate": 3.1259404152873914e-05, "loss": 0.2308, "step": 3230 }, { "epoch": 15.337278106508876, "grad_norm": 10.251791954040527, "learning_rate": 3.118417092988264e-05, "loss": 0.2549, "step": 3240 }, { "epoch": 15.384615384615385, "grad_norm": 4.962519645690918, "learning_rate": 3.1108937706891364e-05, "loss": 0.2448, "step": 3250 }, { "epoch": 15.431952662721894, "grad_norm": 8.956313133239746, "learning_rate": 3.103370448390009e-05, "loss": 0.2278, "step": 3260 }, { "epoch": 15.479289940828401, "grad_norm": 5.445577144622803, "learning_rate": 3.0958471260908813e-05, "loss": 0.3195, "step": 3270 }, { "epoch": 15.52662721893491, "grad_norm": 8.691884994506836, "learning_rate": 3.0883238037917545e-05, "loss": 0.2816, "step": 3280 }, { "epoch": 15.57396449704142, "grad_norm": 4.890760898590088, "learning_rate": 3.080800481492627e-05, "loss": 0.2479, "step": 3290 }, { "epoch": 15.621301775147929, "grad_norm": 10.502642631530762, "learning_rate": 3.0732771591935e-05, "loss": 0.2368, "step": 3300 }, { "epoch": 15.668639053254438, "grad_norm": 11.197770118713379, "learning_rate": 3.0657538368943726e-05, "loss": 0.396, "step": 3310 }, { "epoch": 15.715976331360947, "grad_norm": 7.301953315734863, "learning_rate": 3.058230514595246e-05, "loss": 0.2605, "step": 3320 }, { "epoch": 15.763313609467456, "grad_norm": 9.391778945922852, "learning_rate": 3.0507071922961183e-05, "loss": 0.2318, "step": 3330 }, { "epoch": 15.810650887573965, "grad_norm": 11.96308708190918, "learning_rate": 3.0431838699969904e-05, "loss": 0.3574, "step": 3340 }, { "epoch": 15.857988165680473, "grad_norm": 6.631661415100098, "learning_rate": 3.0356605476978632e-05, "loss": 0.2773, "step": 3350 }, { "epoch": 15.905325443786982, "grad_norm": 7.179072380065918, "learning_rate": 3.028137225398736e-05, "loss": 0.3573, "step": 3360 }, { "epoch": 15.95266272189349, "grad_norm": 9.855470657348633, "learning_rate": 3.020613903099609e-05, "loss": 0.3077, "step": 3370 }, { "epoch": 16.0, "grad_norm": 4.808469772338867, "learning_rate": 3.0130905808004817e-05, "loss": 0.3149, "step": 3380 }, { "epoch": 16.0, "eval_accuracy": 0.9082191780821918, "eval_loss": 0.2928474545478821, "eval_runtime": 6.1031, "eval_samples_per_second": 239.221, "eval_steps_per_second": 29.985, "step": 3380 }, { "epoch": 16.047337278106507, "grad_norm": 5.927903175354004, "learning_rate": 3.0055672585013545e-05, "loss": 0.2194, "step": 3390 }, { "epoch": 16.09467455621302, "grad_norm": 9.440893173217773, "learning_rate": 2.9980439362022273e-05, "loss": 0.2311, "step": 3400 }, { "epoch": 16.142011834319526, "grad_norm": 10.132343292236328, "learning_rate": 2.9905206139031e-05, "loss": 0.2608, "step": 3410 }, { "epoch": 16.189349112426036, "grad_norm": 9.294024467468262, "learning_rate": 2.9829972916039723e-05, "loss": 0.3056, "step": 3420 }, { "epoch": 16.236686390532544, "grad_norm": 6.507917404174805, "learning_rate": 2.975473969304845e-05, "loss": 0.1905, "step": 3430 }, { "epoch": 16.284023668639055, "grad_norm": 8.411003112792969, "learning_rate": 2.967950647005718e-05, "loss": 0.3232, "step": 3440 }, { "epoch": 16.331360946745562, "grad_norm": 5.495641708374023, "learning_rate": 2.9604273247065907e-05, "loss": 0.2718, "step": 3450 }, { "epoch": 16.37869822485207, "grad_norm": 9.734967231750488, "learning_rate": 2.9529040024074632e-05, "loss": 0.2966, "step": 3460 }, { "epoch": 16.42603550295858, "grad_norm": 3.004697799682617, "learning_rate": 2.945380680108336e-05, "loss": 0.2234, "step": 3470 }, { "epoch": 16.473372781065088, "grad_norm": 13.730050086975098, "learning_rate": 2.937857357809209e-05, "loss": 0.3209, "step": 3480 }, { "epoch": 16.5207100591716, "grad_norm": 5.133395195007324, "learning_rate": 2.930334035510081e-05, "loss": 0.2561, "step": 3490 }, { "epoch": 16.568047337278106, "grad_norm": 5.885538101196289, "learning_rate": 2.9228107132109538e-05, "loss": 0.3166, "step": 3500 }, { "epoch": 16.615384615384617, "grad_norm": 8.295323371887207, "learning_rate": 2.9152873909118266e-05, "loss": 0.2634, "step": 3510 }, { "epoch": 16.662721893491124, "grad_norm": 8.664441108703613, "learning_rate": 2.9077640686126994e-05, "loss": 0.2488, "step": 3520 }, { "epoch": 16.71005917159763, "grad_norm": 13.536978721618652, "learning_rate": 2.9002407463135723e-05, "loss": 0.2616, "step": 3530 }, { "epoch": 16.757396449704142, "grad_norm": 8.778542518615723, "learning_rate": 2.892717424014445e-05, "loss": 0.3111, "step": 3540 }, { "epoch": 16.80473372781065, "grad_norm": 4.70704460144043, "learning_rate": 2.885194101715318e-05, "loss": 0.2381, "step": 3550 }, { "epoch": 16.85207100591716, "grad_norm": 13.269988059997559, "learning_rate": 2.8776707794161907e-05, "loss": 0.2824, "step": 3560 }, { "epoch": 16.899408284023668, "grad_norm": 3.4718408584594727, "learning_rate": 2.870147457117063e-05, "loss": 0.2517, "step": 3570 }, { "epoch": 16.94674556213018, "grad_norm": 4.911701679229736, "learning_rate": 2.8626241348179357e-05, "loss": 0.2842, "step": 3580 }, { "epoch": 16.994082840236686, "grad_norm": 14.3350248336792, "learning_rate": 2.8551008125188085e-05, "loss": 0.2734, "step": 3590 }, { "epoch": 16.99881656804734, "eval_accuracy": 0.9095890410958904, "eval_loss": 0.2769572138786316, "eval_runtime": 6.3128, "eval_samples_per_second": 231.275, "eval_steps_per_second": 28.989, "step": 3591 }, { "epoch": 17.041420118343197, "grad_norm": 9.578266143798828, "learning_rate": 2.847577490219681e-05, "loss": 0.2216, "step": 3600 }, { "epoch": 17.088757396449704, "grad_norm": 10.65328311920166, "learning_rate": 2.8400541679205538e-05, "loss": 0.2713, "step": 3610 }, { "epoch": 17.13609467455621, "grad_norm": 13.547807693481445, "learning_rate": 2.8325308456214266e-05, "loss": 0.2578, "step": 3620 }, { "epoch": 17.183431952662723, "grad_norm": 5.553393363952637, "learning_rate": 2.8250075233222994e-05, "loss": 0.3016, "step": 3630 }, { "epoch": 17.23076923076923, "grad_norm": 9.82513427734375, "learning_rate": 2.8174842010231723e-05, "loss": 0.281, "step": 3640 }, { "epoch": 17.27810650887574, "grad_norm": 3.8038620948791504, "learning_rate": 2.8099608787240444e-05, "loss": 0.2876, "step": 3650 }, { "epoch": 17.325443786982248, "grad_norm": 4.463418006896973, "learning_rate": 2.8024375564249172e-05, "loss": 0.2434, "step": 3660 }, { "epoch": 17.37278106508876, "grad_norm": 4.446181297302246, "learning_rate": 2.79491423412579e-05, "loss": 0.2434, "step": 3670 }, { "epoch": 17.420118343195266, "grad_norm": 12.428364753723145, "learning_rate": 2.787390911826663e-05, "loss": 0.2706, "step": 3680 }, { "epoch": 17.467455621301774, "grad_norm": 9.818281173706055, "learning_rate": 2.7798675895275357e-05, "loss": 0.232, "step": 3690 }, { "epoch": 17.514792899408285, "grad_norm": 19.56150245666504, "learning_rate": 2.7723442672284085e-05, "loss": 0.2981, "step": 3700 }, { "epoch": 17.562130177514792, "grad_norm": 8.730667114257812, "learning_rate": 2.764820944929281e-05, "loss": 0.2427, "step": 3710 }, { "epoch": 17.609467455621303, "grad_norm": 11.973594665527344, "learning_rate": 2.7572976226301534e-05, "loss": 0.2359, "step": 3720 }, { "epoch": 17.65680473372781, "grad_norm": 2.578996419906616, "learning_rate": 2.7497743003310263e-05, "loss": 0.2783, "step": 3730 }, { "epoch": 17.70414201183432, "grad_norm": 9.876580238342285, "learning_rate": 2.7422509780318987e-05, "loss": 0.2268, "step": 3740 }, { "epoch": 17.75147928994083, "grad_norm": 5.562457084655762, "learning_rate": 2.7347276557327716e-05, "loss": 0.2296, "step": 3750 }, { "epoch": 17.798816568047336, "grad_norm": 6.533483505249023, "learning_rate": 2.7272043334336444e-05, "loss": 0.2818, "step": 3760 }, { "epoch": 17.846153846153847, "grad_norm": 7.880773544311523, "learning_rate": 2.7196810111345172e-05, "loss": 0.2865, "step": 3770 }, { "epoch": 17.893491124260354, "grad_norm": 13.510115623474121, "learning_rate": 2.71215768883539e-05, "loss": 0.3133, "step": 3780 }, { "epoch": 17.940828402366865, "grad_norm": 6.314772605895996, "learning_rate": 2.704634366536263e-05, "loss": 0.2102, "step": 3790 }, { "epoch": 17.988165680473372, "grad_norm": 4.932859420776367, "learning_rate": 2.697111044237135e-05, "loss": 0.2344, "step": 3800 }, { "epoch": 17.997633136094674, "eval_accuracy": 0.8952054794520548, "eval_loss": 0.3737930953502655, "eval_runtime": 6.2965, "eval_samples_per_second": 231.875, "eval_steps_per_second": 29.064, "step": 3802 }, { "epoch": 18.035502958579883, "grad_norm": 8.163798332214355, "learning_rate": 2.6895877219380078e-05, "loss": 0.349, "step": 3810 }, { "epoch": 18.08284023668639, "grad_norm": 8.841765403747559, "learning_rate": 2.6820643996388806e-05, "loss": 0.2864, "step": 3820 }, { "epoch": 18.130177514792898, "grad_norm": 5.997651100158691, "learning_rate": 2.6745410773397534e-05, "loss": 0.2941, "step": 3830 }, { "epoch": 18.17751479289941, "grad_norm": 5.4760332107543945, "learning_rate": 2.6670177550406263e-05, "loss": 0.2216, "step": 3840 }, { "epoch": 18.224852071005916, "grad_norm": 6.478240489959717, "learning_rate": 2.6594944327414987e-05, "loss": 0.2874, "step": 3850 }, { "epoch": 18.272189349112427, "grad_norm": 12.63205623626709, "learning_rate": 2.6519711104423716e-05, "loss": 0.2338, "step": 3860 }, { "epoch": 18.319526627218934, "grad_norm": 9.010831832885742, "learning_rate": 2.6444477881432444e-05, "loss": 0.3293, "step": 3870 }, { "epoch": 18.366863905325445, "grad_norm": 6.102337837219238, "learning_rate": 2.6369244658441165e-05, "loss": 0.3229, "step": 3880 }, { "epoch": 18.414201183431953, "grad_norm": 9.948938369750977, "learning_rate": 2.6294011435449893e-05, "loss": 0.2604, "step": 3890 }, { "epoch": 18.46153846153846, "grad_norm": 8.575167655944824, "learning_rate": 2.621877821245862e-05, "loss": 0.2205, "step": 3900 }, { "epoch": 18.50887573964497, "grad_norm": 7.808337688446045, "learning_rate": 2.614354498946735e-05, "loss": 0.1802, "step": 3910 }, { "epoch": 18.556213017751478, "grad_norm": 11.38652515411377, "learning_rate": 2.6068311766476078e-05, "loss": 0.2161, "step": 3920 }, { "epoch": 18.60355029585799, "grad_norm": 7.173455715179443, "learning_rate": 2.5993078543484806e-05, "loss": 0.2973, "step": 3930 }, { "epoch": 18.650887573964496, "grad_norm": 10.973929405212402, "learning_rate": 2.5917845320493534e-05, "loss": 0.2557, "step": 3940 }, { "epoch": 18.698224852071007, "grad_norm": 6.697062015533447, "learning_rate": 2.5842612097502256e-05, "loss": 0.2371, "step": 3950 }, { "epoch": 18.745562130177515, "grad_norm": 11.82797908782959, "learning_rate": 2.5767378874510984e-05, "loss": 0.2639, "step": 3960 }, { "epoch": 18.792899408284022, "grad_norm": 4.322720050811768, "learning_rate": 2.5692145651519712e-05, "loss": 0.2212, "step": 3970 }, { "epoch": 18.840236686390533, "grad_norm": 5.201810836791992, "learning_rate": 2.561691242852844e-05, "loss": 0.2003, "step": 3980 }, { "epoch": 18.88757396449704, "grad_norm": 7.236006736755371, "learning_rate": 2.554167920553717e-05, "loss": 0.3897, "step": 3990 }, { "epoch": 18.93491124260355, "grad_norm": 7.327210426330566, "learning_rate": 2.5466445982545893e-05, "loss": 0.1939, "step": 4000 }, { "epoch": 18.98224852071006, "grad_norm": 16.192811965942383, "learning_rate": 2.539121275955462e-05, "loss": 0.2872, "step": 4010 }, { "epoch": 18.996449704142012, "eval_accuracy": 0.9061643835616439, "eval_loss": 0.3222917914390564, "eval_runtime": 6.2252, "eval_samples_per_second": 234.532, "eval_steps_per_second": 29.397, "step": 4013 }, { "epoch": 19.02958579881657, "grad_norm": 14.001523971557617, "learning_rate": 2.531597953656335e-05, "loss": 0.2899, "step": 4020 }, { "epoch": 19.076923076923077, "grad_norm": 12.866436004638672, "learning_rate": 2.524074631357207e-05, "loss": 0.2249, "step": 4030 }, { "epoch": 19.124260355029588, "grad_norm": 12.653215408325195, "learning_rate": 2.51655130905808e-05, "loss": 0.2008, "step": 4040 }, { "epoch": 19.171597633136095, "grad_norm": 6.0526604652404785, "learning_rate": 2.5090279867589527e-05, "loss": 0.2264, "step": 4050 }, { "epoch": 19.218934911242602, "grad_norm": 7.189617156982422, "learning_rate": 2.5015046644598256e-05, "loss": 0.2785, "step": 4060 }, { "epoch": 19.266272189349113, "grad_norm": 6.08707332611084, "learning_rate": 2.4939813421606984e-05, "loss": 0.2666, "step": 4070 }, { "epoch": 19.31360946745562, "grad_norm": 8.810041427612305, "learning_rate": 2.4864580198615712e-05, "loss": 0.2561, "step": 4080 }, { "epoch": 19.36094674556213, "grad_norm": 5.877760410308838, "learning_rate": 2.4789346975624437e-05, "loss": 0.1829, "step": 4090 }, { "epoch": 19.40828402366864, "grad_norm": 4.540722846984863, "learning_rate": 2.4714113752633165e-05, "loss": 0.2082, "step": 4100 }, { "epoch": 19.45562130177515, "grad_norm": 10.91895866394043, "learning_rate": 2.4638880529641893e-05, "loss": 0.2264, "step": 4110 }, { "epoch": 19.502958579881657, "grad_norm": 18.722084045410156, "learning_rate": 2.4563647306650618e-05, "loss": 0.2649, "step": 4120 }, { "epoch": 19.550295857988164, "grad_norm": 5.907430648803711, "learning_rate": 2.4488414083659346e-05, "loss": 0.1769, "step": 4130 }, { "epoch": 19.597633136094675, "grad_norm": 12.51977825164795, "learning_rate": 2.441318086066807e-05, "loss": 0.2895, "step": 4140 }, { "epoch": 19.644970414201183, "grad_norm": 9.822182655334473, "learning_rate": 2.43379476376768e-05, "loss": 0.2349, "step": 4150 }, { "epoch": 19.692307692307693, "grad_norm": 6.536006450653076, "learning_rate": 2.4262714414685524e-05, "loss": 0.2949, "step": 4160 }, { "epoch": 19.7396449704142, "grad_norm": 6.116447448730469, "learning_rate": 2.4187481191694252e-05, "loss": 0.2438, "step": 4170 }, { "epoch": 19.78698224852071, "grad_norm": 8.528430938720703, "learning_rate": 2.411224796870298e-05, "loss": 0.274, "step": 4180 }, { "epoch": 19.83431952662722, "grad_norm": 9.427675247192383, "learning_rate": 2.403701474571171e-05, "loss": 0.2848, "step": 4190 }, { "epoch": 19.881656804733726, "grad_norm": 5.054657459259033, "learning_rate": 2.3961781522720433e-05, "loss": 0.24, "step": 4200 }, { "epoch": 19.928994082840237, "grad_norm": 12.677891731262207, "learning_rate": 2.388654829972916e-05, "loss": 0.2593, "step": 4210 }, { "epoch": 19.976331360946745, "grad_norm": 5.84495735168457, "learning_rate": 2.381131507673789e-05, "loss": 0.2486, "step": 4220 }, { "epoch": 20.0, "eval_accuracy": 0.9068493150684932, "eval_loss": 0.32860177755355835, "eval_runtime": 6.1011, "eval_samples_per_second": 239.302, "eval_steps_per_second": 29.995, "step": 4225 }, { "epoch": 20.023668639053255, "grad_norm": 6.881824970245361, "learning_rate": 2.3736081853746618e-05, "loss": 0.2215, "step": 4230 }, { "epoch": 20.071005917159763, "grad_norm": 10.07770824432373, "learning_rate": 2.3660848630755343e-05, "loss": 0.244, "step": 4240 }, { "epoch": 20.118343195266274, "grad_norm": 4.6197919845581055, "learning_rate": 2.358561540776407e-05, "loss": 0.2289, "step": 4250 }, { "epoch": 20.16568047337278, "grad_norm": 8.33582592010498, "learning_rate": 2.35103821847728e-05, "loss": 0.1889, "step": 4260 }, { "epoch": 20.21301775147929, "grad_norm": 8.195116996765137, "learning_rate": 2.3435148961781524e-05, "loss": 0.2015, "step": 4270 }, { "epoch": 20.2603550295858, "grad_norm": 6.473872661590576, "learning_rate": 2.335991573879025e-05, "loss": 0.2306, "step": 4280 }, { "epoch": 20.307692307692307, "grad_norm": 4.936031341552734, "learning_rate": 2.3284682515798977e-05, "loss": 0.2311, "step": 4290 }, { "epoch": 20.355029585798817, "grad_norm": 16.449352264404297, "learning_rate": 2.3209449292807705e-05, "loss": 0.2129, "step": 4300 }, { "epoch": 20.402366863905325, "grad_norm": 7.029664516448975, "learning_rate": 2.3134216069816433e-05, "loss": 0.2211, "step": 4310 }, { "epoch": 20.449704142011836, "grad_norm": 7.797490119934082, "learning_rate": 2.3058982846825158e-05, "loss": 0.2305, "step": 4320 }, { "epoch": 20.497041420118343, "grad_norm": 13.063493728637695, "learning_rate": 2.2983749623833886e-05, "loss": 0.2916, "step": 4330 }, { "epoch": 20.54437869822485, "grad_norm": 9.06458568572998, "learning_rate": 2.2908516400842614e-05, "loss": 0.2342, "step": 4340 }, { "epoch": 20.59171597633136, "grad_norm": 7.881487846374512, "learning_rate": 2.283328317785134e-05, "loss": 0.2041, "step": 4350 }, { "epoch": 20.63905325443787, "grad_norm": 10.349453926086426, "learning_rate": 2.2758049954860067e-05, "loss": 0.2949, "step": 4360 }, { "epoch": 20.68639053254438, "grad_norm": 12.278468132019043, "learning_rate": 2.2682816731868795e-05, "loss": 0.2607, "step": 4370 }, { "epoch": 20.733727810650887, "grad_norm": 11.949197769165039, "learning_rate": 2.2607583508877524e-05, "loss": 0.2741, "step": 4380 }, { "epoch": 20.781065088757398, "grad_norm": 13.006739616394043, "learning_rate": 2.253235028588625e-05, "loss": 0.2845, "step": 4390 }, { "epoch": 20.828402366863905, "grad_norm": 6.179040908813477, "learning_rate": 2.2457117062894977e-05, "loss": 0.2518, "step": 4400 }, { "epoch": 20.875739644970416, "grad_norm": 8.708568572998047, "learning_rate": 2.23818838399037e-05, "loss": 0.254, "step": 4410 }, { "epoch": 20.923076923076923, "grad_norm": 8.595051765441895, "learning_rate": 2.230665061691243e-05, "loss": 0.2462, "step": 4420 }, { "epoch": 20.97041420118343, "grad_norm": 8.650654792785645, "learning_rate": 2.2231417393921154e-05, "loss": 0.2818, "step": 4430 }, { "epoch": 20.99881656804734, "eval_accuracy": 0.8938356164383562, "eval_loss": 0.3853361904621124, "eval_runtime": 6.105, "eval_samples_per_second": 239.147, "eval_steps_per_second": 29.975, "step": 4436 }, { "epoch": 21.01775147928994, "grad_norm": 7.857712268829346, "learning_rate": 2.2156184170929883e-05, "loss": 0.2664, "step": 4440 }, { "epoch": 21.06508875739645, "grad_norm": 7.22745943069458, "learning_rate": 2.208095094793861e-05, "loss": 0.2062, "step": 4450 }, { "epoch": 21.11242603550296, "grad_norm": 2.6673853397369385, "learning_rate": 2.200571772494734e-05, "loss": 0.2239, "step": 4460 }, { "epoch": 21.159763313609467, "grad_norm": 4.8849005699157715, "learning_rate": 2.1930484501956064e-05, "loss": 0.1985, "step": 4470 }, { "epoch": 21.207100591715978, "grad_norm": 22.471643447875977, "learning_rate": 2.1855251278964792e-05, "loss": 0.2331, "step": 4480 }, { "epoch": 21.254437869822485, "grad_norm": 12.047694206237793, "learning_rate": 2.178001805597352e-05, "loss": 0.193, "step": 4490 }, { "epoch": 21.301775147928993, "grad_norm": 8.459744453430176, "learning_rate": 2.170478483298225e-05, "loss": 0.2698, "step": 4500 }, { "epoch": 21.349112426035504, "grad_norm": 5.106344699859619, "learning_rate": 2.1629551609990973e-05, "loss": 0.2626, "step": 4510 }, { "epoch": 21.39644970414201, "grad_norm": 8.469663619995117, "learning_rate": 2.15543183869997e-05, "loss": 0.208, "step": 4520 }, { "epoch": 21.443786982248522, "grad_norm": 4.838006496429443, "learning_rate": 2.147908516400843e-05, "loss": 0.2903, "step": 4530 }, { "epoch": 21.49112426035503, "grad_norm": 5.432097911834717, "learning_rate": 2.1403851941017154e-05, "loss": 0.2337, "step": 4540 }, { "epoch": 21.53846153846154, "grad_norm": 6.889484882354736, "learning_rate": 2.132861871802588e-05, "loss": 0.2269, "step": 4550 }, { "epoch": 21.585798816568047, "grad_norm": 8.73716926574707, "learning_rate": 2.1253385495034607e-05, "loss": 0.2631, "step": 4560 }, { "epoch": 21.633136094674555, "grad_norm": 2.4893133640289307, "learning_rate": 2.1178152272043336e-05, "loss": 0.2186, "step": 4570 }, { "epoch": 21.680473372781066, "grad_norm": 7.44368839263916, "learning_rate": 2.110291904905206e-05, "loss": 0.2052, "step": 4580 }, { "epoch": 21.727810650887573, "grad_norm": 12.204940795898438, "learning_rate": 2.102768582606079e-05, "loss": 0.2862, "step": 4590 }, { "epoch": 21.775147928994084, "grad_norm": 7.419914722442627, "learning_rate": 2.0952452603069517e-05, "loss": 0.2568, "step": 4600 }, { "epoch": 21.82248520710059, "grad_norm": 7.833005905151367, "learning_rate": 2.0877219380078245e-05, "loss": 0.2727, "step": 4610 }, { "epoch": 21.869822485207102, "grad_norm": 3.8460819721221924, "learning_rate": 2.080198615708697e-05, "loss": 0.2306, "step": 4620 }, { "epoch": 21.91715976331361, "grad_norm": 12.018167495727539, "learning_rate": 2.0726752934095698e-05, "loss": 0.2729, "step": 4630 }, { "epoch": 21.964497041420117, "grad_norm": 7.023700714111328, "learning_rate": 2.0651519711104426e-05, "loss": 0.2845, "step": 4640 }, { "epoch": 21.997633136094674, "eval_accuracy": 0.8938356164383562, "eval_loss": 0.3902602195739746, "eval_runtime": 6.3485, "eval_samples_per_second": 229.975, "eval_steps_per_second": 28.826, "step": 4647 }, { "epoch": 22.011834319526628, "grad_norm": 11.811697006225586, "learning_rate": 2.0576286488113154e-05, "loss": 0.3063, "step": 4650 }, { "epoch": 22.059171597633135, "grad_norm": 4.944943428039551, "learning_rate": 2.050105326512188e-05, "loss": 0.2157, "step": 4660 }, { "epoch": 22.106508875739646, "grad_norm": 5.4949517250061035, "learning_rate": 2.0425820042130607e-05, "loss": 0.2373, "step": 4670 }, { "epoch": 22.153846153846153, "grad_norm": 6.9762163162231445, "learning_rate": 2.0350586819139332e-05, "loss": 0.2378, "step": 4680 }, { "epoch": 22.201183431952664, "grad_norm": 6.753002643585205, "learning_rate": 2.027535359614806e-05, "loss": 0.1939, "step": 4690 }, { "epoch": 22.24852071005917, "grad_norm": 4.161319732666016, "learning_rate": 2.0200120373156785e-05, "loss": 0.242, "step": 4700 }, { "epoch": 22.29585798816568, "grad_norm": 5.062042713165283, "learning_rate": 2.0124887150165513e-05, "loss": 0.2782, "step": 4710 }, { "epoch": 22.34319526627219, "grad_norm": 9.755287170410156, "learning_rate": 2.004965392717424e-05, "loss": 0.2009, "step": 4720 }, { "epoch": 22.390532544378697, "grad_norm": 6.668210506439209, "learning_rate": 1.997442070418297e-05, "loss": 0.1617, "step": 4730 }, { "epoch": 22.437869822485208, "grad_norm": 3.9158642292022705, "learning_rate": 1.9899187481191694e-05, "loss": 0.2013, "step": 4740 }, { "epoch": 22.485207100591715, "grad_norm": 7.47080659866333, "learning_rate": 1.9823954258200423e-05, "loss": 0.2057, "step": 4750 }, { "epoch": 22.532544378698226, "grad_norm": 17.479690551757812, "learning_rate": 1.974872103520915e-05, "loss": 0.2964, "step": 4760 }, { "epoch": 22.579881656804734, "grad_norm": 9.807324409484863, "learning_rate": 1.9673487812217876e-05, "loss": 0.2519, "step": 4770 }, { "epoch": 22.62721893491124, "grad_norm": 8.961894035339355, "learning_rate": 1.9598254589226604e-05, "loss": 0.2724, "step": 4780 }, { "epoch": 22.674556213017752, "grad_norm": 3.2384064197540283, "learning_rate": 1.9523021366235332e-05, "loss": 0.187, "step": 4790 }, { "epoch": 22.72189349112426, "grad_norm": 5.056863307952881, "learning_rate": 1.944778814324406e-05, "loss": 0.2512, "step": 4800 }, { "epoch": 22.76923076923077, "grad_norm": 9.88666820526123, "learning_rate": 1.9372554920252785e-05, "loss": 0.2003, "step": 4810 }, { "epoch": 22.816568047337277, "grad_norm": 15.032508850097656, "learning_rate": 1.929732169726151e-05, "loss": 0.2665, "step": 4820 }, { "epoch": 22.86390532544379, "grad_norm": 6.520040035247803, "learning_rate": 1.9222088474270238e-05, "loss": 0.2592, "step": 4830 }, { "epoch": 22.911242603550296, "grad_norm": 5.046426296234131, "learning_rate": 1.9146855251278966e-05, "loss": 0.2219, "step": 4840 }, { "epoch": 22.958579881656803, "grad_norm": 11.43876838684082, "learning_rate": 1.907162202828769e-05, "loss": 0.227, "step": 4850 }, { "epoch": 22.996449704142012, "eval_accuracy": 0.9041095890410958, "eval_loss": 0.3559742569923401, "eval_runtime": 6.1037, "eval_samples_per_second": 239.198, "eval_steps_per_second": 29.982, "step": 4858 }, { "epoch": 23.005917159763314, "grad_norm": 5.062148571014404, "learning_rate": 1.899638880529642e-05, "loss": 0.1716, "step": 4860 }, { "epoch": 23.05325443786982, "grad_norm": 13.177910804748535, "learning_rate": 1.8921155582305147e-05, "loss": 0.2274, "step": 4870 }, { "epoch": 23.100591715976332, "grad_norm": 10.63724136352539, "learning_rate": 1.8845922359313875e-05, "loss": 0.2376, "step": 4880 }, { "epoch": 23.14792899408284, "grad_norm": 11.315512657165527, "learning_rate": 1.87706891363226e-05, "loss": 0.2451, "step": 4890 }, { "epoch": 23.19526627218935, "grad_norm": 9.915947914123535, "learning_rate": 1.869545591333133e-05, "loss": 0.265, "step": 4900 }, { "epoch": 23.242603550295858, "grad_norm": 7.371302604675293, "learning_rate": 1.8620222690340057e-05, "loss": 0.203, "step": 4910 }, { "epoch": 23.28994082840237, "grad_norm": 10.347346305847168, "learning_rate": 1.8544989467348785e-05, "loss": 0.2058, "step": 4920 }, { "epoch": 23.337278106508876, "grad_norm": 7.930377006530762, "learning_rate": 1.846975624435751e-05, "loss": 0.1881, "step": 4930 }, { "epoch": 23.384615384615383, "grad_norm": 7.690789699554443, "learning_rate": 1.8394523021366238e-05, "loss": 0.2058, "step": 4940 }, { "epoch": 23.431952662721894, "grad_norm": 9.262539863586426, "learning_rate": 1.8319289798374963e-05, "loss": 0.2432, "step": 4950 }, { "epoch": 23.4792899408284, "grad_norm": 6.507819652557373, "learning_rate": 1.8244056575383687e-05, "loss": 0.1951, "step": 4960 }, { "epoch": 23.526627218934912, "grad_norm": 5.187134742736816, "learning_rate": 1.8168823352392416e-05, "loss": 0.1948, "step": 4970 }, { "epoch": 23.57396449704142, "grad_norm": 5.986237525939941, "learning_rate": 1.8093590129401144e-05, "loss": 0.1896, "step": 4980 }, { "epoch": 23.62130177514793, "grad_norm": 3.2465999126434326, "learning_rate": 1.8018356906409872e-05, "loss": 0.2521, "step": 4990 }, { "epoch": 23.668639053254438, "grad_norm": 6.972270488739014, "learning_rate": 1.7943123683418597e-05, "loss": 0.2162, "step": 5000 }, { "epoch": 23.715976331360945, "grad_norm": 10.68996524810791, "learning_rate": 1.7867890460427325e-05, "loss": 0.228, "step": 5010 }, { "epoch": 23.763313609467456, "grad_norm": 13.406333923339844, "learning_rate": 1.7792657237436053e-05, "loss": 0.2184, "step": 5020 }, { "epoch": 23.810650887573964, "grad_norm": 10.20108699798584, "learning_rate": 1.771742401444478e-05, "loss": 0.2286, "step": 5030 }, { "epoch": 23.857988165680474, "grad_norm": 4.646299839019775, "learning_rate": 1.7642190791453506e-05, "loss": 0.177, "step": 5040 }, { "epoch": 23.90532544378698, "grad_norm": 11.070876121520996, "learning_rate": 1.7566957568462234e-05, "loss": 0.232, "step": 5050 }, { "epoch": 23.952662721893493, "grad_norm": 9.572555541992188, "learning_rate": 1.7491724345470962e-05, "loss": 0.2007, "step": 5060 }, { "epoch": 24.0, "grad_norm": 5.681567668914795, "learning_rate": 1.7416491122479687e-05, "loss": 0.1909, "step": 5070 }, { "epoch": 24.0, "eval_accuracy": 0.9054794520547945, "eval_loss": 0.3623672127723694, "eval_runtime": 6.2394, "eval_samples_per_second": 233.996, "eval_steps_per_second": 29.33, "step": 5070 }, { "epoch": 24.047337278106507, "grad_norm": 9.697016716003418, "learning_rate": 1.7341257899488415e-05, "loss": 0.2439, "step": 5080 }, { "epoch": 24.09467455621302, "grad_norm": 10.163914680480957, "learning_rate": 1.726602467649714e-05, "loss": 0.1802, "step": 5090 }, { "epoch": 24.142011834319526, "grad_norm": 11.584846496582031, "learning_rate": 1.719079145350587e-05, "loss": 0.2823, "step": 5100 }, { "epoch": 24.189349112426036, "grad_norm": 4.525638103485107, "learning_rate": 1.7115558230514597e-05, "loss": 0.1716, "step": 5110 }, { "epoch": 24.236686390532544, "grad_norm": 7.053022861480713, "learning_rate": 1.704032500752332e-05, "loss": 0.218, "step": 5120 }, { "epoch": 24.284023668639055, "grad_norm": 2.7923426628112793, "learning_rate": 1.696509178453205e-05, "loss": 0.185, "step": 5130 }, { "epoch": 24.331360946745562, "grad_norm": 3.2291653156280518, "learning_rate": 1.6889858561540778e-05, "loss": 0.2196, "step": 5140 }, { "epoch": 24.37869822485207, "grad_norm": 11.007999420166016, "learning_rate": 1.6814625338549506e-05, "loss": 0.2367, "step": 5150 }, { "epoch": 24.42603550295858, "grad_norm": 10.4671049118042, "learning_rate": 1.673939211555823e-05, "loss": 0.2754, "step": 5160 }, { "epoch": 24.473372781065088, "grad_norm": 11.023184776306152, "learning_rate": 1.666415889256696e-05, "loss": 0.2092, "step": 5170 }, { "epoch": 24.5207100591716, "grad_norm": 7.405954360961914, "learning_rate": 1.6588925669575687e-05, "loss": 0.2387, "step": 5180 }, { "epoch": 24.568047337278106, "grad_norm": 2.6797077655792236, "learning_rate": 1.6513692446584412e-05, "loss": 0.2842, "step": 5190 }, { "epoch": 24.615384615384617, "grad_norm": 2.8351101875305176, "learning_rate": 1.643845922359314e-05, "loss": 0.2395, "step": 5200 }, { "epoch": 24.662721893491124, "grad_norm": 5.248380661010742, "learning_rate": 1.636322600060187e-05, "loss": 0.2047, "step": 5210 }, { "epoch": 24.71005917159763, "grad_norm": 11.657218933105469, "learning_rate": 1.6287992777610593e-05, "loss": 0.2442, "step": 5220 }, { "epoch": 24.757396449704142, "grad_norm": 8.078208923339844, "learning_rate": 1.6212759554619318e-05, "loss": 0.1842, "step": 5230 }, { "epoch": 24.80473372781065, "grad_norm": 7.111977577209473, "learning_rate": 1.6137526331628046e-05, "loss": 0.2838, "step": 5240 }, { "epoch": 24.85207100591716, "grad_norm": 5.9829535484313965, "learning_rate": 1.6062293108636774e-05, "loss": 0.2379, "step": 5250 }, { "epoch": 24.899408284023668, "grad_norm": 7.217136859893799, "learning_rate": 1.5987059885645502e-05, "loss": 0.2162, "step": 5260 }, { "epoch": 24.94674556213018, "grad_norm": 3.860224485397339, "learning_rate": 1.5911826662654227e-05, "loss": 0.2123, "step": 5270 }, { "epoch": 24.994082840236686, "grad_norm": 7.911783695220947, "learning_rate": 1.5836593439662955e-05, "loss": 0.1972, "step": 5280 }, { "epoch": 24.99881656804734, "eval_accuracy": 0.910958904109589, "eval_loss": 0.3805873990058899, "eval_runtime": 6.2415, "eval_samples_per_second": 233.919, "eval_steps_per_second": 29.32, "step": 5281 } ], "logging_steps": 10, "max_steps": 7385, "num_input_tokens_seen": 0, "num_train_epochs": 35, "save_steps": 500, "total_flos": 5.493880885130035e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }