{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 250, "global_step": 22156, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004513450081242101, "grad_norm": 7.729167461395264, "learning_rate": 6.768953068592058e-07, "loss": 5.9501, "step": 50 }, { "epoch": 0.009026900162484202, "grad_norm": 13.082584381103516, "learning_rate": 1.3537906137184116e-06, "loss": 5.9447, "step": 100 }, { "epoch": 0.013540350243726304, "grad_norm": 18.259796142578125, "learning_rate": 2.0306859205776177e-06, "loss": 5.9165, "step": 150 }, { "epoch": 0.018053800324968405, "grad_norm": 22.386247634887695, "learning_rate": 2.7075812274368233e-06, "loss": 5.8394, "step": 200 }, { "epoch": 0.022567250406210507, "grad_norm": 34.2584228515625, "learning_rate": 3.384476534296029e-06, "loss": 5.6527, "step": 250 }, { "epoch": 0.022567250406210507, "eval_exact_match": 2.0056764427625353, "eval_f1": 8.992036775611602, "eval_runtime": 152.2365, "eval_samples_per_second": 70.929, "eval_steps_per_second": 17.736, "step": 250 }, { "epoch": 0.02708070048745261, "grad_norm": 35.096343994140625, "learning_rate": 4.061371841155235e-06, "loss": 5.4208, "step": 300 }, { "epoch": 0.03159415056869471, "grad_norm": 32.929325103759766, "learning_rate": 4.73826714801444e-06, "loss": 5.1947, "step": 350 }, { "epoch": 0.03610760064993681, "grad_norm": 24.735565185546875, "learning_rate": 5.4151624548736465e-06, "loss": 4.9405, "step": 400 }, { "epoch": 0.040621050731178915, "grad_norm": 22.857040405273438, "learning_rate": 6.092057761732852e-06, "loss": 4.7322, "step": 450 }, { "epoch": 0.04513450081242101, "grad_norm": 24.466981887817383, "learning_rate": 6.768953068592058e-06, "loss": 4.4225, "step": 500 }, { "epoch": 0.04513450081242101, "eval_exact_match": 5.771050141911069, "eval_f1": 13.440407627304385, "eval_runtime": 143.7394, "eval_samples_per_second": 75.122, "eval_steps_per_second": 18.784, "step": 500 }, { "epoch": 0.04964795089366312, "grad_norm": 21.847347259521484, "learning_rate": 7.445848375451264e-06, "loss": 4.077, "step": 550 }, { "epoch": 0.05416140097490522, "grad_norm": 27.222633361816406, "learning_rate": 8.12274368231047e-06, "loss": 3.5837, "step": 600 }, { "epoch": 0.058674851056147316, "grad_norm": 29.45089340209961, "learning_rate": 8.799638989169675e-06, "loss": 3.0981, "step": 650 }, { "epoch": 0.06318830113738942, "grad_norm": 29.15781593322754, "learning_rate": 9.47653429602888e-06, "loss": 2.823, "step": 700 }, { "epoch": 0.06770175121863152, "grad_norm": 20.824087142944336, "learning_rate": 1.0153429602888087e-05, "loss": 2.4615, "step": 750 }, { "epoch": 0.06770175121863152, "eval_exact_match": 52.82876064333018, "eval_f1": 63.363202801168775, "eval_runtime": 143.757, "eval_samples_per_second": 75.113, "eval_steps_per_second": 18.782, "step": 750 }, { "epoch": 0.07221520129987362, "grad_norm": 33.184410095214844, "learning_rate": 1.0830324909747293e-05, "loss": 2.2565, "step": 800 }, { "epoch": 0.07672865138111573, "grad_norm": 27.845844268798828, "learning_rate": 1.15072202166065e-05, "loss": 1.8158, "step": 850 }, { "epoch": 0.08124210146235783, "grad_norm": 18.29555320739746, "learning_rate": 1.2184115523465704e-05, "loss": 1.7871, "step": 900 }, { "epoch": 0.08575555154359993, "grad_norm": 23.45345687866211, "learning_rate": 1.2861010830324909e-05, "loss": 1.6184, "step": 950 }, { "epoch": 0.09026900162484203, "grad_norm": 23.513124465942383, "learning_rate": 1.3537906137184115e-05, "loss": 1.577, "step": 1000 }, { "epoch": 0.09026900162484203, "eval_exact_match": 65.42100283822138, "eval_f1": 76.31429412241118, "eval_runtime": 143.6001, "eval_samples_per_second": 75.195, "eval_steps_per_second": 18.802, "step": 1000 }, { "epoch": 0.09478245170608413, "grad_norm": 13.316262245178223, "learning_rate": 1.4214801444043322e-05, "loss": 1.5352, "step": 1050 }, { "epoch": 0.09929590178732624, "grad_norm": 34.4163932800293, "learning_rate": 1.4891696750902528e-05, "loss": 1.6355, "step": 1100 }, { "epoch": 0.10380935186856834, "grad_norm": 33.572750091552734, "learning_rate": 1.5568592057761735e-05, "loss": 1.5064, "step": 1150 }, { "epoch": 0.10832280194981043, "grad_norm": 13.494843482971191, "learning_rate": 1.624548736462094e-05, "loss": 1.4047, "step": 1200 }, { "epoch": 0.11283625203105253, "grad_norm": 21.778579711914062, "learning_rate": 1.6922382671480144e-05, "loss": 1.2552, "step": 1250 }, { "epoch": 0.11283625203105253, "eval_exact_match": 69.5364238410596, "eval_f1": 80.48975233211664, "eval_runtime": 143.6325, "eval_samples_per_second": 75.178, "eval_steps_per_second": 18.798, "step": 1250 }, { "epoch": 0.11734970211229463, "grad_norm": 17.293298721313477, "learning_rate": 1.759927797833935e-05, "loss": 1.2779, "step": 1300 }, { "epoch": 0.12186315219353674, "grad_norm": 14.029269218444824, "learning_rate": 1.8276173285198557e-05, "loss": 1.2871, "step": 1350 }, { "epoch": 0.12637660227477884, "grad_norm": 12.971822738647461, "learning_rate": 1.895306859205776e-05, "loss": 1.1974, "step": 1400 }, { "epoch": 0.13089005235602094, "grad_norm": 30.38484001159668, "learning_rate": 1.9629963898916967e-05, "loss": 1.355, "step": 1450 }, { "epoch": 0.13540350243726304, "grad_norm": 29.467548370361328, "learning_rate": 2.0306859205776173e-05, "loss": 1.1713, "step": 1500 }, { "epoch": 0.13540350243726304, "eval_exact_match": 72.57332071901608, "eval_f1": 82.65844387552723, "eval_runtime": 143.752, "eval_samples_per_second": 75.115, "eval_steps_per_second": 18.782, "step": 1500 }, { "epoch": 0.13991695251850514, "grad_norm": 12.46554183959961, "learning_rate": 2.098375451263538e-05, "loss": 1.1677, "step": 1550 }, { "epoch": 0.14443040259974724, "grad_norm": 25.593875885009766, "learning_rate": 2.1660649819494586e-05, "loss": 1.3478, "step": 1600 }, { "epoch": 0.14894385268098934, "grad_norm": 16.136869430541992, "learning_rate": 2.2337545126353793e-05, "loss": 1.1676, "step": 1650 }, { "epoch": 0.15345730276223146, "grad_norm": 16.83846664428711, "learning_rate": 2.3014440433213e-05, "loss": 1.1741, "step": 1700 }, { "epoch": 0.15797075284347356, "grad_norm": 17.464096069335938, "learning_rate": 2.3691335740072202e-05, "loss": 1.2104, "step": 1750 }, { "epoch": 0.15797075284347356, "eval_exact_match": 74.64522232734153, "eval_f1": 84.72687223622708, "eval_runtime": 143.5946, "eval_samples_per_second": 75.198, "eval_steps_per_second": 18.803, "step": 1750 }, { "epoch": 0.16248420292471566, "grad_norm": 13.083732604980469, "learning_rate": 2.436823104693141e-05, "loss": 1.1279, "step": 1800 }, { "epoch": 0.16699765300595776, "grad_norm": 15.166335105895996, "learning_rate": 2.5045126353790615e-05, "loss": 1.1395, "step": 1850 }, { "epoch": 0.17151110308719986, "grad_norm": 8.829039573669434, "learning_rate": 2.5722021660649818e-05, "loss": 1.1374, "step": 1900 }, { "epoch": 0.17602455316844196, "grad_norm": 28.089391708374023, "learning_rate": 2.6398916967509024e-05, "loss": 1.2106, "step": 1950 }, { "epoch": 0.18053800324968405, "grad_norm": 13.704926490783691, "learning_rate": 2.707581227436823e-05, "loss": 1.2369, "step": 2000 }, { "epoch": 0.18053800324968405, "eval_exact_match": 77.96594134342479, "eval_f1": 86.52116394116426, "eval_runtime": 143.6954, "eval_samples_per_second": 75.145, "eval_steps_per_second": 18.79, "step": 2000 }, { "epoch": 0.18505145333092615, "grad_norm": 13.062108993530273, "learning_rate": 2.7752707581227437e-05, "loss": 1.0064, "step": 2050 }, { "epoch": 0.18956490341216825, "grad_norm": 25.21763801574707, "learning_rate": 2.8429602888086644e-05, "loss": 1.1127, "step": 2100 }, { "epoch": 0.19407835349341035, "grad_norm": 7.10919189453125, "learning_rate": 2.910649819494585e-05, "loss": 1.054, "step": 2150 }, { "epoch": 0.19859180357465248, "grad_norm": 14.997174263000488, "learning_rate": 2.9783393501805057e-05, "loss": 1.1187, "step": 2200 }, { "epoch": 0.20310525365589457, "grad_norm": 9.683287620544434, "learning_rate": 2.9948846539618856e-05, "loss": 0.9916, "step": 2250 }, { "epoch": 0.20310525365589457, "eval_exact_match": 77.21854304635761, "eval_f1": 85.59348158373206, "eval_runtime": 143.7109, "eval_samples_per_second": 75.137, "eval_steps_per_second": 18.788, "step": 2250 }, { "epoch": 0.20761870373713667, "grad_norm": 18.712541580200195, "learning_rate": 2.987362086258776e-05, "loss": 1.1756, "step": 2300 }, { "epoch": 0.21213215381837877, "grad_norm": 8.502235412597656, "learning_rate": 2.979839518555667e-05, "loss": 1.0872, "step": 2350 }, { "epoch": 0.21664560389962087, "grad_norm": 16.61508560180664, "learning_rate": 2.9723169508525577e-05, "loss": 1.1148, "step": 2400 }, { "epoch": 0.22115905398086297, "grad_norm": 9.676267623901367, "learning_rate": 2.9647943831494482e-05, "loss": 1.0559, "step": 2450 }, { "epoch": 0.22567250406210507, "grad_norm": 11.562779426574707, "learning_rate": 2.957271815446339e-05, "loss": 1.0628, "step": 2500 }, { "epoch": 0.22567250406210507, "eval_exact_match": 79.3755912961211, "eval_f1": 87.42406194378296, "eval_runtime": 143.6631, "eval_samples_per_second": 75.162, "eval_steps_per_second": 18.794, "step": 2500 }, { "epoch": 0.23018595414334717, "grad_norm": 20.24138832092285, "learning_rate": 2.9497492477432297e-05, "loss": 1.0549, "step": 2550 }, { "epoch": 0.23469940422458926, "grad_norm": 24.723041534423828, "learning_rate": 2.9422266800401203e-05, "loss": 1.137, "step": 2600 }, { "epoch": 0.2392128543058314, "grad_norm": 14.101241111755371, "learning_rate": 2.9347041123370113e-05, "loss": 1.0199, "step": 2650 }, { "epoch": 0.2437263043870735, "grad_norm": 7.032845497131348, "learning_rate": 2.927181544633902e-05, "loss": 1.0601, "step": 2700 }, { "epoch": 0.2482397544683156, "grad_norm": 13.543634414672852, "learning_rate": 2.9196589769307924e-05, "loss": 1.0534, "step": 2750 }, { "epoch": 0.2482397544683156, "eval_exact_match": 79.57426679280984, "eval_f1": 87.74281924363757, "eval_runtime": 143.6905, "eval_samples_per_second": 75.148, "eval_steps_per_second": 18.79, "step": 2750 }, { "epoch": 0.2527532045495577, "grad_norm": 7.890726566314697, "learning_rate": 2.9121364092276833e-05, "loss": 1.0196, "step": 2800 }, { "epoch": 0.2572666546307998, "grad_norm": 12.943625450134277, "learning_rate": 2.904613841524574e-05, "loss": 1.0178, "step": 2850 }, { "epoch": 0.2617801047120419, "grad_norm": 9.828871726989746, "learning_rate": 2.897091273821464e-05, "loss": 1.0482, "step": 2900 }, { "epoch": 0.266293554793284, "grad_norm": 28.693660736083984, "learning_rate": 2.889568706118355e-05, "loss": 0.9897, "step": 2950 }, { "epoch": 0.2708070048745261, "grad_norm": 10.408865928649902, "learning_rate": 2.8820461384152457e-05, "loss": 0.9407, "step": 3000 }, { "epoch": 0.2708070048745261, "eval_exact_match": 80.37842951750237, "eval_f1": 88.60122498039404, "eval_runtime": 143.7217, "eval_samples_per_second": 75.131, "eval_steps_per_second": 18.786, "step": 3000 }, { "epoch": 0.2753204549557682, "grad_norm": 11.135859489440918, "learning_rate": 2.8745235707121363e-05, "loss": 0.9192, "step": 3050 }, { "epoch": 0.2798339050370103, "grad_norm": 10.159820556640625, "learning_rate": 2.8670010030090272e-05, "loss": 0.9232, "step": 3100 }, { "epoch": 0.2843473551182524, "grad_norm": 6.99199914932251, "learning_rate": 2.8594784353059178e-05, "loss": 0.9188, "step": 3150 }, { "epoch": 0.2888608051994945, "grad_norm": 11.692395210266113, "learning_rate": 2.8519558676028083e-05, "loss": 0.979, "step": 3200 }, { "epoch": 0.2933742552807366, "grad_norm": 12.289103507995605, "learning_rate": 2.8444332998996993e-05, "loss": 0.8573, "step": 3250 }, { "epoch": 0.2933742552807366, "eval_exact_match": 81.44749290444655, "eval_f1": 88.89880962072144, "eval_runtime": 143.7737, "eval_samples_per_second": 75.104, "eval_steps_per_second": 18.78, "step": 3250 }, { "epoch": 0.29788770536197867, "grad_norm": 23.986677169799805, "learning_rate": 2.83691073219659e-05, "loss": 0.879, "step": 3300 }, { "epoch": 0.30240115544322077, "grad_norm": 10.214922904968262, "learning_rate": 2.8293881644934804e-05, "loss": 0.9526, "step": 3350 }, { "epoch": 0.3069146055244629, "grad_norm": 11.930830955505371, "learning_rate": 2.8218655967903714e-05, "loss": 1.0308, "step": 3400 }, { "epoch": 0.311428055605705, "grad_norm": 7.23566198348999, "learning_rate": 2.814343029087262e-05, "loss": 0.9849, "step": 3450 }, { "epoch": 0.3159415056869471, "grad_norm": 10.037336349487305, "learning_rate": 2.8068204613841525e-05, "loss": 0.8577, "step": 3500 }, { "epoch": 0.3159415056869471, "eval_exact_match": 82.28949858088932, "eval_f1": 89.36851469763961, "eval_runtime": 143.8541, "eval_samples_per_second": 75.062, "eval_steps_per_second": 18.769, "step": 3500 }, { "epoch": 0.3204549557681892, "grad_norm": 19.438182830810547, "learning_rate": 2.799297893681043e-05, "loss": 0.9586, "step": 3550 }, { "epoch": 0.3249684058494313, "grad_norm": 11.259856224060059, "learning_rate": 2.7917753259779337e-05, "loss": 0.923, "step": 3600 }, { "epoch": 0.3294818559306734, "grad_norm": 16.151851654052734, "learning_rate": 2.7842527582748243e-05, "loss": 0.9551, "step": 3650 }, { "epoch": 0.3339953060119155, "grad_norm": 12.570643424987793, "learning_rate": 2.7767301905717152e-05, "loss": 0.9377, "step": 3700 }, { "epoch": 0.3385087560931576, "grad_norm": 13.616052627563477, "learning_rate": 2.7692076228686058e-05, "loss": 0.7839, "step": 3750 }, { "epoch": 0.3385087560931576, "eval_exact_match": 81.69347209082308, "eval_f1": 89.48678563794635, "eval_runtime": 143.7031, "eval_samples_per_second": 75.141, "eval_steps_per_second": 18.789, "step": 3750 }, { "epoch": 0.3430222061743997, "grad_norm": 20.124753952026367, "learning_rate": 2.7616850551654964e-05, "loss": 0.9229, "step": 3800 }, { "epoch": 0.3475356562556418, "grad_norm": 7.816183090209961, "learning_rate": 2.7541624874623873e-05, "loss": 1.0717, "step": 3850 }, { "epoch": 0.3520491063368839, "grad_norm": 5.988482475280762, "learning_rate": 2.746639919759278e-05, "loss": 0.9611, "step": 3900 }, { "epoch": 0.356562556418126, "grad_norm": 13.177979469299316, "learning_rate": 2.7391173520561685e-05, "loss": 0.9207, "step": 3950 }, { "epoch": 0.3610760064993681, "grad_norm": 11.034092903137207, "learning_rate": 2.7315947843530594e-05, "loss": 0.9395, "step": 4000 }, { "epoch": 0.3610760064993681, "eval_exact_match": 82.74361400189214, "eval_f1": 90.06638594360132, "eval_runtime": 143.7901, "eval_samples_per_second": 75.096, "eval_steps_per_second": 18.777, "step": 4000 }, { "epoch": 0.3655894565806102, "grad_norm": 5.799317359924316, "learning_rate": 2.72407221664995e-05, "loss": 0.9385, "step": 4050 }, { "epoch": 0.3701029066618523, "grad_norm": 13.385774612426758, "learning_rate": 2.7165496489468405e-05, "loss": 0.9356, "step": 4100 }, { "epoch": 0.3746163567430944, "grad_norm": 22.11754608154297, "learning_rate": 2.7090270812437315e-05, "loss": 0.8532, "step": 4150 }, { "epoch": 0.3791298068243365, "grad_norm": 4.648535251617432, "learning_rate": 2.701504513540622e-05, "loss": 1.0032, "step": 4200 }, { "epoch": 0.3836432569055786, "grad_norm": 24.29154396057129, "learning_rate": 2.6939819458375123e-05, "loss": 0.8606, "step": 4250 }, { "epoch": 0.3836432569055786, "eval_exact_match": 82.58278145695364, "eval_f1": 89.65557078580815, "eval_runtime": 143.6035, "eval_samples_per_second": 75.193, "eval_steps_per_second": 18.802, "step": 4250 }, { "epoch": 0.3881567069868207, "grad_norm": 21.021242141723633, "learning_rate": 2.6864593781344032e-05, "loss": 0.9006, "step": 4300 }, { "epoch": 0.39267015706806285, "grad_norm": 26.466794967651367, "learning_rate": 2.6789368104312938e-05, "loss": 0.9426, "step": 4350 }, { "epoch": 0.39718360714930495, "grad_norm": 6.325038433074951, "learning_rate": 2.6714142427281844e-05, "loss": 0.8097, "step": 4400 }, { "epoch": 0.40169705723054705, "grad_norm": 8.038667678833008, "learning_rate": 2.6638916750250753e-05, "loss": 0.9053, "step": 4450 }, { "epoch": 0.40621050731178915, "grad_norm": 10.573040008544922, "learning_rate": 2.656369107321966e-05, "loss": 0.8459, "step": 4500 }, { "epoch": 0.40621050731178915, "eval_exact_match": 82.60170293282876, "eval_f1": 89.80136126079411, "eval_runtime": 143.5994, "eval_samples_per_second": 75.195, "eval_steps_per_second": 18.802, "step": 4500 }, { "epoch": 0.41072395739303125, "grad_norm": 7.336009979248047, "learning_rate": 2.6488465396188565e-05, "loss": 0.8691, "step": 4550 }, { "epoch": 0.41523740747427335, "grad_norm": 13.7125825881958, "learning_rate": 2.6413239719157474e-05, "loss": 0.8486, "step": 4600 }, { "epoch": 0.41975085755551544, "grad_norm": 12.19320011138916, "learning_rate": 2.633801404212638e-05, "loss": 0.9189, "step": 4650 }, { "epoch": 0.42426430763675754, "grad_norm": 11.599879264831543, "learning_rate": 2.6262788365095286e-05, "loss": 0.942, "step": 4700 }, { "epoch": 0.42877775771799964, "grad_norm": 9.139724731445312, "learning_rate": 2.6187562688064195e-05, "loss": 0.9565, "step": 4750 }, { "epoch": 0.42877775771799964, "eval_exact_match": 83.66130558183538, "eval_f1": 90.80495165338898, "eval_runtime": 143.5723, "eval_samples_per_second": 75.209, "eval_steps_per_second": 18.806, "step": 4750 }, { "epoch": 0.43329120779924174, "grad_norm": 10.977174758911133, "learning_rate": 2.61123370110331e-05, "loss": 0.8751, "step": 4800 }, { "epoch": 0.43780465788048384, "grad_norm": 13.7095947265625, "learning_rate": 2.6037111334002007e-05, "loss": 0.7547, "step": 4850 }, { "epoch": 0.44231810796172594, "grad_norm": 26.184358596801758, "learning_rate": 2.5961885656970912e-05, "loss": 0.9376, "step": 4900 }, { "epoch": 0.44683155804296804, "grad_norm": 21.620555877685547, "learning_rate": 2.5886659979939818e-05, "loss": 0.8013, "step": 4950 }, { "epoch": 0.45134500812421013, "grad_norm": 12.163994789123535, "learning_rate": 2.5811434302908724e-05, "loss": 0.8731, "step": 5000 }, { "epoch": 0.45134500812421013, "eval_exact_match": 82.37464522232735, "eval_f1": 89.97068346136126, "eval_runtime": 143.7072, "eval_samples_per_second": 75.139, "eval_steps_per_second": 18.788, "step": 5000 }, { "epoch": 0.45585845820545223, "grad_norm": 5.3110175132751465, "learning_rate": 2.5736208625877633e-05, "loss": 0.9062, "step": 5050 }, { "epoch": 0.46037190828669433, "grad_norm": 17.91867446899414, "learning_rate": 2.566098294884654e-05, "loss": 0.8749, "step": 5100 }, { "epoch": 0.46488535836793643, "grad_norm": 34.21914291381836, "learning_rate": 2.5585757271815445e-05, "loss": 0.8915, "step": 5150 }, { "epoch": 0.46939880844917853, "grad_norm": 8.76441478729248, "learning_rate": 2.5510531594784354e-05, "loss": 0.8023, "step": 5200 }, { "epoch": 0.4739122585304206, "grad_norm": 20.71419334411621, "learning_rate": 2.543530591775326e-05, "loss": 0.8114, "step": 5250 }, { "epoch": 0.4739122585304206, "eval_exact_match": 82.36518448438979, "eval_f1": 90.09893335252144, "eval_runtime": 143.9381, "eval_samples_per_second": 75.018, "eval_steps_per_second": 18.758, "step": 5250 }, { "epoch": 0.4784257086116628, "grad_norm": 14.544415473937988, "learning_rate": 2.536008024072217e-05, "loss": 0.8045, "step": 5300 }, { "epoch": 0.4829391586929049, "grad_norm": 40.066375732421875, "learning_rate": 2.5284854563691075e-05, "loss": 0.8651, "step": 5350 }, { "epoch": 0.487452608774147, "grad_norm": 16.154937744140625, "learning_rate": 2.520962888665998e-05, "loss": 0.9995, "step": 5400 }, { "epoch": 0.4919660588553891, "grad_norm": 6.674190044403076, "learning_rate": 2.513440320962889e-05, "loss": 0.8231, "step": 5450 }, { "epoch": 0.4964795089366312, "grad_norm": 6.037493705749512, "learning_rate": 2.5059177532597796e-05, "loss": 0.8369, "step": 5500 }, { "epoch": 0.4964795089366312, "eval_exact_match": 83.20719016083254, "eval_f1": 90.18831406264282, "eval_runtime": 143.9261, "eval_samples_per_second": 75.025, "eval_steps_per_second": 18.76, "step": 5500 }, { "epoch": 0.5009929590178732, "grad_norm": 8.621197700500488, "learning_rate": 2.4983951855566702e-05, "loss": 0.8002, "step": 5550 }, { "epoch": 0.5055064090991154, "grad_norm": 19.25075340270996, "learning_rate": 2.4908726178535608e-05, "loss": 0.8039, "step": 5600 }, { "epoch": 0.5100198591803574, "grad_norm": 14.201600074768066, "learning_rate": 2.4833500501504514e-05, "loss": 0.7525, "step": 5650 }, { "epoch": 0.5145333092615996, "grad_norm": 30.636154174804688, "learning_rate": 2.475827482447342e-05, "loss": 0.7711, "step": 5700 }, { "epoch": 0.5190467593428417, "grad_norm": 8.79736042022705, "learning_rate": 2.468304914744233e-05, "loss": 0.8997, "step": 5750 }, { "epoch": 0.5190467593428417, "eval_exact_match": 83.66130558183538, "eval_f1": 90.84316221305555, "eval_runtime": 143.8756, "eval_samples_per_second": 75.051, "eval_steps_per_second": 18.766, "step": 5750 }, { "epoch": 0.5235602094240838, "grad_norm": 14.871445655822754, "learning_rate": 2.4607823470411234e-05, "loss": 0.7639, "step": 5800 }, { "epoch": 0.5280736595053259, "grad_norm": 6.112968444824219, "learning_rate": 2.453259779338014e-05, "loss": 0.8643, "step": 5850 }, { "epoch": 0.532587109586568, "grad_norm": 6.213535785675049, "learning_rate": 2.445737211634905e-05, "loss": 1.0582, "step": 5900 }, { "epoch": 0.5371005596678101, "grad_norm": 4.76146936416626, "learning_rate": 2.4382146439317955e-05, "loss": 0.818, "step": 5950 }, { "epoch": 0.5416140097490522, "grad_norm": 8.690106391906738, "learning_rate": 2.430692076228686e-05, "loss": 0.8871, "step": 6000 }, { "epoch": 0.5416140097490522, "eval_exact_match": 84.03027436140019, "eval_f1": 90.88429950527104, "eval_runtime": 143.8461, "eval_samples_per_second": 75.066, "eval_steps_per_second": 18.77, "step": 6000 }, { "epoch": 0.5461274598302943, "grad_norm": 18.575305938720703, "learning_rate": 2.423169508525577e-05, "loss": 0.9143, "step": 6050 }, { "epoch": 0.5506409099115364, "grad_norm": 2.229325294494629, "learning_rate": 2.4156469408224676e-05, "loss": 0.9387, "step": 6100 }, { "epoch": 0.5551543599927785, "grad_norm": 9.413180351257324, "learning_rate": 2.4081243731193582e-05, "loss": 0.8657, "step": 6150 }, { "epoch": 0.5596678100740206, "grad_norm": 5.644939422607422, "learning_rate": 2.400601805416249e-05, "loss": 0.8445, "step": 6200 }, { "epoch": 0.5641812601552627, "grad_norm": 23.247257232666016, "learning_rate": 2.3930792377131394e-05, "loss": 0.8342, "step": 6250 }, { "epoch": 0.5641812601552627, "eval_exact_match": 84.12488174077578, "eval_f1": 91.10403462345704, "eval_runtime": 144.0358, "eval_samples_per_second": 74.967, "eval_steps_per_second": 18.745, "step": 6250 }, { "epoch": 0.5686947102365048, "grad_norm": 19.933300018310547, "learning_rate": 2.38555667001003e-05, "loss": 0.8348, "step": 6300 }, { "epoch": 0.5732081603177469, "grad_norm": 7.6812872886657715, "learning_rate": 2.378034102306921e-05, "loss": 0.7582, "step": 6350 }, { "epoch": 0.577721610398989, "grad_norm": 13.01408863067627, "learning_rate": 2.3705115346038115e-05, "loss": 0.7245, "step": 6400 }, { "epoch": 0.5822350604802311, "grad_norm": 23.909793853759766, "learning_rate": 2.362988966900702e-05, "loss": 0.7503, "step": 6450 }, { "epoch": 0.5867485105614731, "grad_norm": 5.6074323654174805, "learning_rate": 2.355466399197593e-05, "loss": 0.8251, "step": 6500 }, { "epoch": 0.5867485105614731, "eval_exact_match": 84.31409649952697, "eval_f1": 91.10669916586389, "eval_runtime": 143.8718, "eval_samples_per_second": 75.053, "eval_steps_per_second": 18.767, "step": 6500 }, { "epoch": 0.5912619606427153, "grad_norm": 13.117137908935547, "learning_rate": 2.3479438314944836e-05, "loss": 0.7903, "step": 6550 }, { "epoch": 0.5957754107239573, "grad_norm": 4.99781608581543, "learning_rate": 2.340421263791374e-05, "loss": 0.8538, "step": 6600 }, { "epoch": 0.6002888608051995, "grad_norm": 7.639380931854248, "learning_rate": 2.332898696088265e-05, "loss": 0.8154, "step": 6650 }, { "epoch": 0.6048023108864415, "grad_norm": 30.98665428161621, "learning_rate": 2.3253761283851556e-05, "loss": 0.723, "step": 6700 }, { "epoch": 0.6093157609676837, "grad_norm": 30.613746643066406, "learning_rate": 2.3178535606820462e-05, "loss": 0.8682, "step": 6750 }, { "epoch": 0.6093157609676837, "eval_exact_match": 83.72753074739829, "eval_f1": 90.83640909549077, "eval_runtime": 143.634, "eval_samples_per_second": 75.177, "eval_steps_per_second": 18.798, "step": 6750 }, { "epoch": 0.6138292110489258, "grad_norm": 8.440532684326172, "learning_rate": 2.310330992978937e-05, "loss": 0.8729, "step": 6800 }, { "epoch": 0.6183426611301679, "grad_norm": 5.947940826416016, "learning_rate": 2.3028084252758277e-05, "loss": 0.747, "step": 6850 }, { "epoch": 0.62285611121141, "grad_norm": 16.59714698791504, "learning_rate": 2.295285857572718e-05, "loss": 0.8015, "step": 6900 }, { "epoch": 0.6273695612926521, "grad_norm": 5.211153507232666, "learning_rate": 2.287763289869609e-05, "loss": 0.8957, "step": 6950 }, { "epoch": 0.6318830113738942, "grad_norm": 4.547276496887207, "learning_rate": 2.2802407221664995e-05, "loss": 0.9019, "step": 7000 }, { "epoch": 0.6318830113738942, "eval_exact_match": 84.82497634815516, "eval_f1": 91.52669904904272, "eval_runtime": 145.3332, "eval_samples_per_second": 74.298, "eval_steps_per_second": 18.578, "step": 7000 }, { "epoch": 0.6363964614551363, "grad_norm": 11.642155647277832, "learning_rate": 2.27271815446339e-05, "loss": 0.7963, "step": 7050 }, { "epoch": 0.6409099115363784, "grad_norm": 7.39171028137207, "learning_rate": 2.265195586760281e-05, "loss": 0.7953, "step": 7100 }, { "epoch": 0.6454233616176205, "grad_norm": 13.905296325683594, "learning_rate": 2.2576730190571716e-05, "loss": 0.7865, "step": 7150 }, { "epoch": 0.6499368116988626, "grad_norm": 5.167139530181885, "learning_rate": 2.250150451354062e-05, "loss": 0.8087, "step": 7200 }, { "epoch": 0.6544502617801047, "grad_norm": 27.534217834472656, "learning_rate": 2.242627883650953e-05, "loss": 0.9436, "step": 7250 }, { "epoch": 0.6544502617801047, "eval_exact_match": 84.06811731315042, "eval_f1": 91.07524033930977, "eval_runtime": 143.7651, "eval_samples_per_second": 75.109, "eval_steps_per_second": 18.781, "step": 7250 }, { "epoch": 0.6589637118613468, "grad_norm": 15.742715835571289, "learning_rate": 2.2351053159478437e-05, "loss": 0.8499, "step": 7300 }, { "epoch": 0.6634771619425889, "grad_norm": 16.15327262878418, "learning_rate": 2.2275827482447342e-05, "loss": 0.846, "step": 7350 }, { "epoch": 0.667990612023831, "grad_norm": 17.383888244628906, "learning_rate": 2.220060180541625e-05, "loss": 0.7903, "step": 7400 }, { "epoch": 0.6725040621050731, "grad_norm": 7.484638214111328, "learning_rate": 2.2125376128385157e-05, "loss": 0.7664, "step": 7450 }, { "epoch": 0.6770175121863152, "grad_norm": 10.082265853881836, "learning_rate": 2.2050150451354063e-05, "loss": 0.9177, "step": 7500 }, { "epoch": 0.6770175121863152, "eval_exact_match": 84.49385052034059, "eval_f1": 91.10452090726004, "eval_runtime": 143.5023, "eval_samples_per_second": 75.246, "eval_steps_per_second": 18.815, "step": 7500 }, { "epoch": 0.6815309622675573, "grad_norm": 10.778836250305176, "learning_rate": 2.1974924774322973e-05, "loss": 0.7529, "step": 7550 }, { "epoch": 0.6860444123487994, "grad_norm": 12.894726753234863, "learning_rate": 2.1899699097291875e-05, "loss": 0.8783, "step": 7600 }, { "epoch": 0.6905578624300415, "grad_norm": 7.819123268127441, "learning_rate": 2.182447342026078e-05, "loss": 0.9095, "step": 7650 }, { "epoch": 0.6950713125112836, "grad_norm": 24.68296241760254, "learning_rate": 2.174924774322969e-05, "loss": 0.8804, "step": 7700 }, { "epoch": 0.6995847625925258, "grad_norm": 9.52649974822998, "learning_rate": 2.1674022066198596e-05, "loss": 0.8028, "step": 7750 }, { "epoch": 0.6995847625925258, "eval_exact_match": 85.59129612109744, "eval_f1": 91.93623152881347, "eval_runtime": 143.4073, "eval_samples_per_second": 75.296, "eval_steps_per_second": 18.827, "step": 7750 }, { "epoch": 0.7040982126737678, "grad_norm": 10.898487091064453, "learning_rate": 2.1598796389167502e-05, "loss": 0.8282, "step": 7800 }, { "epoch": 0.70861166275501, "grad_norm": 6.693902969360352, "learning_rate": 2.152357071213641e-05, "loss": 0.775, "step": 7850 }, { "epoch": 0.713125112836252, "grad_norm": 10.00558090209961, "learning_rate": 2.1448345035105317e-05, "loss": 0.6894, "step": 7900 }, { "epoch": 0.7176385629174942, "grad_norm": 2.776298761367798, "learning_rate": 2.1373119358074223e-05, "loss": 0.8409, "step": 7950 }, { "epoch": 0.7221520129987362, "grad_norm": 13.581101417541504, "learning_rate": 2.1297893681043132e-05, "loss": 0.8222, "step": 8000 }, { "epoch": 0.7221520129987362, "eval_exact_match": 84.57899716177862, "eval_f1": 91.38107826122027, "eval_runtime": 143.4879, "eval_samples_per_second": 75.254, "eval_steps_per_second": 18.817, "step": 8000 }, { "epoch": 0.7266654630799784, "grad_norm": 6.5702223777771, "learning_rate": 2.1222668004012038e-05, "loss": 0.6735, "step": 8050 }, { "epoch": 0.7311789131612204, "grad_norm": 18.275623321533203, "learning_rate": 2.1147442326980944e-05, "loss": 0.8389, "step": 8100 }, { "epoch": 0.7356923632424626, "grad_norm": 15.205418586730957, "learning_rate": 2.1072216649949853e-05, "loss": 0.7803, "step": 8150 }, { "epoch": 0.7402058133237046, "grad_norm": 8.31666088104248, "learning_rate": 2.099699097291876e-05, "loss": 0.7081, "step": 8200 }, { "epoch": 0.7447192634049468, "grad_norm": 9.174483299255371, "learning_rate": 2.092176529588766e-05, "loss": 0.826, "step": 8250 }, { "epoch": 0.7447192634049468, "eval_exact_match": 84.76821192052981, "eval_f1": 91.56620229706857, "eval_runtime": 143.4859, "eval_samples_per_second": 75.255, "eval_steps_per_second": 18.817, "step": 8250 }, { "epoch": 0.7492327134861888, "grad_norm": 5.849365234375, "learning_rate": 2.084653961885657e-05, "loss": 0.7826, "step": 8300 }, { "epoch": 0.753746163567431, "grad_norm": 8.80666446685791, "learning_rate": 2.0771313941825476e-05, "loss": 0.8931, "step": 8350 }, { "epoch": 0.758259613648673, "grad_norm": 7.301697731018066, "learning_rate": 2.0696088264794382e-05, "loss": 0.6788, "step": 8400 }, { "epoch": 0.7627730637299152, "grad_norm": 9.519810676574707, "learning_rate": 2.062086258776329e-05, "loss": 0.7928, "step": 8450 }, { "epoch": 0.7672865138111572, "grad_norm": 8.138936996459961, "learning_rate": 2.0545636910732197e-05, "loss": 0.8625, "step": 8500 }, { "epoch": 0.7672865138111572, "eval_exact_match": 85.19394512771996, "eval_f1": 91.95119129750337, "eval_runtime": 143.5272, "eval_samples_per_second": 75.233, "eval_steps_per_second": 18.812, "step": 8500 }, { "epoch": 0.7717999638923994, "grad_norm": 3.691103935241699, "learning_rate": 2.0470411233701103e-05, "loss": 0.7947, "step": 8550 }, { "epoch": 0.7763134139736414, "grad_norm": 14.496338844299316, "learning_rate": 2.0395185556670012e-05, "loss": 0.8135, "step": 8600 }, { "epoch": 0.7808268640548836, "grad_norm": 6.248403072357178, "learning_rate": 2.0319959879638918e-05, "loss": 0.8594, "step": 8650 }, { "epoch": 0.7853403141361257, "grad_norm": 16.819801330566406, "learning_rate": 2.0244734202607824e-05, "loss": 0.8171, "step": 8700 }, { "epoch": 0.7898537642173677, "grad_norm": 7.9061079025268555, "learning_rate": 2.0169508525576733e-05, "loss": 0.6517, "step": 8750 }, { "epoch": 0.7898537642173677, "eval_exact_match": 85.6480605487228, "eval_f1": 91.95890910573651, "eval_runtime": 143.4989, "eval_samples_per_second": 75.248, "eval_steps_per_second": 18.815, "step": 8750 }, { "epoch": 0.7943672142986099, "grad_norm": 12.129390716552734, "learning_rate": 2.009428284854564e-05, "loss": 0.7358, "step": 8800 }, { "epoch": 0.798880664379852, "grad_norm": 7.113585472106934, "learning_rate": 2.0019057171514545e-05, "loss": 0.8517, "step": 8850 }, { "epoch": 0.8033941144610941, "grad_norm": 10.407898902893066, "learning_rate": 1.994383149448345e-05, "loss": 0.733, "step": 8900 }, { "epoch": 0.8079075645423361, "grad_norm": 15.745281219482422, "learning_rate": 1.9868605817452356e-05, "loss": 0.693, "step": 8950 }, { "epoch": 0.8124210146235783, "grad_norm": 6.876597881317139, "learning_rate": 1.9793380140421262e-05, "loss": 0.817, "step": 9000 }, { "epoch": 0.8124210146235783, "eval_exact_match": 84.87228003784296, "eval_f1": 91.5110749584356, "eval_runtime": 143.5544, "eval_samples_per_second": 75.219, "eval_steps_per_second": 18.808, "step": 9000 }, { "epoch": 0.8169344647048203, "grad_norm": 7.4037065505981445, "learning_rate": 1.971815446339017e-05, "loss": 0.8677, "step": 9050 }, { "epoch": 0.8214479147860625, "grad_norm": 4.559969902038574, "learning_rate": 1.9642928786359077e-05, "loss": 0.7798, "step": 9100 }, { "epoch": 0.8259613648673045, "grad_norm": 7.184974670410156, "learning_rate": 1.9567703109327983e-05, "loss": 0.7705, "step": 9150 }, { "epoch": 0.8304748149485467, "grad_norm": 8.206283569335938, "learning_rate": 1.9492477432296892e-05, "loss": 0.8398, "step": 9200 }, { "epoch": 0.8349882650297887, "grad_norm": 7.29602575302124, "learning_rate": 1.9417251755265798e-05, "loss": 0.7574, "step": 9250 }, { "epoch": 0.8349882650297887, "eval_exact_match": 85.06149479659413, "eval_f1": 91.53106503540634, "eval_runtime": 143.5779, "eval_samples_per_second": 75.207, "eval_steps_per_second": 18.805, "step": 9250 }, { "epoch": 0.8395017151110309, "grad_norm": 10.167183876037598, "learning_rate": 1.9342026078234704e-05, "loss": 0.7103, "step": 9300 }, { "epoch": 0.8440151651922729, "grad_norm": 6.271793365478516, "learning_rate": 1.9266800401203613e-05, "loss": 0.8364, "step": 9350 }, { "epoch": 0.8485286152735151, "grad_norm": 11.07026481628418, "learning_rate": 1.919157472417252e-05, "loss": 0.726, "step": 9400 }, { "epoch": 0.8530420653547571, "grad_norm": 5.571475028991699, "learning_rate": 1.9116349047141425e-05, "loss": 0.7205, "step": 9450 }, { "epoch": 0.8575555154359993, "grad_norm": 3.4866223335266113, "learning_rate": 1.9041123370110334e-05, "loss": 0.7832, "step": 9500 }, { "epoch": 0.8575555154359993, "eval_exact_match": 85.44938505203406, "eval_f1": 91.84507576310226, "eval_runtime": 143.5379, "eval_samples_per_second": 75.228, "eval_steps_per_second": 18.81, "step": 9500 }, { "epoch": 0.8620689655172413, "grad_norm": 2.2408883571624756, "learning_rate": 1.896589769307924e-05, "loss": 0.7533, "step": 9550 }, { "epoch": 0.8665824155984835, "grad_norm": 13.415377616882324, "learning_rate": 1.8890672016048142e-05, "loss": 0.7669, "step": 9600 }, { "epoch": 0.8710958656797256, "grad_norm": 4.730581760406494, "learning_rate": 1.881544633901705e-05, "loss": 0.7468, "step": 9650 }, { "epoch": 0.8756093157609677, "grad_norm": 6.725691318511963, "learning_rate": 1.8740220661985957e-05, "loss": 0.7426, "step": 9700 }, { "epoch": 0.8801227658422098, "grad_norm": 8.169360160827637, "learning_rate": 1.8664994984954863e-05, "loss": 0.8436, "step": 9750 }, { "epoch": 0.8801227658422098, "eval_exact_match": 84.88174077578051, "eval_f1": 91.83275837323971, "eval_runtime": 143.4694, "eval_samples_per_second": 75.263, "eval_steps_per_second": 18.819, "step": 9750 }, { "epoch": 0.8846362159234519, "grad_norm": 1.717469334602356, "learning_rate": 1.8589769307923772e-05, "loss": 0.7889, "step": 9800 }, { "epoch": 0.889149666004694, "grad_norm": 20.31835174560547, "learning_rate": 1.8514543630892678e-05, "loss": 0.7648, "step": 9850 }, { "epoch": 0.8936631160859361, "grad_norm": 15.77481746673584, "learning_rate": 1.8439317953861584e-05, "loss": 0.7259, "step": 9900 }, { "epoch": 0.8981765661671782, "grad_norm": 3.87709641456604, "learning_rate": 1.8364092276830493e-05, "loss": 0.7866, "step": 9950 }, { "epoch": 0.9026900162484203, "grad_norm": 8.835536003112793, "learning_rate": 1.82888665997994e-05, "loss": 0.6775, "step": 10000 }, { "epoch": 0.9026900162484203, "eval_exact_match": 85.99810785241249, "eval_f1": 91.99639894905705, "eval_runtime": 143.5348, "eval_samples_per_second": 75.229, "eval_steps_per_second": 18.811, "step": 10000 }, { "epoch": 0.9072034663296624, "grad_norm": 12.733137130737305, "learning_rate": 1.8213640922768305e-05, "loss": 0.7703, "step": 10050 }, { "epoch": 0.9117169164109045, "grad_norm": 12.40443229675293, "learning_rate": 1.8138415245737214e-05, "loss": 0.7619, "step": 10100 }, { "epoch": 0.9162303664921466, "grad_norm": 6.346498012542725, "learning_rate": 1.806318956870612e-05, "loss": 0.7605, "step": 10150 }, { "epoch": 0.9207438165733887, "grad_norm": 5.372687816619873, "learning_rate": 1.7987963891675026e-05, "loss": 0.6591, "step": 10200 }, { "epoch": 0.9252572666546308, "grad_norm": 4.377304553985596, "learning_rate": 1.7912738214643932e-05, "loss": 0.8404, "step": 10250 }, { "epoch": 0.9252572666546308, "eval_exact_match": 85.89403973509934, "eval_f1": 92.11360231029698, "eval_runtime": 144.2084, "eval_samples_per_second": 74.878, "eval_steps_per_second": 18.723, "step": 10250 }, { "epoch": 0.9297707167358729, "grad_norm": 11.62856388092041, "learning_rate": 1.7837512537612838e-05, "loss": 0.7712, "step": 10300 }, { "epoch": 0.934284166817115, "grad_norm": 5.723257541656494, "learning_rate": 1.7762286860581743e-05, "loss": 0.7171, "step": 10350 }, { "epoch": 0.9387976168983571, "grad_norm": 6.060873031616211, "learning_rate": 1.7687061183550653e-05, "loss": 0.7324, "step": 10400 }, { "epoch": 0.9433110669795992, "grad_norm": 4.51533842086792, "learning_rate": 1.761183550651956e-05, "loss": 0.7633, "step": 10450 }, { "epoch": 0.9478245170608413, "grad_norm": 11.809548377990723, "learning_rate": 1.7536609829488464e-05, "loss": 0.8111, "step": 10500 }, { "epoch": 0.9478245170608413, "eval_exact_match": 85.09933774834437, "eval_f1": 91.93045017438146, "eval_runtime": 143.4235, "eval_samples_per_second": 75.288, "eval_steps_per_second": 18.825, "step": 10500 }, { "epoch": 0.9523379671420834, "grad_norm": 15.76356029510498, "learning_rate": 1.7461384152457374e-05, "loss": 0.7013, "step": 10550 }, { "epoch": 0.9568514172233256, "grad_norm": 23.272687911987305, "learning_rate": 1.738615847542628e-05, "loss": 0.8229, "step": 10600 }, { "epoch": 0.9613648673045676, "grad_norm": 16.758358001708984, "learning_rate": 1.7310932798395185e-05, "loss": 0.7479, "step": 10650 }, { "epoch": 0.9658783173858098, "grad_norm": 14.670035362243652, "learning_rate": 1.7235707121364094e-05, "loss": 0.8, "step": 10700 }, { "epoch": 0.9703917674670518, "grad_norm": 5.1286821365356445, "learning_rate": 1.7160481444333e-05, "loss": 0.8522, "step": 10750 }, { "epoch": 0.9703917674670518, "eval_exact_match": 85.07095553453169, "eval_f1": 92.09316544794538, "eval_runtime": 143.6698, "eval_samples_per_second": 75.158, "eval_steps_per_second": 18.793, "step": 10750 }, { "epoch": 0.974905217548294, "grad_norm": 8.131464004516602, "learning_rate": 1.7085255767301906e-05, "loss": 0.7949, "step": 10800 }, { "epoch": 0.979418667629536, "grad_norm": 22.16661834716797, "learning_rate": 1.7010030090270815e-05, "loss": 0.7486, "step": 10850 }, { "epoch": 0.9839321177107782, "grad_norm": 5.554388046264648, "learning_rate": 1.693480441323972e-05, "loss": 0.7604, "step": 10900 }, { "epoch": 0.9884455677920202, "grad_norm": 8.525761604309082, "learning_rate": 1.6859578736208624e-05, "loss": 0.766, "step": 10950 }, { "epoch": 0.9929590178732624, "grad_norm": 10.504690170288086, "learning_rate": 1.6784353059177533e-05, "loss": 0.7166, "step": 11000 }, { "epoch": 0.9929590178732624, "eval_exact_match": 85.04257332071901, "eval_f1": 92.02241474371678, "eval_runtime": 143.4783, "eval_samples_per_second": 75.259, "eval_steps_per_second": 18.818, "step": 11000 }, { "epoch": 0.9974724679545044, "grad_norm": 7.378440856933594, "learning_rate": 1.670912738214644e-05, "loss": 0.7187, "step": 11050 }, { "epoch": 1.0019859180357464, "grad_norm": 3.172842502593994, "learning_rate": 1.6633901705115345e-05, "loss": 0.7532, "step": 11100 }, { "epoch": 1.0064993681169887, "grad_norm": 2.3299856185913086, "learning_rate": 1.6558676028084254e-05, "loss": 0.497, "step": 11150 }, { "epoch": 1.0110128181982307, "grad_norm": 8.6509428024292, "learning_rate": 1.648345035105316e-05, "loss": 0.4497, "step": 11200 }, { "epoch": 1.0155262682794728, "grad_norm": 9.68758773803711, "learning_rate": 1.6408224674022065e-05, "loss": 0.6154, "step": 11250 }, { "epoch": 1.0155262682794728, "eval_exact_match": 86.08325449385052, "eval_f1": 92.38528194318762, "eval_runtime": 143.3576, "eval_samples_per_second": 75.322, "eval_steps_per_second": 18.834, "step": 11250 }, { "epoch": 1.0200397183607148, "grad_norm": 3.34212064743042, "learning_rate": 1.6332998996990975e-05, "loss": 0.5372, "step": 11300 }, { "epoch": 1.024553168441957, "grad_norm": 5.384337425231934, "learning_rate": 1.625777331995988e-05, "loss": 0.5464, "step": 11350 }, { "epoch": 1.0290666185231991, "grad_norm": 19.279573440551758, "learning_rate": 1.6182547642928786e-05, "loss": 0.5558, "step": 11400 }, { "epoch": 1.0335800686044412, "grad_norm": 5.5248308181762695, "learning_rate": 1.6107321965897696e-05, "loss": 0.4981, "step": 11450 }, { "epoch": 1.0380935186856832, "grad_norm": 5.657703399658203, "learning_rate": 1.60320962888666e-05, "loss": 0.6565, "step": 11500 }, { "epoch": 1.0380935186856832, "eval_exact_match": 85.58183538315988, "eval_f1": 92.11318103014378, "eval_runtime": 143.5023, "eval_samples_per_second": 75.246, "eval_steps_per_second": 18.815, "step": 11500 }, { "epoch": 1.0426069687669255, "grad_norm": 6.387887954711914, "learning_rate": 1.5956870611835507e-05, "loss": 0.588, "step": 11550 }, { "epoch": 1.0471204188481675, "grad_norm": 1.7305879592895508, "learning_rate": 1.5881644934804413e-05, "loss": 0.5747, "step": 11600 }, { "epoch": 1.0516338689294096, "grad_norm": 14.716680526733398, "learning_rate": 1.580641925777332e-05, "loss": 0.5632, "step": 11650 }, { "epoch": 1.0561473190106518, "grad_norm": 9.127685546875, "learning_rate": 1.5731193580742225e-05, "loss": 0.4897, "step": 11700 }, { "epoch": 1.0606607690918939, "grad_norm": 8.541461944580078, "learning_rate": 1.5655967903711134e-05, "loss": 0.544, "step": 11750 }, { "epoch": 1.0606607690918939, "eval_exact_match": 86.16840113528855, "eval_f1": 92.442978713336, "eval_runtime": 143.2851, "eval_samples_per_second": 75.36, "eval_steps_per_second": 18.844, "step": 11750 }, { "epoch": 1.065174219173136, "grad_norm": 6.538851737976074, "learning_rate": 1.558074222668004e-05, "loss": 0.5202, "step": 11800 }, { "epoch": 1.069687669254378, "grad_norm": 7.314679145812988, "learning_rate": 1.5505516549648946e-05, "loss": 0.54, "step": 11850 }, { "epoch": 1.0742011193356202, "grad_norm": 2.3385446071624756, "learning_rate": 1.5430290872617855e-05, "loss": 0.5192, "step": 11900 }, { "epoch": 1.0787145694168623, "grad_norm": 26.518877029418945, "learning_rate": 1.535506519558676e-05, "loss": 0.5435, "step": 11950 }, { "epoch": 1.0832280194981043, "grad_norm": 39.6591682434082, "learning_rate": 1.5279839518555667e-05, "loss": 0.5149, "step": 12000 }, { "epoch": 1.0832280194981043, "eval_exact_match": 85.80889309366131, "eval_f1": 92.20769990556119, "eval_runtime": 143.6087, "eval_samples_per_second": 75.19, "eval_steps_per_second": 18.801, "step": 12000 }, { "epoch": 1.0877414695793464, "grad_norm": 11.38036823272705, "learning_rate": 1.5204613841524576e-05, "loss": 0.526, "step": 12050 }, { "epoch": 1.0922549196605886, "grad_norm": 21.02750587463379, "learning_rate": 1.512938816449348e-05, "loss": 0.471, "step": 12100 }, { "epoch": 1.0967683697418307, "grad_norm": 23.10146713256836, "learning_rate": 1.5054162487462386e-05, "loss": 0.6643, "step": 12150 }, { "epoch": 1.1012818198230727, "grad_norm": 29.241615295410156, "learning_rate": 1.4978936810431293e-05, "loss": 0.5231, "step": 12200 }, { "epoch": 1.1057952699043148, "grad_norm": 3.3990285396575928, "learning_rate": 1.49037111334002e-05, "loss": 0.4051, "step": 12250 }, { "epoch": 1.1057952699043148, "eval_exact_match": 85.56291390728477, "eval_f1": 92.0566633980034, "eval_runtime": 143.183, "eval_samples_per_second": 75.414, "eval_steps_per_second": 18.857, "step": 12250 }, { "epoch": 1.110308719985557, "grad_norm": 3.075737237930298, "learning_rate": 1.4828485456369108e-05, "loss": 0.5637, "step": 12300 }, { "epoch": 1.114822170066799, "grad_norm": 2.9517650604248047, "learning_rate": 1.4753259779338014e-05, "loss": 0.566, "step": 12350 }, { "epoch": 1.119335620148041, "grad_norm": 23.89853858947754, "learning_rate": 1.4678034102306922e-05, "loss": 0.4439, "step": 12400 }, { "epoch": 1.1238490702292832, "grad_norm": 8.486159324645996, "learning_rate": 1.4602808425275828e-05, "loss": 0.5538, "step": 12450 }, { "epoch": 1.1283625203105254, "grad_norm": 3.5648648738861084, "learning_rate": 1.4527582748244733e-05, "loss": 0.5173, "step": 12500 }, { "epoch": 1.1283625203105254, "eval_exact_match": 85.4872280037843, "eval_f1": 92.15147631309604, "eval_runtime": 143.2081, "eval_samples_per_second": 75.401, "eval_steps_per_second": 18.854, "step": 12500 }, { "epoch": 1.1328759703917675, "grad_norm": 10.259268760681152, "learning_rate": 1.4452357071213641e-05, "loss": 0.5925, "step": 12550 }, { "epoch": 1.1373894204730095, "grad_norm": 6.570536136627197, "learning_rate": 1.4377131394182548e-05, "loss": 0.4594, "step": 12600 }, { "epoch": 1.1419028705542518, "grad_norm": 6.687112808227539, "learning_rate": 1.4301905717151454e-05, "loss": 0.4994, "step": 12650 }, { "epoch": 1.1464163206354938, "grad_norm": 14.550410270690918, "learning_rate": 1.4226680040120362e-05, "loss": 0.5775, "step": 12700 }, { "epoch": 1.1509297707167359, "grad_norm": 12.998605728149414, "learning_rate": 1.4151454363089268e-05, "loss": 0.5285, "step": 12750 }, { "epoch": 1.1509297707167359, "eval_exact_match": 85.93188268684958, "eval_f1": 92.11716297833141, "eval_runtime": 143.1084, "eval_samples_per_second": 75.453, "eval_steps_per_second": 18.867, "step": 12750 }, { "epoch": 1.155443220797978, "grad_norm": 26.985210418701172, "learning_rate": 1.4076228686058175e-05, "loss": 0.5092, "step": 12800 }, { "epoch": 1.1599566708792202, "grad_norm": 15.445883750915527, "learning_rate": 1.4001003009027081e-05, "loss": 0.5178, "step": 12850 }, { "epoch": 1.1644701209604622, "grad_norm": 8.596466064453125, "learning_rate": 1.3925777331995989e-05, "loss": 0.5742, "step": 12900 }, { "epoch": 1.1689835710417043, "grad_norm": 3.9060676097869873, "learning_rate": 1.3850551654964896e-05, "loss": 0.5112, "step": 12950 }, { "epoch": 1.1734970211229463, "grad_norm": 2.3090436458587646, "learning_rate": 1.3775325977933802e-05, "loss": 0.4802, "step": 13000 }, { "epoch": 1.1734970211229463, "eval_exact_match": 86.3670766319773, "eval_f1": 92.39148643540621, "eval_runtime": 143.1331, "eval_samples_per_second": 75.44, "eval_steps_per_second": 18.864, "step": 13000 }, { "epoch": 1.1780104712041886, "grad_norm": 4.289682865142822, "learning_rate": 1.370010030090271e-05, "loss": 0.4555, "step": 13050 }, { "epoch": 1.1825239212854306, "grad_norm": 23.45159149169922, "learning_rate": 1.3624874623871615e-05, "loss": 0.6034, "step": 13100 }, { "epoch": 1.1870373713666726, "grad_norm": 14.170953750610352, "learning_rate": 1.3549648946840521e-05, "loss": 0.4946, "step": 13150 }, { "epoch": 1.191550821447915, "grad_norm": 7.408278942108154, "learning_rate": 1.3474423269809429e-05, "loss": 0.5625, "step": 13200 }, { "epoch": 1.196064271529157, "grad_norm": 4.187251567840576, "learning_rate": 1.3399197592778336e-05, "loss": 0.5344, "step": 13250 }, { "epoch": 1.196064271529157, "eval_exact_match": 85.66698202459791, "eval_f1": 92.29328625942796, "eval_runtime": 143.2334, "eval_samples_per_second": 75.387, "eval_steps_per_second": 18.85, "step": 13250 }, { "epoch": 1.200577721610399, "grad_norm": 9.739165306091309, "learning_rate": 1.3323971915747242e-05, "loss": 0.5319, "step": 13300 }, { "epoch": 1.205091171691641, "grad_norm": 3.0962629318237305, "learning_rate": 1.324874623871615e-05, "loss": 0.5455, "step": 13350 }, { "epoch": 1.209604621772883, "grad_norm": 10.260982513427734, "learning_rate": 1.3173520561685057e-05, "loss": 0.5922, "step": 13400 }, { "epoch": 1.2141180718541253, "grad_norm": 17.95406150817871, "learning_rate": 1.3098294884653961e-05, "loss": 0.6416, "step": 13450 }, { "epoch": 1.2186315219353674, "grad_norm": 9.253098487854004, "learning_rate": 1.3023069207622869e-05, "loss": 0.4543, "step": 13500 }, { "epoch": 1.2186315219353674, "eval_exact_match": 86.20624408703878, "eval_f1": 92.32664235875168, "eval_runtime": 143.2002, "eval_samples_per_second": 75.405, "eval_steps_per_second": 18.855, "step": 13500 }, { "epoch": 1.2231449720166094, "grad_norm": 9.202949523925781, "learning_rate": 1.2947843530591776e-05, "loss": 0.6569, "step": 13550 }, { "epoch": 1.2276584220978517, "grad_norm": 16.244760513305664, "learning_rate": 1.2872617853560682e-05, "loss": 0.5605, "step": 13600 }, { "epoch": 1.2321718721790937, "grad_norm": 2.6242430210113525, "learning_rate": 1.279739217652959e-05, "loss": 0.546, "step": 13650 }, { "epoch": 1.2366853222603358, "grad_norm": 19.960708618164062, "learning_rate": 1.2722166499498497e-05, "loss": 0.5916, "step": 13700 }, { "epoch": 1.2411987723415778, "grad_norm": 14.39201545715332, "learning_rate": 1.2646940822467401e-05, "loss": 0.4647, "step": 13750 }, { "epoch": 1.2411987723415778, "eval_exact_match": 86.27246925260171, "eval_f1": 92.45848778749898, "eval_runtime": 143.112, "eval_samples_per_second": 75.451, "eval_steps_per_second": 18.866, "step": 13750 }, { "epoch": 1.24571222242282, "grad_norm": 8.58752155303955, "learning_rate": 1.2571715145436309e-05, "loss": 0.5224, "step": 13800 }, { "epoch": 1.2502256725040621, "grad_norm": 5.419035911560059, "learning_rate": 1.2496489468405216e-05, "loss": 0.6743, "step": 13850 }, { "epoch": 1.2547391225853042, "grad_norm": 7.52559232711792, "learning_rate": 1.2421263791374122e-05, "loss": 0.5955, "step": 13900 }, { "epoch": 1.2592525726665462, "grad_norm": 15.449511528015137, "learning_rate": 1.234603811434303e-05, "loss": 0.5824, "step": 13950 }, { "epoch": 1.2637660227477885, "grad_norm": 11.266414642333984, "learning_rate": 1.2270812437311937e-05, "loss": 0.5101, "step": 14000 }, { "epoch": 1.2637660227477885, "eval_exact_match": 86.12109744560075, "eval_f1": 92.2146350604068, "eval_runtime": 145.9619, "eval_samples_per_second": 73.978, "eval_steps_per_second": 18.498, "step": 14000 }, { "epoch": 1.2682794728290305, "grad_norm": 7.292428970336914, "learning_rate": 1.2195586760280843e-05, "loss": 0.4962, "step": 14050 }, { "epoch": 1.2727929229102726, "grad_norm": 1.1534169912338257, "learning_rate": 1.2120361083249749e-05, "loss": 0.5691, "step": 14100 }, { "epoch": 1.2773063729915148, "grad_norm": 3.5983633995056152, "learning_rate": 1.2045135406218656e-05, "loss": 0.5114, "step": 14150 }, { "epoch": 1.2818198230727569, "grad_norm": 5.006545543670654, "learning_rate": 1.1969909729187562e-05, "loss": 0.5259, "step": 14200 }, { "epoch": 1.286333273153999, "grad_norm": 24.31420135498047, "learning_rate": 1.189468405215647e-05, "loss": 0.4771, "step": 14250 }, { "epoch": 1.286333273153999, "eval_exact_match": 86.59413434247871, "eval_f1": 92.49677313517446, "eval_runtime": 145.8932, "eval_samples_per_second": 74.013, "eval_steps_per_second": 18.507, "step": 14250 }, { "epoch": 1.290846723235241, "grad_norm": 8.447436332702637, "learning_rate": 1.1819458375125377e-05, "loss": 0.5835, "step": 14300 }, { "epoch": 1.295360173316483, "grad_norm": 23.178955078125, "learning_rate": 1.1744232698094283e-05, "loss": 0.5554, "step": 14350 }, { "epoch": 1.2998736233977253, "grad_norm": 16.500057220458984, "learning_rate": 1.166900702106319e-05, "loss": 0.4928, "step": 14400 }, { "epoch": 1.3043870734789673, "grad_norm": 8.389457702636719, "learning_rate": 1.1593781344032097e-05, "loss": 0.6872, "step": 14450 }, { "epoch": 1.3089005235602094, "grad_norm": 5.315954685211182, "learning_rate": 1.1518555667001002e-05, "loss": 0.5394, "step": 14500 }, { "epoch": 1.3089005235602094, "eval_exact_match": 86.45222327341533, "eval_f1": 92.58788732745475, "eval_runtime": 145.7277, "eval_samples_per_second": 74.097, "eval_steps_per_second": 18.528, "step": 14500 }, { "epoch": 1.3134139736414516, "grad_norm": 2.0151515007019043, "learning_rate": 1.144332998996991e-05, "loss": 0.4745, "step": 14550 }, { "epoch": 1.3179274237226937, "grad_norm": 6.995370864868164, "learning_rate": 1.1368104312938817e-05, "loss": 0.536, "step": 14600 }, { "epoch": 1.3224408738039357, "grad_norm": 4.453261852264404, "learning_rate": 1.1292878635907723e-05, "loss": 0.4909, "step": 14650 }, { "epoch": 1.3269543238851778, "grad_norm": 3.472259998321533, "learning_rate": 1.121765295887663e-05, "loss": 0.586, "step": 14700 }, { "epoch": 1.3314677739664198, "grad_norm": 15.908103942871094, "learning_rate": 1.1142427281845537e-05, "loss": 0.528, "step": 14750 }, { "epoch": 1.3314677739664198, "eval_exact_match": 85.86565752128666, "eval_f1": 92.39225966105154, "eval_runtime": 280.4149, "eval_samples_per_second": 38.507, "eval_steps_per_second": 9.629, "step": 14750 }, { "epoch": 1.335981224047662, "grad_norm": 12.543098449707031, "learning_rate": 1.1067201604814443e-05, "loss": 0.4912, "step": 14800 }, { "epoch": 1.340494674128904, "grad_norm": 24.144222259521484, "learning_rate": 1.099197592778335e-05, "loss": 0.5626, "step": 14850 }, { "epoch": 1.3450081242101461, "grad_norm": 25.347875595092773, "learning_rate": 1.0916750250752258e-05, "loss": 0.4955, "step": 14900 }, { "epoch": 1.3495215742913884, "grad_norm": 5.940708637237549, "learning_rate": 1.0841524573721163e-05, "loss": 0.4859, "step": 14950 }, { "epoch": 1.3540350243726305, "grad_norm": 30.9013671875, "learning_rate": 1.0766298896690071e-05, "loss": 0.4788, "step": 15000 }, { "epoch": 1.3540350243726305, "eval_exact_match": 86.6414380321665, "eval_f1": 92.62032707644155, "eval_runtime": 145.6846, "eval_samples_per_second": 74.119, "eval_steps_per_second": 18.533, "step": 15000 }, { "epoch": 1.3585484744538725, "grad_norm": 12.216713905334473, "learning_rate": 1.0691073219658978e-05, "loss": 0.4977, "step": 15050 }, { "epoch": 1.3630619245351148, "grad_norm": 40.2611083984375, "learning_rate": 1.0615847542627883e-05, "loss": 0.4972, "step": 15100 }, { "epoch": 1.3675753746163568, "grad_norm": 10.3711519241333, "learning_rate": 1.054062186559679e-05, "loss": 0.4955, "step": 15150 }, { "epoch": 1.3720888246975989, "grad_norm": 2.400322914123535, "learning_rate": 1.0465396188565698e-05, "loss": 0.4868, "step": 15200 }, { "epoch": 1.376602274778841, "grad_norm": 3.9988925457000732, "learning_rate": 1.0390170511534603e-05, "loss": 0.5888, "step": 15250 }, { "epoch": 1.376602274778841, "eval_exact_match": 85.96026490066225, "eval_f1": 92.33197764854948, "eval_runtime": 145.4185, "eval_samples_per_second": 74.255, "eval_steps_per_second": 18.567, "step": 15250 }, { "epoch": 1.381115724860083, "grad_norm": 1.6575514078140259, "learning_rate": 1.0314944834503511e-05, "loss": 0.5143, "step": 15300 }, { "epoch": 1.3856291749413252, "grad_norm": 5.943323612213135, "learning_rate": 1.0239719157472419e-05, "loss": 0.4725, "step": 15350 }, { "epoch": 1.3901426250225672, "grad_norm": 21.014570236206055, "learning_rate": 1.0164493480441324e-05, "loss": 0.5131, "step": 15400 }, { "epoch": 1.3946560751038093, "grad_norm": 4.148115634918213, "learning_rate": 1.008926780341023e-05, "loss": 0.4323, "step": 15450 }, { "epoch": 1.3991695251850516, "grad_norm": 8.95993423461914, "learning_rate": 1.0014042126379138e-05, "loss": 0.6072, "step": 15500 }, { "epoch": 1.3991695251850516, "eval_exact_match": 85.76158940397352, "eval_f1": 92.24988076673156, "eval_runtime": 145.3259, "eval_samples_per_second": 74.302, "eval_steps_per_second": 18.579, "step": 15500 }, { "epoch": 1.4036829752662936, "grad_norm": 5.6876959800720215, "learning_rate": 9.938816449348044e-06, "loss": 0.5692, "step": 15550 }, { "epoch": 1.4081964253475356, "grad_norm": 6.91029167175293, "learning_rate": 9.863590772316951e-06, "loss": 0.5801, "step": 15600 }, { "epoch": 1.4127098754287777, "grad_norm": 8.116116523742676, "learning_rate": 9.788365095285859e-06, "loss": 0.4394, "step": 15650 }, { "epoch": 1.4172233255100197, "grad_norm": 7.001738548278809, "learning_rate": 9.713139418254764e-06, "loss": 0.5607, "step": 15700 }, { "epoch": 1.421736775591262, "grad_norm": 21.804443359375, "learning_rate": 9.637913741223672e-06, "loss": 0.5191, "step": 15750 }, { "epoch": 1.421736775591262, "eval_exact_match": 85.97918637653737, "eval_f1": 92.37275066667881, "eval_runtime": 145.3503, "eval_samples_per_second": 74.289, "eval_steps_per_second": 18.576, "step": 15750 }, { "epoch": 1.426250225672504, "grad_norm": 11.133319854736328, "learning_rate": 9.562688064192578e-06, "loss": 0.5256, "step": 15800 }, { "epoch": 1.430763675753746, "grad_norm": 5.4904632568359375, "learning_rate": 9.487462387161484e-06, "loss": 0.5177, "step": 15850 }, { "epoch": 1.4352771258349883, "grad_norm": 9.791414260864258, "learning_rate": 9.412236710130391e-06, "loss": 0.5814, "step": 15900 }, { "epoch": 1.4397905759162304, "grad_norm": 3.3400447368621826, "learning_rate": 9.337011033099299e-06, "loss": 0.5374, "step": 15950 }, { "epoch": 1.4443040259974724, "grad_norm": 23.98038673400879, "learning_rate": 9.261785356068205e-06, "loss": 0.4973, "step": 16000 }, { "epoch": 1.4443040259974724, "eval_exact_match": 84.85335856196784, "eval_f1": 92.03377983249271, "eval_runtime": 145.2676, "eval_samples_per_second": 74.332, "eval_steps_per_second": 18.586, "step": 16000 }, { "epoch": 1.4488174760787147, "grad_norm": 9.59720516204834, "learning_rate": 9.186559679037112e-06, "loss": 0.534, "step": 16050 }, { "epoch": 1.4533309261599567, "grad_norm": 10.079476356506348, "learning_rate": 9.111334002006018e-06, "loss": 0.511, "step": 16100 }, { "epoch": 1.4578443762411988, "grad_norm": 3.377192497253418, "learning_rate": 9.036108324974924e-06, "loss": 0.5187, "step": 16150 }, { "epoch": 1.4623578263224408, "grad_norm": 10.79287052154541, "learning_rate": 8.960882647943831e-06, "loss": 0.5499, "step": 16200 }, { "epoch": 1.4668712764036829, "grad_norm": 15.751055717468262, "learning_rate": 8.885656970912739e-06, "loss": 0.536, "step": 16250 }, { "epoch": 1.4668712764036829, "eval_exact_match": 86.40491958372753, "eval_f1": 92.48354485469106, "eval_runtime": 145.2653, "eval_samples_per_second": 74.333, "eval_steps_per_second": 18.587, "step": 16250 }, { "epoch": 1.4713847264849251, "grad_norm": 3.9903676509857178, "learning_rate": 8.810431293881645e-06, "loss": 0.5385, "step": 16300 }, { "epoch": 1.4758981765661672, "grad_norm": 6.5839080810546875, "learning_rate": 8.735205616850552e-06, "loss": 0.5092, "step": 16350 }, { "epoch": 1.4804116266474092, "grad_norm": 13.69189453125, "learning_rate": 8.65997993981946e-06, "loss": 0.5999, "step": 16400 }, { "epoch": 1.4849250767286515, "grad_norm": 11.840332984924316, "learning_rate": 8.584754262788364e-06, "loss": 0.57, "step": 16450 }, { "epoch": 1.4894385268098935, "grad_norm": 11.86502742767334, "learning_rate": 8.509528585757271e-06, "loss": 0.4635, "step": 16500 }, { "epoch": 1.4894385268098935, "eval_exact_match": 85.49668874172185, "eval_f1": 92.28753097702375, "eval_runtime": 145.2047, "eval_samples_per_second": 74.364, "eval_steps_per_second": 18.594, "step": 16500 }, { "epoch": 1.4939519768911356, "grad_norm": 17.741037368774414, "learning_rate": 8.434302908726179e-06, "loss": 0.53, "step": 16550 }, { "epoch": 1.4984654269723776, "grad_norm": 7.774323463439941, "learning_rate": 8.359077231695085e-06, "loss": 0.575, "step": 16600 }, { "epoch": 1.5029788770536197, "grad_norm": 4.973544597625732, "learning_rate": 8.283851554663992e-06, "loss": 0.6468, "step": 16650 }, { "epoch": 1.507492327134862, "grad_norm": 5.228555202484131, "learning_rate": 8.2086258776329e-06, "loss": 0.4901, "step": 16700 }, { "epoch": 1.512005777216104, "grad_norm": 3.4082319736480713, "learning_rate": 8.133400200601806e-06, "loss": 0.5492, "step": 16750 }, { "epoch": 1.512005777216104, "eval_exact_match": 86.10217596972564, "eval_f1": 92.39499925755595, "eval_runtime": 145.1347, "eval_samples_per_second": 74.4, "eval_steps_per_second": 18.603, "step": 16750 }, { "epoch": 1.516519227297346, "grad_norm": 20.74472427368164, "learning_rate": 8.058174523570712e-06, "loss": 0.4685, "step": 16800 }, { "epoch": 1.5210326773785883, "grad_norm": 4.805381774902344, "learning_rate": 7.982948846539619e-06, "loss": 0.474, "step": 16850 }, { "epoch": 1.5255461274598303, "grad_norm": 1.6686218976974487, "learning_rate": 7.907723169508525e-06, "loss": 0.5617, "step": 16900 }, { "epoch": 1.5300595775410724, "grad_norm": 14.378780364990234, "learning_rate": 7.832497492477432e-06, "loss": 0.4829, "step": 16950 }, { "epoch": 1.5345730276223146, "grad_norm": 9.25706672668457, "learning_rate": 7.75727181544634e-06, "loss": 0.5525, "step": 17000 }, { "epoch": 1.5345730276223146, "eval_exact_match": 85.89403973509934, "eval_f1": 92.2175102581889, "eval_runtime": 145.4454, "eval_samples_per_second": 74.241, "eval_steps_per_second": 18.564, "step": 17000 }, { "epoch": 1.5390864777035564, "grad_norm": 10.210553169250488, "learning_rate": 7.682046138415246e-06, "loss": 0.5284, "step": 17050 }, { "epoch": 1.5435999277847987, "grad_norm": 18.55254364013672, "learning_rate": 7.6068204613841525e-06, "loss": 0.4863, "step": 17100 }, { "epoch": 1.5481133778660408, "grad_norm": 9.640850067138672, "learning_rate": 7.53159478435306e-06, "loss": 0.6163, "step": 17150 }, { "epoch": 1.5526268279472828, "grad_norm": 7.999804496765137, "learning_rate": 7.456369107321966e-06, "loss": 0.518, "step": 17200 }, { "epoch": 1.557140278028525, "grad_norm": 9.162345886230469, "learning_rate": 7.3811434302908725e-06, "loss": 0.5001, "step": 17250 }, { "epoch": 1.557140278028525, "eval_exact_match": 85.9035004730369, "eval_f1": 92.33731500742522, "eval_runtime": 145.3659, "eval_samples_per_second": 74.282, "eval_steps_per_second": 18.574, "step": 17250 }, { "epoch": 1.561653728109767, "grad_norm": 5.462348461151123, "learning_rate": 7.30591775325978e-06, "loss": 0.4179, "step": 17300 }, { "epoch": 1.5661671781910091, "grad_norm": 17.67523765563965, "learning_rate": 7.230692076228686e-06, "loss": 0.4662, "step": 17350 }, { "epoch": 1.5706806282722514, "grad_norm": 4.397737503051758, "learning_rate": 7.155466399197593e-06, "loss": 0.4614, "step": 17400 }, { "epoch": 1.5751940783534935, "grad_norm": 7.665886402130127, "learning_rate": 7.0802407221665e-06, "loss": 0.5263, "step": 17450 }, { "epoch": 1.5797075284347355, "grad_norm": 10.627632141113281, "learning_rate": 7.005015045135407e-06, "loss": 0.4021, "step": 17500 }, { "epoch": 1.5797075284347355, "eval_exact_match": 85.98864711447493, "eval_f1": 92.40008411758966, "eval_runtime": 145.3304, "eval_samples_per_second": 74.3, "eval_steps_per_second": 18.578, "step": 17500 }, { "epoch": 1.5842209785159778, "grad_norm": 7.057243347167969, "learning_rate": 6.929789368104313e-06, "loss": 0.5326, "step": 17550 }, { "epoch": 1.5887344285972196, "grad_norm": 8.216778755187988, "learning_rate": 6.85456369107322e-06, "loss": 0.6029, "step": 17600 }, { "epoch": 1.5932478786784618, "grad_norm": 3.943422794342041, "learning_rate": 6.779338014042127e-06, "loss": 0.5518, "step": 17650 }, { "epoch": 1.597761328759704, "grad_norm": 12.350107192993164, "learning_rate": 6.704112337011033e-06, "loss": 0.5368, "step": 17700 }, { "epoch": 1.602274778840946, "grad_norm": 6.516546249389648, "learning_rate": 6.62888665997994e-06, "loss": 0.504, "step": 17750 }, { "epoch": 1.602274778840946, "eval_exact_match": 86.12109744560075, "eval_f1": 92.37865780721518, "eval_runtime": 145.3649, "eval_samples_per_second": 74.282, "eval_steps_per_second": 18.574, "step": 17750 }, { "epoch": 1.6067882289221882, "grad_norm": 3.5462801456451416, "learning_rate": 6.553660982948847e-06, "loss": 0.4613, "step": 17800 }, { "epoch": 1.6113016790034302, "grad_norm": 9.32242488861084, "learning_rate": 6.478435305917753e-06, "loss": 0.5149, "step": 17850 }, { "epoch": 1.6158151290846723, "grad_norm": 4.5879597663879395, "learning_rate": 6.40320962888666e-06, "loss": 0.4189, "step": 17900 }, { "epoch": 1.6203285791659146, "grad_norm": 10.474478721618652, "learning_rate": 6.327983951855567e-06, "loss": 0.5182, "step": 17950 }, { "epoch": 1.6248420292471564, "grad_norm": 4.693137168884277, "learning_rate": 6.252758274824474e-06, "loss": 0.6212, "step": 18000 }, { "epoch": 1.6248420292471564, "eval_exact_match": 85.93188268684958, "eval_f1": 92.33148266916612, "eval_runtime": 145.2483, "eval_samples_per_second": 74.342, "eval_steps_per_second": 18.589, "step": 18000 }, { "epoch": 1.6293554793283986, "grad_norm": 20.36956214904785, "learning_rate": 6.17753259779338e-06, "loss": 0.6194, "step": 18050 }, { "epoch": 1.6338689294096407, "grad_norm": 8.450358390808105, "learning_rate": 6.102306920762287e-06, "loss": 0.518, "step": 18100 }, { "epoch": 1.6383823794908827, "grad_norm": 4.681309223175049, "learning_rate": 6.027081243731194e-06, "loss": 0.4853, "step": 18150 }, { "epoch": 1.642895829572125, "grad_norm": 6.480415344238281, "learning_rate": 5.9518555667001e-06, "loss": 0.5101, "step": 18200 }, { "epoch": 1.647409279653367, "grad_norm": 11.254326820373535, "learning_rate": 5.876629889669007e-06, "loss": 0.5095, "step": 18250 }, { "epoch": 1.647409279653367, "eval_exact_match": 86.03595080416272, "eval_f1": 92.38608711058883, "eval_runtime": 145.2161, "eval_samples_per_second": 74.358, "eval_steps_per_second": 18.593, "step": 18250 }, { "epoch": 1.651922729734609, "grad_norm": 7.776529788970947, "learning_rate": 5.801404212637914e-06, "loss": 0.5322, "step": 18300 }, { "epoch": 1.6564361798158513, "grad_norm": 17.068607330322266, "learning_rate": 5.72617853560682e-06, "loss": 0.4755, "step": 18350 }, { "epoch": 1.6609496298970934, "grad_norm": 20.472034454345703, "learning_rate": 5.650952858575727e-06, "loss": 0.6017, "step": 18400 }, { "epoch": 1.6654630799783354, "grad_norm": 4.904719352722168, "learning_rate": 5.575727181544634e-06, "loss": 0.4662, "step": 18450 }, { "epoch": 1.6699765300595777, "grad_norm": 7.219258785247803, "learning_rate": 5.500501504513541e-06, "loss": 0.4781, "step": 18500 }, { "epoch": 1.6699765300595777, "eval_exact_match": 85.55345316934721, "eval_f1": 92.32101574735367, "eval_runtime": 145.1791, "eval_samples_per_second": 74.377, "eval_steps_per_second": 18.598, "step": 18500 }, { "epoch": 1.6744899801408195, "grad_norm": 8.811306953430176, "learning_rate": 5.425275827482447e-06, "loss": 0.5161, "step": 18550 }, { "epoch": 1.6790034302220618, "grad_norm": 4.903675079345703, "learning_rate": 5.350050150451354e-06, "loss": 0.5215, "step": 18600 }, { "epoch": 1.6835168803033038, "grad_norm": 2.371656894683838, "learning_rate": 5.274824473420261e-06, "loss": 0.486, "step": 18650 }, { "epoch": 1.6880303303845459, "grad_norm": 8.991338729858398, "learning_rate": 5.199598796389167e-06, "loss": 0.5043, "step": 18700 }, { "epoch": 1.6925437804657881, "grad_norm": 5.460509777069092, "learning_rate": 5.124373119358074e-06, "loss": 0.5344, "step": 18750 }, { "epoch": 1.6925437804657881, "eval_exact_match": 86.02649006622516, "eval_f1": 92.44635703301584, "eval_runtime": 145.2855, "eval_samples_per_second": 74.323, "eval_steps_per_second": 18.584, "step": 18750 }, { "epoch": 1.6970572305470302, "grad_norm": 6.287936210632324, "learning_rate": 5.049147442326981e-06, "loss": 0.4446, "step": 18800 }, { "epoch": 1.7015706806282722, "grad_norm": 2.3766534328460693, "learning_rate": 4.973921765295887e-06, "loss": 0.4618, "step": 18850 }, { "epoch": 1.7060841307095145, "grad_norm": 6.606088161468506, "learning_rate": 4.898696088264794e-06, "loss": 0.498, "step": 18900 }, { "epoch": 1.7105975807907563, "grad_norm": 7.917613506317139, "learning_rate": 4.8234704112337015e-06, "loss": 0.473, "step": 18950 }, { "epoch": 1.7151110308719986, "grad_norm": 13.437002182006836, "learning_rate": 4.748244734202608e-06, "loss": 0.5217, "step": 19000 }, { "epoch": 1.7151110308719986, "eval_exact_match": 86.2251655629139, "eval_f1": 92.51045927542914, "eval_runtime": 145.2525, "eval_samples_per_second": 74.34, "eval_steps_per_second": 18.588, "step": 19000 }, { "epoch": 1.7196244809532406, "grad_norm": 21.756275177001953, "learning_rate": 4.673019057171515e-06, "loss": 0.5129, "step": 19050 }, { "epoch": 1.7241379310344827, "grad_norm": 4.2581377029418945, "learning_rate": 4.5977933801404215e-06, "loss": 0.5202, "step": 19100 }, { "epoch": 1.728651381115725, "grad_norm": 9.396230697631836, "learning_rate": 4.522567703109328e-06, "loss": 0.5118, "step": 19150 }, { "epoch": 1.733164831196967, "grad_norm": 9.545235633850098, "learning_rate": 4.447342026078235e-06, "loss": 0.4611, "step": 19200 }, { "epoch": 1.737678281278209, "grad_norm": 4.104794502258301, "learning_rate": 4.3721163490471416e-06, "loss": 0.5737, "step": 19250 }, { "epoch": 1.737678281278209, "eval_exact_match": 86.23462630085146, "eval_f1": 92.57135940815057, "eval_runtime": 145.5739, "eval_samples_per_second": 74.175, "eval_steps_per_second": 18.547, "step": 19250 }, { "epoch": 1.7421917313594513, "grad_norm": 8.624117851257324, "learning_rate": 4.296890672016048e-06, "loss": 0.5349, "step": 19300 }, { "epoch": 1.7467051814406933, "grad_norm": 4.802499771118164, "learning_rate": 4.221664994984955e-06, "loss": 0.5332, "step": 19350 }, { "epoch": 1.7512186315219354, "grad_norm": 4.347715854644775, "learning_rate": 4.146439317953862e-06, "loss": 0.474, "step": 19400 }, { "epoch": 1.7557320816031776, "grad_norm": 21.51348114013672, "learning_rate": 4.071213640922768e-06, "loss": 0.5182, "step": 19450 }, { "epoch": 1.7602455316844194, "grad_norm": 3.002976655960083, "learning_rate": 3.995987963891676e-06, "loss": 0.4243, "step": 19500 }, { "epoch": 1.7602455316844194, "eval_exact_match": 86.44276253547777, "eval_f1": 92.67282094003843, "eval_runtime": 145.268, "eval_samples_per_second": 74.332, "eval_steps_per_second": 18.586, "step": 19500 }, { "epoch": 1.7647589817656617, "grad_norm": 6.619145393371582, "learning_rate": 3.920762286860582e-06, "loss": 0.5754, "step": 19550 }, { "epoch": 1.7692724318469037, "grad_norm": 8.654962539672852, "learning_rate": 3.845536609829488e-06, "loss": 0.4927, "step": 19600 }, { "epoch": 1.7737858819281458, "grad_norm": 2.102865695953369, "learning_rate": 3.7703109327983955e-06, "loss": 0.417, "step": 19650 }, { "epoch": 1.778299332009388, "grad_norm": 9.824490547180176, "learning_rate": 3.695085255767302e-06, "loss": 0.5777, "step": 19700 }, { "epoch": 1.78281278209063, "grad_norm": 22.286598205566406, "learning_rate": 3.619859578736209e-06, "loss": 0.4338, "step": 19750 }, { "epoch": 1.78281278209063, "eval_exact_match": 86.39545884578997, "eval_f1": 92.63887659164942, "eval_runtime": 145.3006, "eval_samples_per_second": 74.315, "eval_steps_per_second": 18.582, "step": 19750 }, { "epoch": 1.7873262321718721, "grad_norm": 0.6572410464286804, "learning_rate": 3.5446339017051155e-06, "loss": 0.4549, "step": 19800 }, { "epoch": 1.7918396822531144, "grad_norm": 15.171038627624512, "learning_rate": 3.469408224674022e-06, "loss": 0.452, "step": 19850 }, { "epoch": 1.7963531323343562, "grad_norm": 13.550349235534668, "learning_rate": 3.394182547642929e-06, "loss": 0.4901, "step": 19900 }, { "epoch": 1.8008665824155985, "grad_norm": 9.970057487487793, "learning_rate": 3.3189568706118356e-06, "loss": 0.4424, "step": 19950 }, { "epoch": 1.8053800324968405, "grad_norm": 3.251477003097534, "learning_rate": 3.2437311935807422e-06, "loss": 0.5214, "step": 20000 }, { "epoch": 1.8053800324968405, "eval_exact_match": 86.10217596972564, "eval_f1": 92.55287274359681, "eval_runtime": 145.2611, "eval_samples_per_second": 74.335, "eval_steps_per_second": 18.587, "step": 20000 }, { "epoch": 1.8098934825780826, "grad_norm": 3.656310558319092, "learning_rate": 3.1685055165496493e-06, "loss": 0.4794, "step": 20050 }, { "epoch": 1.8144069326593248, "grad_norm": 6.139503479003906, "learning_rate": 3.0932798395185556e-06, "loss": 0.4485, "step": 20100 }, { "epoch": 1.8189203827405669, "grad_norm": 6.566440582275391, "learning_rate": 3.0180541624874623e-06, "loss": 0.456, "step": 20150 }, { "epoch": 1.823433832821809, "grad_norm": 6.406381130218506, "learning_rate": 2.9428284854563694e-06, "loss": 0.4988, "step": 20200 }, { "epoch": 1.8279472829030512, "grad_norm": 4.750673770904541, "learning_rate": 2.8676028084252757e-06, "loss": 0.5102, "step": 20250 }, { "epoch": 1.8279472829030512, "eval_exact_match": 86.0643330179754, "eval_f1": 92.48790625031062, "eval_runtime": 145.6532, "eval_samples_per_second": 74.135, "eval_steps_per_second": 18.537, "step": 20250 }, { "epoch": 1.8324607329842932, "grad_norm": 2.9004476070404053, "learning_rate": 2.7923771313941828e-06, "loss": 0.4172, "step": 20300 }, { "epoch": 1.8369741830655353, "grad_norm": 5.453982353210449, "learning_rate": 2.7171514543630894e-06, "loss": 0.4328, "step": 20350 }, { "epoch": 1.8414876331467775, "grad_norm": 6.562243461608887, "learning_rate": 2.6419257773319957e-06, "loss": 0.5946, "step": 20400 }, { "epoch": 1.8460010832280194, "grad_norm": 2.186967134475708, "learning_rate": 2.566700100300903e-06, "loss": 0.5174, "step": 20450 }, { "epoch": 1.8505145333092616, "grad_norm": 18.740962982177734, "learning_rate": 2.4914744232698095e-06, "loss": 0.5614, "step": 20500 }, { "epoch": 1.8505145333092616, "eval_exact_match": 86.24408703878902, "eval_f1": 92.6215979315234, "eval_runtime": 145.5466, "eval_samples_per_second": 74.189, "eval_steps_per_second": 18.551, "step": 20500 }, { "epoch": 1.8550279833905037, "grad_norm": 8.845901489257812, "learning_rate": 2.416248746238716e-06, "loss": 0.5174, "step": 20550 }, { "epoch": 1.8595414334717457, "grad_norm": 2.0935425758361816, "learning_rate": 2.341023069207623e-06, "loss": 0.4191, "step": 20600 }, { "epoch": 1.864054883552988, "grad_norm": 5.225878715515137, "learning_rate": 2.2657973921765295e-06, "loss": 0.5168, "step": 20650 }, { "epoch": 1.86856833363423, "grad_norm": 3.618779182434082, "learning_rate": 2.1905717151454362e-06, "loss": 0.5116, "step": 20700 }, { "epoch": 1.873081783715472, "grad_norm": 14.580885887145996, "learning_rate": 2.115346038114343e-06, "loss": 0.5247, "step": 20750 }, { "epoch": 1.873081783715472, "eval_exact_match": 86.39545884578997, "eval_f1": 92.68082620123108, "eval_runtime": 144.0826, "eval_samples_per_second": 74.943, "eval_steps_per_second": 18.739, "step": 20750 }, { "epoch": 1.8775952337967143, "grad_norm": 6.323169708251953, "learning_rate": 2.04012036108325e-06, "loss": 0.5007, "step": 20800 }, { "epoch": 1.8821086838779562, "grad_norm": 7.055742263793945, "learning_rate": 1.9648946840521567e-06, "loss": 0.5387, "step": 20850 }, { "epoch": 1.8866221339591984, "grad_norm": 6.097321033477783, "learning_rate": 1.8896690070210632e-06, "loss": 0.4795, "step": 20900 }, { "epoch": 1.8911355840404405, "grad_norm": 20.577049255371094, "learning_rate": 1.81444332998997e-06, "loss": 0.5636, "step": 20950 }, { "epoch": 1.8956490341216825, "grad_norm": 11.891510009765625, "learning_rate": 1.7392176529588768e-06, "loss": 0.5768, "step": 21000 }, { "epoch": 1.8956490341216825, "eval_exact_match": 86.28192999053927, "eval_f1": 92.59481897569101, "eval_runtime": 143.5237, "eval_samples_per_second": 75.235, "eval_steps_per_second": 18.812, "step": 21000 }, { "epoch": 1.9001624842029248, "grad_norm": 28.808475494384766, "learning_rate": 1.6639919759277832e-06, "loss": 0.5267, "step": 21050 }, { "epoch": 1.9046759342841668, "grad_norm": 11.045042991638184, "learning_rate": 1.5887662988966901e-06, "loss": 0.5508, "step": 21100 }, { "epoch": 1.9091893843654089, "grad_norm": 4.862325191497803, "learning_rate": 1.5135406218655968e-06, "loss": 0.4697, "step": 21150 }, { "epoch": 1.9137028344466511, "grad_norm": 13.387544631958008, "learning_rate": 1.4383149448345037e-06, "loss": 0.4416, "step": 21200 }, { "epoch": 1.9182162845278932, "grad_norm": 5.2306342124938965, "learning_rate": 1.3630892678034104e-06, "loss": 0.4124, "step": 21250 }, { "epoch": 1.9182162845278932, "eval_exact_match": 86.26300851466415, "eval_f1": 92.60255176769716, "eval_runtime": 143.5894, "eval_samples_per_second": 75.201, "eval_steps_per_second": 18.804, "step": 21250 }, { "epoch": 1.9227297346091352, "grad_norm": 29.35004234313965, "learning_rate": 1.2878635907723169e-06, "loss": 0.5344, "step": 21300 }, { "epoch": 1.9272431846903775, "grad_norm": 8.634255409240723, "learning_rate": 1.2126379137412237e-06, "loss": 0.4815, "step": 21350 }, { "epoch": 1.9317566347716193, "grad_norm": 8.262895584106445, "learning_rate": 1.1374122367101304e-06, "loss": 0.4939, "step": 21400 }, { "epoch": 1.9362700848528616, "grad_norm": 12.539655685424805, "learning_rate": 1.0621865596790371e-06, "loss": 0.4957, "step": 21450 }, { "epoch": 1.9407835349341036, "grad_norm": 9.728516578674316, "learning_rate": 9.86960882647944e-07, "loss": 0.4587, "step": 21500 }, { "epoch": 1.9407835349341036, "eval_exact_match": 86.14001892147587, "eval_f1": 92.57583651293868, "eval_runtime": 143.5952, "eval_samples_per_second": 75.197, "eval_steps_per_second": 18.803, "step": 21500 }, { "epoch": 1.9452969850153456, "grad_norm": 9.588170051574707, "learning_rate": 9.117352056168506e-07, "loss": 0.5395, "step": 21550 }, { "epoch": 1.949810435096588, "grad_norm": 14.394529342651367, "learning_rate": 8.365095285857573e-07, "loss": 0.5462, "step": 21600 }, { "epoch": 1.95432388517783, "grad_norm": 17.767173767089844, "learning_rate": 7.612838515546641e-07, "loss": 0.4525, "step": 21650 }, { "epoch": 1.958837335259072, "grad_norm": 15.962186813354492, "learning_rate": 6.860581745235707e-07, "loss": 0.6349, "step": 21700 }, { "epoch": 1.9633507853403143, "grad_norm": 3.5042107105255127, "learning_rate": 6.108324974924774e-07, "loss": 0.4903, "step": 21750 }, { "epoch": 1.9633507853403143, "eval_exact_match": 86.30085146641439, "eval_f1": 92.62412689139829, "eval_runtime": 143.4548, "eval_samples_per_second": 75.271, "eval_steps_per_second": 18.821, "step": 21750 }, { "epoch": 1.967864235421556, "grad_norm": 3.967465877532959, "learning_rate": 5.356068204613842e-07, "loss": 0.4477, "step": 21800 }, { "epoch": 1.9723776855027984, "grad_norm": 13.418035507202148, "learning_rate": 4.603811434302909e-07, "loss": 0.4418, "step": 21850 }, { "epoch": 1.9768911355840404, "grad_norm": 8.183111190795898, "learning_rate": 3.851554663991976e-07, "loss": 0.5417, "step": 21900 }, { "epoch": 1.9814045856652824, "grad_norm": 5.646338939666748, "learning_rate": 3.099297893681043e-07, "loss": 0.5065, "step": 21950 }, { "epoch": 1.9859180357465247, "grad_norm": 14.187732696533203, "learning_rate": 2.3470411233701103e-07, "loss": 0.4325, "step": 22000 }, { "epoch": 1.9859180357465247, "eval_exact_match": 86.35761589403974, "eval_f1": 92.66264597808306, "eval_runtime": 143.145, "eval_samples_per_second": 75.434, "eval_steps_per_second": 18.862, "step": 22000 }, { "epoch": 1.9904314858277667, "grad_norm": 24.069639205932617, "learning_rate": 1.5947843530591774e-07, "loss": 0.4079, "step": 22050 }, { "epoch": 1.9949449359090088, "grad_norm": 9.552345275878906, "learning_rate": 8.425275827482447e-08, "loss": 0.4418, "step": 22100 }, { "epoch": 1.999458385990251, "grad_norm": 11.15715503692627, "learning_rate": 9.027081243731194e-09, "loss": 0.519, "step": 22150 }, { "epoch": 2.0, "step": 22156, "total_flos": 4.529540706059981e+16, "train_loss": 0.8456309766066937, "train_runtime": 23960.3876, "train_samples_per_second": 7.397, "train_steps_per_second": 0.925 } ], "logging_steps": 50, "max_steps": 22156, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.529540706059981e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }