ModernBERT_SquadV1_4k / trainer_state.json
sraj's picture
Upload 12 files
a874f72 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 250,
"global_step": 22156,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004513450081242101,
"grad_norm": 7.729167461395264,
"learning_rate": 6.768953068592058e-07,
"loss": 5.9501,
"step": 50
},
{
"epoch": 0.009026900162484202,
"grad_norm": 13.082584381103516,
"learning_rate": 1.3537906137184116e-06,
"loss": 5.9447,
"step": 100
},
{
"epoch": 0.013540350243726304,
"grad_norm": 18.259796142578125,
"learning_rate": 2.0306859205776177e-06,
"loss": 5.9165,
"step": 150
},
{
"epoch": 0.018053800324968405,
"grad_norm": 22.386247634887695,
"learning_rate": 2.7075812274368233e-06,
"loss": 5.8394,
"step": 200
},
{
"epoch": 0.022567250406210507,
"grad_norm": 34.2584228515625,
"learning_rate": 3.384476534296029e-06,
"loss": 5.6527,
"step": 250
},
{
"epoch": 0.022567250406210507,
"eval_exact_match": 2.0056764427625353,
"eval_f1": 8.992036775611602,
"eval_runtime": 152.2365,
"eval_samples_per_second": 70.929,
"eval_steps_per_second": 17.736,
"step": 250
},
{
"epoch": 0.02708070048745261,
"grad_norm": 35.096343994140625,
"learning_rate": 4.061371841155235e-06,
"loss": 5.4208,
"step": 300
},
{
"epoch": 0.03159415056869471,
"grad_norm": 32.929325103759766,
"learning_rate": 4.73826714801444e-06,
"loss": 5.1947,
"step": 350
},
{
"epoch": 0.03610760064993681,
"grad_norm": 24.735565185546875,
"learning_rate": 5.4151624548736465e-06,
"loss": 4.9405,
"step": 400
},
{
"epoch": 0.040621050731178915,
"grad_norm": 22.857040405273438,
"learning_rate": 6.092057761732852e-06,
"loss": 4.7322,
"step": 450
},
{
"epoch": 0.04513450081242101,
"grad_norm": 24.466981887817383,
"learning_rate": 6.768953068592058e-06,
"loss": 4.4225,
"step": 500
},
{
"epoch": 0.04513450081242101,
"eval_exact_match": 5.771050141911069,
"eval_f1": 13.440407627304385,
"eval_runtime": 143.7394,
"eval_samples_per_second": 75.122,
"eval_steps_per_second": 18.784,
"step": 500
},
{
"epoch": 0.04964795089366312,
"grad_norm": 21.847347259521484,
"learning_rate": 7.445848375451264e-06,
"loss": 4.077,
"step": 550
},
{
"epoch": 0.05416140097490522,
"grad_norm": 27.222633361816406,
"learning_rate": 8.12274368231047e-06,
"loss": 3.5837,
"step": 600
},
{
"epoch": 0.058674851056147316,
"grad_norm": 29.45089340209961,
"learning_rate": 8.799638989169675e-06,
"loss": 3.0981,
"step": 650
},
{
"epoch": 0.06318830113738942,
"grad_norm": 29.15781593322754,
"learning_rate": 9.47653429602888e-06,
"loss": 2.823,
"step": 700
},
{
"epoch": 0.06770175121863152,
"grad_norm": 20.824087142944336,
"learning_rate": 1.0153429602888087e-05,
"loss": 2.4615,
"step": 750
},
{
"epoch": 0.06770175121863152,
"eval_exact_match": 52.82876064333018,
"eval_f1": 63.363202801168775,
"eval_runtime": 143.757,
"eval_samples_per_second": 75.113,
"eval_steps_per_second": 18.782,
"step": 750
},
{
"epoch": 0.07221520129987362,
"grad_norm": 33.184410095214844,
"learning_rate": 1.0830324909747293e-05,
"loss": 2.2565,
"step": 800
},
{
"epoch": 0.07672865138111573,
"grad_norm": 27.845844268798828,
"learning_rate": 1.15072202166065e-05,
"loss": 1.8158,
"step": 850
},
{
"epoch": 0.08124210146235783,
"grad_norm": 18.29555320739746,
"learning_rate": 1.2184115523465704e-05,
"loss": 1.7871,
"step": 900
},
{
"epoch": 0.08575555154359993,
"grad_norm": 23.45345687866211,
"learning_rate": 1.2861010830324909e-05,
"loss": 1.6184,
"step": 950
},
{
"epoch": 0.09026900162484203,
"grad_norm": 23.513124465942383,
"learning_rate": 1.3537906137184115e-05,
"loss": 1.577,
"step": 1000
},
{
"epoch": 0.09026900162484203,
"eval_exact_match": 65.42100283822138,
"eval_f1": 76.31429412241118,
"eval_runtime": 143.6001,
"eval_samples_per_second": 75.195,
"eval_steps_per_second": 18.802,
"step": 1000
},
{
"epoch": 0.09478245170608413,
"grad_norm": 13.316262245178223,
"learning_rate": 1.4214801444043322e-05,
"loss": 1.5352,
"step": 1050
},
{
"epoch": 0.09929590178732624,
"grad_norm": 34.4163932800293,
"learning_rate": 1.4891696750902528e-05,
"loss": 1.6355,
"step": 1100
},
{
"epoch": 0.10380935186856834,
"grad_norm": 33.572750091552734,
"learning_rate": 1.5568592057761735e-05,
"loss": 1.5064,
"step": 1150
},
{
"epoch": 0.10832280194981043,
"grad_norm": 13.494843482971191,
"learning_rate": 1.624548736462094e-05,
"loss": 1.4047,
"step": 1200
},
{
"epoch": 0.11283625203105253,
"grad_norm": 21.778579711914062,
"learning_rate": 1.6922382671480144e-05,
"loss": 1.2552,
"step": 1250
},
{
"epoch": 0.11283625203105253,
"eval_exact_match": 69.5364238410596,
"eval_f1": 80.48975233211664,
"eval_runtime": 143.6325,
"eval_samples_per_second": 75.178,
"eval_steps_per_second": 18.798,
"step": 1250
},
{
"epoch": 0.11734970211229463,
"grad_norm": 17.293298721313477,
"learning_rate": 1.759927797833935e-05,
"loss": 1.2779,
"step": 1300
},
{
"epoch": 0.12186315219353674,
"grad_norm": 14.029269218444824,
"learning_rate": 1.8276173285198557e-05,
"loss": 1.2871,
"step": 1350
},
{
"epoch": 0.12637660227477884,
"grad_norm": 12.971822738647461,
"learning_rate": 1.895306859205776e-05,
"loss": 1.1974,
"step": 1400
},
{
"epoch": 0.13089005235602094,
"grad_norm": 30.38484001159668,
"learning_rate": 1.9629963898916967e-05,
"loss": 1.355,
"step": 1450
},
{
"epoch": 0.13540350243726304,
"grad_norm": 29.467548370361328,
"learning_rate": 2.0306859205776173e-05,
"loss": 1.1713,
"step": 1500
},
{
"epoch": 0.13540350243726304,
"eval_exact_match": 72.57332071901608,
"eval_f1": 82.65844387552723,
"eval_runtime": 143.752,
"eval_samples_per_second": 75.115,
"eval_steps_per_second": 18.782,
"step": 1500
},
{
"epoch": 0.13991695251850514,
"grad_norm": 12.46554183959961,
"learning_rate": 2.098375451263538e-05,
"loss": 1.1677,
"step": 1550
},
{
"epoch": 0.14443040259974724,
"grad_norm": 25.593875885009766,
"learning_rate": 2.1660649819494586e-05,
"loss": 1.3478,
"step": 1600
},
{
"epoch": 0.14894385268098934,
"grad_norm": 16.136869430541992,
"learning_rate": 2.2337545126353793e-05,
"loss": 1.1676,
"step": 1650
},
{
"epoch": 0.15345730276223146,
"grad_norm": 16.83846664428711,
"learning_rate": 2.3014440433213e-05,
"loss": 1.1741,
"step": 1700
},
{
"epoch": 0.15797075284347356,
"grad_norm": 17.464096069335938,
"learning_rate": 2.3691335740072202e-05,
"loss": 1.2104,
"step": 1750
},
{
"epoch": 0.15797075284347356,
"eval_exact_match": 74.64522232734153,
"eval_f1": 84.72687223622708,
"eval_runtime": 143.5946,
"eval_samples_per_second": 75.198,
"eval_steps_per_second": 18.803,
"step": 1750
},
{
"epoch": 0.16248420292471566,
"grad_norm": 13.083732604980469,
"learning_rate": 2.436823104693141e-05,
"loss": 1.1279,
"step": 1800
},
{
"epoch": 0.16699765300595776,
"grad_norm": 15.166335105895996,
"learning_rate": 2.5045126353790615e-05,
"loss": 1.1395,
"step": 1850
},
{
"epoch": 0.17151110308719986,
"grad_norm": 8.829039573669434,
"learning_rate": 2.5722021660649818e-05,
"loss": 1.1374,
"step": 1900
},
{
"epoch": 0.17602455316844196,
"grad_norm": 28.089391708374023,
"learning_rate": 2.6398916967509024e-05,
"loss": 1.2106,
"step": 1950
},
{
"epoch": 0.18053800324968405,
"grad_norm": 13.704926490783691,
"learning_rate": 2.707581227436823e-05,
"loss": 1.2369,
"step": 2000
},
{
"epoch": 0.18053800324968405,
"eval_exact_match": 77.96594134342479,
"eval_f1": 86.52116394116426,
"eval_runtime": 143.6954,
"eval_samples_per_second": 75.145,
"eval_steps_per_second": 18.79,
"step": 2000
},
{
"epoch": 0.18505145333092615,
"grad_norm": 13.062108993530273,
"learning_rate": 2.7752707581227437e-05,
"loss": 1.0064,
"step": 2050
},
{
"epoch": 0.18956490341216825,
"grad_norm": 25.21763801574707,
"learning_rate": 2.8429602888086644e-05,
"loss": 1.1127,
"step": 2100
},
{
"epoch": 0.19407835349341035,
"grad_norm": 7.10919189453125,
"learning_rate": 2.910649819494585e-05,
"loss": 1.054,
"step": 2150
},
{
"epoch": 0.19859180357465248,
"grad_norm": 14.997174263000488,
"learning_rate": 2.9783393501805057e-05,
"loss": 1.1187,
"step": 2200
},
{
"epoch": 0.20310525365589457,
"grad_norm": 9.683287620544434,
"learning_rate": 2.9948846539618856e-05,
"loss": 0.9916,
"step": 2250
},
{
"epoch": 0.20310525365589457,
"eval_exact_match": 77.21854304635761,
"eval_f1": 85.59348158373206,
"eval_runtime": 143.7109,
"eval_samples_per_second": 75.137,
"eval_steps_per_second": 18.788,
"step": 2250
},
{
"epoch": 0.20761870373713667,
"grad_norm": 18.712541580200195,
"learning_rate": 2.987362086258776e-05,
"loss": 1.1756,
"step": 2300
},
{
"epoch": 0.21213215381837877,
"grad_norm": 8.502235412597656,
"learning_rate": 2.979839518555667e-05,
"loss": 1.0872,
"step": 2350
},
{
"epoch": 0.21664560389962087,
"grad_norm": 16.61508560180664,
"learning_rate": 2.9723169508525577e-05,
"loss": 1.1148,
"step": 2400
},
{
"epoch": 0.22115905398086297,
"grad_norm": 9.676267623901367,
"learning_rate": 2.9647943831494482e-05,
"loss": 1.0559,
"step": 2450
},
{
"epoch": 0.22567250406210507,
"grad_norm": 11.562779426574707,
"learning_rate": 2.957271815446339e-05,
"loss": 1.0628,
"step": 2500
},
{
"epoch": 0.22567250406210507,
"eval_exact_match": 79.3755912961211,
"eval_f1": 87.42406194378296,
"eval_runtime": 143.6631,
"eval_samples_per_second": 75.162,
"eval_steps_per_second": 18.794,
"step": 2500
},
{
"epoch": 0.23018595414334717,
"grad_norm": 20.24138832092285,
"learning_rate": 2.9497492477432297e-05,
"loss": 1.0549,
"step": 2550
},
{
"epoch": 0.23469940422458926,
"grad_norm": 24.723041534423828,
"learning_rate": 2.9422266800401203e-05,
"loss": 1.137,
"step": 2600
},
{
"epoch": 0.2392128543058314,
"grad_norm": 14.101241111755371,
"learning_rate": 2.9347041123370113e-05,
"loss": 1.0199,
"step": 2650
},
{
"epoch": 0.2437263043870735,
"grad_norm": 7.032845497131348,
"learning_rate": 2.927181544633902e-05,
"loss": 1.0601,
"step": 2700
},
{
"epoch": 0.2482397544683156,
"grad_norm": 13.543634414672852,
"learning_rate": 2.9196589769307924e-05,
"loss": 1.0534,
"step": 2750
},
{
"epoch": 0.2482397544683156,
"eval_exact_match": 79.57426679280984,
"eval_f1": 87.74281924363757,
"eval_runtime": 143.6905,
"eval_samples_per_second": 75.148,
"eval_steps_per_second": 18.79,
"step": 2750
},
{
"epoch": 0.2527532045495577,
"grad_norm": 7.890726566314697,
"learning_rate": 2.9121364092276833e-05,
"loss": 1.0196,
"step": 2800
},
{
"epoch": 0.2572666546307998,
"grad_norm": 12.943625450134277,
"learning_rate": 2.904613841524574e-05,
"loss": 1.0178,
"step": 2850
},
{
"epoch": 0.2617801047120419,
"grad_norm": 9.828871726989746,
"learning_rate": 2.897091273821464e-05,
"loss": 1.0482,
"step": 2900
},
{
"epoch": 0.266293554793284,
"grad_norm": 28.693660736083984,
"learning_rate": 2.889568706118355e-05,
"loss": 0.9897,
"step": 2950
},
{
"epoch": 0.2708070048745261,
"grad_norm": 10.408865928649902,
"learning_rate": 2.8820461384152457e-05,
"loss": 0.9407,
"step": 3000
},
{
"epoch": 0.2708070048745261,
"eval_exact_match": 80.37842951750237,
"eval_f1": 88.60122498039404,
"eval_runtime": 143.7217,
"eval_samples_per_second": 75.131,
"eval_steps_per_second": 18.786,
"step": 3000
},
{
"epoch": 0.2753204549557682,
"grad_norm": 11.135859489440918,
"learning_rate": 2.8745235707121363e-05,
"loss": 0.9192,
"step": 3050
},
{
"epoch": 0.2798339050370103,
"grad_norm": 10.159820556640625,
"learning_rate": 2.8670010030090272e-05,
"loss": 0.9232,
"step": 3100
},
{
"epoch": 0.2843473551182524,
"grad_norm": 6.99199914932251,
"learning_rate": 2.8594784353059178e-05,
"loss": 0.9188,
"step": 3150
},
{
"epoch": 0.2888608051994945,
"grad_norm": 11.692395210266113,
"learning_rate": 2.8519558676028083e-05,
"loss": 0.979,
"step": 3200
},
{
"epoch": 0.2933742552807366,
"grad_norm": 12.289103507995605,
"learning_rate": 2.8444332998996993e-05,
"loss": 0.8573,
"step": 3250
},
{
"epoch": 0.2933742552807366,
"eval_exact_match": 81.44749290444655,
"eval_f1": 88.89880962072144,
"eval_runtime": 143.7737,
"eval_samples_per_second": 75.104,
"eval_steps_per_second": 18.78,
"step": 3250
},
{
"epoch": 0.29788770536197867,
"grad_norm": 23.986677169799805,
"learning_rate": 2.83691073219659e-05,
"loss": 0.879,
"step": 3300
},
{
"epoch": 0.30240115544322077,
"grad_norm": 10.214922904968262,
"learning_rate": 2.8293881644934804e-05,
"loss": 0.9526,
"step": 3350
},
{
"epoch": 0.3069146055244629,
"grad_norm": 11.930830955505371,
"learning_rate": 2.8218655967903714e-05,
"loss": 1.0308,
"step": 3400
},
{
"epoch": 0.311428055605705,
"grad_norm": 7.23566198348999,
"learning_rate": 2.814343029087262e-05,
"loss": 0.9849,
"step": 3450
},
{
"epoch": 0.3159415056869471,
"grad_norm": 10.037336349487305,
"learning_rate": 2.8068204613841525e-05,
"loss": 0.8577,
"step": 3500
},
{
"epoch": 0.3159415056869471,
"eval_exact_match": 82.28949858088932,
"eval_f1": 89.36851469763961,
"eval_runtime": 143.8541,
"eval_samples_per_second": 75.062,
"eval_steps_per_second": 18.769,
"step": 3500
},
{
"epoch": 0.3204549557681892,
"grad_norm": 19.438182830810547,
"learning_rate": 2.799297893681043e-05,
"loss": 0.9586,
"step": 3550
},
{
"epoch": 0.3249684058494313,
"grad_norm": 11.259856224060059,
"learning_rate": 2.7917753259779337e-05,
"loss": 0.923,
"step": 3600
},
{
"epoch": 0.3294818559306734,
"grad_norm": 16.151851654052734,
"learning_rate": 2.7842527582748243e-05,
"loss": 0.9551,
"step": 3650
},
{
"epoch": 0.3339953060119155,
"grad_norm": 12.570643424987793,
"learning_rate": 2.7767301905717152e-05,
"loss": 0.9377,
"step": 3700
},
{
"epoch": 0.3385087560931576,
"grad_norm": 13.616052627563477,
"learning_rate": 2.7692076228686058e-05,
"loss": 0.7839,
"step": 3750
},
{
"epoch": 0.3385087560931576,
"eval_exact_match": 81.69347209082308,
"eval_f1": 89.48678563794635,
"eval_runtime": 143.7031,
"eval_samples_per_second": 75.141,
"eval_steps_per_second": 18.789,
"step": 3750
},
{
"epoch": 0.3430222061743997,
"grad_norm": 20.124753952026367,
"learning_rate": 2.7616850551654964e-05,
"loss": 0.9229,
"step": 3800
},
{
"epoch": 0.3475356562556418,
"grad_norm": 7.816183090209961,
"learning_rate": 2.7541624874623873e-05,
"loss": 1.0717,
"step": 3850
},
{
"epoch": 0.3520491063368839,
"grad_norm": 5.988482475280762,
"learning_rate": 2.746639919759278e-05,
"loss": 0.9611,
"step": 3900
},
{
"epoch": 0.356562556418126,
"grad_norm": 13.177979469299316,
"learning_rate": 2.7391173520561685e-05,
"loss": 0.9207,
"step": 3950
},
{
"epoch": 0.3610760064993681,
"grad_norm": 11.034092903137207,
"learning_rate": 2.7315947843530594e-05,
"loss": 0.9395,
"step": 4000
},
{
"epoch": 0.3610760064993681,
"eval_exact_match": 82.74361400189214,
"eval_f1": 90.06638594360132,
"eval_runtime": 143.7901,
"eval_samples_per_second": 75.096,
"eval_steps_per_second": 18.777,
"step": 4000
},
{
"epoch": 0.3655894565806102,
"grad_norm": 5.799317359924316,
"learning_rate": 2.72407221664995e-05,
"loss": 0.9385,
"step": 4050
},
{
"epoch": 0.3701029066618523,
"grad_norm": 13.385774612426758,
"learning_rate": 2.7165496489468405e-05,
"loss": 0.9356,
"step": 4100
},
{
"epoch": 0.3746163567430944,
"grad_norm": 22.11754608154297,
"learning_rate": 2.7090270812437315e-05,
"loss": 0.8532,
"step": 4150
},
{
"epoch": 0.3791298068243365,
"grad_norm": 4.648535251617432,
"learning_rate": 2.701504513540622e-05,
"loss": 1.0032,
"step": 4200
},
{
"epoch": 0.3836432569055786,
"grad_norm": 24.29154396057129,
"learning_rate": 2.6939819458375123e-05,
"loss": 0.8606,
"step": 4250
},
{
"epoch": 0.3836432569055786,
"eval_exact_match": 82.58278145695364,
"eval_f1": 89.65557078580815,
"eval_runtime": 143.6035,
"eval_samples_per_second": 75.193,
"eval_steps_per_second": 18.802,
"step": 4250
},
{
"epoch": 0.3881567069868207,
"grad_norm": 21.021242141723633,
"learning_rate": 2.6864593781344032e-05,
"loss": 0.9006,
"step": 4300
},
{
"epoch": 0.39267015706806285,
"grad_norm": 26.466794967651367,
"learning_rate": 2.6789368104312938e-05,
"loss": 0.9426,
"step": 4350
},
{
"epoch": 0.39718360714930495,
"grad_norm": 6.325038433074951,
"learning_rate": 2.6714142427281844e-05,
"loss": 0.8097,
"step": 4400
},
{
"epoch": 0.40169705723054705,
"grad_norm": 8.038667678833008,
"learning_rate": 2.6638916750250753e-05,
"loss": 0.9053,
"step": 4450
},
{
"epoch": 0.40621050731178915,
"grad_norm": 10.573040008544922,
"learning_rate": 2.656369107321966e-05,
"loss": 0.8459,
"step": 4500
},
{
"epoch": 0.40621050731178915,
"eval_exact_match": 82.60170293282876,
"eval_f1": 89.80136126079411,
"eval_runtime": 143.5994,
"eval_samples_per_second": 75.195,
"eval_steps_per_second": 18.802,
"step": 4500
},
{
"epoch": 0.41072395739303125,
"grad_norm": 7.336009979248047,
"learning_rate": 2.6488465396188565e-05,
"loss": 0.8691,
"step": 4550
},
{
"epoch": 0.41523740747427335,
"grad_norm": 13.7125825881958,
"learning_rate": 2.6413239719157474e-05,
"loss": 0.8486,
"step": 4600
},
{
"epoch": 0.41975085755551544,
"grad_norm": 12.19320011138916,
"learning_rate": 2.633801404212638e-05,
"loss": 0.9189,
"step": 4650
},
{
"epoch": 0.42426430763675754,
"grad_norm": 11.599879264831543,
"learning_rate": 2.6262788365095286e-05,
"loss": 0.942,
"step": 4700
},
{
"epoch": 0.42877775771799964,
"grad_norm": 9.139724731445312,
"learning_rate": 2.6187562688064195e-05,
"loss": 0.9565,
"step": 4750
},
{
"epoch": 0.42877775771799964,
"eval_exact_match": 83.66130558183538,
"eval_f1": 90.80495165338898,
"eval_runtime": 143.5723,
"eval_samples_per_second": 75.209,
"eval_steps_per_second": 18.806,
"step": 4750
},
{
"epoch": 0.43329120779924174,
"grad_norm": 10.977174758911133,
"learning_rate": 2.61123370110331e-05,
"loss": 0.8751,
"step": 4800
},
{
"epoch": 0.43780465788048384,
"grad_norm": 13.7095947265625,
"learning_rate": 2.6037111334002007e-05,
"loss": 0.7547,
"step": 4850
},
{
"epoch": 0.44231810796172594,
"grad_norm": 26.184358596801758,
"learning_rate": 2.5961885656970912e-05,
"loss": 0.9376,
"step": 4900
},
{
"epoch": 0.44683155804296804,
"grad_norm": 21.620555877685547,
"learning_rate": 2.5886659979939818e-05,
"loss": 0.8013,
"step": 4950
},
{
"epoch": 0.45134500812421013,
"grad_norm": 12.163994789123535,
"learning_rate": 2.5811434302908724e-05,
"loss": 0.8731,
"step": 5000
},
{
"epoch": 0.45134500812421013,
"eval_exact_match": 82.37464522232735,
"eval_f1": 89.97068346136126,
"eval_runtime": 143.7072,
"eval_samples_per_second": 75.139,
"eval_steps_per_second": 18.788,
"step": 5000
},
{
"epoch": 0.45585845820545223,
"grad_norm": 5.3110175132751465,
"learning_rate": 2.5736208625877633e-05,
"loss": 0.9062,
"step": 5050
},
{
"epoch": 0.46037190828669433,
"grad_norm": 17.91867446899414,
"learning_rate": 2.566098294884654e-05,
"loss": 0.8749,
"step": 5100
},
{
"epoch": 0.46488535836793643,
"grad_norm": 34.21914291381836,
"learning_rate": 2.5585757271815445e-05,
"loss": 0.8915,
"step": 5150
},
{
"epoch": 0.46939880844917853,
"grad_norm": 8.76441478729248,
"learning_rate": 2.5510531594784354e-05,
"loss": 0.8023,
"step": 5200
},
{
"epoch": 0.4739122585304206,
"grad_norm": 20.71419334411621,
"learning_rate": 2.543530591775326e-05,
"loss": 0.8114,
"step": 5250
},
{
"epoch": 0.4739122585304206,
"eval_exact_match": 82.36518448438979,
"eval_f1": 90.09893335252144,
"eval_runtime": 143.9381,
"eval_samples_per_second": 75.018,
"eval_steps_per_second": 18.758,
"step": 5250
},
{
"epoch": 0.4784257086116628,
"grad_norm": 14.544415473937988,
"learning_rate": 2.536008024072217e-05,
"loss": 0.8045,
"step": 5300
},
{
"epoch": 0.4829391586929049,
"grad_norm": 40.066375732421875,
"learning_rate": 2.5284854563691075e-05,
"loss": 0.8651,
"step": 5350
},
{
"epoch": 0.487452608774147,
"grad_norm": 16.154937744140625,
"learning_rate": 2.520962888665998e-05,
"loss": 0.9995,
"step": 5400
},
{
"epoch": 0.4919660588553891,
"grad_norm": 6.674190044403076,
"learning_rate": 2.513440320962889e-05,
"loss": 0.8231,
"step": 5450
},
{
"epoch": 0.4964795089366312,
"grad_norm": 6.037493705749512,
"learning_rate": 2.5059177532597796e-05,
"loss": 0.8369,
"step": 5500
},
{
"epoch": 0.4964795089366312,
"eval_exact_match": 83.20719016083254,
"eval_f1": 90.18831406264282,
"eval_runtime": 143.9261,
"eval_samples_per_second": 75.025,
"eval_steps_per_second": 18.76,
"step": 5500
},
{
"epoch": 0.5009929590178732,
"grad_norm": 8.621197700500488,
"learning_rate": 2.4983951855566702e-05,
"loss": 0.8002,
"step": 5550
},
{
"epoch": 0.5055064090991154,
"grad_norm": 19.25075340270996,
"learning_rate": 2.4908726178535608e-05,
"loss": 0.8039,
"step": 5600
},
{
"epoch": 0.5100198591803574,
"grad_norm": 14.201600074768066,
"learning_rate": 2.4833500501504514e-05,
"loss": 0.7525,
"step": 5650
},
{
"epoch": 0.5145333092615996,
"grad_norm": 30.636154174804688,
"learning_rate": 2.475827482447342e-05,
"loss": 0.7711,
"step": 5700
},
{
"epoch": 0.5190467593428417,
"grad_norm": 8.79736042022705,
"learning_rate": 2.468304914744233e-05,
"loss": 0.8997,
"step": 5750
},
{
"epoch": 0.5190467593428417,
"eval_exact_match": 83.66130558183538,
"eval_f1": 90.84316221305555,
"eval_runtime": 143.8756,
"eval_samples_per_second": 75.051,
"eval_steps_per_second": 18.766,
"step": 5750
},
{
"epoch": 0.5235602094240838,
"grad_norm": 14.871445655822754,
"learning_rate": 2.4607823470411234e-05,
"loss": 0.7639,
"step": 5800
},
{
"epoch": 0.5280736595053259,
"grad_norm": 6.112968444824219,
"learning_rate": 2.453259779338014e-05,
"loss": 0.8643,
"step": 5850
},
{
"epoch": 0.532587109586568,
"grad_norm": 6.213535785675049,
"learning_rate": 2.445737211634905e-05,
"loss": 1.0582,
"step": 5900
},
{
"epoch": 0.5371005596678101,
"grad_norm": 4.76146936416626,
"learning_rate": 2.4382146439317955e-05,
"loss": 0.818,
"step": 5950
},
{
"epoch": 0.5416140097490522,
"grad_norm": 8.690106391906738,
"learning_rate": 2.430692076228686e-05,
"loss": 0.8871,
"step": 6000
},
{
"epoch": 0.5416140097490522,
"eval_exact_match": 84.03027436140019,
"eval_f1": 90.88429950527104,
"eval_runtime": 143.8461,
"eval_samples_per_second": 75.066,
"eval_steps_per_second": 18.77,
"step": 6000
},
{
"epoch": 0.5461274598302943,
"grad_norm": 18.575305938720703,
"learning_rate": 2.423169508525577e-05,
"loss": 0.9143,
"step": 6050
},
{
"epoch": 0.5506409099115364,
"grad_norm": 2.229325294494629,
"learning_rate": 2.4156469408224676e-05,
"loss": 0.9387,
"step": 6100
},
{
"epoch": 0.5551543599927785,
"grad_norm": 9.413180351257324,
"learning_rate": 2.4081243731193582e-05,
"loss": 0.8657,
"step": 6150
},
{
"epoch": 0.5596678100740206,
"grad_norm": 5.644939422607422,
"learning_rate": 2.400601805416249e-05,
"loss": 0.8445,
"step": 6200
},
{
"epoch": 0.5641812601552627,
"grad_norm": 23.247257232666016,
"learning_rate": 2.3930792377131394e-05,
"loss": 0.8342,
"step": 6250
},
{
"epoch": 0.5641812601552627,
"eval_exact_match": 84.12488174077578,
"eval_f1": 91.10403462345704,
"eval_runtime": 144.0358,
"eval_samples_per_second": 74.967,
"eval_steps_per_second": 18.745,
"step": 6250
},
{
"epoch": 0.5686947102365048,
"grad_norm": 19.933300018310547,
"learning_rate": 2.38555667001003e-05,
"loss": 0.8348,
"step": 6300
},
{
"epoch": 0.5732081603177469,
"grad_norm": 7.6812872886657715,
"learning_rate": 2.378034102306921e-05,
"loss": 0.7582,
"step": 6350
},
{
"epoch": 0.577721610398989,
"grad_norm": 13.01408863067627,
"learning_rate": 2.3705115346038115e-05,
"loss": 0.7245,
"step": 6400
},
{
"epoch": 0.5822350604802311,
"grad_norm": 23.909793853759766,
"learning_rate": 2.362988966900702e-05,
"loss": 0.7503,
"step": 6450
},
{
"epoch": 0.5867485105614731,
"grad_norm": 5.6074323654174805,
"learning_rate": 2.355466399197593e-05,
"loss": 0.8251,
"step": 6500
},
{
"epoch": 0.5867485105614731,
"eval_exact_match": 84.31409649952697,
"eval_f1": 91.10669916586389,
"eval_runtime": 143.8718,
"eval_samples_per_second": 75.053,
"eval_steps_per_second": 18.767,
"step": 6500
},
{
"epoch": 0.5912619606427153,
"grad_norm": 13.117137908935547,
"learning_rate": 2.3479438314944836e-05,
"loss": 0.7903,
"step": 6550
},
{
"epoch": 0.5957754107239573,
"grad_norm": 4.99781608581543,
"learning_rate": 2.340421263791374e-05,
"loss": 0.8538,
"step": 6600
},
{
"epoch": 0.6002888608051995,
"grad_norm": 7.639380931854248,
"learning_rate": 2.332898696088265e-05,
"loss": 0.8154,
"step": 6650
},
{
"epoch": 0.6048023108864415,
"grad_norm": 30.98665428161621,
"learning_rate": 2.3253761283851556e-05,
"loss": 0.723,
"step": 6700
},
{
"epoch": 0.6093157609676837,
"grad_norm": 30.613746643066406,
"learning_rate": 2.3178535606820462e-05,
"loss": 0.8682,
"step": 6750
},
{
"epoch": 0.6093157609676837,
"eval_exact_match": 83.72753074739829,
"eval_f1": 90.83640909549077,
"eval_runtime": 143.634,
"eval_samples_per_second": 75.177,
"eval_steps_per_second": 18.798,
"step": 6750
},
{
"epoch": 0.6138292110489258,
"grad_norm": 8.440532684326172,
"learning_rate": 2.310330992978937e-05,
"loss": 0.8729,
"step": 6800
},
{
"epoch": 0.6183426611301679,
"grad_norm": 5.947940826416016,
"learning_rate": 2.3028084252758277e-05,
"loss": 0.747,
"step": 6850
},
{
"epoch": 0.62285611121141,
"grad_norm": 16.59714698791504,
"learning_rate": 2.295285857572718e-05,
"loss": 0.8015,
"step": 6900
},
{
"epoch": 0.6273695612926521,
"grad_norm": 5.211153507232666,
"learning_rate": 2.287763289869609e-05,
"loss": 0.8957,
"step": 6950
},
{
"epoch": 0.6318830113738942,
"grad_norm": 4.547276496887207,
"learning_rate": 2.2802407221664995e-05,
"loss": 0.9019,
"step": 7000
},
{
"epoch": 0.6318830113738942,
"eval_exact_match": 84.82497634815516,
"eval_f1": 91.52669904904272,
"eval_runtime": 145.3332,
"eval_samples_per_second": 74.298,
"eval_steps_per_second": 18.578,
"step": 7000
},
{
"epoch": 0.6363964614551363,
"grad_norm": 11.642155647277832,
"learning_rate": 2.27271815446339e-05,
"loss": 0.7963,
"step": 7050
},
{
"epoch": 0.6409099115363784,
"grad_norm": 7.39171028137207,
"learning_rate": 2.265195586760281e-05,
"loss": 0.7953,
"step": 7100
},
{
"epoch": 0.6454233616176205,
"grad_norm": 13.905296325683594,
"learning_rate": 2.2576730190571716e-05,
"loss": 0.7865,
"step": 7150
},
{
"epoch": 0.6499368116988626,
"grad_norm": 5.167139530181885,
"learning_rate": 2.250150451354062e-05,
"loss": 0.8087,
"step": 7200
},
{
"epoch": 0.6544502617801047,
"grad_norm": 27.534217834472656,
"learning_rate": 2.242627883650953e-05,
"loss": 0.9436,
"step": 7250
},
{
"epoch": 0.6544502617801047,
"eval_exact_match": 84.06811731315042,
"eval_f1": 91.07524033930977,
"eval_runtime": 143.7651,
"eval_samples_per_second": 75.109,
"eval_steps_per_second": 18.781,
"step": 7250
},
{
"epoch": 0.6589637118613468,
"grad_norm": 15.742715835571289,
"learning_rate": 2.2351053159478437e-05,
"loss": 0.8499,
"step": 7300
},
{
"epoch": 0.6634771619425889,
"grad_norm": 16.15327262878418,
"learning_rate": 2.2275827482447342e-05,
"loss": 0.846,
"step": 7350
},
{
"epoch": 0.667990612023831,
"grad_norm": 17.383888244628906,
"learning_rate": 2.220060180541625e-05,
"loss": 0.7903,
"step": 7400
},
{
"epoch": 0.6725040621050731,
"grad_norm": 7.484638214111328,
"learning_rate": 2.2125376128385157e-05,
"loss": 0.7664,
"step": 7450
},
{
"epoch": 0.6770175121863152,
"grad_norm": 10.082265853881836,
"learning_rate": 2.2050150451354063e-05,
"loss": 0.9177,
"step": 7500
},
{
"epoch": 0.6770175121863152,
"eval_exact_match": 84.49385052034059,
"eval_f1": 91.10452090726004,
"eval_runtime": 143.5023,
"eval_samples_per_second": 75.246,
"eval_steps_per_second": 18.815,
"step": 7500
},
{
"epoch": 0.6815309622675573,
"grad_norm": 10.778836250305176,
"learning_rate": 2.1974924774322973e-05,
"loss": 0.7529,
"step": 7550
},
{
"epoch": 0.6860444123487994,
"grad_norm": 12.894726753234863,
"learning_rate": 2.1899699097291875e-05,
"loss": 0.8783,
"step": 7600
},
{
"epoch": 0.6905578624300415,
"grad_norm": 7.819123268127441,
"learning_rate": 2.182447342026078e-05,
"loss": 0.9095,
"step": 7650
},
{
"epoch": 0.6950713125112836,
"grad_norm": 24.68296241760254,
"learning_rate": 2.174924774322969e-05,
"loss": 0.8804,
"step": 7700
},
{
"epoch": 0.6995847625925258,
"grad_norm": 9.52649974822998,
"learning_rate": 2.1674022066198596e-05,
"loss": 0.8028,
"step": 7750
},
{
"epoch": 0.6995847625925258,
"eval_exact_match": 85.59129612109744,
"eval_f1": 91.93623152881347,
"eval_runtime": 143.4073,
"eval_samples_per_second": 75.296,
"eval_steps_per_second": 18.827,
"step": 7750
},
{
"epoch": 0.7040982126737678,
"grad_norm": 10.898487091064453,
"learning_rate": 2.1598796389167502e-05,
"loss": 0.8282,
"step": 7800
},
{
"epoch": 0.70861166275501,
"grad_norm": 6.693902969360352,
"learning_rate": 2.152357071213641e-05,
"loss": 0.775,
"step": 7850
},
{
"epoch": 0.713125112836252,
"grad_norm": 10.00558090209961,
"learning_rate": 2.1448345035105317e-05,
"loss": 0.6894,
"step": 7900
},
{
"epoch": 0.7176385629174942,
"grad_norm": 2.776298761367798,
"learning_rate": 2.1373119358074223e-05,
"loss": 0.8409,
"step": 7950
},
{
"epoch": 0.7221520129987362,
"grad_norm": 13.581101417541504,
"learning_rate": 2.1297893681043132e-05,
"loss": 0.8222,
"step": 8000
},
{
"epoch": 0.7221520129987362,
"eval_exact_match": 84.57899716177862,
"eval_f1": 91.38107826122027,
"eval_runtime": 143.4879,
"eval_samples_per_second": 75.254,
"eval_steps_per_second": 18.817,
"step": 8000
},
{
"epoch": 0.7266654630799784,
"grad_norm": 6.5702223777771,
"learning_rate": 2.1222668004012038e-05,
"loss": 0.6735,
"step": 8050
},
{
"epoch": 0.7311789131612204,
"grad_norm": 18.275623321533203,
"learning_rate": 2.1147442326980944e-05,
"loss": 0.8389,
"step": 8100
},
{
"epoch": 0.7356923632424626,
"grad_norm": 15.205418586730957,
"learning_rate": 2.1072216649949853e-05,
"loss": 0.7803,
"step": 8150
},
{
"epoch": 0.7402058133237046,
"grad_norm": 8.31666088104248,
"learning_rate": 2.099699097291876e-05,
"loss": 0.7081,
"step": 8200
},
{
"epoch": 0.7447192634049468,
"grad_norm": 9.174483299255371,
"learning_rate": 2.092176529588766e-05,
"loss": 0.826,
"step": 8250
},
{
"epoch": 0.7447192634049468,
"eval_exact_match": 84.76821192052981,
"eval_f1": 91.56620229706857,
"eval_runtime": 143.4859,
"eval_samples_per_second": 75.255,
"eval_steps_per_second": 18.817,
"step": 8250
},
{
"epoch": 0.7492327134861888,
"grad_norm": 5.849365234375,
"learning_rate": 2.084653961885657e-05,
"loss": 0.7826,
"step": 8300
},
{
"epoch": 0.753746163567431,
"grad_norm": 8.80666446685791,
"learning_rate": 2.0771313941825476e-05,
"loss": 0.8931,
"step": 8350
},
{
"epoch": 0.758259613648673,
"grad_norm": 7.301697731018066,
"learning_rate": 2.0696088264794382e-05,
"loss": 0.6788,
"step": 8400
},
{
"epoch": 0.7627730637299152,
"grad_norm": 9.519810676574707,
"learning_rate": 2.062086258776329e-05,
"loss": 0.7928,
"step": 8450
},
{
"epoch": 0.7672865138111572,
"grad_norm": 8.138936996459961,
"learning_rate": 2.0545636910732197e-05,
"loss": 0.8625,
"step": 8500
},
{
"epoch": 0.7672865138111572,
"eval_exact_match": 85.19394512771996,
"eval_f1": 91.95119129750337,
"eval_runtime": 143.5272,
"eval_samples_per_second": 75.233,
"eval_steps_per_second": 18.812,
"step": 8500
},
{
"epoch": 0.7717999638923994,
"grad_norm": 3.691103935241699,
"learning_rate": 2.0470411233701103e-05,
"loss": 0.7947,
"step": 8550
},
{
"epoch": 0.7763134139736414,
"grad_norm": 14.496338844299316,
"learning_rate": 2.0395185556670012e-05,
"loss": 0.8135,
"step": 8600
},
{
"epoch": 0.7808268640548836,
"grad_norm": 6.248403072357178,
"learning_rate": 2.0319959879638918e-05,
"loss": 0.8594,
"step": 8650
},
{
"epoch": 0.7853403141361257,
"grad_norm": 16.819801330566406,
"learning_rate": 2.0244734202607824e-05,
"loss": 0.8171,
"step": 8700
},
{
"epoch": 0.7898537642173677,
"grad_norm": 7.9061079025268555,
"learning_rate": 2.0169508525576733e-05,
"loss": 0.6517,
"step": 8750
},
{
"epoch": 0.7898537642173677,
"eval_exact_match": 85.6480605487228,
"eval_f1": 91.95890910573651,
"eval_runtime": 143.4989,
"eval_samples_per_second": 75.248,
"eval_steps_per_second": 18.815,
"step": 8750
},
{
"epoch": 0.7943672142986099,
"grad_norm": 12.129390716552734,
"learning_rate": 2.009428284854564e-05,
"loss": 0.7358,
"step": 8800
},
{
"epoch": 0.798880664379852,
"grad_norm": 7.113585472106934,
"learning_rate": 2.0019057171514545e-05,
"loss": 0.8517,
"step": 8850
},
{
"epoch": 0.8033941144610941,
"grad_norm": 10.407898902893066,
"learning_rate": 1.994383149448345e-05,
"loss": 0.733,
"step": 8900
},
{
"epoch": 0.8079075645423361,
"grad_norm": 15.745281219482422,
"learning_rate": 1.9868605817452356e-05,
"loss": 0.693,
"step": 8950
},
{
"epoch": 0.8124210146235783,
"grad_norm": 6.876597881317139,
"learning_rate": 1.9793380140421262e-05,
"loss": 0.817,
"step": 9000
},
{
"epoch": 0.8124210146235783,
"eval_exact_match": 84.87228003784296,
"eval_f1": 91.5110749584356,
"eval_runtime": 143.5544,
"eval_samples_per_second": 75.219,
"eval_steps_per_second": 18.808,
"step": 9000
},
{
"epoch": 0.8169344647048203,
"grad_norm": 7.4037065505981445,
"learning_rate": 1.971815446339017e-05,
"loss": 0.8677,
"step": 9050
},
{
"epoch": 0.8214479147860625,
"grad_norm": 4.559969902038574,
"learning_rate": 1.9642928786359077e-05,
"loss": 0.7798,
"step": 9100
},
{
"epoch": 0.8259613648673045,
"grad_norm": 7.184974670410156,
"learning_rate": 1.9567703109327983e-05,
"loss": 0.7705,
"step": 9150
},
{
"epoch": 0.8304748149485467,
"grad_norm": 8.206283569335938,
"learning_rate": 1.9492477432296892e-05,
"loss": 0.8398,
"step": 9200
},
{
"epoch": 0.8349882650297887,
"grad_norm": 7.29602575302124,
"learning_rate": 1.9417251755265798e-05,
"loss": 0.7574,
"step": 9250
},
{
"epoch": 0.8349882650297887,
"eval_exact_match": 85.06149479659413,
"eval_f1": 91.53106503540634,
"eval_runtime": 143.5779,
"eval_samples_per_second": 75.207,
"eval_steps_per_second": 18.805,
"step": 9250
},
{
"epoch": 0.8395017151110309,
"grad_norm": 10.167183876037598,
"learning_rate": 1.9342026078234704e-05,
"loss": 0.7103,
"step": 9300
},
{
"epoch": 0.8440151651922729,
"grad_norm": 6.271793365478516,
"learning_rate": 1.9266800401203613e-05,
"loss": 0.8364,
"step": 9350
},
{
"epoch": 0.8485286152735151,
"grad_norm": 11.07026481628418,
"learning_rate": 1.919157472417252e-05,
"loss": 0.726,
"step": 9400
},
{
"epoch": 0.8530420653547571,
"grad_norm": 5.571475028991699,
"learning_rate": 1.9116349047141425e-05,
"loss": 0.7205,
"step": 9450
},
{
"epoch": 0.8575555154359993,
"grad_norm": 3.4866223335266113,
"learning_rate": 1.9041123370110334e-05,
"loss": 0.7832,
"step": 9500
},
{
"epoch": 0.8575555154359993,
"eval_exact_match": 85.44938505203406,
"eval_f1": 91.84507576310226,
"eval_runtime": 143.5379,
"eval_samples_per_second": 75.228,
"eval_steps_per_second": 18.81,
"step": 9500
},
{
"epoch": 0.8620689655172413,
"grad_norm": 2.2408883571624756,
"learning_rate": 1.896589769307924e-05,
"loss": 0.7533,
"step": 9550
},
{
"epoch": 0.8665824155984835,
"grad_norm": 13.415377616882324,
"learning_rate": 1.8890672016048142e-05,
"loss": 0.7669,
"step": 9600
},
{
"epoch": 0.8710958656797256,
"grad_norm": 4.730581760406494,
"learning_rate": 1.881544633901705e-05,
"loss": 0.7468,
"step": 9650
},
{
"epoch": 0.8756093157609677,
"grad_norm": 6.725691318511963,
"learning_rate": 1.8740220661985957e-05,
"loss": 0.7426,
"step": 9700
},
{
"epoch": 0.8801227658422098,
"grad_norm": 8.169360160827637,
"learning_rate": 1.8664994984954863e-05,
"loss": 0.8436,
"step": 9750
},
{
"epoch": 0.8801227658422098,
"eval_exact_match": 84.88174077578051,
"eval_f1": 91.83275837323971,
"eval_runtime": 143.4694,
"eval_samples_per_second": 75.263,
"eval_steps_per_second": 18.819,
"step": 9750
},
{
"epoch": 0.8846362159234519,
"grad_norm": 1.717469334602356,
"learning_rate": 1.8589769307923772e-05,
"loss": 0.7889,
"step": 9800
},
{
"epoch": 0.889149666004694,
"grad_norm": 20.31835174560547,
"learning_rate": 1.8514543630892678e-05,
"loss": 0.7648,
"step": 9850
},
{
"epoch": 0.8936631160859361,
"grad_norm": 15.77481746673584,
"learning_rate": 1.8439317953861584e-05,
"loss": 0.7259,
"step": 9900
},
{
"epoch": 0.8981765661671782,
"grad_norm": 3.87709641456604,
"learning_rate": 1.8364092276830493e-05,
"loss": 0.7866,
"step": 9950
},
{
"epoch": 0.9026900162484203,
"grad_norm": 8.835536003112793,
"learning_rate": 1.82888665997994e-05,
"loss": 0.6775,
"step": 10000
},
{
"epoch": 0.9026900162484203,
"eval_exact_match": 85.99810785241249,
"eval_f1": 91.99639894905705,
"eval_runtime": 143.5348,
"eval_samples_per_second": 75.229,
"eval_steps_per_second": 18.811,
"step": 10000
},
{
"epoch": 0.9072034663296624,
"grad_norm": 12.733137130737305,
"learning_rate": 1.8213640922768305e-05,
"loss": 0.7703,
"step": 10050
},
{
"epoch": 0.9117169164109045,
"grad_norm": 12.40443229675293,
"learning_rate": 1.8138415245737214e-05,
"loss": 0.7619,
"step": 10100
},
{
"epoch": 0.9162303664921466,
"grad_norm": 6.346498012542725,
"learning_rate": 1.806318956870612e-05,
"loss": 0.7605,
"step": 10150
},
{
"epoch": 0.9207438165733887,
"grad_norm": 5.372687816619873,
"learning_rate": 1.7987963891675026e-05,
"loss": 0.6591,
"step": 10200
},
{
"epoch": 0.9252572666546308,
"grad_norm": 4.377304553985596,
"learning_rate": 1.7912738214643932e-05,
"loss": 0.8404,
"step": 10250
},
{
"epoch": 0.9252572666546308,
"eval_exact_match": 85.89403973509934,
"eval_f1": 92.11360231029698,
"eval_runtime": 144.2084,
"eval_samples_per_second": 74.878,
"eval_steps_per_second": 18.723,
"step": 10250
},
{
"epoch": 0.9297707167358729,
"grad_norm": 11.62856388092041,
"learning_rate": 1.7837512537612838e-05,
"loss": 0.7712,
"step": 10300
},
{
"epoch": 0.934284166817115,
"grad_norm": 5.723257541656494,
"learning_rate": 1.7762286860581743e-05,
"loss": 0.7171,
"step": 10350
},
{
"epoch": 0.9387976168983571,
"grad_norm": 6.060873031616211,
"learning_rate": 1.7687061183550653e-05,
"loss": 0.7324,
"step": 10400
},
{
"epoch": 0.9433110669795992,
"grad_norm": 4.51533842086792,
"learning_rate": 1.761183550651956e-05,
"loss": 0.7633,
"step": 10450
},
{
"epoch": 0.9478245170608413,
"grad_norm": 11.809548377990723,
"learning_rate": 1.7536609829488464e-05,
"loss": 0.8111,
"step": 10500
},
{
"epoch": 0.9478245170608413,
"eval_exact_match": 85.09933774834437,
"eval_f1": 91.93045017438146,
"eval_runtime": 143.4235,
"eval_samples_per_second": 75.288,
"eval_steps_per_second": 18.825,
"step": 10500
},
{
"epoch": 0.9523379671420834,
"grad_norm": 15.76356029510498,
"learning_rate": 1.7461384152457374e-05,
"loss": 0.7013,
"step": 10550
},
{
"epoch": 0.9568514172233256,
"grad_norm": 23.272687911987305,
"learning_rate": 1.738615847542628e-05,
"loss": 0.8229,
"step": 10600
},
{
"epoch": 0.9613648673045676,
"grad_norm": 16.758358001708984,
"learning_rate": 1.7310932798395185e-05,
"loss": 0.7479,
"step": 10650
},
{
"epoch": 0.9658783173858098,
"grad_norm": 14.670035362243652,
"learning_rate": 1.7235707121364094e-05,
"loss": 0.8,
"step": 10700
},
{
"epoch": 0.9703917674670518,
"grad_norm": 5.1286821365356445,
"learning_rate": 1.7160481444333e-05,
"loss": 0.8522,
"step": 10750
},
{
"epoch": 0.9703917674670518,
"eval_exact_match": 85.07095553453169,
"eval_f1": 92.09316544794538,
"eval_runtime": 143.6698,
"eval_samples_per_second": 75.158,
"eval_steps_per_second": 18.793,
"step": 10750
},
{
"epoch": 0.974905217548294,
"grad_norm": 8.131464004516602,
"learning_rate": 1.7085255767301906e-05,
"loss": 0.7949,
"step": 10800
},
{
"epoch": 0.979418667629536,
"grad_norm": 22.16661834716797,
"learning_rate": 1.7010030090270815e-05,
"loss": 0.7486,
"step": 10850
},
{
"epoch": 0.9839321177107782,
"grad_norm": 5.554388046264648,
"learning_rate": 1.693480441323972e-05,
"loss": 0.7604,
"step": 10900
},
{
"epoch": 0.9884455677920202,
"grad_norm": 8.525761604309082,
"learning_rate": 1.6859578736208624e-05,
"loss": 0.766,
"step": 10950
},
{
"epoch": 0.9929590178732624,
"grad_norm": 10.504690170288086,
"learning_rate": 1.6784353059177533e-05,
"loss": 0.7166,
"step": 11000
},
{
"epoch": 0.9929590178732624,
"eval_exact_match": 85.04257332071901,
"eval_f1": 92.02241474371678,
"eval_runtime": 143.4783,
"eval_samples_per_second": 75.259,
"eval_steps_per_second": 18.818,
"step": 11000
},
{
"epoch": 0.9974724679545044,
"grad_norm": 7.378440856933594,
"learning_rate": 1.670912738214644e-05,
"loss": 0.7187,
"step": 11050
},
{
"epoch": 1.0019859180357464,
"grad_norm": 3.172842502593994,
"learning_rate": 1.6633901705115345e-05,
"loss": 0.7532,
"step": 11100
},
{
"epoch": 1.0064993681169887,
"grad_norm": 2.3299856185913086,
"learning_rate": 1.6558676028084254e-05,
"loss": 0.497,
"step": 11150
},
{
"epoch": 1.0110128181982307,
"grad_norm": 8.6509428024292,
"learning_rate": 1.648345035105316e-05,
"loss": 0.4497,
"step": 11200
},
{
"epoch": 1.0155262682794728,
"grad_norm": 9.68758773803711,
"learning_rate": 1.6408224674022065e-05,
"loss": 0.6154,
"step": 11250
},
{
"epoch": 1.0155262682794728,
"eval_exact_match": 86.08325449385052,
"eval_f1": 92.38528194318762,
"eval_runtime": 143.3576,
"eval_samples_per_second": 75.322,
"eval_steps_per_second": 18.834,
"step": 11250
},
{
"epoch": 1.0200397183607148,
"grad_norm": 3.34212064743042,
"learning_rate": 1.6332998996990975e-05,
"loss": 0.5372,
"step": 11300
},
{
"epoch": 1.024553168441957,
"grad_norm": 5.384337425231934,
"learning_rate": 1.625777331995988e-05,
"loss": 0.5464,
"step": 11350
},
{
"epoch": 1.0290666185231991,
"grad_norm": 19.279573440551758,
"learning_rate": 1.6182547642928786e-05,
"loss": 0.5558,
"step": 11400
},
{
"epoch": 1.0335800686044412,
"grad_norm": 5.5248308181762695,
"learning_rate": 1.6107321965897696e-05,
"loss": 0.4981,
"step": 11450
},
{
"epoch": 1.0380935186856832,
"grad_norm": 5.657703399658203,
"learning_rate": 1.60320962888666e-05,
"loss": 0.6565,
"step": 11500
},
{
"epoch": 1.0380935186856832,
"eval_exact_match": 85.58183538315988,
"eval_f1": 92.11318103014378,
"eval_runtime": 143.5023,
"eval_samples_per_second": 75.246,
"eval_steps_per_second": 18.815,
"step": 11500
},
{
"epoch": 1.0426069687669255,
"grad_norm": 6.387887954711914,
"learning_rate": 1.5956870611835507e-05,
"loss": 0.588,
"step": 11550
},
{
"epoch": 1.0471204188481675,
"grad_norm": 1.7305879592895508,
"learning_rate": 1.5881644934804413e-05,
"loss": 0.5747,
"step": 11600
},
{
"epoch": 1.0516338689294096,
"grad_norm": 14.716680526733398,
"learning_rate": 1.580641925777332e-05,
"loss": 0.5632,
"step": 11650
},
{
"epoch": 1.0561473190106518,
"grad_norm": 9.127685546875,
"learning_rate": 1.5731193580742225e-05,
"loss": 0.4897,
"step": 11700
},
{
"epoch": 1.0606607690918939,
"grad_norm": 8.541461944580078,
"learning_rate": 1.5655967903711134e-05,
"loss": 0.544,
"step": 11750
},
{
"epoch": 1.0606607690918939,
"eval_exact_match": 86.16840113528855,
"eval_f1": 92.442978713336,
"eval_runtime": 143.2851,
"eval_samples_per_second": 75.36,
"eval_steps_per_second": 18.844,
"step": 11750
},
{
"epoch": 1.065174219173136,
"grad_norm": 6.538851737976074,
"learning_rate": 1.558074222668004e-05,
"loss": 0.5202,
"step": 11800
},
{
"epoch": 1.069687669254378,
"grad_norm": 7.314679145812988,
"learning_rate": 1.5505516549648946e-05,
"loss": 0.54,
"step": 11850
},
{
"epoch": 1.0742011193356202,
"grad_norm": 2.3385446071624756,
"learning_rate": 1.5430290872617855e-05,
"loss": 0.5192,
"step": 11900
},
{
"epoch": 1.0787145694168623,
"grad_norm": 26.518877029418945,
"learning_rate": 1.535506519558676e-05,
"loss": 0.5435,
"step": 11950
},
{
"epoch": 1.0832280194981043,
"grad_norm": 39.6591682434082,
"learning_rate": 1.5279839518555667e-05,
"loss": 0.5149,
"step": 12000
},
{
"epoch": 1.0832280194981043,
"eval_exact_match": 85.80889309366131,
"eval_f1": 92.20769990556119,
"eval_runtime": 143.6087,
"eval_samples_per_second": 75.19,
"eval_steps_per_second": 18.801,
"step": 12000
},
{
"epoch": 1.0877414695793464,
"grad_norm": 11.38036823272705,
"learning_rate": 1.5204613841524576e-05,
"loss": 0.526,
"step": 12050
},
{
"epoch": 1.0922549196605886,
"grad_norm": 21.02750587463379,
"learning_rate": 1.512938816449348e-05,
"loss": 0.471,
"step": 12100
},
{
"epoch": 1.0967683697418307,
"grad_norm": 23.10146713256836,
"learning_rate": 1.5054162487462386e-05,
"loss": 0.6643,
"step": 12150
},
{
"epoch": 1.1012818198230727,
"grad_norm": 29.241615295410156,
"learning_rate": 1.4978936810431293e-05,
"loss": 0.5231,
"step": 12200
},
{
"epoch": 1.1057952699043148,
"grad_norm": 3.3990285396575928,
"learning_rate": 1.49037111334002e-05,
"loss": 0.4051,
"step": 12250
},
{
"epoch": 1.1057952699043148,
"eval_exact_match": 85.56291390728477,
"eval_f1": 92.0566633980034,
"eval_runtime": 143.183,
"eval_samples_per_second": 75.414,
"eval_steps_per_second": 18.857,
"step": 12250
},
{
"epoch": 1.110308719985557,
"grad_norm": 3.075737237930298,
"learning_rate": 1.4828485456369108e-05,
"loss": 0.5637,
"step": 12300
},
{
"epoch": 1.114822170066799,
"grad_norm": 2.9517650604248047,
"learning_rate": 1.4753259779338014e-05,
"loss": 0.566,
"step": 12350
},
{
"epoch": 1.119335620148041,
"grad_norm": 23.89853858947754,
"learning_rate": 1.4678034102306922e-05,
"loss": 0.4439,
"step": 12400
},
{
"epoch": 1.1238490702292832,
"grad_norm": 8.486159324645996,
"learning_rate": 1.4602808425275828e-05,
"loss": 0.5538,
"step": 12450
},
{
"epoch": 1.1283625203105254,
"grad_norm": 3.5648648738861084,
"learning_rate": 1.4527582748244733e-05,
"loss": 0.5173,
"step": 12500
},
{
"epoch": 1.1283625203105254,
"eval_exact_match": 85.4872280037843,
"eval_f1": 92.15147631309604,
"eval_runtime": 143.2081,
"eval_samples_per_second": 75.401,
"eval_steps_per_second": 18.854,
"step": 12500
},
{
"epoch": 1.1328759703917675,
"grad_norm": 10.259268760681152,
"learning_rate": 1.4452357071213641e-05,
"loss": 0.5925,
"step": 12550
},
{
"epoch": 1.1373894204730095,
"grad_norm": 6.570536136627197,
"learning_rate": 1.4377131394182548e-05,
"loss": 0.4594,
"step": 12600
},
{
"epoch": 1.1419028705542518,
"grad_norm": 6.687112808227539,
"learning_rate": 1.4301905717151454e-05,
"loss": 0.4994,
"step": 12650
},
{
"epoch": 1.1464163206354938,
"grad_norm": 14.550410270690918,
"learning_rate": 1.4226680040120362e-05,
"loss": 0.5775,
"step": 12700
},
{
"epoch": 1.1509297707167359,
"grad_norm": 12.998605728149414,
"learning_rate": 1.4151454363089268e-05,
"loss": 0.5285,
"step": 12750
},
{
"epoch": 1.1509297707167359,
"eval_exact_match": 85.93188268684958,
"eval_f1": 92.11716297833141,
"eval_runtime": 143.1084,
"eval_samples_per_second": 75.453,
"eval_steps_per_second": 18.867,
"step": 12750
},
{
"epoch": 1.155443220797978,
"grad_norm": 26.985210418701172,
"learning_rate": 1.4076228686058175e-05,
"loss": 0.5092,
"step": 12800
},
{
"epoch": 1.1599566708792202,
"grad_norm": 15.445883750915527,
"learning_rate": 1.4001003009027081e-05,
"loss": 0.5178,
"step": 12850
},
{
"epoch": 1.1644701209604622,
"grad_norm": 8.596466064453125,
"learning_rate": 1.3925777331995989e-05,
"loss": 0.5742,
"step": 12900
},
{
"epoch": 1.1689835710417043,
"grad_norm": 3.9060676097869873,
"learning_rate": 1.3850551654964896e-05,
"loss": 0.5112,
"step": 12950
},
{
"epoch": 1.1734970211229463,
"grad_norm": 2.3090436458587646,
"learning_rate": 1.3775325977933802e-05,
"loss": 0.4802,
"step": 13000
},
{
"epoch": 1.1734970211229463,
"eval_exact_match": 86.3670766319773,
"eval_f1": 92.39148643540621,
"eval_runtime": 143.1331,
"eval_samples_per_second": 75.44,
"eval_steps_per_second": 18.864,
"step": 13000
},
{
"epoch": 1.1780104712041886,
"grad_norm": 4.289682865142822,
"learning_rate": 1.370010030090271e-05,
"loss": 0.4555,
"step": 13050
},
{
"epoch": 1.1825239212854306,
"grad_norm": 23.45159149169922,
"learning_rate": 1.3624874623871615e-05,
"loss": 0.6034,
"step": 13100
},
{
"epoch": 1.1870373713666726,
"grad_norm": 14.170953750610352,
"learning_rate": 1.3549648946840521e-05,
"loss": 0.4946,
"step": 13150
},
{
"epoch": 1.191550821447915,
"grad_norm": 7.408278942108154,
"learning_rate": 1.3474423269809429e-05,
"loss": 0.5625,
"step": 13200
},
{
"epoch": 1.196064271529157,
"grad_norm": 4.187251567840576,
"learning_rate": 1.3399197592778336e-05,
"loss": 0.5344,
"step": 13250
},
{
"epoch": 1.196064271529157,
"eval_exact_match": 85.66698202459791,
"eval_f1": 92.29328625942796,
"eval_runtime": 143.2334,
"eval_samples_per_second": 75.387,
"eval_steps_per_second": 18.85,
"step": 13250
},
{
"epoch": 1.200577721610399,
"grad_norm": 9.739165306091309,
"learning_rate": 1.3323971915747242e-05,
"loss": 0.5319,
"step": 13300
},
{
"epoch": 1.205091171691641,
"grad_norm": 3.0962629318237305,
"learning_rate": 1.324874623871615e-05,
"loss": 0.5455,
"step": 13350
},
{
"epoch": 1.209604621772883,
"grad_norm": 10.260982513427734,
"learning_rate": 1.3173520561685057e-05,
"loss": 0.5922,
"step": 13400
},
{
"epoch": 1.2141180718541253,
"grad_norm": 17.95406150817871,
"learning_rate": 1.3098294884653961e-05,
"loss": 0.6416,
"step": 13450
},
{
"epoch": 1.2186315219353674,
"grad_norm": 9.253098487854004,
"learning_rate": 1.3023069207622869e-05,
"loss": 0.4543,
"step": 13500
},
{
"epoch": 1.2186315219353674,
"eval_exact_match": 86.20624408703878,
"eval_f1": 92.32664235875168,
"eval_runtime": 143.2002,
"eval_samples_per_second": 75.405,
"eval_steps_per_second": 18.855,
"step": 13500
},
{
"epoch": 1.2231449720166094,
"grad_norm": 9.202949523925781,
"learning_rate": 1.2947843530591776e-05,
"loss": 0.6569,
"step": 13550
},
{
"epoch": 1.2276584220978517,
"grad_norm": 16.244760513305664,
"learning_rate": 1.2872617853560682e-05,
"loss": 0.5605,
"step": 13600
},
{
"epoch": 1.2321718721790937,
"grad_norm": 2.6242430210113525,
"learning_rate": 1.279739217652959e-05,
"loss": 0.546,
"step": 13650
},
{
"epoch": 1.2366853222603358,
"grad_norm": 19.960708618164062,
"learning_rate": 1.2722166499498497e-05,
"loss": 0.5916,
"step": 13700
},
{
"epoch": 1.2411987723415778,
"grad_norm": 14.39201545715332,
"learning_rate": 1.2646940822467401e-05,
"loss": 0.4647,
"step": 13750
},
{
"epoch": 1.2411987723415778,
"eval_exact_match": 86.27246925260171,
"eval_f1": 92.45848778749898,
"eval_runtime": 143.112,
"eval_samples_per_second": 75.451,
"eval_steps_per_second": 18.866,
"step": 13750
},
{
"epoch": 1.24571222242282,
"grad_norm": 8.58752155303955,
"learning_rate": 1.2571715145436309e-05,
"loss": 0.5224,
"step": 13800
},
{
"epoch": 1.2502256725040621,
"grad_norm": 5.419035911560059,
"learning_rate": 1.2496489468405216e-05,
"loss": 0.6743,
"step": 13850
},
{
"epoch": 1.2547391225853042,
"grad_norm": 7.52559232711792,
"learning_rate": 1.2421263791374122e-05,
"loss": 0.5955,
"step": 13900
},
{
"epoch": 1.2592525726665462,
"grad_norm": 15.449511528015137,
"learning_rate": 1.234603811434303e-05,
"loss": 0.5824,
"step": 13950
},
{
"epoch": 1.2637660227477885,
"grad_norm": 11.266414642333984,
"learning_rate": 1.2270812437311937e-05,
"loss": 0.5101,
"step": 14000
},
{
"epoch": 1.2637660227477885,
"eval_exact_match": 86.12109744560075,
"eval_f1": 92.2146350604068,
"eval_runtime": 145.9619,
"eval_samples_per_second": 73.978,
"eval_steps_per_second": 18.498,
"step": 14000
},
{
"epoch": 1.2682794728290305,
"grad_norm": 7.292428970336914,
"learning_rate": 1.2195586760280843e-05,
"loss": 0.4962,
"step": 14050
},
{
"epoch": 1.2727929229102726,
"grad_norm": 1.1534169912338257,
"learning_rate": 1.2120361083249749e-05,
"loss": 0.5691,
"step": 14100
},
{
"epoch": 1.2773063729915148,
"grad_norm": 3.5983633995056152,
"learning_rate": 1.2045135406218656e-05,
"loss": 0.5114,
"step": 14150
},
{
"epoch": 1.2818198230727569,
"grad_norm": 5.006545543670654,
"learning_rate": 1.1969909729187562e-05,
"loss": 0.5259,
"step": 14200
},
{
"epoch": 1.286333273153999,
"grad_norm": 24.31420135498047,
"learning_rate": 1.189468405215647e-05,
"loss": 0.4771,
"step": 14250
},
{
"epoch": 1.286333273153999,
"eval_exact_match": 86.59413434247871,
"eval_f1": 92.49677313517446,
"eval_runtime": 145.8932,
"eval_samples_per_second": 74.013,
"eval_steps_per_second": 18.507,
"step": 14250
},
{
"epoch": 1.290846723235241,
"grad_norm": 8.447436332702637,
"learning_rate": 1.1819458375125377e-05,
"loss": 0.5835,
"step": 14300
},
{
"epoch": 1.295360173316483,
"grad_norm": 23.178955078125,
"learning_rate": 1.1744232698094283e-05,
"loss": 0.5554,
"step": 14350
},
{
"epoch": 1.2998736233977253,
"grad_norm": 16.500057220458984,
"learning_rate": 1.166900702106319e-05,
"loss": 0.4928,
"step": 14400
},
{
"epoch": 1.3043870734789673,
"grad_norm": 8.389457702636719,
"learning_rate": 1.1593781344032097e-05,
"loss": 0.6872,
"step": 14450
},
{
"epoch": 1.3089005235602094,
"grad_norm": 5.315954685211182,
"learning_rate": 1.1518555667001002e-05,
"loss": 0.5394,
"step": 14500
},
{
"epoch": 1.3089005235602094,
"eval_exact_match": 86.45222327341533,
"eval_f1": 92.58788732745475,
"eval_runtime": 145.7277,
"eval_samples_per_second": 74.097,
"eval_steps_per_second": 18.528,
"step": 14500
},
{
"epoch": 1.3134139736414516,
"grad_norm": 2.0151515007019043,
"learning_rate": 1.144332998996991e-05,
"loss": 0.4745,
"step": 14550
},
{
"epoch": 1.3179274237226937,
"grad_norm": 6.995370864868164,
"learning_rate": 1.1368104312938817e-05,
"loss": 0.536,
"step": 14600
},
{
"epoch": 1.3224408738039357,
"grad_norm": 4.453261852264404,
"learning_rate": 1.1292878635907723e-05,
"loss": 0.4909,
"step": 14650
},
{
"epoch": 1.3269543238851778,
"grad_norm": 3.472259998321533,
"learning_rate": 1.121765295887663e-05,
"loss": 0.586,
"step": 14700
},
{
"epoch": 1.3314677739664198,
"grad_norm": 15.908103942871094,
"learning_rate": 1.1142427281845537e-05,
"loss": 0.528,
"step": 14750
},
{
"epoch": 1.3314677739664198,
"eval_exact_match": 85.86565752128666,
"eval_f1": 92.39225966105154,
"eval_runtime": 280.4149,
"eval_samples_per_second": 38.507,
"eval_steps_per_second": 9.629,
"step": 14750
},
{
"epoch": 1.335981224047662,
"grad_norm": 12.543098449707031,
"learning_rate": 1.1067201604814443e-05,
"loss": 0.4912,
"step": 14800
},
{
"epoch": 1.340494674128904,
"grad_norm": 24.144222259521484,
"learning_rate": 1.099197592778335e-05,
"loss": 0.5626,
"step": 14850
},
{
"epoch": 1.3450081242101461,
"grad_norm": 25.347875595092773,
"learning_rate": 1.0916750250752258e-05,
"loss": 0.4955,
"step": 14900
},
{
"epoch": 1.3495215742913884,
"grad_norm": 5.940708637237549,
"learning_rate": 1.0841524573721163e-05,
"loss": 0.4859,
"step": 14950
},
{
"epoch": 1.3540350243726305,
"grad_norm": 30.9013671875,
"learning_rate": 1.0766298896690071e-05,
"loss": 0.4788,
"step": 15000
},
{
"epoch": 1.3540350243726305,
"eval_exact_match": 86.6414380321665,
"eval_f1": 92.62032707644155,
"eval_runtime": 145.6846,
"eval_samples_per_second": 74.119,
"eval_steps_per_second": 18.533,
"step": 15000
},
{
"epoch": 1.3585484744538725,
"grad_norm": 12.216713905334473,
"learning_rate": 1.0691073219658978e-05,
"loss": 0.4977,
"step": 15050
},
{
"epoch": 1.3630619245351148,
"grad_norm": 40.2611083984375,
"learning_rate": 1.0615847542627883e-05,
"loss": 0.4972,
"step": 15100
},
{
"epoch": 1.3675753746163568,
"grad_norm": 10.3711519241333,
"learning_rate": 1.054062186559679e-05,
"loss": 0.4955,
"step": 15150
},
{
"epoch": 1.3720888246975989,
"grad_norm": 2.400322914123535,
"learning_rate": 1.0465396188565698e-05,
"loss": 0.4868,
"step": 15200
},
{
"epoch": 1.376602274778841,
"grad_norm": 3.9988925457000732,
"learning_rate": 1.0390170511534603e-05,
"loss": 0.5888,
"step": 15250
},
{
"epoch": 1.376602274778841,
"eval_exact_match": 85.96026490066225,
"eval_f1": 92.33197764854948,
"eval_runtime": 145.4185,
"eval_samples_per_second": 74.255,
"eval_steps_per_second": 18.567,
"step": 15250
},
{
"epoch": 1.381115724860083,
"grad_norm": 1.6575514078140259,
"learning_rate": 1.0314944834503511e-05,
"loss": 0.5143,
"step": 15300
},
{
"epoch": 1.3856291749413252,
"grad_norm": 5.943323612213135,
"learning_rate": 1.0239719157472419e-05,
"loss": 0.4725,
"step": 15350
},
{
"epoch": 1.3901426250225672,
"grad_norm": 21.014570236206055,
"learning_rate": 1.0164493480441324e-05,
"loss": 0.5131,
"step": 15400
},
{
"epoch": 1.3946560751038093,
"grad_norm": 4.148115634918213,
"learning_rate": 1.008926780341023e-05,
"loss": 0.4323,
"step": 15450
},
{
"epoch": 1.3991695251850516,
"grad_norm": 8.95993423461914,
"learning_rate": 1.0014042126379138e-05,
"loss": 0.6072,
"step": 15500
},
{
"epoch": 1.3991695251850516,
"eval_exact_match": 85.76158940397352,
"eval_f1": 92.24988076673156,
"eval_runtime": 145.3259,
"eval_samples_per_second": 74.302,
"eval_steps_per_second": 18.579,
"step": 15500
},
{
"epoch": 1.4036829752662936,
"grad_norm": 5.6876959800720215,
"learning_rate": 9.938816449348044e-06,
"loss": 0.5692,
"step": 15550
},
{
"epoch": 1.4081964253475356,
"grad_norm": 6.91029167175293,
"learning_rate": 9.863590772316951e-06,
"loss": 0.5801,
"step": 15600
},
{
"epoch": 1.4127098754287777,
"grad_norm": 8.116116523742676,
"learning_rate": 9.788365095285859e-06,
"loss": 0.4394,
"step": 15650
},
{
"epoch": 1.4172233255100197,
"grad_norm": 7.001738548278809,
"learning_rate": 9.713139418254764e-06,
"loss": 0.5607,
"step": 15700
},
{
"epoch": 1.421736775591262,
"grad_norm": 21.804443359375,
"learning_rate": 9.637913741223672e-06,
"loss": 0.5191,
"step": 15750
},
{
"epoch": 1.421736775591262,
"eval_exact_match": 85.97918637653737,
"eval_f1": 92.37275066667881,
"eval_runtime": 145.3503,
"eval_samples_per_second": 74.289,
"eval_steps_per_second": 18.576,
"step": 15750
},
{
"epoch": 1.426250225672504,
"grad_norm": 11.133319854736328,
"learning_rate": 9.562688064192578e-06,
"loss": 0.5256,
"step": 15800
},
{
"epoch": 1.430763675753746,
"grad_norm": 5.4904632568359375,
"learning_rate": 9.487462387161484e-06,
"loss": 0.5177,
"step": 15850
},
{
"epoch": 1.4352771258349883,
"grad_norm": 9.791414260864258,
"learning_rate": 9.412236710130391e-06,
"loss": 0.5814,
"step": 15900
},
{
"epoch": 1.4397905759162304,
"grad_norm": 3.3400447368621826,
"learning_rate": 9.337011033099299e-06,
"loss": 0.5374,
"step": 15950
},
{
"epoch": 1.4443040259974724,
"grad_norm": 23.98038673400879,
"learning_rate": 9.261785356068205e-06,
"loss": 0.4973,
"step": 16000
},
{
"epoch": 1.4443040259974724,
"eval_exact_match": 84.85335856196784,
"eval_f1": 92.03377983249271,
"eval_runtime": 145.2676,
"eval_samples_per_second": 74.332,
"eval_steps_per_second": 18.586,
"step": 16000
},
{
"epoch": 1.4488174760787147,
"grad_norm": 9.59720516204834,
"learning_rate": 9.186559679037112e-06,
"loss": 0.534,
"step": 16050
},
{
"epoch": 1.4533309261599567,
"grad_norm": 10.079476356506348,
"learning_rate": 9.111334002006018e-06,
"loss": 0.511,
"step": 16100
},
{
"epoch": 1.4578443762411988,
"grad_norm": 3.377192497253418,
"learning_rate": 9.036108324974924e-06,
"loss": 0.5187,
"step": 16150
},
{
"epoch": 1.4623578263224408,
"grad_norm": 10.79287052154541,
"learning_rate": 8.960882647943831e-06,
"loss": 0.5499,
"step": 16200
},
{
"epoch": 1.4668712764036829,
"grad_norm": 15.751055717468262,
"learning_rate": 8.885656970912739e-06,
"loss": 0.536,
"step": 16250
},
{
"epoch": 1.4668712764036829,
"eval_exact_match": 86.40491958372753,
"eval_f1": 92.48354485469106,
"eval_runtime": 145.2653,
"eval_samples_per_second": 74.333,
"eval_steps_per_second": 18.587,
"step": 16250
},
{
"epoch": 1.4713847264849251,
"grad_norm": 3.9903676509857178,
"learning_rate": 8.810431293881645e-06,
"loss": 0.5385,
"step": 16300
},
{
"epoch": 1.4758981765661672,
"grad_norm": 6.5839080810546875,
"learning_rate": 8.735205616850552e-06,
"loss": 0.5092,
"step": 16350
},
{
"epoch": 1.4804116266474092,
"grad_norm": 13.69189453125,
"learning_rate": 8.65997993981946e-06,
"loss": 0.5999,
"step": 16400
},
{
"epoch": 1.4849250767286515,
"grad_norm": 11.840332984924316,
"learning_rate": 8.584754262788364e-06,
"loss": 0.57,
"step": 16450
},
{
"epoch": 1.4894385268098935,
"grad_norm": 11.86502742767334,
"learning_rate": 8.509528585757271e-06,
"loss": 0.4635,
"step": 16500
},
{
"epoch": 1.4894385268098935,
"eval_exact_match": 85.49668874172185,
"eval_f1": 92.28753097702375,
"eval_runtime": 145.2047,
"eval_samples_per_second": 74.364,
"eval_steps_per_second": 18.594,
"step": 16500
},
{
"epoch": 1.4939519768911356,
"grad_norm": 17.741037368774414,
"learning_rate": 8.434302908726179e-06,
"loss": 0.53,
"step": 16550
},
{
"epoch": 1.4984654269723776,
"grad_norm": 7.774323463439941,
"learning_rate": 8.359077231695085e-06,
"loss": 0.575,
"step": 16600
},
{
"epoch": 1.5029788770536197,
"grad_norm": 4.973544597625732,
"learning_rate": 8.283851554663992e-06,
"loss": 0.6468,
"step": 16650
},
{
"epoch": 1.507492327134862,
"grad_norm": 5.228555202484131,
"learning_rate": 8.2086258776329e-06,
"loss": 0.4901,
"step": 16700
},
{
"epoch": 1.512005777216104,
"grad_norm": 3.4082319736480713,
"learning_rate": 8.133400200601806e-06,
"loss": 0.5492,
"step": 16750
},
{
"epoch": 1.512005777216104,
"eval_exact_match": 86.10217596972564,
"eval_f1": 92.39499925755595,
"eval_runtime": 145.1347,
"eval_samples_per_second": 74.4,
"eval_steps_per_second": 18.603,
"step": 16750
},
{
"epoch": 1.516519227297346,
"grad_norm": 20.74472427368164,
"learning_rate": 8.058174523570712e-06,
"loss": 0.4685,
"step": 16800
},
{
"epoch": 1.5210326773785883,
"grad_norm": 4.805381774902344,
"learning_rate": 7.982948846539619e-06,
"loss": 0.474,
"step": 16850
},
{
"epoch": 1.5255461274598303,
"grad_norm": 1.6686218976974487,
"learning_rate": 7.907723169508525e-06,
"loss": 0.5617,
"step": 16900
},
{
"epoch": 1.5300595775410724,
"grad_norm": 14.378780364990234,
"learning_rate": 7.832497492477432e-06,
"loss": 0.4829,
"step": 16950
},
{
"epoch": 1.5345730276223146,
"grad_norm": 9.25706672668457,
"learning_rate": 7.75727181544634e-06,
"loss": 0.5525,
"step": 17000
},
{
"epoch": 1.5345730276223146,
"eval_exact_match": 85.89403973509934,
"eval_f1": 92.2175102581889,
"eval_runtime": 145.4454,
"eval_samples_per_second": 74.241,
"eval_steps_per_second": 18.564,
"step": 17000
},
{
"epoch": 1.5390864777035564,
"grad_norm": 10.210553169250488,
"learning_rate": 7.682046138415246e-06,
"loss": 0.5284,
"step": 17050
},
{
"epoch": 1.5435999277847987,
"grad_norm": 18.55254364013672,
"learning_rate": 7.6068204613841525e-06,
"loss": 0.4863,
"step": 17100
},
{
"epoch": 1.5481133778660408,
"grad_norm": 9.640850067138672,
"learning_rate": 7.53159478435306e-06,
"loss": 0.6163,
"step": 17150
},
{
"epoch": 1.5526268279472828,
"grad_norm": 7.999804496765137,
"learning_rate": 7.456369107321966e-06,
"loss": 0.518,
"step": 17200
},
{
"epoch": 1.557140278028525,
"grad_norm": 9.162345886230469,
"learning_rate": 7.3811434302908725e-06,
"loss": 0.5001,
"step": 17250
},
{
"epoch": 1.557140278028525,
"eval_exact_match": 85.9035004730369,
"eval_f1": 92.33731500742522,
"eval_runtime": 145.3659,
"eval_samples_per_second": 74.282,
"eval_steps_per_second": 18.574,
"step": 17250
},
{
"epoch": 1.561653728109767,
"grad_norm": 5.462348461151123,
"learning_rate": 7.30591775325978e-06,
"loss": 0.4179,
"step": 17300
},
{
"epoch": 1.5661671781910091,
"grad_norm": 17.67523765563965,
"learning_rate": 7.230692076228686e-06,
"loss": 0.4662,
"step": 17350
},
{
"epoch": 1.5706806282722514,
"grad_norm": 4.397737503051758,
"learning_rate": 7.155466399197593e-06,
"loss": 0.4614,
"step": 17400
},
{
"epoch": 1.5751940783534935,
"grad_norm": 7.665886402130127,
"learning_rate": 7.0802407221665e-06,
"loss": 0.5263,
"step": 17450
},
{
"epoch": 1.5797075284347355,
"grad_norm": 10.627632141113281,
"learning_rate": 7.005015045135407e-06,
"loss": 0.4021,
"step": 17500
},
{
"epoch": 1.5797075284347355,
"eval_exact_match": 85.98864711447493,
"eval_f1": 92.40008411758966,
"eval_runtime": 145.3304,
"eval_samples_per_second": 74.3,
"eval_steps_per_second": 18.578,
"step": 17500
},
{
"epoch": 1.5842209785159778,
"grad_norm": 7.057243347167969,
"learning_rate": 6.929789368104313e-06,
"loss": 0.5326,
"step": 17550
},
{
"epoch": 1.5887344285972196,
"grad_norm": 8.216778755187988,
"learning_rate": 6.85456369107322e-06,
"loss": 0.6029,
"step": 17600
},
{
"epoch": 1.5932478786784618,
"grad_norm": 3.943422794342041,
"learning_rate": 6.779338014042127e-06,
"loss": 0.5518,
"step": 17650
},
{
"epoch": 1.597761328759704,
"grad_norm": 12.350107192993164,
"learning_rate": 6.704112337011033e-06,
"loss": 0.5368,
"step": 17700
},
{
"epoch": 1.602274778840946,
"grad_norm": 6.516546249389648,
"learning_rate": 6.62888665997994e-06,
"loss": 0.504,
"step": 17750
},
{
"epoch": 1.602274778840946,
"eval_exact_match": 86.12109744560075,
"eval_f1": 92.37865780721518,
"eval_runtime": 145.3649,
"eval_samples_per_second": 74.282,
"eval_steps_per_second": 18.574,
"step": 17750
},
{
"epoch": 1.6067882289221882,
"grad_norm": 3.5462801456451416,
"learning_rate": 6.553660982948847e-06,
"loss": 0.4613,
"step": 17800
},
{
"epoch": 1.6113016790034302,
"grad_norm": 9.32242488861084,
"learning_rate": 6.478435305917753e-06,
"loss": 0.5149,
"step": 17850
},
{
"epoch": 1.6158151290846723,
"grad_norm": 4.5879597663879395,
"learning_rate": 6.40320962888666e-06,
"loss": 0.4189,
"step": 17900
},
{
"epoch": 1.6203285791659146,
"grad_norm": 10.474478721618652,
"learning_rate": 6.327983951855567e-06,
"loss": 0.5182,
"step": 17950
},
{
"epoch": 1.6248420292471564,
"grad_norm": 4.693137168884277,
"learning_rate": 6.252758274824474e-06,
"loss": 0.6212,
"step": 18000
},
{
"epoch": 1.6248420292471564,
"eval_exact_match": 85.93188268684958,
"eval_f1": 92.33148266916612,
"eval_runtime": 145.2483,
"eval_samples_per_second": 74.342,
"eval_steps_per_second": 18.589,
"step": 18000
},
{
"epoch": 1.6293554793283986,
"grad_norm": 20.36956214904785,
"learning_rate": 6.17753259779338e-06,
"loss": 0.6194,
"step": 18050
},
{
"epoch": 1.6338689294096407,
"grad_norm": 8.450358390808105,
"learning_rate": 6.102306920762287e-06,
"loss": 0.518,
"step": 18100
},
{
"epoch": 1.6383823794908827,
"grad_norm": 4.681309223175049,
"learning_rate": 6.027081243731194e-06,
"loss": 0.4853,
"step": 18150
},
{
"epoch": 1.642895829572125,
"grad_norm": 6.480415344238281,
"learning_rate": 5.9518555667001e-06,
"loss": 0.5101,
"step": 18200
},
{
"epoch": 1.647409279653367,
"grad_norm": 11.254326820373535,
"learning_rate": 5.876629889669007e-06,
"loss": 0.5095,
"step": 18250
},
{
"epoch": 1.647409279653367,
"eval_exact_match": 86.03595080416272,
"eval_f1": 92.38608711058883,
"eval_runtime": 145.2161,
"eval_samples_per_second": 74.358,
"eval_steps_per_second": 18.593,
"step": 18250
},
{
"epoch": 1.651922729734609,
"grad_norm": 7.776529788970947,
"learning_rate": 5.801404212637914e-06,
"loss": 0.5322,
"step": 18300
},
{
"epoch": 1.6564361798158513,
"grad_norm": 17.068607330322266,
"learning_rate": 5.72617853560682e-06,
"loss": 0.4755,
"step": 18350
},
{
"epoch": 1.6609496298970934,
"grad_norm": 20.472034454345703,
"learning_rate": 5.650952858575727e-06,
"loss": 0.6017,
"step": 18400
},
{
"epoch": 1.6654630799783354,
"grad_norm": 4.904719352722168,
"learning_rate": 5.575727181544634e-06,
"loss": 0.4662,
"step": 18450
},
{
"epoch": 1.6699765300595777,
"grad_norm": 7.219258785247803,
"learning_rate": 5.500501504513541e-06,
"loss": 0.4781,
"step": 18500
},
{
"epoch": 1.6699765300595777,
"eval_exact_match": 85.55345316934721,
"eval_f1": 92.32101574735367,
"eval_runtime": 145.1791,
"eval_samples_per_second": 74.377,
"eval_steps_per_second": 18.598,
"step": 18500
},
{
"epoch": 1.6744899801408195,
"grad_norm": 8.811306953430176,
"learning_rate": 5.425275827482447e-06,
"loss": 0.5161,
"step": 18550
},
{
"epoch": 1.6790034302220618,
"grad_norm": 4.903675079345703,
"learning_rate": 5.350050150451354e-06,
"loss": 0.5215,
"step": 18600
},
{
"epoch": 1.6835168803033038,
"grad_norm": 2.371656894683838,
"learning_rate": 5.274824473420261e-06,
"loss": 0.486,
"step": 18650
},
{
"epoch": 1.6880303303845459,
"grad_norm": 8.991338729858398,
"learning_rate": 5.199598796389167e-06,
"loss": 0.5043,
"step": 18700
},
{
"epoch": 1.6925437804657881,
"grad_norm": 5.460509777069092,
"learning_rate": 5.124373119358074e-06,
"loss": 0.5344,
"step": 18750
},
{
"epoch": 1.6925437804657881,
"eval_exact_match": 86.02649006622516,
"eval_f1": 92.44635703301584,
"eval_runtime": 145.2855,
"eval_samples_per_second": 74.323,
"eval_steps_per_second": 18.584,
"step": 18750
},
{
"epoch": 1.6970572305470302,
"grad_norm": 6.287936210632324,
"learning_rate": 5.049147442326981e-06,
"loss": 0.4446,
"step": 18800
},
{
"epoch": 1.7015706806282722,
"grad_norm": 2.3766534328460693,
"learning_rate": 4.973921765295887e-06,
"loss": 0.4618,
"step": 18850
},
{
"epoch": 1.7060841307095145,
"grad_norm": 6.606088161468506,
"learning_rate": 4.898696088264794e-06,
"loss": 0.498,
"step": 18900
},
{
"epoch": 1.7105975807907563,
"grad_norm": 7.917613506317139,
"learning_rate": 4.8234704112337015e-06,
"loss": 0.473,
"step": 18950
},
{
"epoch": 1.7151110308719986,
"grad_norm": 13.437002182006836,
"learning_rate": 4.748244734202608e-06,
"loss": 0.5217,
"step": 19000
},
{
"epoch": 1.7151110308719986,
"eval_exact_match": 86.2251655629139,
"eval_f1": 92.51045927542914,
"eval_runtime": 145.2525,
"eval_samples_per_second": 74.34,
"eval_steps_per_second": 18.588,
"step": 19000
},
{
"epoch": 1.7196244809532406,
"grad_norm": 21.756275177001953,
"learning_rate": 4.673019057171515e-06,
"loss": 0.5129,
"step": 19050
},
{
"epoch": 1.7241379310344827,
"grad_norm": 4.2581377029418945,
"learning_rate": 4.5977933801404215e-06,
"loss": 0.5202,
"step": 19100
},
{
"epoch": 1.728651381115725,
"grad_norm": 9.396230697631836,
"learning_rate": 4.522567703109328e-06,
"loss": 0.5118,
"step": 19150
},
{
"epoch": 1.733164831196967,
"grad_norm": 9.545235633850098,
"learning_rate": 4.447342026078235e-06,
"loss": 0.4611,
"step": 19200
},
{
"epoch": 1.737678281278209,
"grad_norm": 4.104794502258301,
"learning_rate": 4.3721163490471416e-06,
"loss": 0.5737,
"step": 19250
},
{
"epoch": 1.737678281278209,
"eval_exact_match": 86.23462630085146,
"eval_f1": 92.57135940815057,
"eval_runtime": 145.5739,
"eval_samples_per_second": 74.175,
"eval_steps_per_second": 18.547,
"step": 19250
},
{
"epoch": 1.7421917313594513,
"grad_norm": 8.624117851257324,
"learning_rate": 4.296890672016048e-06,
"loss": 0.5349,
"step": 19300
},
{
"epoch": 1.7467051814406933,
"grad_norm": 4.802499771118164,
"learning_rate": 4.221664994984955e-06,
"loss": 0.5332,
"step": 19350
},
{
"epoch": 1.7512186315219354,
"grad_norm": 4.347715854644775,
"learning_rate": 4.146439317953862e-06,
"loss": 0.474,
"step": 19400
},
{
"epoch": 1.7557320816031776,
"grad_norm": 21.51348114013672,
"learning_rate": 4.071213640922768e-06,
"loss": 0.5182,
"step": 19450
},
{
"epoch": 1.7602455316844194,
"grad_norm": 3.002976655960083,
"learning_rate": 3.995987963891676e-06,
"loss": 0.4243,
"step": 19500
},
{
"epoch": 1.7602455316844194,
"eval_exact_match": 86.44276253547777,
"eval_f1": 92.67282094003843,
"eval_runtime": 145.268,
"eval_samples_per_second": 74.332,
"eval_steps_per_second": 18.586,
"step": 19500
},
{
"epoch": 1.7647589817656617,
"grad_norm": 6.619145393371582,
"learning_rate": 3.920762286860582e-06,
"loss": 0.5754,
"step": 19550
},
{
"epoch": 1.7692724318469037,
"grad_norm": 8.654962539672852,
"learning_rate": 3.845536609829488e-06,
"loss": 0.4927,
"step": 19600
},
{
"epoch": 1.7737858819281458,
"grad_norm": 2.102865695953369,
"learning_rate": 3.7703109327983955e-06,
"loss": 0.417,
"step": 19650
},
{
"epoch": 1.778299332009388,
"grad_norm": 9.824490547180176,
"learning_rate": 3.695085255767302e-06,
"loss": 0.5777,
"step": 19700
},
{
"epoch": 1.78281278209063,
"grad_norm": 22.286598205566406,
"learning_rate": 3.619859578736209e-06,
"loss": 0.4338,
"step": 19750
},
{
"epoch": 1.78281278209063,
"eval_exact_match": 86.39545884578997,
"eval_f1": 92.63887659164942,
"eval_runtime": 145.3006,
"eval_samples_per_second": 74.315,
"eval_steps_per_second": 18.582,
"step": 19750
},
{
"epoch": 1.7873262321718721,
"grad_norm": 0.6572410464286804,
"learning_rate": 3.5446339017051155e-06,
"loss": 0.4549,
"step": 19800
},
{
"epoch": 1.7918396822531144,
"grad_norm": 15.171038627624512,
"learning_rate": 3.469408224674022e-06,
"loss": 0.452,
"step": 19850
},
{
"epoch": 1.7963531323343562,
"grad_norm": 13.550349235534668,
"learning_rate": 3.394182547642929e-06,
"loss": 0.4901,
"step": 19900
},
{
"epoch": 1.8008665824155985,
"grad_norm": 9.970057487487793,
"learning_rate": 3.3189568706118356e-06,
"loss": 0.4424,
"step": 19950
},
{
"epoch": 1.8053800324968405,
"grad_norm": 3.251477003097534,
"learning_rate": 3.2437311935807422e-06,
"loss": 0.5214,
"step": 20000
},
{
"epoch": 1.8053800324968405,
"eval_exact_match": 86.10217596972564,
"eval_f1": 92.55287274359681,
"eval_runtime": 145.2611,
"eval_samples_per_second": 74.335,
"eval_steps_per_second": 18.587,
"step": 20000
},
{
"epoch": 1.8098934825780826,
"grad_norm": 3.656310558319092,
"learning_rate": 3.1685055165496493e-06,
"loss": 0.4794,
"step": 20050
},
{
"epoch": 1.8144069326593248,
"grad_norm": 6.139503479003906,
"learning_rate": 3.0932798395185556e-06,
"loss": 0.4485,
"step": 20100
},
{
"epoch": 1.8189203827405669,
"grad_norm": 6.566440582275391,
"learning_rate": 3.0180541624874623e-06,
"loss": 0.456,
"step": 20150
},
{
"epoch": 1.823433832821809,
"grad_norm": 6.406381130218506,
"learning_rate": 2.9428284854563694e-06,
"loss": 0.4988,
"step": 20200
},
{
"epoch": 1.8279472829030512,
"grad_norm": 4.750673770904541,
"learning_rate": 2.8676028084252757e-06,
"loss": 0.5102,
"step": 20250
},
{
"epoch": 1.8279472829030512,
"eval_exact_match": 86.0643330179754,
"eval_f1": 92.48790625031062,
"eval_runtime": 145.6532,
"eval_samples_per_second": 74.135,
"eval_steps_per_second": 18.537,
"step": 20250
},
{
"epoch": 1.8324607329842932,
"grad_norm": 2.9004476070404053,
"learning_rate": 2.7923771313941828e-06,
"loss": 0.4172,
"step": 20300
},
{
"epoch": 1.8369741830655353,
"grad_norm": 5.453982353210449,
"learning_rate": 2.7171514543630894e-06,
"loss": 0.4328,
"step": 20350
},
{
"epoch": 1.8414876331467775,
"grad_norm": 6.562243461608887,
"learning_rate": 2.6419257773319957e-06,
"loss": 0.5946,
"step": 20400
},
{
"epoch": 1.8460010832280194,
"grad_norm": 2.186967134475708,
"learning_rate": 2.566700100300903e-06,
"loss": 0.5174,
"step": 20450
},
{
"epoch": 1.8505145333092616,
"grad_norm": 18.740962982177734,
"learning_rate": 2.4914744232698095e-06,
"loss": 0.5614,
"step": 20500
},
{
"epoch": 1.8505145333092616,
"eval_exact_match": 86.24408703878902,
"eval_f1": 92.6215979315234,
"eval_runtime": 145.5466,
"eval_samples_per_second": 74.189,
"eval_steps_per_second": 18.551,
"step": 20500
},
{
"epoch": 1.8550279833905037,
"grad_norm": 8.845901489257812,
"learning_rate": 2.416248746238716e-06,
"loss": 0.5174,
"step": 20550
},
{
"epoch": 1.8595414334717457,
"grad_norm": 2.0935425758361816,
"learning_rate": 2.341023069207623e-06,
"loss": 0.4191,
"step": 20600
},
{
"epoch": 1.864054883552988,
"grad_norm": 5.225878715515137,
"learning_rate": 2.2657973921765295e-06,
"loss": 0.5168,
"step": 20650
},
{
"epoch": 1.86856833363423,
"grad_norm": 3.618779182434082,
"learning_rate": 2.1905717151454362e-06,
"loss": 0.5116,
"step": 20700
},
{
"epoch": 1.873081783715472,
"grad_norm": 14.580885887145996,
"learning_rate": 2.115346038114343e-06,
"loss": 0.5247,
"step": 20750
},
{
"epoch": 1.873081783715472,
"eval_exact_match": 86.39545884578997,
"eval_f1": 92.68082620123108,
"eval_runtime": 144.0826,
"eval_samples_per_second": 74.943,
"eval_steps_per_second": 18.739,
"step": 20750
},
{
"epoch": 1.8775952337967143,
"grad_norm": 6.323169708251953,
"learning_rate": 2.04012036108325e-06,
"loss": 0.5007,
"step": 20800
},
{
"epoch": 1.8821086838779562,
"grad_norm": 7.055742263793945,
"learning_rate": 1.9648946840521567e-06,
"loss": 0.5387,
"step": 20850
},
{
"epoch": 1.8866221339591984,
"grad_norm": 6.097321033477783,
"learning_rate": 1.8896690070210632e-06,
"loss": 0.4795,
"step": 20900
},
{
"epoch": 1.8911355840404405,
"grad_norm": 20.577049255371094,
"learning_rate": 1.81444332998997e-06,
"loss": 0.5636,
"step": 20950
},
{
"epoch": 1.8956490341216825,
"grad_norm": 11.891510009765625,
"learning_rate": 1.7392176529588768e-06,
"loss": 0.5768,
"step": 21000
},
{
"epoch": 1.8956490341216825,
"eval_exact_match": 86.28192999053927,
"eval_f1": 92.59481897569101,
"eval_runtime": 143.5237,
"eval_samples_per_second": 75.235,
"eval_steps_per_second": 18.812,
"step": 21000
},
{
"epoch": 1.9001624842029248,
"grad_norm": 28.808475494384766,
"learning_rate": 1.6639919759277832e-06,
"loss": 0.5267,
"step": 21050
},
{
"epoch": 1.9046759342841668,
"grad_norm": 11.045042991638184,
"learning_rate": 1.5887662988966901e-06,
"loss": 0.5508,
"step": 21100
},
{
"epoch": 1.9091893843654089,
"grad_norm": 4.862325191497803,
"learning_rate": 1.5135406218655968e-06,
"loss": 0.4697,
"step": 21150
},
{
"epoch": 1.9137028344466511,
"grad_norm": 13.387544631958008,
"learning_rate": 1.4383149448345037e-06,
"loss": 0.4416,
"step": 21200
},
{
"epoch": 1.9182162845278932,
"grad_norm": 5.2306342124938965,
"learning_rate": 1.3630892678034104e-06,
"loss": 0.4124,
"step": 21250
},
{
"epoch": 1.9182162845278932,
"eval_exact_match": 86.26300851466415,
"eval_f1": 92.60255176769716,
"eval_runtime": 143.5894,
"eval_samples_per_second": 75.201,
"eval_steps_per_second": 18.804,
"step": 21250
},
{
"epoch": 1.9227297346091352,
"grad_norm": 29.35004234313965,
"learning_rate": 1.2878635907723169e-06,
"loss": 0.5344,
"step": 21300
},
{
"epoch": 1.9272431846903775,
"grad_norm": 8.634255409240723,
"learning_rate": 1.2126379137412237e-06,
"loss": 0.4815,
"step": 21350
},
{
"epoch": 1.9317566347716193,
"grad_norm": 8.262895584106445,
"learning_rate": 1.1374122367101304e-06,
"loss": 0.4939,
"step": 21400
},
{
"epoch": 1.9362700848528616,
"grad_norm": 12.539655685424805,
"learning_rate": 1.0621865596790371e-06,
"loss": 0.4957,
"step": 21450
},
{
"epoch": 1.9407835349341036,
"grad_norm": 9.728516578674316,
"learning_rate": 9.86960882647944e-07,
"loss": 0.4587,
"step": 21500
},
{
"epoch": 1.9407835349341036,
"eval_exact_match": 86.14001892147587,
"eval_f1": 92.57583651293868,
"eval_runtime": 143.5952,
"eval_samples_per_second": 75.197,
"eval_steps_per_second": 18.803,
"step": 21500
},
{
"epoch": 1.9452969850153456,
"grad_norm": 9.588170051574707,
"learning_rate": 9.117352056168506e-07,
"loss": 0.5395,
"step": 21550
},
{
"epoch": 1.949810435096588,
"grad_norm": 14.394529342651367,
"learning_rate": 8.365095285857573e-07,
"loss": 0.5462,
"step": 21600
},
{
"epoch": 1.95432388517783,
"grad_norm": 17.767173767089844,
"learning_rate": 7.612838515546641e-07,
"loss": 0.4525,
"step": 21650
},
{
"epoch": 1.958837335259072,
"grad_norm": 15.962186813354492,
"learning_rate": 6.860581745235707e-07,
"loss": 0.6349,
"step": 21700
},
{
"epoch": 1.9633507853403143,
"grad_norm": 3.5042107105255127,
"learning_rate": 6.108324974924774e-07,
"loss": 0.4903,
"step": 21750
},
{
"epoch": 1.9633507853403143,
"eval_exact_match": 86.30085146641439,
"eval_f1": 92.62412689139829,
"eval_runtime": 143.4548,
"eval_samples_per_second": 75.271,
"eval_steps_per_second": 18.821,
"step": 21750
},
{
"epoch": 1.967864235421556,
"grad_norm": 3.967465877532959,
"learning_rate": 5.356068204613842e-07,
"loss": 0.4477,
"step": 21800
},
{
"epoch": 1.9723776855027984,
"grad_norm": 13.418035507202148,
"learning_rate": 4.603811434302909e-07,
"loss": 0.4418,
"step": 21850
},
{
"epoch": 1.9768911355840404,
"grad_norm": 8.183111190795898,
"learning_rate": 3.851554663991976e-07,
"loss": 0.5417,
"step": 21900
},
{
"epoch": 1.9814045856652824,
"grad_norm": 5.646338939666748,
"learning_rate": 3.099297893681043e-07,
"loss": 0.5065,
"step": 21950
},
{
"epoch": 1.9859180357465247,
"grad_norm": 14.187732696533203,
"learning_rate": 2.3470411233701103e-07,
"loss": 0.4325,
"step": 22000
},
{
"epoch": 1.9859180357465247,
"eval_exact_match": 86.35761589403974,
"eval_f1": 92.66264597808306,
"eval_runtime": 143.145,
"eval_samples_per_second": 75.434,
"eval_steps_per_second": 18.862,
"step": 22000
},
{
"epoch": 1.9904314858277667,
"grad_norm": 24.069639205932617,
"learning_rate": 1.5947843530591774e-07,
"loss": 0.4079,
"step": 22050
},
{
"epoch": 1.9949449359090088,
"grad_norm": 9.552345275878906,
"learning_rate": 8.425275827482447e-08,
"loss": 0.4418,
"step": 22100
},
{
"epoch": 1.999458385990251,
"grad_norm": 11.15715503692627,
"learning_rate": 9.027081243731194e-09,
"loss": 0.519,
"step": 22150
},
{
"epoch": 2.0,
"step": 22156,
"total_flos": 4.529540706059981e+16,
"train_loss": 0.8456309766066937,
"train_runtime": 23960.3876,
"train_samples_per_second": 7.397,
"train_steps_per_second": 0.925
}
],
"logging_steps": 50,
"max_steps": 22156,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.529540706059981e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}