|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.0, |
|
"eval_steps": 250, |
|
"global_step": 22156, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.004513450081242101, |
|
"grad_norm": 7.729167461395264, |
|
"learning_rate": 6.768953068592058e-07, |
|
"loss": 5.9501, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.009026900162484202, |
|
"grad_norm": 13.082584381103516, |
|
"learning_rate": 1.3537906137184116e-06, |
|
"loss": 5.9447, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.013540350243726304, |
|
"grad_norm": 18.259796142578125, |
|
"learning_rate": 2.0306859205776177e-06, |
|
"loss": 5.9165, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.018053800324968405, |
|
"grad_norm": 22.386247634887695, |
|
"learning_rate": 2.7075812274368233e-06, |
|
"loss": 5.8394, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.022567250406210507, |
|
"grad_norm": 34.2584228515625, |
|
"learning_rate": 3.384476534296029e-06, |
|
"loss": 5.6527, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.022567250406210507, |
|
"eval_exact_match": 2.0056764427625353, |
|
"eval_f1": 8.992036775611602, |
|
"eval_runtime": 152.2365, |
|
"eval_samples_per_second": 70.929, |
|
"eval_steps_per_second": 17.736, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.02708070048745261, |
|
"grad_norm": 35.096343994140625, |
|
"learning_rate": 4.061371841155235e-06, |
|
"loss": 5.4208, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.03159415056869471, |
|
"grad_norm": 32.929325103759766, |
|
"learning_rate": 4.73826714801444e-06, |
|
"loss": 5.1947, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.03610760064993681, |
|
"grad_norm": 24.735565185546875, |
|
"learning_rate": 5.4151624548736465e-06, |
|
"loss": 4.9405, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.040621050731178915, |
|
"grad_norm": 22.857040405273438, |
|
"learning_rate": 6.092057761732852e-06, |
|
"loss": 4.7322, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.04513450081242101, |
|
"grad_norm": 24.466981887817383, |
|
"learning_rate": 6.768953068592058e-06, |
|
"loss": 4.4225, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.04513450081242101, |
|
"eval_exact_match": 5.771050141911069, |
|
"eval_f1": 13.440407627304385, |
|
"eval_runtime": 143.7394, |
|
"eval_samples_per_second": 75.122, |
|
"eval_steps_per_second": 18.784, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.04964795089366312, |
|
"grad_norm": 21.847347259521484, |
|
"learning_rate": 7.445848375451264e-06, |
|
"loss": 4.077, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.05416140097490522, |
|
"grad_norm": 27.222633361816406, |
|
"learning_rate": 8.12274368231047e-06, |
|
"loss": 3.5837, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.058674851056147316, |
|
"grad_norm": 29.45089340209961, |
|
"learning_rate": 8.799638989169675e-06, |
|
"loss": 3.0981, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.06318830113738942, |
|
"grad_norm": 29.15781593322754, |
|
"learning_rate": 9.47653429602888e-06, |
|
"loss": 2.823, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.06770175121863152, |
|
"grad_norm": 20.824087142944336, |
|
"learning_rate": 1.0153429602888087e-05, |
|
"loss": 2.4615, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.06770175121863152, |
|
"eval_exact_match": 52.82876064333018, |
|
"eval_f1": 63.363202801168775, |
|
"eval_runtime": 143.757, |
|
"eval_samples_per_second": 75.113, |
|
"eval_steps_per_second": 18.782, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.07221520129987362, |
|
"grad_norm": 33.184410095214844, |
|
"learning_rate": 1.0830324909747293e-05, |
|
"loss": 2.2565, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.07672865138111573, |
|
"grad_norm": 27.845844268798828, |
|
"learning_rate": 1.15072202166065e-05, |
|
"loss": 1.8158, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.08124210146235783, |
|
"grad_norm": 18.29555320739746, |
|
"learning_rate": 1.2184115523465704e-05, |
|
"loss": 1.7871, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.08575555154359993, |
|
"grad_norm": 23.45345687866211, |
|
"learning_rate": 1.2861010830324909e-05, |
|
"loss": 1.6184, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.09026900162484203, |
|
"grad_norm": 23.513124465942383, |
|
"learning_rate": 1.3537906137184115e-05, |
|
"loss": 1.577, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.09026900162484203, |
|
"eval_exact_match": 65.42100283822138, |
|
"eval_f1": 76.31429412241118, |
|
"eval_runtime": 143.6001, |
|
"eval_samples_per_second": 75.195, |
|
"eval_steps_per_second": 18.802, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.09478245170608413, |
|
"grad_norm": 13.316262245178223, |
|
"learning_rate": 1.4214801444043322e-05, |
|
"loss": 1.5352, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.09929590178732624, |
|
"grad_norm": 34.4163932800293, |
|
"learning_rate": 1.4891696750902528e-05, |
|
"loss": 1.6355, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.10380935186856834, |
|
"grad_norm": 33.572750091552734, |
|
"learning_rate": 1.5568592057761735e-05, |
|
"loss": 1.5064, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.10832280194981043, |
|
"grad_norm": 13.494843482971191, |
|
"learning_rate": 1.624548736462094e-05, |
|
"loss": 1.4047, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.11283625203105253, |
|
"grad_norm": 21.778579711914062, |
|
"learning_rate": 1.6922382671480144e-05, |
|
"loss": 1.2552, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.11283625203105253, |
|
"eval_exact_match": 69.5364238410596, |
|
"eval_f1": 80.48975233211664, |
|
"eval_runtime": 143.6325, |
|
"eval_samples_per_second": 75.178, |
|
"eval_steps_per_second": 18.798, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.11734970211229463, |
|
"grad_norm": 17.293298721313477, |
|
"learning_rate": 1.759927797833935e-05, |
|
"loss": 1.2779, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.12186315219353674, |
|
"grad_norm": 14.029269218444824, |
|
"learning_rate": 1.8276173285198557e-05, |
|
"loss": 1.2871, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.12637660227477884, |
|
"grad_norm": 12.971822738647461, |
|
"learning_rate": 1.895306859205776e-05, |
|
"loss": 1.1974, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.13089005235602094, |
|
"grad_norm": 30.38484001159668, |
|
"learning_rate": 1.9629963898916967e-05, |
|
"loss": 1.355, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.13540350243726304, |
|
"grad_norm": 29.467548370361328, |
|
"learning_rate": 2.0306859205776173e-05, |
|
"loss": 1.1713, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.13540350243726304, |
|
"eval_exact_match": 72.57332071901608, |
|
"eval_f1": 82.65844387552723, |
|
"eval_runtime": 143.752, |
|
"eval_samples_per_second": 75.115, |
|
"eval_steps_per_second": 18.782, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.13991695251850514, |
|
"grad_norm": 12.46554183959961, |
|
"learning_rate": 2.098375451263538e-05, |
|
"loss": 1.1677, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.14443040259974724, |
|
"grad_norm": 25.593875885009766, |
|
"learning_rate": 2.1660649819494586e-05, |
|
"loss": 1.3478, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.14894385268098934, |
|
"grad_norm": 16.136869430541992, |
|
"learning_rate": 2.2337545126353793e-05, |
|
"loss": 1.1676, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.15345730276223146, |
|
"grad_norm": 16.83846664428711, |
|
"learning_rate": 2.3014440433213e-05, |
|
"loss": 1.1741, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.15797075284347356, |
|
"grad_norm": 17.464096069335938, |
|
"learning_rate": 2.3691335740072202e-05, |
|
"loss": 1.2104, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.15797075284347356, |
|
"eval_exact_match": 74.64522232734153, |
|
"eval_f1": 84.72687223622708, |
|
"eval_runtime": 143.5946, |
|
"eval_samples_per_second": 75.198, |
|
"eval_steps_per_second": 18.803, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.16248420292471566, |
|
"grad_norm": 13.083732604980469, |
|
"learning_rate": 2.436823104693141e-05, |
|
"loss": 1.1279, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.16699765300595776, |
|
"grad_norm": 15.166335105895996, |
|
"learning_rate": 2.5045126353790615e-05, |
|
"loss": 1.1395, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.17151110308719986, |
|
"grad_norm": 8.829039573669434, |
|
"learning_rate": 2.5722021660649818e-05, |
|
"loss": 1.1374, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.17602455316844196, |
|
"grad_norm": 28.089391708374023, |
|
"learning_rate": 2.6398916967509024e-05, |
|
"loss": 1.2106, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.18053800324968405, |
|
"grad_norm": 13.704926490783691, |
|
"learning_rate": 2.707581227436823e-05, |
|
"loss": 1.2369, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.18053800324968405, |
|
"eval_exact_match": 77.96594134342479, |
|
"eval_f1": 86.52116394116426, |
|
"eval_runtime": 143.6954, |
|
"eval_samples_per_second": 75.145, |
|
"eval_steps_per_second": 18.79, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.18505145333092615, |
|
"grad_norm": 13.062108993530273, |
|
"learning_rate": 2.7752707581227437e-05, |
|
"loss": 1.0064, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.18956490341216825, |
|
"grad_norm": 25.21763801574707, |
|
"learning_rate": 2.8429602888086644e-05, |
|
"loss": 1.1127, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.19407835349341035, |
|
"grad_norm": 7.10919189453125, |
|
"learning_rate": 2.910649819494585e-05, |
|
"loss": 1.054, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.19859180357465248, |
|
"grad_norm": 14.997174263000488, |
|
"learning_rate": 2.9783393501805057e-05, |
|
"loss": 1.1187, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.20310525365589457, |
|
"grad_norm": 9.683287620544434, |
|
"learning_rate": 2.9948846539618856e-05, |
|
"loss": 0.9916, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.20310525365589457, |
|
"eval_exact_match": 77.21854304635761, |
|
"eval_f1": 85.59348158373206, |
|
"eval_runtime": 143.7109, |
|
"eval_samples_per_second": 75.137, |
|
"eval_steps_per_second": 18.788, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.20761870373713667, |
|
"grad_norm": 18.712541580200195, |
|
"learning_rate": 2.987362086258776e-05, |
|
"loss": 1.1756, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.21213215381837877, |
|
"grad_norm": 8.502235412597656, |
|
"learning_rate": 2.979839518555667e-05, |
|
"loss": 1.0872, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.21664560389962087, |
|
"grad_norm": 16.61508560180664, |
|
"learning_rate": 2.9723169508525577e-05, |
|
"loss": 1.1148, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.22115905398086297, |
|
"grad_norm": 9.676267623901367, |
|
"learning_rate": 2.9647943831494482e-05, |
|
"loss": 1.0559, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.22567250406210507, |
|
"grad_norm": 11.562779426574707, |
|
"learning_rate": 2.957271815446339e-05, |
|
"loss": 1.0628, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.22567250406210507, |
|
"eval_exact_match": 79.3755912961211, |
|
"eval_f1": 87.42406194378296, |
|
"eval_runtime": 143.6631, |
|
"eval_samples_per_second": 75.162, |
|
"eval_steps_per_second": 18.794, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.23018595414334717, |
|
"grad_norm": 20.24138832092285, |
|
"learning_rate": 2.9497492477432297e-05, |
|
"loss": 1.0549, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.23469940422458926, |
|
"grad_norm": 24.723041534423828, |
|
"learning_rate": 2.9422266800401203e-05, |
|
"loss": 1.137, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.2392128543058314, |
|
"grad_norm": 14.101241111755371, |
|
"learning_rate": 2.9347041123370113e-05, |
|
"loss": 1.0199, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.2437263043870735, |
|
"grad_norm": 7.032845497131348, |
|
"learning_rate": 2.927181544633902e-05, |
|
"loss": 1.0601, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.2482397544683156, |
|
"grad_norm": 13.543634414672852, |
|
"learning_rate": 2.9196589769307924e-05, |
|
"loss": 1.0534, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.2482397544683156, |
|
"eval_exact_match": 79.57426679280984, |
|
"eval_f1": 87.74281924363757, |
|
"eval_runtime": 143.6905, |
|
"eval_samples_per_second": 75.148, |
|
"eval_steps_per_second": 18.79, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.2527532045495577, |
|
"grad_norm": 7.890726566314697, |
|
"learning_rate": 2.9121364092276833e-05, |
|
"loss": 1.0196, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.2572666546307998, |
|
"grad_norm": 12.943625450134277, |
|
"learning_rate": 2.904613841524574e-05, |
|
"loss": 1.0178, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.2617801047120419, |
|
"grad_norm": 9.828871726989746, |
|
"learning_rate": 2.897091273821464e-05, |
|
"loss": 1.0482, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.266293554793284, |
|
"grad_norm": 28.693660736083984, |
|
"learning_rate": 2.889568706118355e-05, |
|
"loss": 0.9897, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.2708070048745261, |
|
"grad_norm": 10.408865928649902, |
|
"learning_rate": 2.8820461384152457e-05, |
|
"loss": 0.9407, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.2708070048745261, |
|
"eval_exact_match": 80.37842951750237, |
|
"eval_f1": 88.60122498039404, |
|
"eval_runtime": 143.7217, |
|
"eval_samples_per_second": 75.131, |
|
"eval_steps_per_second": 18.786, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.2753204549557682, |
|
"grad_norm": 11.135859489440918, |
|
"learning_rate": 2.8745235707121363e-05, |
|
"loss": 0.9192, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.2798339050370103, |
|
"grad_norm": 10.159820556640625, |
|
"learning_rate": 2.8670010030090272e-05, |
|
"loss": 0.9232, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.2843473551182524, |
|
"grad_norm": 6.99199914932251, |
|
"learning_rate": 2.8594784353059178e-05, |
|
"loss": 0.9188, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.2888608051994945, |
|
"grad_norm": 11.692395210266113, |
|
"learning_rate": 2.8519558676028083e-05, |
|
"loss": 0.979, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.2933742552807366, |
|
"grad_norm": 12.289103507995605, |
|
"learning_rate": 2.8444332998996993e-05, |
|
"loss": 0.8573, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.2933742552807366, |
|
"eval_exact_match": 81.44749290444655, |
|
"eval_f1": 88.89880962072144, |
|
"eval_runtime": 143.7737, |
|
"eval_samples_per_second": 75.104, |
|
"eval_steps_per_second": 18.78, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.29788770536197867, |
|
"grad_norm": 23.986677169799805, |
|
"learning_rate": 2.83691073219659e-05, |
|
"loss": 0.879, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.30240115544322077, |
|
"grad_norm": 10.214922904968262, |
|
"learning_rate": 2.8293881644934804e-05, |
|
"loss": 0.9526, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.3069146055244629, |
|
"grad_norm": 11.930830955505371, |
|
"learning_rate": 2.8218655967903714e-05, |
|
"loss": 1.0308, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.311428055605705, |
|
"grad_norm": 7.23566198348999, |
|
"learning_rate": 2.814343029087262e-05, |
|
"loss": 0.9849, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.3159415056869471, |
|
"grad_norm": 10.037336349487305, |
|
"learning_rate": 2.8068204613841525e-05, |
|
"loss": 0.8577, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.3159415056869471, |
|
"eval_exact_match": 82.28949858088932, |
|
"eval_f1": 89.36851469763961, |
|
"eval_runtime": 143.8541, |
|
"eval_samples_per_second": 75.062, |
|
"eval_steps_per_second": 18.769, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.3204549557681892, |
|
"grad_norm": 19.438182830810547, |
|
"learning_rate": 2.799297893681043e-05, |
|
"loss": 0.9586, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 0.3249684058494313, |
|
"grad_norm": 11.259856224060059, |
|
"learning_rate": 2.7917753259779337e-05, |
|
"loss": 0.923, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.3294818559306734, |
|
"grad_norm": 16.151851654052734, |
|
"learning_rate": 2.7842527582748243e-05, |
|
"loss": 0.9551, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 0.3339953060119155, |
|
"grad_norm": 12.570643424987793, |
|
"learning_rate": 2.7767301905717152e-05, |
|
"loss": 0.9377, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.3385087560931576, |
|
"grad_norm": 13.616052627563477, |
|
"learning_rate": 2.7692076228686058e-05, |
|
"loss": 0.7839, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.3385087560931576, |
|
"eval_exact_match": 81.69347209082308, |
|
"eval_f1": 89.48678563794635, |
|
"eval_runtime": 143.7031, |
|
"eval_samples_per_second": 75.141, |
|
"eval_steps_per_second": 18.789, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.3430222061743997, |
|
"grad_norm": 20.124753952026367, |
|
"learning_rate": 2.7616850551654964e-05, |
|
"loss": 0.9229, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.3475356562556418, |
|
"grad_norm": 7.816183090209961, |
|
"learning_rate": 2.7541624874623873e-05, |
|
"loss": 1.0717, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 0.3520491063368839, |
|
"grad_norm": 5.988482475280762, |
|
"learning_rate": 2.746639919759278e-05, |
|
"loss": 0.9611, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.356562556418126, |
|
"grad_norm": 13.177979469299316, |
|
"learning_rate": 2.7391173520561685e-05, |
|
"loss": 0.9207, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 0.3610760064993681, |
|
"grad_norm": 11.034092903137207, |
|
"learning_rate": 2.7315947843530594e-05, |
|
"loss": 0.9395, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.3610760064993681, |
|
"eval_exact_match": 82.74361400189214, |
|
"eval_f1": 90.06638594360132, |
|
"eval_runtime": 143.7901, |
|
"eval_samples_per_second": 75.096, |
|
"eval_steps_per_second": 18.777, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.3655894565806102, |
|
"grad_norm": 5.799317359924316, |
|
"learning_rate": 2.72407221664995e-05, |
|
"loss": 0.9385, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 0.3701029066618523, |
|
"grad_norm": 13.385774612426758, |
|
"learning_rate": 2.7165496489468405e-05, |
|
"loss": 0.9356, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.3746163567430944, |
|
"grad_norm": 22.11754608154297, |
|
"learning_rate": 2.7090270812437315e-05, |
|
"loss": 0.8532, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 0.3791298068243365, |
|
"grad_norm": 4.648535251617432, |
|
"learning_rate": 2.701504513540622e-05, |
|
"loss": 1.0032, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.3836432569055786, |
|
"grad_norm": 24.29154396057129, |
|
"learning_rate": 2.6939819458375123e-05, |
|
"loss": 0.8606, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 0.3836432569055786, |
|
"eval_exact_match": 82.58278145695364, |
|
"eval_f1": 89.65557078580815, |
|
"eval_runtime": 143.6035, |
|
"eval_samples_per_second": 75.193, |
|
"eval_steps_per_second": 18.802, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 0.3881567069868207, |
|
"grad_norm": 21.021242141723633, |
|
"learning_rate": 2.6864593781344032e-05, |
|
"loss": 0.9006, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.39267015706806285, |
|
"grad_norm": 26.466794967651367, |
|
"learning_rate": 2.6789368104312938e-05, |
|
"loss": 0.9426, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 0.39718360714930495, |
|
"grad_norm": 6.325038433074951, |
|
"learning_rate": 2.6714142427281844e-05, |
|
"loss": 0.8097, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.40169705723054705, |
|
"grad_norm": 8.038667678833008, |
|
"learning_rate": 2.6638916750250753e-05, |
|
"loss": 0.9053, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 0.40621050731178915, |
|
"grad_norm": 10.573040008544922, |
|
"learning_rate": 2.656369107321966e-05, |
|
"loss": 0.8459, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.40621050731178915, |
|
"eval_exact_match": 82.60170293282876, |
|
"eval_f1": 89.80136126079411, |
|
"eval_runtime": 143.5994, |
|
"eval_samples_per_second": 75.195, |
|
"eval_steps_per_second": 18.802, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.41072395739303125, |
|
"grad_norm": 7.336009979248047, |
|
"learning_rate": 2.6488465396188565e-05, |
|
"loss": 0.8691, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 0.41523740747427335, |
|
"grad_norm": 13.7125825881958, |
|
"learning_rate": 2.6413239719157474e-05, |
|
"loss": 0.8486, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.41975085755551544, |
|
"grad_norm": 12.19320011138916, |
|
"learning_rate": 2.633801404212638e-05, |
|
"loss": 0.9189, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 0.42426430763675754, |
|
"grad_norm": 11.599879264831543, |
|
"learning_rate": 2.6262788365095286e-05, |
|
"loss": 0.942, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.42877775771799964, |
|
"grad_norm": 9.139724731445312, |
|
"learning_rate": 2.6187562688064195e-05, |
|
"loss": 0.9565, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 0.42877775771799964, |
|
"eval_exact_match": 83.66130558183538, |
|
"eval_f1": 90.80495165338898, |
|
"eval_runtime": 143.5723, |
|
"eval_samples_per_second": 75.209, |
|
"eval_steps_per_second": 18.806, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 0.43329120779924174, |
|
"grad_norm": 10.977174758911133, |
|
"learning_rate": 2.61123370110331e-05, |
|
"loss": 0.8751, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.43780465788048384, |
|
"grad_norm": 13.7095947265625, |
|
"learning_rate": 2.6037111334002007e-05, |
|
"loss": 0.7547, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 0.44231810796172594, |
|
"grad_norm": 26.184358596801758, |
|
"learning_rate": 2.5961885656970912e-05, |
|
"loss": 0.9376, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.44683155804296804, |
|
"grad_norm": 21.620555877685547, |
|
"learning_rate": 2.5886659979939818e-05, |
|
"loss": 0.8013, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 0.45134500812421013, |
|
"grad_norm": 12.163994789123535, |
|
"learning_rate": 2.5811434302908724e-05, |
|
"loss": 0.8731, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.45134500812421013, |
|
"eval_exact_match": 82.37464522232735, |
|
"eval_f1": 89.97068346136126, |
|
"eval_runtime": 143.7072, |
|
"eval_samples_per_second": 75.139, |
|
"eval_steps_per_second": 18.788, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.45585845820545223, |
|
"grad_norm": 5.3110175132751465, |
|
"learning_rate": 2.5736208625877633e-05, |
|
"loss": 0.9062, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 0.46037190828669433, |
|
"grad_norm": 17.91867446899414, |
|
"learning_rate": 2.566098294884654e-05, |
|
"loss": 0.8749, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.46488535836793643, |
|
"grad_norm": 34.21914291381836, |
|
"learning_rate": 2.5585757271815445e-05, |
|
"loss": 0.8915, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 0.46939880844917853, |
|
"grad_norm": 8.76441478729248, |
|
"learning_rate": 2.5510531594784354e-05, |
|
"loss": 0.8023, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.4739122585304206, |
|
"grad_norm": 20.71419334411621, |
|
"learning_rate": 2.543530591775326e-05, |
|
"loss": 0.8114, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 0.4739122585304206, |
|
"eval_exact_match": 82.36518448438979, |
|
"eval_f1": 90.09893335252144, |
|
"eval_runtime": 143.9381, |
|
"eval_samples_per_second": 75.018, |
|
"eval_steps_per_second": 18.758, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 0.4784257086116628, |
|
"grad_norm": 14.544415473937988, |
|
"learning_rate": 2.536008024072217e-05, |
|
"loss": 0.8045, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.4829391586929049, |
|
"grad_norm": 40.066375732421875, |
|
"learning_rate": 2.5284854563691075e-05, |
|
"loss": 0.8651, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 0.487452608774147, |
|
"grad_norm": 16.154937744140625, |
|
"learning_rate": 2.520962888665998e-05, |
|
"loss": 0.9995, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.4919660588553891, |
|
"grad_norm": 6.674190044403076, |
|
"learning_rate": 2.513440320962889e-05, |
|
"loss": 0.8231, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 0.4964795089366312, |
|
"grad_norm": 6.037493705749512, |
|
"learning_rate": 2.5059177532597796e-05, |
|
"loss": 0.8369, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.4964795089366312, |
|
"eval_exact_match": 83.20719016083254, |
|
"eval_f1": 90.18831406264282, |
|
"eval_runtime": 143.9261, |
|
"eval_samples_per_second": 75.025, |
|
"eval_steps_per_second": 18.76, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.5009929590178732, |
|
"grad_norm": 8.621197700500488, |
|
"learning_rate": 2.4983951855566702e-05, |
|
"loss": 0.8002, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 0.5055064090991154, |
|
"grad_norm": 19.25075340270996, |
|
"learning_rate": 2.4908726178535608e-05, |
|
"loss": 0.8039, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.5100198591803574, |
|
"grad_norm": 14.201600074768066, |
|
"learning_rate": 2.4833500501504514e-05, |
|
"loss": 0.7525, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 0.5145333092615996, |
|
"grad_norm": 30.636154174804688, |
|
"learning_rate": 2.475827482447342e-05, |
|
"loss": 0.7711, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.5190467593428417, |
|
"grad_norm": 8.79736042022705, |
|
"learning_rate": 2.468304914744233e-05, |
|
"loss": 0.8997, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 0.5190467593428417, |
|
"eval_exact_match": 83.66130558183538, |
|
"eval_f1": 90.84316221305555, |
|
"eval_runtime": 143.8756, |
|
"eval_samples_per_second": 75.051, |
|
"eval_steps_per_second": 18.766, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 0.5235602094240838, |
|
"grad_norm": 14.871445655822754, |
|
"learning_rate": 2.4607823470411234e-05, |
|
"loss": 0.7639, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.5280736595053259, |
|
"grad_norm": 6.112968444824219, |
|
"learning_rate": 2.453259779338014e-05, |
|
"loss": 0.8643, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 0.532587109586568, |
|
"grad_norm": 6.213535785675049, |
|
"learning_rate": 2.445737211634905e-05, |
|
"loss": 1.0582, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.5371005596678101, |
|
"grad_norm": 4.76146936416626, |
|
"learning_rate": 2.4382146439317955e-05, |
|
"loss": 0.818, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 0.5416140097490522, |
|
"grad_norm": 8.690106391906738, |
|
"learning_rate": 2.430692076228686e-05, |
|
"loss": 0.8871, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.5416140097490522, |
|
"eval_exact_match": 84.03027436140019, |
|
"eval_f1": 90.88429950527104, |
|
"eval_runtime": 143.8461, |
|
"eval_samples_per_second": 75.066, |
|
"eval_steps_per_second": 18.77, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.5461274598302943, |
|
"grad_norm": 18.575305938720703, |
|
"learning_rate": 2.423169508525577e-05, |
|
"loss": 0.9143, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 0.5506409099115364, |
|
"grad_norm": 2.229325294494629, |
|
"learning_rate": 2.4156469408224676e-05, |
|
"loss": 0.9387, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.5551543599927785, |
|
"grad_norm": 9.413180351257324, |
|
"learning_rate": 2.4081243731193582e-05, |
|
"loss": 0.8657, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 0.5596678100740206, |
|
"grad_norm": 5.644939422607422, |
|
"learning_rate": 2.400601805416249e-05, |
|
"loss": 0.8445, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.5641812601552627, |
|
"grad_norm": 23.247257232666016, |
|
"learning_rate": 2.3930792377131394e-05, |
|
"loss": 0.8342, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 0.5641812601552627, |
|
"eval_exact_match": 84.12488174077578, |
|
"eval_f1": 91.10403462345704, |
|
"eval_runtime": 144.0358, |
|
"eval_samples_per_second": 74.967, |
|
"eval_steps_per_second": 18.745, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 0.5686947102365048, |
|
"grad_norm": 19.933300018310547, |
|
"learning_rate": 2.38555667001003e-05, |
|
"loss": 0.8348, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.5732081603177469, |
|
"grad_norm": 7.6812872886657715, |
|
"learning_rate": 2.378034102306921e-05, |
|
"loss": 0.7582, |
|
"step": 6350 |
|
}, |
|
{ |
|
"epoch": 0.577721610398989, |
|
"grad_norm": 13.01408863067627, |
|
"learning_rate": 2.3705115346038115e-05, |
|
"loss": 0.7245, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.5822350604802311, |
|
"grad_norm": 23.909793853759766, |
|
"learning_rate": 2.362988966900702e-05, |
|
"loss": 0.7503, |
|
"step": 6450 |
|
}, |
|
{ |
|
"epoch": 0.5867485105614731, |
|
"grad_norm": 5.6074323654174805, |
|
"learning_rate": 2.355466399197593e-05, |
|
"loss": 0.8251, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.5867485105614731, |
|
"eval_exact_match": 84.31409649952697, |
|
"eval_f1": 91.10669916586389, |
|
"eval_runtime": 143.8718, |
|
"eval_samples_per_second": 75.053, |
|
"eval_steps_per_second": 18.767, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.5912619606427153, |
|
"grad_norm": 13.117137908935547, |
|
"learning_rate": 2.3479438314944836e-05, |
|
"loss": 0.7903, |
|
"step": 6550 |
|
}, |
|
{ |
|
"epoch": 0.5957754107239573, |
|
"grad_norm": 4.99781608581543, |
|
"learning_rate": 2.340421263791374e-05, |
|
"loss": 0.8538, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.6002888608051995, |
|
"grad_norm": 7.639380931854248, |
|
"learning_rate": 2.332898696088265e-05, |
|
"loss": 0.8154, |
|
"step": 6650 |
|
}, |
|
{ |
|
"epoch": 0.6048023108864415, |
|
"grad_norm": 30.98665428161621, |
|
"learning_rate": 2.3253761283851556e-05, |
|
"loss": 0.723, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 0.6093157609676837, |
|
"grad_norm": 30.613746643066406, |
|
"learning_rate": 2.3178535606820462e-05, |
|
"loss": 0.8682, |
|
"step": 6750 |
|
}, |
|
{ |
|
"epoch": 0.6093157609676837, |
|
"eval_exact_match": 83.72753074739829, |
|
"eval_f1": 90.83640909549077, |
|
"eval_runtime": 143.634, |
|
"eval_samples_per_second": 75.177, |
|
"eval_steps_per_second": 18.798, |
|
"step": 6750 |
|
}, |
|
{ |
|
"epoch": 0.6138292110489258, |
|
"grad_norm": 8.440532684326172, |
|
"learning_rate": 2.310330992978937e-05, |
|
"loss": 0.8729, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.6183426611301679, |
|
"grad_norm": 5.947940826416016, |
|
"learning_rate": 2.3028084252758277e-05, |
|
"loss": 0.747, |
|
"step": 6850 |
|
}, |
|
{ |
|
"epoch": 0.62285611121141, |
|
"grad_norm": 16.59714698791504, |
|
"learning_rate": 2.295285857572718e-05, |
|
"loss": 0.8015, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 0.6273695612926521, |
|
"grad_norm": 5.211153507232666, |
|
"learning_rate": 2.287763289869609e-05, |
|
"loss": 0.8957, |
|
"step": 6950 |
|
}, |
|
{ |
|
"epoch": 0.6318830113738942, |
|
"grad_norm": 4.547276496887207, |
|
"learning_rate": 2.2802407221664995e-05, |
|
"loss": 0.9019, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.6318830113738942, |
|
"eval_exact_match": 84.82497634815516, |
|
"eval_f1": 91.52669904904272, |
|
"eval_runtime": 145.3332, |
|
"eval_samples_per_second": 74.298, |
|
"eval_steps_per_second": 18.578, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.6363964614551363, |
|
"grad_norm": 11.642155647277832, |
|
"learning_rate": 2.27271815446339e-05, |
|
"loss": 0.7963, |
|
"step": 7050 |
|
}, |
|
{ |
|
"epoch": 0.6409099115363784, |
|
"grad_norm": 7.39171028137207, |
|
"learning_rate": 2.265195586760281e-05, |
|
"loss": 0.7953, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 0.6454233616176205, |
|
"grad_norm": 13.905296325683594, |
|
"learning_rate": 2.2576730190571716e-05, |
|
"loss": 0.7865, |
|
"step": 7150 |
|
}, |
|
{ |
|
"epoch": 0.6499368116988626, |
|
"grad_norm": 5.167139530181885, |
|
"learning_rate": 2.250150451354062e-05, |
|
"loss": 0.8087, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 0.6544502617801047, |
|
"grad_norm": 27.534217834472656, |
|
"learning_rate": 2.242627883650953e-05, |
|
"loss": 0.9436, |
|
"step": 7250 |
|
}, |
|
{ |
|
"epoch": 0.6544502617801047, |
|
"eval_exact_match": 84.06811731315042, |
|
"eval_f1": 91.07524033930977, |
|
"eval_runtime": 143.7651, |
|
"eval_samples_per_second": 75.109, |
|
"eval_steps_per_second": 18.781, |
|
"step": 7250 |
|
}, |
|
{ |
|
"epoch": 0.6589637118613468, |
|
"grad_norm": 15.742715835571289, |
|
"learning_rate": 2.2351053159478437e-05, |
|
"loss": 0.8499, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 0.6634771619425889, |
|
"grad_norm": 16.15327262878418, |
|
"learning_rate": 2.2275827482447342e-05, |
|
"loss": 0.846, |
|
"step": 7350 |
|
}, |
|
{ |
|
"epoch": 0.667990612023831, |
|
"grad_norm": 17.383888244628906, |
|
"learning_rate": 2.220060180541625e-05, |
|
"loss": 0.7903, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 0.6725040621050731, |
|
"grad_norm": 7.484638214111328, |
|
"learning_rate": 2.2125376128385157e-05, |
|
"loss": 0.7664, |
|
"step": 7450 |
|
}, |
|
{ |
|
"epoch": 0.6770175121863152, |
|
"grad_norm": 10.082265853881836, |
|
"learning_rate": 2.2050150451354063e-05, |
|
"loss": 0.9177, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.6770175121863152, |
|
"eval_exact_match": 84.49385052034059, |
|
"eval_f1": 91.10452090726004, |
|
"eval_runtime": 143.5023, |
|
"eval_samples_per_second": 75.246, |
|
"eval_steps_per_second": 18.815, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.6815309622675573, |
|
"grad_norm": 10.778836250305176, |
|
"learning_rate": 2.1974924774322973e-05, |
|
"loss": 0.7529, |
|
"step": 7550 |
|
}, |
|
{ |
|
"epoch": 0.6860444123487994, |
|
"grad_norm": 12.894726753234863, |
|
"learning_rate": 2.1899699097291875e-05, |
|
"loss": 0.8783, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 0.6905578624300415, |
|
"grad_norm": 7.819123268127441, |
|
"learning_rate": 2.182447342026078e-05, |
|
"loss": 0.9095, |
|
"step": 7650 |
|
}, |
|
{ |
|
"epoch": 0.6950713125112836, |
|
"grad_norm": 24.68296241760254, |
|
"learning_rate": 2.174924774322969e-05, |
|
"loss": 0.8804, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 0.6995847625925258, |
|
"grad_norm": 9.52649974822998, |
|
"learning_rate": 2.1674022066198596e-05, |
|
"loss": 0.8028, |
|
"step": 7750 |
|
}, |
|
{ |
|
"epoch": 0.6995847625925258, |
|
"eval_exact_match": 85.59129612109744, |
|
"eval_f1": 91.93623152881347, |
|
"eval_runtime": 143.4073, |
|
"eval_samples_per_second": 75.296, |
|
"eval_steps_per_second": 18.827, |
|
"step": 7750 |
|
}, |
|
{ |
|
"epoch": 0.7040982126737678, |
|
"grad_norm": 10.898487091064453, |
|
"learning_rate": 2.1598796389167502e-05, |
|
"loss": 0.8282, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 0.70861166275501, |
|
"grad_norm": 6.693902969360352, |
|
"learning_rate": 2.152357071213641e-05, |
|
"loss": 0.775, |
|
"step": 7850 |
|
}, |
|
{ |
|
"epoch": 0.713125112836252, |
|
"grad_norm": 10.00558090209961, |
|
"learning_rate": 2.1448345035105317e-05, |
|
"loss": 0.6894, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 0.7176385629174942, |
|
"grad_norm": 2.776298761367798, |
|
"learning_rate": 2.1373119358074223e-05, |
|
"loss": 0.8409, |
|
"step": 7950 |
|
}, |
|
{ |
|
"epoch": 0.7221520129987362, |
|
"grad_norm": 13.581101417541504, |
|
"learning_rate": 2.1297893681043132e-05, |
|
"loss": 0.8222, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.7221520129987362, |
|
"eval_exact_match": 84.57899716177862, |
|
"eval_f1": 91.38107826122027, |
|
"eval_runtime": 143.4879, |
|
"eval_samples_per_second": 75.254, |
|
"eval_steps_per_second": 18.817, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.7266654630799784, |
|
"grad_norm": 6.5702223777771, |
|
"learning_rate": 2.1222668004012038e-05, |
|
"loss": 0.6735, |
|
"step": 8050 |
|
}, |
|
{ |
|
"epoch": 0.7311789131612204, |
|
"grad_norm": 18.275623321533203, |
|
"learning_rate": 2.1147442326980944e-05, |
|
"loss": 0.8389, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 0.7356923632424626, |
|
"grad_norm": 15.205418586730957, |
|
"learning_rate": 2.1072216649949853e-05, |
|
"loss": 0.7803, |
|
"step": 8150 |
|
}, |
|
{ |
|
"epoch": 0.7402058133237046, |
|
"grad_norm": 8.31666088104248, |
|
"learning_rate": 2.099699097291876e-05, |
|
"loss": 0.7081, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 0.7447192634049468, |
|
"grad_norm": 9.174483299255371, |
|
"learning_rate": 2.092176529588766e-05, |
|
"loss": 0.826, |
|
"step": 8250 |
|
}, |
|
{ |
|
"epoch": 0.7447192634049468, |
|
"eval_exact_match": 84.76821192052981, |
|
"eval_f1": 91.56620229706857, |
|
"eval_runtime": 143.4859, |
|
"eval_samples_per_second": 75.255, |
|
"eval_steps_per_second": 18.817, |
|
"step": 8250 |
|
}, |
|
{ |
|
"epoch": 0.7492327134861888, |
|
"grad_norm": 5.849365234375, |
|
"learning_rate": 2.084653961885657e-05, |
|
"loss": 0.7826, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 0.753746163567431, |
|
"grad_norm": 8.80666446685791, |
|
"learning_rate": 2.0771313941825476e-05, |
|
"loss": 0.8931, |
|
"step": 8350 |
|
}, |
|
{ |
|
"epoch": 0.758259613648673, |
|
"grad_norm": 7.301697731018066, |
|
"learning_rate": 2.0696088264794382e-05, |
|
"loss": 0.6788, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 0.7627730637299152, |
|
"grad_norm": 9.519810676574707, |
|
"learning_rate": 2.062086258776329e-05, |
|
"loss": 0.7928, |
|
"step": 8450 |
|
}, |
|
{ |
|
"epoch": 0.7672865138111572, |
|
"grad_norm": 8.138936996459961, |
|
"learning_rate": 2.0545636910732197e-05, |
|
"loss": 0.8625, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.7672865138111572, |
|
"eval_exact_match": 85.19394512771996, |
|
"eval_f1": 91.95119129750337, |
|
"eval_runtime": 143.5272, |
|
"eval_samples_per_second": 75.233, |
|
"eval_steps_per_second": 18.812, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.7717999638923994, |
|
"grad_norm": 3.691103935241699, |
|
"learning_rate": 2.0470411233701103e-05, |
|
"loss": 0.7947, |
|
"step": 8550 |
|
}, |
|
{ |
|
"epoch": 0.7763134139736414, |
|
"grad_norm": 14.496338844299316, |
|
"learning_rate": 2.0395185556670012e-05, |
|
"loss": 0.8135, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 0.7808268640548836, |
|
"grad_norm": 6.248403072357178, |
|
"learning_rate": 2.0319959879638918e-05, |
|
"loss": 0.8594, |
|
"step": 8650 |
|
}, |
|
{ |
|
"epoch": 0.7853403141361257, |
|
"grad_norm": 16.819801330566406, |
|
"learning_rate": 2.0244734202607824e-05, |
|
"loss": 0.8171, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 0.7898537642173677, |
|
"grad_norm": 7.9061079025268555, |
|
"learning_rate": 2.0169508525576733e-05, |
|
"loss": 0.6517, |
|
"step": 8750 |
|
}, |
|
{ |
|
"epoch": 0.7898537642173677, |
|
"eval_exact_match": 85.6480605487228, |
|
"eval_f1": 91.95890910573651, |
|
"eval_runtime": 143.4989, |
|
"eval_samples_per_second": 75.248, |
|
"eval_steps_per_second": 18.815, |
|
"step": 8750 |
|
}, |
|
{ |
|
"epoch": 0.7943672142986099, |
|
"grad_norm": 12.129390716552734, |
|
"learning_rate": 2.009428284854564e-05, |
|
"loss": 0.7358, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 0.798880664379852, |
|
"grad_norm": 7.113585472106934, |
|
"learning_rate": 2.0019057171514545e-05, |
|
"loss": 0.8517, |
|
"step": 8850 |
|
}, |
|
{ |
|
"epoch": 0.8033941144610941, |
|
"grad_norm": 10.407898902893066, |
|
"learning_rate": 1.994383149448345e-05, |
|
"loss": 0.733, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 0.8079075645423361, |
|
"grad_norm": 15.745281219482422, |
|
"learning_rate": 1.9868605817452356e-05, |
|
"loss": 0.693, |
|
"step": 8950 |
|
}, |
|
{ |
|
"epoch": 0.8124210146235783, |
|
"grad_norm": 6.876597881317139, |
|
"learning_rate": 1.9793380140421262e-05, |
|
"loss": 0.817, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.8124210146235783, |
|
"eval_exact_match": 84.87228003784296, |
|
"eval_f1": 91.5110749584356, |
|
"eval_runtime": 143.5544, |
|
"eval_samples_per_second": 75.219, |
|
"eval_steps_per_second": 18.808, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.8169344647048203, |
|
"grad_norm": 7.4037065505981445, |
|
"learning_rate": 1.971815446339017e-05, |
|
"loss": 0.8677, |
|
"step": 9050 |
|
}, |
|
{ |
|
"epoch": 0.8214479147860625, |
|
"grad_norm": 4.559969902038574, |
|
"learning_rate": 1.9642928786359077e-05, |
|
"loss": 0.7798, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 0.8259613648673045, |
|
"grad_norm": 7.184974670410156, |
|
"learning_rate": 1.9567703109327983e-05, |
|
"loss": 0.7705, |
|
"step": 9150 |
|
}, |
|
{ |
|
"epoch": 0.8304748149485467, |
|
"grad_norm": 8.206283569335938, |
|
"learning_rate": 1.9492477432296892e-05, |
|
"loss": 0.8398, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 0.8349882650297887, |
|
"grad_norm": 7.29602575302124, |
|
"learning_rate": 1.9417251755265798e-05, |
|
"loss": 0.7574, |
|
"step": 9250 |
|
}, |
|
{ |
|
"epoch": 0.8349882650297887, |
|
"eval_exact_match": 85.06149479659413, |
|
"eval_f1": 91.53106503540634, |
|
"eval_runtime": 143.5779, |
|
"eval_samples_per_second": 75.207, |
|
"eval_steps_per_second": 18.805, |
|
"step": 9250 |
|
}, |
|
{ |
|
"epoch": 0.8395017151110309, |
|
"grad_norm": 10.167183876037598, |
|
"learning_rate": 1.9342026078234704e-05, |
|
"loss": 0.7103, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 0.8440151651922729, |
|
"grad_norm": 6.271793365478516, |
|
"learning_rate": 1.9266800401203613e-05, |
|
"loss": 0.8364, |
|
"step": 9350 |
|
}, |
|
{ |
|
"epoch": 0.8485286152735151, |
|
"grad_norm": 11.07026481628418, |
|
"learning_rate": 1.919157472417252e-05, |
|
"loss": 0.726, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 0.8530420653547571, |
|
"grad_norm": 5.571475028991699, |
|
"learning_rate": 1.9116349047141425e-05, |
|
"loss": 0.7205, |
|
"step": 9450 |
|
}, |
|
{ |
|
"epoch": 0.8575555154359993, |
|
"grad_norm": 3.4866223335266113, |
|
"learning_rate": 1.9041123370110334e-05, |
|
"loss": 0.7832, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.8575555154359993, |
|
"eval_exact_match": 85.44938505203406, |
|
"eval_f1": 91.84507576310226, |
|
"eval_runtime": 143.5379, |
|
"eval_samples_per_second": 75.228, |
|
"eval_steps_per_second": 18.81, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.8620689655172413, |
|
"grad_norm": 2.2408883571624756, |
|
"learning_rate": 1.896589769307924e-05, |
|
"loss": 0.7533, |
|
"step": 9550 |
|
}, |
|
{ |
|
"epoch": 0.8665824155984835, |
|
"grad_norm": 13.415377616882324, |
|
"learning_rate": 1.8890672016048142e-05, |
|
"loss": 0.7669, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 0.8710958656797256, |
|
"grad_norm": 4.730581760406494, |
|
"learning_rate": 1.881544633901705e-05, |
|
"loss": 0.7468, |
|
"step": 9650 |
|
}, |
|
{ |
|
"epoch": 0.8756093157609677, |
|
"grad_norm": 6.725691318511963, |
|
"learning_rate": 1.8740220661985957e-05, |
|
"loss": 0.7426, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 0.8801227658422098, |
|
"grad_norm": 8.169360160827637, |
|
"learning_rate": 1.8664994984954863e-05, |
|
"loss": 0.8436, |
|
"step": 9750 |
|
}, |
|
{ |
|
"epoch": 0.8801227658422098, |
|
"eval_exact_match": 84.88174077578051, |
|
"eval_f1": 91.83275837323971, |
|
"eval_runtime": 143.4694, |
|
"eval_samples_per_second": 75.263, |
|
"eval_steps_per_second": 18.819, |
|
"step": 9750 |
|
}, |
|
{ |
|
"epoch": 0.8846362159234519, |
|
"grad_norm": 1.717469334602356, |
|
"learning_rate": 1.8589769307923772e-05, |
|
"loss": 0.7889, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 0.889149666004694, |
|
"grad_norm": 20.31835174560547, |
|
"learning_rate": 1.8514543630892678e-05, |
|
"loss": 0.7648, |
|
"step": 9850 |
|
}, |
|
{ |
|
"epoch": 0.8936631160859361, |
|
"grad_norm": 15.77481746673584, |
|
"learning_rate": 1.8439317953861584e-05, |
|
"loss": 0.7259, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 0.8981765661671782, |
|
"grad_norm": 3.87709641456604, |
|
"learning_rate": 1.8364092276830493e-05, |
|
"loss": 0.7866, |
|
"step": 9950 |
|
}, |
|
{ |
|
"epoch": 0.9026900162484203, |
|
"grad_norm": 8.835536003112793, |
|
"learning_rate": 1.82888665997994e-05, |
|
"loss": 0.6775, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.9026900162484203, |
|
"eval_exact_match": 85.99810785241249, |
|
"eval_f1": 91.99639894905705, |
|
"eval_runtime": 143.5348, |
|
"eval_samples_per_second": 75.229, |
|
"eval_steps_per_second": 18.811, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.9072034663296624, |
|
"grad_norm": 12.733137130737305, |
|
"learning_rate": 1.8213640922768305e-05, |
|
"loss": 0.7703, |
|
"step": 10050 |
|
}, |
|
{ |
|
"epoch": 0.9117169164109045, |
|
"grad_norm": 12.40443229675293, |
|
"learning_rate": 1.8138415245737214e-05, |
|
"loss": 0.7619, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 0.9162303664921466, |
|
"grad_norm": 6.346498012542725, |
|
"learning_rate": 1.806318956870612e-05, |
|
"loss": 0.7605, |
|
"step": 10150 |
|
}, |
|
{ |
|
"epoch": 0.9207438165733887, |
|
"grad_norm": 5.372687816619873, |
|
"learning_rate": 1.7987963891675026e-05, |
|
"loss": 0.6591, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 0.9252572666546308, |
|
"grad_norm": 4.377304553985596, |
|
"learning_rate": 1.7912738214643932e-05, |
|
"loss": 0.8404, |
|
"step": 10250 |
|
}, |
|
{ |
|
"epoch": 0.9252572666546308, |
|
"eval_exact_match": 85.89403973509934, |
|
"eval_f1": 92.11360231029698, |
|
"eval_runtime": 144.2084, |
|
"eval_samples_per_second": 74.878, |
|
"eval_steps_per_second": 18.723, |
|
"step": 10250 |
|
}, |
|
{ |
|
"epoch": 0.9297707167358729, |
|
"grad_norm": 11.62856388092041, |
|
"learning_rate": 1.7837512537612838e-05, |
|
"loss": 0.7712, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 0.934284166817115, |
|
"grad_norm": 5.723257541656494, |
|
"learning_rate": 1.7762286860581743e-05, |
|
"loss": 0.7171, |
|
"step": 10350 |
|
}, |
|
{ |
|
"epoch": 0.9387976168983571, |
|
"grad_norm": 6.060873031616211, |
|
"learning_rate": 1.7687061183550653e-05, |
|
"loss": 0.7324, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 0.9433110669795992, |
|
"grad_norm": 4.51533842086792, |
|
"learning_rate": 1.761183550651956e-05, |
|
"loss": 0.7633, |
|
"step": 10450 |
|
}, |
|
{ |
|
"epoch": 0.9478245170608413, |
|
"grad_norm": 11.809548377990723, |
|
"learning_rate": 1.7536609829488464e-05, |
|
"loss": 0.8111, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.9478245170608413, |
|
"eval_exact_match": 85.09933774834437, |
|
"eval_f1": 91.93045017438146, |
|
"eval_runtime": 143.4235, |
|
"eval_samples_per_second": 75.288, |
|
"eval_steps_per_second": 18.825, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.9523379671420834, |
|
"grad_norm": 15.76356029510498, |
|
"learning_rate": 1.7461384152457374e-05, |
|
"loss": 0.7013, |
|
"step": 10550 |
|
}, |
|
{ |
|
"epoch": 0.9568514172233256, |
|
"grad_norm": 23.272687911987305, |
|
"learning_rate": 1.738615847542628e-05, |
|
"loss": 0.8229, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 0.9613648673045676, |
|
"grad_norm": 16.758358001708984, |
|
"learning_rate": 1.7310932798395185e-05, |
|
"loss": 0.7479, |
|
"step": 10650 |
|
}, |
|
{ |
|
"epoch": 0.9658783173858098, |
|
"grad_norm": 14.670035362243652, |
|
"learning_rate": 1.7235707121364094e-05, |
|
"loss": 0.8, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 0.9703917674670518, |
|
"grad_norm": 5.1286821365356445, |
|
"learning_rate": 1.7160481444333e-05, |
|
"loss": 0.8522, |
|
"step": 10750 |
|
}, |
|
{ |
|
"epoch": 0.9703917674670518, |
|
"eval_exact_match": 85.07095553453169, |
|
"eval_f1": 92.09316544794538, |
|
"eval_runtime": 143.6698, |
|
"eval_samples_per_second": 75.158, |
|
"eval_steps_per_second": 18.793, |
|
"step": 10750 |
|
}, |
|
{ |
|
"epoch": 0.974905217548294, |
|
"grad_norm": 8.131464004516602, |
|
"learning_rate": 1.7085255767301906e-05, |
|
"loss": 0.7949, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 0.979418667629536, |
|
"grad_norm": 22.16661834716797, |
|
"learning_rate": 1.7010030090270815e-05, |
|
"loss": 0.7486, |
|
"step": 10850 |
|
}, |
|
{ |
|
"epoch": 0.9839321177107782, |
|
"grad_norm": 5.554388046264648, |
|
"learning_rate": 1.693480441323972e-05, |
|
"loss": 0.7604, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 0.9884455677920202, |
|
"grad_norm": 8.525761604309082, |
|
"learning_rate": 1.6859578736208624e-05, |
|
"loss": 0.766, |
|
"step": 10950 |
|
}, |
|
{ |
|
"epoch": 0.9929590178732624, |
|
"grad_norm": 10.504690170288086, |
|
"learning_rate": 1.6784353059177533e-05, |
|
"loss": 0.7166, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.9929590178732624, |
|
"eval_exact_match": 85.04257332071901, |
|
"eval_f1": 92.02241474371678, |
|
"eval_runtime": 143.4783, |
|
"eval_samples_per_second": 75.259, |
|
"eval_steps_per_second": 18.818, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.9974724679545044, |
|
"grad_norm": 7.378440856933594, |
|
"learning_rate": 1.670912738214644e-05, |
|
"loss": 0.7187, |
|
"step": 11050 |
|
}, |
|
{ |
|
"epoch": 1.0019859180357464, |
|
"grad_norm": 3.172842502593994, |
|
"learning_rate": 1.6633901705115345e-05, |
|
"loss": 0.7532, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 1.0064993681169887, |
|
"grad_norm": 2.3299856185913086, |
|
"learning_rate": 1.6558676028084254e-05, |
|
"loss": 0.497, |
|
"step": 11150 |
|
}, |
|
{ |
|
"epoch": 1.0110128181982307, |
|
"grad_norm": 8.6509428024292, |
|
"learning_rate": 1.648345035105316e-05, |
|
"loss": 0.4497, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 1.0155262682794728, |
|
"grad_norm": 9.68758773803711, |
|
"learning_rate": 1.6408224674022065e-05, |
|
"loss": 0.6154, |
|
"step": 11250 |
|
}, |
|
{ |
|
"epoch": 1.0155262682794728, |
|
"eval_exact_match": 86.08325449385052, |
|
"eval_f1": 92.38528194318762, |
|
"eval_runtime": 143.3576, |
|
"eval_samples_per_second": 75.322, |
|
"eval_steps_per_second": 18.834, |
|
"step": 11250 |
|
}, |
|
{ |
|
"epoch": 1.0200397183607148, |
|
"grad_norm": 3.34212064743042, |
|
"learning_rate": 1.6332998996990975e-05, |
|
"loss": 0.5372, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 1.024553168441957, |
|
"grad_norm": 5.384337425231934, |
|
"learning_rate": 1.625777331995988e-05, |
|
"loss": 0.5464, |
|
"step": 11350 |
|
}, |
|
{ |
|
"epoch": 1.0290666185231991, |
|
"grad_norm": 19.279573440551758, |
|
"learning_rate": 1.6182547642928786e-05, |
|
"loss": 0.5558, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 1.0335800686044412, |
|
"grad_norm": 5.5248308181762695, |
|
"learning_rate": 1.6107321965897696e-05, |
|
"loss": 0.4981, |
|
"step": 11450 |
|
}, |
|
{ |
|
"epoch": 1.0380935186856832, |
|
"grad_norm": 5.657703399658203, |
|
"learning_rate": 1.60320962888666e-05, |
|
"loss": 0.6565, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 1.0380935186856832, |
|
"eval_exact_match": 85.58183538315988, |
|
"eval_f1": 92.11318103014378, |
|
"eval_runtime": 143.5023, |
|
"eval_samples_per_second": 75.246, |
|
"eval_steps_per_second": 18.815, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 1.0426069687669255, |
|
"grad_norm": 6.387887954711914, |
|
"learning_rate": 1.5956870611835507e-05, |
|
"loss": 0.588, |
|
"step": 11550 |
|
}, |
|
{ |
|
"epoch": 1.0471204188481675, |
|
"grad_norm": 1.7305879592895508, |
|
"learning_rate": 1.5881644934804413e-05, |
|
"loss": 0.5747, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 1.0516338689294096, |
|
"grad_norm": 14.716680526733398, |
|
"learning_rate": 1.580641925777332e-05, |
|
"loss": 0.5632, |
|
"step": 11650 |
|
}, |
|
{ |
|
"epoch": 1.0561473190106518, |
|
"grad_norm": 9.127685546875, |
|
"learning_rate": 1.5731193580742225e-05, |
|
"loss": 0.4897, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 1.0606607690918939, |
|
"grad_norm": 8.541461944580078, |
|
"learning_rate": 1.5655967903711134e-05, |
|
"loss": 0.544, |
|
"step": 11750 |
|
}, |
|
{ |
|
"epoch": 1.0606607690918939, |
|
"eval_exact_match": 86.16840113528855, |
|
"eval_f1": 92.442978713336, |
|
"eval_runtime": 143.2851, |
|
"eval_samples_per_second": 75.36, |
|
"eval_steps_per_second": 18.844, |
|
"step": 11750 |
|
}, |
|
{ |
|
"epoch": 1.065174219173136, |
|
"grad_norm": 6.538851737976074, |
|
"learning_rate": 1.558074222668004e-05, |
|
"loss": 0.5202, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 1.069687669254378, |
|
"grad_norm": 7.314679145812988, |
|
"learning_rate": 1.5505516549648946e-05, |
|
"loss": 0.54, |
|
"step": 11850 |
|
}, |
|
{ |
|
"epoch": 1.0742011193356202, |
|
"grad_norm": 2.3385446071624756, |
|
"learning_rate": 1.5430290872617855e-05, |
|
"loss": 0.5192, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 1.0787145694168623, |
|
"grad_norm": 26.518877029418945, |
|
"learning_rate": 1.535506519558676e-05, |
|
"loss": 0.5435, |
|
"step": 11950 |
|
}, |
|
{ |
|
"epoch": 1.0832280194981043, |
|
"grad_norm": 39.6591682434082, |
|
"learning_rate": 1.5279839518555667e-05, |
|
"loss": 0.5149, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 1.0832280194981043, |
|
"eval_exact_match": 85.80889309366131, |
|
"eval_f1": 92.20769990556119, |
|
"eval_runtime": 143.6087, |
|
"eval_samples_per_second": 75.19, |
|
"eval_steps_per_second": 18.801, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 1.0877414695793464, |
|
"grad_norm": 11.38036823272705, |
|
"learning_rate": 1.5204613841524576e-05, |
|
"loss": 0.526, |
|
"step": 12050 |
|
}, |
|
{ |
|
"epoch": 1.0922549196605886, |
|
"grad_norm": 21.02750587463379, |
|
"learning_rate": 1.512938816449348e-05, |
|
"loss": 0.471, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 1.0967683697418307, |
|
"grad_norm": 23.10146713256836, |
|
"learning_rate": 1.5054162487462386e-05, |
|
"loss": 0.6643, |
|
"step": 12150 |
|
}, |
|
{ |
|
"epoch": 1.1012818198230727, |
|
"grad_norm": 29.241615295410156, |
|
"learning_rate": 1.4978936810431293e-05, |
|
"loss": 0.5231, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 1.1057952699043148, |
|
"grad_norm": 3.3990285396575928, |
|
"learning_rate": 1.49037111334002e-05, |
|
"loss": 0.4051, |
|
"step": 12250 |
|
}, |
|
{ |
|
"epoch": 1.1057952699043148, |
|
"eval_exact_match": 85.56291390728477, |
|
"eval_f1": 92.0566633980034, |
|
"eval_runtime": 143.183, |
|
"eval_samples_per_second": 75.414, |
|
"eval_steps_per_second": 18.857, |
|
"step": 12250 |
|
}, |
|
{ |
|
"epoch": 1.110308719985557, |
|
"grad_norm": 3.075737237930298, |
|
"learning_rate": 1.4828485456369108e-05, |
|
"loss": 0.5637, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 1.114822170066799, |
|
"grad_norm": 2.9517650604248047, |
|
"learning_rate": 1.4753259779338014e-05, |
|
"loss": 0.566, |
|
"step": 12350 |
|
}, |
|
{ |
|
"epoch": 1.119335620148041, |
|
"grad_norm": 23.89853858947754, |
|
"learning_rate": 1.4678034102306922e-05, |
|
"loss": 0.4439, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 1.1238490702292832, |
|
"grad_norm": 8.486159324645996, |
|
"learning_rate": 1.4602808425275828e-05, |
|
"loss": 0.5538, |
|
"step": 12450 |
|
}, |
|
{ |
|
"epoch": 1.1283625203105254, |
|
"grad_norm": 3.5648648738861084, |
|
"learning_rate": 1.4527582748244733e-05, |
|
"loss": 0.5173, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 1.1283625203105254, |
|
"eval_exact_match": 85.4872280037843, |
|
"eval_f1": 92.15147631309604, |
|
"eval_runtime": 143.2081, |
|
"eval_samples_per_second": 75.401, |
|
"eval_steps_per_second": 18.854, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 1.1328759703917675, |
|
"grad_norm": 10.259268760681152, |
|
"learning_rate": 1.4452357071213641e-05, |
|
"loss": 0.5925, |
|
"step": 12550 |
|
}, |
|
{ |
|
"epoch": 1.1373894204730095, |
|
"grad_norm": 6.570536136627197, |
|
"learning_rate": 1.4377131394182548e-05, |
|
"loss": 0.4594, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 1.1419028705542518, |
|
"grad_norm": 6.687112808227539, |
|
"learning_rate": 1.4301905717151454e-05, |
|
"loss": 0.4994, |
|
"step": 12650 |
|
}, |
|
{ |
|
"epoch": 1.1464163206354938, |
|
"grad_norm": 14.550410270690918, |
|
"learning_rate": 1.4226680040120362e-05, |
|
"loss": 0.5775, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 1.1509297707167359, |
|
"grad_norm": 12.998605728149414, |
|
"learning_rate": 1.4151454363089268e-05, |
|
"loss": 0.5285, |
|
"step": 12750 |
|
}, |
|
{ |
|
"epoch": 1.1509297707167359, |
|
"eval_exact_match": 85.93188268684958, |
|
"eval_f1": 92.11716297833141, |
|
"eval_runtime": 143.1084, |
|
"eval_samples_per_second": 75.453, |
|
"eval_steps_per_second": 18.867, |
|
"step": 12750 |
|
}, |
|
{ |
|
"epoch": 1.155443220797978, |
|
"grad_norm": 26.985210418701172, |
|
"learning_rate": 1.4076228686058175e-05, |
|
"loss": 0.5092, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 1.1599566708792202, |
|
"grad_norm": 15.445883750915527, |
|
"learning_rate": 1.4001003009027081e-05, |
|
"loss": 0.5178, |
|
"step": 12850 |
|
}, |
|
{ |
|
"epoch": 1.1644701209604622, |
|
"grad_norm": 8.596466064453125, |
|
"learning_rate": 1.3925777331995989e-05, |
|
"loss": 0.5742, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 1.1689835710417043, |
|
"grad_norm": 3.9060676097869873, |
|
"learning_rate": 1.3850551654964896e-05, |
|
"loss": 0.5112, |
|
"step": 12950 |
|
}, |
|
{ |
|
"epoch": 1.1734970211229463, |
|
"grad_norm": 2.3090436458587646, |
|
"learning_rate": 1.3775325977933802e-05, |
|
"loss": 0.4802, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 1.1734970211229463, |
|
"eval_exact_match": 86.3670766319773, |
|
"eval_f1": 92.39148643540621, |
|
"eval_runtime": 143.1331, |
|
"eval_samples_per_second": 75.44, |
|
"eval_steps_per_second": 18.864, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 1.1780104712041886, |
|
"grad_norm": 4.289682865142822, |
|
"learning_rate": 1.370010030090271e-05, |
|
"loss": 0.4555, |
|
"step": 13050 |
|
}, |
|
{ |
|
"epoch": 1.1825239212854306, |
|
"grad_norm": 23.45159149169922, |
|
"learning_rate": 1.3624874623871615e-05, |
|
"loss": 0.6034, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 1.1870373713666726, |
|
"grad_norm": 14.170953750610352, |
|
"learning_rate": 1.3549648946840521e-05, |
|
"loss": 0.4946, |
|
"step": 13150 |
|
}, |
|
{ |
|
"epoch": 1.191550821447915, |
|
"grad_norm": 7.408278942108154, |
|
"learning_rate": 1.3474423269809429e-05, |
|
"loss": 0.5625, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 1.196064271529157, |
|
"grad_norm": 4.187251567840576, |
|
"learning_rate": 1.3399197592778336e-05, |
|
"loss": 0.5344, |
|
"step": 13250 |
|
}, |
|
{ |
|
"epoch": 1.196064271529157, |
|
"eval_exact_match": 85.66698202459791, |
|
"eval_f1": 92.29328625942796, |
|
"eval_runtime": 143.2334, |
|
"eval_samples_per_second": 75.387, |
|
"eval_steps_per_second": 18.85, |
|
"step": 13250 |
|
}, |
|
{ |
|
"epoch": 1.200577721610399, |
|
"grad_norm": 9.739165306091309, |
|
"learning_rate": 1.3323971915747242e-05, |
|
"loss": 0.5319, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 1.205091171691641, |
|
"grad_norm": 3.0962629318237305, |
|
"learning_rate": 1.324874623871615e-05, |
|
"loss": 0.5455, |
|
"step": 13350 |
|
}, |
|
{ |
|
"epoch": 1.209604621772883, |
|
"grad_norm": 10.260982513427734, |
|
"learning_rate": 1.3173520561685057e-05, |
|
"loss": 0.5922, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 1.2141180718541253, |
|
"grad_norm": 17.95406150817871, |
|
"learning_rate": 1.3098294884653961e-05, |
|
"loss": 0.6416, |
|
"step": 13450 |
|
}, |
|
{ |
|
"epoch": 1.2186315219353674, |
|
"grad_norm": 9.253098487854004, |
|
"learning_rate": 1.3023069207622869e-05, |
|
"loss": 0.4543, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 1.2186315219353674, |
|
"eval_exact_match": 86.20624408703878, |
|
"eval_f1": 92.32664235875168, |
|
"eval_runtime": 143.2002, |
|
"eval_samples_per_second": 75.405, |
|
"eval_steps_per_second": 18.855, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 1.2231449720166094, |
|
"grad_norm": 9.202949523925781, |
|
"learning_rate": 1.2947843530591776e-05, |
|
"loss": 0.6569, |
|
"step": 13550 |
|
}, |
|
{ |
|
"epoch": 1.2276584220978517, |
|
"grad_norm": 16.244760513305664, |
|
"learning_rate": 1.2872617853560682e-05, |
|
"loss": 0.5605, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 1.2321718721790937, |
|
"grad_norm": 2.6242430210113525, |
|
"learning_rate": 1.279739217652959e-05, |
|
"loss": 0.546, |
|
"step": 13650 |
|
}, |
|
{ |
|
"epoch": 1.2366853222603358, |
|
"grad_norm": 19.960708618164062, |
|
"learning_rate": 1.2722166499498497e-05, |
|
"loss": 0.5916, |
|
"step": 13700 |
|
}, |
|
{ |
|
"epoch": 1.2411987723415778, |
|
"grad_norm": 14.39201545715332, |
|
"learning_rate": 1.2646940822467401e-05, |
|
"loss": 0.4647, |
|
"step": 13750 |
|
}, |
|
{ |
|
"epoch": 1.2411987723415778, |
|
"eval_exact_match": 86.27246925260171, |
|
"eval_f1": 92.45848778749898, |
|
"eval_runtime": 143.112, |
|
"eval_samples_per_second": 75.451, |
|
"eval_steps_per_second": 18.866, |
|
"step": 13750 |
|
}, |
|
{ |
|
"epoch": 1.24571222242282, |
|
"grad_norm": 8.58752155303955, |
|
"learning_rate": 1.2571715145436309e-05, |
|
"loss": 0.5224, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 1.2502256725040621, |
|
"grad_norm": 5.419035911560059, |
|
"learning_rate": 1.2496489468405216e-05, |
|
"loss": 0.6743, |
|
"step": 13850 |
|
}, |
|
{ |
|
"epoch": 1.2547391225853042, |
|
"grad_norm": 7.52559232711792, |
|
"learning_rate": 1.2421263791374122e-05, |
|
"loss": 0.5955, |
|
"step": 13900 |
|
}, |
|
{ |
|
"epoch": 1.2592525726665462, |
|
"grad_norm": 15.449511528015137, |
|
"learning_rate": 1.234603811434303e-05, |
|
"loss": 0.5824, |
|
"step": 13950 |
|
}, |
|
{ |
|
"epoch": 1.2637660227477885, |
|
"grad_norm": 11.266414642333984, |
|
"learning_rate": 1.2270812437311937e-05, |
|
"loss": 0.5101, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 1.2637660227477885, |
|
"eval_exact_match": 86.12109744560075, |
|
"eval_f1": 92.2146350604068, |
|
"eval_runtime": 145.9619, |
|
"eval_samples_per_second": 73.978, |
|
"eval_steps_per_second": 18.498, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 1.2682794728290305, |
|
"grad_norm": 7.292428970336914, |
|
"learning_rate": 1.2195586760280843e-05, |
|
"loss": 0.4962, |
|
"step": 14050 |
|
}, |
|
{ |
|
"epoch": 1.2727929229102726, |
|
"grad_norm": 1.1534169912338257, |
|
"learning_rate": 1.2120361083249749e-05, |
|
"loss": 0.5691, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 1.2773063729915148, |
|
"grad_norm": 3.5983633995056152, |
|
"learning_rate": 1.2045135406218656e-05, |
|
"loss": 0.5114, |
|
"step": 14150 |
|
}, |
|
{ |
|
"epoch": 1.2818198230727569, |
|
"grad_norm": 5.006545543670654, |
|
"learning_rate": 1.1969909729187562e-05, |
|
"loss": 0.5259, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 1.286333273153999, |
|
"grad_norm": 24.31420135498047, |
|
"learning_rate": 1.189468405215647e-05, |
|
"loss": 0.4771, |
|
"step": 14250 |
|
}, |
|
{ |
|
"epoch": 1.286333273153999, |
|
"eval_exact_match": 86.59413434247871, |
|
"eval_f1": 92.49677313517446, |
|
"eval_runtime": 145.8932, |
|
"eval_samples_per_second": 74.013, |
|
"eval_steps_per_second": 18.507, |
|
"step": 14250 |
|
}, |
|
{ |
|
"epoch": 1.290846723235241, |
|
"grad_norm": 8.447436332702637, |
|
"learning_rate": 1.1819458375125377e-05, |
|
"loss": 0.5835, |
|
"step": 14300 |
|
}, |
|
{ |
|
"epoch": 1.295360173316483, |
|
"grad_norm": 23.178955078125, |
|
"learning_rate": 1.1744232698094283e-05, |
|
"loss": 0.5554, |
|
"step": 14350 |
|
}, |
|
{ |
|
"epoch": 1.2998736233977253, |
|
"grad_norm": 16.500057220458984, |
|
"learning_rate": 1.166900702106319e-05, |
|
"loss": 0.4928, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 1.3043870734789673, |
|
"grad_norm": 8.389457702636719, |
|
"learning_rate": 1.1593781344032097e-05, |
|
"loss": 0.6872, |
|
"step": 14450 |
|
}, |
|
{ |
|
"epoch": 1.3089005235602094, |
|
"grad_norm": 5.315954685211182, |
|
"learning_rate": 1.1518555667001002e-05, |
|
"loss": 0.5394, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 1.3089005235602094, |
|
"eval_exact_match": 86.45222327341533, |
|
"eval_f1": 92.58788732745475, |
|
"eval_runtime": 145.7277, |
|
"eval_samples_per_second": 74.097, |
|
"eval_steps_per_second": 18.528, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 1.3134139736414516, |
|
"grad_norm": 2.0151515007019043, |
|
"learning_rate": 1.144332998996991e-05, |
|
"loss": 0.4745, |
|
"step": 14550 |
|
}, |
|
{ |
|
"epoch": 1.3179274237226937, |
|
"grad_norm": 6.995370864868164, |
|
"learning_rate": 1.1368104312938817e-05, |
|
"loss": 0.536, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 1.3224408738039357, |
|
"grad_norm": 4.453261852264404, |
|
"learning_rate": 1.1292878635907723e-05, |
|
"loss": 0.4909, |
|
"step": 14650 |
|
}, |
|
{ |
|
"epoch": 1.3269543238851778, |
|
"grad_norm": 3.472259998321533, |
|
"learning_rate": 1.121765295887663e-05, |
|
"loss": 0.586, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 1.3314677739664198, |
|
"grad_norm": 15.908103942871094, |
|
"learning_rate": 1.1142427281845537e-05, |
|
"loss": 0.528, |
|
"step": 14750 |
|
}, |
|
{ |
|
"epoch": 1.3314677739664198, |
|
"eval_exact_match": 85.86565752128666, |
|
"eval_f1": 92.39225966105154, |
|
"eval_runtime": 280.4149, |
|
"eval_samples_per_second": 38.507, |
|
"eval_steps_per_second": 9.629, |
|
"step": 14750 |
|
}, |
|
{ |
|
"epoch": 1.335981224047662, |
|
"grad_norm": 12.543098449707031, |
|
"learning_rate": 1.1067201604814443e-05, |
|
"loss": 0.4912, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 1.340494674128904, |
|
"grad_norm": 24.144222259521484, |
|
"learning_rate": 1.099197592778335e-05, |
|
"loss": 0.5626, |
|
"step": 14850 |
|
}, |
|
{ |
|
"epoch": 1.3450081242101461, |
|
"grad_norm": 25.347875595092773, |
|
"learning_rate": 1.0916750250752258e-05, |
|
"loss": 0.4955, |
|
"step": 14900 |
|
}, |
|
{ |
|
"epoch": 1.3495215742913884, |
|
"grad_norm": 5.940708637237549, |
|
"learning_rate": 1.0841524573721163e-05, |
|
"loss": 0.4859, |
|
"step": 14950 |
|
}, |
|
{ |
|
"epoch": 1.3540350243726305, |
|
"grad_norm": 30.9013671875, |
|
"learning_rate": 1.0766298896690071e-05, |
|
"loss": 0.4788, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 1.3540350243726305, |
|
"eval_exact_match": 86.6414380321665, |
|
"eval_f1": 92.62032707644155, |
|
"eval_runtime": 145.6846, |
|
"eval_samples_per_second": 74.119, |
|
"eval_steps_per_second": 18.533, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 1.3585484744538725, |
|
"grad_norm": 12.216713905334473, |
|
"learning_rate": 1.0691073219658978e-05, |
|
"loss": 0.4977, |
|
"step": 15050 |
|
}, |
|
{ |
|
"epoch": 1.3630619245351148, |
|
"grad_norm": 40.2611083984375, |
|
"learning_rate": 1.0615847542627883e-05, |
|
"loss": 0.4972, |
|
"step": 15100 |
|
}, |
|
{ |
|
"epoch": 1.3675753746163568, |
|
"grad_norm": 10.3711519241333, |
|
"learning_rate": 1.054062186559679e-05, |
|
"loss": 0.4955, |
|
"step": 15150 |
|
}, |
|
{ |
|
"epoch": 1.3720888246975989, |
|
"grad_norm": 2.400322914123535, |
|
"learning_rate": 1.0465396188565698e-05, |
|
"loss": 0.4868, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 1.376602274778841, |
|
"grad_norm": 3.9988925457000732, |
|
"learning_rate": 1.0390170511534603e-05, |
|
"loss": 0.5888, |
|
"step": 15250 |
|
}, |
|
{ |
|
"epoch": 1.376602274778841, |
|
"eval_exact_match": 85.96026490066225, |
|
"eval_f1": 92.33197764854948, |
|
"eval_runtime": 145.4185, |
|
"eval_samples_per_second": 74.255, |
|
"eval_steps_per_second": 18.567, |
|
"step": 15250 |
|
}, |
|
{ |
|
"epoch": 1.381115724860083, |
|
"grad_norm": 1.6575514078140259, |
|
"learning_rate": 1.0314944834503511e-05, |
|
"loss": 0.5143, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 1.3856291749413252, |
|
"grad_norm": 5.943323612213135, |
|
"learning_rate": 1.0239719157472419e-05, |
|
"loss": 0.4725, |
|
"step": 15350 |
|
}, |
|
{ |
|
"epoch": 1.3901426250225672, |
|
"grad_norm": 21.014570236206055, |
|
"learning_rate": 1.0164493480441324e-05, |
|
"loss": 0.5131, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 1.3946560751038093, |
|
"grad_norm": 4.148115634918213, |
|
"learning_rate": 1.008926780341023e-05, |
|
"loss": 0.4323, |
|
"step": 15450 |
|
}, |
|
{ |
|
"epoch": 1.3991695251850516, |
|
"grad_norm": 8.95993423461914, |
|
"learning_rate": 1.0014042126379138e-05, |
|
"loss": 0.6072, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 1.3991695251850516, |
|
"eval_exact_match": 85.76158940397352, |
|
"eval_f1": 92.24988076673156, |
|
"eval_runtime": 145.3259, |
|
"eval_samples_per_second": 74.302, |
|
"eval_steps_per_second": 18.579, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 1.4036829752662936, |
|
"grad_norm": 5.6876959800720215, |
|
"learning_rate": 9.938816449348044e-06, |
|
"loss": 0.5692, |
|
"step": 15550 |
|
}, |
|
{ |
|
"epoch": 1.4081964253475356, |
|
"grad_norm": 6.91029167175293, |
|
"learning_rate": 9.863590772316951e-06, |
|
"loss": 0.5801, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 1.4127098754287777, |
|
"grad_norm": 8.116116523742676, |
|
"learning_rate": 9.788365095285859e-06, |
|
"loss": 0.4394, |
|
"step": 15650 |
|
}, |
|
{ |
|
"epoch": 1.4172233255100197, |
|
"grad_norm": 7.001738548278809, |
|
"learning_rate": 9.713139418254764e-06, |
|
"loss": 0.5607, |
|
"step": 15700 |
|
}, |
|
{ |
|
"epoch": 1.421736775591262, |
|
"grad_norm": 21.804443359375, |
|
"learning_rate": 9.637913741223672e-06, |
|
"loss": 0.5191, |
|
"step": 15750 |
|
}, |
|
{ |
|
"epoch": 1.421736775591262, |
|
"eval_exact_match": 85.97918637653737, |
|
"eval_f1": 92.37275066667881, |
|
"eval_runtime": 145.3503, |
|
"eval_samples_per_second": 74.289, |
|
"eval_steps_per_second": 18.576, |
|
"step": 15750 |
|
}, |
|
{ |
|
"epoch": 1.426250225672504, |
|
"grad_norm": 11.133319854736328, |
|
"learning_rate": 9.562688064192578e-06, |
|
"loss": 0.5256, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 1.430763675753746, |
|
"grad_norm": 5.4904632568359375, |
|
"learning_rate": 9.487462387161484e-06, |
|
"loss": 0.5177, |
|
"step": 15850 |
|
}, |
|
{ |
|
"epoch": 1.4352771258349883, |
|
"grad_norm": 9.791414260864258, |
|
"learning_rate": 9.412236710130391e-06, |
|
"loss": 0.5814, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 1.4397905759162304, |
|
"grad_norm": 3.3400447368621826, |
|
"learning_rate": 9.337011033099299e-06, |
|
"loss": 0.5374, |
|
"step": 15950 |
|
}, |
|
{ |
|
"epoch": 1.4443040259974724, |
|
"grad_norm": 23.98038673400879, |
|
"learning_rate": 9.261785356068205e-06, |
|
"loss": 0.4973, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 1.4443040259974724, |
|
"eval_exact_match": 84.85335856196784, |
|
"eval_f1": 92.03377983249271, |
|
"eval_runtime": 145.2676, |
|
"eval_samples_per_second": 74.332, |
|
"eval_steps_per_second": 18.586, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 1.4488174760787147, |
|
"grad_norm": 9.59720516204834, |
|
"learning_rate": 9.186559679037112e-06, |
|
"loss": 0.534, |
|
"step": 16050 |
|
}, |
|
{ |
|
"epoch": 1.4533309261599567, |
|
"grad_norm": 10.079476356506348, |
|
"learning_rate": 9.111334002006018e-06, |
|
"loss": 0.511, |
|
"step": 16100 |
|
}, |
|
{ |
|
"epoch": 1.4578443762411988, |
|
"grad_norm": 3.377192497253418, |
|
"learning_rate": 9.036108324974924e-06, |
|
"loss": 0.5187, |
|
"step": 16150 |
|
}, |
|
{ |
|
"epoch": 1.4623578263224408, |
|
"grad_norm": 10.79287052154541, |
|
"learning_rate": 8.960882647943831e-06, |
|
"loss": 0.5499, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 1.4668712764036829, |
|
"grad_norm": 15.751055717468262, |
|
"learning_rate": 8.885656970912739e-06, |
|
"loss": 0.536, |
|
"step": 16250 |
|
}, |
|
{ |
|
"epoch": 1.4668712764036829, |
|
"eval_exact_match": 86.40491958372753, |
|
"eval_f1": 92.48354485469106, |
|
"eval_runtime": 145.2653, |
|
"eval_samples_per_second": 74.333, |
|
"eval_steps_per_second": 18.587, |
|
"step": 16250 |
|
}, |
|
{ |
|
"epoch": 1.4713847264849251, |
|
"grad_norm": 3.9903676509857178, |
|
"learning_rate": 8.810431293881645e-06, |
|
"loss": 0.5385, |
|
"step": 16300 |
|
}, |
|
{ |
|
"epoch": 1.4758981765661672, |
|
"grad_norm": 6.5839080810546875, |
|
"learning_rate": 8.735205616850552e-06, |
|
"loss": 0.5092, |
|
"step": 16350 |
|
}, |
|
{ |
|
"epoch": 1.4804116266474092, |
|
"grad_norm": 13.69189453125, |
|
"learning_rate": 8.65997993981946e-06, |
|
"loss": 0.5999, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 1.4849250767286515, |
|
"grad_norm": 11.840332984924316, |
|
"learning_rate": 8.584754262788364e-06, |
|
"loss": 0.57, |
|
"step": 16450 |
|
}, |
|
{ |
|
"epoch": 1.4894385268098935, |
|
"grad_norm": 11.86502742767334, |
|
"learning_rate": 8.509528585757271e-06, |
|
"loss": 0.4635, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 1.4894385268098935, |
|
"eval_exact_match": 85.49668874172185, |
|
"eval_f1": 92.28753097702375, |
|
"eval_runtime": 145.2047, |
|
"eval_samples_per_second": 74.364, |
|
"eval_steps_per_second": 18.594, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 1.4939519768911356, |
|
"grad_norm": 17.741037368774414, |
|
"learning_rate": 8.434302908726179e-06, |
|
"loss": 0.53, |
|
"step": 16550 |
|
}, |
|
{ |
|
"epoch": 1.4984654269723776, |
|
"grad_norm": 7.774323463439941, |
|
"learning_rate": 8.359077231695085e-06, |
|
"loss": 0.575, |
|
"step": 16600 |
|
}, |
|
{ |
|
"epoch": 1.5029788770536197, |
|
"grad_norm": 4.973544597625732, |
|
"learning_rate": 8.283851554663992e-06, |
|
"loss": 0.6468, |
|
"step": 16650 |
|
}, |
|
{ |
|
"epoch": 1.507492327134862, |
|
"grad_norm": 5.228555202484131, |
|
"learning_rate": 8.2086258776329e-06, |
|
"loss": 0.4901, |
|
"step": 16700 |
|
}, |
|
{ |
|
"epoch": 1.512005777216104, |
|
"grad_norm": 3.4082319736480713, |
|
"learning_rate": 8.133400200601806e-06, |
|
"loss": 0.5492, |
|
"step": 16750 |
|
}, |
|
{ |
|
"epoch": 1.512005777216104, |
|
"eval_exact_match": 86.10217596972564, |
|
"eval_f1": 92.39499925755595, |
|
"eval_runtime": 145.1347, |
|
"eval_samples_per_second": 74.4, |
|
"eval_steps_per_second": 18.603, |
|
"step": 16750 |
|
}, |
|
{ |
|
"epoch": 1.516519227297346, |
|
"grad_norm": 20.74472427368164, |
|
"learning_rate": 8.058174523570712e-06, |
|
"loss": 0.4685, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 1.5210326773785883, |
|
"grad_norm": 4.805381774902344, |
|
"learning_rate": 7.982948846539619e-06, |
|
"loss": 0.474, |
|
"step": 16850 |
|
}, |
|
{ |
|
"epoch": 1.5255461274598303, |
|
"grad_norm": 1.6686218976974487, |
|
"learning_rate": 7.907723169508525e-06, |
|
"loss": 0.5617, |
|
"step": 16900 |
|
}, |
|
{ |
|
"epoch": 1.5300595775410724, |
|
"grad_norm": 14.378780364990234, |
|
"learning_rate": 7.832497492477432e-06, |
|
"loss": 0.4829, |
|
"step": 16950 |
|
}, |
|
{ |
|
"epoch": 1.5345730276223146, |
|
"grad_norm": 9.25706672668457, |
|
"learning_rate": 7.75727181544634e-06, |
|
"loss": 0.5525, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 1.5345730276223146, |
|
"eval_exact_match": 85.89403973509934, |
|
"eval_f1": 92.2175102581889, |
|
"eval_runtime": 145.4454, |
|
"eval_samples_per_second": 74.241, |
|
"eval_steps_per_second": 18.564, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 1.5390864777035564, |
|
"grad_norm": 10.210553169250488, |
|
"learning_rate": 7.682046138415246e-06, |
|
"loss": 0.5284, |
|
"step": 17050 |
|
}, |
|
{ |
|
"epoch": 1.5435999277847987, |
|
"grad_norm": 18.55254364013672, |
|
"learning_rate": 7.6068204613841525e-06, |
|
"loss": 0.4863, |
|
"step": 17100 |
|
}, |
|
{ |
|
"epoch": 1.5481133778660408, |
|
"grad_norm": 9.640850067138672, |
|
"learning_rate": 7.53159478435306e-06, |
|
"loss": 0.6163, |
|
"step": 17150 |
|
}, |
|
{ |
|
"epoch": 1.5526268279472828, |
|
"grad_norm": 7.999804496765137, |
|
"learning_rate": 7.456369107321966e-06, |
|
"loss": 0.518, |
|
"step": 17200 |
|
}, |
|
{ |
|
"epoch": 1.557140278028525, |
|
"grad_norm": 9.162345886230469, |
|
"learning_rate": 7.3811434302908725e-06, |
|
"loss": 0.5001, |
|
"step": 17250 |
|
}, |
|
{ |
|
"epoch": 1.557140278028525, |
|
"eval_exact_match": 85.9035004730369, |
|
"eval_f1": 92.33731500742522, |
|
"eval_runtime": 145.3659, |
|
"eval_samples_per_second": 74.282, |
|
"eval_steps_per_second": 18.574, |
|
"step": 17250 |
|
}, |
|
{ |
|
"epoch": 1.561653728109767, |
|
"grad_norm": 5.462348461151123, |
|
"learning_rate": 7.30591775325978e-06, |
|
"loss": 0.4179, |
|
"step": 17300 |
|
}, |
|
{ |
|
"epoch": 1.5661671781910091, |
|
"grad_norm": 17.67523765563965, |
|
"learning_rate": 7.230692076228686e-06, |
|
"loss": 0.4662, |
|
"step": 17350 |
|
}, |
|
{ |
|
"epoch": 1.5706806282722514, |
|
"grad_norm": 4.397737503051758, |
|
"learning_rate": 7.155466399197593e-06, |
|
"loss": 0.4614, |
|
"step": 17400 |
|
}, |
|
{ |
|
"epoch": 1.5751940783534935, |
|
"grad_norm": 7.665886402130127, |
|
"learning_rate": 7.0802407221665e-06, |
|
"loss": 0.5263, |
|
"step": 17450 |
|
}, |
|
{ |
|
"epoch": 1.5797075284347355, |
|
"grad_norm": 10.627632141113281, |
|
"learning_rate": 7.005015045135407e-06, |
|
"loss": 0.4021, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 1.5797075284347355, |
|
"eval_exact_match": 85.98864711447493, |
|
"eval_f1": 92.40008411758966, |
|
"eval_runtime": 145.3304, |
|
"eval_samples_per_second": 74.3, |
|
"eval_steps_per_second": 18.578, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 1.5842209785159778, |
|
"grad_norm": 7.057243347167969, |
|
"learning_rate": 6.929789368104313e-06, |
|
"loss": 0.5326, |
|
"step": 17550 |
|
}, |
|
{ |
|
"epoch": 1.5887344285972196, |
|
"grad_norm": 8.216778755187988, |
|
"learning_rate": 6.85456369107322e-06, |
|
"loss": 0.6029, |
|
"step": 17600 |
|
}, |
|
{ |
|
"epoch": 1.5932478786784618, |
|
"grad_norm": 3.943422794342041, |
|
"learning_rate": 6.779338014042127e-06, |
|
"loss": 0.5518, |
|
"step": 17650 |
|
}, |
|
{ |
|
"epoch": 1.597761328759704, |
|
"grad_norm": 12.350107192993164, |
|
"learning_rate": 6.704112337011033e-06, |
|
"loss": 0.5368, |
|
"step": 17700 |
|
}, |
|
{ |
|
"epoch": 1.602274778840946, |
|
"grad_norm": 6.516546249389648, |
|
"learning_rate": 6.62888665997994e-06, |
|
"loss": 0.504, |
|
"step": 17750 |
|
}, |
|
{ |
|
"epoch": 1.602274778840946, |
|
"eval_exact_match": 86.12109744560075, |
|
"eval_f1": 92.37865780721518, |
|
"eval_runtime": 145.3649, |
|
"eval_samples_per_second": 74.282, |
|
"eval_steps_per_second": 18.574, |
|
"step": 17750 |
|
}, |
|
{ |
|
"epoch": 1.6067882289221882, |
|
"grad_norm": 3.5462801456451416, |
|
"learning_rate": 6.553660982948847e-06, |
|
"loss": 0.4613, |
|
"step": 17800 |
|
}, |
|
{ |
|
"epoch": 1.6113016790034302, |
|
"grad_norm": 9.32242488861084, |
|
"learning_rate": 6.478435305917753e-06, |
|
"loss": 0.5149, |
|
"step": 17850 |
|
}, |
|
{ |
|
"epoch": 1.6158151290846723, |
|
"grad_norm": 4.5879597663879395, |
|
"learning_rate": 6.40320962888666e-06, |
|
"loss": 0.4189, |
|
"step": 17900 |
|
}, |
|
{ |
|
"epoch": 1.6203285791659146, |
|
"grad_norm": 10.474478721618652, |
|
"learning_rate": 6.327983951855567e-06, |
|
"loss": 0.5182, |
|
"step": 17950 |
|
}, |
|
{ |
|
"epoch": 1.6248420292471564, |
|
"grad_norm": 4.693137168884277, |
|
"learning_rate": 6.252758274824474e-06, |
|
"loss": 0.6212, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 1.6248420292471564, |
|
"eval_exact_match": 85.93188268684958, |
|
"eval_f1": 92.33148266916612, |
|
"eval_runtime": 145.2483, |
|
"eval_samples_per_second": 74.342, |
|
"eval_steps_per_second": 18.589, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 1.6293554793283986, |
|
"grad_norm": 20.36956214904785, |
|
"learning_rate": 6.17753259779338e-06, |
|
"loss": 0.6194, |
|
"step": 18050 |
|
}, |
|
{ |
|
"epoch": 1.6338689294096407, |
|
"grad_norm": 8.450358390808105, |
|
"learning_rate": 6.102306920762287e-06, |
|
"loss": 0.518, |
|
"step": 18100 |
|
}, |
|
{ |
|
"epoch": 1.6383823794908827, |
|
"grad_norm": 4.681309223175049, |
|
"learning_rate": 6.027081243731194e-06, |
|
"loss": 0.4853, |
|
"step": 18150 |
|
}, |
|
{ |
|
"epoch": 1.642895829572125, |
|
"grad_norm": 6.480415344238281, |
|
"learning_rate": 5.9518555667001e-06, |
|
"loss": 0.5101, |
|
"step": 18200 |
|
}, |
|
{ |
|
"epoch": 1.647409279653367, |
|
"grad_norm": 11.254326820373535, |
|
"learning_rate": 5.876629889669007e-06, |
|
"loss": 0.5095, |
|
"step": 18250 |
|
}, |
|
{ |
|
"epoch": 1.647409279653367, |
|
"eval_exact_match": 86.03595080416272, |
|
"eval_f1": 92.38608711058883, |
|
"eval_runtime": 145.2161, |
|
"eval_samples_per_second": 74.358, |
|
"eval_steps_per_second": 18.593, |
|
"step": 18250 |
|
}, |
|
{ |
|
"epoch": 1.651922729734609, |
|
"grad_norm": 7.776529788970947, |
|
"learning_rate": 5.801404212637914e-06, |
|
"loss": 0.5322, |
|
"step": 18300 |
|
}, |
|
{ |
|
"epoch": 1.6564361798158513, |
|
"grad_norm": 17.068607330322266, |
|
"learning_rate": 5.72617853560682e-06, |
|
"loss": 0.4755, |
|
"step": 18350 |
|
}, |
|
{ |
|
"epoch": 1.6609496298970934, |
|
"grad_norm": 20.472034454345703, |
|
"learning_rate": 5.650952858575727e-06, |
|
"loss": 0.6017, |
|
"step": 18400 |
|
}, |
|
{ |
|
"epoch": 1.6654630799783354, |
|
"grad_norm": 4.904719352722168, |
|
"learning_rate": 5.575727181544634e-06, |
|
"loss": 0.4662, |
|
"step": 18450 |
|
}, |
|
{ |
|
"epoch": 1.6699765300595777, |
|
"grad_norm": 7.219258785247803, |
|
"learning_rate": 5.500501504513541e-06, |
|
"loss": 0.4781, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 1.6699765300595777, |
|
"eval_exact_match": 85.55345316934721, |
|
"eval_f1": 92.32101574735367, |
|
"eval_runtime": 145.1791, |
|
"eval_samples_per_second": 74.377, |
|
"eval_steps_per_second": 18.598, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 1.6744899801408195, |
|
"grad_norm": 8.811306953430176, |
|
"learning_rate": 5.425275827482447e-06, |
|
"loss": 0.5161, |
|
"step": 18550 |
|
}, |
|
{ |
|
"epoch": 1.6790034302220618, |
|
"grad_norm": 4.903675079345703, |
|
"learning_rate": 5.350050150451354e-06, |
|
"loss": 0.5215, |
|
"step": 18600 |
|
}, |
|
{ |
|
"epoch": 1.6835168803033038, |
|
"grad_norm": 2.371656894683838, |
|
"learning_rate": 5.274824473420261e-06, |
|
"loss": 0.486, |
|
"step": 18650 |
|
}, |
|
{ |
|
"epoch": 1.6880303303845459, |
|
"grad_norm": 8.991338729858398, |
|
"learning_rate": 5.199598796389167e-06, |
|
"loss": 0.5043, |
|
"step": 18700 |
|
}, |
|
{ |
|
"epoch": 1.6925437804657881, |
|
"grad_norm": 5.460509777069092, |
|
"learning_rate": 5.124373119358074e-06, |
|
"loss": 0.5344, |
|
"step": 18750 |
|
}, |
|
{ |
|
"epoch": 1.6925437804657881, |
|
"eval_exact_match": 86.02649006622516, |
|
"eval_f1": 92.44635703301584, |
|
"eval_runtime": 145.2855, |
|
"eval_samples_per_second": 74.323, |
|
"eval_steps_per_second": 18.584, |
|
"step": 18750 |
|
}, |
|
{ |
|
"epoch": 1.6970572305470302, |
|
"grad_norm": 6.287936210632324, |
|
"learning_rate": 5.049147442326981e-06, |
|
"loss": 0.4446, |
|
"step": 18800 |
|
}, |
|
{ |
|
"epoch": 1.7015706806282722, |
|
"grad_norm": 2.3766534328460693, |
|
"learning_rate": 4.973921765295887e-06, |
|
"loss": 0.4618, |
|
"step": 18850 |
|
}, |
|
{ |
|
"epoch": 1.7060841307095145, |
|
"grad_norm": 6.606088161468506, |
|
"learning_rate": 4.898696088264794e-06, |
|
"loss": 0.498, |
|
"step": 18900 |
|
}, |
|
{ |
|
"epoch": 1.7105975807907563, |
|
"grad_norm": 7.917613506317139, |
|
"learning_rate": 4.8234704112337015e-06, |
|
"loss": 0.473, |
|
"step": 18950 |
|
}, |
|
{ |
|
"epoch": 1.7151110308719986, |
|
"grad_norm": 13.437002182006836, |
|
"learning_rate": 4.748244734202608e-06, |
|
"loss": 0.5217, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 1.7151110308719986, |
|
"eval_exact_match": 86.2251655629139, |
|
"eval_f1": 92.51045927542914, |
|
"eval_runtime": 145.2525, |
|
"eval_samples_per_second": 74.34, |
|
"eval_steps_per_second": 18.588, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 1.7196244809532406, |
|
"grad_norm": 21.756275177001953, |
|
"learning_rate": 4.673019057171515e-06, |
|
"loss": 0.5129, |
|
"step": 19050 |
|
}, |
|
{ |
|
"epoch": 1.7241379310344827, |
|
"grad_norm": 4.2581377029418945, |
|
"learning_rate": 4.5977933801404215e-06, |
|
"loss": 0.5202, |
|
"step": 19100 |
|
}, |
|
{ |
|
"epoch": 1.728651381115725, |
|
"grad_norm": 9.396230697631836, |
|
"learning_rate": 4.522567703109328e-06, |
|
"loss": 0.5118, |
|
"step": 19150 |
|
}, |
|
{ |
|
"epoch": 1.733164831196967, |
|
"grad_norm": 9.545235633850098, |
|
"learning_rate": 4.447342026078235e-06, |
|
"loss": 0.4611, |
|
"step": 19200 |
|
}, |
|
{ |
|
"epoch": 1.737678281278209, |
|
"grad_norm": 4.104794502258301, |
|
"learning_rate": 4.3721163490471416e-06, |
|
"loss": 0.5737, |
|
"step": 19250 |
|
}, |
|
{ |
|
"epoch": 1.737678281278209, |
|
"eval_exact_match": 86.23462630085146, |
|
"eval_f1": 92.57135940815057, |
|
"eval_runtime": 145.5739, |
|
"eval_samples_per_second": 74.175, |
|
"eval_steps_per_second": 18.547, |
|
"step": 19250 |
|
}, |
|
{ |
|
"epoch": 1.7421917313594513, |
|
"grad_norm": 8.624117851257324, |
|
"learning_rate": 4.296890672016048e-06, |
|
"loss": 0.5349, |
|
"step": 19300 |
|
}, |
|
{ |
|
"epoch": 1.7467051814406933, |
|
"grad_norm": 4.802499771118164, |
|
"learning_rate": 4.221664994984955e-06, |
|
"loss": 0.5332, |
|
"step": 19350 |
|
}, |
|
{ |
|
"epoch": 1.7512186315219354, |
|
"grad_norm": 4.347715854644775, |
|
"learning_rate": 4.146439317953862e-06, |
|
"loss": 0.474, |
|
"step": 19400 |
|
}, |
|
{ |
|
"epoch": 1.7557320816031776, |
|
"grad_norm": 21.51348114013672, |
|
"learning_rate": 4.071213640922768e-06, |
|
"loss": 0.5182, |
|
"step": 19450 |
|
}, |
|
{ |
|
"epoch": 1.7602455316844194, |
|
"grad_norm": 3.002976655960083, |
|
"learning_rate": 3.995987963891676e-06, |
|
"loss": 0.4243, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 1.7602455316844194, |
|
"eval_exact_match": 86.44276253547777, |
|
"eval_f1": 92.67282094003843, |
|
"eval_runtime": 145.268, |
|
"eval_samples_per_second": 74.332, |
|
"eval_steps_per_second": 18.586, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 1.7647589817656617, |
|
"grad_norm": 6.619145393371582, |
|
"learning_rate": 3.920762286860582e-06, |
|
"loss": 0.5754, |
|
"step": 19550 |
|
}, |
|
{ |
|
"epoch": 1.7692724318469037, |
|
"grad_norm": 8.654962539672852, |
|
"learning_rate": 3.845536609829488e-06, |
|
"loss": 0.4927, |
|
"step": 19600 |
|
}, |
|
{ |
|
"epoch": 1.7737858819281458, |
|
"grad_norm": 2.102865695953369, |
|
"learning_rate": 3.7703109327983955e-06, |
|
"loss": 0.417, |
|
"step": 19650 |
|
}, |
|
{ |
|
"epoch": 1.778299332009388, |
|
"grad_norm": 9.824490547180176, |
|
"learning_rate": 3.695085255767302e-06, |
|
"loss": 0.5777, |
|
"step": 19700 |
|
}, |
|
{ |
|
"epoch": 1.78281278209063, |
|
"grad_norm": 22.286598205566406, |
|
"learning_rate": 3.619859578736209e-06, |
|
"loss": 0.4338, |
|
"step": 19750 |
|
}, |
|
{ |
|
"epoch": 1.78281278209063, |
|
"eval_exact_match": 86.39545884578997, |
|
"eval_f1": 92.63887659164942, |
|
"eval_runtime": 145.3006, |
|
"eval_samples_per_second": 74.315, |
|
"eval_steps_per_second": 18.582, |
|
"step": 19750 |
|
}, |
|
{ |
|
"epoch": 1.7873262321718721, |
|
"grad_norm": 0.6572410464286804, |
|
"learning_rate": 3.5446339017051155e-06, |
|
"loss": 0.4549, |
|
"step": 19800 |
|
}, |
|
{ |
|
"epoch": 1.7918396822531144, |
|
"grad_norm": 15.171038627624512, |
|
"learning_rate": 3.469408224674022e-06, |
|
"loss": 0.452, |
|
"step": 19850 |
|
}, |
|
{ |
|
"epoch": 1.7963531323343562, |
|
"grad_norm": 13.550349235534668, |
|
"learning_rate": 3.394182547642929e-06, |
|
"loss": 0.4901, |
|
"step": 19900 |
|
}, |
|
{ |
|
"epoch": 1.8008665824155985, |
|
"grad_norm": 9.970057487487793, |
|
"learning_rate": 3.3189568706118356e-06, |
|
"loss": 0.4424, |
|
"step": 19950 |
|
}, |
|
{ |
|
"epoch": 1.8053800324968405, |
|
"grad_norm": 3.251477003097534, |
|
"learning_rate": 3.2437311935807422e-06, |
|
"loss": 0.5214, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 1.8053800324968405, |
|
"eval_exact_match": 86.10217596972564, |
|
"eval_f1": 92.55287274359681, |
|
"eval_runtime": 145.2611, |
|
"eval_samples_per_second": 74.335, |
|
"eval_steps_per_second": 18.587, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 1.8098934825780826, |
|
"grad_norm": 3.656310558319092, |
|
"learning_rate": 3.1685055165496493e-06, |
|
"loss": 0.4794, |
|
"step": 20050 |
|
}, |
|
{ |
|
"epoch": 1.8144069326593248, |
|
"grad_norm": 6.139503479003906, |
|
"learning_rate": 3.0932798395185556e-06, |
|
"loss": 0.4485, |
|
"step": 20100 |
|
}, |
|
{ |
|
"epoch": 1.8189203827405669, |
|
"grad_norm": 6.566440582275391, |
|
"learning_rate": 3.0180541624874623e-06, |
|
"loss": 0.456, |
|
"step": 20150 |
|
}, |
|
{ |
|
"epoch": 1.823433832821809, |
|
"grad_norm": 6.406381130218506, |
|
"learning_rate": 2.9428284854563694e-06, |
|
"loss": 0.4988, |
|
"step": 20200 |
|
}, |
|
{ |
|
"epoch": 1.8279472829030512, |
|
"grad_norm": 4.750673770904541, |
|
"learning_rate": 2.8676028084252757e-06, |
|
"loss": 0.5102, |
|
"step": 20250 |
|
}, |
|
{ |
|
"epoch": 1.8279472829030512, |
|
"eval_exact_match": 86.0643330179754, |
|
"eval_f1": 92.48790625031062, |
|
"eval_runtime": 145.6532, |
|
"eval_samples_per_second": 74.135, |
|
"eval_steps_per_second": 18.537, |
|
"step": 20250 |
|
}, |
|
{ |
|
"epoch": 1.8324607329842932, |
|
"grad_norm": 2.9004476070404053, |
|
"learning_rate": 2.7923771313941828e-06, |
|
"loss": 0.4172, |
|
"step": 20300 |
|
}, |
|
{ |
|
"epoch": 1.8369741830655353, |
|
"grad_norm": 5.453982353210449, |
|
"learning_rate": 2.7171514543630894e-06, |
|
"loss": 0.4328, |
|
"step": 20350 |
|
}, |
|
{ |
|
"epoch": 1.8414876331467775, |
|
"grad_norm": 6.562243461608887, |
|
"learning_rate": 2.6419257773319957e-06, |
|
"loss": 0.5946, |
|
"step": 20400 |
|
}, |
|
{ |
|
"epoch": 1.8460010832280194, |
|
"grad_norm": 2.186967134475708, |
|
"learning_rate": 2.566700100300903e-06, |
|
"loss": 0.5174, |
|
"step": 20450 |
|
}, |
|
{ |
|
"epoch": 1.8505145333092616, |
|
"grad_norm": 18.740962982177734, |
|
"learning_rate": 2.4914744232698095e-06, |
|
"loss": 0.5614, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 1.8505145333092616, |
|
"eval_exact_match": 86.24408703878902, |
|
"eval_f1": 92.6215979315234, |
|
"eval_runtime": 145.5466, |
|
"eval_samples_per_second": 74.189, |
|
"eval_steps_per_second": 18.551, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 1.8550279833905037, |
|
"grad_norm": 8.845901489257812, |
|
"learning_rate": 2.416248746238716e-06, |
|
"loss": 0.5174, |
|
"step": 20550 |
|
}, |
|
{ |
|
"epoch": 1.8595414334717457, |
|
"grad_norm": 2.0935425758361816, |
|
"learning_rate": 2.341023069207623e-06, |
|
"loss": 0.4191, |
|
"step": 20600 |
|
}, |
|
{ |
|
"epoch": 1.864054883552988, |
|
"grad_norm": 5.225878715515137, |
|
"learning_rate": 2.2657973921765295e-06, |
|
"loss": 0.5168, |
|
"step": 20650 |
|
}, |
|
{ |
|
"epoch": 1.86856833363423, |
|
"grad_norm": 3.618779182434082, |
|
"learning_rate": 2.1905717151454362e-06, |
|
"loss": 0.5116, |
|
"step": 20700 |
|
}, |
|
{ |
|
"epoch": 1.873081783715472, |
|
"grad_norm": 14.580885887145996, |
|
"learning_rate": 2.115346038114343e-06, |
|
"loss": 0.5247, |
|
"step": 20750 |
|
}, |
|
{ |
|
"epoch": 1.873081783715472, |
|
"eval_exact_match": 86.39545884578997, |
|
"eval_f1": 92.68082620123108, |
|
"eval_runtime": 144.0826, |
|
"eval_samples_per_second": 74.943, |
|
"eval_steps_per_second": 18.739, |
|
"step": 20750 |
|
}, |
|
{ |
|
"epoch": 1.8775952337967143, |
|
"grad_norm": 6.323169708251953, |
|
"learning_rate": 2.04012036108325e-06, |
|
"loss": 0.5007, |
|
"step": 20800 |
|
}, |
|
{ |
|
"epoch": 1.8821086838779562, |
|
"grad_norm": 7.055742263793945, |
|
"learning_rate": 1.9648946840521567e-06, |
|
"loss": 0.5387, |
|
"step": 20850 |
|
}, |
|
{ |
|
"epoch": 1.8866221339591984, |
|
"grad_norm": 6.097321033477783, |
|
"learning_rate": 1.8896690070210632e-06, |
|
"loss": 0.4795, |
|
"step": 20900 |
|
}, |
|
{ |
|
"epoch": 1.8911355840404405, |
|
"grad_norm": 20.577049255371094, |
|
"learning_rate": 1.81444332998997e-06, |
|
"loss": 0.5636, |
|
"step": 20950 |
|
}, |
|
{ |
|
"epoch": 1.8956490341216825, |
|
"grad_norm": 11.891510009765625, |
|
"learning_rate": 1.7392176529588768e-06, |
|
"loss": 0.5768, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 1.8956490341216825, |
|
"eval_exact_match": 86.28192999053927, |
|
"eval_f1": 92.59481897569101, |
|
"eval_runtime": 143.5237, |
|
"eval_samples_per_second": 75.235, |
|
"eval_steps_per_second": 18.812, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 1.9001624842029248, |
|
"grad_norm": 28.808475494384766, |
|
"learning_rate": 1.6639919759277832e-06, |
|
"loss": 0.5267, |
|
"step": 21050 |
|
}, |
|
{ |
|
"epoch": 1.9046759342841668, |
|
"grad_norm": 11.045042991638184, |
|
"learning_rate": 1.5887662988966901e-06, |
|
"loss": 0.5508, |
|
"step": 21100 |
|
}, |
|
{ |
|
"epoch": 1.9091893843654089, |
|
"grad_norm": 4.862325191497803, |
|
"learning_rate": 1.5135406218655968e-06, |
|
"loss": 0.4697, |
|
"step": 21150 |
|
}, |
|
{ |
|
"epoch": 1.9137028344466511, |
|
"grad_norm": 13.387544631958008, |
|
"learning_rate": 1.4383149448345037e-06, |
|
"loss": 0.4416, |
|
"step": 21200 |
|
}, |
|
{ |
|
"epoch": 1.9182162845278932, |
|
"grad_norm": 5.2306342124938965, |
|
"learning_rate": 1.3630892678034104e-06, |
|
"loss": 0.4124, |
|
"step": 21250 |
|
}, |
|
{ |
|
"epoch": 1.9182162845278932, |
|
"eval_exact_match": 86.26300851466415, |
|
"eval_f1": 92.60255176769716, |
|
"eval_runtime": 143.5894, |
|
"eval_samples_per_second": 75.201, |
|
"eval_steps_per_second": 18.804, |
|
"step": 21250 |
|
}, |
|
{ |
|
"epoch": 1.9227297346091352, |
|
"grad_norm": 29.35004234313965, |
|
"learning_rate": 1.2878635907723169e-06, |
|
"loss": 0.5344, |
|
"step": 21300 |
|
}, |
|
{ |
|
"epoch": 1.9272431846903775, |
|
"grad_norm": 8.634255409240723, |
|
"learning_rate": 1.2126379137412237e-06, |
|
"loss": 0.4815, |
|
"step": 21350 |
|
}, |
|
{ |
|
"epoch": 1.9317566347716193, |
|
"grad_norm": 8.262895584106445, |
|
"learning_rate": 1.1374122367101304e-06, |
|
"loss": 0.4939, |
|
"step": 21400 |
|
}, |
|
{ |
|
"epoch": 1.9362700848528616, |
|
"grad_norm": 12.539655685424805, |
|
"learning_rate": 1.0621865596790371e-06, |
|
"loss": 0.4957, |
|
"step": 21450 |
|
}, |
|
{ |
|
"epoch": 1.9407835349341036, |
|
"grad_norm": 9.728516578674316, |
|
"learning_rate": 9.86960882647944e-07, |
|
"loss": 0.4587, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 1.9407835349341036, |
|
"eval_exact_match": 86.14001892147587, |
|
"eval_f1": 92.57583651293868, |
|
"eval_runtime": 143.5952, |
|
"eval_samples_per_second": 75.197, |
|
"eval_steps_per_second": 18.803, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 1.9452969850153456, |
|
"grad_norm": 9.588170051574707, |
|
"learning_rate": 9.117352056168506e-07, |
|
"loss": 0.5395, |
|
"step": 21550 |
|
}, |
|
{ |
|
"epoch": 1.949810435096588, |
|
"grad_norm": 14.394529342651367, |
|
"learning_rate": 8.365095285857573e-07, |
|
"loss": 0.5462, |
|
"step": 21600 |
|
}, |
|
{ |
|
"epoch": 1.95432388517783, |
|
"grad_norm": 17.767173767089844, |
|
"learning_rate": 7.612838515546641e-07, |
|
"loss": 0.4525, |
|
"step": 21650 |
|
}, |
|
{ |
|
"epoch": 1.958837335259072, |
|
"grad_norm": 15.962186813354492, |
|
"learning_rate": 6.860581745235707e-07, |
|
"loss": 0.6349, |
|
"step": 21700 |
|
}, |
|
{ |
|
"epoch": 1.9633507853403143, |
|
"grad_norm": 3.5042107105255127, |
|
"learning_rate": 6.108324974924774e-07, |
|
"loss": 0.4903, |
|
"step": 21750 |
|
}, |
|
{ |
|
"epoch": 1.9633507853403143, |
|
"eval_exact_match": 86.30085146641439, |
|
"eval_f1": 92.62412689139829, |
|
"eval_runtime": 143.4548, |
|
"eval_samples_per_second": 75.271, |
|
"eval_steps_per_second": 18.821, |
|
"step": 21750 |
|
}, |
|
{ |
|
"epoch": 1.967864235421556, |
|
"grad_norm": 3.967465877532959, |
|
"learning_rate": 5.356068204613842e-07, |
|
"loss": 0.4477, |
|
"step": 21800 |
|
}, |
|
{ |
|
"epoch": 1.9723776855027984, |
|
"grad_norm": 13.418035507202148, |
|
"learning_rate": 4.603811434302909e-07, |
|
"loss": 0.4418, |
|
"step": 21850 |
|
}, |
|
{ |
|
"epoch": 1.9768911355840404, |
|
"grad_norm": 8.183111190795898, |
|
"learning_rate": 3.851554663991976e-07, |
|
"loss": 0.5417, |
|
"step": 21900 |
|
}, |
|
{ |
|
"epoch": 1.9814045856652824, |
|
"grad_norm": 5.646338939666748, |
|
"learning_rate": 3.099297893681043e-07, |
|
"loss": 0.5065, |
|
"step": 21950 |
|
}, |
|
{ |
|
"epoch": 1.9859180357465247, |
|
"grad_norm": 14.187732696533203, |
|
"learning_rate": 2.3470411233701103e-07, |
|
"loss": 0.4325, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 1.9859180357465247, |
|
"eval_exact_match": 86.35761589403974, |
|
"eval_f1": 92.66264597808306, |
|
"eval_runtime": 143.145, |
|
"eval_samples_per_second": 75.434, |
|
"eval_steps_per_second": 18.862, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 1.9904314858277667, |
|
"grad_norm": 24.069639205932617, |
|
"learning_rate": 1.5947843530591774e-07, |
|
"loss": 0.4079, |
|
"step": 22050 |
|
}, |
|
{ |
|
"epoch": 1.9949449359090088, |
|
"grad_norm": 9.552345275878906, |
|
"learning_rate": 8.425275827482447e-08, |
|
"loss": 0.4418, |
|
"step": 22100 |
|
}, |
|
{ |
|
"epoch": 1.999458385990251, |
|
"grad_norm": 11.15715503692627, |
|
"learning_rate": 9.027081243731194e-09, |
|
"loss": 0.519, |
|
"step": 22150 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"step": 22156, |
|
"total_flos": 4.529540706059981e+16, |
|
"train_loss": 0.8456309766066937, |
|
"train_runtime": 23960.3876, |
|
"train_samples_per_second": 7.397, |
|
"train_steps_per_second": 0.925 |
|
} |
|
], |
|
"logging_steps": 50, |
|
"max_steps": 22156, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.529540706059981e+16, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|