diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,6 +1,6 @@ { - "best_metric": 0.0070460401475429535, - "best_model_checkpoint": "./all-observation-type/checkpoint-1300", + "best_metric": 0.007334939204156399, + "best_model_checkpoint": "./all-observation-type/checkpoint-400", "epoch": 40.0, "eval_steps": 100, "global_step": 6880, @@ -10,5440 +10,5440 @@ "log_history": [ { "epoch": 0.05813953488372093, - "grad_norm": 0.0041476087644696236, + "grad_norm": 0.017299937084317207, "learning_rate": 0.0001997093023255814, - "loss": 0.0128, + "loss": 0.0049, "step": 10 }, { "epoch": 0.11627906976744186, - "grad_norm": 0.0033479102421551943, + "grad_norm": 0.015242324210703373, "learning_rate": 0.0001994186046511628, - "loss": 0.0107, + "loss": 0.0069, "step": 20 }, { "epoch": 0.1744186046511628, - "grad_norm": 0.0034534952137619257, + "grad_norm": 0.010356844402849674, "learning_rate": 0.0001991279069767442, - "loss": 0.0113, + "loss": 0.0087, "step": 30 }, { "epoch": 0.23255813953488372, - "grad_norm": 0.003540867706760764, + "grad_norm": 0.009658879600465298, "learning_rate": 0.0001988372093023256, - "loss": 0.0099, + "loss": 0.0075, "step": 40 }, { "epoch": 0.29069767441860467, - "grad_norm": 0.0035517301876097918, + "grad_norm": 0.010567049495875835, "learning_rate": 0.00019854651162790697, - "loss": 0.0098, + "loss": 0.0071, "step": 50 }, { "epoch": 0.3488372093023256, - "grad_norm": 0.0030546970665454865, + "grad_norm": 0.007140299770981073, "learning_rate": 0.00019825581395348837, - "loss": 0.0105, + "loss": 0.0078, "step": 60 }, { "epoch": 0.4069767441860465, - "grad_norm": 0.0026506497524678707, + "grad_norm": 0.006361471954733133, "learning_rate": 0.00019796511627906977, - "loss": 0.0097, + "loss": 0.0067, "step": 70 }, { "epoch": 0.46511627906976744, - "grad_norm": 0.003407118609175086, + "grad_norm": 0.00814775750041008, "learning_rate": 0.00019767441860465116, - "loss": 0.0099, + "loss": 0.0076, "step": 80 }, { "epoch": 0.5232558139534884, - "grad_norm": 0.0033446382731199265, + "grad_norm": 0.012026441283524036, "learning_rate": 0.00019738372093023256, - "loss": 0.0099, + "loss": 0.0075, "step": 90 }, { "epoch": 0.5813953488372093, - "grad_norm": 0.003635579254478216, + "grad_norm": 0.013495495542883873, "learning_rate": 0.00019709302325581396, - "loss": 0.0095, + "loss": 0.007, "step": 100 }, { "epoch": 0.5813953488372093, - "eval_f1": 0.0, - "eval_loss": 0.010107280686497688, - "eval_runtime": 3.2981, - "eval_samples_per_second": 43.965, - "eval_steps_per_second": 5.761, + "eval_f1": 0.04482758620689655, + "eval_loss": 0.008567324839532375, + "eval_runtime": 3.4629, + "eval_samples_per_second": 41.872, + "eval_steps_per_second": 5.487, "step": 100 }, { "epoch": 0.6395348837209303, - "grad_norm": 0.004401995334774256, + "grad_norm": 0.011290419846773148, "learning_rate": 0.00019680232558139536, - "loss": 0.0092, + "loss": 0.0069, "step": 110 }, { "epoch": 0.6976744186046512, - "grad_norm": 0.0034081449266523123, + "grad_norm": 0.007751344237476587, "learning_rate": 0.00019651162790697676, - "loss": 0.0092, + "loss": 0.0066, "step": 120 }, { "epoch": 0.7558139534883721, - "grad_norm": 0.0033064705785363913, + "grad_norm": 0.00781613402068615, "learning_rate": 0.00019622093023255815, - "loss": 0.0089, + "loss": 0.0059, "step": 130 }, { "epoch": 0.813953488372093, - "grad_norm": 0.003100926522165537, + "grad_norm": 0.007235790602862835, "learning_rate": 0.00019593023255813952, - "loss": 0.0092, + "loss": 0.0061, "step": 140 }, { "epoch": 0.872093023255814, - "grad_norm": 0.0038867127150297165, + "grad_norm": 0.012400178238749504, "learning_rate": 0.00019563953488372092, - "loss": 0.0091, + "loss": 0.0061, "step": 150 }, { "epoch": 0.9302325581395349, - "grad_norm": 0.002387130632996559, + "grad_norm": 0.007190878503024578, "learning_rate": 0.00019534883720930232, - "loss": 0.0099, + "loss": 0.0065, "step": 160 }, { "epoch": 0.9883720930232558, - "grad_norm": 0.0035301295574754477, + "grad_norm": 0.007438919506967068, "learning_rate": 0.00019505813953488372, - "loss": 0.0093, + "loss": 0.0058, "step": 170 }, { "epoch": 1.0465116279069768, - "grad_norm": 0.0028093892615288496, + "grad_norm": 0.0056468648836016655, "learning_rate": 0.00019476744186046511, - "loss": 0.0089, + "loss": 0.0053, "step": 180 }, { "epoch": 1.1046511627906976, - "grad_norm": 0.003535624360665679, + "grad_norm": 0.006276940926909447, "learning_rate": 0.0001944767441860465, - "loss": 0.0096, + "loss": 0.0056, "step": 190 }, { "epoch": 1.1627906976744187, - "grad_norm": 0.003256190102547407, + "grad_norm": 0.009605356492102146, "learning_rate": 0.0001941860465116279, - "loss": 0.009, + "loss": 0.0046, "step": 200 }, { "epoch": 1.1627906976744187, - "eval_f1": 0.0, - "eval_loss": 0.009706242009997368, - "eval_runtime": 3.4352, - "eval_samples_per_second": 42.211, - "eval_steps_per_second": 5.531, + "eval_f1": 0.1396551724137931, + "eval_loss": 0.007882812060415745, + "eval_runtime": 3.4027, + "eval_samples_per_second": 42.613, + "eval_steps_per_second": 5.584, "step": 200 }, { "epoch": 1.2209302325581395, - "grad_norm": 0.0036612502299249172, + "grad_norm": 0.01098948810249567, "learning_rate": 0.0001938953488372093, - "loss": 0.0093, + "loss": 0.0053, "step": 210 }, { "epoch": 1.2790697674418605, - "grad_norm": 0.0037002083845436573, + "grad_norm": 0.006418940145522356, "learning_rate": 0.0001936046511627907, - "loss": 0.0085, + "loss": 0.0045, "step": 220 }, { "epoch": 1.3372093023255813, - "grad_norm": 0.0040439036674797535, + "grad_norm": 0.01644039899110794, "learning_rate": 0.0001933139534883721, - "loss": 0.0085, + "loss": 0.0051, "step": 230 }, { "epoch": 1.3953488372093024, - "grad_norm": 0.0030805468559265137, + "grad_norm": 0.0071088275872170925, "learning_rate": 0.0001930232558139535, - "loss": 0.0087, + "loss": 0.0055, "step": 240 }, { "epoch": 1.4534883720930232, - "grad_norm": 0.0032109718304127455, + "grad_norm": 0.009615642949938774, "learning_rate": 0.0001927325581395349, - "loss": 0.0094, + "loss": 0.0054, "step": 250 }, { "epoch": 1.5116279069767442, - "grad_norm": 0.004008762538433075, + "grad_norm": 0.008527752943336964, "learning_rate": 0.0001924418604651163, - "loss": 0.0085, + "loss": 0.0052, "step": 260 }, { "epoch": 1.5697674418604652, - "grad_norm": 0.0038238398265093565, + "grad_norm": 0.006476707756519318, "learning_rate": 0.0001921511627906977, - "loss": 0.0086, + "loss": 0.0048, "step": 270 }, { "epoch": 1.627906976744186, - "grad_norm": 0.0034297690726816654, + "grad_norm": 0.010913817211985588, "learning_rate": 0.0001918604651162791, - "loss": 0.0086, + "loss": 0.0044, "step": 280 }, { "epoch": 1.6860465116279069, - "grad_norm": 0.004734665155410767, + "grad_norm": 0.006241935305297375, "learning_rate": 0.0001915697674418605, - "loss": 0.0074, + "loss": 0.0046, "step": 290 }, { "epoch": 1.744186046511628, - "grad_norm": 0.004433305002748966, + "grad_norm": 0.009475222788751125, "learning_rate": 0.0001912790697674419, - "loss": 0.0081, + "loss": 0.0047, "step": 300 }, { "epoch": 1.744186046511628, - "eval_f1": 0.0, - "eval_loss": 0.009246854111552238, - "eval_runtime": 3.3654, - "eval_samples_per_second": 43.085, - "eval_steps_per_second": 5.646, + "eval_f1": 0.10344827586206896, + "eval_loss": 0.007474210113286972, + "eval_runtime": 3.3899, + "eval_samples_per_second": 42.775, + "eval_steps_per_second": 5.605, "step": 300 }, { "epoch": 1.802325581395349, - "grad_norm": 0.003982495050877333, + "grad_norm": 0.0077681937254965305, "learning_rate": 0.00019098837209302326, - "loss": 0.0086, + "loss": 0.0051, "step": 310 }, { "epoch": 1.8604651162790697, - "grad_norm": 0.005557681433856487, + "grad_norm": 0.008732211776077747, "learning_rate": 0.00019069767441860466, - "loss": 0.0092, + "loss": 0.0054, "step": 320 }, { "epoch": 1.9186046511627906, - "grad_norm": 0.003640982089564204, + "grad_norm": 0.004988688975572586, "learning_rate": 0.00019040697674418605, - "loss": 0.0081, + "loss": 0.0044, "step": 330 }, { "epoch": 1.9767441860465116, - "grad_norm": 0.004217673558741808, + "grad_norm": 0.012183120474219322, "learning_rate": 0.00019011627906976745, - "loss": 0.0088, + "loss": 0.0058, "step": 340 }, { "epoch": 2.0348837209302326, - "grad_norm": 0.00363196455873549, + "grad_norm": 0.006278990302234888, "learning_rate": 0.00018982558139534885, - "loss": 0.008, + "loss": 0.0041, "step": 350 }, { "epoch": 2.0930232558139537, - "grad_norm": 0.004549409728497267, + "grad_norm": 0.005961701739579439, "learning_rate": 0.00018953488372093025, - "loss": 0.0081, + "loss": 0.0042, "step": 360 }, { "epoch": 2.1511627906976742, - "grad_norm": 0.00565100833773613, + "grad_norm": 0.005051076877862215, "learning_rate": 0.00018924418604651164, - "loss": 0.0078, + "loss": 0.004, "step": 370 }, { "epoch": 2.2093023255813953, - "grad_norm": 0.003489867551252246, + "grad_norm": 0.008163918741047382, "learning_rate": 0.00018895348837209304, - "loss": 0.0081, + "loss": 0.0043, "step": 380 }, { "epoch": 2.2674418604651163, - "grad_norm": 0.006364505272358656, + "grad_norm": 0.007578638382256031, "learning_rate": 0.00018866279069767444, - "loss": 0.0087, + "loss": 0.0041, "step": 390 }, { "epoch": 2.3255813953488373, - "grad_norm": 0.003716991748660803, + "grad_norm": 0.004077777732163668, "learning_rate": 0.00018837209302325584, - "loss": 0.0078, + "loss": 0.0032, "step": 400 }, { "epoch": 2.3255813953488373, - "eval_f1": 0.0, - "eval_loss": 0.008736548013985157, - "eval_runtime": 3.3469, - "eval_samples_per_second": 43.324, - "eval_steps_per_second": 5.677, + "eval_f1": 0.18192665571975916, + "eval_loss": 0.007334939204156399, + "eval_runtime": 3.5574, + "eval_samples_per_second": 40.76, + "eval_steps_per_second": 5.341, "step": 400 }, { "epoch": 2.383720930232558, - "grad_norm": 0.003629864426329732, + "grad_norm": 0.007896770723164082, "learning_rate": 0.0001880813953488372, - "loss": 0.0077, + "loss": 0.0037, "step": 410 }, { "epoch": 2.441860465116279, - "grad_norm": 0.01104139443486929, + "grad_norm": 0.004801823291927576, "learning_rate": 0.0001877906976744186, - "loss": 0.0079, + "loss": 0.0036, "step": 420 }, { "epoch": 2.5, - "grad_norm": 0.010308067314326763, + "grad_norm": 0.006228045094758272, "learning_rate": 0.0001875, - "loss": 0.008, + "loss": 0.0042, "step": 430 }, { "epoch": 2.558139534883721, - "grad_norm": 0.005923152901232243, + "grad_norm": 0.005940658040344715, "learning_rate": 0.0001872093023255814, - "loss": 0.0081, + "loss": 0.0042, "step": 440 }, { "epoch": 2.616279069767442, - "grad_norm": 0.003238389501348138, + "grad_norm": 0.004676891956478357, "learning_rate": 0.0001869186046511628, - "loss": 0.0073, + "loss": 0.0037, "step": 450 }, { "epoch": 2.6744186046511627, - "grad_norm": 0.006099153310060501, + "grad_norm": 0.007892576046288013, "learning_rate": 0.0001866279069767442, - "loss": 0.0078, + "loss": 0.0039, "step": 460 }, { "epoch": 2.7325581395348837, - "grad_norm": 0.0038867320399731398, + "grad_norm": 0.006186337675899267, "learning_rate": 0.0001863372093023256, - "loss": 0.0071, + "loss": 0.0035, "step": 470 }, { "epoch": 2.7906976744186047, - "grad_norm": 0.004797995090484619, + "grad_norm": 0.00394851341843605, "learning_rate": 0.000186046511627907, - "loss": 0.0074, + "loss": 0.0033, "step": 480 }, { "epoch": 2.8488372093023253, - "grad_norm": 0.0057796696200966835, + "grad_norm": 0.011999442242085934, "learning_rate": 0.0001857558139534884, - "loss": 0.0082, + "loss": 0.0042, "step": 490 }, { "epoch": 2.9069767441860463, - "grad_norm": 0.0043983496725559235, + "grad_norm": 0.009115585125982761, "learning_rate": 0.00018546511627906976, - "loss": 0.0076, + "loss": 0.0038, "step": 500 }, { "epoch": 2.9069767441860463, - "eval_f1": 0.0, - "eval_loss": 0.00865822285413742, - "eval_runtime": 3.4294, - "eval_samples_per_second": 42.281, - "eval_steps_per_second": 5.54, + "eval_f1": 0.16376573617952928, + "eval_loss": 0.007854057475924492, + "eval_runtime": 3.4136, + "eval_samples_per_second": 42.477, + "eval_steps_per_second": 5.566, "step": 500 }, { "epoch": 2.9651162790697674, - "grad_norm": 0.004557132720947266, + "grad_norm": 0.007015453651547432, "learning_rate": 0.00018517441860465116, - "loss": 0.0075, + "loss": 0.0035, "step": 510 }, { "epoch": 3.0232558139534884, - "grad_norm": 0.01413981057703495, + "grad_norm": 0.01785508170723915, "learning_rate": 0.00018488372093023256, - "loss": 0.0085, + "loss": 0.0037, "step": 520 }, { "epoch": 3.0813953488372094, - "grad_norm": 0.005286430940032005, + "grad_norm": 0.00528199365362525, "learning_rate": 0.00018459302325581395, - "loss": 0.0075, + "loss": 0.0031, "step": 530 }, { "epoch": 3.13953488372093, - "grad_norm": 0.003948388621211052, + "grad_norm": 0.005166070070117712, "learning_rate": 0.00018430232558139535, - "loss": 0.0067, + "loss": 0.0029, "step": 540 }, { "epoch": 3.197674418604651, - "grad_norm": 0.003928500227630138, + "grad_norm": 0.007344888988882303, "learning_rate": 0.00018401162790697675, - "loss": 0.0068, + "loss": 0.0032, "step": 550 }, { "epoch": 3.255813953488372, - "grad_norm": 0.004227056168019772, + "grad_norm": 0.005858028307557106, "learning_rate": 0.00018372093023255815, - "loss": 0.0067, + "loss": 0.0034, "step": 560 }, { "epoch": 3.313953488372093, - "grad_norm": 0.004199563525617123, + "grad_norm": 0.004915835801512003, "learning_rate": 0.00018343023255813955, - "loss": 0.0069, + "loss": 0.003, "step": 570 }, { "epoch": 3.3720930232558137, - "grad_norm": 0.0067247967235744, + "grad_norm": 0.0058327000588178635, "learning_rate": 0.00018313953488372094, - "loss": 0.007, + "loss": 0.003, "step": 580 }, { "epoch": 3.4302325581395348, - "grad_norm": 0.0035736667923629284, + "grad_norm": 0.010060491971671581, "learning_rate": 0.0001828488372093023, - "loss": 0.0068, + "loss": 0.0031, "step": 590 }, { "epoch": 3.488372093023256, - "grad_norm": 0.0050314306281507015, + "grad_norm": 0.005528128705918789, "learning_rate": 0.0001825581395348837, - "loss": 0.007, + "loss": 0.0036, "step": 600 }, { "epoch": 3.488372093023256, - "eval_f1": 0.0, - "eval_loss": 0.007984375581145287, - "eval_runtime": 3.3821, - "eval_samples_per_second": 42.873, - "eval_steps_per_second": 5.618, + "eval_f1": 0.1789272030651341, + "eval_loss": 0.0075110774487257, + "eval_runtime": 3.4484, + "eval_samples_per_second": 42.049, + "eval_steps_per_second": 5.51, "step": 600 }, { "epoch": 3.546511627906977, - "grad_norm": 0.0058385697193443775, + "grad_norm": 0.008280656300485134, "learning_rate": 0.0001822674418604651, - "loss": 0.0072, + "loss": 0.0033, "step": 610 }, { "epoch": 3.604651162790698, - "grad_norm": 0.004391437862068415, + "grad_norm": 0.004866399802267551, "learning_rate": 0.0001819767441860465, - "loss": 0.0065, + "loss": 0.0031, "step": 620 }, { "epoch": 3.6627906976744184, - "grad_norm": 0.0044134631752967834, + "grad_norm": 0.005611503962427378, "learning_rate": 0.0001816860465116279, - "loss": 0.0071, + "loss": 0.0032, "step": 630 }, { "epoch": 3.7209302325581395, - "grad_norm": 0.00653222668915987, + "grad_norm": 0.010018990375101566, "learning_rate": 0.0001813953488372093, - "loss": 0.0067, + "loss": 0.0033, "step": 640 }, { "epoch": 3.7790697674418605, - "grad_norm": 0.004163091536611319, + "grad_norm": 0.0037976368330419064, "learning_rate": 0.00018110465116279073, - "loss": 0.0066, + "loss": 0.0031, "step": 650 }, { "epoch": 3.8372093023255816, - "grad_norm": 0.005467016715556383, + "grad_norm": 0.009306875988841057, "learning_rate": 0.00018081395348837212, - "loss": 0.0072, + "loss": 0.0034, "step": 660 }, { "epoch": 3.895348837209302, - "grad_norm": 0.00349302776157856, + "grad_norm": 0.0037058594170957804, "learning_rate": 0.0001805232558139535, - "loss": 0.0073, + "loss": 0.0035, "step": 670 }, { "epoch": 3.953488372093023, - "grad_norm": 0.004969348665326834, + "grad_norm": 0.0047531514428555965, "learning_rate": 0.0001802325581395349, - "loss": 0.0073, + "loss": 0.0035, "step": 680 }, { "epoch": 4.011627906976744, - "grad_norm": 0.004739751107990742, + "grad_norm": 0.006104028318077326, "learning_rate": 0.0001799418604651163, - "loss": 0.0067, + "loss": 0.0031, "step": 690 }, { "epoch": 4.069767441860465, - "grad_norm": 0.004799176007509232, + "grad_norm": 0.004128449596464634, "learning_rate": 0.0001796511627906977, - "loss": 0.0059, + "loss": 0.0024, "step": 700 }, { "epoch": 4.069767441860465, - "eval_f1": 0.0034482758620689655, - "eval_loss": 0.007861158810555935, - "eval_runtime": 3.5733, - "eval_samples_per_second": 40.579, - "eval_steps_per_second": 5.317, + "eval_f1": 0.2093486590038314, + "eval_loss": 0.0075767748057842255, + "eval_runtime": 3.4263, + "eval_samples_per_second": 42.319, + "eval_steps_per_second": 5.545, "step": 700 }, { "epoch": 4.127906976744186, - "grad_norm": 0.0049268933944404125, + "grad_norm": 0.003599094692617655, "learning_rate": 0.00017936046511627909, - "loss": 0.0063, + "loss": 0.0028, "step": 710 }, { "epoch": 4.186046511627907, - "grad_norm": 0.004747786559164524, + "grad_norm": 0.004095184151083231, "learning_rate": 0.00017906976744186048, - "loss": 0.0063, + "loss": 0.003, "step": 720 }, { "epoch": 4.2441860465116275, - "grad_norm": 0.0038926389534026384, + "grad_norm": 0.0033232627902179956, "learning_rate": 0.00017877906976744188, - "loss": 0.006, + "loss": 0.0027, "step": 730 }, { "epoch": 4.3023255813953485, - "grad_norm": 0.006649950984865427, + "grad_norm": 0.0075360932387411594, "learning_rate": 0.00017848837209302328, - "loss": 0.0068, + "loss": 0.0029, "step": 740 }, { "epoch": 4.3604651162790695, - "grad_norm": 0.005653685890138149, + "grad_norm": 0.004825787618756294, "learning_rate": 0.00017819767441860468, - "loss": 0.0065, + "loss": 0.0027, "step": 750 }, { "epoch": 4.4186046511627906, - "grad_norm": 0.004142285790294409, + "grad_norm": 0.004639455582946539, "learning_rate": 0.00017790697674418605, - "loss": 0.0057, + "loss": 0.0022, "step": 760 }, { "epoch": 4.476744186046512, - "grad_norm": 0.00737773347645998, + "grad_norm": 0.0050194947980344296, "learning_rate": 0.00017761627906976745, - "loss": 0.0061, + "loss": 0.0027, "step": 770 }, { "epoch": 4.534883720930233, - "grad_norm": 0.003984431270509958, + "grad_norm": 0.004107888787984848, "learning_rate": 0.00017732558139534884, - "loss": 0.0059, + "loss": 0.0024, "step": 780 }, { "epoch": 4.593023255813954, - "grad_norm": 0.004819260910153389, + "grad_norm": 0.0041518546640872955, "learning_rate": 0.00017703488372093024, - "loss": 0.0059, + "loss": 0.0026, "step": 790 }, { "epoch": 4.651162790697675, - "grad_norm": 0.006082874722778797, + "grad_norm": 0.0043709296733140945, "learning_rate": 0.00017674418604651164, - "loss": 0.0058, + "loss": 0.0025, "step": 800 }, { "epoch": 4.651162790697675, - "eval_f1": 0.04597701149425287, - "eval_loss": 0.007556948345154524, - "eval_runtime": 3.3888, - "eval_samples_per_second": 42.788, - "eval_steps_per_second": 5.607, + "eval_f1": 0.20008757525998905, + "eval_loss": 0.007771243806928396, + "eval_runtime": 3.4804, + "eval_samples_per_second": 41.662, + "eval_steps_per_second": 5.459, "step": 800 }, { "epoch": 4.709302325581396, - "grad_norm": 0.005633768625557423, + "grad_norm": 0.003510788083076477, "learning_rate": 0.00017645348837209304, - "loss": 0.0061, + "loss": 0.0026, "step": 810 }, { "epoch": 4.767441860465116, - "grad_norm": 0.005370887462049723, + "grad_norm": 0.008073955774307251, "learning_rate": 0.00017616279069767443, - "loss": 0.006, + "loss": 0.0026, "step": 820 }, { "epoch": 4.825581395348837, - "grad_norm": 0.006756180431693792, + "grad_norm": 0.005666918121278286, "learning_rate": 0.00017587209302325583, - "loss": 0.0063, + "loss": 0.0026, "step": 830 }, { "epoch": 4.883720930232558, - "grad_norm": 0.010633228346705437, + "grad_norm": 0.02320161648094654, "learning_rate": 0.00017558139534883723, - "loss": 0.0065, + "loss": 0.0027, "step": 840 }, { "epoch": 4.941860465116279, - "grad_norm": 0.008287597447633743, + "grad_norm": 0.005332824774086475, "learning_rate": 0.0001752906976744186, - "loss": 0.0068, + "loss": 0.0029, "step": 850 }, { "epoch": 5.0, - "grad_norm": 0.005023573059588671, + "grad_norm": 0.005581226199865341, "learning_rate": 0.000175, - "loss": 0.0066, + "loss": 0.0026, "step": 860 }, { "epoch": 5.058139534883721, - "grad_norm": 0.002978923264890909, + "grad_norm": 0.005100961308926344, "learning_rate": 0.0001747093023255814, - "loss": 0.0053, + "loss": 0.0021, "step": 870 }, { "epoch": 5.116279069767442, - "grad_norm": 0.003913262393325567, + "grad_norm": 0.004550059791654348, "learning_rate": 0.0001744186046511628, - "loss": 0.0056, + "loss": 0.0022, "step": 880 }, { "epoch": 5.174418604651163, - "grad_norm": 0.004473670851439238, + "grad_norm": 0.005096370819956064, "learning_rate": 0.0001741279069767442, - "loss": 0.006, + "loss": 0.0026, "step": 890 }, { "epoch": 5.232558139534884, - "grad_norm": 0.005170521326363087, + "grad_norm": 0.004697335418313742, "learning_rate": 0.0001738372093023256, - "loss": 0.0058, + "loss": 0.0021, "step": 900 }, { "epoch": 5.232558139534884, - "eval_f1": 0.06371100164203612, - "eval_loss": 0.007500209379941225, - "eval_runtime": 3.3654, - "eval_samples_per_second": 43.085, - "eval_steps_per_second": 5.646, + "eval_f1": 0.22375478927203063, + "eval_loss": 0.008321760222315788, + "eval_runtime": 3.5066, + "eval_samples_per_second": 41.35, + "eval_steps_per_second": 5.418, "step": 900 }, { "epoch": 5.290697674418604, - "grad_norm": 0.008084526285529137, + "grad_norm": 0.006415514275431633, "learning_rate": 0.000173546511627907, - "loss": 0.0061, + "loss": 0.0026, "step": 910 }, { "epoch": 5.348837209302325, - "grad_norm": 0.008898613974452019, + "grad_norm": 0.005164206027984619, "learning_rate": 0.00017325581395348838, - "loss": 0.0056, + "loss": 0.0023, "step": 920 }, { "epoch": 5.406976744186046, - "grad_norm": 0.004377087112516165, + "grad_norm": 0.007651182822883129, "learning_rate": 0.00017296511627906978, - "loss": 0.0052, + "loss": 0.0021, "step": 930 }, { "epoch": 5.465116279069767, - "grad_norm": 0.008384795859456062, + "grad_norm": 0.006317194551229477, "learning_rate": 0.00017267441860465118, - "loss": 0.0055, + "loss": 0.0021, "step": 940 }, { "epoch": 5.523255813953488, - "grad_norm": 0.008445755578577518, + "grad_norm": 0.00388012221083045, "learning_rate": 0.00017238372093023255, - "loss": 0.0056, + "loss": 0.0023, "step": 950 }, { "epoch": 5.5813953488372094, - "grad_norm": 0.009697931818664074, + "grad_norm": 0.003768169553950429, "learning_rate": 0.00017209302325581395, - "loss": 0.0056, + "loss": 0.0021, "step": 960 }, { "epoch": 5.6395348837209305, - "grad_norm": 0.0035875188186764717, + "grad_norm": 0.007084627170115709, "learning_rate": 0.00017180232558139535, - "loss": 0.0052, + "loss": 0.0022, "step": 970 }, { "epoch": 5.6976744186046515, - "grad_norm": 0.00736456923186779, + "grad_norm": 0.011306801810860634, "learning_rate": 0.00017151162790697674, - "loss": 0.0058, + "loss": 0.0025, "step": 980 }, { "epoch": 5.7558139534883725, - "grad_norm": 0.00497861485928297, + "grad_norm": 0.005429191514849663, "learning_rate": 0.00017122093023255814, - "loss": 0.0053, + "loss": 0.002, "step": 990 }, { "epoch": 5.813953488372093, - "grad_norm": 0.008679182268679142, + "grad_norm": 0.005304534453898668, "learning_rate": 0.00017093023255813954, - "loss": 0.0051, + "loss": 0.0019, "step": 1000 }, { "epoch": 5.813953488372093, - "eval_f1": 0.10275862068965516, - "eval_loss": 0.007194500882178545, - "eval_runtime": 3.2949, - "eval_samples_per_second": 44.007, - "eval_steps_per_second": 5.766, + "eval_f1": 0.21429118773946357, + "eval_loss": 0.008307804353535175, + "eval_runtime": 3.5331, + "eval_samples_per_second": 41.041, + "eval_steps_per_second": 5.378, "step": 1000 }, { "epoch": 5.872093023255814, - "grad_norm": 0.006493718363344669, + "grad_norm": 0.004971199668943882, "learning_rate": 0.00017063953488372094, - "loss": 0.0054, + "loss": 0.0022, "step": 1010 }, { "epoch": 5.930232558139535, - "grad_norm": 0.011412269435822964, + "grad_norm": 0.0047471108846366405, "learning_rate": 0.00017034883720930233, - "loss": 0.0048, + "loss": 0.0018, "step": 1020 }, { "epoch": 5.988372093023256, - "grad_norm": 0.0045639583840966225, + "grad_norm": 0.004365094937384129, "learning_rate": 0.00017005813953488373, - "loss": 0.0051, + "loss": 0.0022, "step": 1030 }, { "epoch": 6.046511627906977, - "grad_norm": 0.007110298611223698, + "grad_norm": 0.0043588015250861645, "learning_rate": 0.0001697674418604651, - "loss": 0.0059, + "loss": 0.0023, "step": 1040 }, { "epoch": 6.104651162790698, - "grad_norm": 0.006933867000043392, + "grad_norm": 0.002772640436887741, "learning_rate": 0.0001694767441860465, - "loss": 0.0048, + "loss": 0.0018, "step": 1050 }, { "epoch": 6.162790697674419, - "grad_norm": 0.0038328988011926413, + "grad_norm": 0.002988204127177596, "learning_rate": 0.0001691860465116279, - "loss": 0.0049, + "loss": 0.0018, "step": 1060 }, { "epoch": 6.22093023255814, - "grad_norm": 0.006384945474565029, + "grad_norm": 0.0037049332167953253, "learning_rate": 0.0001688953488372093, - "loss": 0.0045, + "loss": 0.0014, "step": 1070 }, { "epoch": 6.27906976744186, - "grad_norm": 0.004555262625217438, + "grad_norm": 0.005849141161888838, "learning_rate": 0.00016860465116279072, - "loss": 0.0053, + "loss": 0.0021, "step": 1080 }, { "epoch": 6.337209302325581, - "grad_norm": 0.0052214753814041615, + "grad_norm": 0.003919659648090601, "learning_rate": 0.00016831395348837212, - "loss": 0.0047, + "loss": 0.0015, "step": 1090 }, { "epoch": 6.395348837209302, - "grad_norm": 0.005426972638815641, + "grad_norm": 0.005172960460186005, "learning_rate": 0.00016802325581395352, - "loss": 0.0046, + "loss": 0.0017, "step": 1100 }, { "epoch": 6.395348837209302, - "eval_f1": 0.10804597701149424, - "eval_loss": 0.007310420274734497, - "eval_runtime": 3.351, - "eval_samples_per_second": 43.271, - "eval_steps_per_second": 5.67, + "eval_f1": 0.21129720853858786, + "eval_loss": 0.008289740420877934, + "eval_runtime": 3.3779, + "eval_samples_per_second": 42.927, + "eval_steps_per_second": 5.625, "step": 1100 }, { "epoch": 6.453488372093023, - "grad_norm": 0.00663839653134346, + "grad_norm": 0.0016555693000555038, "learning_rate": 0.00016773255813953491, - "loss": 0.0045, + "loss": 0.0016, "step": 1110 }, { "epoch": 6.511627906976744, - "grad_norm": 0.012470350600779057, + "grad_norm": 0.004070278722792864, "learning_rate": 0.00016744186046511629, - "loss": 0.0042, + "loss": 0.0015, "step": 1120 }, { "epoch": 6.569767441860465, - "grad_norm": 0.008353759534657001, + "grad_norm": 0.005148930475115776, "learning_rate": 0.00016715116279069768, - "loss": 0.0049, + "loss": 0.0021, "step": 1130 }, { "epoch": 6.627906976744186, - "grad_norm": 0.017994899302721024, + "grad_norm": 0.003465486690402031, "learning_rate": 0.00016686046511627908, - "loss": 0.0048, + "loss": 0.0016, "step": 1140 }, { "epoch": 6.686046511627907, - "grad_norm": 0.01437366846948862, + "grad_norm": 0.0043140980415046215, "learning_rate": 0.00016656976744186048, - "loss": 0.0047, + "loss": 0.0016, "step": 1150 }, { "epoch": 6.7441860465116275, - "grad_norm": 0.010588089935481548, + "grad_norm": 0.002341193612664938, "learning_rate": 0.00016627906976744188, - "loss": 0.0045, + "loss": 0.0018, "step": 1160 }, { "epoch": 6.8023255813953485, - "grad_norm": 0.006751187611371279, + "grad_norm": 0.007294884882867336, "learning_rate": 0.00016598837209302327, - "loss": 0.0048, + "loss": 0.0017, "step": 1170 }, { "epoch": 6.8604651162790695, - "grad_norm": 0.007425054907798767, + "grad_norm": 0.005381755530834198, "learning_rate": 0.00016569767441860467, - "loss": 0.0049, + "loss": 0.0016, "step": 1180 }, { "epoch": 6.9186046511627906, - "grad_norm": 0.0072457860223948956, + "grad_norm": 0.0034906489308923483, "learning_rate": 0.00016540697674418607, - "loss": 0.0045, + "loss": 0.0016, "step": 1190 }, { "epoch": 6.976744186046512, - "grad_norm": 0.006747200153768063, + "grad_norm": 0.005938508547842503, "learning_rate": 0.00016511627906976747, - "loss": 0.0051, + "loss": 0.002, "step": 1200 }, { "epoch": 6.976744186046512, - "eval_f1": 0.14172413793103447, - "eval_loss": 0.007313856389373541, - "eval_runtime": 3.4475, - "eval_samples_per_second": 42.06, - "eval_steps_per_second": 5.511, + "eval_f1": 0.23157088122605363, + "eval_loss": 0.008556404151022434, + "eval_runtime": 3.5089, + "eval_samples_per_second": 41.323, + "eval_steps_per_second": 5.415, "step": 1200 }, { "epoch": 7.034883720930233, - "grad_norm": 0.004629623144865036, + "grad_norm": 0.0029680596198886633, "learning_rate": 0.00016482558139534884, - "loss": 0.0048, + "loss": 0.0016, "step": 1210 }, { "epoch": 7.093023255813954, - "grad_norm": 0.006834134925156832, + "grad_norm": 0.005607557017356157, "learning_rate": 0.00016453488372093024, - "loss": 0.0039, + "loss": 0.0013, "step": 1220 }, { "epoch": 7.151162790697675, - "grad_norm": 0.004890755284577608, + "grad_norm": 0.003573081223294139, "learning_rate": 0.00016424418604651163, - "loss": 0.0041, + "loss": 0.0014, "step": 1230 }, { "epoch": 7.209302325581396, - "grad_norm": 0.0034632522147148848, + "grad_norm": 0.0018453372176736593, "learning_rate": 0.00016395348837209303, - "loss": 0.0035, + "loss": 0.0012, "step": 1240 }, { "epoch": 7.267441860465116, - "grad_norm": 0.004078785423189402, + "grad_norm": 0.0038145456928759813, "learning_rate": 0.00016366279069767443, - "loss": 0.0039, + "loss": 0.0013, "step": 1250 }, { "epoch": 7.325581395348837, - "grad_norm": 0.011058216914534569, + "grad_norm": 0.004139317665249109, "learning_rate": 0.00016337209302325583, - "loss": 0.0042, + "loss": 0.0017, "step": 1260 }, { "epoch": 7.383720930232558, - "grad_norm": 0.006676188204437494, + "grad_norm": 0.0027741054072976112, "learning_rate": 0.00016308139534883722, - "loss": 0.0041, + "loss": 0.0013, "step": 1270 }, { "epoch": 7.441860465116279, - "grad_norm": 0.008615364320576191, + "grad_norm": 0.00188296043779701, "learning_rate": 0.00016279069767441862, - "loss": 0.004, + "loss": 0.0012, "step": 1280 }, { "epoch": 7.5, - "grad_norm": 0.008562974631786346, + "grad_norm": 0.003546525491401553, "learning_rate": 0.00016250000000000002, - "loss": 0.0044, + "loss": 0.0015, "step": 1290 }, { "epoch": 7.558139534883721, - "grad_norm": 0.005274553783237934, + "grad_norm": 0.00296179112046957, "learning_rate": 0.0001622093023255814, - "loss": 0.004, + "loss": 0.0013, "step": 1300 }, { "epoch": 7.558139534883721, - "eval_f1": 0.1798029556650246, - "eval_loss": 0.0070460401475429535, - "eval_runtime": 3.3407, - "eval_samples_per_second": 43.403, - "eval_steps_per_second": 5.687, + "eval_f1": 0.22314176245210726, + "eval_loss": 0.008710747584700584, + "eval_runtime": 3.4684, + "eval_samples_per_second": 41.806, + "eval_steps_per_second": 5.478, "step": 1300 }, { "epoch": 7.616279069767442, - "grad_norm": 0.005747280549257994, + "grad_norm": 0.003891675267368555, "learning_rate": 0.0001619186046511628, - "loss": 0.0044, + "loss": 0.0013, "step": 1310 }, { "epoch": 7.674418604651163, - "grad_norm": 0.0057726092636585236, + "grad_norm": 0.006289920769631863, "learning_rate": 0.00016162790697674419, - "loss": 0.0039, + "loss": 0.0013, "step": 1320 }, { "epoch": 7.732558139534884, - "grad_norm": 0.0055718193762004375, + "grad_norm": 0.004885310307145119, "learning_rate": 0.00016133720930232558, - "loss": 0.0042, + "loss": 0.0014, "step": 1330 }, { "epoch": 7.790697674418604, - "grad_norm": 0.006158602889627218, + "grad_norm": 0.003097211243584752, "learning_rate": 0.00016104651162790698, - "loss": 0.0041, + "loss": 0.0013, "step": 1340 }, { "epoch": 7.848837209302325, - "grad_norm": 0.006620342377573252, + "grad_norm": 0.004283824935555458, "learning_rate": 0.00016075581395348838, - "loss": 0.0042, + "loss": 0.0014, "step": 1350 }, { "epoch": 7.906976744186046, - "grad_norm": 0.007158090826123953, + "grad_norm": 0.005171797703951597, "learning_rate": 0.00016046511627906978, - "loss": 0.0042, + "loss": 0.0014, "step": 1360 }, { "epoch": 7.965116279069767, - "grad_norm": 0.004612094257026911, + "grad_norm": 0.03580261394381523, "learning_rate": 0.00016017441860465117, - "loss": 0.0042, + "loss": 0.0015, "step": 1370 }, { "epoch": 8.023255813953488, - "grad_norm": 0.008365660905838013, + "grad_norm": 0.020157570019364357, "learning_rate": 0.00015988372093023257, - "loss": 0.004, + "loss": 0.0014, "step": 1380 }, { "epoch": 8.081395348837209, - "grad_norm": 0.004503598902374506, + "grad_norm": 0.0027610217221081257, "learning_rate": 0.00015959302325581394, - "loss": 0.0033, + "loss": 0.0009, "step": 1390 }, { "epoch": 8.13953488372093, - "grad_norm": 0.005002808757126331, + "grad_norm": 0.002770358696579933, "learning_rate": 0.00015930232558139534, - "loss": 0.0032, + "loss": 0.001, "step": 1400 }, { "epoch": 8.13953488372093, - "eval_f1": 0.17356321839080457, - "eval_loss": 0.007230895105749369, - "eval_runtime": 3.3549, - "eval_samples_per_second": 43.22, - "eval_steps_per_second": 5.663, + "eval_f1": 0.26995073891625615, + "eval_loss": 0.00847978051751852, + "eval_runtime": 3.5635, + "eval_samples_per_second": 40.691, + "eval_steps_per_second": 5.332, "step": 1400 }, { "epoch": 8.19767441860465, - "grad_norm": 0.006834129802882671, + "grad_norm": 0.007025621831417084, "learning_rate": 0.00015901162790697674, - "loss": 0.0035, + "loss": 0.0011, "step": 1410 }, { "epoch": 8.255813953488373, - "grad_norm": 0.009472133591771126, + "grad_norm": 0.004289526492357254, "learning_rate": 0.00015872093023255814, - "loss": 0.0037, + "loss": 0.001, "step": 1420 }, { "epoch": 8.313953488372093, - "grad_norm": 0.003925286699086428, + "grad_norm": 0.0031822659075260162, "learning_rate": 0.00015843023255813953, - "loss": 0.0035, + "loss": 0.0012, "step": 1430 }, { "epoch": 8.372093023255815, - "grad_norm": 0.0058881971053779125, + "grad_norm": 0.002740811090916395, "learning_rate": 0.00015813953488372093, - "loss": 0.0034, + "loss": 0.0013, "step": 1440 }, { "epoch": 8.430232558139535, - "grad_norm": 0.01131221279501915, + "grad_norm": 0.009544509463012218, "learning_rate": 0.00015784883720930233, - "loss": 0.0034, + "loss": 0.0014, "step": 1450 }, { "epoch": 8.488372093023255, - "grad_norm": 0.00741715170443058, + "grad_norm": 0.006155288778245449, "learning_rate": 0.00015755813953488373, - "loss": 0.0037, + "loss": 0.0013, "step": 1460 }, { "epoch": 8.546511627906977, - "grad_norm": 0.003511174814775586, + "grad_norm": 0.0027884591836482286, "learning_rate": 0.00015726744186046512, - "loss": 0.0034, + "loss": 0.0012, "step": 1470 }, { "epoch": 8.604651162790697, - "grad_norm": 0.004778098315000534, + "grad_norm": 0.003985070623457432, "learning_rate": 0.00015697674418604652, - "loss": 0.0038, + "loss": 0.0016, "step": 1480 }, { "epoch": 8.662790697674419, - "grad_norm": 0.006466461345553398, + "grad_norm": 0.0022715777158737183, "learning_rate": 0.0001566860465116279, - "loss": 0.0036, + "loss": 0.0011, "step": 1490 }, { "epoch": 8.720930232558139, - "grad_norm": 0.006319102365523577, + "grad_norm": 0.002351171337068081, "learning_rate": 0.0001563953488372093, - "loss": 0.0038, + "loss": 0.0013, "step": 1500 }, { "epoch": 8.720930232558139, - "eval_f1": 0.1764367816091954, - "eval_loss": 0.007273267954587936, - "eval_runtime": 3.6099, - "eval_samples_per_second": 40.168, - "eval_steps_per_second": 5.263, + "eval_f1": 0.28823207443897103, + "eval_loss": 0.00830762181431055, + "eval_runtime": 3.5305, + "eval_samples_per_second": 41.07, + "eval_steps_per_second": 5.382, "step": 1500 }, { "epoch": 8.779069767441861, - "grad_norm": 0.006623339373618364, + "grad_norm": 0.0052019525319337845, "learning_rate": 0.00015610465116279072, - "loss": 0.0036, + "loss": 0.0013, "step": 1510 }, { "epoch": 8.837209302325581, - "grad_norm": 0.005630009341984987, + "grad_norm": 0.003164341440424323, "learning_rate": 0.0001558139534883721, - "loss": 0.0033, + "loss": 0.0011, "step": 1520 }, { "epoch": 8.895348837209303, - "grad_norm": 0.012286137789487839, + "grad_norm": 0.0030019793193787336, "learning_rate": 0.0001555232558139535, - "loss": 0.0037, + "loss": 0.0012, "step": 1530 }, { "epoch": 8.953488372093023, - "grad_norm": 0.007844207808375359, + "grad_norm": 0.006242596078664064, "learning_rate": 0.0001552325581395349, - "loss": 0.0033, + "loss": 0.0012, "step": 1540 }, { "epoch": 9.011627906976743, - "grad_norm": 0.007706441916525364, + "grad_norm": 0.003362039802595973, "learning_rate": 0.0001549418604651163, - "loss": 0.0038, + "loss": 0.0012, "step": 1550 }, { "epoch": 9.069767441860465, - "grad_norm": 0.009034098125994205, + "grad_norm": 0.002083594212308526, "learning_rate": 0.00015465116279069768, - "loss": 0.0032, + "loss": 0.0008, "step": 1560 }, { "epoch": 9.127906976744185, - "grad_norm": 0.018183371052145958, + "grad_norm": 0.003623665776103735, "learning_rate": 0.00015436046511627907, - "loss": 0.0034, + "loss": 0.001, "step": 1570 }, { "epoch": 9.186046511627907, - "grad_norm": 0.004302157089114189, + "grad_norm": 0.0032485022675246, "learning_rate": 0.00015406976744186047, - "loss": 0.003, + "loss": 0.0009, "step": 1580 }, { "epoch": 9.244186046511627, - "grad_norm": 0.00826517678797245, + "grad_norm": 0.0038227837067097425, "learning_rate": 0.00015377906976744187, - "loss": 0.003, + "loss": 0.0008, "step": 1590 }, { "epoch": 9.30232558139535, - "grad_norm": 0.004036781378090382, + "grad_norm": 0.005091373808681965, "learning_rate": 0.00015348837209302327, - "loss": 0.0033, + "loss": 0.0008, "step": 1600 }, { "epoch": 9.30232558139535, - "eval_f1": 0.178544061302682, - "eval_loss": 0.007520783226937056, - "eval_runtime": 3.3612, - "eval_samples_per_second": 43.139, - "eval_steps_per_second": 5.653, + "eval_f1": 0.2671264367816092, + "eval_loss": 0.009402135387063026, + "eval_runtime": 3.6491, + "eval_samples_per_second": 39.736, + "eval_steps_per_second": 5.207, "step": 1600 }, { "epoch": 9.36046511627907, - "grad_norm": 0.005061028990894556, + "grad_norm": 0.0021304183173924685, "learning_rate": 0.00015319767441860467, - "loss": 0.0035, + "loss": 0.001, "step": 1610 }, { "epoch": 9.418604651162791, - "grad_norm": 0.01016306784003973, + "grad_norm": 0.003027849830687046, "learning_rate": 0.00015290697674418606, - "loss": 0.0034, + "loss": 0.0009, "step": 1620 }, { "epoch": 9.476744186046512, - "grad_norm": 0.0059317899867892265, + "grad_norm": 0.003086281009018421, "learning_rate": 0.00015261627906976746, - "loss": 0.0031, + "loss": 0.0007, "step": 1630 }, { "epoch": 9.534883720930232, - "grad_norm": 0.004909018520265818, + "grad_norm": 0.0032607070170342922, "learning_rate": 0.00015232558139534886, - "loss": 0.0031, + "loss": 0.0008, "step": 1640 }, { "epoch": 9.593023255813954, - "grad_norm": 0.0044730366207659245, + "grad_norm": 0.002567051211372018, "learning_rate": 0.00015203488372093026, - "loss": 0.0031, + "loss": 0.0008, "step": 1650 }, { "epoch": 9.651162790697674, - "grad_norm": 0.009714599698781967, + "grad_norm": 0.0025655811186879873, "learning_rate": 0.00015174418604651163, - "loss": 0.0036, + "loss": 0.0009, "step": 1660 }, { "epoch": 9.709302325581396, - "grad_norm": 0.004445505794137716, + "grad_norm": 0.006435499060899019, "learning_rate": 0.00015145348837209303, - "loss": 0.0028, + "loss": 0.0007, "step": 1670 }, { "epoch": 9.767441860465116, - "grad_norm": 0.0036919168196618557, + "grad_norm": 0.007524511776864529, "learning_rate": 0.00015116279069767442, - "loss": 0.0029, + "loss": 0.0007, "step": 1680 }, { "epoch": 9.825581395348838, - "grad_norm": 0.00488548306748271, + "grad_norm": 0.0045714788138866425, "learning_rate": 0.00015087209302325582, - "loss": 0.0029, + "loss": 0.0008, "step": 1690 }, { "epoch": 9.883720930232558, - "grad_norm": 0.01639348641037941, + "grad_norm": 0.0030976375564932823, "learning_rate": 0.00015058139534883722, - "loss": 0.0035, + "loss": 0.0011, "step": 1700 }, { "epoch": 9.883720930232558, - "eval_f1": 0.18669950738916255, - "eval_loss": 0.007291770074516535, - "eval_runtime": 3.3554, - "eval_samples_per_second": 43.213, - "eval_steps_per_second": 5.662, + "eval_f1": 0.25834701696770657, + "eval_loss": 0.008911101147532463, + "eval_runtime": 3.4579, + "eval_samples_per_second": 41.933, + "eval_steps_per_second": 5.495, "step": 1700 }, { "epoch": 9.94186046511628, - "grad_norm": 0.005756668280810118, + "grad_norm": 0.0028768714983016253, "learning_rate": 0.00015029069767441862, - "loss": 0.0033, + "loss": 0.0008, "step": 1710 }, { "epoch": 10.0, - "grad_norm": 0.0072600990533828735, + "grad_norm": 0.0014783083461225033, "learning_rate": 0.00015000000000000001, - "loss": 0.0033, + "loss": 0.0007, "step": 1720 }, { "epoch": 10.05813953488372, - "grad_norm": 0.00476442463696003, + "grad_norm": 0.0026261017192155123, "learning_rate": 0.0001497093023255814, - "loss": 0.0031, + "loss": 0.0007, "step": 1730 }, { "epoch": 10.116279069767442, - "grad_norm": 0.004853737540543079, + "grad_norm": 0.0027360799722373486, "learning_rate": 0.0001494186046511628, - "loss": 0.003, + "loss": 0.0006, "step": 1740 }, { "epoch": 10.174418604651162, - "grad_norm": 0.016644136980175972, + "grad_norm": 0.0025223286356776953, "learning_rate": 0.00014912790697674418, - "loss": 0.0026, + "loss": 0.0007, "step": 1750 }, { "epoch": 10.232558139534884, - "grad_norm": 0.007938366383314133, + "grad_norm": 0.004347564652562141, "learning_rate": 0.00014883720930232558, - "loss": 0.0028, + "loss": 0.0006, "step": 1760 }, { "epoch": 10.290697674418604, - "grad_norm": 0.00343290320597589, + "grad_norm": 0.0030164269264787436, "learning_rate": 0.00014854651162790698, - "loss": 0.0026, + "loss": 0.0005, "step": 1770 }, { "epoch": 10.348837209302326, - "grad_norm": 0.004399769939482212, + "grad_norm": 0.0031331840436905622, "learning_rate": 0.00014825581395348837, - "loss": 0.0028, + "loss": 0.0007, "step": 1780 }, { "epoch": 10.406976744186046, - "grad_norm": 0.0065431734547019005, + "grad_norm": 0.0013913946459069848, "learning_rate": 0.00014796511627906977, - "loss": 0.0026, + "loss": 0.0005, "step": 1790 }, { "epoch": 10.465116279069768, - "grad_norm": 0.004511106293648481, + "grad_norm": 0.0014110967749729753, "learning_rate": 0.00014767441860465117, - "loss": 0.0025, + "loss": 0.0006, "step": 1800 }, { "epoch": 10.465116279069768, - "eval_f1": 0.22796934865900378, - "eval_loss": 0.007098213769495487, - "eval_runtime": 3.3525, - "eval_samples_per_second": 43.252, - "eval_steps_per_second": 5.667, + "eval_f1": 0.2261247947454844, + "eval_loss": 0.009887471795082092, + "eval_runtime": 3.4557, + "eval_samples_per_second": 41.96, + "eval_steps_per_second": 5.498, "step": 1800 }, { "epoch": 10.523255813953488, - "grad_norm": 0.004023385234177113, + "grad_norm": 0.010545547120273113, "learning_rate": 0.00014738372093023257, - "loss": 0.0026, + "loss": 0.0008, "step": 1810 }, { "epoch": 10.581395348837209, - "grad_norm": 0.0062681203708052635, + "grad_norm": 0.009523374028503895, "learning_rate": 0.00014709302325581396, - "loss": 0.0024, + "loss": 0.0007, "step": 1820 }, { "epoch": 10.63953488372093, - "grad_norm": 0.0065085552632808685, + "grad_norm": 0.0016696532256901264, "learning_rate": 0.00014680232558139536, - "loss": 0.0027, + "loss": 0.0008, "step": 1830 }, { "epoch": 10.69767441860465, - "grad_norm": 0.005249280948191881, + "grad_norm": 0.0035481429658830166, "learning_rate": 0.00014651162790697673, - "loss": 0.0024, + "loss": 0.0006, "step": 1840 }, { "epoch": 10.755813953488373, - "grad_norm": 0.005527133122086525, + "grad_norm": 0.002992045832797885, "learning_rate": 0.00014622093023255813, - "loss": 0.0027, + "loss": 0.0006, "step": 1850 }, { "epoch": 10.813953488372093, - "grad_norm": 0.008362425491213799, + "grad_norm": 0.003827236359938979, "learning_rate": 0.00014593023255813953, - "loss": 0.0025, + "loss": 0.0006, "step": 1860 }, { "epoch": 10.872093023255815, - "grad_norm": 0.004236938431859016, + "grad_norm": 0.01295691542327404, "learning_rate": 0.00014563953488372093, - "loss": 0.003, + "loss": 0.001, "step": 1870 }, { "epoch": 10.930232558139535, - "grad_norm": 0.004265348892658949, + "grad_norm": 0.004208057653158903, "learning_rate": 0.00014534883720930232, - "loss": 0.0029, + "loss": 0.0007, "step": 1880 }, { "epoch": 10.988372093023255, - "grad_norm": 0.005769385490566492, + "grad_norm": 0.0019303156295791268, "learning_rate": 0.00014505813953488372, - "loss": 0.0025, + "loss": 0.0006, "step": 1890 }, { "epoch": 11.046511627906977, - "grad_norm": 0.0019634233321994543, + "grad_norm": 0.001584715093486011, "learning_rate": 0.00014476744186046512, - "loss": 0.0021, + "loss": 0.0005, "step": 1900 }, { "epoch": 11.046511627906977, - "eval_f1": 0.23026819923371644, - "eval_loss": 0.007248056121170521, - "eval_runtime": 3.4926, - "eval_samples_per_second": 41.517, - "eval_steps_per_second": 5.44, + "eval_f1": 0.2654022988505747, + "eval_loss": 0.009239580482244492, + "eval_runtime": 3.6287, + "eval_samples_per_second": 39.96, + "eval_steps_per_second": 5.236, "step": 1900 }, { "epoch": 11.104651162790697, - "grad_norm": 0.003944189753383398, + "grad_norm": 0.002834419487044215, "learning_rate": 0.00014447674418604652, - "loss": 0.0022, + "loss": 0.0005, "step": 1910 }, { "epoch": 11.162790697674419, - "grad_norm": 0.006719578057527542, + "grad_norm": 0.0023569874465465546, "learning_rate": 0.00014418604651162791, - "loss": 0.0024, + "loss": 0.0005, "step": 1920 }, { "epoch": 11.220930232558139, - "grad_norm": 0.004878057166934013, + "grad_norm": 0.0019151725573465228, "learning_rate": 0.00014389534883720929, - "loss": 0.0021, + "loss": 0.0004, "step": 1930 }, { "epoch": 11.279069767441861, - "grad_norm": 0.0037629061844199896, + "grad_norm": 0.003607311053201556, "learning_rate": 0.0001436046511627907, - "loss": 0.0019, + "loss": 0.0004, "step": 1940 }, { "epoch": 11.337209302325581, - "grad_norm": 0.006484214682132006, + "grad_norm": 0.00216736551374197, "learning_rate": 0.0001433139534883721, - "loss": 0.0023, + "loss": 0.0006, "step": 1950 }, { "epoch": 11.395348837209303, - "grad_norm": 0.0033203568309545517, + "grad_norm": 0.002466777805238962, "learning_rate": 0.0001430232558139535, - "loss": 0.0022, + "loss": 0.0005, "step": 1960 }, { "epoch": 11.453488372093023, - "grad_norm": 0.006716153584420681, + "grad_norm": 0.003370426595211029, "learning_rate": 0.0001427325581395349, - "loss": 0.0023, + "loss": 0.0005, "step": 1970 }, { "epoch": 11.511627906976745, - "grad_norm": 0.005748982075601816, + "grad_norm": 0.0033452839124947786, "learning_rate": 0.0001424418604651163, - "loss": 0.0021, + "loss": 0.0006, "step": 1980 }, { "epoch": 11.569767441860465, - "grad_norm": 0.0051764799281954765, + "grad_norm": 0.002133035333827138, "learning_rate": 0.0001421511627906977, - "loss": 0.0023, + "loss": 0.0005, "step": 1990 }, { "epoch": 11.627906976744185, - "grad_norm": 0.0049621122889220715, + "grad_norm": 0.0015113772824406624, "learning_rate": 0.0001418604651162791, - "loss": 0.0024, + "loss": 0.0005, "step": 2000 }, { "epoch": 11.627906976744185, - "eval_f1": 0.21279693486590037, - "eval_loss": 0.007344240788370371, - "eval_runtime": 3.385, - "eval_samples_per_second": 42.836, - "eval_steps_per_second": 5.613, + "eval_f1": 0.20137931034482762, + "eval_loss": 0.00990789383649826, + "eval_runtime": 3.5081, + "eval_samples_per_second": 41.333, + "eval_steps_per_second": 5.416, "step": 2000 }, { "epoch": 11.686046511627907, - "grad_norm": 0.004597889259457588, + "grad_norm": 0.0024171818513423204, "learning_rate": 0.00014156976744186047, - "loss": 0.0021, + "loss": 0.0005, "step": 2010 }, { "epoch": 11.744186046511627, - "grad_norm": 0.0028125131502747536, + "grad_norm": 0.005505066830664873, "learning_rate": 0.00014127906976744186, - "loss": 0.0019, + "loss": 0.0004, "step": 2020 }, { "epoch": 11.80232558139535, - "grad_norm": 0.005528640002012253, + "grad_norm": 0.0015683730598539114, "learning_rate": 0.00014098837209302326, - "loss": 0.0023, + "loss": 0.0005, "step": 2030 }, { "epoch": 11.86046511627907, - "grad_norm": 0.006331980228424072, + "grad_norm": 0.003685836913064122, "learning_rate": 0.00014069767441860466, - "loss": 0.0023, + "loss": 0.0004, "step": 2040 }, { "epoch": 11.918604651162791, - "grad_norm": 0.00469759339466691, + "grad_norm": 0.005244854371994734, "learning_rate": 0.00014040697674418606, - "loss": 0.0024, + "loss": 0.0006, "step": 2050 }, { "epoch": 11.976744186046512, - "grad_norm": 0.0038904016837477684, + "grad_norm": 0.0012378114042803645, "learning_rate": 0.00014011627906976746, - "loss": 0.0023, + "loss": 0.0005, "step": 2060 }, { "epoch": 12.034883720930232, - "grad_norm": 0.0019254146609455347, + "grad_norm": 0.0007717824191786349, "learning_rate": 0.00013982558139534885, - "loss": 0.0022, + "loss": 0.0005, "step": 2070 }, { "epoch": 12.093023255813954, - "grad_norm": 0.004151986446231604, + "grad_norm": 0.001109365839511156, "learning_rate": 0.00013953488372093025, - "loss": 0.002, + "loss": 0.0005, "step": 2080 }, { "epoch": 12.151162790697674, - "grad_norm": 0.0016184344422072172, + "grad_norm": 0.0007751621888019145, "learning_rate": 0.00013924418604651165, - "loss": 0.0017, + "loss": 0.0004, "step": 2090 }, { "epoch": 12.209302325581396, - "grad_norm": 0.002807161770761013, + "grad_norm": 0.0032476542983204126, "learning_rate": 0.00013895348837209302, - "loss": 0.0018, + "loss": 0.0004, "step": 2100 }, { "epoch": 12.209302325581396, - "eval_f1": 0.22808429118773946, - "eval_loss": 0.007525748107582331, - "eval_runtime": 3.468, - "eval_samples_per_second": 41.811, - "eval_steps_per_second": 5.479, + "eval_f1": 0.23834701696770666, + "eval_loss": 0.009933044202625751, + "eval_runtime": 3.5451, + "eval_samples_per_second": 40.901, + "eval_steps_per_second": 5.359, "step": 2100 }, { "epoch": 12.267441860465116, - "grad_norm": 0.003710753982886672, + "grad_norm": 0.0012783126439899206, "learning_rate": 0.00013866279069767442, - "loss": 0.0019, + "loss": 0.0003, "step": 2110 }, { "epoch": 12.325581395348838, - "grad_norm": 0.002232793951407075, + "grad_norm": 0.000302033411571756, "learning_rate": 0.00013837209302325582, - "loss": 0.0019, + "loss": 0.0004, "step": 2120 }, { "epoch": 12.383720930232558, - "grad_norm": 0.004755348898470402, + "grad_norm": 0.002487839898094535, "learning_rate": 0.0001380813953488372, - "loss": 0.0015, + "loss": 0.0003, "step": 2130 }, { "epoch": 12.44186046511628, - "grad_norm": 0.0033219349570572376, + "grad_norm": 0.0004567207070067525, "learning_rate": 0.0001377906976744186, - "loss": 0.0017, + "loss": 0.0004, "step": 2140 }, { "epoch": 12.5, - "grad_norm": 0.007486560381948948, + "grad_norm": 0.001022443757392466, "learning_rate": 0.0001375, - "loss": 0.0016, + "loss": 0.0003, "step": 2150 }, { "epoch": 12.55813953488372, - "grad_norm": 0.0035984499845653772, + "grad_norm": 0.001065536867827177, "learning_rate": 0.0001372093023255814, - "loss": 0.0014, + "loss": 0.0002, "step": 2160 }, { "epoch": 12.616279069767442, - "grad_norm": 0.0038998990785330534, + "grad_norm": 0.001972356578335166, "learning_rate": 0.0001369186046511628, - "loss": 0.0014, + "loss": 0.0003, "step": 2170 }, { "epoch": 12.674418604651162, - "grad_norm": 0.004726073704659939, + "grad_norm": 0.0007090851431712508, "learning_rate": 0.0001366279069767442, - "loss": 0.0019, + "loss": 0.0003, "step": 2180 }, { "epoch": 12.732558139534884, - "grad_norm": 0.0023618771228939295, + "grad_norm": 0.0004720042343251407, "learning_rate": 0.0001363372093023256, - "loss": 0.0018, + "loss": 0.0004, "step": 2190 }, { "epoch": 12.790697674418604, - "grad_norm": 0.004634870681911707, + "grad_norm": 0.0026569641195237637, "learning_rate": 0.00013604651162790697, - "loss": 0.0018, + "loss": 0.0003, "step": 2200 }, { "epoch": 12.790697674418604, - "eval_f1": 0.2606458675424193, - "eval_loss": 0.007699997629970312, - "eval_runtime": 3.3981, - "eval_samples_per_second": 42.671, - "eval_steps_per_second": 5.591, + "eval_f1": 0.2586973180076628, + "eval_loss": 0.010139710269868374, + "eval_runtime": 3.3807, + "eval_samples_per_second": 42.89, + "eval_steps_per_second": 5.62, "step": 2200 }, { "epoch": 12.848837209302326, - "grad_norm": 0.006798637565225363, + "grad_norm": 0.0018606955418363214, "learning_rate": 0.00013575581395348837, - "loss": 0.0017, + "loss": 0.0003, "step": 2210 }, { "epoch": 12.906976744186046, - "grad_norm": 0.0030679753981530666, + "grad_norm": 0.0012309341691434383, "learning_rate": 0.00013546511627906977, - "loss": 0.0018, + "loss": 0.0003, "step": 2220 }, { "epoch": 12.965116279069768, - "grad_norm": 0.004236382897943258, + "grad_norm": 0.0020257702562958, "learning_rate": 0.00013517441860465116, - "loss": 0.0017, + "loss": 0.0002, "step": 2230 }, { "epoch": 13.023255813953488, - "grad_norm": 0.005479669664055109, + "grad_norm": 0.0015356411458924413, "learning_rate": 0.00013488372093023256, - "loss": 0.0013, + "loss": 0.0003, "step": 2240 }, { "epoch": 13.081395348837209, - "grad_norm": 0.005079224240034819, + "grad_norm": 0.0010870143305510283, "learning_rate": 0.00013459302325581396, - "loss": 0.0016, + "loss": 0.0003, "step": 2250 }, { "epoch": 13.13953488372093, - "grad_norm": 0.004375936463475227, + "grad_norm": 0.0006066053174436092, "learning_rate": 0.00013430232558139536, - "loss": 0.0016, + "loss": 0.0002, "step": 2260 }, { "epoch": 13.19767441860465, - "grad_norm": 0.0030040640849620104, + "grad_norm": 0.0010405691573396325, "learning_rate": 0.00013401162790697675, - "loss": 0.0014, + "loss": 0.0002, "step": 2270 }, { "epoch": 13.255813953488373, - "grad_norm": 0.0019917995668947697, + "grad_norm": 0.0003021611482836306, "learning_rate": 0.00013372093023255815, - "loss": 0.0014, + "loss": 0.0003, "step": 2280 }, { "epoch": 13.313953488372093, - "grad_norm": 0.006326055154204369, + "grad_norm": 0.001801244798116386, "learning_rate": 0.00013343023255813952, - "loss": 0.0012, + "loss": 0.0002, "step": 2290 }, { "epoch": 13.372093023255815, - "grad_norm": 0.0036998435389250517, + "grad_norm": 0.001750931260176003, "learning_rate": 0.00013313953488372092, - "loss": 0.0013, + "loss": 0.0003, "step": 2300 }, { "epoch": 13.372093023255815, - "eval_f1": 0.27019157088122603, - "eval_loss": 0.007727098651230335, - "eval_runtime": 3.3859, - "eval_samples_per_second": 42.825, - "eval_steps_per_second": 5.612, + "eval_f1": 0.25413793103448273, + "eval_loss": 0.01030605100095272, + "eval_runtime": 3.7055, + "eval_samples_per_second": 39.131, + "eval_steps_per_second": 5.127, "step": 2300 }, { "epoch": 13.430232558139535, - "grad_norm": 0.0057464600540697575, + "grad_norm": 0.0007868983666412532, "learning_rate": 0.00013284883720930232, - "loss": 0.0015, + "loss": 0.0002, "step": 2310 }, { "epoch": 13.488372093023255, - "grad_norm": 0.0031235923524945974, + "grad_norm": 0.00037955614970996976, "learning_rate": 0.00013255813953488372, - "loss": 0.0011, + "loss": 0.0001, "step": 2320 }, { "epoch": 13.546511627906977, - "grad_norm": 0.001779690501280129, + "grad_norm": 0.0005226065404713154, "learning_rate": 0.0001322674418604651, - "loss": 0.0015, + "loss": 0.0002, "step": 2330 }, { "epoch": 13.604651162790697, - "grad_norm": 0.0027253900188952684, + "grad_norm": 0.000628421432338655, "learning_rate": 0.0001319767441860465, - "loss": 0.0015, + "loss": 0.0003, "step": 2340 }, { "epoch": 13.662790697674419, - "grad_norm": 0.003966566640883684, + "grad_norm": 0.00035077467327937484, "learning_rate": 0.0001316860465116279, - "loss": 0.0017, + "loss": 0.0003, "step": 2350 }, { "epoch": 13.720930232558139, - "grad_norm": 0.004900631029158831, + "grad_norm": 0.0008253551204688847, "learning_rate": 0.0001313953488372093, - "loss": 0.0015, + "loss": 0.0003, "step": 2360 }, { "epoch": 13.779069767441861, - "grad_norm": 0.002880127402022481, + "grad_norm": 0.001465794863179326, "learning_rate": 0.0001311046511627907, - "loss": 0.0012, + "loss": 0.0002, "step": 2370 }, { "epoch": 13.837209302325581, - "grad_norm": 0.0024573553819209337, + "grad_norm": 0.0007311553927138448, "learning_rate": 0.0001308139534883721, - "loss": 0.0013, + "loss": 0.0003, "step": 2380 }, { "epoch": 13.895348837209303, - "grad_norm": 0.005345191806554794, + "grad_norm": 0.0013177372748032212, "learning_rate": 0.0001305232558139535, - "loss": 0.0014, + "loss": 0.0003, "step": 2390 }, { "epoch": 13.953488372093023, - "grad_norm": 0.0035257439594715834, + "grad_norm": 0.0007739219581708312, "learning_rate": 0.0001302325581395349, - "loss": 0.0015, + "loss": 0.0002, "step": 2400 }, { "epoch": 13.953488372093023, - "eval_f1": 0.22084291187739466, - "eval_loss": 0.007676718290895224, - "eval_runtime": 3.4598, - "eval_samples_per_second": 41.91, - "eval_steps_per_second": 5.492, + "eval_f1": 0.24739463601532566, + "eval_loss": 0.010597319342195988, + "eval_runtime": 3.4991, + "eval_samples_per_second": 41.44, + "eval_steps_per_second": 5.43, "step": 2400 }, { "epoch": 14.011627906976743, - "grad_norm": 0.004318791441619396, + "grad_norm": 0.0015530871460214257, "learning_rate": 0.0001299418604651163, - "loss": 0.0014, + "loss": 0.0002, "step": 2410 }, { "epoch": 14.069767441860465, - "grad_norm": 0.021991753950715065, + "grad_norm": 0.0019514122977852821, "learning_rate": 0.0001296511627906977, - "loss": 0.0014, + "loss": 0.0002, "step": 2420 }, { "epoch": 14.127906976744185, - "grad_norm": 0.0019976613111793995, + "grad_norm": 0.000271965836873278, "learning_rate": 0.0001293604651162791, - "loss": 0.0012, + "loss": 0.0002, "step": 2430 }, { "epoch": 14.186046511627907, - "grad_norm": 0.0067547089420259, + "grad_norm": 0.0002978089905809611, "learning_rate": 0.0001290697674418605, - "loss": 0.0021, + "loss": 0.0002, "step": 2440 }, { "epoch": 14.244186046511627, - "grad_norm": 0.004755343776196241, + "grad_norm": 0.00023556972155347466, "learning_rate": 0.00012877906976744189, - "loss": 0.0015, + "loss": 0.0001, "step": 2450 }, { "epoch": 14.30232558139535, - "grad_norm": 0.004111699294298887, + "grad_norm": 0.0016617613146081567, "learning_rate": 0.00012848837209302326, - "loss": 0.0017, + "loss": 0.0002, "step": 2460 }, { "epoch": 14.36046511627907, - "grad_norm": 0.0054778833873569965, + "grad_norm": 0.0014797139447182417, "learning_rate": 0.00012819767441860465, - "loss": 0.0017, + "loss": 0.0002, "step": 2470 }, { "epoch": 14.418604651162791, - "grad_norm": 0.003433394245803356, + "grad_norm": 0.000434195069829002, "learning_rate": 0.00012790697674418605, - "loss": 0.0012, + "loss": 0.0002, "step": 2480 }, { "epoch": 14.476744186046512, - "grad_norm": 0.0027168726082891226, + "grad_norm": 0.0002562550362199545, "learning_rate": 0.00012761627906976745, - "loss": 0.0013, + "loss": 0.0002, "step": 2490 }, { "epoch": 14.534883720930232, - "grad_norm": 0.023376334458589554, + "grad_norm": 0.000852324883453548, "learning_rate": 0.00012732558139534885, - "loss": 0.002, + "loss": 0.0002, "step": 2500 }, { "epoch": 14.534883720930232, - "eval_f1": 0.1724137931034483, - "eval_loss": 0.008439584635198116, - "eval_runtime": 3.3328, - "eval_samples_per_second": 43.507, - "eval_steps_per_second": 5.701, + "eval_f1": 0.25348111658456485, + "eval_loss": 0.010765406303107738, + "eval_runtime": 3.6538, + "eval_samples_per_second": 39.684, + "eval_steps_per_second": 5.2, "step": 2500 }, { "epoch": 14.593023255813954, - "grad_norm": 0.002839108230546117, + "grad_norm": 0.0002972542424686253, "learning_rate": 0.00012703488372093025, - "loss": 0.0017, + "loss": 0.0002, "step": 2510 }, { "epoch": 14.651162790697674, - "grad_norm": 0.005331306718289852, + "grad_norm": 0.000232176054851152, "learning_rate": 0.00012674418604651164, - "loss": 0.0015, + "loss": 0.0002, "step": 2520 }, { "epoch": 14.709302325581396, - "grad_norm": 0.00544377276673913, + "grad_norm": 0.000487919372972101, "learning_rate": 0.00012645348837209304, - "loss": 0.0015, + "loss": 0.0002, "step": 2530 }, { "epoch": 14.767441860465116, - "grad_norm": 0.005771029274910688, + "grad_norm": 0.0008283961797133088, "learning_rate": 0.00012616279069767444, - "loss": 0.0014, + "loss": 0.0002, "step": 2540 }, { "epoch": 14.825581395348838, - "grad_norm": 0.004663311410695314, + "grad_norm": 0.000571161974221468, "learning_rate": 0.0001258720930232558, - "loss": 0.0016, + "loss": 0.0002, "step": 2550 }, { "epoch": 14.883720930232558, - "grad_norm": 0.007239652331918478, + "grad_norm": 0.0005665848148055375, "learning_rate": 0.0001255813953488372, - "loss": 0.0012, + "loss": 0.0001, "step": 2560 }, { "epoch": 14.94186046511628, - "grad_norm": 0.006081250496208668, + "grad_norm": 0.0003588281979318708, "learning_rate": 0.0001252906976744186, - "loss": 0.0014, + "loss": 0.0003, "step": 2570 }, { "epoch": 15.0, - "grad_norm": 0.0048074545338749886, + "grad_norm": 0.0005309730768203735, "learning_rate": 0.000125, - "loss": 0.0013, + "loss": 0.0002, "step": 2580 }, { "epoch": 15.05813953488372, - "grad_norm": 0.003552735550329089, + "grad_norm": 0.0014692209661006927, "learning_rate": 0.0001247093023255814, - "loss": 0.0011, + "loss": 0.0002, "step": 2590 }, { "epoch": 15.116279069767442, - "grad_norm": 0.0026673264801502228, + "grad_norm": 0.0009636764298193157, "learning_rate": 0.0001244186046511628, - "loss": 0.001, + "loss": 0.0001, "step": 2600 }, { "epoch": 15.116279069767442, - "eval_f1": 0.25363984674329504, - "eval_loss": 0.00820987019687891, - "eval_runtime": 3.4551, - "eval_samples_per_second": 41.967, - "eval_steps_per_second": 5.499, + "eval_f1": 0.2530596606458675, + "eval_loss": 0.010869406163692474, + "eval_runtime": 3.4235, + "eval_samples_per_second": 42.355, + "eval_steps_per_second": 5.55, "step": 2600 }, { "epoch": 15.174418604651162, - "grad_norm": 0.0035540435928851366, + "grad_norm": 0.0011290176771581173, "learning_rate": 0.0001241279069767442, - "loss": 0.0012, + "loss": 0.0002, "step": 2610 }, { "epoch": 15.232558139534884, - "grad_norm": 0.005464059300720692, + "grad_norm": 0.0004133663896936923, "learning_rate": 0.0001238372093023256, - "loss": 0.0012, + "loss": 0.0002, "step": 2620 }, { "epoch": 15.290697674418604, - "grad_norm": 0.00443275785073638, + "grad_norm": 0.00029262196039780974, "learning_rate": 0.000123546511627907, - "loss": 0.0011, + "loss": 0.0002, "step": 2630 }, { "epoch": 15.348837209302326, - "grad_norm": 0.003320839488878846, + "grad_norm": 0.0008913601050153375, "learning_rate": 0.00012325581395348836, - "loss": 0.0011, + "loss": 0.0001, "step": 2640 }, { "epoch": 15.406976744186046, - "grad_norm": 0.0023912449833005667, + "grad_norm": 0.00043799684499390423, "learning_rate": 0.00012296511627906976, - "loss": 0.0009, + "loss": 0.0001, "step": 2650 }, { "epoch": 15.465116279069768, - "grad_norm": 0.005080386996269226, + "grad_norm": 0.00045140215661376715, "learning_rate": 0.00012267441860465116, - "loss": 0.0011, + "loss": 0.0001, "step": 2660 }, { "epoch": 15.523255813953488, - "grad_norm": 0.0031936930026859045, + "grad_norm": 0.0006623863009735942, "learning_rate": 0.00012238372093023256, - "loss": 0.0009, + "loss": 0.0002, "step": 2670 }, { "epoch": 15.581395348837209, - "grad_norm": 0.005393624771386385, + "grad_norm": 0.0004738447314593941, "learning_rate": 0.00012209302325581395, - "loss": 0.0008, + "loss": 0.0001, "step": 2680 }, { "epoch": 15.63953488372093, - "grad_norm": 0.006540833972394466, + "grad_norm": 0.0009200413478538394, "learning_rate": 0.00012180232558139535, - "loss": 0.001, + "loss": 0.0001, "step": 2690 }, { "epoch": 15.69767441860465, - "grad_norm": 0.003084400901570916, + "grad_norm": 0.0014583002775907516, "learning_rate": 0.00012151162790697675, - "loss": 0.0008, + "loss": 0.0002, "step": 2700 }, { "epoch": 15.69767441860465, - "eval_f1": 0.27501915708812263, - "eval_loss": 0.008198206312954426, - "eval_runtime": 3.3003, - "eval_samples_per_second": 43.936, - "eval_steps_per_second": 5.757, + "eval_f1": 0.25222222222222224, + "eval_loss": 0.011049561202526093, + "eval_runtime": 3.6042, + "eval_samples_per_second": 40.231, + "eval_steps_per_second": 5.272, "step": 2700 }, { "epoch": 15.755813953488373, - "grad_norm": 0.0018062421586364508, + "grad_norm": 0.0003152574354317039, "learning_rate": 0.00012122093023255813, - "loss": 0.001, + "loss": 0.0002, "step": 2710 }, { "epoch": 15.813953488372093, - "grad_norm": 0.0032957058865576982, + "grad_norm": 0.0008657924481667578, "learning_rate": 0.00012093023255813953, - "loss": 0.0011, + "loss": 0.0002, "step": 2720 }, { "epoch": 15.872093023255815, - "grad_norm": 0.0035143059212714434, + "grad_norm": 0.0002135358372470364, "learning_rate": 0.00012063953488372093, - "loss": 0.0008, + "loss": 0.0001, "step": 2730 }, { "epoch": 15.930232558139535, - "grad_norm": 0.0031228724401444197, + "grad_norm": 0.0013386629289016128, "learning_rate": 0.00012034883720930233, - "loss": 0.0011, + "loss": 0.0002, "step": 2740 }, { "epoch": 15.988372093023255, - "grad_norm": 0.0021080966107547283, + "grad_norm": 0.00024844781728461385, "learning_rate": 0.00012005813953488372, - "loss": 0.0008, + "loss": 0.0001, "step": 2750 }, { "epoch": 16.046511627906977, - "grad_norm": 0.0019063223153352737, + "grad_norm": 0.00031204556580632925, "learning_rate": 0.00011976744186046511, - "loss": 0.001, + "loss": 0.0001, "step": 2760 }, { "epoch": 16.1046511627907, - "grad_norm": 0.003136126324534416, + "grad_norm": 0.0009940903401002288, "learning_rate": 0.0001194767441860465, - "loss": 0.0006, + "loss": 0.0001, "step": 2770 }, { "epoch": 16.162790697674417, - "grad_norm": 0.0023068387527018785, + "grad_norm": 0.0006196293397806585, "learning_rate": 0.0001191860465116279, - "loss": 0.0007, + "loss": 0.0001, "step": 2780 }, { "epoch": 16.22093023255814, - "grad_norm": 0.0025313214864581823, + "grad_norm": 0.00012485009210649878, "learning_rate": 0.0001188953488372093, - "loss": 0.0006, + "loss": 0.0001, "step": 2790 }, { "epoch": 16.27906976744186, - "grad_norm": 0.0018450116040185094, + "grad_norm": 0.0003121390473097563, "learning_rate": 0.00011860465116279071, - "loss": 0.0007, + "loss": 0.0001, "step": 2800 }, { "epoch": 16.27906976744186, - "eval_f1": 0.2639846743295019, - "eval_loss": 0.00866973027586937, - "eval_runtime": 3.4675, - "eval_samples_per_second": 41.817, - "eval_steps_per_second": 5.479, + "eval_f1": 0.2647454844006568, + "eval_loss": 0.01124858669936657, + "eval_runtime": 3.5199, + "eval_samples_per_second": 41.195, + "eval_steps_per_second": 5.398, "step": 2800 }, { "epoch": 16.337209302325583, - "grad_norm": 0.0012873446103185415, + "grad_norm": 0.0002632283139973879, "learning_rate": 0.00011831395348837211, - "loss": 0.0009, + "loss": 0.0002, "step": 2810 }, { "epoch": 16.3953488372093, - "grad_norm": 0.0019211816834285855, + "grad_norm": 0.00032865104731172323, "learning_rate": 0.00011802325581395351, - "loss": 0.0009, + "loss": 0.0002, "step": 2820 }, { "epoch": 16.453488372093023, - "grad_norm": 0.001212621689774096, + "grad_norm": 0.00011938096577068791, "learning_rate": 0.0001177325581395349, - "loss": 0.0007, + "loss": 0.0001, "step": 2830 }, { "epoch": 16.511627906976745, - "grad_norm": 0.0016034708824008703, + "grad_norm": 0.00024540178128518164, "learning_rate": 0.00011744186046511629, - "loss": 0.0005, + "loss": 0.0001, "step": 2840 }, { "epoch": 16.569767441860463, - "grad_norm": 0.00214293971657753, + "grad_norm": 0.00032731363899074495, "learning_rate": 0.00011715116279069769, - "loss": 0.0007, + "loss": 0.0001, "step": 2850 }, { "epoch": 16.627906976744185, - "grad_norm": 0.00336723611690104, + "grad_norm": 0.001534398295916617, "learning_rate": 0.00011686046511627909, - "loss": 0.0008, + "loss": 0.0002, "step": 2860 }, { "epoch": 16.686046511627907, - "grad_norm": 0.00877336971461773, + "grad_norm": 0.00020153452351223677, "learning_rate": 0.00011656976744186048, - "loss": 0.0007, + "loss": 0.0001, "step": 2870 }, { "epoch": 16.74418604651163, - "grad_norm": 0.0013621302787214518, + "grad_norm": 0.00044350489042699337, "learning_rate": 0.00011627906976744187, - "loss": 0.0007, + "loss": 0.0001, "step": 2880 }, { "epoch": 16.802325581395348, - "grad_norm": 0.0024161990731954575, + "grad_norm": 0.00052054034313187, "learning_rate": 0.00011598837209302326, - "loss": 0.0007, + "loss": 0.0002, "step": 2890 }, { "epoch": 16.86046511627907, - "grad_norm": 0.008009699173271656, + "grad_norm": 0.0003187255060765892, "learning_rate": 0.00011569767441860466, - "loss": 0.0008, + "loss": 0.0001, "step": 2900 }, { "epoch": 16.86046511627907, - "eval_f1": 0.2604378762999453, - "eval_loss": 0.00883494596928358, - "eval_runtime": 3.3336, - "eval_samples_per_second": 43.496, - "eval_steps_per_second": 5.7, + "eval_f1": 0.2603776683087028, + "eval_loss": 0.011500461027026176, + "eval_runtime": 3.3966, + "eval_samples_per_second": 42.689, + "eval_steps_per_second": 5.594, "step": 2900 }, { "epoch": 16.91860465116279, - "grad_norm": 0.009839486330747604, + "grad_norm": 0.00039312278386205435, "learning_rate": 0.00011540697674418606, - "loss": 0.0009, + "loss": 0.0001, "step": 2910 }, { "epoch": 16.97674418604651, - "grad_norm": 0.002450864529237151, + "grad_norm": 0.00026511470787227154, "learning_rate": 0.00011511627906976746, - "loss": 0.0006, + "loss": 0.0001, "step": 2920 }, { "epoch": 17.03488372093023, - "grad_norm": 0.0017150610219687223, + "grad_norm": 0.00033822376281023026, "learning_rate": 0.00011482558139534884, - "loss": 0.0006, + "loss": 0.0001, "step": 2930 }, { "epoch": 17.093023255813954, - "grad_norm": 0.0004813602427020669, + "grad_norm": 6.437127740355209e-05, "learning_rate": 0.00011453488372093024, - "loss": 0.0008, + "loss": 0.0001, "step": 2940 }, { "epoch": 17.151162790697676, - "grad_norm": 0.0030171223916113377, + "grad_norm": 0.00016166905697900802, "learning_rate": 0.00011424418604651164, - "loss": 0.0006, + "loss": 0.0001, "step": 2950 }, { "epoch": 17.209302325581394, - "grad_norm": 0.0035950797609984875, + "grad_norm": 0.0005291549605317414, "learning_rate": 0.00011395348837209304, - "loss": 0.0006, + "loss": 0.0001, "step": 2960 }, { "epoch": 17.267441860465116, - "grad_norm": 0.0031929980032145977, + "grad_norm": 0.0013425592333078384, "learning_rate": 0.00011366279069767442, - "loss": 0.0006, + "loss": 0.0001, "step": 2970 }, { "epoch": 17.325581395348838, - "grad_norm": 0.0007339220610447228, + "grad_norm": 9.590380068402737e-05, "learning_rate": 0.00011337209302325582, - "loss": 0.0005, + "loss": 0.0001, "step": 2980 }, { "epoch": 17.38372093023256, - "grad_norm": 0.001164542161859572, + "grad_norm": 0.0002535681996960193, "learning_rate": 0.00011308139534883721, - "loss": 0.0006, + "loss": 0.0001, "step": 2990 }, { "epoch": 17.441860465116278, - "grad_norm": 0.002089454559609294, + "grad_norm": 0.00036225697840563953, "learning_rate": 0.00011279069767441861, - "loss": 0.0006, + "loss": 0.0001, "step": 3000 }, { "epoch": 17.441860465116278, - "eval_f1": 0.2560919540229885, - "eval_loss": 0.009051520377397537, - "eval_runtime": 3.3412, - "eval_samples_per_second": 43.397, - "eval_steps_per_second": 5.687, + "eval_f1": 0.2642145593869732, + "eval_loss": 0.011556784622371197, + "eval_runtime": 3.7908, + "eval_samples_per_second": 38.251, + "eval_steps_per_second": 5.012, "step": 3000 }, { "epoch": 17.5, - "grad_norm": 0.002525537507608533, + "grad_norm": 0.0002578358689788729, "learning_rate": 0.00011250000000000001, - "loss": 0.0005, + "loss": 0.0001, "step": 3010 }, { "epoch": 17.558139534883722, - "grad_norm": 0.0011018068762496114, + "grad_norm": 9.179613698506728e-05, "learning_rate": 0.0001122093023255814, - "loss": 0.0005, + "loss": 0.0001, "step": 3020 }, { "epoch": 17.61627906976744, - "grad_norm": 0.002401923295110464, + "grad_norm": 0.0003637939807958901, "learning_rate": 0.00011191860465116279, - "loss": 0.0006, + "loss": 0.0002, "step": 3030 }, { "epoch": 17.674418604651162, - "grad_norm": 0.0011603782186284661, + "grad_norm": 0.00022838416043668985, "learning_rate": 0.00011162790697674419, - "loss": 0.0004, + "loss": 0.0001, "step": 3040 }, { "epoch": 17.732558139534884, - "grad_norm": 0.0023699665907770395, + "grad_norm": 0.00047861726488918066, "learning_rate": 0.00011133720930232559, - "loss": 0.0006, + "loss": 0.0002, "step": 3050 }, { "epoch": 17.790697674418606, - "grad_norm": 0.0024471229407936335, + "grad_norm": 0.0006203922675922513, "learning_rate": 0.00011104651162790699, - "loss": 0.0005, + "loss": 0.0001, "step": 3060 }, { "epoch": 17.848837209302324, - "grad_norm": 0.0009754971251823008, + "grad_norm": 0.00020492461044341326, "learning_rate": 0.00011075581395348837, - "loss": 0.0005, + "loss": 0.0001, "step": 3070 }, { "epoch": 17.906976744186046, - "grad_norm": 0.0014759496552869678, + "grad_norm": 0.0001858137547969818, "learning_rate": 0.00011046511627906977, - "loss": 0.0005, + "loss": 0.0001, "step": 3080 }, { "epoch": 17.96511627906977, - "grad_norm": 0.0020384867675602436, + "grad_norm": 0.0016664748545736074, "learning_rate": 0.00011017441860465117, - "loss": 0.0004, + "loss": 0.0001, "step": 3090 }, { "epoch": 18.023255813953487, - "grad_norm": 0.0012286253040656447, + "grad_norm": 0.000151352200191468, "learning_rate": 0.00010988372093023256, - "loss": 0.0004, + "loss": 0.0001, "step": 3100 }, { "epoch": 18.023255813953487, - "eval_f1": 0.2303831417624521, - "eval_loss": 0.009536550380289555, - "eval_runtime": 3.6693, - "eval_samples_per_second": 39.517, - "eval_steps_per_second": 5.178, + "eval_f1": 0.26248494800218936, + "eval_loss": 0.011899447068572044, + "eval_runtime": 3.5707, + "eval_samples_per_second": 40.608, + "eval_steps_per_second": 5.321, "step": 3100 }, { "epoch": 18.08139534883721, - "grad_norm": 0.0013027406530454755, + "grad_norm": 0.00014296278823167086, "learning_rate": 0.00010959302325581395, - "loss": 0.0004, + "loss": 0.0001, "step": 3110 }, { "epoch": 18.13953488372093, - "grad_norm": 0.0033979248255491257, + "grad_norm": 0.00022280642588157207, "learning_rate": 0.00010930232558139534, - "loss": 0.0005, + "loss": 0.0001, "step": 3120 }, { "epoch": 18.197674418604652, - "grad_norm": 0.0007470336277037859, + "grad_norm": 0.00013984073302708566, "learning_rate": 0.00010901162790697674, - "loss": 0.0004, + "loss": 0.0001, "step": 3130 }, { "epoch": 18.25581395348837, - "grad_norm": 0.0006385065498761833, + "grad_norm": 9.586274245521054e-05, "learning_rate": 0.00010872093023255814, - "loss": 0.0005, + "loss": 0.0001, "step": 3140 }, { "epoch": 18.313953488372093, - "grad_norm": 0.0015766954747959971, + "grad_norm": 0.0002341025392524898, "learning_rate": 0.00010843023255813954, - "loss": 0.0004, + "loss": 0.0001, "step": 3150 }, { "epoch": 18.372093023255815, - "grad_norm": 0.001259420532733202, + "grad_norm": 0.0002666398650035262, "learning_rate": 0.00010813953488372092, - "loss": 0.0005, + "loss": 0.0001, "step": 3160 }, { "epoch": 18.430232558139537, - "grad_norm": 0.001094960025511682, + "grad_norm": 0.0002524552692193538, "learning_rate": 0.00010784883720930232, - "loss": 0.0005, + "loss": 0.0001, "step": 3170 }, { "epoch": 18.488372093023255, - "grad_norm": 0.0031288727186620235, + "grad_norm": 0.0019044369691982865, "learning_rate": 0.00010755813953488372, - "loss": 0.0003, + "loss": 0.0001, "step": 3180 }, { "epoch": 18.546511627906977, - "grad_norm": 0.0017002202803269029, + "grad_norm": 0.00025484306388534606, "learning_rate": 0.00010726744186046512, - "loss": 0.0004, + "loss": 0.0001, "step": 3190 }, { "epoch": 18.6046511627907, - "grad_norm": 0.002515538828447461, + "grad_norm": 0.00027556129498407245, "learning_rate": 0.00010697674418604651, - "loss": 0.0004, + "loss": 0.0001, "step": 3200 }, { "epoch": 18.6046511627907, - "eval_f1": 0.24908045977011495, - "eval_loss": 0.009597817435860634, - "eval_runtime": 3.3234, - "eval_samples_per_second": 43.63, - "eval_steps_per_second": 5.717, + "eval_f1": 0.2714504652435687, + "eval_loss": 0.011928555555641651, + "eval_runtime": 3.7341, + "eval_samples_per_second": 38.831, + "eval_steps_per_second": 5.088, "step": 3200 }, { "epoch": 18.662790697674417, - "grad_norm": 0.0011364705860614777, + "grad_norm": 0.00024047697661444545, "learning_rate": 0.0001066860465116279, - "loss": 0.0005, + "loss": 0.0001, "step": 3210 }, { "epoch": 18.72093023255814, - "grad_norm": 0.0011592620285227895, + "grad_norm": 0.0002139677235390991, "learning_rate": 0.0001063953488372093, - "loss": 0.0005, + "loss": 0.0001, "step": 3220 }, { "epoch": 18.77906976744186, - "grad_norm": 0.0012696533231064677, + "grad_norm": 0.0002612411917652935, "learning_rate": 0.00010610465116279072, - "loss": 0.0005, + "loss": 0.0002, "step": 3230 }, { "epoch": 18.837209302325583, - "grad_norm": 0.0008735220180824399, + "grad_norm": 0.000182965217391029, "learning_rate": 0.0001058139534883721, - "loss": 0.0005, + "loss": 0.0001, "step": 3240 }, { "epoch": 18.8953488372093, - "grad_norm": 0.001498409896157682, + "grad_norm": 0.00022537916083820164, "learning_rate": 0.0001055232558139535, - "loss": 0.0005, + "loss": 0.0002, "step": 3250 }, { "epoch": 18.953488372093023, - "grad_norm": 0.005490511190146208, + "grad_norm": 0.00047821583575569093, "learning_rate": 0.0001052325581395349, - "loss": 0.0005, + "loss": 0.0001, "step": 3260 }, { "epoch": 19.011627906976745, - "grad_norm": 0.0016580373048782349, + "grad_norm": 0.0003564992221072316, "learning_rate": 0.0001049418604651163, - "loss": 0.0004, + "loss": 0.0001, "step": 3270 }, { "epoch": 19.069767441860463, - "grad_norm": 0.0007307511405088007, + "grad_norm": 0.00014158693375065923, "learning_rate": 0.00010465116279069768, - "loss": 0.0003, + "loss": 0.0001, "step": 3280 }, { "epoch": 19.127906976744185, - "grad_norm": 0.0025577195920050144, + "grad_norm": 0.00045683086500503123, "learning_rate": 0.00010436046511627908, - "loss": 0.0004, + "loss": 0.0001, "step": 3290 }, { "epoch": 19.186046511627907, - "grad_norm": 0.0014425559202209115, + "grad_norm": 0.00021988105436321348, "learning_rate": 0.00010406976744186048, - "loss": 0.0004, + "loss": 0.0001, "step": 3300 }, { "epoch": 19.186046511627907, - "eval_f1": 0.26417624521072797, - "eval_loss": 0.00973073486238718, - "eval_runtime": 3.5511, - "eval_samples_per_second": 40.833, - "eval_steps_per_second": 5.35, + "eval_f1": 0.2774274767378216, + "eval_loss": 0.012026965618133545, + "eval_runtime": 3.5203, + "eval_samples_per_second": 41.19, + "eval_steps_per_second": 5.397, "step": 3300 }, { "epoch": 19.24418604651163, - "grad_norm": 0.002455376088619232, + "grad_norm": 0.00031487777596339583, "learning_rate": 0.00010377906976744187, - "loss": 0.0006, + "loss": 0.0001, "step": 3310 }, { "epoch": 19.302325581395348, - "grad_norm": 0.0010040978668257594, + "grad_norm": 0.0002731177373789251, "learning_rate": 0.00010348837209302327, - "loss": 0.0003, + "loss": 0.0001, "step": 3320 }, { "epoch": 19.36046511627907, - "grad_norm": 0.0005100468406453729, + "grad_norm": 0.00014353702135849744, "learning_rate": 0.00010319767441860466, - "loss": 0.0003, + "loss": 0.0001, "step": 3330 }, { "epoch": 19.41860465116279, - "grad_norm": 0.0010542541276663542, + "grad_norm": 0.0009115894790738821, "learning_rate": 0.00010290697674418605, - "loss": 0.0003, + "loss": 0.0001, "step": 3340 }, { "epoch": 19.476744186046513, - "grad_norm": 0.00017168428166769445, + "grad_norm": 4.79189729958307e-05, "learning_rate": 0.00010261627906976745, - "loss": 0.0003, + "loss": 0.0001, "step": 3350 }, { "epoch": 19.53488372093023, - "grad_norm": 0.00021940135047771037, + "grad_norm": 6.471537199104205e-05, "learning_rate": 0.00010232558139534885, - "loss": 0.0003, + "loss": 0.0001, "step": 3360 }, { "epoch": 19.593023255813954, - "grad_norm": 0.000929496600292623, + "grad_norm": 0.00013734422100242227, "learning_rate": 0.00010203488372093025, - "loss": 0.0004, + "loss": 0.0001, "step": 3370 }, { "epoch": 19.651162790697676, - "grad_norm": 0.0022539959754794836, + "grad_norm": 0.0016522855730727315, "learning_rate": 0.00010174418604651163, - "loss": 0.0003, + "loss": 0.0001, "step": 3380 }, { "epoch": 19.709302325581394, - "grad_norm": 0.001599296578206122, + "grad_norm": 0.00024157518055289984, "learning_rate": 0.00010145348837209303, - "loss": 0.0004, + "loss": 0.0001, "step": 3390 }, { "epoch": 19.767441860465116, - "grad_norm": 0.0008732157293707132, + "grad_norm": 0.0001968201104318723, "learning_rate": 0.00010116279069767443, - "loss": 0.0004, + "loss": 0.0001, "step": 3400 }, { "epoch": 19.767441860465116, - "eval_f1": 0.2755719759168035, - "eval_loss": 0.009651989676058292, - "eval_runtime": 3.3686, - "eval_samples_per_second": 43.045, - "eval_steps_per_second": 5.64, + "eval_f1": 0.2886918445539135, + "eval_loss": 0.012111752294003963, + "eval_runtime": 3.5554, + "eval_samples_per_second": 40.783, + "eval_steps_per_second": 5.344, "step": 3400 }, { "epoch": 19.825581395348838, - "grad_norm": 0.0005286884261295199, + "grad_norm": 0.00010204633872490376, "learning_rate": 0.00010087209302325583, - "loss": 0.0003, + "loss": 0.0001, "step": 3410 }, { "epoch": 19.88372093023256, - "grad_norm": 0.00036527440533973277, + "grad_norm": 9.004466119222343e-05, "learning_rate": 0.00010058139534883721, - "loss": 0.0004, + "loss": 0.0001, "step": 3420 }, { "epoch": 19.941860465116278, - "grad_norm": 0.0006188285769894719, + "grad_norm": 0.0001104567272705026, "learning_rate": 0.00010029069767441861, - "loss": 0.0003, + "loss": 0.0001, "step": 3430 }, { "epoch": 20.0, - "grad_norm": 0.002268497832119465, + "grad_norm": 0.00030834207427687943, "learning_rate": 0.0001, - "loss": 0.0003, + "loss": 0.0001, "step": 3440 }, { "epoch": 20.058139534883722, - "grad_norm": 0.0005240673199295998, + "grad_norm": 0.00012030407378915697, "learning_rate": 9.97093023255814e-05, - "loss": 0.0003, + "loss": 0.0001, "step": 3450 }, { "epoch": 20.11627906976744, - "grad_norm": 0.001875634421594441, + "grad_norm": 0.0001983350666705519, "learning_rate": 9.94186046511628e-05, - "loss": 0.0003, + "loss": 0.0001, "step": 3460 }, { "epoch": 20.174418604651162, - "grad_norm": 0.0009196652099490166, + "grad_norm": 0.00022101357171777636, "learning_rate": 9.912790697674418e-05, - "loss": 0.0002, + "loss": 0.0001, "step": 3470 }, { "epoch": 20.232558139534884, - "grad_norm": 0.0006180737400427461, + "grad_norm": 0.00016476133896503597, "learning_rate": 9.883720930232558e-05, - "loss": 0.0003, + "loss": 0.0001, "step": 3480 }, { "epoch": 20.290697674418606, - "grad_norm": 0.0007387311779893935, + "grad_norm": 0.00016228470485657454, "learning_rate": 9.854651162790698e-05, - "loss": 0.0003, + "loss": 0.0001, "step": 3490 }, { "epoch": 20.348837209302324, - "grad_norm": 0.0013516974868252873, + "grad_norm": 0.00015402429562527686, "learning_rate": 9.825581395348838e-05, - "loss": 0.0003, + "loss": 0.0001, "step": 3500 }, { "epoch": 20.348837209302324, - "eval_f1": 0.2661466885604817, - "eval_loss": 0.009747232310473919, - "eval_runtime": 3.4047, - "eval_samples_per_second": 42.589, - "eval_steps_per_second": 5.581, + "eval_f1": 0.2831746031746032, + "eval_loss": 0.012293835170567036, + "eval_runtime": 3.5826, + "eval_samples_per_second": 40.474, + "eval_steps_per_second": 5.303, "step": 3500 }, { "epoch": 20.406976744186046, - "grad_norm": 0.0006762651028111577, + "grad_norm": 0.00015309838636312634, "learning_rate": 9.796511627906976e-05, - "loss": 0.0003, + "loss": 0.0001, "step": 3510 }, { "epoch": 20.46511627906977, - "grad_norm": 0.00046472542453557253, + "grad_norm": 0.00039073656080290675, "learning_rate": 9.767441860465116e-05, - "loss": 0.0002, + "loss": 0.0, "step": 3520 }, { "epoch": 20.52325581395349, - "grad_norm": 0.0018592874985188246, + "grad_norm": 0.0015916860429570079, "learning_rate": 9.738372093023256e-05, - "loss": 0.0004, + "loss": 0.0001, "step": 3530 }, { "epoch": 20.58139534883721, - "grad_norm": 0.0012767938897013664, + "grad_norm": 0.000295623583951965, "learning_rate": 9.709302325581396e-05, - "loss": 0.0004, + "loss": 0.0001, "step": 3540 }, { "epoch": 20.63953488372093, - "grad_norm": 0.0018365670694038272, + "grad_norm": 0.0002569795469753444, "learning_rate": 9.680232558139535e-05, - "loss": 0.0003, + "loss": 0.0001, "step": 3550 }, { "epoch": 20.697674418604652, - "grad_norm": 0.0003081525210291147, + "grad_norm": 9.455447434447706e-05, "learning_rate": 9.651162790697675e-05, - "loss": 0.0003, + "loss": 0.0001, "step": 3560 }, { "epoch": 20.75581395348837, - "grad_norm": 0.0013045326340943575, + "grad_norm": 0.00047062692465260625, "learning_rate": 9.622093023255815e-05, - "loss": 0.0002, + "loss": 0.0001, "step": 3570 }, { "epoch": 20.813953488372093, - "grad_norm": 0.0005050916224718094, + "grad_norm": 0.00013552811287809163, "learning_rate": 9.593023255813955e-05, - "loss": 0.0003, + "loss": 0.0001, "step": 3580 }, { "epoch": 20.872093023255815, - "grad_norm": 0.0004142239340581, + "grad_norm": 0.00011756927415262908, "learning_rate": 9.563953488372094e-05, - "loss": 0.0003, + "loss": 0.0001, "step": 3590 }, { "epoch": 20.930232558139537, - "grad_norm": 0.0017503146082162857, + "grad_norm": 0.0014577241381630301, "learning_rate": 9.534883720930233e-05, - "loss": 0.0003, + "loss": 0.0001, "step": 3600 }, { "epoch": 20.930232558139537, - "eval_f1": 0.2664750957854406, - "eval_loss": 0.01000593975186348, - "eval_runtime": 3.3798, - "eval_samples_per_second": 42.901, - "eval_steps_per_second": 5.622, + "eval_f1": 0.26915161466885607, + "eval_loss": 0.01241295225918293, + "eval_runtime": 3.6158, + "eval_samples_per_second": 40.102, + "eval_steps_per_second": 5.255, "step": 3600 }, { "epoch": 20.988372093023255, - "grad_norm": 0.000552505545783788, + "grad_norm": 0.00012101033644285053, "learning_rate": 9.505813953488373e-05, - "loss": 0.0002, + "loss": 0.0001, "step": 3610 }, { "epoch": 21.046511627906977, - "grad_norm": 0.0024412046186625957, + "grad_norm": 0.00022719320259056985, "learning_rate": 9.476744186046512e-05, - "loss": 0.0003, + "loss": 0.0001, "step": 3620 }, { "epoch": 21.1046511627907, - "grad_norm": 0.00036216064472682774, + "grad_norm": 0.00010675326484488323, "learning_rate": 9.447674418604652e-05, - "loss": 0.0002, + "loss": 0.0001, "step": 3630 }, { "epoch": 21.162790697674417, - "grad_norm": 0.001324295997619629, + "grad_norm": 0.00024110461527016014, "learning_rate": 9.418604651162792e-05, - "loss": 0.0003, + "loss": 0.0001, "step": 3640 }, { "epoch": 21.22093023255814, - "grad_norm": 0.0006499919691123068, + "grad_norm": 0.00012969542876817286, "learning_rate": 9.38953488372093e-05, - "loss": 0.0002, + "loss": 0.0001, "step": 3650 }, { "epoch": 21.27906976744186, - "grad_norm": 0.0006810969207435846, + "grad_norm": 0.00018064751930069178, "learning_rate": 9.36046511627907e-05, - "loss": 0.0003, + "loss": 0.0001, "step": 3660 }, { "epoch": 21.337209302325583, - "grad_norm": 0.0007499000639654696, + "grad_norm": 0.0002021090331254527, "learning_rate": 9.33139534883721e-05, - "loss": 0.0003, + "loss": 0.0001, "step": 3670 }, { "epoch": 21.3953488372093, - "grad_norm": 0.0006736308569088578, + "grad_norm": 0.00024396360095124692, "learning_rate": 9.30232558139535e-05, - "loss": 0.0003, + "loss": 0.0001, "step": 3680 }, { "epoch": 21.453488372093023, - "grad_norm": 0.0003619794442784041, + "grad_norm": 8.181745215551928e-05, "learning_rate": 9.273255813953488e-05, - "loss": 0.0003, + "loss": 0.0001, "step": 3690 }, { "epoch": 21.511627906976745, - "grad_norm": 0.0010090672876685858, + "grad_norm": 0.00021163842757232487, "learning_rate": 9.244186046511628e-05, - "loss": 0.0002, + "loss": 0.0001, "step": 3700 }, { "epoch": 21.511627906976745, - "eval_f1": 0.26578544061302684, - "eval_loss": 0.010013419203460217, - "eval_runtime": 3.4061, - "eval_samples_per_second": 42.571, - "eval_steps_per_second": 5.578, + "eval_f1": 0.276048166392994, + "eval_loss": 0.012478708289563656, + "eval_runtime": 3.4797, + "eval_samples_per_second": 41.67, + "eval_steps_per_second": 5.46, "step": 3700 }, { "epoch": 21.569767441860463, - "grad_norm": 0.0005667731165885925, + "grad_norm": 0.0001295630499953404, "learning_rate": 9.215116279069768e-05, - "loss": 0.0003, + "loss": 0.0001, "step": 3710 }, { "epoch": 21.627906976744185, - "grad_norm": 0.0005890413303859532, + "grad_norm": 0.00010119118087459356, "learning_rate": 9.186046511627907e-05, - "loss": 0.0002, + "loss": 0.0001, "step": 3720 }, { "epoch": 21.686046511627907, - "grad_norm": 0.0005061826086603105, + "grad_norm": 0.00012852686631958932, "learning_rate": 9.156976744186047e-05, - "loss": 0.0002, + "loss": 0.0001, "step": 3730 }, { "epoch": 21.74418604651163, - "grad_norm": 0.0006897013518027961, + "grad_norm": 0.00018867886683437973, "learning_rate": 9.127906976744186e-05, - "loss": 0.0002, + "loss": 0.0001, "step": 3740 }, { "epoch": 21.802325581395348, - "grad_norm": 0.0003011000226251781, + "grad_norm": 0.0001046601973939687, "learning_rate": 9.098837209302325e-05, - "loss": 0.0002, + "loss": 0.0001, "step": 3750 }, { "epoch": 21.86046511627907, - "grad_norm": 0.0015246364055201411, + "grad_norm": 0.0014569408958777785, "learning_rate": 9.069767441860465e-05, - "loss": 0.0003, + "loss": 0.0001, "step": 3760 }, { "epoch": 21.91860465116279, - "grad_norm": 0.0008636050042696297, + "grad_norm": 6.906556518515572e-05, "learning_rate": 9.040697674418606e-05, - "loss": 0.0002, + "loss": 0.0001, "step": 3770 }, { "epoch": 21.97674418604651, - "grad_norm": 0.0008527491590939462, + "grad_norm": 0.0003369758778717369, "learning_rate": 9.011627906976745e-05, - "loss": 0.0003, + "loss": 0.0001, "step": 3780 }, { "epoch": 22.03488372093023, - "grad_norm": 0.0007409742684103549, + "grad_norm": 0.00020307957311160862, "learning_rate": 8.982558139534884e-05, - "loss": 0.0002, + "loss": 0.0, "step": 3790 }, { "epoch": 22.093023255813954, - "grad_norm": 0.0004994486225768924, + "grad_norm": 7.845911022741348e-05, "learning_rate": 8.953488372093024e-05, - "loss": 0.0002, + "loss": 0.0001, "step": 3800 }, { "epoch": 22.093023255813954, - "eval_f1": 0.2687739463601532, - "eval_loss": 0.010326522402465343, - "eval_runtime": 3.5415, - "eval_samples_per_second": 40.943, - "eval_steps_per_second": 5.365, + "eval_f1": 0.2693814997263273, + "eval_loss": 0.012578567489981651, + "eval_runtime": 3.539, + "eval_samples_per_second": 40.972, + "eval_steps_per_second": 5.369, "step": 3800 }, { "epoch": 22.151162790697676, - "grad_norm": 0.0016253129579126835, + "grad_norm": 0.000257166480878368, "learning_rate": 8.924418604651164e-05, - "loss": 0.0002, + "loss": 0.0001, "step": 3810 }, { "epoch": 22.209302325581394, - "grad_norm": 0.001818611053749919, + "grad_norm": 0.0014932604972273111, "learning_rate": 8.895348837209302e-05, - "loss": 0.0003, + "loss": 0.0001, "step": 3820 }, { "epoch": 22.267441860465116, - "grad_norm": 0.0006514011765830219, + "grad_norm": 0.00015260590589605272, "learning_rate": 8.866279069767442e-05, - "loss": 0.0002, + "loss": 0.0001, "step": 3830 }, { "epoch": 22.325581395348838, - "grad_norm": 0.0004353756958153099, + "grad_norm": 0.00015176714805420488, "learning_rate": 8.837209302325582e-05, - "loss": 0.0002, + "loss": 0.0001, "step": 3840 }, { "epoch": 22.38372093023256, - "grad_norm": 0.0003122685884591192, + "grad_norm": 0.00010105521505465731, "learning_rate": 8.808139534883722e-05, - "loss": 0.0002, + "loss": 0.0001, "step": 3850 }, { "epoch": 22.441860465116278, - "grad_norm": 0.0006704569677822292, + "grad_norm": 0.00016976063488982618, "learning_rate": 8.779069767441861e-05, - "loss": 0.0002, + "loss": 0.0001, "step": 3860 }, { "epoch": 22.5, - "grad_norm": 0.0002993652888108045, + "grad_norm": 0.00014910796016920358, "learning_rate": 8.75e-05, - "loss": 0.0002, + "loss": 0.0001, "step": 3870 }, { "epoch": 22.558139534883722, - "grad_norm": 0.0010453565046191216, + "grad_norm": 0.0002738297334872186, "learning_rate": 8.72093023255814e-05, - "loss": 0.0002, + "loss": 0.0001, "step": 3880 }, { "epoch": 22.61627906976744, - "grad_norm": 0.00024619605392217636, + "grad_norm": 8.589241770096123e-05, "learning_rate": 8.69186046511628e-05, - "loss": 0.0002, + "loss": 0.0001, "step": 3890 }, { "epoch": 22.674418604651162, - "grad_norm": 0.00045620821765623987, + "grad_norm": 0.0001273662637686357, "learning_rate": 8.662790697674419e-05, - "loss": 0.0002, + "loss": 0.0, "step": 3900 }, { "epoch": 22.674418604651162, - "eval_f1": 0.2768199233716475, - "eval_loss": 0.010496433824300766, - "eval_runtime": 3.4145, - "eval_samples_per_second": 42.466, - "eval_steps_per_second": 5.565, + "eval_f1": 0.27168035030103993, + "eval_loss": 0.012721100822091103, + "eval_runtime": 4.0027, + "eval_samples_per_second": 36.226, + "eval_steps_per_second": 4.747, "step": 3900 }, { "epoch": 22.732558139534884, - "grad_norm": 0.0002549287164583802, + "grad_norm": 8.572284423280507e-05, "learning_rate": 8.633720930232559e-05, - "loss": 0.0002, + "loss": 0.0, "step": 3910 }, { "epoch": 22.790697674418606, - "grad_norm": 0.00029945597634650767, + "grad_norm": 7.511546573368832e-05, "learning_rate": 8.604651162790697e-05, - "loss": 0.0002, + "loss": 0.0001, "step": 3920 }, { "epoch": 22.848837209302324, - "grad_norm": 0.0004562933463603258, + "grad_norm": 0.00011926183651667088, "learning_rate": 8.575581395348837e-05, - "loss": 0.0002, + "loss": 0.0001, "step": 3930 }, { "epoch": 22.906976744186046, - "grad_norm": 0.0006230974686332047, + "grad_norm": 0.00020073799532838166, "learning_rate": 8.546511627906977e-05, - "loss": 0.0002, + "loss": 0.0001, "step": 3940 }, { "epoch": 22.96511627906977, - "grad_norm": 0.0004207508754916489, + "grad_norm": 0.00010419807949801907, "learning_rate": 8.517441860465117e-05, - "loss": 0.0002, + "loss": 0.0001, "step": 3950 }, { "epoch": 23.023255813953487, - "grad_norm": 0.0014452919131144881, + "grad_norm": 0.0011627307394519448, "learning_rate": 8.488372093023255e-05, - "loss": 0.0003, + "loss": 0.0001, "step": 3960 }, { "epoch": 23.08139534883721, - "grad_norm": 0.000655371870379895, + "grad_norm": 0.00019352821982465684, "learning_rate": 8.459302325581395e-05, - "loss": 0.0001, + "loss": 0.0, "step": 3970 }, { "epoch": 23.13953488372093, - "grad_norm": 0.0008183548925444484, + "grad_norm": 0.0007782498141750693, "learning_rate": 8.430232558139536e-05, "loss": 0.0001, "step": 3980 }, { "epoch": 23.197674418604652, - "grad_norm": 0.000937555159907788, + "grad_norm": 0.0002046496229013428, "learning_rate": 8.401162790697676e-05, - "loss": 0.0002, + "loss": 0.0, "step": 3990 }, { "epoch": 23.25581395348837, - "grad_norm": 0.0018412956269457936, + "grad_norm": 0.0015353452181443572, "learning_rate": 8.372093023255814e-05, - "loss": 0.0002, + "loss": 0.0001, "step": 4000 }, { "epoch": 23.25581395348837, - "eval_f1": 0.2664750957854406, - "eval_loss": 0.010493006557226181, - "eval_runtime": 3.4525, - "eval_samples_per_second": 41.999, - "eval_steps_per_second": 5.503, + "eval_f1": 0.2657033388067871, + "eval_loss": 0.012782420963048935, + "eval_runtime": 3.5689, + "eval_samples_per_second": 40.629, + "eval_steps_per_second": 5.324, "step": 4000 }, { "epoch": 23.313953488372093, - "grad_norm": 0.0006483742035925388, + "grad_norm": 0.00027609398239292204, "learning_rate": 8.343023255813954e-05, - "loss": 0.0003, + "loss": 0.0001, "step": 4010 }, { "epoch": 23.372093023255815, - "grad_norm": 0.000587189628276974, + "grad_norm": 0.00021070419461466372, "learning_rate": 8.313953488372094e-05, "loss": 0.0001, "step": 4020 }, { "epoch": 23.430232558139537, - "grad_norm": 0.0002649370289873332, + "grad_norm": 9.023152233567089e-05, "learning_rate": 8.284883720930234e-05, - "loss": 0.0002, + "loss": 0.0, "step": 4030 }, { "epoch": 23.488372093023255, - "grad_norm": 0.0020568666514009237, + "grad_norm": 0.0018169673858210444, "learning_rate": 8.255813953488373e-05, - "loss": 0.0002, + "loss": 0.0001, "step": 4040 }, { "epoch": 23.546511627906977, - "grad_norm": 0.0002564095484558493, + "grad_norm": 7.264404121087864e-05, "learning_rate": 8.226744186046512e-05, - "loss": 0.0002, + "loss": 0.0001, "step": 4050 }, { "epoch": 23.6046511627907, - "grad_norm": 0.0011838797945529222, + "grad_norm": 0.00016047897224780172, "learning_rate": 8.197674418604652e-05, - "loss": 0.0002, + "loss": 0.0001, "step": 4060 }, { "epoch": 23.662790697674417, - "grad_norm": 0.0003872521629091352, + "grad_norm": 0.0001079871944966726, "learning_rate": 8.168604651162791e-05, - "loss": 0.0002, + "loss": 0.0001, "step": 4070 }, { "epoch": 23.72093023255814, - "grad_norm": 0.0005001472891308367, + "grad_norm": 0.0001797958684619516, "learning_rate": 8.139534883720931e-05, - "loss": 0.0002, + "loss": 0.0001, "step": 4080 }, { "epoch": 23.77906976744186, - "grad_norm": 0.0005937547539360821, + "grad_norm": 0.0001625343138584867, "learning_rate": 8.11046511627907e-05, - "loss": 0.0002, + "loss": 0.0, "step": 4090 }, { "epoch": 23.837209302325583, - "grad_norm": 0.0002876650250982493, + "grad_norm": 0.0001472539152018726, "learning_rate": 8.081395348837209e-05, - "loss": 0.0002, + "loss": 0.0, "step": 4100 }, { "epoch": 23.837209302325583, - "eval_f1": 0.27337164750957854, - "eval_loss": 0.010660259984433651, - "eval_runtime": 3.3902, - "eval_samples_per_second": 42.771, - "eval_steps_per_second": 5.604, + "eval_f1": 0.2852435686918446, + "eval_loss": 0.012909590266644955, + "eval_runtime": 3.7094, + "eval_samples_per_second": 39.09, + "eval_steps_per_second": 5.122, "step": 4100 }, { "epoch": 23.8953488372093, - "grad_norm": 0.0004182937555015087, + "grad_norm": 9.219853382091969e-05, "learning_rate": 8.052325581395349e-05, - "loss": 0.0001, + "loss": 0.0, "step": 4110 }, { "epoch": 23.953488372093023, - "grad_norm": 0.0002867088478524238, + "grad_norm": 5.5808886827435344e-05, "learning_rate": 8.023255813953489e-05, - "loss": 0.0001, + "loss": 0.0, "step": 4120 }, { "epoch": 24.011627906976745, - "grad_norm": 0.0002801069349516183, + "grad_norm": 9.610240522306412e-05, "learning_rate": 7.994186046511629e-05, - "loss": 0.0002, + "loss": 0.0001, "step": 4130 }, { "epoch": 24.069767441860463, - "grad_norm": 0.00038150488398969173, + "grad_norm": 0.00012580392649397254, "learning_rate": 7.965116279069767e-05, - "loss": 0.0001, + "loss": 0.0, "step": 4140 }, { "epoch": 24.127906976744185, - "grad_norm": 0.00017597964324522763, + "grad_norm": 7.413386629195884e-05, "learning_rate": 7.936046511627907e-05, - "loss": 0.0002, + "loss": 0.0001, "step": 4150 }, { "epoch": 24.186046511627907, - "grad_norm": 0.001368246623314917, + "grad_norm": 0.0012770950561389327, "learning_rate": 7.906976744186047e-05, - "loss": 0.0002, + "loss": 0.0001, "step": 4160 }, { "epoch": 24.24418604651163, - "grad_norm": 0.00013013397983741015, + "grad_norm": 4.0335857192985713e-05, "learning_rate": 7.877906976744186e-05, - "loss": 0.0001, + "loss": 0.0, "step": 4170 }, { "epoch": 24.302325581395348, - "grad_norm": 0.00026110088219866157, + "grad_norm": 8.064441499300301e-05, "learning_rate": 7.848837209302326e-05, - "loss": 0.0001, + "loss": 0.0, "step": 4180 }, { "epoch": 24.36046511627907, - "grad_norm": 0.00030333283939398825, + "grad_norm": 0.00010314686369383708, "learning_rate": 7.819767441860465e-05, - "loss": 0.0001, + "loss": 0.0, "step": 4190 }, { "epoch": 24.41860465116279, - "grad_norm": 0.00038482973468489945, + "grad_norm": 0.00018079935398418456, "learning_rate": 7.790697674418606e-05, - "loss": 0.0002, + "loss": 0.0001, "step": 4200 }, { "epoch": 24.41860465116279, - "eval_f1": 0.276360153256705, - "eval_loss": 0.01084720715880394, - "eval_runtime": 3.4671, - "eval_samples_per_second": 41.822, - "eval_steps_per_second": 5.48, + "eval_f1": 0.27834701696770664, + "eval_loss": 0.013003015890717506, + "eval_runtime": 3.4732, + "eval_samples_per_second": 41.749, + "eval_steps_per_second": 5.47, "step": 4200 }, { "epoch": 24.476744186046513, - "grad_norm": 0.00028552854200825095, + "grad_norm": 8.888553566066548e-05, "learning_rate": 7.761627906976745e-05, - "loss": 0.0001, + "loss": 0.0, "step": 4210 }, { "epoch": 24.53488372093023, - "grad_norm": 0.00015695330512244254, + "grad_norm": 5.777809565188363e-05, "learning_rate": 7.732558139534884e-05, "loss": 0.0001, "step": 4220 }, { "epoch": 24.593023255813954, - "grad_norm": 0.0002479999966453761, + "grad_norm": 7.774583355057985e-05, "learning_rate": 7.703488372093024e-05, - "loss": 0.0002, + "loss": 0.0001, "step": 4230 }, { "epoch": 24.651162790697676, - "grad_norm": 0.0023114762734621763, + "grad_norm": 0.002446948317810893, "learning_rate": 7.674418604651163e-05, - "loss": 0.0002, + "loss": 0.0001, "step": 4240 }, { "epoch": 24.709302325581394, - "grad_norm": 0.00013815987040288746, + "grad_norm": 4.1072496969718486e-05, "learning_rate": 7.645348837209303e-05, "loss": 0.0001, "step": 4250 }, { "epoch": 24.767441860465116, - "grad_norm": 0.0004795648856088519, + "grad_norm": 0.00013745592150371522, "learning_rate": 7.616279069767443e-05, - "loss": 0.0001, + "loss": 0.0, "step": 4260 }, { "epoch": 24.825581395348838, - "grad_norm": 0.0016045981319621205, + "grad_norm": 0.00012999869068153203, "learning_rate": 7.587209302325581e-05, - "loss": 0.0002, + "loss": 0.0001, "step": 4270 }, { "epoch": 24.88372093023256, - "grad_norm": 0.0002838395012076944, + "grad_norm": 9.823262371355668e-05, "learning_rate": 7.558139534883721e-05, - "loss": 0.0001, + "loss": 0.0, "step": 4280 }, { "epoch": 24.941860465116278, - "grad_norm": 0.00034605650580488145, + "grad_norm": 0.00010892364662140608, "learning_rate": 7.529069767441861e-05, - "loss": 0.0002, + "loss": 0.0001, "step": 4290 }, { "epoch": 25.0, - "grad_norm": 0.0027031004428863525, + "grad_norm": 0.0023911795578897, "learning_rate": 7.500000000000001e-05, - "loss": 0.0002, + "loss": 0.0001, "step": 4300 }, { "epoch": 25.0, - "eval_f1": 0.27314176245210725, - "eval_loss": 0.010902789421379566, - "eval_runtime": 3.4699, - "eval_samples_per_second": 41.788, - "eval_steps_per_second": 5.476, + "eval_f1": 0.28984126984126984, + "eval_loss": 0.013028619810938835, + "eval_runtime": 3.7188, + "eval_samples_per_second": 38.991, + "eval_steps_per_second": 5.109, "step": 4300 }, { "epoch": 25.058139534883722, - "grad_norm": 0.00046942729386501014, + "grad_norm": 0.00017118205141741782, "learning_rate": 7.47093023255814e-05, "loss": 0.0001, "step": 4310 }, { "epoch": 25.11627906976744, - "grad_norm": 0.00040583638474345207, + "grad_norm": 0.00012128344678785652, "learning_rate": 7.441860465116279e-05, "loss": 0.0001, "step": 4320 }, { "epoch": 25.174418604651162, - "grad_norm": 0.00019362822058610618, + "grad_norm": 7.591600297018886e-05, "learning_rate": 7.412790697674419e-05, - "loss": 0.0002, + "loss": 0.0001, "step": 4330 }, { "epoch": 25.232558139534884, - "grad_norm": 0.0007580669480375946, + "grad_norm": 0.0001722633169265464, "learning_rate": 7.383720930232558e-05, - "loss": 0.0001, + "loss": 0.0, "step": 4340 }, { "epoch": 25.290697674418606, - "grad_norm": 0.00015401588461827487, + "grad_norm": 5.6696524552535266e-05, "learning_rate": 7.354651162790698e-05, - "loss": 0.0001, + "loss": 0.0, "step": 4350 }, { "epoch": 25.348837209302324, - "grad_norm": 0.00041559862438589334, + "grad_norm": 0.00011566980538191274, "learning_rate": 7.325581395348837e-05, - "loss": 0.0002, + "loss": 0.0001, "step": 4360 }, { "epoch": 25.406976744186046, - "grad_norm": 0.00041440289351157844, + "grad_norm": 0.00012127365334890783, "learning_rate": 7.296511627906976e-05, "loss": 0.0001, "step": 4370 }, { "epoch": 25.46511627906977, - "grad_norm": 0.00021292344899848104, + "grad_norm": 6.409850175259635e-05, "learning_rate": 7.267441860465116e-05, - "loss": 0.0001, + "loss": 0.0, "step": 4380 }, { "epoch": 25.52325581395349, - "grad_norm": 0.00023520820832345635, + "grad_norm": 7.475136226275936e-05, "learning_rate": 7.238372093023256e-05, "loss": 0.0001, "step": 4390 }, { "epoch": 25.58139534883721, - "grad_norm": 0.00023186506587080657, + "grad_norm": 8.108514157356694e-05, "learning_rate": 7.209302325581396e-05, - "loss": 0.0001, + "loss": 0.0, "step": 4400 }, { "epoch": 25.58139534883721, - "eval_f1": 0.2745210727969349, - "eval_loss": 0.010938283056020737, - "eval_runtime": 3.5549, - "eval_samples_per_second": 40.789, - "eval_steps_per_second": 5.345, + "eval_f1": 0.2863929939792009, + "eval_loss": 0.01313122920691967, + "eval_runtime": 3.5504, + "eval_samples_per_second": 40.84, + "eval_steps_per_second": 5.351, "step": 4400 }, { "epoch": 25.63953488372093, - "grad_norm": 0.00018147336959373206, + "grad_norm": 6.366455636452883e-05, "learning_rate": 7.180232558139535e-05, - "loss": 0.0002, + "loss": 0.0001, "step": 4410 }, { "epoch": 25.697674418604652, - "grad_norm": 0.00047183968126773834, + "grad_norm": 0.00016111049626488239, "learning_rate": 7.151162790697675e-05, - "loss": 0.0002, + "loss": 0.0001, "step": 4420 }, { "epoch": 25.75581395348837, - "grad_norm": 0.0002894249919336289, + "grad_norm": 0.00010021917114499956, "learning_rate": 7.122093023255815e-05, - "loss": 0.0002, + "loss": 0.0001, "step": 4430 }, { "epoch": 25.813953488372093, - "grad_norm": 0.00034714362118393183, + "grad_norm": 0.00013560970546677709, "learning_rate": 7.093023255813955e-05, - "loss": 0.0001, + "loss": 0.0, "step": 4440 }, { "epoch": 25.872093023255815, - "grad_norm": 0.00033856811933219433, + "grad_norm": 0.00011311213893350214, "learning_rate": 7.063953488372093e-05, - "loss": 0.0001, + "loss": 0.0, "step": 4450 }, { "epoch": 25.930232558139537, - "grad_norm": 0.0001298709976254031, + "grad_norm": 4.385941429063678e-05, "learning_rate": 7.034883720930233e-05, - "loss": 0.0002, + "loss": 0.0001, "step": 4460 }, { "epoch": 25.988372093023255, - "grad_norm": 0.0003549143730197102, + "grad_norm": 0.00011054795322706923, "learning_rate": 7.005813953488373e-05, "loss": 0.0001, "step": 4470 }, { "epoch": 26.046511627906977, - "grad_norm": 0.00040327481110580266, + "grad_norm": 0.0001339893351541832, "learning_rate": 6.976744186046513e-05, - "loss": 0.0001, + "loss": 0.0, "step": 4480 }, { "epoch": 26.1046511627907, - "grad_norm": 0.00033235581940971315, + "grad_norm": 0.00012086863716831431, "learning_rate": 6.947674418604651e-05, - "loss": 0.0001, + "loss": 0.0, "step": 4490 }, { "epoch": 26.162790697674417, - "grad_norm": 0.0005312968278303742, + "grad_norm": 0.00015321826504077762, "learning_rate": 6.918604651162791e-05, - "loss": 0.0001, + "loss": 0.0, "step": 4500 }, { "epoch": 26.162790697674417, - "eval_f1": 0.27498084291187735, - "eval_loss": 0.011064889840781689, - "eval_runtime": 3.5705, - "eval_samples_per_second": 40.611, - "eval_steps_per_second": 5.321, + "eval_f1": 0.28616310892172964, + "eval_loss": 0.013169930316507816, + "eval_runtime": 3.6384, + "eval_samples_per_second": 39.853, + "eval_steps_per_second": 5.222, "step": 4500 }, { "epoch": 26.22093023255814, - "grad_norm": 0.00014046816795598716, + "grad_norm": 5.1164453907404095e-05, "learning_rate": 6.88953488372093e-05, - "loss": 0.0001, + "loss": 0.0, "step": 4510 }, { "epoch": 26.27906976744186, - "grad_norm": 0.00020187173504382372, + "grad_norm": 8.028713637031615e-05, "learning_rate": 6.86046511627907e-05, - "loss": 0.0002, + "loss": 0.0001, "step": 4520 }, { "epoch": 26.337209302325583, - "grad_norm": 0.0005311329150572419, + "grad_norm": 0.00014986025053076446, "learning_rate": 6.83139534883721e-05, - "loss": 0.0001, + "loss": 0.0, "step": 4530 }, { "epoch": 26.3953488372093, - "grad_norm": 0.0016504917293787003, + "grad_norm": 0.0015762551920488477, "learning_rate": 6.802325581395348e-05, - "loss": 0.0002, + "loss": 0.0001, "step": 4540 }, { "epoch": 26.453488372093023, - "grad_norm": 0.0002207654615631327, + "grad_norm": 7.128499419195578e-05, "learning_rate": 6.773255813953488e-05, - "loss": 0.0001, + "loss": 0.0, "step": 4550 }, { "epoch": 26.511627906976745, - "grad_norm": 0.00021439451666083187, + "grad_norm": 7.312518573598936e-05, "learning_rate": 6.744186046511628e-05, "loss": 0.0001, "step": 4560 }, { "epoch": 26.569767441860463, - "grad_norm": 0.00018846942111849785, + "grad_norm": 7.20562384231016e-05, "learning_rate": 6.715116279069768e-05, - "loss": 0.0001, + "loss": 0.0, "step": 4570 }, { "epoch": 26.627906976744185, - "grad_norm": 0.00017869797011371702, + "grad_norm": 6.213824963197112e-05, "learning_rate": 6.686046511627908e-05, "loss": 0.0001, "step": 4580 }, { "epoch": 26.686046511627907, - "grad_norm": 0.0001802190236048773, + "grad_norm": 7.234533404698595e-05, "learning_rate": 6.656976744186046e-05, - "loss": 0.0001, + "loss": 0.0, "step": 4590 }, { "epoch": 26.74418604651163, - "grad_norm": 0.00015876081306487322, + "grad_norm": 5.657358997268602e-05, "learning_rate": 6.627906976744186e-05, - "loss": 0.0002, + "loss": 0.0001, "step": 4600 }, { "epoch": 26.74418604651163, - "eval_f1": 0.2839463601532567, - "eval_loss": 0.011096668429672718, - "eval_runtime": 3.7437, - "eval_samples_per_second": 38.732, - "eval_steps_per_second": 5.075, + "eval_f1": 0.2965079365079365, + "eval_loss": 0.013265942223370075, + "eval_runtime": 3.6331, + "eval_samples_per_second": 39.911, + "eval_steps_per_second": 5.23, "step": 4600 }, { "epoch": 26.802325581395348, - "grad_norm": 0.0013521353248506784, + "grad_norm": 0.0011341801146045327, "learning_rate": 6.598837209302326e-05, "loss": 0.0001, "step": 4610 }, { "epoch": 26.86046511627907, - "grad_norm": 0.0003753627242986113, + "grad_norm": 0.00013208380551077425, "learning_rate": 6.569767441860465e-05, - "loss": 0.0002, + "loss": 0.0001, "step": 4620 }, { "epoch": 26.91860465116279, - "grad_norm": 0.00030890744528733194, + "grad_norm": 0.00013013571151532233, "learning_rate": 6.540697674418605e-05, - "loss": 0.0001, + "loss": 0.0, "step": 4630 }, { "epoch": 26.97674418604651, - "grad_norm": 0.0020816426258534193, + "grad_norm": 0.001869815867394209, "learning_rate": 6.511627906976745e-05, - "loss": 0.0002, + "loss": 0.0001, "step": 4640 }, { "epoch": 27.03488372093023, - "grad_norm": 0.00025410964735783637, + "grad_norm": 0.0001128123258240521, "learning_rate": 6.482558139534885e-05, - "loss": 0.0001, + "loss": 0.0, "step": 4650 }, { "epoch": 27.093023255813954, - "grad_norm": 7.904416997916996e-05, + "grad_norm": 2.7133073672303e-05, "learning_rate": 6.453488372093024e-05, "loss": 0.0001, "step": 4660 }, { "epoch": 27.151162790697676, - "grad_norm": 0.0003131863777525723, + "grad_norm": 0.00010893656872212887, "learning_rate": 6.424418604651163e-05, "loss": 0.0001, "step": 4670 }, { "epoch": 27.209302325581394, - "grad_norm": 0.00033080356661230326, + "grad_norm": 0.00013369403313845396, "learning_rate": 6.395348837209303e-05, - "loss": 0.0001, + "loss": 0.0, "step": 4680 }, { "epoch": 27.267441860465116, - "grad_norm": 0.0003155465819872916, + "grad_norm": 9.277948265662417e-05, "learning_rate": 6.366279069767442e-05, "loss": 0.0001, "step": 4690 }, { "epoch": 27.325581395348838, - "grad_norm": 0.00019863164925482124, + "grad_norm": 7.702650327701122e-05, "learning_rate": 6.337209302325582e-05, - "loss": 0.0001, + "loss": 0.0, "step": 4700 }, { "epoch": 27.325581395348838, - "eval_f1": 0.276360153256705, - "eval_loss": 0.011220867745578289, - "eval_runtime": 3.7066, - "eval_samples_per_second": 39.12, - "eval_steps_per_second": 5.126, + "eval_f1": 0.2921401204159825, + "eval_loss": 0.013389009051024914, + "eval_runtime": 3.4914, + "eval_samples_per_second": 41.53, + "eval_steps_per_second": 5.442, "step": 4700 }, { "epoch": 27.38372093023256, - "grad_norm": 0.0010860268957912922, + "grad_norm": 0.00017988786567002535, "learning_rate": 6.308139534883722e-05, - "loss": 0.0001, + "loss": 0.0, "step": 4710 }, { "epoch": 27.441860465116278, - "grad_norm": 0.0003841882571578026, + "grad_norm": 0.00014185108011588454, "learning_rate": 6.27906976744186e-05, - "loss": 0.0002, + "loss": 0.0001, "step": 4720 }, { "epoch": 27.5, - "grad_norm": 0.00014041626127436757, + "grad_norm": 4.6865457989042625e-05, "learning_rate": 6.25e-05, - "loss": 0.0002, + "loss": 0.0001, "step": 4730 }, { "epoch": 27.558139534883722, - "grad_norm": 0.00016446378140244633, + "grad_norm": 7.037931209197268e-05, "learning_rate": 6.22093023255814e-05, - "loss": 0.0001, + "loss": 0.0, "step": 4740 }, { "epoch": 27.61627906976744, - "grad_norm": 0.00015792602789588273, + "grad_norm": 6.061540261725895e-05, "learning_rate": 6.19186046511628e-05, "loss": 0.0001, "step": 4750 }, { "epoch": 27.674418604651162, - "grad_norm": 0.00044839963084086776, + "grad_norm": 0.00017687020590528846, "learning_rate": 6.162790697674418e-05, "loss": 0.0001, "step": 4760 }, { "epoch": 27.732558139534884, - "grad_norm": 0.00031541698263026774, + "grad_norm": 0.00010895613377215341, "learning_rate": 6.133720930232558e-05, "loss": 0.0001, "step": 4770 }, { "epoch": 27.790697674418606, - "grad_norm": 0.00018006614118348807, + "grad_norm": 6.3329964177683e-05, "learning_rate": 6.104651162790698e-05, - "loss": 0.0001, + "loss": 0.0, "step": 4780 }, { "epoch": 27.848837209302324, - "grad_norm": 0.0002560662687756121, + "grad_norm": 8.614605758339167e-05, "learning_rate": 6.0755813953488374e-05, - "loss": 0.0001, + "loss": 0.0, "step": 4790 }, { "epoch": 27.906976744186046, - "grad_norm": 0.0001889288832899183, + "grad_norm": 6.203014345373958e-05, "learning_rate": 6.0465116279069765e-05, - "loss": 0.0001, + "loss": 0.0, "step": 4800 }, { "epoch": 27.906976744186046, - "eval_f1": 0.27544061302681994, - "eval_loss": 0.011338108219206333, - "eval_runtime": 3.718, - "eval_samples_per_second": 38.999, - "eval_steps_per_second": 5.11, + "eval_f1": 0.2840941434044882, + "eval_loss": 0.013416824862360954, + "eval_runtime": 3.6126, + "eval_samples_per_second": 40.138, + "eval_steps_per_second": 5.259, "step": 4800 }, { "epoch": 27.96511627906977, - "grad_norm": 0.00040978859760798514, + "grad_norm": 0.00010886503878282383, "learning_rate": 6.017441860465116e-05, "loss": 0.0001, "step": 4810 }, { "epoch": 28.023255813953487, - "grad_norm": 0.00017055953503586352, + "grad_norm": 7.100569928297773e-05, "learning_rate": 5.9883720930232554e-05, - "loss": 0.0001, + "loss": 0.0, "step": 4820 }, { "epoch": 28.08139534883721, - "grad_norm": 0.0002564933674875647, + "grad_norm": 9.883446909952909e-05, "learning_rate": 5.959302325581395e-05, "loss": 0.0001, "step": 4830 }, { "epoch": 28.13953488372093, - "grad_norm": 0.00017605409084353596, + "grad_norm": 6.230935832718387e-05, "learning_rate": 5.9302325581395356e-05, "loss": 0.0001, "step": 4840 }, { "epoch": 28.197674418604652, - "grad_norm": 0.0008650438976474106, + "grad_norm": 0.00013418818707577884, "learning_rate": 5.9011627906976754e-05, "loss": 0.0001, "step": 4850 }, { "epoch": 28.25581395348837, - "grad_norm": 0.00019766090554185212, + "grad_norm": 8.271815750049427e-05, "learning_rate": 5.8720930232558145e-05, "loss": 0.0001, "step": 4860 }, { "epoch": 28.313953488372093, - "grad_norm": 0.00017726205987855792, + "grad_norm": 7.025552622508258e-05, "learning_rate": 5.843023255813954e-05, - "loss": 0.0001, + "loss": 0.0, "step": 4870 }, { "epoch": 28.372093023255815, - "grad_norm": 0.00017238754662685096, + "grad_norm": 5.5301345128100365e-05, "learning_rate": 5.8139534883720933e-05, - "loss": 0.0001, + "loss": 0.0, "step": 4880 }, { "epoch": 28.430232558139537, - "grad_norm": 0.0003975199651904404, + "grad_norm": 0.00013681758719030768, "learning_rate": 5.784883720930233e-05, "loss": 0.0001, "step": 4890 }, { "epoch": 28.488372093023255, - "grad_norm": 0.00012141514162067324, + "grad_norm": 4.886011083726771e-05, "learning_rate": 5.755813953488373e-05, - "loss": 0.0001, + "loss": 0.0, "step": 4900 }, { "epoch": 28.488372093023255, - "eval_f1": 0.28164750957854406, - "eval_loss": 0.011380935087800026, - "eval_runtime": 3.4367, - "eval_samples_per_second": 42.192, - "eval_steps_per_second": 5.529, + "eval_f1": 0.28961138478379855, + "eval_loss": 0.013477078638970852, + "eval_runtime": 3.4779, + "eval_samples_per_second": 41.692, + "eval_steps_per_second": 5.463, "step": 4900 }, { "epoch": 28.546511627906977, - "grad_norm": 0.000261383451288566, + "grad_norm": 9.383865108247846e-05, "learning_rate": 5.726744186046512e-05, - "loss": 0.0001, + "loss": 0.0, "step": 4910 }, { "epoch": 28.6046511627907, - "grad_norm": 0.00027831582701765, + "grad_norm": 0.00010642106644809246, "learning_rate": 5.697674418604652e-05, "loss": 0.0001, "step": 4920 }, { "epoch": 28.662790697674417, - "grad_norm": 0.00019040559709537774, + "grad_norm": 6.404238956747577e-05, "learning_rate": 5.668604651162791e-05, - "loss": 0.0001, + "loss": 0.0, "step": 4930 }, { "epoch": 28.72093023255814, - "grad_norm": 9.32104085222818e-05, + "grad_norm": 3.364186341059394e-05, "learning_rate": 5.6395348837209306e-05, "loss": 0.0001, "step": 4940 }, { "epoch": 28.77906976744186, - "grad_norm": 0.0001667519100010395, + "grad_norm": 6.766904698451981e-05, "learning_rate": 5.61046511627907e-05, - "loss": 0.0002, + "loss": 0.0001, "step": 4950 }, { "epoch": 28.837209302325583, - "grad_norm": 0.0001531678281025961, + "grad_norm": 5.329645500751212e-05, "learning_rate": 5.5813953488372095e-05, "loss": 0.0001, "step": 4960 }, { "epoch": 28.8953488372093, - "grad_norm": 0.00017712761473376304, + "grad_norm": 5.700432666344568e-05, "learning_rate": 5.552325581395349e-05, - "loss": 0.0001, + "loss": 0.0, "step": 4970 }, { "epoch": 28.953488372093023, - "grad_norm": 0.0004892111173830926, + "grad_norm": 0.00016557041089981794, "learning_rate": 5.5232558139534884e-05, - "loss": 0.0001, + "loss": 0.0, "step": 4980 }, { "epoch": 29.011627906976745, - "grad_norm": 8.356086618732661e-05, + "grad_norm": 3.3288117265328765e-05, "learning_rate": 5.494186046511628e-05, - "loss": 0.0001, + "loss": 0.0, "step": 4990 }, { "epoch": 29.069767441860463, - "grad_norm": 0.0002602541644591838, + "grad_norm": 0.00010683518485166132, "learning_rate": 5.465116279069767e-05, "loss": 0.0001, "step": 5000 }, { "epoch": 29.069767441860463, - "eval_f1": 0.28164750957854406, - "eval_loss": 0.01150429341942072, - "eval_runtime": 3.6516, - "eval_samples_per_second": 39.709, - "eval_steps_per_second": 5.203, + "eval_f1": 0.28294471811713195, + "eval_loss": 0.013584722764790058, + "eval_runtime": 3.629, + "eval_samples_per_second": 39.956, + "eval_steps_per_second": 5.236, "step": 5000 }, { "epoch": 29.127906976744185, - "grad_norm": 0.00018598516180645674, + "grad_norm": 7.567764259874821e-05, "learning_rate": 5.436046511627907e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5010 }, { "epoch": 29.186046511627907, - "grad_norm": 9.54332499532029e-05, + "grad_norm": 3.903659671777859e-05, "learning_rate": 5.406976744186046e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5020 }, { "epoch": 29.24418604651163, - "grad_norm": 8.306506060762331e-05, + "grad_norm": 3.379717963980511e-05, "learning_rate": 5.377906976744186e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5030 }, { "epoch": 29.302325581395348, - "grad_norm": 0.0002054049982689321, + "grad_norm": 7.912916771601886e-05, "learning_rate": 5.348837209302326e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5040 }, { "epoch": 29.36046511627907, - "grad_norm": 0.00017273312550969422, + "grad_norm": 7.148746954044327e-05, "learning_rate": 5.319767441860465e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5050 }, { "epoch": 29.41860465116279, - "grad_norm": 0.001203768653795123, + "grad_norm": 0.0011785589158535004, "learning_rate": 5.290697674418605e-05, "loss": 0.0001, "step": 5060 }, { "epoch": 29.476744186046513, - "grad_norm": 0.0013083290541544557, + "grad_norm": 0.0013802028261125088, "learning_rate": 5.261627906976745e-05, "loss": 0.0001, "step": 5070 }, { "epoch": 29.53488372093023, - "grad_norm": 0.00020865008991677314, + "grad_norm": 8.99569786270149e-05, "learning_rate": 5.232558139534884e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5080 }, { "epoch": 29.593023255813954, - "grad_norm": 0.00016767044144216925, + "grad_norm": 5.34497266926337e-05, "learning_rate": 5.203488372093024e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5090 }, { "epoch": 29.651162790697676, - "grad_norm": 0.00039240618934854865, + "grad_norm": 0.00012180486373836175, "learning_rate": 5.1744186046511636e-05, - "loss": 0.0002, + "loss": 0.0001, "step": 5100 }, { "epoch": 29.651162790697676, - "eval_f1": 0.2793486590038314, - "eval_loss": 0.011618364602327347, - "eval_runtime": 3.413, - "eval_samples_per_second": 42.484, - "eval_steps_per_second": 5.567, + "eval_f1": 0.2863929939792009, + "eval_loss": 0.013599464669823647, + "eval_runtime": 3.5135, + "eval_samples_per_second": 41.269, + "eval_steps_per_second": 5.408, "step": 5100 }, { "epoch": 29.709302325581394, - "grad_norm": 0.0001932568702613935, + "grad_norm": 5.467371374834329e-05, "learning_rate": 5.145348837209303e-05, "loss": 0.0001, "step": 5110 }, { "epoch": 29.767441860465116, - "grad_norm": 0.0002690663968678564, + "grad_norm": 9.718058572616428e-05, "learning_rate": 5.1162790697674425e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5120 }, { "epoch": 29.825581395348838, - "grad_norm": 0.00014419872604776174, + "grad_norm": 6.610102718695998e-05, "learning_rate": 5.0872093023255816e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5130 }, { "epoch": 29.88372093023256, - "grad_norm": 0.00028917836607433856, + "grad_norm": 0.0001138786828960292, "learning_rate": 5.0581395348837214e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5140 }, { "epoch": 29.941860465116278, - "grad_norm": 0.00013393754488788545, + "grad_norm": 5.7462475524516776e-05, "learning_rate": 5.0290697674418605e-05, "loss": 0.0001, "step": 5150 }, { "epoch": 30.0, - "grad_norm": 0.00014104918227531016, + "grad_norm": 5.7312758144689724e-05, "learning_rate": 5e-05, "loss": 0.0001, "step": 5160 }, { "epoch": 30.058139534883722, - "grad_norm": 0.00020505864813458174, + "grad_norm": 8.433328184764832e-05, "learning_rate": 4.97093023255814e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5170 }, { "epoch": 30.11627906976744, - "grad_norm": 0.00019484035146888345, + "grad_norm": 7.180812099250033e-05, "learning_rate": 4.941860465116279e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5180 }, { "epoch": 30.174418604651162, - "grad_norm": 0.0001535069168312475, + "grad_norm": 6.091676914365962e-05, "learning_rate": 4.912790697674419e-05, "loss": 0.0001, "step": 5190 }, { "epoch": 30.232558139534884, - "grad_norm": 0.00022290536435320973, + "grad_norm": 8.669839735375717e-05, "learning_rate": 4.883720930232558e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5200 }, { "epoch": 30.232558139534884, - "eval_f1": 0.28325670498084293, - "eval_loss": 0.011721865274012089, - "eval_runtime": 3.4824, - "eval_samples_per_second": 41.638, - "eval_steps_per_second": 5.456, + "eval_f1": 0.28984126984126984, + "eval_loss": 0.013676689006388187, + "eval_runtime": 3.4049, + "eval_samples_per_second": 42.586, + "eval_steps_per_second": 5.58, "step": 5200 }, { "epoch": 30.290697674418606, - "grad_norm": 0.00027753174072131515, + "grad_norm": 0.0001129989541368559, "learning_rate": 4.854651162790698e-05, "loss": 0.0001, "step": 5210 }, { "epoch": 30.348837209302324, - "grad_norm": 0.0002098442637361586, + "grad_norm": 8.099635306280106e-05, "learning_rate": 4.8255813953488375e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5220 }, { "epoch": 30.406976744186046, - "grad_norm": 0.0001334796252194792, + "grad_norm": 5.446938666864298e-05, "learning_rate": 4.796511627906977e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5230 }, { "epoch": 30.46511627906977, - "grad_norm": 0.00013724550080951303, + "grad_norm": 5.5554151913383976e-05, "learning_rate": 4.7674418604651164e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5240 }, { "epoch": 30.52325581395349, - "grad_norm": 0.00012516268179751933, + "grad_norm": 5.075265289633535e-05, "learning_rate": 4.738372093023256e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5250 }, { "epoch": 30.58139534883721, - "grad_norm": 0.0002204925549449399, + "grad_norm": 8.383076055906713e-05, "learning_rate": 4.709302325581396e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5260 }, { "epoch": 30.63953488372093, - "grad_norm": 0.00021479894348885864, + "grad_norm": 8.669024100527167e-05, "learning_rate": 4.680232558139535e-05, "loss": 0.0001, "step": 5270 }, { "epoch": 30.697674418604652, - "grad_norm": 6.720636156387627e-05, + "grad_norm": 2.637769102875609e-05, "learning_rate": 4.651162790697675e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5280 }, { "epoch": 30.75581395348837, - "grad_norm": 0.0001380900212097913, + "grad_norm": 5.6105236581061035e-05, "learning_rate": 4.622093023255814e-05, "loss": 0.0001, "step": 5290 }, { "epoch": 30.813953488372093, - "grad_norm": 0.00017198365821968764, + "grad_norm": 6.223523087101057e-05, "learning_rate": 4.593023255813954e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5300 }, { "epoch": 30.813953488372093, - "eval_f1": 0.28095785440613025, - "eval_loss": 0.011782627552747726, - "eval_runtime": 3.4974, - "eval_samples_per_second": 41.46, - "eval_steps_per_second": 5.433, + "eval_f1": 0.28777230432402845, + "eval_loss": 0.013757361099123955, + "eval_runtime": 3.5582, + "eval_samples_per_second": 40.751, + "eval_steps_per_second": 5.34, "step": 5300 }, { "epoch": 30.872093023255815, - "grad_norm": 0.0001518342614872381, + "grad_norm": 6.383532308973372e-05, "learning_rate": 4.563953488372093e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5310 }, { "epoch": 30.930232558139537, - "grad_norm": 9.010894427774474e-05, + "grad_norm": 3.963845665566623e-05, "learning_rate": 4.5348837209302326e-05, - "loss": 0.0002, + "loss": 0.0001, "step": 5320 }, { "epoch": 30.988372093023255, - "grad_norm": 0.0003909343504346907, + "grad_norm": 0.0001494886673754081, "learning_rate": 4.505813953488372e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5330 }, { "epoch": 31.046511627906977, - "grad_norm": 0.00024297893105540425, + "grad_norm": 0.00010179109813179821, "learning_rate": 4.476744186046512e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5340 }, { "epoch": 31.1046511627907, - "grad_norm": 0.00013888234389014542, + "grad_norm": 5.474482532008551e-05, "learning_rate": 4.447674418604651e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5350 }, { "epoch": 31.162790697674417, - "grad_norm": 0.0002786846016533673, + "grad_norm": 0.0001141530810855329, "learning_rate": 4.418604651162791e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5360 }, { "epoch": 31.22093023255814, - "grad_norm": 0.00017326476518064737, + "grad_norm": 7.843540515750647e-05, "learning_rate": 4.389534883720931e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5370 }, { "epoch": 31.27906976744186, - "grad_norm": 0.0010548157151788473, + "grad_norm": 0.001031916937790811, "learning_rate": 4.36046511627907e-05, "loss": 0.0001, "step": 5380 }, { "epoch": 31.337209302325583, - "grad_norm": 0.00022908163373358548, + "grad_norm": 9.010145004140213e-05, "learning_rate": 4.3313953488372096e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5390 }, { "epoch": 31.3953488372093, - "grad_norm": 0.00017552966892253608, + "grad_norm": 7.715393439866602e-05, "learning_rate": 4.302325581395349e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5400 }, { "epoch": 31.3953488372093, - "eval_f1": 0.2765900383141762, - "eval_loss": 0.011876632459461689, - "eval_runtime": 3.733, - "eval_samples_per_second": 38.843, - "eval_steps_per_second": 5.09, + "eval_f1": 0.2829447181171319, + "eval_loss": 0.01381576806306839, + "eval_runtime": 3.5816, + "eval_samples_per_second": 40.484, + "eval_steps_per_second": 5.305, "step": 5400 }, { "epoch": 31.453488372093023, - "grad_norm": 0.0019674422219395638, + "grad_norm": 0.0016606730641797185, "learning_rate": 4.2732558139534885e-05, "loss": 0.0001, "step": 5410 }, { "epoch": 31.511627906976745, - "grad_norm": 0.00012373637582641095, + "grad_norm": 5.4263909987639636e-05, "learning_rate": 4.2441860465116276e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5420 }, { "epoch": 31.569767441860463, - "grad_norm": 9.777515515452251e-05, + "grad_norm": 5.0394799472996965e-05, "learning_rate": 4.215116279069768e-05, "loss": 0.0001, "step": 5430 }, { "epoch": 31.627906976744185, - "grad_norm": 9.450603101868182e-05, + "grad_norm": 3.8536370993824676e-05, "learning_rate": 4.186046511627907e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5440 }, { "epoch": 31.686046511627907, - "grad_norm": 0.00028439078596420586, + "grad_norm": 0.00011184679169673473, "learning_rate": 4.156976744186047e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5450 }, { "epoch": 31.74418604651163, - "grad_norm": 0.00019921896455343813, + "grad_norm": 8.460821118205786e-05, "learning_rate": 4.127906976744187e-05, "loss": 0.0001, "step": 5460 }, { "epoch": 31.802325581395348, - "grad_norm": 0.00025190593441948295, + "grad_norm": 0.00010111901065101847, "learning_rate": 4.098837209302326e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5470 }, { "epoch": 31.86046511627907, - "grad_norm": 9.046045306604356e-05, + "grad_norm": 3.6637979064835235e-05, "learning_rate": 4.0697674418604655e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5480 }, { "epoch": 31.91860465116279, - "grad_norm": 0.00012191152927698568, + "grad_norm": 4.688263652496971e-05, "learning_rate": 4.0406976744186046e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5490 }, { "epoch": 31.97674418604651, - "grad_norm": 0.00012126816000090912, + "grad_norm": 5.571435031015426e-05, "learning_rate": 4.0116279069767444e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5500 }, { "epoch": 31.97674418604651, - "eval_f1": 0.2876245210727969, - "eval_loss": 0.011887027882039547, - "eval_runtime": 3.4056, - "eval_samples_per_second": 42.576, - "eval_steps_per_second": 5.579, + "eval_f1": 0.29099069512862613, + "eval_loss": 0.01386214978992939, + "eval_runtime": 3.4456, + "eval_samples_per_second": 42.083, + "eval_steps_per_second": 5.514, "step": 5500 }, { "epoch": 32.03488372093023, - "grad_norm": 0.00011582663137232885, + "grad_norm": 4.559468288789503e-05, "learning_rate": 3.9825581395348835e-05, "loss": 0.0001, "step": 5510 }, { "epoch": 32.093023255813954, - "grad_norm": 0.00015023123705759645, + "grad_norm": 6.466393097070977e-05, "learning_rate": 3.953488372093023e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5520 }, { "epoch": 32.151162790697676, - "grad_norm": 0.00012203873484395444, + "grad_norm": 4.4632710341829807e-05, "learning_rate": 3.924418604651163e-05, "loss": 0.0001, "step": 5530 }, { "epoch": 32.2093023255814, - "grad_norm": 0.0002246979856863618, + "grad_norm": 0.0001007476239465177, "learning_rate": 3.895348837209303e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5540 }, { "epoch": 32.26744186046512, - "grad_norm": 5.295784285408445e-05, + "grad_norm": 2.3614309611730278e-05, "learning_rate": 3.866279069767442e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5550 }, { "epoch": 32.325581395348834, - "grad_norm": 8.684042404638603e-05, + "grad_norm": 3.585747617762536e-05, "learning_rate": 3.837209302325582e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5560 }, { "epoch": 32.383720930232556, - "grad_norm": 0.0017948716413229704, + "grad_norm": 0.0016852164408192039, "learning_rate": 3.8081395348837215e-05, "loss": 0.0001, "step": 5570 }, { "epoch": 32.44186046511628, - "grad_norm": 0.00010387677320977673, + "grad_norm": 4.243815419613384e-05, "learning_rate": 3.7790697674418606e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5580 }, { "epoch": 32.5, - "grad_norm": 0.00011506995360832661, + "grad_norm": 4.3279163946863264e-05, "learning_rate": 3.7500000000000003e-05, "loss": 0.0001, "step": 5590 }, { "epoch": 32.55813953488372, - "grad_norm": 8.274852734757587e-05, + "grad_norm": 3.228824061807245e-05, "learning_rate": 3.7209302325581394e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5600 }, { "epoch": 32.55813953488372, - "eval_f1": 0.27980842911877396, - "eval_loss": 0.012018047273159027, - "eval_runtime": 3.692, - "eval_samples_per_second": 39.274, - "eval_steps_per_second": 5.146, + "eval_f1": 0.29099069512862613, + "eval_loss": 0.013911720365285873, + "eval_runtime": 3.6285, + "eval_samples_per_second": 39.961, + "eval_steps_per_second": 5.236, "step": 5600 }, { "epoch": 32.616279069767444, - "grad_norm": 0.0001311128871748224, + "grad_norm": 4.971786984242499e-05, "learning_rate": 3.691860465116279e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5610 }, { "epoch": 32.674418604651166, - "grad_norm": 0.0001304544712183997, + "grad_norm": 5.36576917511411e-05, "learning_rate": 3.662790697674418e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5620 }, { "epoch": 32.73255813953488, - "grad_norm": 0.00018204055959358811, + "grad_norm": 7.435170846292749e-05, "learning_rate": 3.633720930232558e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5630 }, { "epoch": 32.7906976744186, - "grad_norm": 0.00017878651851788163, + "grad_norm": 8.162882295437157e-05, "learning_rate": 3.604651162790698e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5640 }, { "epoch": 32.848837209302324, - "grad_norm": 0.0002789355639833957, + "grad_norm": 0.00011551292845979333, "learning_rate": 3.5755813953488376e-05, "loss": 0.0001, "step": 5650 }, { "epoch": 32.906976744186046, - "grad_norm": 0.0001054438398568891, + "grad_norm": 3.900201045325957e-05, "learning_rate": 3.5465116279069774e-05, "loss": 0.0001, "step": 5660 }, { "epoch": 32.96511627906977, - "grad_norm": 0.0001857210008893162, + "grad_norm": 8.802942465990782e-05, "learning_rate": 3.5174418604651165e-05, "loss": 0.0001, "step": 5670 }, { "epoch": 33.02325581395349, - "grad_norm": 0.00014737743185833097, + "grad_norm": 6.427193147828802e-05, "learning_rate": 3.488372093023256e-05, "loss": 0.0001, "step": 5680 }, { "epoch": 33.08139534883721, - "grad_norm": 0.00010852559353224933, + "grad_norm": 4.4448115659179166e-05, "learning_rate": 3.4593023255813954e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5690 }, { "epoch": 33.13953488372093, - "grad_norm": 7.817021833034232e-05, + "grad_norm": 3.592113353079185e-05, "learning_rate": 3.430232558139535e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5700 }, { "epoch": 33.13953488372093, - "eval_f1": 0.2890366721401204, - "eval_loss": 0.012036466971039772, - "eval_runtime": 3.4702, - "eval_samples_per_second": 41.784, - "eval_steps_per_second": 5.475, + "eval_f1": 0.2863929939792009, + "eval_loss": 0.013954274356365204, + "eval_runtime": 3.4335, + "eval_samples_per_second": 42.231, + "eval_steps_per_second": 5.534, "step": 5700 }, { "epoch": 33.19767441860465, - "grad_norm": 0.000135956855956465, + "grad_norm": 5.941819108556956e-05, "learning_rate": 3.401162790697674e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5710 }, { "epoch": 33.25581395348837, - "grad_norm": 9.314154158346355e-05, + "grad_norm": 4.0591501601738855e-05, "learning_rate": 3.372093023255814e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5720 }, { "epoch": 33.31395348837209, - "grad_norm": 0.00014684416237287223, + "grad_norm": 7.046113023534417e-05, "learning_rate": 3.343023255813954e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5730 }, { "epoch": 33.372093023255815, - "grad_norm": 9.907364437822253e-05, + "grad_norm": 4.407744199852459e-05, "learning_rate": 3.313953488372093e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5740 }, { "epoch": 33.43023255813954, - "grad_norm": 0.00013944462989456952, + "grad_norm": 5.7378078054171056e-05, "learning_rate": 3.284883720930233e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5750 }, { "epoch": 33.48837209302326, - "grad_norm": 0.00012148257519584149, + "grad_norm": 5.3646341257262975e-05, "learning_rate": 3.2558139534883724e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5760 }, { "epoch": 33.54651162790697, - "grad_norm": 8.549185440642759e-05, + "grad_norm": 3.057984577026218e-05, "learning_rate": 3.226744186046512e-05, "loss": 0.0001, "step": 5770 }, { "epoch": 33.604651162790695, - "grad_norm": 0.00018653196457307786, + "grad_norm": 8.206104394048452e-05, "learning_rate": 3.197674418604651e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5780 }, { "epoch": 33.66279069767442, - "grad_norm": 0.001464176457375288, + "grad_norm": 0.001278466428630054, "learning_rate": 3.168604651162791e-05, "loss": 0.0001, "step": 5790 }, { "epoch": 33.72093023255814, - "grad_norm": 0.00013092403241898865, + "grad_norm": 5.615848203888163e-05, "learning_rate": 3.13953488372093e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5800 }, { "epoch": 33.72093023255814, - "eval_f1": 0.28302681992337164, - "eval_loss": 0.01213167142122984, - "eval_runtime": 3.6168, - "eval_samples_per_second": 40.091, - "eval_steps_per_second": 5.253, + "eval_f1": 0.29099069512862613, + "eval_loss": 0.014040759764611721, + "eval_runtime": 3.5965, + "eval_samples_per_second": 40.317, + "eval_steps_per_second": 5.283, "step": 5800 }, { "epoch": 33.77906976744186, - "grad_norm": 4.064566019224003e-05, + "grad_norm": 1.5725838238722645e-05, "learning_rate": 3.11046511627907e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5810 }, { "epoch": 33.83720930232558, - "grad_norm": 0.0002751752035692334, + "grad_norm": 0.00011022594117093831, "learning_rate": 3.081395348837209e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5820 }, { "epoch": 33.895348837209305, - "grad_norm": 0.0015582791529595852, + "grad_norm": 0.0013819674495607615, "learning_rate": 3.052325581395349e-05, "loss": 0.0001, "step": 5830 }, { "epoch": 33.95348837209303, - "grad_norm": 0.0010291668586432934, + "grad_norm": 0.0009882468730211258, "learning_rate": 3.0232558139534883e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5840 }, { "epoch": 34.01162790697674, - "grad_norm": 0.00012285509728826582, + "grad_norm": 6.084492270019837e-05, "learning_rate": 2.9941860465116277e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5850 }, { "epoch": 34.06976744186046, - "grad_norm": 0.00013763338210992515, + "grad_norm": 6.144608778413385e-05, "learning_rate": 2.9651162790697678e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5860 }, { "epoch": 34.127906976744185, - "grad_norm": 0.0002774275781121105, + "grad_norm": 0.00011088972678408027, "learning_rate": 2.9360465116279072e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5870 }, { "epoch": 34.18604651162791, - "grad_norm": 0.000168486891197972, + "grad_norm": 7.059588824631646e-05, "learning_rate": 2.9069767441860467e-05, "loss": 0.0001, "step": 5880 }, { "epoch": 34.24418604651163, - "grad_norm": 0.0001923375966725871, + "grad_norm": 7.196161459432915e-05, "learning_rate": 2.8779069767441864e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5890 }, { "epoch": 34.30232558139535, - "grad_norm": 0.00018539116717875004, + "grad_norm": 8.242028707172722e-05, "learning_rate": 2.848837209302326e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5900 }, { "epoch": 34.30232558139535, - "eval_f1": 0.2862452107279693, - "eval_loss": 0.012162204831838608, - "eval_runtime": 3.4002, - "eval_samples_per_second": 42.645, - "eval_steps_per_second": 5.588, + "eval_f1": 0.29099069512862613, + "eval_loss": 0.014029977843165398, + "eval_runtime": 3.5257, + "eval_samples_per_second": 41.127, + "eval_steps_per_second": 5.389, "step": 5900 }, { "epoch": 34.36046511627907, - "grad_norm": 0.00014681600441690534, + "grad_norm": 5.9451271226862445e-05, "learning_rate": 2.8197674418604653e-05, "loss": 0.0001, "step": 5910 }, { "epoch": 34.41860465116279, - "grad_norm": 0.00013214924547355622, + "grad_norm": 5.25450159329921e-05, "learning_rate": 2.7906976744186048e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5920 }, { "epoch": 34.47674418604651, - "grad_norm": 0.0015631905989721417, + "grad_norm": 0.0014084222493693233, "learning_rate": 2.7616279069767442e-05, "loss": 0.0001, "step": 5930 }, { "epoch": 34.53488372093023, - "grad_norm": 0.00011128299229312688, + "grad_norm": 4.850577170145698e-05, "learning_rate": 2.7325581395348836e-05, "loss": 0.0, "step": 5940 }, { "epoch": 34.593023255813954, - "grad_norm": 8.814867032924667e-05, + "grad_norm": 3.650556755019352e-05, "learning_rate": 2.703488372093023e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5950 }, { "epoch": 34.651162790697676, - "grad_norm": 0.0001612030464457348, + "grad_norm": 6.439626304199919e-05, "learning_rate": 2.674418604651163e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5960 }, { "epoch": 34.7093023255814, - "grad_norm": 0.00011930564505746588, + "grad_norm": 5.1403050747467205e-05, "learning_rate": 2.6453488372093026e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5970 }, { "epoch": 34.76744186046512, - "grad_norm": 0.00014217174611985683, + "grad_norm": 6.013525853632018e-05, "learning_rate": 2.616279069767442e-05, "loss": 0.0, "step": 5980 }, { "epoch": 34.825581395348834, - "grad_norm": 7.942403317429125e-05, + "grad_norm": 3.223196472390555e-05, "learning_rate": 2.5872093023255818e-05, - "loss": 0.0001, + "loss": 0.0, "step": 5990 }, { "epoch": 34.883720930232556, - "grad_norm": 0.0002957758551929146, + "grad_norm": 0.0001265082391910255, "learning_rate": 2.5581395348837212e-05, "loss": 0.0001, "step": 6000 }, { "epoch": 34.883720930232556, - "eval_f1": 0.2938314176245211, - "eval_loss": 0.012194461189210415, - "eval_runtime": 3.5706, - "eval_samples_per_second": 40.61, - "eval_steps_per_second": 5.321, + "eval_f1": 0.29099069512862613, + "eval_loss": 0.014094020240008831, + "eval_runtime": 3.6626, + "eval_samples_per_second": 39.59, + "eval_steps_per_second": 5.188, "step": 6000 }, { "epoch": 34.94186046511628, - "grad_norm": 0.00012033889652229846, + "grad_norm": 4.7167286538751796e-05, "learning_rate": 2.5290697674418607e-05, - "loss": 0.0001, + "loss": 0.0, "step": 6010 }, { "epoch": 35.0, - "grad_norm": 8.605780749348924e-05, + "grad_norm": 4.0338782127946615e-05, "learning_rate": 2.5e-05, - "loss": 0.0001, + "loss": 0.0, "step": 6020 }, { "epoch": 35.05813953488372, - "grad_norm": 3.7674937630072236e-05, + "grad_norm": 1.820520446926821e-05, "learning_rate": 2.4709302325581396e-05, - "loss": 0.0001, + "loss": 0.0, "step": 6030 }, { "epoch": 35.116279069767444, - "grad_norm": 6.766508158762008e-05, + "grad_norm": 2.6706376957008615e-05, "learning_rate": 2.441860465116279e-05, - "loss": 0.0001, + "loss": 0.0, "step": 6040 }, { "epoch": 35.174418604651166, - "grad_norm": 7.438640022883192e-05, + "grad_norm": 3.248331631766632e-05, "learning_rate": 2.4127906976744188e-05, "loss": 0.0001, "step": 6050 }, { "epoch": 35.23255813953488, - "grad_norm": 8.941497071646154e-05, + "grad_norm": 3.7379890272859484e-05, "learning_rate": 2.3837209302325582e-05, - "loss": 0.0001, + "loss": 0.0, "step": 6060 }, { "epoch": 35.2906976744186, - "grad_norm": 0.000191706873010844, + "grad_norm": 7.409269892377779e-05, "learning_rate": 2.354651162790698e-05, - "loss": 0.0001, + "loss": 0.0, "step": 6070 }, { "epoch": 35.348837209302324, - "grad_norm": 0.00023106786829885095, + "grad_norm": 9.567120287101716e-05, "learning_rate": 2.3255813953488374e-05, - "loss": 0.0001, + "loss": 0.0, "step": 6080 }, { "epoch": 35.406976744186046, - "grad_norm": 7.814213313395157e-05, + "grad_norm": 3.7521804188145325e-05, "learning_rate": 2.296511627906977e-05, "loss": 0.0001, "step": 6090 }, { "epoch": 35.46511627906977, - "grad_norm": 0.00019387643260415643, + "grad_norm": 7.803832704667002e-05, "learning_rate": 2.2674418604651163e-05, - "loss": 0.0001, + "loss": 0.0, "step": 6100 }, { "epoch": 35.46511627906977, - "eval_f1": 0.29314176245210727, - "eval_loss": 0.012266311794519424, - "eval_runtime": 3.4225, - "eval_samples_per_second": 42.367, - "eval_steps_per_second": 5.552, + "eval_f1": 0.29099069512862613, + "eval_loss": 0.014105994254350662, + "eval_runtime": 3.6532, + "eval_samples_per_second": 39.691, + "eval_steps_per_second": 5.201, "step": 6100 }, { "epoch": 35.52325581395349, - "grad_norm": 0.00013076080358587205, + "grad_norm": 5.455021164380014e-05, "learning_rate": 2.238372093023256e-05, "loss": 0.0001, "step": 6110 }, { "epoch": 35.58139534883721, - "grad_norm": 0.0001595883077243343, + "grad_norm": 6.907759961904958e-05, "learning_rate": 2.2093023255813955e-05, "loss": 0.0001, "step": 6120 }, { "epoch": 35.63953488372093, - "grad_norm": 0.00011887087021023035, + "grad_norm": 5.429649172583595e-05, "learning_rate": 2.180232558139535e-05, - "loss": 0.0001, + "loss": 0.0, "step": 6130 }, { "epoch": 35.69767441860465, - "grad_norm": 0.00013006749213673174, + "grad_norm": 7.051627471810207e-05, "learning_rate": 2.1511627906976744e-05, - "loss": 0.0001, + "loss": 0.0, "step": 6140 }, { "epoch": 35.75581395348837, - "grad_norm": 8.787312253843993e-05, + "grad_norm": 3.3518244890728965e-05, "learning_rate": 2.1220930232558138e-05, - "loss": 0.0001, + "loss": 0.0, "step": 6150 }, { "epoch": 35.81395348837209, - "grad_norm": 0.0001312882814090699, + "grad_norm": 5.913059430895373e-05, "learning_rate": 2.0930232558139536e-05, - "loss": 0.0001, + "loss": 0.0, "step": 6160 }, { "epoch": 35.872093023255815, - "grad_norm": 0.00014745873340871185, + "grad_norm": 6.510165258077905e-05, "learning_rate": 2.0639534883720933e-05, - "loss": 0.0001, + "loss": 0.0, "step": 6170 }, { "epoch": 35.93023255813954, - "grad_norm": 0.0001226527092512697, + "grad_norm": 5.56660997972358e-05, "learning_rate": 2.0348837209302328e-05, - "loss": 0.0001, + "loss": 0.0, "step": 6180 }, { "epoch": 35.98837209302326, - "grad_norm": 0.00016083895752672106, + "grad_norm": 6.964689237065613e-05, "learning_rate": 2.0058139534883722e-05, - "loss": 0.0001, + "loss": 0.0, "step": 6190 }, { "epoch": 36.04651162790697, - "grad_norm": 0.00015023113519418985, + "grad_norm": 6.217962072696537e-05, "learning_rate": 1.9767441860465116e-05, - "loss": 0.0001, + "loss": 0.0, "step": 6200 }, { "epoch": 36.04651162790697, - "eval_f1": 0.2945210727969348, - "eval_loss": 0.012289067730307579, - "eval_runtime": 3.5505, - "eval_samples_per_second": 40.839, - "eval_steps_per_second": 5.351, + "eval_f1": 0.29099069512862613, + "eval_loss": 0.014182238839566708, + "eval_runtime": 3.6867, + "eval_samples_per_second": 39.33, + "eval_steps_per_second": 5.154, "step": 6200 }, { "epoch": 36.104651162790695, - "grad_norm": 0.00011414434993639588, + "grad_norm": 5.075850276625715e-05, "learning_rate": 1.9476744186046514e-05, - "loss": 0.0001, + "loss": 0.0, "step": 6210 }, { "epoch": 36.16279069767442, - "grad_norm": 9.501133899902925e-05, + "grad_norm": 4.40786279796157e-05, "learning_rate": 1.918604651162791e-05, - "loss": 0.0001, + "loss": 0.0, "step": 6220 }, { "epoch": 36.22093023255814, - "grad_norm": 8.573233935749158e-05, + "grad_norm": 3.8874815800227225e-05, "learning_rate": 1.8895348837209303e-05, - "loss": 0.0001, + "loss": 0.0, "step": 6230 }, { "epoch": 36.27906976744186, - "grad_norm": 0.00016559124924242496, + "grad_norm": 6.489389488706365e-05, "learning_rate": 1.8604651162790697e-05, "loss": 0.0001, "step": 6240 }, { "epoch": 36.33720930232558, - "grad_norm": 0.00012183932994958013, + "grad_norm": 6.208844570210204e-05, "learning_rate": 1.831395348837209e-05, "loss": 0.0001, "step": 6250 }, { "epoch": 36.395348837209305, - "grad_norm": 9.987889643525705e-05, + "grad_norm": 4.0742837882135063e-05, "learning_rate": 1.802325581395349e-05, - "loss": 0.0001, + "loss": 0.0, "step": 6260 }, { "epoch": 36.45348837209303, - "grad_norm": 0.0009484774782322347, + "grad_norm": 0.0009149734978564084, "learning_rate": 1.7732558139534887e-05, - "loss": 0.0001, + "loss": 0.0, "step": 6270 }, { "epoch": 36.51162790697674, - "grad_norm": 0.00021977306460030377, + "grad_norm": 0.00010139483492821455, "learning_rate": 1.744186046511628e-05, "loss": 0.0001, "step": 6280 }, { "epoch": 36.56976744186046, - "grad_norm": 5.9906513342866674e-05, + "grad_norm": 2.6992856874130666e-05, "learning_rate": 1.7151162790697676e-05, "loss": 0.0, "step": 6290 }, { "epoch": 36.627906976744185, - "grad_norm": 0.0002617529535200447, + "grad_norm": 0.0001187586021842435, "learning_rate": 1.686046511627907e-05, - "loss": 0.0001, + "loss": 0.0, "step": 6300 }, { "epoch": 36.627906976744185, - "eval_f1": 0.29314176245210727, - "eval_loss": 0.012351792305707932, - "eval_runtime": 3.5163, - "eval_samples_per_second": 41.236, - "eval_steps_per_second": 5.403, + "eval_f1": 0.29099069512862613, + "eval_loss": 0.014201737940311432, + "eval_runtime": 3.4709, + "eval_samples_per_second": 41.776, + "eval_steps_per_second": 5.474, "step": 6300 }, { "epoch": 36.68604651162791, - "grad_norm": 0.0003131375997327268, + "grad_norm": 0.00012753716146107763, "learning_rate": 1.6569767441860464e-05, - "loss": 0.0001, + "loss": 0.0, "step": 6310 }, { "epoch": 36.74418604651163, - "grad_norm": 0.0001495686883572489, + "grad_norm": 6.54375835438259e-05, "learning_rate": 1.6279069767441862e-05, "loss": 0.0001, "step": 6320 }, { "epoch": 36.80232558139535, - "grad_norm": 5.631699605146423e-05, + "grad_norm": 2.518395740480628e-05, "learning_rate": 1.5988372093023257e-05, - "loss": 0.0001, + "loss": 0.0, "step": 6330 }, { "epoch": 36.86046511627907, - "grad_norm": 0.00014844325778540224, + "grad_norm": 6.046351700206287e-05, "learning_rate": 1.569767441860465e-05, "loss": 0.0, "step": 6340 }, { "epoch": 36.91860465116279, - "grad_norm": 0.00011371670552762225, + "grad_norm": 5.144343958818354e-05, "learning_rate": 1.5406976744186045e-05, "loss": 0.0, "step": 6350 }, { "epoch": 36.97674418604651, - "grad_norm": 0.0013855810975655913, + "grad_norm": 0.0013279829872772098, "learning_rate": 1.5116279069767441e-05, "loss": 0.0001, "step": 6360 }, { "epoch": 37.03488372093023, - "grad_norm": 0.00018020320567302406, + "grad_norm": 9.111184772336856e-05, "learning_rate": 1.4825581395348839e-05, "loss": 0.0001, "step": 6370 }, { "epoch": 37.093023255813954, - "grad_norm": 8.168007479980588e-05, + "grad_norm": 3.807275425060652e-05, "learning_rate": 1.4534883720930233e-05, "loss": 0.0, "step": 6380 }, { "epoch": 37.151162790697676, - "grad_norm": 0.00013036660675425082, + "grad_norm": 5.674179919878952e-05, "learning_rate": 1.424418604651163e-05, "loss": 0.0001, "step": 6390 }, { "epoch": 37.2093023255814, - "grad_norm": 3.612674117903225e-05, + "grad_norm": 1.8400254703010432e-05, "learning_rate": 1.3953488372093024e-05, - "loss": 0.0001, + "loss": 0.0, "step": 6400 }, { "epoch": 37.2093023255814, - "eval_f1": 0.28946360153256706, - "eval_loss": 0.012344308197498322, - "eval_runtime": 3.4693, - "eval_samples_per_second": 41.795, - "eval_steps_per_second": 5.477, + "eval_f1": 0.29099069512862613, + "eval_loss": 0.01421139482408762, + "eval_runtime": 3.4773, + "eval_samples_per_second": 41.699, + "eval_steps_per_second": 5.464, "step": 6400 }, { "epoch": 37.26744186046512, - "grad_norm": 3.0226166927604936e-05, + "grad_norm": 1.3907278116676025e-05, "learning_rate": 1.3662790697674418e-05, "loss": 0.0, "step": 6410 }, { "epoch": 37.325581395348834, - "grad_norm": 0.0011081281118094921, + "grad_norm": 0.0010881676571443677, "learning_rate": 1.3372093023255814e-05, "loss": 0.0001, "step": 6420 }, { "epoch": 37.383720930232556, - "grad_norm": 0.00015511033416260034, + "grad_norm": 6.577212479896843e-05, "learning_rate": 1.308139534883721e-05, - "loss": 0.0001, + "loss": 0.0, "step": 6430 }, { "epoch": 37.44186046511628, - "grad_norm": 0.0009556662989780307, + "grad_norm": 0.0009500395390205085, "learning_rate": 1.2790697674418606e-05, - "loss": 0.0001, + "loss": 0.0, "step": 6440 }, { "epoch": 37.5, - "grad_norm": 6.703213148284703e-05, + "grad_norm": 3.0178140150383115e-05, "learning_rate": 1.25e-05, "loss": 0.0, "step": 6450 }, { "epoch": 37.55813953488372, - "grad_norm": 0.00015013941447250545, + "grad_norm": 6.296801439020783e-05, "learning_rate": 1.2209302325581395e-05, - "loss": 0.0001, + "loss": 0.0, "step": 6460 }, { "epoch": 37.616279069767444, - "grad_norm": 8.934471406973898e-05, + "grad_norm": 3.9534879761049524e-05, "learning_rate": 1.1918604651162791e-05, - "loss": 0.0001, + "loss": 0.0, "step": 6470 }, { "epoch": 37.674418604651166, - "grad_norm": 0.0010155312484130263, + "grad_norm": 0.0009818760445341468, "learning_rate": 1.1627906976744187e-05, - "loss": 0.0001, + "loss": 0.0, "step": 6480 }, { "epoch": 37.73255813953488, - "grad_norm": 0.00017622910672798753, + "grad_norm": 7.201674452517182e-05, "learning_rate": 1.1337209302325581e-05, - "loss": 0.0001, + "loss": 0.0, "step": 6490 }, { "epoch": 37.7906976744186, - "grad_norm": 0.00010929736163234338, + "grad_norm": 4.7669211198808625e-05, "learning_rate": 1.1046511627906977e-05, - "loss": 0.0001, + "loss": 0.0, "step": 6500 }, { "epoch": 37.7906976744186, - "eval_f1": 0.29314176245210727, - "eval_loss": 0.01237593311816454, - "eval_runtime": 3.4823, - "eval_samples_per_second": 41.639, - "eval_steps_per_second": 5.456, + "eval_f1": 0.29099069512862613, + "eval_loss": 0.01422305591404438, + "eval_runtime": 3.8279, + "eval_samples_per_second": 37.88, + "eval_steps_per_second": 4.964, "step": 6500 }, { "epoch": 37.848837209302324, - "grad_norm": 8.053899364313111e-05, + "grad_norm": 3.94410926674027e-05, "learning_rate": 1.0755813953488372e-05, "loss": 0.0001, "step": 6510 }, { "epoch": 37.906976744186046, - "grad_norm": 5.195455378270708e-05, + "grad_norm": 2.4280170691781677e-05, "learning_rate": 1.0465116279069768e-05, "loss": 0.0, "step": 6520 }, { "epoch": 37.96511627906977, - "grad_norm": 0.001343599520623684, + "grad_norm": 0.001271724351681769, "learning_rate": 1.0174418604651164e-05, "loss": 0.0001, "step": 6530 }, { "epoch": 38.02325581395349, - "grad_norm": 0.00010656165250111371, + "grad_norm": 4.451238419278525e-05, "learning_rate": 9.883720930232558e-06, - "loss": 0.0001, + "loss": 0.0, "step": 6540 }, { "epoch": 38.08139534883721, - "grad_norm": 0.00016731688810978085, + "grad_norm": 7.402494520647451e-05, "learning_rate": 9.593023255813954e-06, "loss": 0.0001, "step": 6550 }, { "epoch": 38.13953488372093, - "grad_norm": 0.00010719741840148345, + "grad_norm": 4.294473183108494e-05, "learning_rate": 9.302325581395349e-06, "loss": 0.0, "step": 6560 }, { "epoch": 38.19767441860465, - "grad_norm": 8.668331429362297e-05, + "grad_norm": 3.757912418222986e-05, "learning_rate": 9.011627906976745e-06, - "loss": 0.0001, + "loss": 0.0, "step": 6570 }, { "epoch": 38.25581395348837, - "grad_norm": 0.00015398432151414454, + "grad_norm": 6.937901343917474e-05, "learning_rate": 8.72093023255814e-06, "loss": 0.0001, "step": 6580 }, { "epoch": 38.31395348837209, - "grad_norm": 0.00020456039055716246, + "grad_norm": 9.59595272433944e-05, "learning_rate": 8.430232558139535e-06, - "loss": 0.0001, + "loss": 0.0, "step": 6590 }, { "epoch": 38.372093023255815, - "grad_norm": 6.38675264781341e-05, + "grad_norm": 2.7079699066234753e-05, "learning_rate": 8.139534883720931e-06, "loss": 0.0001, "step": 6600 }, { "epoch": 38.372093023255815, - "eval_f1": 0.2954406130268199, - "eval_loss": 0.012406516820192337, - "eval_runtime": 3.7428, - "eval_samples_per_second": 38.741, - "eval_steps_per_second": 5.076, + "eval_f1": 0.29099069512862613, + "eval_loss": 0.014242968522012234, + "eval_runtime": 3.4547, + "eval_samples_per_second": 41.972, + "eval_steps_per_second": 5.5, "step": 6600 }, { "epoch": 38.43023255813954, - "grad_norm": 8.79619037732482e-05, + "grad_norm": 3.895884583471343e-05, "learning_rate": 7.848837209302325e-06, - "loss": 0.0001, + "loss": 0.0, "step": 6610 }, { "epoch": 38.48837209302326, - "grad_norm": 0.00012503766629379243, + "grad_norm": 5.40660330443643e-05, "learning_rate": 7.558139534883721e-06, "loss": 0.0, "step": 6620 }, { "epoch": 38.54651162790697, - "grad_norm": 0.0011176352854818106, + "grad_norm": 0.0011285449145361781, "learning_rate": 7.267441860465117e-06, - "loss": 0.0001, + "loss": 0.0, "step": 6630 }, { "epoch": 38.604651162790695, - "grad_norm": 0.00017961690900847316, + "grad_norm": 8.187559433281422e-05, "learning_rate": 6.976744186046512e-06, - "loss": 0.0001, + "loss": 0.0, "step": 6640 }, { "epoch": 38.66279069767442, - "grad_norm": 0.000830134900752455, + "grad_norm": 0.0008101025014184415, "learning_rate": 6.686046511627907e-06, - "loss": 0.0001, + "loss": 0.0, "step": 6650 }, { "epoch": 38.72093023255814, - "grad_norm": 0.00017510255565866828, + "grad_norm": 7.593370537506416e-05, "learning_rate": 6.395348837209303e-06, - "loss": 0.0001, + "loss": 0.0, "step": 6660 }, { "epoch": 38.77906976744186, - "grad_norm": 8.669484668644145e-05, + "grad_norm": 3.832248694379814e-05, "learning_rate": 6.1046511627906975e-06, - "loss": 0.0001, + "loss": 0.0, "step": 6670 }, { "epoch": 38.83720930232558, - "grad_norm": 6.750423926860094e-05, + "grad_norm": 3.35754593834281e-05, "learning_rate": 5.8139534883720935e-06, - "loss": 0.0001, + "loss": 0.0, "step": 6680 }, { "epoch": 38.895348837209305, - "grad_norm": 0.00015165552031248808, + "grad_norm": 6.160214979900047e-05, "learning_rate": 5.523255813953489e-06, "loss": 0.0, "step": 6690 }, { "epoch": 38.95348837209303, - "grad_norm": 0.00013152658357284963, + "grad_norm": 5.2824787417193875e-05, "learning_rate": 5.232558139534884e-06, "loss": 0.0, "step": 6700 }, { "epoch": 38.95348837209303, - "eval_f1": 0.29314176245210727, - "eval_loss": 0.01242696214467287, - "eval_runtime": 3.4693, - "eval_samples_per_second": 41.795, - "eval_steps_per_second": 5.477, + "eval_f1": 0.29099069512862613, + "eval_loss": 0.014258792623877525, + "eval_runtime": 3.69, + "eval_samples_per_second": 39.295, + "eval_steps_per_second": 5.149, "step": 6700 }, { "epoch": 39.01162790697674, - "grad_norm": 0.00021946868218947202, + "grad_norm": 9.248552669305354e-05, "learning_rate": 4.941860465116279e-06, - "loss": 0.0001, + "loss": 0.0, "step": 6710 }, { "epoch": 39.06976744186046, - "grad_norm": 8.51030126796104e-05, + "grad_norm": 3.754795034183189e-05, "learning_rate": 4.651162790697674e-06, - "loss": 0.0001, + "loss": 0.0, "step": 6720 }, { "epoch": 39.127906976744185, - "grad_norm": 8.695604628883302e-05, + "grad_norm": 3.82878388336394e-05, "learning_rate": 4.36046511627907e-06, - "loss": 0.0001, + "loss": 0.0, "step": 6730 }, { "epoch": 39.18604651162791, - "grad_norm": 8.164348400896415e-05, + "grad_norm": 3.539842873578891e-05, "learning_rate": 4.0697674418604655e-06, - "loss": 0.0001, + "loss": 0.0, "step": 6740 }, { "epoch": 39.24418604651163, - "grad_norm": 0.00012590661935973912, + "grad_norm": 5.360898649087176e-05, "learning_rate": 3.7790697674418603e-06, "loss": 0.0001, "step": 6750 }, { "epoch": 39.30232558139535, - "grad_norm": 9.477663115831092e-05, + "grad_norm": 4.1736613638931885e-05, "learning_rate": 3.488372093023256e-06, "loss": 0.0001, "step": 6760 }, { "epoch": 39.36046511627907, - "grad_norm": 0.00015061408339533955, + "grad_norm": 7.096412446117029e-05, "learning_rate": 3.1976744186046516e-06, - "loss": 0.0001, + "loss": 0.0, "step": 6770 }, { "epoch": 39.41860465116279, - "grad_norm": 7.140241359593347e-05, + "grad_norm": 3.134623693767935e-05, "learning_rate": 2.9069767441860468e-06, - "loss": 0.0001, + "loss": 0.0, "step": 6780 }, { "epoch": 39.47674418604651, - "grad_norm": 0.00012239372881595045, + "grad_norm": 5.227196015766822e-05, "learning_rate": 2.616279069767442e-06, "loss": 0.0, "step": 6790 }, { "epoch": 39.53488372093023, - "grad_norm": 8.01909263827838e-05, + "grad_norm": 3.6389799788594246e-05, "learning_rate": 2.325581395348837e-06, - "loss": 0.0001, + "loss": 0.0, "step": 6800 }, { "epoch": 39.53488372093023, - "eval_f1": 0.2954406130268199, - "eval_loss": 0.012424755841493607, - "eval_runtime": 3.6525, - "eval_samples_per_second": 39.699, - "eval_steps_per_second": 5.202, + "eval_f1": 0.29099069512862613, + "eval_loss": 0.014251462183892727, + "eval_runtime": 3.5064, + "eval_samples_per_second": 41.353, + "eval_steps_per_second": 5.419, "step": 6800 }, { "epoch": 39.593023255813954, - "grad_norm": 0.0017040801467373967, + "grad_norm": 0.0016915270825847983, "learning_rate": 2.0348837209302328e-06, "loss": 0.0001, "step": 6810 }, { "epoch": 39.651162790697676, - "grad_norm": 8.113701915135607e-05, + "grad_norm": 3.793878204305656e-05, "learning_rate": 1.744186046511628e-06, - "loss": 0.0001, + "loss": 0.0, "step": 6820 }, { "epoch": 39.7093023255814, - "grad_norm": 0.0001180045583168976, + "grad_norm": 4.6207122068153694e-05, "learning_rate": 1.4534883720930234e-06, "loss": 0.0, "step": 6830 }, { "epoch": 39.76744186046512, - "grad_norm": 0.0002191288658650592, + "grad_norm": 8.933740900829434e-05, "learning_rate": 1.1627906976744186e-06, - "loss": 0.0001, + "loss": 0.0, "step": 6840 }, { "epoch": 39.825581395348834, - "grad_norm": 9.5214563771151e-05, + "grad_norm": 4.582426117849536e-05, "learning_rate": 8.72093023255814e-07, - "loss": 0.0001, + "loss": 0.0, "step": 6850 }, { "epoch": 39.883720930232556, - "grad_norm": 6.935905548743904e-05, + "grad_norm": 3.368450779817067e-05, "learning_rate": 5.813953488372093e-07, - "loss": 0.0001, + "loss": 0.0, "step": 6860 }, { "epoch": 39.94186046511628, - "grad_norm": 7.418568566208705e-05, + "grad_norm": 3.472556272754446e-05, "learning_rate": 2.9069767441860464e-07, - "loss": 0.0001, + "loss": 0.0, "step": 6870 }, { "epoch": 40.0, - "grad_norm": 8.563321898691356e-05, + "grad_norm": 3.56947275577113e-05, "learning_rate": 0.0, - "loss": 0.0001, + "loss": 0.0, "step": 6880 }, { "epoch": 40.0, "step": 6880, "total_flos": 8.58517804989186e+18, - "train_loss": 0.0018945207089970165, - "train_runtime": 3857.5132, - "train_samples_per_second": 28.495, - "train_steps_per_second": 1.784 + "train_loss": 0.0008123670003886556, + "train_runtime": 3936.5073, + "train_samples_per_second": 27.923, + "train_steps_per_second": 1.748 } ], "logging_steps": 10,