|
{ |
|
"best_metric": 0.007714101579040289, |
|
"best_model_checkpoint": "./all-observation-type/checkpoint-2500", |
|
"epoch": 40.0, |
|
"eval_steps": 100, |
|
"global_step": 3440, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.11627906976744186, |
|
"grad_norm": 0.09591829776763916, |
|
"learning_rate": 0.0001994186046511628, |
|
"loss": 0.6339, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.23255813953488372, |
|
"grad_norm": 0.09048111736774445, |
|
"learning_rate": 0.0001988372093023256, |
|
"loss": 0.5034, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.3488372093023256, |
|
"grad_norm": 0.08638294041156769, |
|
"learning_rate": 0.00019825581395348837, |
|
"loss": 0.3894, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.46511627906976744, |
|
"grad_norm": 0.07995904237031937, |
|
"learning_rate": 0.00019767441860465116, |
|
"loss": 0.2951, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.5813953488372093, |
|
"grad_norm": 0.07135872542858124, |
|
"learning_rate": 0.00019709302325581396, |
|
"loss": 0.223, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.6976744186046512, |
|
"grad_norm": 0.0624060332775116, |
|
"learning_rate": 0.00019651162790697676, |
|
"loss": 0.1703, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.813953488372093, |
|
"grad_norm": 0.05441118776798248, |
|
"learning_rate": 0.00019593023255813952, |
|
"loss": 0.133, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.9302325581395349, |
|
"grad_norm": 0.04721858724951744, |
|
"learning_rate": 0.00019534883720930232, |
|
"loss": 0.1057, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.0465116279069768, |
|
"grad_norm": 0.041243478655815125, |
|
"learning_rate": 0.00019476744186046511, |
|
"loss": 0.0866, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.1627906976744187, |
|
"grad_norm": 0.036053091287612915, |
|
"learning_rate": 0.0001941860465116279, |
|
"loss": 0.0726, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.1627906976744187, |
|
"eval_f1": 0.0, |
|
"eval_loss": 0.06596538424491882, |
|
"eval_runtime": 1.8015, |
|
"eval_samples_per_second": 40.522, |
|
"eval_steps_per_second": 5.551, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.2790697674418605, |
|
"grad_norm": 0.03166024759411812, |
|
"learning_rate": 0.0001936046511627907, |
|
"loss": 0.0617, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.3953488372093024, |
|
"grad_norm": 0.0280374176800251, |
|
"learning_rate": 0.0001930232558139535, |
|
"loss": 0.0537, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.5116279069767442, |
|
"grad_norm": 0.025035331025719643, |
|
"learning_rate": 0.0001924418604651163, |
|
"loss": 0.0469, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.627906976744186, |
|
"grad_norm": 0.02311284840106964, |
|
"learning_rate": 0.0001918604651162791, |
|
"loss": 0.042, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.744186046511628, |
|
"grad_norm": 0.020456839352846146, |
|
"learning_rate": 0.0001912790697674419, |
|
"loss": 0.0383, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.8604651162790697, |
|
"grad_norm": 0.01917686127126217, |
|
"learning_rate": 0.00019069767441860466, |
|
"loss": 0.034, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.9767441860465116, |
|
"grad_norm": 0.017376938834786415, |
|
"learning_rate": 0.00019011627906976745, |
|
"loss": 0.0314, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.0930232558139537, |
|
"grad_norm": 0.01589462347328663, |
|
"learning_rate": 0.00018953488372093025, |
|
"loss": 0.0292, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.2093023255813953, |
|
"grad_norm": 0.014865094795823097, |
|
"learning_rate": 0.00018895348837209304, |
|
"loss": 0.0274, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 2.3255813953488373, |
|
"grad_norm": 0.013694318942725658, |
|
"learning_rate": 0.00018837209302325584, |
|
"loss": 0.0264, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 2.3255813953488373, |
|
"eval_f1": 0.0, |
|
"eval_loss": 0.02468988113105297, |
|
"eval_runtime": 1.6978, |
|
"eval_samples_per_second": 42.996, |
|
"eval_steps_per_second": 5.89, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 2.441860465116279, |
|
"grad_norm": 0.012708564288914204, |
|
"learning_rate": 0.0001877906976744186, |
|
"loss": 0.0244, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 2.558139534883721, |
|
"grad_norm": 0.011777268722653389, |
|
"learning_rate": 0.0001872093023255814, |
|
"loss": 0.023, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 2.6744186046511627, |
|
"grad_norm": 0.011043795384466648, |
|
"learning_rate": 0.0001866279069767442, |
|
"loss": 0.0212, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 2.7906976744186047, |
|
"grad_norm": 0.010540899820625782, |
|
"learning_rate": 0.000186046511627907, |
|
"loss": 0.0206, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.9069767441860463, |
|
"grad_norm": 0.0098367715254426, |
|
"learning_rate": 0.00018546511627906976, |
|
"loss": 0.0192, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 3.0232558139534884, |
|
"grad_norm": 0.009457371197640896, |
|
"learning_rate": 0.00018488372093023256, |
|
"loss": 0.0186, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 3.13953488372093, |
|
"grad_norm": 0.0089762257412076, |
|
"learning_rate": 0.00018430232558139535, |
|
"loss": 0.0182, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 3.255813953488372, |
|
"grad_norm": 0.00830694381147623, |
|
"learning_rate": 0.00018372093023255815, |
|
"loss": 0.0177, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 3.3720930232558137, |
|
"grad_norm": 0.008157115429639816, |
|
"learning_rate": 0.00018313953488372094, |
|
"loss": 0.0171, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 3.488372093023256, |
|
"grad_norm": 0.007526726461946964, |
|
"learning_rate": 0.0001825581395348837, |
|
"loss": 0.0161, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 3.488372093023256, |
|
"eval_f1": 0.0, |
|
"eval_loss": 0.016456665471196175, |
|
"eval_runtime": 1.81, |
|
"eval_samples_per_second": 40.33, |
|
"eval_steps_per_second": 5.525, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 3.604651162790698, |
|
"grad_norm": 0.006970840971916914, |
|
"learning_rate": 0.0001819767441860465, |
|
"loss": 0.0171, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 3.7209302325581395, |
|
"grad_norm": 0.006966202985495329, |
|
"learning_rate": 0.0001813953488372093, |
|
"loss": 0.0157, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 3.8372093023255816, |
|
"grad_norm": 0.006569746416062117, |
|
"learning_rate": 0.00018081395348837212, |
|
"loss": 0.0155, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 3.953488372093023, |
|
"grad_norm": 0.006560744717717171, |
|
"learning_rate": 0.0001802325581395349, |
|
"loss": 0.0154, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 4.069767441860465, |
|
"grad_norm": 0.006836111657321453, |
|
"learning_rate": 0.0001796511627906977, |
|
"loss": 0.0144, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 4.186046511627907, |
|
"grad_norm": 0.005944866221398115, |
|
"learning_rate": 0.00017906976744186048, |
|
"loss": 0.0144, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 4.3023255813953485, |
|
"grad_norm": 0.00594041682779789, |
|
"learning_rate": 0.00017848837209302328, |
|
"loss": 0.0142, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 4.4186046511627906, |
|
"grad_norm": 0.005508134141564369, |
|
"learning_rate": 0.00017790697674418605, |
|
"loss": 0.0136, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 4.534883720930233, |
|
"grad_norm": 0.005419102031737566, |
|
"learning_rate": 0.00017732558139534884, |
|
"loss": 0.0135, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 4.651162790697675, |
|
"grad_norm": 0.005490223411470652, |
|
"learning_rate": 0.00017674418604651164, |
|
"loss": 0.0133, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 4.651162790697675, |
|
"eval_f1": 0.0, |
|
"eval_loss": 0.013475539162755013, |
|
"eval_runtime": 1.7409, |
|
"eval_samples_per_second": 41.933, |
|
"eval_steps_per_second": 5.744, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 4.767441860465116, |
|
"grad_norm": 0.005142460577189922, |
|
"learning_rate": 0.00017616279069767443, |
|
"loss": 0.0132, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 4.883720930232558, |
|
"grad_norm": 0.00514647364616394, |
|
"learning_rate": 0.00017558139534883723, |
|
"loss": 0.0137, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.005101104732602835, |
|
"learning_rate": 0.000175, |
|
"loss": 0.0135, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 5.116279069767442, |
|
"grad_norm": 0.005293596535921097, |
|
"learning_rate": 0.0001744186046511628, |
|
"loss": 0.0122, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 5.232558139534884, |
|
"grad_norm": 0.004600506741553545, |
|
"learning_rate": 0.0001738372093023256, |
|
"loss": 0.0123, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 5.348837209302325, |
|
"grad_norm": 0.005292165093123913, |
|
"learning_rate": 0.00017325581395348838, |
|
"loss": 0.0125, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 5.465116279069767, |
|
"grad_norm": 0.004837548825889826, |
|
"learning_rate": 0.00017267441860465118, |
|
"loss": 0.012, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 5.5813953488372094, |
|
"grad_norm": 0.004367106128484011, |
|
"learning_rate": 0.00017209302325581395, |
|
"loss": 0.0127, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 5.6976744186046515, |
|
"grad_norm": 0.004551732446998358, |
|
"learning_rate": 0.00017151162790697674, |
|
"loss": 0.0123, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 5.813953488372093, |
|
"grad_norm": 0.004141143057495356, |
|
"learning_rate": 0.00017093023255813954, |
|
"loss": 0.0124, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 5.813953488372093, |
|
"eval_f1": 0.0, |
|
"eval_loss": 0.012049774639308453, |
|
"eval_runtime": 1.8113, |
|
"eval_samples_per_second": 40.303, |
|
"eval_steps_per_second": 5.521, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 5.930232558139535, |
|
"grad_norm": 0.0041281660087406635, |
|
"learning_rate": 0.00017034883720930233, |
|
"loss": 0.0123, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 6.046511627906977, |
|
"grad_norm": 0.004327772185206413, |
|
"learning_rate": 0.0001697674418604651, |
|
"loss": 0.0114, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 6.162790697674419, |
|
"grad_norm": 0.00425134040415287, |
|
"learning_rate": 0.0001691860465116279, |
|
"loss": 0.0108, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 6.27906976744186, |
|
"grad_norm": 0.004197305999696255, |
|
"learning_rate": 0.00016860465116279072, |
|
"loss": 0.0119, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 6.395348837209302, |
|
"grad_norm": 0.004064254928380251, |
|
"learning_rate": 0.00016802325581395352, |
|
"loss": 0.0117, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 6.511627906976744, |
|
"grad_norm": 0.004467037972062826, |
|
"learning_rate": 0.00016744186046511629, |
|
"loss": 0.012, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 6.627906976744186, |
|
"grad_norm": 0.004210630431771278, |
|
"learning_rate": 0.00016686046511627908, |
|
"loss": 0.0116, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 6.7441860465116275, |
|
"grad_norm": 0.003937269560992718, |
|
"learning_rate": 0.00016627906976744188, |
|
"loss": 0.0116, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 6.8604651162790695, |
|
"grad_norm": 0.003829133929684758, |
|
"learning_rate": 0.00016569767441860467, |
|
"loss": 0.0114, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 6.976744186046512, |
|
"grad_norm": 0.0037258623633533716, |
|
"learning_rate": 0.00016511627906976747, |
|
"loss": 0.011, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 6.976744186046512, |
|
"eval_f1": 0.0, |
|
"eval_loss": 0.011244768276810646, |
|
"eval_runtime": 1.6726, |
|
"eval_samples_per_second": 43.646, |
|
"eval_steps_per_second": 5.979, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 7.093023255813954, |
|
"grad_norm": 0.0038892878219485283, |
|
"learning_rate": 0.00016453488372093024, |
|
"loss": 0.0108, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 7.209302325581396, |
|
"grad_norm": 0.003730091731995344, |
|
"learning_rate": 0.00016395348837209303, |
|
"loss": 0.0108, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 7.325581395348837, |
|
"grad_norm": 0.0035813930444419384, |
|
"learning_rate": 0.00016337209302325583, |
|
"loss": 0.0113, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 7.441860465116279, |
|
"grad_norm": 0.004092409275472164, |
|
"learning_rate": 0.00016279069767441862, |
|
"loss": 0.0105, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 7.558139534883721, |
|
"grad_norm": 0.004152482841163874, |
|
"learning_rate": 0.0001622093023255814, |
|
"loss": 0.0107, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 7.674418604651163, |
|
"grad_norm": 0.0035718800500035286, |
|
"learning_rate": 0.00016162790697674419, |
|
"loss": 0.0106, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 7.790697674418604, |
|
"grad_norm": 0.004181632772088051, |
|
"learning_rate": 0.00016104651162790698, |
|
"loss": 0.0112, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 7.906976744186046, |
|
"grad_norm": 0.003530286019667983, |
|
"learning_rate": 0.00016046511627906978, |
|
"loss": 0.0112, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 8.023255813953488, |
|
"grad_norm": 0.003953536041080952, |
|
"learning_rate": 0.00015988372093023257, |
|
"loss": 0.0107, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 8.13953488372093, |
|
"grad_norm": 0.00364445592276752, |
|
"learning_rate": 0.00015930232558139534, |
|
"loss": 0.0114, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 8.13953488372093, |
|
"eval_f1": 0.0, |
|
"eval_loss": 0.010723751038312912, |
|
"eval_runtime": 1.705, |
|
"eval_samples_per_second": 42.815, |
|
"eval_steps_per_second": 5.865, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 8.255813953488373, |
|
"grad_norm": 0.004072926007211208, |
|
"learning_rate": 0.00015872093023255814, |
|
"loss": 0.0105, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 8.372093023255815, |
|
"grad_norm": 0.003972381353378296, |
|
"learning_rate": 0.00015813953488372093, |
|
"loss": 0.0103, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 8.488372093023255, |
|
"grad_norm": 0.003728296374902129, |
|
"learning_rate": 0.00015755813953488373, |
|
"loss": 0.0107, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 8.604651162790697, |
|
"grad_norm": 0.0036759586073458195, |
|
"learning_rate": 0.00015697674418604652, |
|
"loss": 0.01, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 8.720930232558139, |
|
"grad_norm": 0.0035114786587655544, |
|
"learning_rate": 0.0001563953488372093, |
|
"loss": 0.0105, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 8.837209302325581, |
|
"grad_norm": 0.0033980573061853647, |
|
"learning_rate": 0.0001558139534883721, |
|
"loss": 0.0102, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 8.953488372093023, |
|
"grad_norm": 0.0035892720334231853, |
|
"learning_rate": 0.0001552325581395349, |
|
"loss": 0.0099, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 9.069767441860465, |
|
"grad_norm": 0.0031818547286093235, |
|
"learning_rate": 0.00015465116279069768, |
|
"loss": 0.0098, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 9.186046511627907, |
|
"grad_norm": 0.003511944320052862, |
|
"learning_rate": 0.00015406976744186047, |
|
"loss": 0.0101, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 9.30232558139535, |
|
"grad_norm": 0.0032857146579772234, |
|
"learning_rate": 0.00015348837209302327, |
|
"loss": 0.0109, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 9.30232558139535, |
|
"eval_f1": 0.0, |
|
"eval_loss": 0.010318118147552013, |
|
"eval_runtime": 1.6682, |
|
"eval_samples_per_second": 43.76, |
|
"eval_steps_per_second": 5.995, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 9.418604651162791, |
|
"grad_norm": 0.004180931951850653, |
|
"learning_rate": 0.00015290697674418606, |
|
"loss": 0.0104, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 9.534883720930232, |
|
"grad_norm": 0.0035553001798689365, |
|
"learning_rate": 0.00015232558139534886, |
|
"loss": 0.0107, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 9.651162790697674, |
|
"grad_norm": 0.004481980111449957, |
|
"learning_rate": 0.00015174418604651163, |
|
"loss": 0.0098, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 9.767441860465116, |
|
"grad_norm": 0.004224750213325024, |
|
"learning_rate": 0.00015116279069767442, |
|
"loss": 0.0102, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 9.883720930232558, |
|
"grad_norm": 0.0038431661669164896, |
|
"learning_rate": 0.00015058139534883722, |
|
"loss": 0.0095, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.004212440922856331, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 0.0099, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 10.116279069767442, |
|
"grad_norm": 0.003633901011198759, |
|
"learning_rate": 0.0001494186046511628, |
|
"loss": 0.0095, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 10.232558139534884, |
|
"grad_norm": 0.003657746594399214, |
|
"learning_rate": 0.00014883720930232558, |
|
"loss": 0.0096, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 10.348837209302326, |
|
"grad_norm": 0.003825038904324174, |
|
"learning_rate": 0.00014825581395348837, |
|
"loss": 0.0102, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 10.465116279069768, |
|
"grad_norm": 0.0041350796818733215, |
|
"learning_rate": 0.00014767441860465117, |
|
"loss": 0.0096, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 10.465116279069768, |
|
"eval_f1": 0.0, |
|
"eval_loss": 0.010230067186057568, |
|
"eval_runtime": 1.6982, |
|
"eval_samples_per_second": 42.987, |
|
"eval_steps_per_second": 5.889, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 10.581395348837209, |
|
"grad_norm": 0.003846728475764394, |
|
"learning_rate": 0.00014709302325581396, |
|
"loss": 0.0106, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 10.69767441860465, |
|
"grad_norm": 0.0042284755036234856, |
|
"learning_rate": 0.00014651162790697673, |
|
"loss": 0.0102, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 10.813953488372093, |
|
"grad_norm": 0.003174531040713191, |
|
"learning_rate": 0.00014593023255813953, |
|
"loss": 0.009, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 10.930232558139535, |
|
"grad_norm": 0.0035903833340853453, |
|
"learning_rate": 0.00014534883720930232, |
|
"loss": 0.0101, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 11.046511627906977, |
|
"grad_norm": 0.0043946849182248116, |
|
"learning_rate": 0.00014476744186046512, |
|
"loss": 0.0093, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 11.162790697674419, |
|
"grad_norm": 0.0035909838043153286, |
|
"learning_rate": 0.00014418604651162791, |
|
"loss": 0.0093, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 11.279069767441861, |
|
"grad_norm": 0.0036394952330738306, |
|
"learning_rate": 0.0001436046511627907, |
|
"loss": 0.0092, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 11.395348837209303, |
|
"grad_norm": 0.0038749193772673607, |
|
"learning_rate": 0.0001430232558139535, |
|
"loss": 0.009, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 11.511627906976745, |
|
"grad_norm": 0.0044557624496519566, |
|
"learning_rate": 0.0001424418604651163, |
|
"loss": 0.0097, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 11.627906976744185, |
|
"grad_norm": 0.0037609776481986046, |
|
"learning_rate": 0.0001418604651162791, |
|
"loss": 0.0099, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 11.627906976744185, |
|
"eval_f1": 0.0, |
|
"eval_loss": 0.00977805070579052, |
|
"eval_runtime": 1.7092, |
|
"eval_samples_per_second": 42.71, |
|
"eval_steps_per_second": 5.851, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 11.744186046511627, |
|
"grad_norm": 0.003938957117497921, |
|
"learning_rate": 0.00014127906976744186, |
|
"loss": 0.0093, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 11.86046511627907, |
|
"grad_norm": 0.006434954237192869, |
|
"learning_rate": 0.00014069767441860466, |
|
"loss": 0.0095, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 11.976744186046512, |
|
"grad_norm": 0.007113784551620483, |
|
"learning_rate": 0.00014011627906976746, |
|
"loss": 0.0105, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 12.093023255813954, |
|
"grad_norm": 0.0040367101319134235, |
|
"learning_rate": 0.00013953488372093025, |
|
"loss": 0.0093, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 12.209302325581396, |
|
"grad_norm": 0.0035031838342547417, |
|
"learning_rate": 0.00013895348837209302, |
|
"loss": 0.0096, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 12.325581395348838, |
|
"grad_norm": 0.0033588423393666744, |
|
"learning_rate": 0.00013837209302325582, |
|
"loss": 0.0091, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 12.44186046511628, |
|
"grad_norm": 0.0037452862598001957, |
|
"learning_rate": 0.0001377906976744186, |
|
"loss": 0.0092, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 12.55813953488372, |
|
"grad_norm": 0.004363079089671373, |
|
"learning_rate": 0.0001372093023255814, |
|
"loss": 0.0082, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 12.674418604651162, |
|
"grad_norm": 0.006496877875179052, |
|
"learning_rate": 0.0001366279069767442, |
|
"loss": 0.0092, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 12.790697674418604, |
|
"grad_norm": 0.00442136637866497, |
|
"learning_rate": 0.00013604651162790697, |
|
"loss": 0.0089, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 12.790697674418604, |
|
"eval_f1": 0.0, |
|
"eval_loss": 0.009442531503736973, |
|
"eval_runtime": 1.7071, |
|
"eval_samples_per_second": 42.762, |
|
"eval_steps_per_second": 5.858, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 12.906976744186046, |
|
"grad_norm": 0.004154372029006481, |
|
"learning_rate": 0.00013546511627906977, |
|
"loss": 0.0097, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 13.023255813953488, |
|
"grad_norm": 0.004947633482515812, |
|
"learning_rate": 0.00013488372093023256, |
|
"loss": 0.0094, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 13.13953488372093, |
|
"grad_norm": 0.004092765972018242, |
|
"learning_rate": 0.00013430232558139536, |
|
"loss": 0.0087, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 13.255813953488373, |
|
"grad_norm": 0.003843548009172082, |
|
"learning_rate": 0.00013372093023255815, |
|
"loss": 0.0084, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 13.372093023255815, |
|
"grad_norm": 0.0034843245521187782, |
|
"learning_rate": 0.00013313953488372092, |
|
"loss": 0.0084, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 13.488372093023255, |
|
"grad_norm": 0.0045846025459468365, |
|
"learning_rate": 0.00013255813953488372, |
|
"loss": 0.0094, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 13.604651162790697, |
|
"grad_norm": 0.0040568090043962, |
|
"learning_rate": 0.0001319767441860465, |
|
"loss": 0.0092, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 13.720930232558139, |
|
"grad_norm": 0.0035901013761758804, |
|
"learning_rate": 0.0001313953488372093, |
|
"loss": 0.0086, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 13.837209302325581, |
|
"grad_norm": 0.003683890914544463, |
|
"learning_rate": 0.0001308139534883721, |
|
"loss": 0.0092, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 13.953488372093023, |
|
"grad_norm": 0.003873488400131464, |
|
"learning_rate": 0.0001302325581395349, |
|
"loss": 0.0091, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 13.953488372093023, |
|
"eval_f1": 0.0, |
|
"eval_loss": 0.009279163554310799, |
|
"eval_runtime": 1.6753, |
|
"eval_samples_per_second": 43.574, |
|
"eval_steps_per_second": 5.969, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 14.069767441860465, |
|
"grad_norm": 0.0047092861495912075, |
|
"learning_rate": 0.0001296511627906977, |
|
"loss": 0.0078, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 14.186046511627907, |
|
"grad_norm": 0.005257419776171446, |
|
"learning_rate": 0.0001290697674418605, |
|
"loss": 0.0083, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 14.30232558139535, |
|
"grad_norm": 0.005552452523261309, |
|
"learning_rate": 0.00012848837209302326, |
|
"loss": 0.0087, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 14.418604651162791, |
|
"grad_norm": 0.0051083252765238285, |
|
"learning_rate": 0.00012790697674418605, |
|
"loss": 0.0092, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 14.534883720930232, |
|
"grad_norm": 0.0037808313500136137, |
|
"learning_rate": 0.00012732558139534885, |
|
"loss": 0.0082, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 14.651162790697674, |
|
"grad_norm": 0.004120196681469679, |
|
"learning_rate": 0.00012674418604651164, |
|
"loss": 0.0082, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 14.767441860465116, |
|
"grad_norm": 0.004169652238488197, |
|
"learning_rate": 0.00012616279069767444, |
|
"loss": 0.0085, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 14.883720930232558, |
|
"grad_norm": 0.004740726202726364, |
|
"learning_rate": 0.0001255813953488372, |
|
"loss": 0.0082, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"grad_norm": 0.00697332015261054, |
|
"learning_rate": 0.000125, |
|
"loss": 0.0087, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 15.116279069767442, |
|
"grad_norm": 0.004342319909483194, |
|
"learning_rate": 0.0001244186046511628, |
|
"loss": 0.0081, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 15.116279069767442, |
|
"eval_f1": 0.0, |
|
"eval_loss": 0.008884104900062084, |
|
"eval_runtime": 1.7593, |
|
"eval_samples_per_second": 41.494, |
|
"eval_steps_per_second": 5.684, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 15.232558139534884, |
|
"grad_norm": 0.00661982037127018, |
|
"learning_rate": 0.0001238372093023256, |
|
"loss": 0.008, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 15.348837209302326, |
|
"grad_norm": 0.004550145473331213, |
|
"learning_rate": 0.00012325581395348836, |
|
"loss": 0.0082, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 15.465116279069768, |
|
"grad_norm": 0.00338382157497108, |
|
"learning_rate": 0.00012267441860465116, |
|
"loss": 0.0084, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 15.581395348837209, |
|
"grad_norm": 0.005567767191678286, |
|
"learning_rate": 0.00012209302325581395, |
|
"loss": 0.008, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 15.69767441860465, |
|
"grad_norm": 0.0051926677115261555, |
|
"learning_rate": 0.00012151162790697675, |
|
"loss": 0.0071, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 15.813953488372093, |
|
"grad_norm": 0.004935986362397671, |
|
"learning_rate": 0.00012093023255813953, |
|
"loss": 0.0085, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 15.930232558139535, |
|
"grad_norm": 0.0048750354908406734, |
|
"learning_rate": 0.00012034883720930233, |
|
"loss": 0.0078, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 16.046511627906977, |
|
"grad_norm": 0.0055388594046235085, |
|
"learning_rate": 0.00011976744186046511, |
|
"loss": 0.0082, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 16.162790697674417, |
|
"grad_norm": 0.004739618394523859, |
|
"learning_rate": 0.0001191860465116279, |
|
"loss": 0.0077, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 16.27906976744186, |
|
"grad_norm": 0.004103164654225111, |
|
"learning_rate": 0.00011860465116279071, |
|
"loss": 0.0073, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 16.27906976744186, |
|
"eval_f1": 0.0, |
|
"eval_loss": 0.008862593211233616, |
|
"eval_runtime": 1.7176, |
|
"eval_samples_per_second": 42.5, |
|
"eval_steps_per_second": 5.822, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 16.3953488372093, |
|
"grad_norm": 0.0057744914665818214, |
|
"learning_rate": 0.00011802325581395351, |
|
"loss": 0.0078, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 16.511627906976745, |
|
"grad_norm": 0.003924832213670015, |
|
"learning_rate": 0.00011744186046511629, |
|
"loss": 0.0076, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 16.627906976744185, |
|
"grad_norm": 0.004951529670506716, |
|
"learning_rate": 0.00011686046511627909, |
|
"loss": 0.0077, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 16.74418604651163, |
|
"grad_norm": 0.004139590077102184, |
|
"learning_rate": 0.00011627906976744187, |
|
"loss": 0.0073, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 16.86046511627907, |
|
"grad_norm": 0.004691829439252615, |
|
"learning_rate": 0.00011569767441860466, |
|
"loss": 0.0078, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 16.97674418604651, |
|
"grad_norm": 0.011304708197712898, |
|
"learning_rate": 0.00011511627906976746, |
|
"loss": 0.0073, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 17.093023255813954, |
|
"grad_norm": 0.004497618414461613, |
|
"learning_rate": 0.00011453488372093024, |
|
"loss": 0.0075, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 17.209302325581394, |
|
"grad_norm": 0.004640204831957817, |
|
"learning_rate": 0.00011395348837209304, |
|
"loss": 0.0067, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 17.325581395348838, |
|
"grad_norm": 0.008850700221955776, |
|
"learning_rate": 0.00011337209302325582, |
|
"loss": 0.0074, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 17.441860465116278, |
|
"grad_norm": 0.0063476236537098885, |
|
"learning_rate": 0.00011279069767441861, |
|
"loss": 0.0071, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 17.441860465116278, |
|
"eval_f1": 0.0, |
|
"eval_loss": 0.008479787036776543, |
|
"eval_runtime": 1.7053, |
|
"eval_samples_per_second": 42.808, |
|
"eval_steps_per_second": 5.864, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 17.558139534883722, |
|
"grad_norm": 0.005328552797436714, |
|
"learning_rate": 0.0001122093023255814, |
|
"loss": 0.0071, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 17.674418604651162, |
|
"grad_norm": 0.006423116661608219, |
|
"learning_rate": 0.00011162790697674419, |
|
"loss": 0.0075, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 17.790697674418606, |
|
"grad_norm": 0.006245277356356382, |
|
"learning_rate": 0.00011104651162790699, |
|
"loss": 0.0074, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 17.906976744186046, |
|
"grad_norm": 0.006818824913352728, |
|
"learning_rate": 0.00011046511627906977, |
|
"loss": 0.0065, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 18.023255813953487, |
|
"grad_norm": 0.005169364158064127, |
|
"learning_rate": 0.00010988372093023256, |
|
"loss": 0.0076, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 18.13953488372093, |
|
"grad_norm": 0.0035659593995660543, |
|
"learning_rate": 0.00010930232558139534, |
|
"loss": 0.0067, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 18.25581395348837, |
|
"grad_norm": 0.004306168295443058, |
|
"learning_rate": 0.00010872093023255814, |
|
"loss": 0.0066, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 18.372093023255815, |
|
"grad_norm": 0.004275166429579258, |
|
"learning_rate": 0.00010813953488372092, |
|
"loss": 0.0064, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 18.488372093023255, |
|
"grad_norm": 0.003919180482625961, |
|
"learning_rate": 0.00010755813953488372, |
|
"loss": 0.0071, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 18.6046511627907, |
|
"grad_norm": 0.008834786713123322, |
|
"learning_rate": 0.00010697674418604651, |
|
"loss": 0.0068, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 18.6046511627907, |
|
"eval_f1": 0.0182648401826484, |
|
"eval_loss": 0.008239569142460823, |
|
"eval_runtime": 1.7236, |
|
"eval_samples_per_second": 42.353, |
|
"eval_steps_per_second": 5.802, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 18.72093023255814, |
|
"grad_norm": 0.004857208579778671, |
|
"learning_rate": 0.0001063953488372093, |
|
"loss": 0.0062, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 18.837209302325583, |
|
"grad_norm": 0.005715236067771912, |
|
"learning_rate": 0.0001058139534883721, |
|
"loss": 0.0074, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 18.953488372093023, |
|
"grad_norm": 0.005261226557195187, |
|
"learning_rate": 0.0001052325581395349, |
|
"loss": 0.0065, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 19.069767441860463, |
|
"grad_norm": 0.005216473713517189, |
|
"learning_rate": 0.00010465116279069768, |
|
"loss": 0.0066, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 19.186046511627907, |
|
"grad_norm": 0.004990010056644678, |
|
"learning_rate": 0.00010406976744186048, |
|
"loss": 0.0065, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 19.302325581395348, |
|
"grad_norm": 0.005021219607442617, |
|
"learning_rate": 0.00010348837209302327, |
|
"loss": 0.0064, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 19.41860465116279, |
|
"grad_norm": 0.006166994571685791, |
|
"learning_rate": 0.00010290697674418605, |
|
"loss": 0.0069, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 19.53488372093023, |
|
"grad_norm": 0.0043314131908118725, |
|
"learning_rate": 0.00010232558139534885, |
|
"loss": 0.0064, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 19.651162790697676, |
|
"grad_norm": 0.006082617212086916, |
|
"learning_rate": 0.00010174418604651163, |
|
"loss": 0.006, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 19.767441860465116, |
|
"grad_norm": 0.008033761754631996, |
|
"learning_rate": 0.00010116279069767443, |
|
"loss": 0.0064, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 19.767441860465116, |
|
"eval_f1": 0.0365296803652968, |
|
"eval_loss": 0.008163763210177422, |
|
"eval_runtime": 1.7458, |
|
"eval_samples_per_second": 41.815, |
|
"eval_steps_per_second": 5.728, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 19.88372093023256, |
|
"grad_norm": 0.006994906347244978, |
|
"learning_rate": 0.00010058139534883721, |
|
"loss": 0.0061, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"grad_norm": 0.006535364780575037, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0061, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 20.11627906976744, |
|
"grad_norm": 0.004905609879642725, |
|
"learning_rate": 9.94186046511628e-05, |
|
"loss": 0.006, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 20.232558139534884, |
|
"grad_norm": 0.004114439245313406, |
|
"learning_rate": 9.883720930232558e-05, |
|
"loss": 0.0064, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 20.348837209302324, |
|
"grad_norm": 0.003827931359410286, |
|
"learning_rate": 9.825581395348838e-05, |
|
"loss": 0.0052, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 20.46511627906977, |
|
"grad_norm": 0.0036556690465658903, |
|
"learning_rate": 9.767441860465116e-05, |
|
"loss": 0.0059, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 20.58139534883721, |
|
"grad_norm": 0.00459684245288372, |
|
"learning_rate": 9.709302325581396e-05, |
|
"loss": 0.0053, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 20.697674418604652, |
|
"grad_norm": 0.0047168247401714325, |
|
"learning_rate": 9.651162790697675e-05, |
|
"loss": 0.0059, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 20.813953488372093, |
|
"grad_norm": 0.005462776403874159, |
|
"learning_rate": 9.593023255813955e-05, |
|
"loss": 0.006, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 20.930232558139537, |
|
"grad_norm": 0.004429865162819624, |
|
"learning_rate": 9.534883720930233e-05, |
|
"loss": 0.0061, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 20.930232558139537, |
|
"eval_f1": 0.0091324200913242, |
|
"eval_loss": 0.008642657659947872, |
|
"eval_runtime": 1.6675, |
|
"eval_samples_per_second": 43.777, |
|
"eval_steps_per_second": 5.997, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 21.046511627906977, |
|
"grad_norm": 0.008058182895183563, |
|
"learning_rate": 9.476744186046512e-05, |
|
"loss": 0.0062, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 21.162790697674417, |
|
"grad_norm": 0.0037425195332616568, |
|
"learning_rate": 9.418604651162792e-05, |
|
"loss": 0.0053, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 21.27906976744186, |
|
"grad_norm": 0.004587921779602766, |
|
"learning_rate": 9.36046511627907e-05, |
|
"loss": 0.006, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 21.3953488372093, |
|
"grad_norm": 0.006532969884574413, |
|
"learning_rate": 9.30232558139535e-05, |
|
"loss": 0.0056, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 21.511627906976745, |
|
"grad_norm": 0.008152597583830357, |
|
"learning_rate": 9.244186046511628e-05, |
|
"loss": 0.0061, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 21.627906976744185, |
|
"grad_norm": 0.005781789310276508, |
|
"learning_rate": 9.186046511627907e-05, |
|
"loss": 0.0059, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 21.74418604651163, |
|
"grad_norm": 0.004202969837933779, |
|
"learning_rate": 9.127906976744186e-05, |
|
"loss": 0.0055, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 21.86046511627907, |
|
"grad_norm": 0.0047506955452263355, |
|
"learning_rate": 9.069767441860465e-05, |
|
"loss": 0.0058, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 21.97674418604651, |
|
"grad_norm": 0.005872243549674749, |
|
"learning_rate": 9.011627906976745e-05, |
|
"loss": 0.0056, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 22.093023255813954, |
|
"grad_norm": 0.005416091997176409, |
|
"learning_rate": 8.953488372093024e-05, |
|
"loss": 0.0054, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 22.093023255813954, |
|
"eval_f1": 0.0593607305936073, |
|
"eval_loss": 0.008248222060501575, |
|
"eval_runtime": 1.7364, |
|
"eval_samples_per_second": 42.04, |
|
"eval_steps_per_second": 5.759, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 22.209302325581394, |
|
"grad_norm": 0.0039680940099060535, |
|
"learning_rate": 8.895348837209302e-05, |
|
"loss": 0.005, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 22.325581395348838, |
|
"grad_norm": 0.0041343714110553265, |
|
"learning_rate": 8.837209302325582e-05, |
|
"loss": 0.006, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 22.441860465116278, |
|
"grad_norm": 0.0031660939566791058, |
|
"learning_rate": 8.779069767441861e-05, |
|
"loss": 0.0053, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 22.558139534883722, |
|
"grad_norm": 0.005090238060802221, |
|
"learning_rate": 8.72093023255814e-05, |
|
"loss": 0.0053, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 22.674418604651162, |
|
"grad_norm": 0.0037465649656951427, |
|
"learning_rate": 8.662790697674419e-05, |
|
"loss": 0.0048, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 22.790697674418606, |
|
"grad_norm": 0.0040254793129861355, |
|
"learning_rate": 8.604651162790697e-05, |
|
"loss": 0.0051, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 22.906976744186046, |
|
"grad_norm": 0.007103138603270054, |
|
"learning_rate": 8.546511627906977e-05, |
|
"loss": 0.0054, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 23.023255813953487, |
|
"grad_norm": 0.00407650088891387, |
|
"learning_rate": 8.488372093023255e-05, |
|
"loss": 0.0055, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 23.13953488372093, |
|
"grad_norm": 0.004379500634968281, |
|
"learning_rate": 8.430232558139536e-05, |
|
"loss": 0.0054, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 23.25581395348837, |
|
"grad_norm": 0.004481241572648287, |
|
"learning_rate": 8.372093023255814e-05, |
|
"loss": 0.0051, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 23.25581395348837, |
|
"eval_f1": 0.0502283105022831, |
|
"eval_loss": 0.008041327819228172, |
|
"eval_runtime": 1.6921, |
|
"eval_samples_per_second": 43.14, |
|
"eval_steps_per_second": 5.91, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 23.372093023255815, |
|
"grad_norm": 0.005388026591390371, |
|
"learning_rate": 8.313953488372094e-05, |
|
"loss": 0.0053, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 23.488372093023255, |
|
"grad_norm": 0.004522989969700575, |
|
"learning_rate": 8.255813953488373e-05, |
|
"loss": 0.0051, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 23.6046511627907, |
|
"grad_norm": 0.0037742829881608486, |
|
"learning_rate": 8.197674418604652e-05, |
|
"loss": 0.0054, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 23.72093023255814, |
|
"grad_norm": 0.0024201772175729275, |
|
"learning_rate": 8.139534883720931e-05, |
|
"loss": 0.0047, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 23.837209302325583, |
|
"grad_norm": 0.010637535713613033, |
|
"learning_rate": 8.081395348837209e-05, |
|
"loss": 0.0053, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 23.953488372093023, |
|
"grad_norm": 0.0036848525051027536, |
|
"learning_rate": 8.023255813953489e-05, |
|
"loss": 0.0045, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 24.069767441860463, |
|
"grad_norm": 0.0036426165606826544, |
|
"learning_rate": 7.965116279069767e-05, |
|
"loss": 0.0051, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 24.186046511627907, |
|
"grad_norm": 0.004131542984396219, |
|
"learning_rate": 7.906976744186047e-05, |
|
"loss": 0.005, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 24.302325581395348, |
|
"grad_norm": 0.005381637252867222, |
|
"learning_rate": 7.848837209302326e-05, |
|
"loss": 0.0049, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 24.41860465116279, |
|
"grad_norm": 0.00364381424151361, |
|
"learning_rate": 7.790697674418606e-05, |
|
"loss": 0.0048, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 24.41860465116279, |
|
"eval_f1": 0.0639269406392694, |
|
"eval_loss": 0.007898622192442417, |
|
"eval_runtime": 1.74, |
|
"eval_samples_per_second": 41.953, |
|
"eval_steps_per_second": 5.747, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 24.53488372093023, |
|
"grad_norm": 0.0032041827216744423, |
|
"learning_rate": 7.732558139534884e-05, |
|
"loss": 0.0047, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 24.651162790697676, |
|
"grad_norm": 0.004305603448301554, |
|
"learning_rate": 7.674418604651163e-05, |
|
"loss": 0.0048, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 24.767441860465116, |
|
"grad_norm": 0.003094649873673916, |
|
"learning_rate": 7.616279069767443e-05, |
|
"loss": 0.0042, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 24.88372093023256, |
|
"grad_norm": 0.0032246061600744724, |
|
"learning_rate": 7.558139534883721e-05, |
|
"loss": 0.0045, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"grad_norm": 0.00515501806512475, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 0.0047, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 25.11627906976744, |
|
"grad_norm": 0.003303398611024022, |
|
"learning_rate": 7.441860465116279e-05, |
|
"loss": 0.0039, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 25.232558139534884, |
|
"grad_norm": 0.00300194276496768, |
|
"learning_rate": 7.383720930232558e-05, |
|
"loss": 0.0042, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 25.348837209302324, |
|
"grad_norm": 0.007243013009428978, |
|
"learning_rate": 7.325581395348837e-05, |
|
"loss": 0.0048, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 25.46511627906977, |
|
"grad_norm": 0.00349716329947114, |
|
"learning_rate": 7.267441860465116e-05, |
|
"loss": 0.005, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 25.58139534883721, |
|
"grad_norm": 0.003004447091370821, |
|
"learning_rate": 7.209302325581396e-05, |
|
"loss": 0.0045, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 25.58139534883721, |
|
"eval_f1": 0.0639269406392694, |
|
"eval_loss": 0.008040810003876686, |
|
"eval_runtime": 1.6946, |
|
"eval_samples_per_second": 43.078, |
|
"eval_steps_per_second": 5.901, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 25.697674418604652, |
|
"grad_norm": 0.0032493805047124624, |
|
"learning_rate": 7.151162790697675e-05, |
|
"loss": 0.0042, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 25.813953488372093, |
|
"grad_norm": 0.005652161315083504, |
|
"learning_rate": 7.093023255813955e-05, |
|
"loss": 0.0043, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 25.930232558139537, |
|
"grad_norm": 0.003228959860280156, |
|
"learning_rate": 7.034883720930233e-05, |
|
"loss": 0.004, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 26.046511627906977, |
|
"grad_norm": 0.0029076021164655685, |
|
"learning_rate": 6.976744186046513e-05, |
|
"loss": 0.0046, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 26.162790697674417, |
|
"grad_norm": 0.004093965515494347, |
|
"learning_rate": 6.918604651162791e-05, |
|
"loss": 0.0046, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 26.27906976744186, |
|
"grad_norm": 0.0039006653241813183, |
|
"learning_rate": 6.86046511627907e-05, |
|
"loss": 0.0044, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 26.3953488372093, |
|
"grad_norm": 0.0035654634702950716, |
|
"learning_rate": 6.802325581395348e-05, |
|
"loss": 0.0043, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 26.511627906976745, |
|
"grad_norm": 0.0034853783436119556, |
|
"learning_rate": 6.744186046511628e-05, |
|
"loss": 0.0035, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 26.627906976744185, |
|
"grad_norm": 0.0032556080259382725, |
|
"learning_rate": 6.686046511627908e-05, |
|
"loss": 0.0039, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 26.74418604651163, |
|
"grad_norm": 0.004529784433543682, |
|
"learning_rate": 6.627906976744186e-05, |
|
"loss": 0.0036, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 26.74418604651163, |
|
"eval_f1": 0.10273972602739725, |
|
"eval_loss": 0.007886539213359356, |
|
"eval_runtime": 1.7923, |
|
"eval_samples_per_second": 40.73, |
|
"eval_steps_per_second": 5.579, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 26.86046511627907, |
|
"grad_norm": 0.0036729234270751476, |
|
"learning_rate": 6.569767441860465e-05, |
|
"loss": 0.0041, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 26.97674418604651, |
|
"grad_norm": 0.003093762556090951, |
|
"learning_rate": 6.511627906976745e-05, |
|
"loss": 0.0043, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 27.093023255813954, |
|
"grad_norm": 0.0035506237763911486, |
|
"learning_rate": 6.453488372093024e-05, |
|
"loss": 0.0038, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 27.209302325581394, |
|
"grad_norm": 0.004144872073084116, |
|
"learning_rate": 6.395348837209303e-05, |
|
"loss": 0.0041, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 27.325581395348838, |
|
"grad_norm": 0.0029570453334599733, |
|
"learning_rate": 6.337209302325582e-05, |
|
"loss": 0.0039, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 27.441860465116278, |
|
"grad_norm": 0.0029905049595981836, |
|
"learning_rate": 6.27906976744186e-05, |
|
"loss": 0.0038, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 27.558139534883722, |
|
"grad_norm": 0.0037850120570510626, |
|
"learning_rate": 6.22093023255814e-05, |
|
"loss": 0.0041, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 27.674418604651162, |
|
"grad_norm": 0.0027285381220281124, |
|
"learning_rate": 6.162790697674418e-05, |
|
"loss": 0.004, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 27.790697674418606, |
|
"grad_norm": 0.0033172164112329483, |
|
"learning_rate": 6.104651162790698e-05, |
|
"loss": 0.0036, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 27.906976744186046, |
|
"grad_norm": 0.0038614473305642605, |
|
"learning_rate": 6.0465116279069765e-05, |
|
"loss": 0.0038, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 27.906976744186046, |
|
"eval_f1": 0.10273972602739725, |
|
"eval_loss": 0.007930786348879337, |
|
"eval_runtime": 1.6954, |
|
"eval_samples_per_second": 43.058, |
|
"eval_steps_per_second": 5.898, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 28.023255813953487, |
|
"grad_norm": 0.004366340581327677, |
|
"learning_rate": 5.9883720930232554e-05, |
|
"loss": 0.004, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 28.13953488372093, |
|
"grad_norm": 0.0038022997323423624, |
|
"learning_rate": 5.9302325581395356e-05, |
|
"loss": 0.0038, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 28.25581395348837, |
|
"grad_norm": 0.0036421448457986116, |
|
"learning_rate": 5.8720930232558145e-05, |
|
"loss": 0.0038, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 28.372093023255815, |
|
"grad_norm": 0.003124956740066409, |
|
"learning_rate": 5.8139534883720933e-05, |
|
"loss": 0.0038, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 28.488372093023255, |
|
"grad_norm": 0.0033948852214962244, |
|
"learning_rate": 5.755813953488373e-05, |
|
"loss": 0.0036, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 28.6046511627907, |
|
"grad_norm": 0.002520653186365962, |
|
"learning_rate": 5.697674418604652e-05, |
|
"loss": 0.0035, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 28.72093023255814, |
|
"grad_norm": 0.003338318085297942, |
|
"learning_rate": 5.6395348837209306e-05, |
|
"loss": 0.0037, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 28.837209302325583, |
|
"grad_norm": 0.0030738948844373226, |
|
"learning_rate": 5.5813953488372095e-05, |
|
"loss": 0.0035, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 28.953488372093023, |
|
"grad_norm": 0.0024965908378362656, |
|
"learning_rate": 5.5232558139534884e-05, |
|
"loss": 0.0037, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 29.069767441860463, |
|
"grad_norm": 0.0026568674948066473, |
|
"learning_rate": 5.465116279069767e-05, |
|
"loss": 0.0032, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 29.069767441860463, |
|
"eval_f1": 0.091324200913242, |
|
"eval_loss": 0.007714101579040289, |
|
"eval_runtime": 1.768, |
|
"eval_samples_per_second": 41.289, |
|
"eval_steps_per_second": 5.656, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 29.186046511627907, |
|
"grad_norm": 0.004428944084793329, |
|
"learning_rate": 5.406976744186046e-05, |
|
"loss": 0.0036, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 29.302325581395348, |
|
"grad_norm": 0.0083547318354249, |
|
"learning_rate": 5.348837209302326e-05, |
|
"loss": 0.0033, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 29.41860465116279, |
|
"grad_norm": 0.003402271308004856, |
|
"learning_rate": 5.290697674418605e-05, |
|
"loss": 0.0039, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 29.53488372093023, |
|
"grad_norm": 0.0033303038217127323, |
|
"learning_rate": 5.232558139534884e-05, |
|
"loss": 0.0035, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 29.651162790697676, |
|
"grad_norm": 0.03485463559627533, |
|
"learning_rate": 5.1744186046511636e-05, |
|
"loss": 0.0038, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 29.767441860465116, |
|
"grad_norm": 0.004661730024963617, |
|
"learning_rate": 5.1162790697674425e-05, |
|
"loss": 0.0036, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 29.88372093023256, |
|
"grad_norm": 0.0038003467489033937, |
|
"learning_rate": 5.0581395348837214e-05, |
|
"loss": 0.0035, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"grad_norm": 0.004514409229159355, |
|
"learning_rate": 5e-05, |
|
"loss": 0.0033, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 30.11627906976744, |
|
"grad_norm": 0.004792849998921156, |
|
"learning_rate": 4.941860465116279e-05, |
|
"loss": 0.0035, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 30.232558139534884, |
|
"grad_norm": 0.004430180415511131, |
|
"learning_rate": 4.883720930232558e-05, |
|
"loss": 0.004, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 30.232558139534884, |
|
"eval_f1": 0.10273972602739725, |
|
"eval_loss": 0.007851834408938885, |
|
"eval_runtime": 1.6578, |
|
"eval_samples_per_second": 44.034, |
|
"eval_steps_per_second": 6.032, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 30.348837209302324, |
|
"grad_norm": 0.0030448336619883776, |
|
"learning_rate": 4.8255813953488375e-05, |
|
"loss": 0.0034, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 30.46511627906977, |
|
"grad_norm": 0.0026765645015984774, |
|
"learning_rate": 4.7674418604651164e-05, |
|
"loss": 0.0033, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 30.58139534883721, |
|
"grad_norm": 0.002969229593873024, |
|
"learning_rate": 4.709302325581396e-05, |
|
"loss": 0.0033, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 30.697674418604652, |
|
"grad_norm": 0.005750367883592844, |
|
"learning_rate": 4.651162790697675e-05, |
|
"loss": 0.0029, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 30.813953488372093, |
|
"grad_norm": 0.0028870333917438984, |
|
"learning_rate": 4.593023255813954e-05, |
|
"loss": 0.0033, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 30.930232558139537, |
|
"grad_norm": 0.003318176371976733, |
|
"learning_rate": 4.5348837209302326e-05, |
|
"loss": 0.0034, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 31.046511627906977, |
|
"grad_norm": 0.0030346305575221777, |
|
"learning_rate": 4.476744186046512e-05, |
|
"loss": 0.003, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 31.162790697674417, |
|
"grad_norm": 0.0031568079721182585, |
|
"learning_rate": 4.418604651162791e-05, |
|
"loss": 0.0033, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 31.27906976744186, |
|
"grad_norm": 0.0028284024447202682, |
|
"learning_rate": 4.36046511627907e-05, |
|
"loss": 0.0031, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 31.3953488372093, |
|
"grad_norm": 0.003977891989052296, |
|
"learning_rate": 4.302325581395349e-05, |
|
"loss": 0.003, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 31.3953488372093, |
|
"eval_f1": 0.09360730593607304, |
|
"eval_loss": 0.008055122569203377, |
|
"eval_runtime": 1.6937, |
|
"eval_samples_per_second": 43.101, |
|
"eval_steps_per_second": 5.904, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 31.511627906976745, |
|
"grad_norm": 0.0026259180158376694, |
|
"learning_rate": 4.2441860465116276e-05, |
|
"loss": 0.0036, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 31.627906976744185, |
|
"grad_norm": 0.003417164785787463, |
|
"learning_rate": 4.186046511627907e-05, |
|
"loss": 0.0032, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 31.74418604651163, |
|
"grad_norm": 0.0038994085043668747, |
|
"learning_rate": 4.127906976744187e-05, |
|
"loss": 0.0032, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 31.86046511627907, |
|
"grad_norm": 0.0034232349134981632, |
|
"learning_rate": 4.0697674418604655e-05, |
|
"loss": 0.0027, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 31.97674418604651, |
|
"grad_norm": 0.004144703969359398, |
|
"learning_rate": 4.0116279069767444e-05, |
|
"loss": 0.0035, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 32.093023255813954, |
|
"grad_norm": 0.0026302810292690992, |
|
"learning_rate": 3.953488372093023e-05, |
|
"loss": 0.0032, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 32.2093023255814, |
|
"grad_norm": 0.002205599332228303, |
|
"learning_rate": 3.895348837209303e-05, |
|
"loss": 0.0033, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 32.325581395348834, |
|
"grad_norm": 0.003439367515966296, |
|
"learning_rate": 3.837209302325582e-05, |
|
"loss": 0.0032, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 32.44186046511628, |
|
"grad_norm": 0.0022263077553361654, |
|
"learning_rate": 3.7790697674418606e-05, |
|
"loss": 0.0029, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 32.55813953488372, |
|
"grad_norm": 0.003249467583373189, |
|
"learning_rate": 3.7209302325581394e-05, |
|
"loss": 0.0029, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 32.55813953488372, |
|
"eval_f1": 0.08904109589041094, |
|
"eval_loss": 0.008000507019460201, |
|
"eval_runtime": 1.6639, |
|
"eval_samples_per_second": 43.874, |
|
"eval_steps_per_second": 6.01, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 32.674418604651166, |
|
"grad_norm": 0.003246536012738943, |
|
"learning_rate": 3.662790697674418e-05, |
|
"loss": 0.0027, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 32.7906976744186, |
|
"grad_norm": 0.0032464447431266308, |
|
"learning_rate": 3.604651162790698e-05, |
|
"loss": 0.0033, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 32.906976744186046, |
|
"grad_norm": 0.002286577830091119, |
|
"learning_rate": 3.5465116279069774e-05, |
|
"loss": 0.0029, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 33.02325581395349, |
|
"grad_norm": 0.0027776581700891256, |
|
"learning_rate": 3.488372093023256e-05, |
|
"loss": 0.003, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 33.13953488372093, |
|
"grad_norm": 0.0025106158573180437, |
|
"learning_rate": 3.430232558139535e-05, |
|
"loss": 0.0027, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 33.25581395348837, |
|
"grad_norm": 0.0026736510917544365, |
|
"learning_rate": 3.372093023255814e-05, |
|
"loss": 0.0029, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 33.372093023255815, |
|
"grad_norm": 0.002507059136405587, |
|
"learning_rate": 3.313953488372093e-05, |
|
"loss": 0.0025, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 33.48837209302326, |
|
"grad_norm": 0.0028928928077220917, |
|
"learning_rate": 3.2558139534883724e-05, |
|
"loss": 0.0031, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 33.604651162790695, |
|
"grad_norm": 0.0032647838816046715, |
|
"learning_rate": 3.197674418604651e-05, |
|
"loss": 0.0028, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 33.72093023255814, |
|
"grad_norm": 0.0033728063572198153, |
|
"learning_rate": 3.13953488372093e-05, |
|
"loss": 0.0033, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 33.72093023255814, |
|
"eval_f1": 0.08447488584474885, |
|
"eval_loss": 0.008075451478362083, |
|
"eval_runtime": 1.6844, |
|
"eval_samples_per_second": 43.339, |
|
"eval_steps_per_second": 5.937, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 33.83720930232558, |
|
"grad_norm": 0.0026662382297217846, |
|
"learning_rate": 3.081395348837209e-05, |
|
"loss": 0.0031, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 33.95348837209303, |
|
"grad_norm": 0.0025568390265107155, |
|
"learning_rate": 3.0232558139534883e-05, |
|
"loss": 0.0028, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 34.06976744186046, |
|
"grad_norm": 0.002340954029932618, |
|
"learning_rate": 2.9651162790697678e-05, |
|
"loss": 0.0026, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 34.18604651162791, |
|
"grad_norm": 0.0033266523387283087, |
|
"learning_rate": 2.9069767441860467e-05, |
|
"loss": 0.0031, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 34.30232558139535, |
|
"grad_norm": 0.0023587134201079607, |
|
"learning_rate": 2.848837209302326e-05, |
|
"loss": 0.0027, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 34.41860465116279, |
|
"grad_norm": 0.002229692181572318, |
|
"learning_rate": 2.7906976744186048e-05, |
|
"loss": 0.0029, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 34.53488372093023, |
|
"grad_norm": 0.0029290826059877872, |
|
"learning_rate": 2.7325581395348836e-05, |
|
"loss": 0.0026, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 34.651162790697676, |
|
"grad_norm": 0.0030265513341873884, |
|
"learning_rate": 2.674418604651163e-05, |
|
"loss": 0.0026, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 34.76744186046512, |
|
"grad_norm": 0.0028545409440994263, |
|
"learning_rate": 2.616279069767442e-05, |
|
"loss": 0.0032, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 34.883720930232556, |
|
"grad_norm": 0.002398886950686574, |
|
"learning_rate": 2.5581395348837212e-05, |
|
"loss": 0.0029, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 34.883720930232556, |
|
"eval_f1": 0.12557077625570776, |
|
"eval_loss": 0.008088447153568268, |
|
"eval_runtime": 1.6856, |
|
"eval_samples_per_second": 43.309, |
|
"eval_steps_per_second": 5.933, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 35.0, |
|
"grad_norm": 0.00287942448630929, |
|
"learning_rate": 2.5e-05, |
|
"loss": 0.0026, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 35.116279069767444, |
|
"grad_norm": 0.0025614567566663027, |
|
"learning_rate": 2.441860465116279e-05, |
|
"loss": 0.0024, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 35.23255813953488, |
|
"grad_norm": 0.002899142215028405, |
|
"learning_rate": 2.3837209302325582e-05, |
|
"loss": 0.0028, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 35.348837209302324, |
|
"grad_norm": 0.003214885015040636, |
|
"learning_rate": 2.3255813953488374e-05, |
|
"loss": 0.0032, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 35.46511627906977, |
|
"grad_norm": 0.003339228220283985, |
|
"learning_rate": 2.2674418604651163e-05, |
|
"loss": 0.0027, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 35.58139534883721, |
|
"grad_norm": 0.0030185177456587553, |
|
"learning_rate": 2.2093023255813955e-05, |
|
"loss": 0.0028, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 35.69767441860465, |
|
"grad_norm": 0.00310333538800478, |
|
"learning_rate": 2.1511627906976744e-05, |
|
"loss": 0.0026, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 35.81395348837209, |
|
"grad_norm": 0.0028332043439149857, |
|
"learning_rate": 2.0930232558139536e-05, |
|
"loss": 0.0026, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 35.93023255813954, |
|
"grad_norm": 0.0021240401547402143, |
|
"learning_rate": 2.0348837209302328e-05, |
|
"loss": 0.0028, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 36.04651162790697, |
|
"grad_norm": 0.0028554806485772133, |
|
"learning_rate": 1.9767441860465116e-05, |
|
"loss": 0.0025, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 36.04651162790697, |
|
"eval_f1": 0.13470319634703196, |
|
"eval_loss": 0.008070297539234161, |
|
"eval_runtime": 1.8154, |
|
"eval_samples_per_second": 40.211, |
|
"eval_steps_per_second": 5.508, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 36.16279069767442, |
|
"grad_norm": 0.0031274755019694567, |
|
"learning_rate": 1.918604651162791e-05, |
|
"loss": 0.0029, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 36.27906976744186, |
|
"grad_norm": 0.0016608175355941057, |
|
"learning_rate": 1.8604651162790697e-05, |
|
"loss": 0.0026, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 36.395348837209305, |
|
"grad_norm": 0.0023403004743158817, |
|
"learning_rate": 1.802325581395349e-05, |
|
"loss": 0.0025, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 36.51162790697674, |
|
"grad_norm": 0.0022207223810255527, |
|
"learning_rate": 1.744186046511628e-05, |
|
"loss": 0.0028, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 36.627906976744185, |
|
"grad_norm": 0.0019355164840817451, |
|
"learning_rate": 1.686046511627907e-05, |
|
"loss": 0.0025, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 36.74418604651163, |
|
"grad_norm": 0.00238374387845397, |
|
"learning_rate": 1.6279069767441862e-05, |
|
"loss": 0.0026, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 36.86046511627907, |
|
"grad_norm": 0.002908985363319516, |
|
"learning_rate": 1.569767441860465e-05, |
|
"loss": 0.0028, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 36.97674418604651, |
|
"grad_norm": 0.002507247030735016, |
|
"learning_rate": 1.5116279069767441e-05, |
|
"loss": 0.0025, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 37.093023255813954, |
|
"grad_norm": 0.0024845688603818417, |
|
"learning_rate": 1.4534883720930233e-05, |
|
"loss": 0.0028, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 37.2093023255814, |
|
"grad_norm": 0.0021229905541986227, |
|
"learning_rate": 1.3953488372093024e-05, |
|
"loss": 0.0027, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 37.2093023255814, |
|
"eval_f1": 0.1324200913242009, |
|
"eval_loss": 0.008123889565467834, |
|
"eval_runtime": 1.6983, |
|
"eval_samples_per_second": 42.985, |
|
"eval_steps_per_second": 5.888, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 37.325581395348834, |
|
"grad_norm": 0.0020165506284683943, |
|
"learning_rate": 1.3372093023255814e-05, |
|
"loss": 0.0024, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 37.44186046511628, |
|
"grad_norm": 0.00225025019608438, |
|
"learning_rate": 1.2790697674418606e-05, |
|
"loss": 0.0024, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 37.55813953488372, |
|
"grad_norm": 0.002410660730674863, |
|
"learning_rate": 1.2209302325581395e-05, |
|
"loss": 0.0025, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 37.674418604651166, |
|
"grad_norm": 0.0022259822580963373, |
|
"learning_rate": 1.1627906976744187e-05, |
|
"loss": 0.0026, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 37.7906976744186, |
|
"grad_norm": 0.002472145715728402, |
|
"learning_rate": 1.1046511627906977e-05, |
|
"loss": 0.0028, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 37.906976744186046, |
|
"grad_norm": 0.002238726709038019, |
|
"learning_rate": 1.0465116279069768e-05, |
|
"loss": 0.0024, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 38.02325581395349, |
|
"grad_norm": 0.0025766075123101473, |
|
"learning_rate": 9.883720930232558e-06, |
|
"loss": 0.0027, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 38.13953488372093, |
|
"grad_norm": 0.002669785637408495, |
|
"learning_rate": 9.302325581395349e-06, |
|
"loss": 0.0027, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 38.25581395348837, |
|
"grad_norm": 0.0023648098576813936, |
|
"learning_rate": 8.72093023255814e-06, |
|
"loss": 0.0028, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 38.372093023255815, |
|
"grad_norm": 0.0019695968367159367, |
|
"learning_rate": 8.139534883720931e-06, |
|
"loss": 0.0028, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 38.372093023255815, |
|
"eval_f1": 0.1324200913242009, |
|
"eval_loss": 0.008158449083566666, |
|
"eval_runtime": 1.8249, |
|
"eval_samples_per_second": 40.002, |
|
"eval_steps_per_second": 5.48, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 38.48837209302326, |
|
"grad_norm": 0.0023280028253793716, |
|
"learning_rate": 7.558139534883721e-06, |
|
"loss": 0.0024, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 38.604651162790695, |
|
"grad_norm": 0.0030140073504298925, |
|
"learning_rate": 6.976744186046512e-06, |
|
"loss": 0.0026, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 38.72093023255814, |
|
"grad_norm": 0.0024626152589917183, |
|
"learning_rate": 6.395348837209303e-06, |
|
"loss": 0.0024, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 38.83720930232558, |
|
"grad_norm": 0.0027184640057384968, |
|
"learning_rate": 5.8139534883720935e-06, |
|
"loss": 0.0022, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 38.95348837209303, |
|
"grad_norm": 0.002258691005408764, |
|
"learning_rate": 5.232558139534884e-06, |
|
"loss": 0.0027, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 39.06976744186046, |
|
"grad_norm": 0.0015635826857760549, |
|
"learning_rate": 4.651162790697674e-06, |
|
"loss": 0.0025, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 39.18604651162791, |
|
"grad_norm": 0.0014281744370236993, |
|
"learning_rate": 4.0697674418604655e-06, |
|
"loss": 0.0025, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 39.30232558139535, |
|
"grad_norm": 0.0019795901607722044, |
|
"learning_rate": 3.488372093023256e-06, |
|
"loss": 0.0024, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 39.41860465116279, |
|
"grad_norm": 0.00274649984203279, |
|
"learning_rate": 2.9069767441860468e-06, |
|
"loss": 0.0028, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 39.53488372093023, |
|
"grad_norm": 0.0021044183522462845, |
|
"learning_rate": 2.325581395348837e-06, |
|
"loss": 0.0023, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 39.53488372093023, |
|
"eval_f1": 0.1324200913242009, |
|
"eval_loss": 0.008160348981618881, |
|
"eval_runtime": 1.7392, |
|
"eval_samples_per_second": 41.972, |
|
"eval_steps_per_second": 5.75, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 39.651162790697676, |
|
"grad_norm": 0.0022693907376378775, |
|
"learning_rate": 1.744186046511628e-06, |
|
"loss": 0.0025, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 39.76744186046512, |
|
"grad_norm": 0.0020165799651294947, |
|
"learning_rate": 1.1627906976744186e-06, |
|
"loss": 0.0027, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 39.883720930232556, |
|
"grad_norm": 0.0023625025060027838, |
|
"learning_rate": 5.813953488372093e-07, |
|
"loss": 0.0022, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"grad_norm": 0.0031698495149612427, |
|
"learning_rate": 0.0, |
|
"loss": 0.0029, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"step": 3440, |
|
"total_flos": 4.2863407148659507e+18, |
|
"train_loss": 0.015139733321509908, |
|
"train_runtime": 1940.273, |
|
"train_samples_per_second": 28.285, |
|
"train_steps_per_second": 1.773 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 3440, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 40, |
|
"save_steps": 100, |
|
"total_flos": 4.2863407148659507e+18, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|