|
{
|
|
"best_metric": 0.910958904109589,
|
|
"best_model_checkpoint": "swinv2-tiny-patch4-window8-256-finetuned-5emotions\\checkpoint-5281",
|
|
"epoch": 24.99881656804734,
|
|
"eval_steps": 500,
|
|
"global_step": 5281,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.047337278106508875,
|
|
"grad_norm": 7.075885772705078,
|
|
"learning_rate": 6.765899864682003e-07,
|
|
"loss": 1.691,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 0.09467455621301775,
|
|
"grad_norm": 9.890098571777344,
|
|
"learning_rate": 1.3531799729364006e-06,
|
|
"loss": 1.6712,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.14201183431952663,
|
|
"grad_norm": 7.415971755981445,
|
|
"learning_rate": 2.029769959404601e-06,
|
|
"loss": 1.6787,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 0.1893491124260355,
|
|
"grad_norm": 6.3063764572143555,
|
|
"learning_rate": 2.7063599458728013e-06,
|
|
"loss": 1.6685,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.23668639053254437,
|
|
"grad_norm": 8.763900756835938,
|
|
"learning_rate": 3.3829499323410016e-06,
|
|
"loss": 1.6143,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.28402366863905326,
|
|
"grad_norm": 6.661700248718262,
|
|
"learning_rate": 4.059539918809202e-06,
|
|
"loss": 1.5849,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.33136094674556216,
|
|
"grad_norm": 7.178672790527344,
|
|
"learning_rate": 4.736129905277402e-06,
|
|
"loss": 1.5502,
|
|
"step": 70
|
|
},
|
|
{
|
|
"epoch": 0.378698224852071,
|
|
"grad_norm": 5.857969284057617,
|
|
"learning_rate": 5.4127198917456026e-06,
|
|
"loss": 1.5274,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.4260355029585799,
|
|
"grad_norm": 6.652136325836182,
|
|
"learning_rate": 6.089309878213803e-06,
|
|
"loss": 1.4915,
|
|
"step": 90
|
|
},
|
|
{
|
|
"epoch": 0.47337278106508873,
|
|
"grad_norm": 6.222568035125732,
|
|
"learning_rate": 6.765899864682003e-06,
|
|
"loss": 1.4063,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.5207100591715976,
|
|
"grad_norm": 6.365822792053223,
|
|
"learning_rate": 7.442489851150203e-06,
|
|
"loss": 1.3811,
|
|
"step": 110
|
|
},
|
|
{
|
|
"epoch": 0.5680473372781065,
|
|
"grad_norm": 7.9343414306640625,
|
|
"learning_rate": 8.119079837618404e-06,
|
|
"loss": 1.3026,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.6153846153846154,
|
|
"grad_norm": 9.204723358154297,
|
|
"learning_rate": 8.795669824086604e-06,
|
|
"loss": 1.2516,
|
|
"step": 130
|
|
},
|
|
{
|
|
"epoch": 0.6627218934911243,
|
|
"grad_norm": 7.836040496826172,
|
|
"learning_rate": 9.472259810554804e-06,
|
|
"loss": 1.1664,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.7100591715976331,
|
|
"grad_norm": 10.82960319519043,
|
|
"learning_rate": 1.0148849797023005e-05,
|
|
"loss": 1.182,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 0.757396449704142,
|
|
"grad_norm": 13.981669425964355,
|
|
"learning_rate": 1.0825439783491205e-05,
|
|
"loss": 1.0992,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 0.8047337278106509,
|
|
"grad_norm": 17.63448715209961,
|
|
"learning_rate": 1.1502029769959405e-05,
|
|
"loss": 1.036,
|
|
"step": 170
|
|
},
|
|
{
|
|
"epoch": 0.8520710059171598,
|
|
"grad_norm": 13.67409610748291,
|
|
"learning_rate": 1.2178619756427606e-05,
|
|
"loss": 1.0372,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 0.8994082840236687,
|
|
"grad_norm": 10.486079216003418,
|
|
"learning_rate": 1.2855209742895804e-05,
|
|
"loss": 0.9888,
|
|
"step": 190
|
|
},
|
|
{
|
|
"epoch": 0.9467455621301775,
|
|
"grad_norm": 10.388420104980469,
|
|
"learning_rate": 1.3531799729364006e-05,
|
|
"loss": 0.98,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.9940828402366864,
|
|
"grad_norm": 11.530645370483398,
|
|
"learning_rate": 1.4208389715832207e-05,
|
|
"loss": 0.9665,
|
|
"step": 210
|
|
},
|
|
{
|
|
"epoch": 0.9988165680473373,
|
|
"eval_accuracy": 0.6835616438356165,
|
|
"eval_loss": 0.8002648949623108,
|
|
"eval_runtime": 6.3464,
|
|
"eval_samples_per_second": 230.051,
|
|
"eval_steps_per_second": 28.835,
|
|
"step": 211
|
|
},
|
|
{
|
|
"epoch": 1.0414201183431953,
|
|
"grad_norm": 14.492610931396484,
|
|
"learning_rate": 1.4884979702300405e-05,
|
|
"loss": 0.9431,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 1.0887573964497042,
|
|
"grad_norm": 11.852544784545898,
|
|
"learning_rate": 1.5561569688768607e-05,
|
|
"loss": 0.8959,
|
|
"step": 230
|
|
},
|
|
{
|
|
"epoch": 1.136094674556213,
|
|
"grad_norm": 11.708285331726074,
|
|
"learning_rate": 1.6238159675236808e-05,
|
|
"loss": 0.9688,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 1.183431952662722,
|
|
"grad_norm": 14.45132827758789,
|
|
"learning_rate": 1.6914749661705008e-05,
|
|
"loss": 0.8888,
|
|
"step": 250
|
|
},
|
|
{
|
|
"epoch": 1.2307692307692308,
|
|
"grad_norm": 14.281059265136719,
|
|
"learning_rate": 1.759133964817321e-05,
|
|
"loss": 0.846,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 1.2781065088757395,
|
|
"grad_norm": 14.860888481140137,
|
|
"learning_rate": 1.826792963464141e-05,
|
|
"loss": 0.8906,
|
|
"step": 270
|
|
},
|
|
{
|
|
"epoch": 1.3254437869822486,
|
|
"grad_norm": 10.605212211608887,
|
|
"learning_rate": 1.894451962110961e-05,
|
|
"loss": 0.8169,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 1.3727810650887573,
|
|
"grad_norm": 12.668191909790039,
|
|
"learning_rate": 1.962110960757781e-05,
|
|
"loss": 0.836,
|
|
"step": 290
|
|
},
|
|
{
|
|
"epoch": 1.4201183431952662,
|
|
"grad_norm": 10.248248100280762,
|
|
"learning_rate": 2.029769959404601e-05,
|
|
"loss": 0.7644,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 1.467455621301775,
|
|
"grad_norm": 9.778542518615723,
|
|
"learning_rate": 2.097428958051421e-05,
|
|
"loss": 0.7755,
|
|
"step": 310
|
|
},
|
|
{
|
|
"epoch": 1.514792899408284,
|
|
"grad_norm": 9.64427661895752,
|
|
"learning_rate": 2.165087956698241e-05,
|
|
"loss": 0.7387,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 1.5621301775147929,
|
|
"grad_norm": 10.04445743560791,
|
|
"learning_rate": 2.232746955345061e-05,
|
|
"loss": 0.7605,
|
|
"step": 330
|
|
},
|
|
{
|
|
"epoch": 1.6094674556213018,
|
|
"grad_norm": 13.125927925109863,
|
|
"learning_rate": 2.300405953991881e-05,
|
|
"loss": 0.6781,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 1.6568047337278107,
|
|
"grad_norm": 13.797953605651855,
|
|
"learning_rate": 2.368064952638701e-05,
|
|
"loss": 0.6551,
|
|
"step": 350
|
|
},
|
|
{
|
|
"epoch": 1.7041420118343196,
|
|
"grad_norm": 14.754645347595215,
|
|
"learning_rate": 2.435723951285521e-05,
|
|
"loss": 0.7542,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 1.7514792899408285,
|
|
"grad_norm": 13.914559364318848,
|
|
"learning_rate": 2.5033829499323412e-05,
|
|
"loss": 0.8104,
|
|
"step": 370
|
|
},
|
|
{
|
|
"epoch": 1.7988165680473371,
|
|
"grad_norm": 11.46696662902832,
|
|
"learning_rate": 2.571041948579161e-05,
|
|
"loss": 0.6945,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 1.8461538461538463,
|
|
"grad_norm": 10.812294960021973,
|
|
"learning_rate": 2.638700947225981e-05,
|
|
"loss": 0.6711,
|
|
"step": 390
|
|
},
|
|
{
|
|
"epoch": 1.893491124260355,
|
|
"grad_norm": 15.02450180053711,
|
|
"learning_rate": 2.7063599458728013e-05,
|
|
"loss": 0.7345,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 1.940828402366864,
|
|
"grad_norm": 11.53946590423584,
|
|
"learning_rate": 2.7740189445196213e-05,
|
|
"loss": 0.723,
|
|
"step": 410
|
|
},
|
|
{
|
|
"epoch": 1.9881656804733727,
|
|
"grad_norm": 8.337069511413574,
|
|
"learning_rate": 2.8416779431664413e-05,
|
|
"loss": 0.6443,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 1.9976331360946746,
|
|
"eval_accuracy": 0.8246575342465754,
|
|
"eval_loss": 0.4562951624393463,
|
|
"eval_runtime": 6.3186,
|
|
"eval_samples_per_second": 231.065,
|
|
"eval_steps_per_second": 28.962,
|
|
"step": 422
|
|
},
|
|
{
|
|
"epoch": 2.035502958579882,
|
|
"grad_norm": 9.038360595703125,
|
|
"learning_rate": 2.9093369418132617e-05,
|
|
"loss": 0.6256,
|
|
"step": 430
|
|
},
|
|
{
|
|
"epoch": 2.0828402366863905,
|
|
"grad_norm": 12.379063606262207,
|
|
"learning_rate": 2.976995940460081e-05,
|
|
"loss": 0.5998,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 2.1301775147928996,
|
|
"grad_norm": 12.626445770263672,
|
|
"learning_rate": 3.044654939106901e-05,
|
|
"loss": 0.6456,
|
|
"step": 450
|
|
},
|
|
{
|
|
"epoch": 2.1775147928994083,
|
|
"grad_norm": 10.665410995483398,
|
|
"learning_rate": 3.1123139377537215e-05,
|
|
"loss": 0.6145,
|
|
"step": 460
|
|
},
|
|
{
|
|
"epoch": 2.224852071005917,
|
|
"grad_norm": 11.917645454406738,
|
|
"learning_rate": 3.1799729364005415e-05,
|
|
"loss": 0.6494,
|
|
"step": 470
|
|
},
|
|
{
|
|
"epoch": 2.272189349112426,
|
|
"grad_norm": 14.427268981933594,
|
|
"learning_rate": 3.2476319350473615e-05,
|
|
"loss": 0.5967,
|
|
"step": 480
|
|
},
|
|
{
|
|
"epoch": 2.3195266272189348,
|
|
"grad_norm": 14.22167682647705,
|
|
"learning_rate": 3.3152909336941816e-05,
|
|
"loss": 0.6356,
|
|
"step": 490
|
|
},
|
|
{
|
|
"epoch": 2.366863905325444,
|
|
"grad_norm": 15.034667015075684,
|
|
"learning_rate": 3.3829499323410016e-05,
|
|
"loss": 0.6583,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 2.4142011834319526,
|
|
"grad_norm": 11.716626167297363,
|
|
"learning_rate": 3.4506089309878216e-05,
|
|
"loss": 0.5703,
|
|
"step": 510
|
|
},
|
|
{
|
|
"epoch": 2.4615384615384617,
|
|
"grad_norm": 8.812618255615234,
|
|
"learning_rate": 3.518267929634642e-05,
|
|
"loss": 0.5495,
|
|
"step": 520
|
|
},
|
|
{
|
|
"epoch": 2.5088757396449703,
|
|
"grad_norm": 19.886188507080078,
|
|
"learning_rate": 3.585926928281462e-05,
|
|
"loss": 0.7156,
|
|
"step": 530
|
|
},
|
|
{
|
|
"epoch": 2.556213017751479,
|
|
"grad_norm": 10.014534950256348,
|
|
"learning_rate": 3.653585926928282e-05,
|
|
"loss": 0.7279,
|
|
"step": 540
|
|
},
|
|
{
|
|
"epoch": 2.603550295857988,
|
|
"grad_norm": 8.2186861038208,
|
|
"learning_rate": 3.721244925575101e-05,
|
|
"loss": 0.6629,
|
|
"step": 550
|
|
},
|
|
{
|
|
"epoch": 2.6508875739644973,
|
|
"grad_norm": 11.415748596191406,
|
|
"learning_rate": 3.788903924221922e-05,
|
|
"loss": 0.6031,
|
|
"step": 560
|
|
},
|
|
{
|
|
"epoch": 2.698224852071006,
|
|
"grad_norm": 6.490344047546387,
|
|
"learning_rate": 3.856562922868742e-05,
|
|
"loss": 0.5158,
|
|
"step": 570
|
|
},
|
|
{
|
|
"epoch": 2.7455621301775146,
|
|
"grad_norm": 10.63316822052002,
|
|
"learning_rate": 3.924221921515562e-05,
|
|
"loss": 0.6533,
|
|
"step": 580
|
|
},
|
|
{
|
|
"epoch": 2.7928994082840237,
|
|
"grad_norm": 9.291253089904785,
|
|
"learning_rate": 3.991880920162382e-05,
|
|
"loss": 0.5501,
|
|
"step": 590
|
|
},
|
|
{
|
|
"epoch": 2.8402366863905324,
|
|
"grad_norm": 10.60273551940918,
|
|
"learning_rate": 4.059539918809202e-05,
|
|
"loss": 0.5719,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 2.8875739644970415,
|
|
"grad_norm": 10.603645324707031,
|
|
"learning_rate": 4.127198917456021e-05,
|
|
"loss": 0.4905,
|
|
"step": 610
|
|
},
|
|
{
|
|
"epoch": 2.93491124260355,
|
|
"grad_norm": 17.47416877746582,
|
|
"learning_rate": 4.194857916102842e-05,
|
|
"loss": 0.7037,
|
|
"step": 620
|
|
},
|
|
{
|
|
"epoch": 2.9822485207100593,
|
|
"grad_norm": 9.434072494506836,
|
|
"learning_rate": 4.262516914749662e-05,
|
|
"loss": 0.5815,
|
|
"step": 630
|
|
},
|
|
{
|
|
"epoch": 2.996449704142012,
|
|
"eval_accuracy": 0.8568493150684932,
|
|
"eval_loss": 0.3556749224662781,
|
|
"eval_runtime": 6.2074,
|
|
"eval_samples_per_second": 235.204,
|
|
"eval_steps_per_second": 29.481,
|
|
"step": 633
|
|
},
|
|
{
|
|
"epoch": 3.029585798816568,
|
|
"grad_norm": 13.81190299987793,
|
|
"learning_rate": 4.330175913396482e-05,
|
|
"loss": 0.5877,
|
|
"step": 640
|
|
},
|
|
{
|
|
"epoch": 3.076923076923077,
|
|
"grad_norm": 8.872483253479004,
|
|
"learning_rate": 4.397834912043302e-05,
|
|
"loss": 0.55,
|
|
"step": 650
|
|
},
|
|
{
|
|
"epoch": 3.1242603550295858,
|
|
"grad_norm": 11.748785972595215,
|
|
"learning_rate": 4.465493910690122e-05,
|
|
"loss": 0.6155,
|
|
"step": 660
|
|
},
|
|
{
|
|
"epoch": 3.171597633136095,
|
|
"grad_norm": 13.621400833129883,
|
|
"learning_rate": 4.5331529093369415e-05,
|
|
"loss": 0.5907,
|
|
"step": 670
|
|
},
|
|
{
|
|
"epoch": 3.2189349112426036,
|
|
"grad_norm": 10.422270774841309,
|
|
"learning_rate": 4.600811907983762e-05,
|
|
"loss": 0.6022,
|
|
"step": 680
|
|
},
|
|
{
|
|
"epoch": 3.2662721893491122,
|
|
"grad_norm": 12.192015647888184,
|
|
"learning_rate": 4.668470906630582e-05,
|
|
"loss": 0.558,
|
|
"step": 690
|
|
},
|
|
{
|
|
"epoch": 3.3136094674556213,
|
|
"grad_norm": 5.769958972930908,
|
|
"learning_rate": 4.736129905277402e-05,
|
|
"loss": 0.5257,
|
|
"step": 700
|
|
},
|
|
{
|
|
"epoch": 3.36094674556213,
|
|
"grad_norm": 11.664800643920898,
|
|
"learning_rate": 4.803788903924222e-05,
|
|
"loss": 0.6242,
|
|
"step": 710
|
|
},
|
|
{
|
|
"epoch": 3.408284023668639,
|
|
"grad_norm": 10.007041931152344,
|
|
"learning_rate": 4.871447902571042e-05,
|
|
"loss": 0.5789,
|
|
"step": 720
|
|
},
|
|
{
|
|
"epoch": 3.455621301775148,
|
|
"grad_norm": 18.98644256591797,
|
|
"learning_rate": 4.9391069012178623e-05,
|
|
"loss": 0.4632,
|
|
"step": 730
|
|
},
|
|
{
|
|
"epoch": 3.502958579881657,
|
|
"grad_norm": 9.949424743652344,
|
|
"learning_rate": 4.999247667770087e-05,
|
|
"loss": 0.5657,
|
|
"step": 740
|
|
},
|
|
{
|
|
"epoch": 3.5502958579881656,
|
|
"grad_norm": 7.471621513366699,
|
|
"learning_rate": 4.99172434547096e-05,
|
|
"loss": 0.4076,
|
|
"step": 750
|
|
},
|
|
{
|
|
"epoch": 3.5976331360946747,
|
|
"grad_norm": 9.102510452270508,
|
|
"learning_rate": 4.9842010231718327e-05,
|
|
"loss": 0.532,
|
|
"step": 760
|
|
},
|
|
{
|
|
"epoch": 3.6449704142011834,
|
|
"grad_norm": 9.587445259094238,
|
|
"learning_rate": 4.976677700872706e-05,
|
|
"loss": 0.5685,
|
|
"step": 770
|
|
},
|
|
{
|
|
"epoch": 3.6923076923076925,
|
|
"grad_norm": 10.277064323425293,
|
|
"learning_rate": 4.969154378573578e-05,
|
|
"loss": 0.5004,
|
|
"step": 780
|
|
},
|
|
{
|
|
"epoch": 3.739644970414201,
|
|
"grad_norm": 15.665764808654785,
|
|
"learning_rate": 4.9616310562744514e-05,
|
|
"loss": 0.5571,
|
|
"step": 790
|
|
},
|
|
{
|
|
"epoch": 3.78698224852071,
|
|
"grad_norm": 9.643716812133789,
|
|
"learning_rate": 4.954107733975324e-05,
|
|
"loss": 0.5235,
|
|
"step": 800
|
|
},
|
|
{
|
|
"epoch": 3.834319526627219,
|
|
"grad_norm": 12.600419044494629,
|
|
"learning_rate": 4.9465844116761964e-05,
|
|
"loss": 0.5579,
|
|
"step": 810
|
|
},
|
|
{
|
|
"epoch": 3.8816568047337277,
|
|
"grad_norm": 9.6210298538208,
|
|
"learning_rate": 4.939061089377069e-05,
|
|
"loss": 0.4711,
|
|
"step": 820
|
|
},
|
|
{
|
|
"epoch": 3.9289940828402368,
|
|
"grad_norm": 10.485040664672852,
|
|
"learning_rate": 4.9315377670779414e-05,
|
|
"loss": 0.4848,
|
|
"step": 830
|
|
},
|
|
{
|
|
"epoch": 3.9763313609467454,
|
|
"grad_norm": 7.453371524810791,
|
|
"learning_rate": 4.9240144447788145e-05,
|
|
"loss": 0.474,
|
|
"step": 840
|
|
},
|
|
{
|
|
"epoch": 4.0,
|
|
"eval_accuracy": 0.8726027397260274,
|
|
"eval_loss": 0.35826006531715393,
|
|
"eval_runtime": 6.1388,
|
|
"eval_samples_per_second": 237.833,
|
|
"eval_steps_per_second": 29.811,
|
|
"step": 845
|
|
},
|
|
{
|
|
"epoch": 4.023668639053255,
|
|
"grad_norm": 8.34096908569336,
|
|
"learning_rate": 4.916491122479687e-05,
|
|
"loss": 0.5093,
|
|
"step": 850
|
|
},
|
|
{
|
|
"epoch": 4.071005917159764,
|
|
"grad_norm": 7.713958263397217,
|
|
"learning_rate": 4.90896780018056e-05,
|
|
"loss": 0.4608,
|
|
"step": 860
|
|
},
|
|
{
|
|
"epoch": 4.118343195266272,
|
|
"grad_norm": 9.734159469604492,
|
|
"learning_rate": 4.9014444778814326e-05,
|
|
"loss": 0.4247,
|
|
"step": 870
|
|
},
|
|
{
|
|
"epoch": 4.165680473372781,
|
|
"grad_norm": 7.637202739715576,
|
|
"learning_rate": 4.893921155582306e-05,
|
|
"loss": 0.554,
|
|
"step": 880
|
|
},
|
|
{
|
|
"epoch": 4.21301775147929,
|
|
"grad_norm": 12.172405242919922,
|
|
"learning_rate": 4.886397833283178e-05,
|
|
"loss": 0.4859,
|
|
"step": 890
|
|
},
|
|
{
|
|
"epoch": 4.260355029585799,
|
|
"grad_norm": 9.40637493133545,
|
|
"learning_rate": 4.878874510984051e-05,
|
|
"loss": 0.5068,
|
|
"step": 900
|
|
},
|
|
{
|
|
"epoch": 4.3076923076923075,
|
|
"grad_norm": 5.2307209968566895,
|
|
"learning_rate": 4.871351188684923e-05,
|
|
"loss": 0.4318,
|
|
"step": 910
|
|
},
|
|
{
|
|
"epoch": 4.355029585798817,
|
|
"grad_norm": 13.809428215026855,
|
|
"learning_rate": 4.8638278663857964e-05,
|
|
"loss": 0.5231,
|
|
"step": 920
|
|
},
|
|
{
|
|
"epoch": 4.402366863905326,
|
|
"grad_norm": 9.841399192810059,
|
|
"learning_rate": 4.856304544086669e-05,
|
|
"loss": 0.4441,
|
|
"step": 930
|
|
},
|
|
{
|
|
"epoch": 4.449704142011834,
|
|
"grad_norm": 7.034471035003662,
|
|
"learning_rate": 4.8487812217875414e-05,
|
|
"loss": 0.5421,
|
|
"step": 940
|
|
},
|
|
{
|
|
"epoch": 4.497041420118343,
|
|
"grad_norm": 6.35905122756958,
|
|
"learning_rate": 4.8412578994884145e-05,
|
|
"loss": 0.5084,
|
|
"step": 950
|
|
},
|
|
{
|
|
"epoch": 4.544378698224852,
|
|
"grad_norm": 8.407711029052734,
|
|
"learning_rate": 4.833734577189287e-05,
|
|
"loss": 0.4067,
|
|
"step": 960
|
|
},
|
|
{
|
|
"epoch": 4.591715976331361,
|
|
"grad_norm": 7.5561113357543945,
|
|
"learning_rate": 4.8262112548901595e-05,
|
|
"loss": 0.4881,
|
|
"step": 970
|
|
},
|
|
{
|
|
"epoch": 4.6390532544378695,
|
|
"grad_norm": 7.843471050262451,
|
|
"learning_rate": 4.818687932591032e-05,
|
|
"loss": 0.4175,
|
|
"step": 980
|
|
},
|
|
{
|
|
"epoch": 4.686390532544379,
|
|
"grad_norm": 11.301685333251953,
|
|
"learning_rate": 4.811164610291905e-05,
|
|
"loss": 0.4423,
|
|
"step": 990
|
|
},
|
|
{
|
|
"epoch": 4.733727810650888,
|
|
"grad_norm": 7.472105503082275,
|
|
"learning_rate": 4.8036412879927776e-05,
|
|
"loss": 0.4525,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 4.781065088757396,
|
|
"grad_norm": 9.092314720153809,
|
|
"learning_rate": 4.796117965693651e-05,
|
|
"loss": 0.5699,
|
|
"step": 1010
|
|
},
|
|
{
|
|
"epoch": 4.828402366863905,
|
|
"grad_norm": 12.238302230834961,
|
|
"learning_rate": 4.788594643394523e-05,
|
|
"loss": 0.4524,
|
|
"step": 1020
|
|
},
|
|
{
|
|
"epoch": 4.875739644970414,
|
|
"grad_norm": 5.100959777832031,
|
|
"learning_rate": 4.7810713210953964e-05,
|
|
"loss": 0.3866,
|
|
"step": 1030
|
|
},
|
|
{
|
|
"epoch": 4.923076923076923,
|
|
"grad_norm": 9.616569519042969,
|
|
"learning_rate": 4.773547998796269e-05,
|
|
"loss": 0.3577,
|
|
"step": 1040
|
|
},
|
|
{
|
|
"epoch": 4.970414201183432,
|
|
"grad_norm": 9.995213508605957,
|
|
"learning_rate": 4.7660246764971413e-05,
|
|
"loss": 0.5819,
|
|
"step": 1050
|
|
},
|
|
{
|
|
"epoch": 4.998816568047337,
|
|
"eval_accuracy": 0.8671232876712329,
|
|
"eval_loss": 0.34042322635650635,
|
|
"eval_runtime": 6.4475,
|
|
"eval_samples_per_second": 226.444,
|
|
"eval_steps_per_second": 28.383,
|
|
"step": 1056
|
|
},
|
|
{
|
|
"epoch": 5.017751479289941,
|
|
"grad_norm": 6.531469345092773,
|
|
"learning_rate": 4.758501354198014e-05,
|
|
"loss": 0.4182,
|
|
"step": 1060
|
|
},
|
|
{
|
|
"epoch": 5.06508875739645,
|
|
"grad_norm": 11.092623710632324,
|
|
"learning_rate": 4.750978031898887e-05,
|
|
"loss": 0.4458,
|
|
"step": 1070
|
|
},
|
|
{
|
|
"epoch": 5.112426035502959,
|
|
"grad_norm": 12.276275634765625,
|
|
"learning_rate": 4.7434547095997595e-05,
|
|
"loss": 0.5101,
|
|
"step": 1080
|
|
},
|
|
{
|
|
"epoch": 5.159763313609467,
|
|
"grad_norm": 10.82636833190918,
|
|
"learning_rate": 4.735931387300632e-05,
|
|
"loss": 0.4708,
|
|
"step": 1090
|
|
},
|
|
{
|
|
"epoch": 5.207100591715976,
|
|
"grad_norm": 9.973958015441895,
|
|
"learning_rate": 4.728408065001505e-05,
|
|
"loss": 0.5191,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"epoch": 5.254437869822485,
|
|
"grad_norm": 9.460865020751953,
|
|
"learning_rate": 4.7208847427023776e-05,
|
|
"loss": 0.4285,
|
|
"step": 1110
|
|
},
|
|
{
|
|
"epoch": 5.3017751479289945,
|
|
"grad_norm": 15.347735404968262,
|
|
"learning_rate": 4.713361420403251e-05,
|
|
"loss": 0.4579,
|
|
"step": 1120
|
|
},
|
|
{
|
|
"epoch": 5.349112426035503,
|
|
"grad_norm": 14.214599609375,
|
|
"learning_rate": 4.7058380981041225e-05,
|
|
"loss": 0.4787,
|
|
"step": 1130
|
|
},
|
|
{
|
|
"epoch": 5.396449704142012,
|
|
"grad_norm": 9.042417526245117,
|
|
"learning_rate": 4.698314775804996e-05,
|
|
"loss": 0.4146,
|
|
"step": 1140
|
|
},
|
|
{
|
|
"epoch": 5.443786982248521,
|
|
"grad_norm": 8.627814292907715,
|
|
"learning_rate": 4.690791453505868e-05,
|
|
"loss": 0.394,
|
|
"step": 1150
|
|
},
|
|
{
|
|
"epoch": 5.491124260355029,
|
|
"grad_norm": 8.060114860534668,
|
|
"learning_rate": 4.683268131206741e-05,
|
|
"loss": 0.412,
|
|
"step": 1160
|
|
},
|
|
{
|
|
"epoch": 5.538461538461538,
|
|
"grad_norm": 8.569971084594727,
|
|
"learning_rate": 4.675744808907614e-05,
|
|
"loss": 0.443,
|
|
"step": 1170
|
|
},
|
|
{
|
|
"epoch": 5.585798816568047,
|
|
"grad_norm": 31.7719669342041,
|
|
"learning_rate": 4.668221486608487e-05,
|
|
"loss": 0.4424,
|
|
"step": 1180
|
|
},
|
|
{
|
|
"epoch": 5.633136094674557,
|
|
"grad_norm": 10.994864463806152,
|
|
"learning_rate": 4.6606981643093595e-05,
|
|
"loss": 0.4072,
|
|
"step": 1190
|
|
},
|
|
{
|
|
"epoch": 5.680473372781065,
|
|
"grad_norm": 12.489917755126953,
|
|
"learning_rate": 4.653174842010232e-05,
|
|
"loss": 0.4193,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"epoch": 5.727810650887574,
|
|
"grad_norm": 6.0672760009765625,
|
|
"learning_rate": 4.6456515197111044e-05,
|
|
"loss": 0.463,
|
|
"step": 1210
|
|
},
|
|
{
|
|
"epoch": 5.775147928994083,
|
|
"grad_norm": 9.66230297088623,
|
|
"learning_rate": 4.6381281974119776e-05,
|
|
"loss": 0.3863,
|
|
"step": 1220
|
|
},
|
|
{
|
|
"epoch": 5.822485207100591,
|
|
"grad_norm": 12.802431106567383,
|
|
"learning_rate": 4.63060487511285e-05,
|
|
"loss": 0.4471,
|
|
"step": 1230
|
|
},
|
|
{
|
|
"epoch": 5.8698224852071,
|
|
"grad_norm": 10.842957496643066,
|
|
"learning_rate": 4.6230815528137225e-05,
|
|
"loss": 0.5186,
|
|
"step": 1240
|
|
},
|
|
{
|
|
"epoch": 5.9171597633136095,
|
|
"grad_norm": 8.612702369689941,
|
|
"learning_rate": 4.615558230514596e-05,
|
|
"loss": 0.4908,
|
|
"step": 1250
|
|
},
|
|
{
|
|
"epoch": 5.964497041420119,
|
|
"grad_norm": 8.768792152404785,
|
|
"learning_rate": 4.608034908215468e-05,
|
|
"loss": 0.4557,
|
|
"step": 1260
|
|
},
|
|
{
|
|
"epoch": 5.997633136094675,
|
|
"eval_accuracy": 0.8993150684931507,
|
|
"eval_loss": 0.2699526846408844,
|
|
"eval_runtime": 6.396,
|
|
"eval_samples_per_second": 228.266,
|
|
"eval_steps_per_second": 28.611,
|
|
"step": 1267
|
|
},
|
|
{
|
|
"epoch": 6.011834319526627,
|
|
"grad_norm": 6.778576374053955,
|
|
"learning_rate": 4.600511585916341e-05,
|
|
"loss": 0.4647,
|
|
"step": 1270
|
|
},
|
|
{
|
|
"epoch": 6.059171597633136,
|
|
"grad_norm": 5.115172863006592,
|
|
"learning_rate": 4.592988263617213e-05,
|
|
"loss": 0.4053,
|
|
"step": 1280
|
|
},
|
|
{
|
|
"epoch": 6.106508875739645,
|
|
"grad_norm": 7.163010120391846,
|
|
"learning_rate": 4.585464941318086e-05,
|
|
"loss": 0.4136,
|
|
"step": 1290
|
|
},
|
|
{
|
|
"epoch": 6.153846153846154,
|
|
"grad_norm": 5.242615699768066,
|
|
"learning_rate": 4.577941619018959e-05,
|
|
"loss": 0.4233,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"epoch": 6.201183431952662,
|
|
"grad_norm": 7.148778915405273,
|
|
"learning_rate": 4.570418296719832e-05,
|
|
"loss": 0.3791,
|
|
"step": 1310
|
|
},
|
|
{
|
|
"epoch": 6.2485207100591715,
|
|
"grad_norm": 6.911210060119629,
|
|
"learning_rate": 4.5628949744207044e-05,
|
|
"loss": 0.3933,
|
|
"step": 1320
|
|
},
|
|
{
|
|
"epoch": 6.295857988165681,
|
|
"grad_norm": 7.753135681152344,
|
|
"learning_rate": 4.5553716521215776e-05,
|
|
"loss": 0.428,
|
|
"step": 1330
|
|
},
|
|
{
|
|
"epoch": 6.34319526627219,
|
|
"grad_norm": 5.933778762817383,
|
|
"learning_rate": 4.54784832982245e-05,
|
|
"loss": 0.4668,
|
|
"step": 1340
|
|
},
|
|
{
|
|
"epoch": 6.390532544378698,
|
|
"grad_norm": 7.8352556228637695,
|
|
"learning_rate": 4.5403250075233225e-05,
|
|
"loss": 0.3272,
|
|
"step": 1350
|
|
},
|
|
{
|
|
"epoch": 6.437869822485207,
|
|
"grad_norm": 11.419840812683105,
|
|
"learning_rate": 4.532801685224195e-05,
|
|
"loss": 0.3954,
|
|
"step": 1360
|
|
},
|
|
{
|
|
"epoch": 6.485207100591716,
|
|
"grad_norm": 9.681208610534668,
|
|
"learning_rate": 4.5252783629250675e-05,
|
|
"loss": 0.5153,
|
|
"step": 1370
|
|
},
|
|
{
|
|
"epoch": 6.5325443786982245,
|
|
"grad_norm": 6.971587657928467,
|
|
"learning_rate": 4.5177550406259406e-05,
|
|
"loss": 0.4247,
|
|
"step": 1380
|
|
},
|
|
{
|
|
"epoch": 6.579881656804734,
|
|
"grad_norm": 6.286644458770752,
|
|
"learning_rate": 4.510231718326813e-05,
|
|
"loss": 0.4618,
|
|
"step": 1390
|
|
},
|
|
{
|
|
"epoch": 6.627218934911243,
|
|
"grad_norm": 11.171966552734375,
|
|
"learning_rate": 4.502708396027686e-05,
|
|
"loss": 0.4352,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"epoch": 6.674556213017752,
|
|
"grad_norm": 10.539188385009766,
|
|
"learning_rate": 4.495185073728559e-05,
|
|
"loss": 0.3841,
|
|
"step": 1410
|
|
},
|
|
{
|
|
"epoch": 6.72189349112426,
|
|
"grad_norm": 5.127812385559082,
|
|
"learning_rate": 4.487661751429432e-05,
|
|
"loss": 0.3388,
|
|
"step": 1420
|
|
},
|
|
{
|
|
"epoch": 6.769230769230769,
|
|
"grad_norm": 10.178089141845703,
|
|
"learning_rate": 4.480138429130304e-05,
|
|
"loss": 0.4024,
|
|
"step": 1430
|
|
},
|
|
{
|
|
"epoch": 6.816568047337278,
|
|
"grad_norm": 5.93577766418457,
|
|
"learning_rate": 4.472615106831177e-05,
|
|
"loss": 0.4173,
|
|
"step": 1440
|
|
},
|
|
{
|
|
"epoch": 6.8639053254437865,
|
|
"grad_norm": 5.2099609375,
|
|
"learning_rate": 4.4650917845320493e-05,
|
|
"loss": 0.3462,
|
|
"step": 1450
|
|
},
|
|
{
|
|
"epoch": 6.911242603550296,
|
|
"grad_norm": 7.551539897918701,
|
|
"learning_rate": 4.4575684622329225e-05,
|
|
"loss": 0.4034,
|
|
"step": 1460
|
|
},
|
|
{
|
|
"epoch": 6.958579881656805,
|
|
"grad_norm": 10.478506088256836,
|
|
"learning_rate": 4.450045139933795e-05,
|
|
"loss": 0.4021,
|
|
"step": 1470
|
|
},
|
|
{
|
|
"epoch": 6.9964497041420115,
|
|
"eval_accuracy": 0.8917808219178082,
|
|
"eval_loss": 0.3158508837223053,
|
|
"eval_runtime": 6.1877,
|
|
"eval_samples_per_second": 235.95,
|
|
"eval_steps_per_second": 29.575,
|
|
"step": 1478
|
|
},
|
|
{
|
|
"epoch": 7.005917159763314,
|
|
"grad_norm": 10.892561912536621,
|
|
"learning_rate": 4.4425218176346675e-05,
|
|
"loss": 0.3283,
|
|
"step": 1480
|
|
},
|
|
{
|
|
"epoch": 7.053254437869822,
|
|
"grad_norm": 8.013442993164062,
|
|
"learning_rate": 4.4349984953355406e-05,
|
|
"loss": 0.4517,
|
|
"step": 1490
|
|
},
|
|
{
|
|
"epoch": 7.100591715976331,
|
|
"grad_norm": 6.160177230834961,
|
|
"learning_rate": 4.427475173036413e-05,
|
|
"loss": 0.4099,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"epoch": 7.14792899408284,
|
|
"grad_norm": 8.48135757446289,
|
|
"learning_rate": 4.4199518507372856e-05,
|
|
"loss": 0.4019,
|
|
"step": 1510
|
|
},
|
|
{
|
|
"epoch": 7.195266272189349,
|
|
"grad_norm": 10.302865982055664,
|
|
"learning_rate": 4.412428528438158e-05,
|
|
"loss": 0.3329,
|
|
"step": 1520
|
|
},
|
|
{
|
|
"epoch": 7.242603550295858,
|
|
"grad_norm": 10.503307342529297,
|
|
"learning_rate": 4.404905206139031e-05,
|
|
"loss": 0.394,
|
|
"step": 1530
|
|
},
|
|
{
|
|
"epoch": 7.289940828402367,
|
|
"grad_norm": 7.577216148376465,
|
|
"learning_rate": 4.397381883839904e-05,
|
|
"loss": 0.4075,
|
|
"step": 1540
|
|
},
|
|
{
|
|
"epoch": 7.337278106508876,
|
|
"grad_norm": 12.196857452392578,
|
|
"learning_rate": 4.389858561540777e-05,
|
|
"loss": 0.3919,
|
|
"step": 1550
|
|
},
|
|
{
|
|
"epoch": 7.384615384615385,
|
|
"grad_norm": 6.480340003967285,
|
|
"learning_rate": 4.382335239241649e-05,
|
|
"loss": 0.3562,
|
|
"step": 1560
|
|
},
|
|
{
|
|
"epoch": 7.431952662721893,
|
|
"grad_norm": 4.814269542694092,
|
|
"learning_rate": 4.3748119169425225e-05,
|
|
"loss": 0.3232,
|
|
"step": 1570
|
|
},
|
|
{
|
|
"epoch": 7.479289940828402,
|
|
"grad_norm": 8.813551902770996,
|
|
"learning_rate": 4.367288594643394e-05,
|
|
"loss": 0.3947,
|
|
"step": 1580
|
|
},
|
|
{
|
|
"epoch": 7.5266272189349115,
|
|
"grad_norm": 10.225379943847656,
|
|
"learning_rate": 4.3597652723442675e-05,
|
|
"loss": 0.4059,
|
|
"step": 1590
|
|
},
|
|
{
|
|
"epoch": 7.57396449704142,
|
|
"grad_norm": 9.415613174438477,
|
|
"learning_rate": 4.35224195004514e-05,
|
|
"loss": 0.3371,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"epoch": 7.621301775147929,
|
|
"grad_norm": 6.129647731781006,
|
|
"learning_rate": 4.344718627746013e-05,
|
|
"loss": 0.3652,
|
|
"step": 1610
|
|
},
|
|
{
|
|
"epoch": 7.668639053254438,
|
|
"grad_norm": 9.19030475616455,
|
|
"learning_rate": 4.3371953054468856e-05,
|
|
"loss": 0.3562,
|
|
"step": 1620
|
|
},
|
|
{
|
|
"epoch": 7.715976331360947,
|
|
"grad_norm": 12.973560333251953,
|
|
"learning_rate": 4.329671983147758e-05,
|
|
"loss": 0.3804,
|
|
"step": 1630
|
|
},
|
|
{
|
|
"epoch": 7.763313609467455,
|
|
"grad_norm": 7.263617515563965,
|
|
"learning_rate": 4.322148660848631e-05,
|
|
"loss": 0.3808,
|
|
"step": 1640
|
|
},
|
|
{
|
|
"epoch": 7.810650887573964,
|
|
"grad_norm": 6.532052516937256,
|
|
"learning_rate": 4.314625338549504e-05,
|
|
"loss": 0.468,
|
|
"step": 1650
|
|
},
|
|
{
|
|
"epoch": 7.8579881656804735,
|
|
"grad_norm": 8.766283988952637,
|
|
"learning_rate": 4.307102016250376e-05,
|
|
"loss": 0.4145,
|
|
"step": 1660
|
|
},
|
|
{
|
|
"epoch": 7.905325443786982,
|
|
"grad_norm": 5.956889629364014,
|
|
"learning_rate": 4.2995786939512487e-05,
|
|
"loss": 0.4047,
|
|
"step": 1670
|
|
},
|
|
{
|
|
"epoch": 7.952662721893491,
|
|
"grad_norm": 6.531178951263428,
|
|
"learning_rate": 4.292055371652122e-05,
|
|
"loss": 0.3396,
|
|
"step": 1680
|
|
},
|
|
{
|
|
"epoch": 8.0,
|
|
"grad_norm": 8.662644386291504,
|
|
"learning_rate": 4.284532049352994e-05,
|
|
"loss": 0.3209,
|
|
"step": 1690
|
|
},
|
|
{
|
|
"epoch": 8.0,
|
|
"eval_accuracy": 0.8972602739726028,
|
|
"eval_loss": 0.3082219660282135,
|
|
"eval_runtime": 6.2922,
|
|
"eval_samples_per_second": 232.034,
|
|
"eval_steps_per_second": 29.084,
|
|
"step": 1690
|
|
},
|
|
{
|
|
"epoch": 8.04733727810651,
|
|
"grad_norm": 12.477700233459473,
|
|
"learning_rate": 4.2770087270538674e-05,
|
|
"loss": 0.3262,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"epoch": 8.094674556213018,
|
|
"grad_norm": 6.367954730987549,
|
|
"learning_rate": 4.26948540475474e-05,
|
|
"loss": 0.3579,
|
|
"step": 1710
|
|
},
|
|
{
|
|
"epoch": 8.142011834319527,
|
|
"grad_norm": 7.339391708374023,
|
|
"learning_rate": 4.261962082455613e-05,
|
|
"loss": 0.3993,
|
|
"step": 1720
|
|
},
|
|
{
|
|
"epoch": 8.189349112426035,
|
|
"grad_norm": 7.060799598693848,
|
|
"learning_rate": 4.2544387601564856e-05,
|
|
"loss": 0.3702,
|
|
"step": 1730
|
|
},
|
|
{
|
|
"epoch": 8.236686390532544,
|
|
"grad_norm": 7.423877239227295,
|
|
"learning_rate": 4.246915437857358e-05,
|
|
"loss": 0.4548,
|
|
"step": 1740
|
|
},
|
|
{
|
|
"epoch": 8.284023668639053,
|
|
"grad_norm": 7.742123603820801,
|
|
"learning_rate": 4.2393921155582305e-05,
|
|
"loss": 0.3914,
|
|
"step": 1750
|
|
},
|
|
{
|
|
"epoch": 8.331360946745562,
|
|
"grad_norm": 3.941162109375,
|
|
"learning_rate": 4.231868793259104e-05,
|
|
"loss": 0.3953,
|
|
"step": 1760
|
|
},
|
|
{
|
|
"epoch": 8.378698224852071,
|
|
"grad_norm": 7.15812349319458,
|
|
"learning_rate": 4.224345470959976e-05,
|
|
"loss": 0.3871,
|
|
"step": 1770
|
|
},
|
|
{
|
|
"epoch": 8.42603550295858,
|
|
"grad_norm": 11.954395294189453,
|
|
"learning_rate": 4.2168221486608486e-05,
|
|
"loss": 0.3919,
|
|
"step": 1780
|
|
},
|
|
{
|
|
"epoch": 8.47337278106509,
|
|
"grad_norm": 7.049565315246582,
|
|
"learning_rate": 4.209298826361722e-05,
|
|
"loss": 0.3539,
|
|
"step": 1790
|
|
},
|
|
{
|
|
"epoch": 8.520710059171599,
|
|
"grad_norm": 8.527347564697266,
|
|
"learning_rate": 4.201775504062594e-05,
|
|
"loss": 0.3883,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"epoch": 8.568047337278106,
|
|
"grad_norm": 9.178783416748047,
|
|
"learning_rate": 4.194252181763467e-05,
|
|
"loss": 0.4226,
|
|
"step": 1810
|
|
},
|
|
{
|
|
"epoch": 8.615384615384615,
|
|
"grad_norm": 10.065650939941406,
|
|
"learning_rate": 4.186728859464339e-05,
|
|
"loss": 0.3773,
|
|
"step": 1820
|
|
},
|
|
{
|
|
"epoch": 8.662721893491124,
|
|
"grad_norm": 5.588104724884033,
|
|
"learning_rate": 4.1792055371652124e-05,
|
|
"loss": 0.3921,
|
|
"step": 1830
|
|
},
|
|
{
|
|
"epoch": 8.710059171597633,
|
|
"grad_norm": 4.505855083465576,
|
|
"learning_rate": 4.171682214866085e-05,
|
|
"loss": 0.3483,
|
|
"step": 1840
|
|
},
|
|
{
|
|
"epoch": 8.757396449704142,
|
|
"grad_norm": 10.081398963928223,
|
|
"learning_rate": 4.164158892566958e-05,
|
|
"loss": 0.3312,
|
|
"step": 1850
|
|
},
|
|
{
|
|
"epoch": 8.804733727810651,
|
|
"grad_norm": 7.667760848999023,
|
|
"learning_rate": 4.1566355702678305e-05,
|
|
"loss": 0.2838,
|
|
"step": 1860
|
|
},
|
|
{
|
|
"epoch": 8.85207100591716,
|
|
"grad_norm": 11.876665115356445,
|
|
"learning_rate": 4.149112247968704e-05,
|
|
"loss": 0.4328,
|
|
"step": 1870
|
|
},
|
|
{
|
|
"epoch": 8.899408284023668,
|
|
"grad_norm": 7.79551887512207,
|
|
"learning_rate": 4.141588925669576e-05,
|
|
"loss": 0.4616,
|
|
"step": 1880
|
|
},
|
|
{
|
|
"epoch": 8.946745562130177,
|
|
"grad_norm": 6.006857395172119,
|
|
"learning_rate": 4.1340656033704486e-05,
|
|
"loss": 0.3389,
|
|
"step": 1890
|
|
},
|
|
{
|
|
"epoch": 8.994082840236686,
|
|
"grad_norm": 9.194988250732422,
|
|
"learning_rate": 4.126542281071321e-05,
|
|
"loss": 0.3479,
|
|
"step": 1900
|
|
},
|
|
{
|
|
"epoch": 8.998816568047337,
|
|
"eval_accuracy": 0.9027397260273973,
|
|
"eval_loss": 0.28129294514656067,
|
|
"eval_runtime": 6.217,
|
|
"eval_samples_per_second": 234.84,
|
|
"eval_steps_per_second": 29.435,
|
|
"step": 1901
|
|
},
|
|
{
|
|
"epoch": 9.041420118343195,
|
|
"grad_norm": 7.2297163009643555,
|
|
"learning_rate": 4.1190189587721936e-05,
|
|
"loss": 0.3223,
|
|
"step": 1910
|
|
},
|
|
{
|
|
"epoch": 9.088757396449704,
|
|
"grad_norm": 9.67817211151123,
|
|
"learning_rate": 4.111495636473067e-05,
|
|
"loss": 0.3681,
|
|
"step": 1920
|
|
},
|
|
{
|
|
"epoch": 9.136094674556213,
|
|
"grad_norm": 6.748856544494629,
|
|
"learning_rate": 4.103972314173939e-05,
|
|
"loss": 0.351,
|
|
"step": 1930
|
|
},
|
|
{
|
|
"epoch": 9.183431952662723,
|
|
"grad_norm": 3.9139935970306396,
|
|
"learning_rate": 4.0964489918748124e-05,
|
|
"loss": 0.39,
|
|
"step": 1940
|
|
},
|
|
{
|
|
"epoch": 9.23076923076923,
|
|
"grad_norm": 5.222900390625,
|
|
"learning_rate": 4.088925669575685e-05,
|
|
"loss": 0.3132,
|
|
"step": 1950
|
|
},
|
|
{
|
|
"epoch": 9.278106508875739,
|
|
"grad_norm": 11.637986183166504,
|
|
"learning_rate": 4.081402347276558e-05,
|
|
"loss": 0.3373,
|
|
"step": 1960
|
|
},
|
|
{
|
|
"epoch": 9.325443786982248,
|
|
"grad_norm": 10.712813377380371,
|
|
"learning_rate": 4.07387902497743e-05,
|
|
"loss": 0.3424,
|
|
"step": 1970
|
|
},
|
|
{
|
|
"epoch": 9.372781065088757,
|
|
"grad_norm": 7.3563947677612305,
|
|
"learning_rate": 4.066355702678303e-05,
|
|
"loss": 0.3709,
|
|
"step": 1980
|
|
},
|
|
{
|
|
"epoch": 9.420118343195266,
|
|
"grad_norm": 8.500737190246582,
|
|
"learning_rate": 4.0588323803791755e-05,
|
|
"loss": 0.3398,
|
|
"step": 1990
|
|
},
|
|
{
|
|
"epoch": 9.467455621301776,
|
|
"grad_norm": 10.802979469299316,
|
|
"learning_rate": 4.0513090580800486e-05,
|
|
"loss": 0.308,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"epoch": 9.514792899408285,
|
|
"grad_norm": 7.362417697906494,
|
|
"learning_rate": 4.043785735780921e-05,
|
|
"loss": 0.3193,
|
|
"step": 2010
|
|
},
|
|
{
|
|
"epoch": 9.562130177514792,
|
|
"grad_norm": 5.569155693054199,
|
|
"learning_rate": 4.0362624134817936e-05,
|
|
"loss": 0.3028,
|
|
"step": 2020
|
|
},
|
|
{
|
|
"epoch": 9.609467455621301,
|
|
"grad_norm": 8.995447158813477,
|
|
"learning_rate": 4.028739091182667e-05,
|
|
"loss": 0.4206,
|
|
"step": 2030
|
|
},
|
|
{
|
|
"epoch": 9.65680473372781,
|
|
"grad_norm": 5.864706993103027,
|
|
"learning_rate": 4.021215768883539e-05,
|
|
"loss": 0.2987,
|
|
"step": 2040
|
|
},
|
|
{
|
|
"epoch": 9.70414201183432,
|
|
"grad_norm": 8.34255313873291,
|
|
"learning_rate": 4.013692446584412e-05,
|
|
"loss": 0.4161,
|
|
"step": 2050
|
|
},
|
|
{
|
|
"epoch": 9.751479289940828,
|
|
"grad_norm": 8.392521858215332,
|
|
"learning_rate": 4.006169124285284e-05,
|
|
"loss": 0.4073,
|
|
"step": 2060
|
|
},
|
|
{
|
|
"epoch": 9.798816568047338,
|
|
"grad_norm": 6.388725280761719,
|
|
"learning_rate": 3.998645801986157e-05,
|
|
"loss": 0.3513,
|
|
"step": 2070
|
|
},
|
|
{
|
|
"epoch": 9.846153846153847,
|
|
"grad_norm": 5.696859836578369,
|
|
"learning_rate": 3.99112247968703e-05,
|
|
"loss": 0.3219,
|
|
"step": 2080
|
|
},
|
|
{
|
|
"epoch": 9.893491124260356,
|
|
"grad_norm": 8.325499534606934,
|
|
"learning_rate": 3.983599157387903e-05,
|
|
"loss": 0.394,
|
|
"step": 2090
|
|
},
|
|
{
|
|
"epoch": 9.940828402366863,
|
|
"grad_norm": 11.819910049438477,
|
|
"learning_rate": 3.9760758350887755e-05,
|
|
"loss": 0.4085,
|
|
"step": 2100
|
|
},
|
|
{
|
|
"epoch": 9.988165680473372,
|
|
"grad_norm": 6.419707298278809,
|
|
"learning_rate": 3.9685525127896486e-05,
|
|
"loss": 0.3429,
|
|
"step": 2110
|
|
},
|
|
{
|
|
"epoch": 9.997633136094674,
|
|
"eval_accuracy": 0.8924657534246575,
|
|
"eval_loss": 0.3318786323070526,
|
|
"eval_runtime": 6.1733,
|
|
"eval_samples_per_second": 236.501,
|
|
"eval_steps_per_second": 29.644,
|
|
"step": 2112
|
|
},
|
|
{
|
|
"epoch": 10.035502958579881,
|
|
"grad_norm": 4.879507064819336,
|
|
"learning_rate": 3.9610291904905204e-05,
|
|
"loss": 0.2607,
|
|
"step": 2120
|
|
},
|
|
{
|
|
"epoch": 10.08284023668639,
|
|
"grad_norm": 10.089688301086426,
|
|
"learning_rate": 3.9535058681913936e-05,
|
|
"loss": 0.3887,
|
|
"step": 2130
|
|
},
|
|
{
|
|
"epoch": 10.1301775147929,
|
|
"grad_norm": 6.6358819007873535,
|
|
"learning_rate": 3.945982545892266e-05,
|
|
"loss": 0.3926,
|
|
"step": 2140
|
|
},
|
|
{
|
|
"epoch": 10.177514792899409,
|
|
"grad_norm": 4.718569755554199,
|
|
"learning_rate": 3.938459223593139e-05,
|
|
"loss": 0.2977,
|
|
"step": 2150
|
|
},
|
|
{
|
|
"epoch": 10.224852071005918,
|
|
"grad_norm": 4.798628807067871,
|
|
"learning_rate": 3.930935901294012e-05,
|
|
"loss": 0.3167,
|
|
"step": 2160
|
|
},
|
|
{
|
|
"epoch": 10.272189349112425,
|
|
"grad_norm": 12.527241706848145,
|
|
"learning_rate": 3.923412578994884e-05,
|
|
"loss": 0.3498,
|
|
"step": 2170
|
|
},
|
|
{
|
|
"epoch": 10.319526627218934,
|
|
"grad_norm": 19.981807708740234,
|
|
"learning_rate": 3.915889256695757e-05,
|
|
"loss": 0.3791,
|
|
"step": 2180
|
|
},
|
|
{
|
|
"epoch": 10.366863905325443,
|
|
"grad_norm": 5.31036901473999,
|
|
"learning_rate": 3.90836593439663e-05,
|
|
"loss": 0.3635,
|
|
"step": 2190
|
|
},
|
|
{
|
|
"epoch": 10.414201183431953,
|
|
"grad_norm": 7.329598426818848,
|
|
"learning_rate": 3.900842612097502e-05,
|
|
"loss": 0.2612,
|
|
"step": 2200
|
|
},
|
|
{
|
|
"epoch": 10.461538461538462,
|
|
"grad_norm": 10.241847038269043,
|
|
"learning_rate": 3.893319289798375e-05,
|
|
"loss": 0.3508,
|
|
"step": 2210
|
|
},
|
|
{
|
|
"epoch": 10.50887573964497,
|
|
"grad_norm": 9.222640991210938,
|
|
"learning_rate": 3.885795967499248e-05,
|
|
"loss": 0.4113,
|
|
"step": 2220
|
|
},
|
|
{
|
|
"epoch": 10.55621301775148,
|
|
"grad_norm": 5.4523115158081055,
|
|
"learning_rate": 3.8782726452001204e-05,
|
|
"loss": 0.312,
|
|
"step": 2230
|
|
},
|
|
{
|
|
"epoch": 10.603550295857989,
|
|
"grad_norm": 25.376020431518555,
|
|
"learning_rate": 3.8707493229009936e-05,
|
|
"loss": 0.382,
|
|
"step": 2240
|
|
},
|
|
{
|
|
"epoch": 10.650887573964496,
|
|
"grad_norm": 7.494572162628174,
|
|
"learning_rate": 3.863226000601866e-05,
|
|
"loss": 0.3078,
|
|
"step": 2250
|
|
},
|
|
{
|
|
"epoch": 10.698224852071005,
|
|
"grad_norm": 9.24726390838623,
|
|
"learning_rate": 3.855702678302739e-05,
|
|
"loss": 0.3368,
|
|
"step": 2260
|
|
},
|
|
{
|
|
"epoch": 10.745562130177515,
|
|
"grad_norm": 7.74558162689209,
|
|
"learning_rate": 3.848179356003611e-05,
|
|
"loss": 0.2912,
|
|
"step": 2270
|
|
},
|
|
{
|
|
"epoch": 10.792899408284024,
|
|
"grad_norm": 7.557544708251953,
|
|
"learning_rate": 3.840656033704484e-05,
|
|
"loss": 0.3268,
|
|
"step": 2280
|
|
},
|
|
{
|
|
"epoch": 10.840236686390533,
|
|
"grad_norm": 9.215229988098145,
|
|
"learning_rate": 3.8331327114053566e-05,
|
|
"loss": 0.4372,
|
|
"step": 2290
|
|
},
|
|
{
|
|
"epoch": 10.887573964497042,
|
|
"grad_norm": 9.268451690673828,
|
|
"learning_rate": 3.82560938910623e-05,
|
|
"loss": 0.3564,
|
|
"step": 2300
|
|
},
|
|
{
|
|
"epoch": 10.934911242603551,
|
|
"grad_norm": 4.07456111907959,
|
|
"learning_rate": 3.818086066807102e-05,
|
|
"loss": 0.3003,
|
|
"step": 2310
|
|
},
|
|
{
|
|
"epoch": 10.982248520710058,
|
|
"grad_norm": 8.930679321289062,
|
|
"learning_rate": 3.810562744507975e-05,
|
|
"loss": 0.3341,
|
|
"step": 2320
|
|
},
|
|
{
|
|
"epoch": 10.996449704142012,
|
|
"eval_accuracy": 0.8972602739726028,
|
|
"eval_loss": 0.2900165021419525,
|
|
"eval_runtime": 6.2027,
|
|
"eval_samples_per_second": 235.379,
|
|
"eval_steps_per_second": 29.503,
|
|
"step": 2323
|
|
},
|
|
{
|
|
"epoch": 11.029585798816568,
|
|
"grad_norm": 7.869425296783447,
|
|
"learning_rate": 3.803039422208848e-05,
|
|
"loss": 0.3134,
|
|
"step": 2330
|
|
},
|
|
{
|
|
"epoch": 11.076923076923077,
|
|
"grad_norm": 8.941612243652344,
|
|
"learning_rate": 3.7955160999097204e-05,
|
|
"loss": 0.3465,
|
|
"step": 2340
|
|
},
|
|
{
|
|
"epoch": 11.124260355029586,
|
|
"grad_norm": 8.30190372467041,
|
|
"learning_rate": 3.787992777610593e-05,
|
|
"loss": 0.2489,
|
|
"step": 2350
|
|
},
|
|
{
|
|
"epoch": 11.171597633136095,
|
|
"grad_norm": 8.490402221679688,
|
|
"learning_rate": 3.7804694553114653e-05,
|
|
"loss": 0.326,
|
|
"step": 2360
|
|
},
|
|
{
|
|
"epoch": 11.218934911242604,
|
|
"grad_norm": 19.662193298339844,
|
|
"learning_rate": 3.7729461330123385e-05,
|
|
"loss": 0.3444,
|
|
"step": 2370
|
|
},
|
|
{
|
|
"epoch": 11.266272189349113,
|
|
"grad_norm": 9.445649147033691,
|
|
"learning_rate": 3.765422810713211e-05,
|
|
"loss": 0.3185,
|
|
"step": 2380
|
|
},
|
|
{
|
|
"epoch": 11.31360946745562,
|
|
"grad_norm": 4.701760292053223,
|
|
"learning_rate": 3.757899488414084e-05,
|
|
"loss": 0.3665,
|
|
"step": 2390
|
|
},
|
|
{
|
|
"epoch": 11.36094674556213,
|
|
"grad_norm": 5.095606327056885,
|
|
"learning_rate": 3.7503761661149566e-05,
|
|
"loss": 0.2736,
|
|
"step": 2400
|
|
},
|
|
{
|
|
"epoch": 11.408284023668639,
|
|
"grad_norm": 10.870713233947754,
|
|
"learning_rate": 3.74285284381583e-05,
|
|
"loss": 0.2966,
|
|
"step": 2410
|
|
},
|
|
{
|
|
"epoch": 11.455621301775148,
|
|
"grad_norm": 6.850511074066162,
|
|
"learning_rate": 3.7353295215167016e-05,
|
|
"loss": 0.2624,
|
|
"step": 2420
|
|
},
|
|
{
|
|
"epoch": 11.502958579881657,
|
|
"grad_norm": 10.627695083618164,
|
|
"learning_rate": 3.727806199217575e-05,
|
|
"loss": 0.3767,
|
|
"step": 2430
|
|
},
|
|
{
|
|
"epoch": 11.550295857988166,
|
|
"grad_norm": 8.704399108886719,
|
|
"learning_rate": 3.720282876918447e-05,
|
|
"loss": 0.3127,
|
|
"step": 2440
|
|
},
|
|
{
|
|
"epoch": 11.597633136094675,
|
|
"grad_norm": 7.4766716957092285,
|
|
"learning_rate": 3.71275955461932e-05,
|
|
"loss": 0.3015,
|
|
"step": 2450
|
|
},
|
|
{
|
|
"epoch": 11.644970414201183,
|
|
"grad_norm": 8.510762214660645,
|
|
"learning_rate": 3.705236232320193e-05,
|
|
"loss": 0.3406,
|
|
"step": 2460
|
|
},
|
|
{
|
|
"epoch": 11.692307692307692,
|
|
"grad_norm": 9.42719841003418,
|
|
"learning_rate": 3.697712910021065e-05,
|
|
"loss": 0.3085,
|
|
"step": 2470
|
|
},
|
|
{
|
|
"epoch": 11.7396449704142,
|
|
"grad_norm": 6.386455535888672,
|
|
"learning_rate": 3.6901895877219385e-05,
|
|
"loss": 0.3426,
|
|
"step": 2480
|
|
},
|
|
{
|
|
"epoch": 11.78698224852071,
|
|
"grad_norm": 7.612992286682129,
|
|
"learning_rate": 3.682666265422811e-05,
|
|
"loss": 0.3567,
|
|
"step": 2490
|
|
},
|
|
{
|
|
"epoch": 11.834319526627219,
|
|
"grad_norm": 8.440069198608398,
|
|
"learning_rate": 3.6751429431236835e-05,
|
|
"loss": 0.3288,
|
|
"step": 2500
|
|
},
|
|
{
|
|
"epoch": 11.881656804733728,
|
|
"grad_norm": 7.730615615844727,
|
|
"learning_rate": 3.667619620824556e-05,
|
|
"loss": 0.3253,
|
|
"step": 2510
|
|
},
|
|
{
|
|
"epoch": 11.928994082840237,
|
|
"grad_norm": 7.29069185256958,
|
|
"learning_rate": 3.660096298525429e-05,
|
|
"loss": 0.306,
|
|
"step": 2520
|
|
},
|
|
{
|
|
"epoch": 11.976331360946746,
|
|
"grad_norm": 8.983368873596191,
|
|
"learning_rate": 3.6525729762263016e-05,
|
|
"loss": 0.2937,
|
|
"step": 2530
|
|
},
|
|
{
|
|
"epoch": 12.0,
|
|
"eval_accuracy": 0.8993150684931507,
|
|
"eval_loss": 0.3500230312347412,
|
|
"eval_runtime": 6.3178,
|
|
"eval_samples_per_second": 231.093,
|
|
"eval_steps_per_second": 28.966,
|
|
"step": 2535
|
|
},
|
|
{
|
|
"epoch": 12.023668639053254,
|
|
"grad_norm": 6.368637561798096,
|
|
"learning_rate": 3.645049653927175e-05,
|
|
"loss": 0.3998,
|
|
"step": 2540
|
|
},
|
|
{
|
|
"epoch": 12.071005917159763,
|
|
"grad_norm": 14.744524955749512,
|
|
"learning_rate": 3.637526331628047e-05,
|
|
"loss": 0.324,
|
|
"step": 2550
|
|
},
|
|
{
|
|
"epoch": 12.118343195266272,
|
|
"grad_norm": 4.304303169250488,
|
|
"learning_rate": 3.63000300932892e-05,
|
|
"loss": 0.3538,
|
|
"step": 2560
|
|
},
|
|
{
|
|
"epoch": 12.165680473372781,
|
|
"grad_norm": 11.705492973327637,
|
|
"learning_rate": 3.622479687029793e-05,
|
|
"loss": 0.3422,
|
|
"step": 2570
|
|
},
|
|
{
|
|
"epoch": 12.21301775147929,
|
|
"grad_norm": 9.357977867126465,
|
|
"learning_rate": 3.614956364730665e-05,
|
|
"loss": 0.2732,
|
|
"step": 2580
|
|
},
|
|
{
|
|
"epoch": 12.2603550295858,
|
|
"grad_norm": 12.46599006652832,
|
|
"learning_rate": 3.607433042431538e-05,
|
|
"loss": 0.4473,
|
|
"step": 2590
|
|
},
|
|
{
|
|
"epoch": 12.307692307692308,
|
|
"grad_norm": 20.074487686157227,
|
|
"learning_rate": 3.59990972013241e-05,
|
|
"loss": 0.2837,
|
|
"step": 2600
|
|
},
|
|
{
|
|
"epoch": 12.355029585798816,
|
|
"grad_norm": 4.281162738800049,
|
|
"learning_rate": 3.5923863978332834e-05,
|
|
"loss": 0.3035,
|
|
"step": 2610
|
|
},
|
|
{
|
|
"epoch": 12.402366863905325,
|
|
"grad_norm": 10.390352249145508,
|
|
"learning_rate": 3.584863075534156e-05,
|
|
"loss": 0.2636,
|
|
"step": 2620
|
|
},
|
|
{
|
|
"epoch": 12.449704142011834,
|
|
"grad_norm": 3.76784348487854,
|
|
"learning_rate": 3.577339753235029e-05,
|
|
"loss": 0.3388,
|
|
"step": 2630
|
|
},
|
|
{
|
|
"epoch": 12.497041420118343,
|
|
"grad_norm": 9.673295021057129,
|
|
"learning_rate": 3.5698164309359016e-05,
|
|
"loss": 0.2947,
|
|
"step": 2640
|
|
},
|
|
{
|
|
"epoch": 12.544378698224852,
|
|
"grad_norm": 6.6694722175598145,
|
|
"learning_rate": 3.562293108636774e-05,
|
|
"loss": 0.3453,
|
|
"step": 2650
|
|
},
|
|
{
|
|
"epoch": 12.591715976331361,
|
|
"grad_norm": 7.178610324859619,
|
|
"learning_rate": 3.5547697863376465e-05,
|
|
"loss": 0.3383,
|
|
"step": 2660
|
|
},
|
|
{
|
|
"epoch": 12.63905325443787,
|
|
"grad_norm": 10.715120315551758,
|
|
"learning_rate": 3.54724646403852e-05,
|
|
"loss": 0.3222,
|
|
"step": 2670
|
|
},
|
|
{
|
|
"epoch": 12.68639053254438,
|
|
"grad_norm": 6.3047285079956055,
|
|
"learning_rate": 3.539723141739392e-05,
|
|
"loss": 0.3521,
|
|
"step": 2680
|
|
},
|
|
{
|
|
"epoch": 12.733727810650887,
|
|
"grad_norm": 6.073225021362305,
|
|
"learning_rate": 3.532199819440265e-05,
|
|
"loss": 0.2904,
|
|
"step": 2690
|
|
},
|
|
{
|
|
"epoch": 12.781065088757396,
|
|
"grad_norm": 9.05847454071045,
|
|
"learning_rate": 3.524676497141138e-05,
|
|
"loss": 0.3764,
|
|
"step": 2700
|
|
},
|
|
{
|
|
"epoch": 12.828402366863905,
|
|
"grad_norm": 6.264795303344727,
|
|
"learning_rate": 3.51715317484201e-05,
|
|
"loss": 0.3159,
|
|
"step": 2710
|
|
},
|
|
{
|
|
"epoch": 12.875739644970414,
|
|
"grad_norm": 7.125365257263184,
|
|
"learning_rate": 3.5096298525428834e-05,
|
|
"loss": 0.2996,
|
|
"step": 2720
|
|
},
|
|
{
|
|
"epoch": 12.923076923076923,
|
|
"grad_norm": 9.880492210388184,
|
|
"learning_rate": 3.502106530243755e-05,
|
|
"loss": 0.3283,
|
|
"step": 2730
|
|
},
|
|
{
|
|
"epoch": 12.970414201183432,
|
|
"grad_norm": 14.802063941955566,
|
|
"learning_rate": 3.4945832079446284e-05,
|
|
"loss": 0.3478,
|
|
"step": 2740
|
|
},
|
|
{
|
|
"epoch": 12.998816568047337,
|
|
"eval_accuracy": 0.9013698630136986,
|
|
"eval_loss": 0.3168272078037262,
|
|
"eval_runtime": 6.174,
|
|
"eval_samples_per_second": 236.475,
|
|
"eval_steps_per_second": 29.64,
|
|
"step": 2746
|
|
},
|
|
{
|
|
"epoch": 13.017751479289942,
|
|
"grad_norm": 9.61425495147705,
|
|
"learning_rate": 3.487059885645501e-05,
|
|
"loss": 0.2884,
|
|
"step": 2750
|
|
},
|
|
{
|
|
"epoch": 13.065088757396449,
|
|
"grad_norm": 7.737671375274658,
|
|
"learning_rate": 3.479536563346374e-05,
|
|
"loss": 0.3732,
|
|
"step": 2760
|
|
},
|
|
{
|
|
"epoch": 13.112426035502958,
|
|
"grad_norm": 7.558273792266846,
|
|
"learning_rate": 3.4720132410472465e-05,
|
|
"loss": 0.2859,
|
|
"step": 2770
|
|
},
|
|
{
|
|
"epoch": 13.159763313609467,
|
|
"grad_norm": 7.560544013977051,
|
|
"learning_rate": 3.46448991874812e-05,
|
|
"loss": 0.2986,
|
|
"step": 2780
|
|
},
|
|
{
|
|
"epoch": 13.207100591715976,
|
|
"grad_norm": 7.7973480224609375,
|
|
"learning_rate": 3.456966596448992e-05,
|
|
"loss": 0.291,
|
|
"step": 2790
|
|
},
|
|
{
|
|
"epoch": 13.254437869822485,
|
|
"grad_norm": 9.302266120910645,
|
|
"learning_rate": 3.449443274149865e-05,
|
|
"loss": 0.3669,
|
|
"step": 2800
|
|
},
|
|
{
|
|
"epoch": 13.301775147928995,
|
|
"grad_norm": 5.183737277984619,
|
|
"learning_rate": 3.441919951850737e-05,
|
|
"loss": 0.293,
|
|
"step": 2810
|
|
},
|
|
{
|
|
"epoch": 13.349112426035504,
|
|
"grad_norm": 6.064436912536621,
|
|
"learning_rate": 3.43439662955161e-05,
|
|
"loss": 0.2944,
|
|
"step": 2820
|
|
},
|
|
{
|
|
"epoch": 13.396449704142011,
|
|
"grad_norm": 9.409137725830078,
|
|
"learning_rate": 3.426873307252483e-05,
|
|
"loss": 0.3103,
|
|
"step": 2830
|
|
},
|
|
{
|
|
"epoch": 13.44378698224852,
|
|
"grad_norm": 20.371089935302734,
|
|
"learning_rate": 3.419349984953356e-05,
|
|
"loss": 0.2879,
|
|
"step": 2840
|
|
},
|
|
{
|
|
"epoch": 13.49112426035503,
|
|
"grad_norm": 9.97218132019043,
|
|
"learning_rate": 3.4118266626542284e-05,
|
|
"loss": 0.2959,
|
|
"step": 2850
|
|
},
|
|
{
|
|
"epoch": 13.538461538461538,
|
|
"grad_norm": 7.915639400482178,
|
|
"learning_rate": 3.404303340355101e-05,
|
|
"loss": 0.2929,
|
|
"step": 2860
|
|
},
|
|
{
|
|
"epoch": 13.585798816568047,
|
|
"grad_norm": 6.3162641525268555,
|
|
"learning_rate": 3.396780018055974e-05,
|
|
"loss": 0.2773,
|
|
"step": 2870
|
|
},
|
|
{
|
|
"epoch": 13.633136094674557,
|
|
"grad_norm": 7.813812732696533,
|
|
"learning_rate": 3.389256695756846e-05,
|
|
"loss": 0.2795,
|
|
"step": 2880
|
|
},
|
|
{
|
|
"epoch": 13.680473372781066,
|
|
"grad_norm": 13.80722427368164,
|
|
"learning_rate": 3.381733373457719e-05,
|
|
"loss": 0.3648,
|
|
"step": 2890
|
|
},
|
|
{
|
|
"epoch": 13.727810650887575,
|
|
"grad_norm": 9.83273696899414,
|
|
"learning_rate": 3.3742100511585915e-05,
|
|
"loss": 0.2952,
|
|
"step": 2900
|
|
},
|
|
{
|
|
"epoch": 13.775147928994082,
|
|
"grad_norm": 10.903112411499023,
|
|
"learning_rate": 3.3666867288594646e-05,
|
|
"loss": 0.269,
|
|
"step": 2910
|
|
},
|
|
{
|
|
"epoch": 13.822485207100591,
|
|
"grad_norm": 4.992847919464111,
|
|
"learning_rate": 3.359163406560337e-05,
|
|
"loss": 0.3689,
|
|
"step": 2920
|
|
},
|
|
{
|
|
"epoch": 13.8698224852071,
|
|
"grad_norm": 7.029762268066406,
|
|
"learning_rate": 3.35164008426121e-05,
|
|
"loss": 0.3296,
|
|
"step": 2930
|
|
},
|
|
{
|
|
"epoch": 13.91715976331361,
|
|
"grad_norm": 15.533370018005371,
|
|
"learning_rate": 3.344116761962083e-05,
|
|
"loss": 0.2764,
|
|
"step": 2940
|
|
},
|
|
{
|
|
"epoch": 13.964497041420119,
|
|
"grad_norm": 14.553123474121094,
|
|
"learning_rate": 3.336593439662956e-05,
|
|
"loss": 0.3148,
|
|
"step": 2950
|
|
},
|
|
{
|
|
"epoch": 13.997633136094674,
|
|
"eval_accuracy": 0.9054794520547945,
|
|
"eval_loss": 0.3071611225605011,
|
|
"eval_runtime": 6.0563,
|
|
"eval_samples_per_second": 241.071,
|
|
"eval_steps_per_second": 30.216,
|
|
"step": 2957
|
|
},
|
|
{
|
|
"epoch": 14.011834319526628,
|
|
"grad_norm": 6.9089035987854,
|
|
"learning_rate": 3.329070117363828e-05,
|
|
"loss": 0.3318,
|
|
"step": 2960
|
|
},
|
|
{
|
|
"epoch": 14.059171597633137,
|
|
"grad_norm": 7.897435665130615,
|
|
"learning_rate": 3.321546795064701e-05,
|
|
"loss": 0.2537,
|
|
"step": 2970
|
|
},
|
|
{
|
|
"epoch": 14.106508875739644,
|
|
"grad_norm": 12.082826614379883,
|
|
"learning_rate": 3.314023472765573e-05,
|
|
"loss": 0.2685,
|
|
"step": 2980
|
|
},
|
|
{
|
|
"epoch": 14.153846153846153,
|
|
"grad_norm": 8.465901374816895,
|
|
"learning_rate": 3.306500150466446e-05,
|
|
"loss": 0.3849,
|
|
"step": 2990
|
|
},
|
|
{
|
|
"epoch": 14.201183431952662,
|
|
"grad_norm": 9.606731414794922,
|
|
"learning_rate": 3.298976828167319e-05,
|
|
"loss": 0.3219,
|
|
"step": 3000
|
|
},
|
|
{
|
|
"epoch": 14.248520710059172,
|
|
"grad_norm": 5.763510704040527,
|
|
"learning_rate": 3.2914535058681914e-05,
|
|
"loss": 0.2798,
|
|
"step": 3010
|
|
},
|
|
{
|
|
"epoch": 14.29585798816568,
|
|
"grad_norm": 7.898010730743408,
|
|
"learning_rate": 3.2839301835690646e-05,
|
|
"loss": 0.353,
|
|
"step": 3020
|
|
},
|
|
{
|
|
"epoch": 14.34319526627219,
|
|
"grad_norm": 4.139184951782227,
|
|
"learning_rate": 3.276406861269937e-05,
|
|
"loss": 0.3145,
|
|
"step": 3030
|
|
},
|
|
{
|
|
"epoch": 14.390532544378699,
|
|
"grad_norm": 10.472068786621094,
|
|
"learning_rate": 3.2688835389708096e-05,
|
|
"loss": 0.2997,
|
|
"step": 3040
|
|
},
|
|
{
|
|
"epoch": 14.437869822485208,
|
|
"grad_norm": 6.952048301696777,
|
|
"learning_rate": 3.261360216671682e-05,
|
|
"loss": 0.2931,
|
|
"step": 3050
|
|
},
|
|
{
|
|
"epoch": 14.485207100591715,
|
|
"grad_norm": 11.008207321166992,
|
|
"learning_rate": 3.253836894372555e-05,
|
|
"loss": 0.2891,
|
|
"step": 3060
|
|
},
|
|
{
|
|
"epoch": 14.532544378698224,
|
|
"grad_norm": 4.314377784729004,
|
|
"learning_rate": 3.246313572073428e-05,
|
|
"loss": 0.2922,
|
|
"step": 3070
|
|
},
|
|
{
|
|
"epoch": 14.579881656804734,
|
|
"grad_norm": 6.738071441650391,
|
|
"learning_rate": 3.238790249774301e-05,
|
|
"loss": 0.2226,
|
|
"step": 3080
|
|
},
|
|
{
|
|
"epoch": 14.627218934911243,
|
|
"grad_norm": 5.609333038330078,
|
|
"learning_rate": 3.231266927475173e-05,
|
|
"loss": 0.2366,
|
|
"step": 3090
|
|
},
|
|
{
|
|
"epoch": 14.674556213017752,
|
|
"grad_norm": 5.399454116821289,
|
|
"learning_rate": 3.223743605176046e-05,
|
|
"loss": 0.32,
|
|
"step": 3100
|
|
},
|
|
{
|
|
"epoch": 14.721893491124261,
|
|
"grad_norm": 13.962152481079102,
|
|
"learning_rate": 3.216220282876918e-05,
|
|
"loss": 0.3652,
|
|
"step": 3110
|
|
},
|
|
{
|
|
"epoch": 14.76923076923077,
|
|
"grad_norm": 8.14931869506836,
|
|
"learning_rate": 3.2086969605777914e-05,
|
|
"loss": 0.2513,
|
|
"step": 3120
|
|
},
|
|
{
|
|
"epoch": 14.816568047337277,
|
|
"grad_norm": 6.72014045715332,
|
|
"learning_rate": 3.201173638278664e-05,
|
|
"loss": 0.3068,
|
|
"step": 3130
|
|
},
|
|
{
|
|
"epoch": 14.863905325443787,
|
|
"grad_norm": 9.025717735290527,
|
|
"learning_rate": 3.1936503159795364e-05,
|
|
"loss": 0.2845,
|
|
"step": 3140
|
|
},
|
|
{
|
|
"epoch": 14.911242603550296,
|
|
"grad_norm": 3.6108787059783936,
|
|
"learning_rate": 3.1861269936804096e-05,
|
|
"loss": 0.2868,
|
|
"step": 3150
|
|
},
|
|
{
|
|
"epoch": 14.958579881656805,
|
|
"grad_norm": 12.648404121398926,
|
|
"learning_rate": 3.178603671381282e-05,
|
|
"loss": 0.2896,
|
|
"step": 3160
|
|
},
|
|
{
|
|
"epoch": 14.996449704142012,
|
|
"eval_accuracy": 0.9061643835616439,
|
|
"eval_loss": 0.30652791261672974,
|
|
"eval_runtime": 6.136,
|
|
"eval_samples_per_second": 237.938,
|
|
"eval_steps_per_second": 29.824,
|
|
"step": 3168
|
|
},
|
|
{
|
|
"epoch": 15.005917159763314,
|
|
"grad_norm": 5.476109027862549,
|
|
"learning_rate": 3.171080349082155e-05,
|
|
"loss": 0.3452,
|
|
"step": 3170
|
|
},
|
|
{
|
|
"epoch": 15.053254437869823,
|
|
"grad_norm": 8.330878257751465,
|
|
"learning_rate": 3.163557026783028e-05,
|
|
"loss": 0.2456,
|
|
"step": 3180
|
|
},
|
|
{
|
|
"epoch": 15.100591715976332,
|
|
"grad_norm": 8.56313705444336,
|
|
"learning_rate": 3.1560337044839e-05,
|
|
"loss": 0.2296,
|
|
"step": 3190
|
|
},
|
|
{
|
|
"epoch": 15.14792899408284,
|
|
"grad_norm": 10.402885437011719,
|
|
"learning_rate": 3.1485103821847726e-05,
|
|
"loss": 0.2862,
|
|
"step": 3200
|
|
},
|
|
{
|
|
"epoch": 15.195266272189349,
|
|
"grad_norm": 7.497808933258057,
|
|
"learning_rate": 3.140987059885646e-05,
|
|
"loss": 0.3389,
|
|
"step": 3210
|
|
},
|
|
{
|
|
"epoch": 15.242603550295858,
|
|
"grad_norm": 7.207127094268799,
|
|
"learning_rate": 3.133463737586518e-05,
|
|
"loss": 0.2575,
|
|
"step": 3220
|
|
},
|
|
{
|
|
"epoch": 15.289940828402367,
|
|
"grad_norm": 4.729502201080322,
|
|
"learning_rate": 3.1259404152873914e-05,
|
|
"loss": 0.2308,
|
|
"step": 3230
|
|
},
|
|
{
|
|
"epoch": 15.337278106508876,
|
|
"grad_norm": 10.251791954040527,
|
|
"learning_rate": 3.118417092988264e-05,
|
|
"loss": 0.2549,
|
|
"step": 3240
|
|
},
|
|
{
|
|
"epoch": 15.384615384615385,
|
|
"grad_norm": 4.962519645690918,
|
|
"learning_rate": 3.1108937706891364e-05,
|
|
"loss": 0.2448,
|
|
"step": 3250
|
|
},
|
|
{
|
|
"epoch": 15.431952662721894,
|
|
"grad_norm": 8.956313133239746,
|
|
"learning_rate": 3.103370448390009e-05,
|
|
"loss": 0.2278,
|
|
"step": 3260
|
|
},
|
|
{
|
|
"epoch": 15.479289940828401,
|
|
"grad_norm": 5.445577144622803,
|
|
"learning_rate": 3.0958471260908813e-05,
|
|
"loss": 0.3195,
|
|
"step": 3270
|
|
},
|
|
{
|
|
"epoch": 15.52662721893491,
|
|
"grad_norm": 8.691884994506836,
|
|
"learning_rate": 3.0883238037917545e-05,
|
|
"loss": 0.2816,
|
|
"step": 3280
|
|
},
|
|
{
|
|
"epoch": 15.57396449704142,
|
|
"grad_norm": 4.890760898590088,
|
|
"learning_rate": 3.080800481492627e-05,
|
|
"loss": 0.2479,
|
|
"step": 3290
|
|
},
|
|
{
|
|
"epoch": 15.621301775147929,
|
|
"grad_norm": 10.502642631530762,
|
|
"learning_rate": 3.0732771591935e-05,
|
|
"loss": 0.2368,
|
|
"step": 3300
|
|
},
|
|
{
|
|
"epoch": 15.668639053254438,
|
|
"grad_norm": 11.197770118713379,
|
|
"learning_rate": 3.0657538368943726e-05,
|
|
"loss": 0.396,
|
|
"step": 3310
|
|
},
|
|
{
|
|
"epoch": 15.715976331360947,
|
|
"grad_norm": 7.301953315734863,
|
|
"learning_rate": 3.058230514595246e-05,
|
|
"loss": 0.2605,
|
|
"step": 3320
|
|
},
|
|
{
|
|
"epoch": 15.763313609467456,
|
|
"grad_norm": 9.391778945922852,
|
|
"learning_rate": 3.0507071922961183e-05,
|
|
"loss": 0.2318,
|
|
"step": 3330
|
|
},
|
|
{
|
|
"epoch": 15.810650887573965,
|
|
"grad_norm": 11.96308708190918,
|
|
"learning_rate": 3.0431838699969904e-05,
|
|
"loss": 0.3574,
|
|
"step": 3340
|
|
},
|
|
{
|
|
"epoch": 15.857988165680473,
|
|
"grad_norm": 6.631661415100098,
|
|
"learning_rate": 3.0356605476978632e-05,
|
|
"loss": 0.2773,
|
|
"step": 3350
|
|
},
|
|
{
|
|
"epoch": 15.905325443786982,
|
|
"grad_norm": 7.179072380065918,
|
|
"learning_rate": 3.028137225398736e-05,
|
|
"loss": 0.3573,
|
|
"step": 3360
|
|
},
|
|
{
|
|
"epoch": 15.95266272189349,
|
|
"grad_norm": 9.855470657348633,
|
|
"learning_rate": 3.020613903099609e-05,
|
|
"loss": 0.3077,
|
|
"step": 3370
|
|
},
|
|
{
|
|
"epoch": 16.0,
|
|
"grad_norm": 4.808469772338867,
|
|
"learning_rate": 3.0130905808004817e-05,
|
|
"loss": 0.3149,
|
|
"step": 3380
|
|
},
|
|
{
|
|
"epoch": 16.0,
|
|
"eval_accuracy": 0.9082191780821918,
|
|
"eval_loss": 0.2928474545478821,
|
|
"eval_runtime": 6.1031,
|
|
"eval_samples_per_second": 239.221,
|
|
"eval_steps_per_second": 29.985,
|
|
"step": 3380
|
|
},
|
|
{
|
|
"epoch": 16.047337278106507,
|
|
"grad_norm": 5.927903175354004,
|
|
"learning_rate": 3.0055672585013545e-05,
|
|
"loss": 0.2194,
|
|
"step": 3390
|
|
},
|
|
{
|
|
"epoch": 16.09467455621302,
|
|
"grad_norm": 9.440893173217773,
|
|
"learning_rate": 2.9980439362022273e-05,
|
|
"loss": 0.2311,
|
|
"step": 3400
|
|
},
|
|
{
|
|
"epoch": 16.142011834319526,
|
|
"grad_norm": 10.132343292236328,
|
|
"learning_rate": 2.9905206139031e-05,
|
|
"loss": 0.2608,
|
|
"step": 3410
|
|
},
|
|
{
|
|
"epoch": 16.189349112426036,
|
|
"grad_norm": 9.294024467468262,
|
|
"learning_rate": 2.9829972916039723e-05,
|
|
"loss": 0.3056,
|
|
"step": 3420
|
|
},
|
|
{
|
|
"epoch": 16.236686390532544,
|
|
"grad_norm": 6.507917404174805,
|
|
"learning_rate": 2.975473969304845e-05,
|
|
"loss": 0.1905,
|
|
"step": 3430
|
|
},
|
|
{
|
|
"epoch": 16.284023668639055,
|
|
"grad_norm": 8.411003112792969,
|
|
"learning_rate": 2.967950647005718e-05,
|
|
"loss": 0.3232,
|
|
"step": 3440
|
|
},
|
|
{
|
|
"epoch": 16.331360946745562,
|
|
"grad_norm": 5.495641708374023,
|
|
"learning_rate": 2.9604273247065907e-05,
|
|
"loss": 0.2718,
|
|
"step": 3450
|
|
},
|
|
{
|
|
"epoch": 16.37869822485207,
|
|
"grad_norm": 9.734967231750488,
|
|
"learning_rate": 2.9529040024074632e-05,
|
|
"loss": 0.2966,
|
|
"step": 3460
|
|
},
|
|
{
|
|
"epoch": 16.42603550295858,
|
|
"grad_norm": 3.004697799682617,
|
|
"learning_rate": 2.945380680108336e-05,
|
|
"loss": 0.2234,
|
|
"step": 3470
|
|
},
|
|
{
|
|
"epoch": 16.473372781065088,
|
|
"grad_norm": 13.730050086975098,
|
|
"learning_rate": 2.937857357809209e-05,
|
|
"loss": 0.3209,
|
|
"step": 3480
|
|
},
|
|
{
|
|
"epoch": 16.5207100591716,
|
|
"grad_norm": 5.133395195007324,
|
|
"learning_rate": 2.930334035510081e-05,
|
|
"loss": 0.2561,
|
|
"step": 3490
|
|
},
|
|
{
|
|
"epoch": 16.568047337278106,
|
|
"grad_norm": 5.885538101196289,
|
|
"learning_rate": 2.9228107132109538e-05,
|
|
"loss": 0.3166,
|
|
"step": 3500
|
|
},
|
|
{
|
|
"epoch": 16.615384615384617,
|
|
"grad_norm": 8.295323371887207,
|
|
"learning_rate": 2.9152873909118266e-05,
|
|
"loss": 0.2634,
|
|
"step": 3510
|
|
},
|
|
{
|
|
"epoch": 16.662721893491124,
|
|
"grad_norm": 8.664441108703613,
|
|
"learning_rate": 2.9077640686126994e-05,
|
|
"loss": 0.2488,
|
|
"step": 3520
|
|
},
|
|
{
|
|
"epoch": 16.71005917159763,
|
|
"grad_norm": 13.536978721618652,
|
|
"learning_rate": 2.9002407463135723e-05,
|
|
"loss": 0.2616,
|
|
"step": 3530
|
|
},
|
|
{
|
|
"epoch": 16.757396449704142,
|
|
"grad_norm": 8.778542518615723,
|
|
"learning_rate": 2.892717424014445e-05,
|
|
"loss": 0.3111,
|
|
"step": 3540
|
|
},
|
|
{
|
|
"epoch": 16.80473372781065,
|
|
"grad_norm": 4.70704460144043,
|
|
"learning_rate": 2.885194101715318e-05,
|
|
"loss": 0.2381,
|
|
"step": 3550
|
|
},
|
|
{
|
|
"epoch": 16.85207100591716,
|
|
"grad_norm": 13.269988059997559,
|
|
"learning_rate": 2.8776707794161907e-05,
|
|
"loss": 0.2824,
|
|
"step": 3560
|
|
},
|
|
{
|
|
"epoch": 16.899408284023668,
|
|
"grad_norm": 3.4718408584594727,
|
|
"learning_rate": 2.870147457117063e-05,
|
|
"loss": 0.2517,
|
|
"step": 3570
|
|
},
|
|
{
|
|
"epoch": 16.94674556213018,
|
|
"grad_norm": 4.911701679229736,
|
|
"learning_rate": 2.8626241348179357e-05,
|
|
"loss": 0.2842,
|
|
"step": 3580
|
|
},
|
|
{
|
|
"epoch": 16.994082840236686,
|
|
"grad_norm": 14.3350248336792,
|
|
"learning_rate": 2.8551008125188085e-05,
|
|
"loss": 0.2734,
|
|
"step": 3590
|
|
},
|
|
{
|
|
"epoch": 16.99881656804734,
|
|
"eval_accuracy": 0.9095890410958904,
|
|
"eval_loss": 0.2769572138786316,
|
|
"eval_runtime": 6.3128,
|
|
"eval_samples_per_second": 231.275,
|
|
"eval_steps_per_second": 28.989,
|
|
"step": 3591
|
|
},
|
|
{
|
|
"epoch": 17.041420118343197,
|
|
"grad_norm": 9.578266143798828,
|
|
"learning_rate": 2.847577490219681e-05,
|
|
"loss": 0.2216,
|
|
"step": 3600
|
|
},
|
|
{
|
|
"epoch": 17.088757396449704,
|
|
"grad_norm": 10.65328311920166,
|
|
"learning_rate": 2.8400541679205538e-05,
|
|
"loss": 0.2713,
|
|
"step": 3610
|
|
},
|
|
{
|
|
"epoch": 17.13609467455621,
|
|
"grad_norm": 13.547807693481445,
|
|
"learning_rate": 2.8325308456214266e-05,
|
|
"loss": 0.2578,
|
|
"step": 3620
|
|
},
|
|
{
|
|
"epoch": 17.183431952662723,
|
|
"grad_norm": 5.553393363952637,
|
|
"learning_rate": 2.8250075233222994e-05,
|
|
"loss": 0.3016,
|
|
"step": 3630
|
|
},
|
|
{
|
|
"epoch": 17.23076923076923,
|
|
"grad_norm": 9.82513427734375,
|
|
"learning_rate": 2.8174842010231723e-05,
|
|
"loss": 0.281,
|
|
"step": 3640
|
|
},
|
|
{
|
|
"epoch": 17.27810650887574,
|
|
"grad_norm": 3.8038620948791504,
|
|
"learning_rate": 2.8099608787240444e-05,
|
|
"loss": 0.2876,
|
|
"step": 3650
|
|
},
|
|
{
|
|
"epoch": 17.325443786982248,
|
|
"grad_norm": 4.463418006896973,
|
|
"learning_rate": 2.8024375564249172e-05,
|
|
"loss": 0.2434,
|
|
"step": 3660
|
|
},
|
|
{
|
|
"epoch": 17.37278106508876,
|
|
"grad_norm": 4.446181297302246,
|
|
"learning_rate": 2.79491423412579e-05,
|
|
"loss": 0.2434,
|
|
"step": 3670
|
|
},
|
|
{
|
|
"epoch": 17.420118343195266,
|
|
"grad_norm": 12.428364753723145,
|
|
"learning_rate": 2.787390911826663e-05,
|
|
"loss": 0.2706,
|
|
"step": 3680
|
|
},
|
|
{
|
|
"epoch": 17.467455621301774,
|
|
"grad_norm": 9.818281173706055,
|
|
"learning_rate": 2.7798675895275357e-05,
|
|
"loss": 0.232,
|
|
"step": 3690
|
|
},
|
|
{
|
|
"epoch": 17.514792899408285,
|
|
"grad_norm": 19.56150245666504,
|
|
"learning_rate": 2.7723442672284085e-05,
|
|
"loss": 0.2981,
|
|
"step": 3700
|
|
},
|
|
{
|
|
"epoch": 17.562130177514792,
|
|
"grad_norm": 8.730667114257812,
|
|
"learning_rate": 2.764820944929281e-05,
|
|
"loss": 0.2427,
|
|
"step": 3710
|
|
},
|
|
{
|
|
"epoch": 17.609467455621303,
|
|
"grad_norm": 11.973594665527344,
|
|
"learning_rate": 2.7572976226301534e-05,
|
|
"loss": 0.2359,
|
|
"step": 3720
|
|
},
|
|
{
|
|
"epoch": 17.65680473372781,
|
|
"grad_norm": 2.578996419906616,
|
|
"learning_rate": 2.7497743003310263e-05,
|
|
"loss": 0.2783,
|
|
"step": 3730
|
|
},
|
|
{
|
|
"epoch": 17.70414201183432,
|
|
"grad_norm": 9.876580238342285,
|
|
"learning_rate": 2.7422509780318987e-05,
|
|
"loss": 0.2268,
|
|
"step": 3740
|
|
},
|
|
{
|
|
"epoch": 17.75147928994083,
|
|
"grad_norm": 5.562457084655762,
|
|
"learning_rate": 2.7347276557327716e-05,
|
|
"loss": 0.2296,
|
|
"step": 3750
|
|
},
|
|
{
|
|
"epoch": 17.798816568047336,
|
|
"grad_norm": 6.533483505249023,
|
|
"learning_rate": 2.7272043334336444e-05,
|
|
"loss": 0.2818,
|
|
"step": 3760
|
|
},
|
|
{
|
|
"epoch": 17.846153846153847,
|
|
"grad_norm": 7.880773544311523,
|
|
"learning_rate": 2.7196810111345172e-05,
|
|
"loss": 0.2865,
|
|
"step": 3770
|
|
},
|
|
{
|
|
"epoch": 17.893491124260354,
|
|
"grad_norm": 13.510115623474121,
|
|
"learning_rate": 2.71215768883539e-05,
|
|
"loss": 0.3133,
|
|
"step": 3780
|
|
},
|
|
{
|
|
"epoch": 17.940828402366865,
|
|
"grad_norm": 6.314772605895996,
|
|
"learning_rate": 2.704634366536263e-05,
|
|
"loss": 0.2102,
|
|
"step": 3790
|
|
},
|
|
{
|
|
"epoch": 17.988165680473372,
|
|
"grad_norm": 4.932859420776367,
|
|
"learning_rate": 2.697111044237135e-05,
|
|
"loss": 0.2344,
|
|
"step": 3800
|
|
},
|
|
{
|
|
"epoch": 17.997633136094674,
|
|
"eval_accuracy": 0.8952054794520548,
|
|
"eval_loss": 0.3737930953502655,
|
|
"eval_runtime": 6.2965,
|
|
"eval_samples_per_second": 231.875,
|
|
"eval_steps_per_second": 29.064,
|
|
"step": 3802
|
|
},
|
|
{
|
|
"epoch": 18.035502958579883,
|
|
"grad_norm": 8.163798332214355,
|
|
"learning_rate": 2.6895877219380078e-05,
|
|
"loss": 0.349,
|
|
"step": 3810
|
|
},
|
|
{
|
|
"epoch": 18.08284023668639,
|
|
"grad_norm": 8.841765403747559,
|
|
"learning_rate": 2.6820643996388806e-05,
|
|
"loss": 0.2864,
|
|
"step": 3820
|
|
},
|
|
{
|
|
"epoch": 18.130177514792898,
|
|
"grad_norm": 5.997651100158691,
|
|
"learning_rate": 2.6745410773397534e-05,
|
|
"loss": 0.2941,
|
|
"step": 3830
|
|
},
|
|
{
|
|
"epoch": 18.17751479289941,
|
|
"grad_norm": 5.4760332107543945,
|
|
"learning_rate": 2.6670177550406263e-05,
|
|
"loss": 0.2216,
|
|
"step": 3840
|
|
},
|
|
{
|
|
"epoch": 18.224852071005916,
|
|
"grad_norm": 6.478240489959717,
|
|
"learning_rate": 2.6594944327414987e-05,
|
|
"loss": 0.2874,
|
|
"step": 3850
|
|
},
|
|
{
|
|
"epoch": 18.272189349112427,
|
|
"grad_norm": 12.63205623626709,
|
|
"learning_rate": 2.6519711104423716e-05,
|
|
"loss": 0.2338,
|
|
"step": 3860
|
|
},
|
|
{
|
|
"epoch": 18.319526627218934,
|
|
"grad_norm": 9.010831832885742,
|
|
"learning_rate": 2.6444477881432444e-05,
|
|
"loss": 0.3293,
|
|
"step": 3870
|
|
},
|
|
{
|
|
"epoch": 18.366863905325445,
|
|
"grad_norm": 6.102337837219238,
|
|
"learning_rate": 2.6369244658441165e-05,
|
|
"loss": 0.3229,
|
|
"step": 3880
|
|
},
|
|
{
|
|
"epoch": 18.414201183431953,
|
|
"grad_norm": 9.948938369750977,
|
|
"learning_rate": 2.6294011435449893e-05,
|
|
"loss": 0.2604,
|
|
"step": 3890
|
|
},
|
|
{
|
|
"epoch": 18.46153846153846,
|
|
"grad_norm": 8.575167655944824,
|
|
"learning_rate": 2.621877821245862e-05,
|
|
"loss": 0.2205,
|
|
"step": 3900
|
|
},
|
|
{
|
|
"epoch": 18.50887573964497,
|
|
"grad_norm": 7.808337688446045,
|
|
"learning_rate": 2.614354498946735e-05,
|
|
"loss": 0.1802,
|
|
"step": 3910
|
|
},
|
|
{
|
|
"epoch": 18.556213017751478,
|
|
"grad_norm": 11.38652515411377,
|
|
"learning_rate": 2.6068311766476078e-05,
|
|
"loss": 0.2161,
|
|
"step": 3920
|
|
},
|
|
{
|
|
"epoch": 18.60355029585799,
|
|
"grad_norm": 7.173455715179443,
|
|
"learning_rate": 2.5993078543484806e-05,
|
|
"loss": 0.2973,
|
|
"step": 3930
|
|
},
|
|
{
|
|
"epoch": 18.650887573964496,
|
|
"grad_norm": 10.973929405212402,
|
|
"learning_rate": 2.5917845320493534e-05,
|
|
"loss": 0.2557,
|
|
"step": 3940
|
|
},
|
|
{
|
|
"epoch": 18.698224852071007,
|
|
"grad_norm": 6.697062015533447,
|
|
"learning_rate": 2.5842612097502256e-05,
|
|
"loss": 0.2371,
|
|
"step": 3950
|
|
},
|
|
{
|
|
"epoch": 18.745562130177515,
|
|
"grad_norm": 11.82797908782959,
|
|
"learning_rate": 2.5767378874510984e-05,
|
|
"loss": 0.2639,
|
|
"step": 3960
|
|
},
|
|
{
|
|
"epoch": 18.792899408284022,
|
|
"grad_norm": 4.322720050811768,
|
|
"learning_rate": 2.5692145651519712e-05,
|
|
"loss": 0.2212,
|
|
"step": 3970
|
|
},
|
|
{
|
|
"epoch": 18.840236686390533,
|
|
"grad_norm": 5.201810836791992,
|
|
"learning_rate": 2.561691242852844e-05,
|
|
"loss": 0.2003,
|
|
"step": 3980
|
|
},
|
|
{
|
|
"epoch": 18.88757396449704,
|
|
"grad_norm": 7.236006736755371,
|
|
"learning_rate": 2.554167920553717e-05,
|
|
"loss": 0.3897,
|
|
"step": 3990
|
|
},
|
|
{
|
|
"epoch": 18.93491124260355,
|
|
"grad_norm": 7.327210426330566,
|
|
"learning_rate": 2.5466445982545893e-05,
|
|
"loss": 0.1939,
|
|
"step": 4000
|
|
},
|
|
{
|
|
"epoch": 18.98224852071006,
|
|
"grad_norm": 16.192811965942383,
|
|
"learning_rate": 2.539121275955462e-05,
|
|
"loss": 0.2872,
|
|
"step": 4010
|
|
},
|
|
{
|
|
"epoch": 18.996449704142012,
|
|
"eval_accuracy": 0.9061643835616439,
|
|
"eval_loss": 0.3222917914390564,
|
|
"eval_runtime": 6.2252,
|
|
"eval_samples_per_second": 234.532,
|
|
"eval_steps_per_second": 29.397,
|
|
"step": 4013
|
|
},
|
|
{
|
|
"epoch": 19.02958579881657,
|
|
"grad_norm": 14.001523971557617,
|
|
"learning_rate": 2.531597953656335e-05,
|
|
"loss": 0.2899,
|
|
"step": 4020
|
|
},
|
|
{
|
|
"epoch": 19.076923076923077,
|
|
"grad_norm": 12.866436004638672,
|
|
"learning_rate": 2.524074631357207e-05,
|
|
"loss": 0.2249,
|
|
"step": 4030
|
|
},
|
|
{
|
|
"epoch": 19.124260355029588,
|
|
"grad_norm": 12.653215408325195,
|
|
"learning_rate": 2.51655130905808e-05,
|
|
"loss": 0.2008,
|
|
"step": 4040
|
|
},
|
|
{
|
|
"epoch": 19.171597633136095,
|
|
"grad_norm": 6.0526604652404785,
|
|
"learning_rate": 2.5090279867589527e-05,
|
|
"loss": 0.2264,
|
|
"step": 4050
|
|
},
|
|
{
|
|
"epoch": 19.218934911242602,
|
|
"grad_norm": 7.189617156982422,
|
|
"learning_rate": 2.5015046644598256e-05,
|
|
"loss": 0.2785,
|
|
"step": 4060
|
|
},
|
|
{
|
|
"epoch": 19.266272189349113,
|
|
"grad_norm": 6.08707332611084,
|
|
"learning_rate": 2.4939813421606984e-05,
|
|
"loss": 0.2666,
|
|
"step": 4070
|
|
},
|
|
{
|
|
"epoch": 19.31360946745562,
|
|
"grad_norm": 8.810041427612305,
|
|
"learning_rate": 2.4864580198615712e-05,
|
|
"loss": 0.2561,
|
|
"step": 4080
|
|
},
|
|
{
|
|
"epoch": 19.36094674556213,
|
|
"grad_norm": 5.877760410308838,
|
|
"learning_rate": 2.4789346975624437e-05,
|
|
"loss": 0.1829,
|
|
"step": 4090
|
|
},
|
|
{
|
|
"epoch": 19.40828402366864,
|
|
"grad_norm": 4.540722846984863,
|
|
"learning_rate": 2.4714113752633165e-05,
|
|
"loss": 0.2082,
|
|
"step": 4100
|
|
},
|
|
{
|
|
"epoch": 19.45562130177515,
|
|
"grad_norm": 10.91895866394043,
|
|
"learning_rate": 2.4638880529641893e-05,
|
|
"loss": 0.2264,
|
|
"step": 4110
|
|
},
|
|
{
|
|
"epoch": 19.502958579881657,
|
|
"grad_norm": 18.722084045410156,
|
|
"learning_rate": 2.4563647306650618e-05,
|
|
"loss": 0.2649,
|
|
"step": 4120
|
|
},
|
|
{
|
|
"epoch": 19.550295857988164,
|
|
"grad_norm": 5.907430648803711,
|
|
"learning_rate": 2.4488414083659346e-05,
|
|
"loss": 0.1769,
|
|
"step": 4130
|
|
},
|
|
{
|
|
"epoch": 19.597633136094675,
|
|
"grad_norm": 12.51977825164795,
|
|
"learning_rate": 2.441318086066807e-05,
|
|
"loss": 0.2895,
|
|
"step": 4140
|
|
},
|
|
{
|
|
"epoch": 19.644970414201183,
|
|
"grad_norm": 9.822182655334473,
|
|
"learning_rate": 2.43379476376768e-05,
|
|
"loss": 0.2349,
|
|
"step": 4150
|
|
},
|
|
{
|
|
"epoch": 19.692307692307693,
|
|
"grad_norm": 6.536006450653076,
|
|
"learning_rate": 2.4262714414685524e-05,
|
|
"loss": 0.2949,
|
|
"step": 4160
|
|
},
|
|
{
|
|
"epoch": 19.7396449704142,
|
|
"grad_norm": 6.116447448730469,
|
|
"learning_rate": 2.4187481191694252e-05,
|
|
"loss": 0.2438,
|
|
"step": 4170
|
|
},
|
|
{
|
|
"epoch": 19.78698224852071,
|
|
"grad_norm": 8.528430938720703,
|
|
"learning_rate": 2.411224796870298e-05,
|
|
"loss": 0.274,
|
|
"step": 4180
|
|
},
|
|
{
|
|
"epoch": 19.83431952662722,
|
|
"grad_norm": 9.427675247192383,
|
|
"learning_rate": 2.403701474571171e-05,
|
|
"loss": 0.2848,
|
|
"step": 4190
|
|
},
|
|
{
|
|
"epoch": 19.881656804733726,
|
|
"grad_norm": 5.054657459259033,
|
|
"learning_rate": 2.3961781522720433e-05,
|
|
"loss": 0.24,
|
|
"step": 4200
|
|
},
|
|
{
|
|
"epoch": 19.928994082840237,
|
|
"grad_norm": 12.677891731262207,
|
|
"learning_rate": 2.388654829972916e-05,
|
|
"loss": 0.2593,
|
|
"step": 4210
|
|
},
|
|
{
|
|
"epoch": 19.976331360946745,
|
|
"grad_norm": 5.84495735168457,
|
|
"learning_rate": 2.381131507673789e-05,
|
|
"loss": 0.2486,
|
|
"step": 4220
|
|
},
|
|
{
|
|
"epoch": 20.0,
|
|
"eval_accuracy": 0.9068493150684932,
|
|
"eval_loss": 0.32860177755355835,
|
|
"eval_runtime": 6.1011,
|
|
"eval_samples_per_second": 239.302,
|
|
"eval_steps_per_second": 29.995,
|
|
"step": 4225
|
|
},
|
|
{
|
|
"epoch": 20.023668639053255,
|
|
"grad_norm": 6.881824970245361,
|
|
"learning_rate": 2.3736081853746618e-05,
|
|
"loss": 0.2215,
|
|
"step": 4230
|
|
},
|
|
{
|
|
"epoch": 20.071005917159763,
|
|
"grad_norm": 10.07770824432373,
|
|
"learning_rate": 2.3660848630755343e-05,
|
|
"loss": 0.244,
|
|
"step": 4240
|
|
},
|
|
{
|
|
"epoch": 20.118343195266274,
|
|
"grad_norm": 4.6197919845581055,
|
|
"learning_rate": 2.358561540776407e-05,
|
|
"loss": 0.2289,
|
|
"step": 4250
|
|
},
|
|
{
|
|
"epoch": 20.16568047337278,
|
|
"grad_norm": 8.33582592010498,
|
|
"learning_rate": 2.35103821847728e-05,
|
|
"loss": 0.1889,
|
|
"step": 4260
|
|
},
|
|
{
|
|
"epoch": 20.21301775147929,
|
|
"grad_norm": 8.195116996765137,
|
|
"learning_rate": 2.3435148961781524e-05,
|
|
"loss": 0.2015,
|
|
"step": 4270
|
|
},
|
|
{
|
|
"epoch": 20.2603550295858,
|
|
"grad_norm": 6.473872661590576,
|
|
"learning_rate": 2.335991573879025e-05,
|
|
"loss": 0.2306,
|
|
"step": 4280
|
|
},
|
|
{
|
|
"epoch": 20.307692307692307,
|
|
"grad_norm": 4.936031341552734,
|
|
"learning_rate": 2.3284682515798977e-05,
|
|
"loss": 0.2311,
|
|
"step": 4290
|
|
},
|
|
{
|
|
"epoch": 20.355029585798817,
|
|
"grad_norm": 16.449352264404297,
|
|
"learning_rate": 2.3209449292807705e-05,
|
|
"loss": 0.2129,
|
|
"step": 4300
|
|
},
|
|
{
|
|
"epoch": 20.402366863905325,
|
|
"grad_norm": 7.029664516448975,
|
|
"learning_rate": 2.3134216069816433e-05,
|
|
"loss": 0.2211,
|
|
"step": 4310
|
|
},
|
|
{
|
|
"epoch": 20.449704142011836,
|
|
"grad_norm": 7.797490119934082,
|
|
"learning_rate": 2.3058982846825158e-05,
|
|
"loss": 0.2305,
|
|
"step": 4320
|
|
},
|
|
{
|
|
"epoch": 20.497041420118343,
|
|
"grad_norm": 13.063493728637695,
|
|
"learning_rate": 2.2983749623833886e-05,
|
|
"loss": 0.2916,
|
|
"step": 4330
|
|
},
|
|
{
|
|
"epoch": 20.54437869822485,
|
|
"grad_norm": 9.06458568572998,
|
|
"learning_rate": 2.2908516400842614e-05,
|
|
"loss": 0.2342,
|
|
"step": 4340
|
|
},
|
|
{
|
|
"epoch": 20.59171597633136,
|
|
"grad_norm": 7.881487846374512,
|
|
"learning_rate": 2.283328317785134e-05,
|
|
"loss": 0.2041,
|
|
"step": 4350
|
|
},
|
|
{
|
|
"epoch": 20.63905325443787,
|
|
"grad_norm": 10.349453926086426,
|
|
"learning_rate": 2.2758049954860067e-05,
|
|
"loss": 0.2949,
|
|
"step": 4360
|
|
},
|
|
{
|
|
"epoch": 20.68639053254438,
|
|
"grad_norm": 12.278468132019043,
|
|
"learning_rate": 2.2682816731868795e-05,
|
|
"loss": 0.2607,
|
|
"step": 4370
|
|
},
|
|
{
|
|
"epoch": 20.733727810650887,
|
|
"grad_norm": 11.949197769165039,
|
|
"learning_rate": 2.2607583508877524e-05,
|
|
"loss": 0.2741,
|
|
"step": 4380
|
|
},
|
|
{
|
|
"epoch": 20.781065088757398,
|
|
"grad_norm": 13.006739616394043,
|
|
"learning_rate": 2.253235028588625e-05,
|
|
"loss": 0.2845,
|
|
"step": 4390
|
|
},
|
|
{
|
|
"epoch": 20.828402366863905,
|
|
"grad_norm": 6.179040908813477,
|
|
"learning_rate": 2.2457117062894977e-05,
|
|
"loss": 0.2518,
|
|
"step": 4400
|
|
},
|
|
{
|
|
"epoch": 20.875739644970416,
|
|
"grad_norm": 8.708568572998047,
|
|
"learning_rate": 2.23818838399037e-05,
|
|
"loss": 0.254,
|
|
"step": 4410
|
|
},
|
|
{
|
|
"epoch": 20.923076923076923,
|
|
"grad_norm": 8.595051765441895,
|
|
"learning_rate": 2.230665061691243e-05,
|
|
"loss": 0.2462,
|
|
"step": 4420
|
|
},
|
|
{
|
|
"epoch": 20.97041420118343,
|
|
"grad_norm": 8.650654792785645,
|
|
"learning_rate": 2.2231417393921154e-05,
|
|
"loss": 0.2818,
|
|
"step": 4430
|
|
},
|
|
{
|
|
"epoch": 20.99881656804734,
|
|
"eval_accuracy": 0.8938356164383562,
|
|
"eval_loss": 0.3853361904621124,
|
|
"eval_runtime": 6.105,
|
|
"eval_samples_per_second": 239.147,
|
|
"eval_steps_per_second": 29.975,
|
|
"step": 4436
|
|
},
|
|
{
|
|
"epoch": 21.01775147928994,
|
|
"grad_norm": 7.857712268829346,
|
|
"learning_rate": 2.2156184170929883e-05,
|
|
"loss": 0.2664,
|
|
"step": 4440
|
|
},
|
|
{
|
|
"epoch": 21.06508875739645,
|
|
"grad_norm": 7.22745943069458,
|
|
"learning_rate": 2.208095094793861e-05,
|
|
"loss": 0.2062,
|
|
"step": 4450
|
|
},
|
|
{
|
|
"epoch": 21.11242603550296,
|
|
"grad_norm": 2.6673853397369385,
|
|
"learning_rate": 2.200571772494734e-05,
|
|
"loss": 0.2239,
|
|
"step": 4460
|
|
},
|
|
{
|
|
"epoch": 21.159763313609467,
|
|
"grad_norm": 4.8849005699157715,
|
|
"learning_rate": 2.1930484501956064e-05,
|
|
"loss": 0.1985,
|
|
"step": 4470
|
|
},
|
|
{
|
|
"epoch": 21.207100591715978,
|
|
"grad_norm": 22.471643447875977,
|
|
"learning_rate": 2.1855251278964792e-05,
|
|
"loss": 0.2331,
|
|
"step": 4480
|
|
},
|
|
{
|
|
"epoch": 21.254437869822485,
|
|
"grad_norm": 12.047694206237793,
|
|
"learning_rate": 2.178001805597352e-05,
|
|
"loss": 0.193,
|
|
"step": 4490
|
|
},
|
|
{
|
|
"epoch": 21.301775147928993,
|
|
"grad_norm": 8.459744453430176,
|
|
"learning_rate": 2.170478483298225e-05,
|
|
"loss": 0.2698,
|
|
"step": 4500
|
|
},
|
|
{
|
|
"epoch": 21.349112426035504,
|
|
"grad_norm": 5.106344699859619,
|
|
"learning_rate": 2.1629551609990973e-05,
|
|
"loss": 0.2626,
|
|
"step": 4510
|
|
},
|
|
{
|
|
"epoch": 21.39644970414201,
|
|
"grad_norm": 8.469663619995117,
|
|
"learning_rate": 2.15543183869997e-05,
|
|
"loss": 0.208,
|
|
"step": 4520
|
|
},
|
|
{
|
|
"epoch": 21.443786982248522,
|
|
"grad_norm": 4.838006496429443,
|
|
"learning_rate": 2.147908516400843e-05,
|
|
"loss": 0.2903,
|
|
"step": 4530
|
|
},
|
|
{
|
|
"epoch": 21.49112426035503,
|
|
"grad_norm": 5.432097911834717,
|
|
"learning_rate": 2.1403851941017154e-05,
|
|
"loss": 0.2337,
|
|
"step": 4540
|
|
},
|
|
{
|
|
"epoch": 21.53846153846154,
|
|
"grad_norm": 6.889484882354736,
|
|
"learning_rate": 2.132861871802588e-05,
|
|
"loss": 0.2269,
|
|
"step": 4550
|
|
},
|
|
{
|
|
"epoch": 21.585798816568047,
|
|
"grad_norm": 8.73716926574707,
|
|
"learning_rate": 2.1253385495034607e-05,
|
|
"loss": 0.2631,
|
|
"step": 4560
|
|
},
|
|
{
|
|
"epoch": 21.633136094674555,
|
|
"grad_norm": 2.4893133640289307,
|
|
"learning_rate": 2.1178152272043336e-05,
|
|
"loss": 0.2186,
|
|
"step": 4570
|
|
},
|
|
{
|
|
"epoch": 21.680473372781066,
|
|
"grad_norm": 7.44368839263916,
|
|
"learning_rate": 2.110291904905206e-05,
|
|
"loss": 0.2052,
|
|
"step": 4580
|
|
},
|
|
{
|
|
"epoch": 21.727810650887573,
|
|
"grad_norm": 12.204940795898438,
|
|
"learning_rate": 2.102768582606079e-05,
|
|
"loss": 0.2862,
|
|
"step": 4590
|
|
},
|
|
{
|
|
"epoch": 21.775147928994084,
|
|
"grad_norm": 7.419914722442627,
|
|
"learning_rate": 2.0952452603069517e-05,
|
|
"loss": 0.2568,
|
|
"step": 4600
|
|
},
|
|
{
|
|
"epoch": 21.82248520710059,
|
|
"grad_norm": 7.833005905151367,
|
|
"learning_rate": 2.0877219380078245e-05,
|
|
"loss": 0.2727,
|
|
"step": 4610
|
|
},
|
|
{
|
|
"epoch": 21.869822485207102,
|
|
"grad_norm": 3.8460819721221924,
|
|
"learning_rate": 2.080198615708697e-05,
|
|
"loss": 0.2306,
|
|
"step": 4620
|
|
},
|
|
{
|
|
"epoch": 21.91715976331361,
|
|
"grad_norm": 12.018167495727539,
|
|
"learning_rate": 2.0726752934095698e-05,
|
|
"loss": 0.2729,
|
|
"step": 4630
|
|
},
|
|
{
|
|
"epoch": 21.964497041420117,
|
|
"grad_norm": 7.023700714111328,
|
|
"learning_rate": 2.0651519711104426e-05,
|
|
"loss": 0.2845,
|
|
"step": 4640
|
|
},
|
|
{
|
|
"epoch": 21.997633136094674,
|
|
"eval_accuracy": 0.8938356164383562,
|
|
"eval_loss": 0.3902602195739746,
|
|
"eval_runtime": 6.3485,
|
|
"eval_samples_per_second": 229.975,
|
|
"eval_steps_per_second": 28.826,
|
|
"step": 4647
|
|
},
|
|
{
|
|
"epoch": 22.011834319526628,
|
|
"grad_norm": 11.811697006225586,
|
|
"learning_rate": 2.0576286488113154e-05,
|
|
"loss": 0.3063,
|
|
"step": 4650
|
|
},
|
|
{
|
|
"epoch": 22.059171597633135,
|
|
"grad_norm": 4.944943428039551,
|
|
"learning_rate": 2.050105326512188e-05,
|
|
"loss": 0.2157,
|
|
"step": 4660
|
|
},
|
|
{
|
|
"epoch": 22.106508875739646,
|
|
"grad_norm": 5.4949517250061035,
|
|
"learning_rate": 2.0425820042130607e-05,
|
|
"loss": 0.2373,
|
|
"step": 4670
|
|
},
|
|
{
|
|
"epoch": 22.153846153846153,
|
|
"grad_norm": 6.9762163162231445,
|
|
"learning_rate": 2.0350586819139332e-05,
|
|
"loss": 0.2378,
|
|
"step": 4680
|
|
},
|
|
{
|
|
"epoch": 22.201183431952664,
|
|
"grad_norm": 6.753002643585205,
|
|
"learning_rate": 2.027535359614806e-05,
|
|
"loss": 0.1939,
|
|
"step": 4690
|
|
},
|
|
{
|
|
"epoch": 22.24852071005917,
|
|
"grad_norm": 4.161319732666016,
|
|
"learning_rate": 2.0200120373156785e-05,
|
|
"loss": 0.242,
|
|
"step": 4700
|
|
},
|
|
{
|
|
"epoch": 22.29585798816568,
|
|
"grad_norm": 5.062042713165283,
|
|
"learning_rate": 2.0124887150165513e-05,
|
|
"loss": 0.2782,
|
|
"step": 4710
|
|
},
|
|
{
|
|
"epoch": 22.34319526627219,
|
|
"grad_norm": 9.755287170410156,
|
|
"learning_rate": 2.004965392717424e-05,
|
|
"loss": 0.2009,
|
|
"step": 4720
|
|
},
|
|
{
|
|
"epoch": 22.390532544378697,
|
|
"grad_norm": 6.668210506439209,
|
|
"learning_rate": 1.997442070418297e-05,
|
|
"loss": 0.1617,
|
|
"step": 4730
|
|
},
|
|
{
|
|
"epoch": 22.437869822485208,
|
|
"grad_norm": 3.9158642292022705,
|
|
"learning_rate": 1.9899187481191694e-05,
|
|
"loss": 0.2013,
|
|
"step": 4740
|
|
},
|
|
{
|
|
"epoch": 22.485207100591715,
|
|
"grad_norm": 7.47080659866333,
|
|
"learning_rate": 1.9823954258200423e-05,
|
|
"loss": 0.2057,
|
|
"step": 4750
|
|
},
|
|
{
|
|
"epoch": 22.532544378698226,
|
|
"grad_norm": 17.479690551757812,
|
|
"learning_rate": 1.974872103520915e-05,
|
|
"loss": 0.2964,
|
|
"step": 4760
|
|
},
|
|
{
|
|
"epoch": 22.579881656804734,
|
|
"grad_norm": 9.807324409484863,
|
|
"learning_rate": 1.9673487812217876e-05,
|
|
"loss": 0.2519,
|
|
"step": 4770
|
|
},
|
|
{
|
|
"epoch": 22.62721893491124,
|
|
"grad_norm": 8.961894035339355,
|
|
"learning_rate": 1.9598254589226604e-05,
|
|
"loss": 0.2724,
|
|
"step": 4780
|
|
},
|
|
{
|
|
"epoch": 22.674556213017752,
|
|
"grad_norm": 3.2384064197540283,
|
|
"learning_rate": 1.9523021366235332e-05,
|
|
"loss": 0.187,
|
|
"step": 4790
|
|
},
|
|
{
|
|
"epoch": 22.72189349112426,
|
|
"grad_norm": 5.056863307952881,
|
|
"learning_rate": 1.944778814324406e-05,
|
|
"loss": 0.2512,
|
|
"step": 4800
|
|
},
|
|
{
|
|
"epoch": 22.76923076923077,
|
|
"grad_norm": 9.88666820526123,
|
|
"learning_rate": 1.9372554920252785e-05,
|
|
"loss": 0.2003,
|
|
"step": 4810
|
|
},
|
|
{
|
|
"epoch": 22.816568047337277,
|
|
"grad_norm": 15.032508850097656,
|
|
"learning_rate": 1.929732169726151e-05,
|
|
"loss": 0.2665,
|
|
"step": 4820
|
|
},
|
|
{
|
|
"epoch": 22.86390532544379,
|
|
"grad_norm": 6.520040035247803,
|
|
"learning_rate": 1.9222088474270238e-05,
|
|
"loss": 0.2592,
|
|
"step": 4830
|
|
},
|
|
{
|
|
"epoch": 22.911242603550296,
|
|
"grad_norm": 5.046426296234131,
|
|
"learning_rate": 1.9146855251278966e-05,
|
|
"loss": 0.2219,
|
|
"step": 4840
|
|
},
|
|
{
|
|
"epoch": 22.958579881656803,
|
|
"grad_norm": 11.43876838684082,
|
|
"learning_rate": 1.907162202828769e-05,
|
|
"loss": 0.227,
|
|
"step": 4850
|
|
},
|
|
{
|
|
"epoch": 22.996449704142012,
|
|
"eval_accuracy": 0.9041095890410958,
|
|
"eval_loss": 0.3559742569923401,
|
|
"eval_runtime": 6.1037,
|
|
"eval_samples_per_second": 239.198,
|
|
"eval_steps_per_second": 29.982,
|
|
"step": 4858
|
|
},
|
|
{
|
|
"epoch": 23.005917159763314,
|
|
"grad_norm": 5.062148571014404,
|
|
"learning_rate": 1.899638880529642e-05,
|
|
"loss": 0.1716,
|
|
"step": 4860
|
|
},
|
|
{
|
|
"epoch": 23.05325443786982,
|
|
"grad_norm": 13.177910804748535,
|
|
"learning_rate": 1.8921155582305147e-05,
|
|
"loss": 0.2274,
|
|
"step": 4870
|
|
},
|
|
{
|
|
"epoch": 23.100591715976332,
|
|
"grad_norm": 10.63724136352539,
|
|
"learning_rate": 1.8845922359313875e-05,
|
|
"loss": 0.2376,
|
|
"step": 4880
|
|
},
|
|
{
|
|
"epoch": 23.14792899408284,
|
|
"grad_norm": 11.315512657165527,
|
|
"learning_rate": 1.87706891363226e-05,
|
|
"loss": 0.2451,
|
|
"step": 4890
|
|
},
|
|
{
|
|
"epoch": 23.19526627218935,
|
|
"grad_norm": 9.915947914123535,
|
|
"learning_rate": 1.869545591333133e-05,
|
|
"loss": 0.265,
|
|
"step": 4900
|
|
},
|
|
{
|
|
"epoch": 23.242603550295858,
|
|
"grad_norm": 7.371302604675293,
|
|
"learning_rate": 1.8620222690340057e-05,
|
|
"loss": 0.203,
|
|
"step": 4910
|
|
},
|
|
{
|
|
"epoch": 23.28994082840237,
|
|
"grad_norm": 10.347346305847168,
|
|
"learning_rate": 1.8544989467348785e-05,
|
|
"loss": 0.2058,
|
|
"step": 4920
|
|
},
|
|
{
|
|
"epoch": 23.337278106508876,
|
|
"grad_norm": 7.930377006530762,
|
|
"learning_rate": 1.846975624435751e-05,
|
|
"loss": 0.1881,
|
|
"step": 4930
|
|
},
|
|
{
|
|
"epoch": 23.384615384615383,
|
|
"grad_norm": 7.690789699554443,
|
|
"learning_rate": 1.8394523021366238e-05,
|
|
"loss": 0.2058,
|
|
"step": 4940
|
|
},
|
|
{
|
|
"epoch": 23.431952662721894,
|
|
"grad_norm": 9.262539863586426,
|
|
"learning_rate": 1.8319289798374963e-05,
|
|
"loss": 0.2432,
|
|
"step": 4950
|
|
},
|
|
{
|
|
"epoch": 23.4792899408284,
|
|
"grad_norm": 6.507819652557373,
|
|
"learning_rate": 1.8244056575383687e-05,
|
|
"loss": 0.1951,
|
|
"step": 4960
|
|
},
|
|
{
|
|
"epoch": 23.526627218934912,
|
|
"grad_norm": 5.187134742736816,
|
|
"learning_rate": 1.8168823352392416e-05,
|
|
"loss": 0.1948,
|
|
"step": 4970
|
|
},
|
|
{
|
|
"epoch": 23.57396449704142,
|
|
"grad_norm": 5.986237525939941,
|
|
"learning_rate": 1.8093590129401144e-05,
|
|
"loss": 0.1896,
|
|
"step": 4980
|
|
},
|
|
{
|
|
"epoch": 23.62130177514793,
|
|
"grad_norm": 3.2465999126434326,
|
|
"learning_rate": 1.8018356906409872e-05,
|
|
"loss": 0.2521,
|
|
"step": 4990
|
|
},
|
|
{
|
|
"epoch": 23.668639053254438,
|
|
"grad_norm": 6.972270488739014,
|
|
"learning_rate": 1.7943123683418597e-05,
|
|
"loss": 0.2162,
|
|
"step": 5000
|
|
},
|
|
{
|
|
"epoch": 23.715976331360945,
|
|
"grad_norm": 10.68996524810791,
|
|
"learning_rate": 1.7867890460427325e-05,
|
|
"loss": 0.228,
|
|
"step": 5010
|
|
},
|
|
{
|
|
"epoch": 23.763313609467456,
|
|
"grad_norm": 13.406333923339844,
|
|
"learning_rate": 1.7792657237436053e-05,
|
|
"loss": 0.2184,
|
|
"step": 5020
|
|
},
|
|
{
|
|
"epoch": 23.810650887573964,
|
|
"grad_norm": 10.20108699798584,
|
|
"learning_rate": 1.771742401444478e-05,
|
|
"loss": 0.2286,
|
|
"step": 5030
|
|
},
|
|
{
|
|
"epoch": 23.857988165680474,
|
|
"grad_norm": 4.646299839019775,
|
|
"learning_rate": 1.7642190791453506e-05,
|
|
"loss": 0.177,
|
|
"step": 5040
|
|
},
|
|
{
|
|
"epoch": 23.90532544378698,
|
|
"grad_norm": 11.070876121520996,
|
|
"learning_rate": 1.7566957568462234e-05,
|
|
"loss": 0.232,
|
|
"step": 5050
|
|
},
|
|
{
|
|
"epoch": 23.952662721893493,
|
|
"grad_norm": 9.572555541992188,
|
|
"learning_rate": 1.7491724345470962e-05,
|
|
"loss": 0.2007,
|
|
"step": 5060
|
|
},
|
|
{
|
|
"epoch": 24.0,
|
|
"grad_norm": 5.681567668914795,
|
|
"learning_rate": 1.7416491122479687e-05,
|
|
"loss": 0.1909,
|
|
"step": 5070
|
|
},
|
|
{
|
|
"epoch": 24.0,
|
|
"eval_accuracy": 0.9054794520547945,
|
|
"eval_loss": 0.3623672127723694,
|
|
"eval_runtime": 6.2394,
|
|
"eval_samples_per_second": 233.996,
|
|
"eval_steps_per_second": 29.33,
|
|
"step": 5070
|
|
},
|
|
{
|
|
"epoch": 24.047337278106507,
|
|
"grad_norm": 9.697016716003418,
|
|
"learning_rate": 1.7341257899488415e-05,
|
|
"loss": 0.2439,
|
|
"step": 5080
|
|
},
|
|
{
|
|
"epoch": 24.09467455621302,
|
|
"grad_norm": 10.163914680480957,
|
|
"learning_rate": 1.726602467649714e-05,
|
|
"loss": 0.1802,
|
|
"step": 5090
|
|
},
|
|
{
|
|
"epoch": 24.142011834319526,
|
|
"grad_norm": 11.584846496582031,
|
|
"learning_rate": 1.719079145350587e-05,
|
|
"loss": 0.2823,
|
|
"step": 5100
|
|
},
|
|
{
|
|
"epoch": 24.189349112426036,
|
|
"grad_norm": 4.525638103485107,
|
|
"learning_rate": 1.7115558230514597e-05,
|
|
"loss": 0.1716,
|
|
"step": 5110
|
|
},
|
|
{
|
|
"epoch": 24.236686390532544,
|
|
"grad_norm": 7.053022861480713,
|
|
"learning_rate": 1.704032500752332e-05,
|
|
"loss": 0.218,
|
|
"step": 5120
|
|
},
|
|
{
|
|
"epoch": 24.284023668639055,
|
|
"grad_norm": 2.7923426628112793,
|
|
"learning_rate": 1.696509178453205e-05,
|
|
"loss": 0.185,
|
|
"step": 5130
|
|
},
|
|
{
|
|
"epoch": 24.331360946745562,
|
|
"grad_norm": 3.2291653156280518,
|
|
"learning_rate": 1.6889858561540778e-05,
|
|
"loss": 0.2196,
|
|
"step": 5140
|
|
},
|
|
{
|
|
"epoch": 24.37869822485207,
|
|
"grad_norm": 11.007999420166016,
|
|
"learning_rate": 1.6814625338549506e-05,
|
|
"loss": 0.2367,
|
|
"step": 5150
|
|
},
|
|
{
|
|
"epoch": 24.42603550295858,
|
|
"grad_norm": 10.4671049118042,
|
|
"learning_rate": 1.673939211555823e-05,
|
|
"loss": 0.2754,
|
|
"step": 5160
|
|
},
|
|
{
|
|
"epoch": 24.473372781065088,
|
|
"grad_norm": 11.023184776306152,
|
|
"learning_rate": 1.666415889256696e-05,
|
|
"loss": 0.2092,
|
|
"step": 5170
|
|
},
|
|
{
|
|
"epoch": 24.5207100591716,
|
|
"grad_norm": 7.405954360961914,
|
|
"learning_rate": 1.6588925669575687e-05,
|
|
"loss": 0.2387,
|
|
"step": 5180
|
|
},
|
|
{
|
|
"epoch": 24.568047337278106,
|
|
"grad_norm": 2.6797077655792236,
|
|
"learning_rate": 1.6513692446584412e-05,
|
|
"loss": 0.2842,
|
|
"step": 5190
|
|
},
|
|
{
|
|
"epoch": 24.615384615384617,
|
|
"grad_norm": 2.8351101875305176,
|
|
"learning_rate": 1.643845922359314e-05,
|
|
"loss": 0.2395,
|
|
"step": 5200
|
|
},
|
|
{
|
|
"epoch": 24.662721893491124,
|
|
"grad_norm": 5.248380661010742,
|
|
"learning_rate": 1.636322600060187e-05,
|
|
"loss": 0.2047,
|
|
"step": 5210
|
|
},
|
|
{
|
|
"epoch": 24.71005917159763,
|
|
"grad_norm": 11.657218933105469,
|
|
"learning_rate": 1.6287992777610593e-05,
|
|
"loss": 0.2442,
|
|
"step": 5220
|
|
},
|
|
{
|
|
"epoch": 24.757396449704142,
|
|
"grad_norm": 8.078208923339844,
|
|
"learning_rate": 1.6212759554619318e-05,
|
|
"loss": 0.1842,
|
|
"step": 5230
|
|
},
|
|
{
|
|
"epoch": 24.80473372781065,
|
|
"grad_norm": 7.111977577209473,
|
|
"learning_rate": 1.6137526331628046e-05,
|
|
"loss": 0.2838,
|
|
"step": 5240
|
|
},
|
|
{
|
|
"epoch": 24.85207100591716,
|
|
"grad_norm": 5.9829535484313965,
|
|
"learning_rate": 1.6062293108636774e-05,
|
|
"loss": 0.2379,
|
|
"step": 5250
|
|
},
|
|
{
|
|
"epoch": 24.899408284023668,
|
|
"grad_norm": 7.217136859893799,
|
|
"learning_rate": 1.5987059885645502e-05,
|
|
"loss": 0.2162,
|
|
"step": 5260
|
|
},
|
|
{
|
|
"epoch": 24.94674556213018,
|
|
"grad_norm": 3.860224485397339,
|
|
"learning_rate": 1.5911826662654227e-05,
|
|
"loss": 0.2123,
|
|
"step": 5270
|
|
},
|
|
{
|
|
"epoch": 24.994082840236686,
|
|
"grad_norm": 7.911783695220947,
|
|
"learning_rate": 1.5836593439662955e-05,
|
|
"loss": 0.1972,
|
|
"step": 5280
|
|
},
|
|
{
|
|
"epoch": 24.99881656804734,
|
|
"eval_accuracy": 0.910958904109589,
|
|
"eval_loss": 0.3805873990058899,
|
|
"eval_runtime": 6.2415,
|
|
"eval_samples_per_second": 233.919,
|
|
"eval_steps_per_second": 29.32,
|
|
"step": 5281
|
|
}
|
|
],
|
|
"logging_steps": 10,
|
|
"max_steps": 7385,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 35,
|
|
"save_steps": 500,
|
|
"total_flos": 5.493880885130035e+18,
|
|
"train_batch_size": 8,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|
|
|