beamaia's picture
Training in progress, epoch 0
6b637ec
raw
history blame
42.6 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.24560541735377706,
"global_step": 3500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"learning_rate": 1.1428571428571429e-05,
"loss": 1.2994,
"step": 10
},
{
"epoch": 0.0,
"learning_rate": 2.2857142857142858e-05,
"loss": 1.2625,
"step": 20
},
{
"epoch": 0.0,
"learning_rate": 3.428571428571429e-05,
"loss": 1.1582,
"step": 30
},
{
"epoch": 0.0,
"learning_rate": 4.5714285714285716e-05,
"loss": 0.9934,
"step": 40
},
{
"epoch": 0.0,
"learning_rate": 5.714285714285714e-05,
"loss": 0.8862,
"step": 50
},
{
"epoch": 0.0,
"learning_rate": 6.857142857142858e-05,
"loss": 0.7613,
"step": 60
},
{
"epoch": 0.0,
"learning_rate": 8e-05,
"loss": 0.6832,
"step": 70
},
{
"epoch": 0.01,
"learning_rate": 9.142857142857143e-05,
"loss": 0.5934,
"step": 80
},
{
"epoch": 0.01,
"learning_rate": 0.00010285714285714286,
"loss": 0.5305,
"step": 90
},
{
"epoch": 0.01,
"learning_rate": 0.00011428571428571428,
"loss": 0.5195,
"step": 100
},
{
"epoch": 0.01,
"learning_rate": 0.00012571428571428572,
"loss": 0.5001,
"step": 110
},
{
"epoch": 0.01,
"learning_rate": 0.00013714285714285716,
"loss": 0.4812,
"step": 120
},
{
"epoch": 0.01,
"learning_rate": 0.00014857142857142857,
"loss": 0.4624,
"step": 130
},
{
"epoch": 0.01,
"learning_rate": 0.00016,
"loss": 0.4699,
"step": 140
},
{
"epoch": 0.01,
"learning_rate": 0.00017142857142857143,
"loss": 0.4355,
"step": 150
},
{
"epoch": 0.01,
"learning_rate": 0.00018285714285714286,
"loss": 0.4341,
"step": 160
},
{
"epoch": 0.01,
"learning_rate": 0.00019314285714285717,
"loss": 0.4243,
"step": 170
},
{
"epoch": 0.01,
"learning_rate": 0.00020457142857142858,
"loss": 0.4255,
"step": 180
},
{
"epoch": 0.01,
"learning_rate": 0.00021600000000000002,
"loss": 0.4116,
"step": 190
},
{
"epoch": 0.01,
"learning_rate": 0.00022742857142857146,
"loss": 0.425,
"step": 200
},
{
"epoch": 0.01,
"learning_rate": 0.0002388571428571429,
"loss": 0.4121,
"step": 210
},
{
"epoch": 0.02,
"learning_rate": 0.0002502857142857143,
"loss": 0.4066,
"step": 220
},
{
"epoch": 0.02,
"learning_rate": 0.0002617142857142857,
"loss": 0.4019,
"step": 230
},
{
"epoch": 0.02,
"learning_rate": 0.00027314285714285716,
"loss": 0.3961,
"step": 240
},
{
"epoch": 0.02,
"learning_rate": 0.00028457142857142857,
"loss": 0.3947,
"step": 250
},
{
"epoch": 0.02,
"learning_rate": 0.000296,
"loss": 0.4015,
"step": 260
},
{
"epoch": 0.02,
"learning_rate": 0.00030742857142857145,
"loss": 0.3862,
"step": 270
},
{
"epoch": 0.02,
"learning_rate": 0.00031885714285714286,
"loss": 0.4013,
"step": 280
},
{
"epoch": 0.02,
"learning_rate": 0.00033028571428571433,
"loss": 0.3913,
"step": 290
},
{
"epoch": 0.02,
"learning_rate": 0.00034171428571428574,
"loss": 0.4038,
"step": 300
},
{
"epoch": 0.02,
"learning_rate": 0.00035314285714285715,
"loss": 0.3962,
"step": 310
},
{
"epoch": 0.02,
"learning_rate": 0.0003645714285714286,
"loss": 0.3897,
"step": 320
},
{
"epoch": 0.02,
"learning_rate": 0.000376,
"loss": 0.3875,
"step": 330
},
{
"epoch": 0.02,
"learning_rate": 0.00038742857142857144,
"loss": 0.378,
"step": 340
},
{
"epoch": 0.02,
"learning_rate": 0.00039885714285714286,
"loss": 0.3843,
"step": 350
},
{
"epoch": 0.03,
"learning_rate": 0.00039999194323417424,
"loss": 0.3827,
"step": 360
},
{
"epoch": 0.03,
"learning_rate": 0.000399964093519005,
"loss": 0.3828,
"step": 370
},
{
"epoch": 0.03,
"learning_rate": 0.0003999163541576902,
"loss": 0.3905,
"step": 380
},
{
"epoch": 0.03,
"learning_rate": 0.00039984872989867726,
"loss": 0.3938,
"step": 390
},
{
"epoch": 0.03,
"learning_rate": 0.0003997612274682868,
"loss": 0.3811,
"step": 400
},
{
"epoch": 0.03,
"learning_rate": 0.0003996538555700433,
"loss": 0.3942,
"step": 410
},
{
"epoch": 0.03,
"learning_rate": 0.0003995266248838093,
"loss": 0.3746,
"step": 420
},
{
"epoch": 0.03,
"learning_rate": 0.00039937954806472354,
"loss": 0.3785,
"step": 430
},
{
"epoch": 0.03,
"learning_rate": 0.0003992126397419419,
"loss": 0.3807,
"step": 440
},
{
"epoch": 0.03,
"learning_rate": 0.00039902591651718225,
"loss": 0.3765,
"step": 450
},
{
"epoch": 0.03,
"learning_rate": 0.0003988193969630734,
"loss": 0.375,
"step": 460
},
{
"epoch": 0.03,
"learning_rate": 0.0003985931016213074,
"loss": 0.3886,
"step": 470
},
{
"epoch": 0.03,
"learning_rate": 0.00039834705300059676,
"loss": 0.3796,
"step": 480
},
{
"epoch": 0.03,
"learning_rate": 0.0003980812755744352,
"loss": 0.3778,
"step": 490
},
{
"epoch": 0.04,
"learning_rate": 0.00039779579577866366,
"loss": 0.3801,
"step": 500
},
{
"epoch": 0.04,
"learning_rate": 0.0003974906420088406,
"loss": 0.382,
"step": 510
},
{
"epoch": 0.04,
"learning_rate": 0.00039716584461741783,
"loss": 0.3779,
"step": 520
},
{
"epoch": 0.04,
"learning_rate": 0.0003968214359107213,
"loss": 0.3811,
"step": 530
},
{
"epoch": 0.04,
"learning_rate": 0.0003964574501457378,
"loss": 0.3837,
"step": 540
},
{
"epoch": 0.04,
"learning_rate": 0.0003960739235267074,
"loss": 0.3798,
"step": 550
},
{
"epoch": 0.04,
"learning_rate": 0.00039567089420152265,
"loss": 0.3801,
"step": 560
},
{
"epoch": 0.04,
"learning_rate": 0.00039524840225793395,
"loss": 0.3761,
"step": 570
},
{
"epoch": 0.04,
"learning_rate": 0.00039480648971956214,
"loss": 0.3702,
"step": 580
},
{
"epoch": 0.04,
"learning_rate": 0.00039434520054171875,
"loss": 0.3787,
"step": 590
},
{
"epoch": 0.04,
"learning_rate": 0.0003938645806070337,
"loss": 0.364,
"step": 600
},
{
"epoch": 0.04,
"learning_rate": 0.00039336467772089194,
"loss": 0.3669,
"step": 610
},
{
"epoch": 0.04,
"learning_rate": 0.0003928455416066777,
"loss": 0.3713,
"step": 620
},
{
"epoch": 0.04,
"learning_rate": 0.00039230722390082984,
"loss": 0.3676,
"step": 630
},
{
"epoch": 0.04,
"learning_rate": 0.0003917497781477043,
"loss": 0.3715,
"step": 640
},
{
"epoch": 0.05,
"learning_rate": 0.00039117325979424984,
"loss": 0.3626,
"step": 650
},
{
"epoch": 0.05,
"learning_rate": 0.00039057772618449163,
"loss": 0.3744,
"step": 660
},
{
"epoch": 0.05,
"learning_rate": 0.0003899632365538282,
"loss": 0.3686,
"step": 670
},
{
"epoch": 0.05,
"learning_rate": 0.0003893298520231393,
"loss": 0.3683,
"step": 680
},
{
"epoch": 0.05,
"learning_rate": 0.0003886776355927065,
"loss": 0.3745,
"step": 690
},
{
"epoch": 0.05,
"learning_rate": 0.00038800665213594663,
"loss": 0.3701,
"step": 700
},
{
"epoch": 0.05,
"learning_rate": 0.0003873169683929594,
"loss": 0.3685,
"step": 710
},
{
"epoch": 0.05,
"learning_rate": 0.0003866086529638889,
"loss": 0.3649,
"step": 720
},
{
"epoch": 0.05,
"learning_rate": 0.0003858817763020997,
"loss": 0.3621,
"step": 730
},
{
"epoch": 0.05,
"learning_rate": 0.00038513641070717016,
"loss": 0.3678,
"step": 740
},
{
"epoch": 0.05,
"learning_rate": 0.0003843726303177001,
"loss": 0.362,
"step": 750
},
{
"epoch": 0.05,
"learning_rate": 0.0003835905111039371,
"loss": 0.3551,
"step": 760
},
{
"epoch": 0.05,
"learning_rate": 0.00038279013086021973,
"loss": 0.3675,
"step": 770
},
{
"epoch": 0.05,
"learning_rate": 0.00038197156919723984,
"loss": 0.3666,
"step": 780
},
{
"epoch": 0.06,
"learning_rate": 0.000381134907534124,
"loss": 0.3719,
"step": 790
},
{
"epoch": 0.06,
"learning_rate": 0.0003802802290903348,
"loss": 0.3675,
"step": 800
},
{
"epoch": 0.06,
"learning_rate": 0.00037940761887739373,
"loss": 0.3746,
"step": 810
},
{
"epoch": 0.06,
"learning_rate": 0.00037851716369042513,
"loss": 0.3677,
"step": 820
},
{
"epoch": 0.06,
"learning_rate": 0.000377608952099523,
"loss": 0.3689,
"step": 830
},
{
"epoch": 0.06,
"learning_rate": 0.00037668307444094153,
"loss": 0.3606,
"step": 840
},
{
"epoch": 0.06,
"learning_rate": 0.00037573962280810944,
"loss": 0.3657,
"step": 850
},
{
"epoch": 0.06,
"learning_rate": 0.0003747786910424701,
"loss": 0.3601,
"step": 860
},
{
"epoch": 0.06,
"learning_rate": 0.00037380037472414704,
"loss": 0.3732,
"step": 870
},
{
"epoch": 0.06,
"learning_rate": 0.0003728047711624374,
"loss": 0.3765,
"step": 880
},
{
"epoch": 0.06,
"learning_rate": 0.0003717919793861329,
"loss": 0.3641,
"step": 890
},
{
"epoch": 0.06,
"learning_rate": 0.00037076210013366943,
"loss": 0.3621,
"step": 900
},
{
"epoch": 0.06,
"learning_rate": 0.00036971523584310764,
"loss": 0.3635,
"step": 910
},
{
"epoch": 0.06,
"learning_rate": 0.0003686514906419431,
"loss": 0.3716,
"step": 920
},
{
"epoch": 0.07,
"learning_rate": 0.0003675709703367499,
"loss": 0.3613,
"step": 930
},
{
"epoch": 0.07,
"learning_rate": 0.0003664737824026558,
"loss": 0.3728,
"step": 940
},
{
"epoch": 0.07,
"learning_rate": 0.0003653600359726526,
"loss": 0.362,
"step": 950
},
{
"epoch": 0.07,
"learning_rate": 0.00036422984182674084,
"loss": 0.3518,
"step": 960
},
{
"epoch": 0.07,
"learning_rate": 0.00036308331238091095,
"loss": 0.3672,
"step": 970
},
{
"epoch": 0.07,
"learning_rate": 0.00036192056167596193,
"loss": 0.3563,
"step": 980
},
{
"epoch": 0.07,
"learning_rate": 0.00036074170536615777,
"loss": 0.358,
"step": 990
},
{
"epoch": 0.07,
"learning_rate": 0.00035954686070772404,
"loss": 0.3594,
"step": 1000
},
{
"epoch": 0.07,
"learning_rate": 0.00035833614654718493,
"loss": 0.3635,
"step": 1010
},
{
"epoch": 0.07,
"learning_rate": 0.00035710968330954164,
"loss": 0.354,
"step": 1020
},
{
"epoch": 0.07,
"learning_rate": 0.0003558675929862949,
"loss": 0.3721,
"step": 1030
},
{
"epoch": 0.07,
"learning_rate": 0.00035460999912330994,
"loss": 0.3588,
"step": 1040
},
{
"epoch": 0.07,
"learning_rate": 0.0003533370268085289,
"loss": 0.3608,
"step": 1050
},
{
"epoch": 0.07,
"learning_rate": 0.0003520488026595282,
"loss": 0.3539,
"step": 1060
},
{
"epoch": 0.08,
"learning_rate": 0.00035074545481092456,
"loss": 0.3597,
"step": 1070
},
{
"epoch": 0.08,
"learning_rate": 0.00034942711290162997,
"loss": 0.3493,
"step": 1080
},
{
"epoch": 0.08,
"learning_rate": 0.00034809390806195685,
"loss": 0.3648,
"step": 1090
},
{
"epoch": 0.08,
"learning_rate": 0.0003467459729005753,
"loss": 0.3553,
"step": 1100
},
{
"epoch": 0.08,
"learning_rate": 0.0003453834414913229,
"loss": 0.3497,
"step": 1110
},
{
"epoch": 0.08,
"learning_rate": 0.00034400644935986876,
"loss": 0.3496,
"step": 1120
},
{
"epoch": 0.08,
"learning_rate": 0.0003426151334702336,
"loss": 0.3612,
"step": 1130
},
{
"epoch": 0.08,
"learning_rate": 0.00034120963221116637,
"loss": 0.3578,
"step": 1140
},
{
"epoch": 0.08,
"learning_rate": 0.00033979008538237914,
"loss": 0.3502,
"step": 1150
},
{
"epoch": 0.08,
"learning_rate": 0.0003383566341806417,
"loss": 0.3556,
"step": 1160
},
{
"epoch": 0.08,
"learning_rate": 0.00033690942118573774,
"loss": 0.3525,
"step": 1170
},
{
"epoch": 0.08,
"learning_rate": 0.0003354485903462825,
"loss": 0.3524,
"step": 1180
},
{
"epoch": 0.08,
"learning_rate": 0.000333974286965405,
"loss": 0.353,
"step": 1190
},
{
"epoch": 0.08,
"learning_rate": 0.0003324866576862952,
"loss": 0.362,
"step": 1200
},
{
"epoch": 0.08,
"learning_rate": 0.00033098585047761797,
"loss": 0.3613,
"step": 1210
},
{
"epoch": 0.09,
"learning_rate": 0.0003294720146187955,
"loss": 0.3514,
"step": 1220
},
{
"epoch": 0.09,
"learning_rate": 0.00032794530068515874,
"loss": 0.3638,
"step": 1230
},
{
"epoch": 0.09,
"learning_rate": 0.0003264058605329702,
"loss": 0.3511,
"step": 1240
},
{
"epoch": 0.09,
"learning_rate": 0.0003248538472843198,
"loss": 0.3619,
"step": 1250
},
{
"epoch": 0.09,
"learning_rate": 0.00032328941531189397,
"loss": 0.3535,
"step": 1260
},
{
"epoch": 0.09,
"learning_rate": 0.00032171272022362134,
"loss": 0.3523,
"step": 1270
},
{
"epoch": 0.09,
"learning_rate": 0.0003201239188471943,
"loss": 0.3581,
"step": 1280
},
{
"epoch": 0.09,
"learning_rate": 0.0003185231692144706,
"loss": 0.3555,
"step": 1290
},
{
"epoch": 0.09,
"learning_rate": 0.00031691063054575434,
"loss": 0.3472,
"step": 1300
},
{
"epoch": 0.09,
"learning_rate": 0.0003152864632339587,
"loss": 0.3538,
"step": 1310
},
{
"epoch": 0.09,
"learning_rate": 0.0003136508288286527,
"loss": 0.3477,
"step": 1320
},
{
"epoch": 0.09,
"learning_rate": 0.0003120038900199922,
"loss": 0.3475,
"step": 1330
},
{
"epoch": 0.09,
"learning_rate": 0.00031034581062253786,
"loss": 0.3521,
"step": 1340
},
{
"epoch": 0.09,
"learning_rate": 0.0003086767555589612,
"loss": 0.3531,
"step": 1350
},
{
"epoch": 0.1,
"learning_rate": 0.0003069968908436401,
"loss": 0.3448,
"step": 1360
},
{
"epoch": 0.1,
"learning_rate": 0.00030530638356614644,
"loss": 0.3564,
"step": 1370
},
{
"epoch": 0.1,
"learning_rate": 0.00030360540187462606,
"loss": 0.3473,
"step": 1380
},
{
"epoch": 0.1,
"learning_rate": 0.00030189411495907376,
"loss": 0.3575,
"step": 1390
},
{
"epoch": 0.1,
"learning_rate": 0.0003001726930345046,
"loss": 0.3478,
"step": 1400
},
{
"epoch": 0.1,
"learning_rate": 0.00029844130732402347,
"loss": 0.3499,
"step": 1410
},
{
"epoch": 0.1,
"learning_rate": 0.000296700130041794,
"loss": 0.3638,
"step": 1420
},
{
"epoch": 0.1,
"learning_rate": 0.00029494933437590926,
"loss": 0.3494,
"step": 1430
},
{
"epoch": 0.1,
"learning_rate": 0.00029318909447116496,
"loss": 0.3497,
"step": 1440
},
{
"epoch": 0.1,
"learning_rate": 0.0002914195854117389,
"loss": 0.3555,
"step": 1450
},
{
"epoch": 0.1,
"learning_rate": 0.0002896409832037748,
"loss": 0.352,
"step": 1460
},
{
"epoch": 0.1,
"learning_rate": 0.0002878534647578768,
"loss": 0.3486,
"step": 1470
},
{
"epoch": 0.1,
"learning_rate": 0.0002860572078715121,
"loss": 0.3528,
"step": 1480
},
{
"epoch": 0.1,
"learning_rate": 0.0002842523912113264,
"loss": 0.3412,
"step": 1490
},
{
"epoch": 0.11,
"learning_rate": 0.00028243919429537265,
"loss": 0.3384,
"step": 1500
},
{
"epoch": 0.11,
"learning_rate": 0.00028061779747525504,
"loss": 0.3493,
"step": 1510
},
{
"epoch": 0.11,
"learning_rate": 0.00027878838191819024,
"loss": 0.3416,
"step": 1520
},
{
"epoch": 0.11,
"learning_rate": 0.0002769511295889872,
"loss": 0.3485,
"step": 1530
},
{
"epoch": 0.11,
"learning_rate": 0.000275106223231948,
"loss": 0.3436,
"step": 1540
},
{
"epoch": 0.11,
"learning_rate": 0.00027325384635269094,
"loss": 0.3405,
"step": 1550
},
{
"epoch": 0.11,
"learning_rate": 0.00027139418319989785,
"loss": 0.349,
"step": 1560
},
{
"epoch": 0.11,
"learning_rate": 0.0002695274187469878,
"loss": 0.3366,
"step": 1570
},
{
"epoch": 0.11,
"learning_rate": 0.0002676537386737183,
"loss": 0.3524,
"step": 1580
},
{
"epoch": 0.11,
"learning_rate": 0.00026577332934771665,
"loss": 0.3405,
"step": 1590
},
{
"epoch": 0.11,
"learning_rate": 0.0002638863778059426,
"loss": 0.3437,
"step": 1600
},
{
"epoch": 0.11,
"learning_rate": 0.00026199307173608433,
"loss": 0.3375,
"step": 1610
},
{
"epoch": 0.11,
"learning_rate": 0.0002600935994578904,
"loss": 0.3447,
"step": 1620
},
{
"epoch": 0.11,
"learning_rate": 0.0002581881499044377,
"loss": 0.3464,
"step": 1630
},
{
"epoch": 0.12,
"learning_rate": 0.0002562769126033394,
"loss": 0.3369,
"step": 1640
},
{
"epoch": 0.12,
"learning_rate": 0.00025436007765789327,
"loss": 0.3423,
"step": 1650
},
{
"epoch": 0.12,
"learning_rate": 0.00025243783572817297,
"loss": 0.3433,
"step": 1660
},
{
"epoch": 0.12,
"learning_rate": 0.0002505103780120636,
"loss": 0.3412,
"step": 1670
},
{
"epoch": 0.12,
"learning_rate": 0.0002485778962262443,
"loss": 0.3424,
"step": 1680
},
{
"epoch": 0.12,
"learning_rate": 0.00024664058258711853,
"loss": 0.3452,
"step": 1690
},
{
"epoch": 0.12,
"learning_rate": 0.0002446986297916954,
"loss": 0.3493,
"step": 1700
},
{
"epoch": 0.12,
"learning_rate": 0.00024275223099842291,
"loss": 0.3387,
"step": 1710
},
{
"epoch": 0.12,
"learning_rate": 0.00024080157980797484,
"loss": 0.3421,
"step": 1720
},
{
"epoch": 0.12,
"learning_rate": 0.0002388468702439944,
"loss": 0.3365,
"step": 1730
},
{
"epoch": 0.12,
"learning_rate": 0.00023688829673379534,
"loss": 0.3382,
"step": 1740
},
{
"epoch": 0.12,
"learning_rate": 0.00023492605408902297,
"loss": 0.3443,
"step": 1750
},
{
"epoch": 0.12,
"learning_rate": 0.00023296033748627712,
"loss": 0.3441,
"step": 1760
},
{
"epoch": 0.12,
"learning_rate": 0.0002309913424476986,
"loss": 0.3425,
"step": 1770
},
{
"epoch": 0.12,
"learning_rate": 0.00022901926482152138,
"loss": 0.3469,
"step": 1780
},
{
"epoch": 0.13,
"learning_rate": 0.00022704430076259246,
"loss": 0.3409,
"step": 1790
},
{
"epoch": 0.13,
"learning_rate": 0.00022506664671286087,
"loss": 0.3585,
"step": 1800
},
{
"epoch": 0.13,
"learning_rate": 0.00022308649938183864,
"loss": 0.3327,
"step": 1810
},
{
"epoch": 0.13,
"learning_rate": 0.00022110405572703466,
"loss": 0.3354,
"step": 1820
},
{
"epoch": 0.13,
"learning_rate": 0.00021911951293436416,
"loss": 0.3424,
"step": 1830
},
{
"epoch": 0.13,
"learning_rate": 0.00021713306839853545,
"loss": 0.3369,
"step": 1840
},
{
"epoch": 0.13,
"learning_rate": 0.0002151449197034157,
"loss": 0.3407,
"step": 1850
},
{
"epoch": 0.13,
"learning_rate": 0.0002131552646023783,
"loss": 0.3391,
"step": 1860
},
{
"epoch": 0.13,
"learning_rate": 0.00021116430099863277,
"loss": 0.3358,
"step": 1870
},
{
"epoch": 0.13,
"learning_rate": 0.0002091722269255404,
"loss": 0.3364,
"step": 1880
},
{
"epoch": 0.13,
"learning_rate": 0.0002071792405269165,
"loss": 0.341,
"step": 1890
},
{
"epoch": 0.13,
"learning_rate": 0.00020518554003732167,
"loss": 0.3378,
"step": 1900
},
{
"epoch": 0.13,
"learning_rate": 0.00020319132376234462,
"loss": 0.3305,
"step": 1910
},
{
"epoch": 0.13,
"learning_rate": 0.00020119679005887702,
"loss": 0.3352,
"step": 1920
},
{
"epoch": 0.14,
"learning_rate": 0.00019920213731538394,
"loss": 0.3376,
"step": 1930
},
{
"epoch": 0.14,
"learning_rate": 0.00019720756393217098,
"loss": 0.3283,
"step": 1940
},
{
"epoch": 0.14,
"learning_rate": 0.00019521326830164998,
"loss": 0.3354,
"step": 1950
},
{
"epoch": 0.14,
"learning_rate": 0.00019321944878860587,
"loss": 0.3445,
"step": 1960
},
{
"epoch": 0.14,
"learning_rate": 0.000191226303710466,
"loss": 0.3401,
"step": 1970
},
{
"epoch": 0.14,
"learning_rate": 0.00018923403131757439,
"loss": 0.3438,
"step": 1980
},
{
"epoch": 0.14,
"learning_rate": 0.00018724282977347235,
"loss": 0.3341,
"step": 1990
},
{
"epoch": 0.14,
"learning_rate": 0.00018525289713518817,
"loss": 0.3272,
"step": 2000
},
{
"epoch": 0.14,
"learning_rate": 0.00018326443133353693,
"loss": 0.3488,
"step": 2010
},
{
"epoch": 0.14,
"learning_rate": 0.00018127763015343332,
"loss": 0.3242,
"step": 2020
},
{
"epoch": 0.14,
"learning_rate": 0.00017929269121421857,
"loss": 0.337,
"step": 2030
},
{
"epoch": 0.14,
"learning_rate": 0.00017730981195000406,
"loss": 0.3386,
"step": 2040
},
{
"epoch": 0.14,
"learning_rate": 0.00017532918959003353,
"loss": 0.3312,
"step": 2050
},
{
"epoch": 0.14,
"learning_rate": 0.00017335102113906505,
"loss": 0.3382,
"step": 2060
},
{
"epoch": 0.15,
"learning_rate": 0.00017137550335777612,
"loss": 0.3319,
"step": 2070
},
{
"epoch": 0.15,
"learning_rate": 0.0001694028327431924,
"loss": 0.3294,
"step": 2080
},
{
"epoch": 0.15,
"learning_rate": 0.0001674332055091431,
"loss": 0.3313,
"step": 2090
},
{
"epoch": 0.15,
"learning_rate": 0.0001654668175667442,
"loss": 0.3347,
"step": 2100
},
{
"epoch": 0.15,
"learning_rate": 0.00016350386450491208,
"loss": 0.324,
"step": 2110
},
{
"epoch": 0.15,
"learning_rate": 0.00016154454157090884,
"loss": 0.3302,
"step": 2120
},
{
"epoch": 0.15,
"learning_rate": 0.00015958904365092225,
"loss": 0.334,
"step": 2130
},
{
"epoch": 0.15,
"learning_rate": 0.00015763756525068065,
"loss": 0.3375,
"step": 2140
},
{
"epoch": 0.15,
"learning_rate": 0.00015569030047610656,
"loss": 0.337,
"step": 2150
},
{
"epoch": 0.15,
"learning_rate": 0.0001537474430140096,
"loss": 0.3346,
"step": 2160
},
{
"epoch": 0.15,
"learning_rate": 0.0001518091861128213,
"loss": 0.3377,
"step": 2170
},
{
"epoch": 0.15,
"learning_rate": 0.00014987572256337336,
"loss": 0.3247,
"step": 2180
},
{
"epoch": 0.15,
"learning_rate": 0.0001479472446797216,
"loss": 0.3194,
"step": 2190
},
{
"epoch": 0.15,
"learning_rate": 0.00014602394428001712,
"loss": 0.3299,
"step": 2200
},
{
"epoch": 0.16,
"learning_rate": 0.00014410601266742691,
"loss": 0.3202,
"step": 2210
},
{
"epoch": 0.16,
"learning_rate": 0.00014219364061110565,
"loss": 0.3316,
"step": 2220
},
{
"epoch": 0.16,
"learning_rate": 0.00014028701832722104,
"loss": 0.3304,
"step": 2230
},
{
"epoch": 0.16,
"learning_rate": 0.00013838633546003302,
"loss": 0.3286,
"step": 2240
},
{
"epoch": 0.16,
"learning_rate": 0.00013649178106303115,
"loss": 0.3267,
"step": 2250
},
{
"epoch": 0.16,
"learning_rate": 0.00013460354358013,
"loss": 0.3295,
"step": 2260
},
{
"epoch": 0.16,
"learning_rate": 0.0001327218108269255,
"loss": 0.3321,
"step": 2270
},
{
"epoch": 0.16,
"learning_rate": 0.00013084676997201342,
"loss": 0.3302,
"step": 2280
},
{
"epoch": 0.16,
"learning_rate": 0.00012897860751837263,
"loss": 0.3246,
"step": 2290
},
{
"epoch": 0.16,
"learning_rate": 0.00012711750928481443,
"loss": 0.3305,
"step": 2300
},
{
"epoch": 0.16,
"learning_rate": 0.00012526366038749956,
"loss": 0.3274,
"step": 2310
},
{
"epoch": 0.16,
"learning_rate": 0.000123417245221526,
"loss": 0.3303,
"step": 2320
},
{
"epoch": 0.16,
"learning_rate": 0.00012157844744258722,
"loss": 0.3265,
"step": 2330
},
{
"epoch": 0.16,
"learning_rate": 0.00011974744994870517,
"loss": 0.3237,
"step": 2340
},
{
"epoch": 0.16,
"learning_rate": 0.00011792443486203788,
"loss": 0.3288,
"step": 2350
},
{
"epoch": 0.17,
"learning_rate": 0.00011610958351076458,
"loss": 0.3372,
"step": 2360
},
{
"epoch": 0.17,
"learning_rate": 0.00011430307641104971,
"loss": 0.3257,
"step": 2370
},
{
"epoch": 0.17,
"learning_rate": 0.00011250509324908767,
"loss": 0.3294,
"step": 2380
},
{
"epoch": 0.17,
"learning_rate": 0.00011071581286323,
"loss": 0.3304,
"step": 2390
},
{
"epoch": 0.17,
"learning_rate": 0.00010893541322619732,
"loss": 0.33,
"step": 2400
},
{
"epoch": 0.17,
"learning_rate": 0.00010716407142737659,
"loss": 0.3251,
"step": 2410
},
{
"epoch": 0.17,
"learning_rate": 0.00010540196365520754,
"loss": 0.3293,
"step": 2420
},
{
"epoch": 0.17,
"learning_rate": 0.00010364926517965692,
"loss": 0.3269,
"step": 2430
},
{
"epoch": 0.17,
"learning_rate": 0.0001019061503347858,
"loss": 0.3187,
"step": 2440
},
{
"epoch": 0.17,
"learning_rate": 0.00010017279250140891,
"loss": 0.3204,
"step": 2450
},
{
"epoch": 0.17,
"learning_rate": 9.844936408984924e-05,
"loss": 0.3151,
"step": 2460
},
{
"epoch": 0.17,
"learning_rate": 9.673603652278904e-05,
"loss": 0.3237,
"step": 2470
},
{
"epoch": 0.17,
"learning_rate": 9.503298021821905e-05,
"loss": 0.3224,
"step": 2480
},
{
"epoch": 0.17,
"learning_rate": 9.334036457248774e-05,
"loss": 0.3228,
"step": 2490
},
{
"epoch": 0.18,
"learning_rate": 9.165835794345205e-05,
"loss": 0.3186,
"step": 2500
},
{
"epoch": 0.18,
"learning_rate": 8.99871276337315e-05,
"loss": 0.3181,
"step": 2510
},
{
"epoch": 0.18,
"learning_rate": 8.832683987406746e-05,
"loss": 0.3233,
"step": 2520
},
{
"epoch": 0.18,
"learning_rate": 8.667765980678851e-05,
"loss": 0.3302,
"step": 2530
},
{
"epoch": 0.18,
"learning_rate": 8.503975146938444e-05,
"loss": 0.3314,
"step": 2540
},
{
"epoch": 0.18,
"learning_rate": 8.341327777819035e-05,
"loss": 0.3309,
"step": 2550
},
{
"epoch": 0.18,
"learning_rate": 8.179840051218167e-05,
"loss": 0.331,
"step": 2560
},
{
"epoch": 0.18,
"learning_rate": 8.019528029688286e-05,
"loss": 0.318,
"step": 2570
},
{
"epoch": 0.18,
"learning_rate": 7.860407658839049e-05,
"loss": 0.3273,
"step": 2580
},
{
"epoch": 0.18,
"learning_rate": 7.702494765751285e-05,
"loss": 0.3182,
"step": 2590
},
{
"epoch": 0.18,
"learning_rate": 7.545805057402733e-05,
"loss": 0.3237,
"step": 2600
},
{
"epoch": 0.18,
"learning_rate": 7.390354119105722e-05,
"loss": 0.3229,
"step": 2610
},
{
"epoch": 0.18,
"learning_rate": 7.236157412956994e-05,
"loss": 0.3304,
"step": 2620
},
{
"epoch": 0.18,
"learning_rate": 7.0832302762997e-05,
"loss": 0.3284,
"step": 2630
},
{
"epoch": 0.19,
"learning_rate": 6.93158792019789e-05,
"loss": 0.3251,
"step": 2640
},
{
"epoch": 0.19,
"learning_rate": 6.781245427923522e-05,
"loss": 0.3201,
"step": 2650
},
{
"epoch": 0.19,
"learning_rate": 6.632217753456174e-05,
"loss": 0.322,
"step": 2660
},
{
"epoch": 0.19,
"learning_rate": 6.484519719995647e-05,
"loss": 0.3265,
"step": 2670
},
{
"epoch": 0.19,
"learning_rate": 6.338166018487555e-05,
"loss": 0.3273,
"step": 2680
},
{
"epoch": 0.19,
"learning_rate": 6.193171206162065e-05,
"loss": 0.3217,
"step": 2690
},
{
"epoch": 0.19,
"learning_rate": 6.0495497050859574e-05,
"loss": 0.3247,
"step": 2700
},
{
"epoch": 0.19,
"learning_rate": 5.907315800728106e-05,
"loss": 0.3179,
"step": 2710
},
{
"epoch": 0.19,
"learning_rate": 5.766483640538587e-05,
"loss": 0.3116,
"step": 2720
},
{
"epoch": 0.19,
"learning_rate": 5.62706723254145e-05,
"loss": 0.315,
"step": 2730
},
{
"epoch": 0.19,
"learning_rate": 5.489080443941415e-05,
"loss": 0.3197,
"step": 2740
},
{
"epoch": 0.19,
"learning_rate": 5.352536999744557e-05,
"loss": 0.32,
"step": 2750
},
{
"epoch": 0.19,
"learning_rate": 5.217450481393129e-05,
"loss": 0.3195,
"step": 2760
},
{
"epoch": 0.19,
"learning_rate": 5.083834325414667e-05,
"loss": 0.3221,
"step": 2770
},
{
"epoch": 0.2,
"learning_rate": 4.951701822085515e-05,
"loss": 0.3165,
"step": 2780
},
{
"epoch": 0.2,
"learning_rate": 4.821066114108892e-05,
"loss": 0.3216,
"step": 2790
},
{
"epoch": 0.2,
"learning_rate": 4.69194019530764e-05,
"loss": 0.3122,
"step": 2800
},
{
"epoch": 0.2,
"learning_rate": 4.564336909331768e-05,
"loss": 0.3237,
"step": 2810
},
{
"epoch": 0.2,
"learning_rate": 4.438268948380972e-05,
"loss": 0.3169,
"step": 2820
},
{
"epoch": 0.2,
"learning_rate": 4.3137488519421656e-05,
"loss": 0.3146,
"step": 2830
},
{
"epoch": 0.2,
"learning_rate": 4.1907890055422286e-05,
"loss": 0.3256,
"step": 2840
},
{
"epoch": 0.2,
"learning_rate": 4.069401639516075e-05,
"loss": 0.3178,
"step": 2850
},
{
"epoch": 0.2,
"learning_rate": 3.949598827790155e-05,
"loss": 0.3151,
"step": 2860
},
{
"epoch": 0.2,
"learning_rate": 3.831392486681495e-05,
"loss": 0.3121,
"step": 2870
},
{
"epoch": 0.2,
"learning_rate": 3.714794373712431e-05,
"loss": 0.3199,
"step": 2880
},
{
"epoch": 0.2,
"learning_rate": 3.5998160864411476e-05,
"loss": 0.3226,
"step": 2890
},
{
"epoch": 0.2,
"learning_rate": 3.486469061308093e-05,
"loss": 0.3182,
"step": 2900
},
{
"epoch": 0.2,
"learning_rate": 3.3747645724984544e-05,
"loss": 0.3162,
"step": 2910
},
{
"epoch": 0.2,
"learning_rate": 3.264713730820768e-05,
"loss": 0.3133,
"step": 2920
},
{
"epoch": 0.21,
"learning_rate": 3.156327482601742e-05,
"loss": 0.318,
"step": 2930
},
{
"epoch": 0.21,
"learning_rate": 3.0496166085974943e-05,
"loss": 0.3107,
"step": 2940
},
{
"epoch": 0.21,
"learning_rate": 2.9445917229212193e-05,
"loss": 0.3188,
"step": 2950
},
{
"epoch": 0.21,
"learning_rate": 2.8412632719874532e-05,
"loss": 0.3278,
"step": 2960
},
{
"epoch": 0.21,
"learning_rate": 2.7396415334729964e-05,
"loss": 0.3161,
"step": 2970
},
{
"epoch": 0.21,
"learning_rate": 2.6397366152946523e-05,
"loss": 0.3151,
"step": 2980
},
{
"epoch": 0.21,
"learning_rate": 2.5415584546038096e-05,
"loss": 0.316,
"step": 2990
},
{
"epoch": 0.21,
"learning_rate": 2.4451168167980497e-05,
"loss": 0.3191,
"step": 3000
},
{
"epoch": 0.21,
"learning_rate": 2.350421294549825e-05,
"loss": 0.3152,
"step": 3010
},
{
"epoch": 0.21,
"learning_rate": 2.2574813068522894e-05,
"loss": 0.3195,
"step": 3020
},
{
"epoch": 0.21,
"learning_rate": 2.166306098082451e-05,
"loss": 0.3139,
"step": 3030
},
{
"epoch": 0.21,
"learning_rate": 2.0769047370816553e-05,
"loss": 0.3214,
"step": 3040
},
{
"epoch": 0.21,
"learning_rate": 1.989286116253557e-05,
"loss": 0.3114,
"step": 3050
},
{
"epoch": 0.21,
"learning_rate": 1.903458950679613e-05,
"loss": 0.3203,
"step": 3060
},
{
"epoch": 0.22,
"learning_rate": 1.8194317772522362e-05,
"loss": 0.3132,
"step": 3070
},
{
"epoch": 0.22,
"learning_rate": 1.7372129538256667e-05,
"loss": 0.3091,
"step": 3080
},
{
"epoch": 0.22,
"learning_rate": 1.6568106583846378e-05,
"loss": 0.3111,
"step": 3090
},
{
"epoch": 0.22,
"learning_rate": 1.5782328882309484e-05,
"loss": 0.3126,
"step": 3100
},
{
"epoch": 0.22,
"learning_rate": 1.5014874591880157e-05,
"loss": 0.3173,
"step": 3110
},
{
"epoch": 0.22,
"learning_rate": 1.4265820048234447e-05,
"loss": 0.3084,
"step": 3120
},
{
"epoch": 0.22,
"learning_rate": 1.3535239756897566e-05,
"loss": 0.3133,
"step": 3130
},
{
"epoch": 0.22,
"learning_rate": 1.2823206385833187e-05,
"loss": 0.3182,
"step": 3140
},
{
"epoch": 0.22,
"learning_rate": 1.212979075821532e-05,
"loss": 0.3109,
"step": 3150
},
{
"epoch": 0.22,
"learning_rate": 1.1455061845383852e-05,
"loss": 0.3169,
"step": 3160
},
{
"epoch": 0.22,
"learning_rate": 1.0799086759984333e-05,
"loss": 0.3185,
"step": 3170
},
{
"epoch": 0.22,
"learning_rate": 1.016193074929237e-05,
"loss": 0.3035,
"step": 3180
},
{
"epoch": 0.22,
"learning_rate": 9.543657188723876e-06,
"loss": 0.3117,
"step": 3190
},
{
"epoch": 0.22,
"learning_rate": 8.944327575531275e-06,
"loss": 0.3204,
"step": 3200
},
{
"epoch": 0.23,
"learning_rate": 8.364001522686726e-06,
"loss": 0.3144,
"step": 3210
},
{
"epoch": 0.23,
"learning_rate": 7.802736752952533e-06,
"loss": 0.321,
"step": 3220
},
{
"epoch": 0.23,
"learning_rate": 7.260589093139736e-06,
"loss": 0.316,
"step": 3230
},
{
"epoch": 0.23,
"learning_rate": 6.737612468555221e-06,
"loss": 0.3133,
"step": 3240
},
{
"epoch": 0.23,
"learning_rate": 6.2338588976380115e-06,
"loss": 0.3107,
"step": 3250
},
{
"epoch": 0.23,
"learning_rate": 5.749378486785162e-06,
"loss": 0.3076,
"step": 3260
},
{
"epoch": 0.23,
"learning_rate": 5.2842194253679424e-06,
"loss": 0.3137,
"step": 3270
},
{
"epoch": 0.23,
"learning_rate": 4.8384279809385426e-06,
"loss": 0.3121,
"step": 3280
},
{
"epoch": 0.23,
"learning_rate": 4.41204849462804e-06,
"loss": 0.3065,
"step": 3290
},
{
"epoch": 0.23,
"learning_rate": 4.005123376735997e-06,
"loss": 0.3172,
"step": 3300
},
{
"epoch": 0.23,
"learning_rate": 3.617693102512032e-06,
"loss": 0.3183,
"step": 3310
},
{
"epoch": 0.23,
"learning_rate": 3.2497962081299514e-06,
"loss": 0.3141,
"step": 3320
},
{
"epoch": 0.23,
"learning_rate": 2.9014692868546633e-06,
"loss": 0.3094,
"step": 3330
},
{
"epoch": 0.23,
"learning_rate": 2.572746985402419e-06,
"loss": 0.3139,
"step": 3340
},
{
"epoch": 0.24,
"learning_rate": 2.2636620004946154e-06,
"loss": 0.3091,
"step": 3350
},
{
"epoch": 0.24,
"learning_rate": 1.974245075605574e-06,
"loss": 0.3092,
"step": 3360
},
{
"epoch": 0.24,
"learning_rate": 1.7045249979046995e-06,
"loss": 0.3108,
"step": 3370
},
{
"epoch": 0.24,
"learning_rate": 1.4545285953929677e-06,
"loss": 0.3098,
"step": 3380
},
{
"epoch": 0.24,
"learning_rate": 1.224280734234573e-06,
"loss": 0.3059,
"step": 3390
},
{
"epoch": 0.24,
"learning_rate": 1.013804316283573e-06,
"loss": 0.3073,
"step": 3400
},
{
"epoch": 0.24,
"learning_rate": 8.231202768059332e-07,
"loss": 0.306,
"step": 3410
},
{
"epoch": 0.24,
"learning_rate": 6.522475823970808e-07,
"loss": 0.3071,
"step": 3420
},
{
"epoch": 0.24,
"learning_rate": 5.012032290955037e-07,
"loss": 0.3033,
"step": 3430
},
{
"epoch": 0.24,
"learning_rate": 3.7000224069216883e-07,
"loss": 0.3109,
"step": 3440
},
{
"epoch": 0.24,
"learning_rate": 2.586576672361396e-07,
"loss": 0.3127,
"step": 3450
},
{
"epoch": 0.24,
"learning_rate": 1.671805837365703e-07,
"loss": 0.312,
"step": 3460
},
{
"epoch": 0.24,
"learning_rate": 9.558008906112026e-08,
"loss": 0.317,
"step": 3470
},
{
"epoch": 0.24,
"learning_rate": 4.386330503090008e-08,
"loss": 0.3092,
"step": 3480
},
{
"epoch": 0.24,
"learning_rate": 1.2035375712105001e-08,
"loss": 0.3142,
"step": 3490
},
{
"epoch": 0.25,
"learning_rate": 9.946690433526584e-11,
"loss": 0.3105,
"step": 3500
}
],
"max_steps": 3500,
"num_train_epochs": 1,
"total_flos": 1.12189130145792e+18,
"trial_name": null,
"trial_params": null
}