gemma2b-classification-gpt4o-100k / trainer_state.json
chansung's picture
Model save
b854a85 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 14.952978056426332,
"eval_steps": 500,
"global_step": 2385,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.006269592476489028,
"grad_norm": 5.8125,
"learning_rate": 8.368200836820084e-07,
"loss": 3.8384,
"step": 1
},
{
"epoch": 0.03134796238244514,
"grad_norm": 4.96875,
"learning_rate": 4.184100418410042e-06,
"loss": 3.8397,
"step": 5
},
{
"epoch": 0.06269592476489028,
"grad_norm": 5.6875,
"learning_rate": 8.368200836820084e-06,
"loss": 3.739,
"step": 10
},
{
"epoch": 0.09404388714733543,
"grad_norm": 4.15625,
"learning_rate": 1.2552301255230125e-05,
"loss": 3.6737,
"step": 15
},
{
"epoch": 0.12539184952978055,
"grad_norm": 3.375,
"learning_rate": 1.6736401673640167e-05,
"loss": 3.4019,
"step": 20
},
{
"epoch": 0.15673981191222572,
"grad_norm": 3.1875,
"learning_rate": 2.092050209205021e-05,
"loss": 3.1898,
"step": 25
},
{
"epoch": 0.18808777429467086,
"grad_norm": 2.0,
"learning_rate": 2.510460251046025e-05,
"loss": 2.9745,
"step": 30
},
{
"epoch": 0.219435736677116,
"grad_norm": 1.7265625,
"learning_rate": 2.9288702928870294e-05,
"loss": 2.7919,
"step": 35
},
{
"epoch": 0.2507836990595611,
"grad_norm": 7.0625,
"learning_rate": 3.3472803347280334e-05,
"loss": 2.6087,
"step": 40
},
{
"epoch": 0.28213166144200624,
"grad_norm": 2.4375,
"learning_rate": 3.765690376569038e-05,
"loss": 2.4252,
"step": 45
},
{
"epoch": 0.31347962382445144,
"grad_norm": 0.89453125,
"learning_rate": 4.184100418410042e-05,
"loss": 2.2774,
"step": 50
},
{
"epoch": 0.3448275862068966,
"grad_norm": 0.8046875,
"learning_rate": 4.602510460251046e-05,
"loss": 2.1221,
"step": 55
},
{
"epoch": 0.3761755485893417,
"grad_norm": 0.890625,
"learning_rate": 5.02092050209205e-05,
"loss": 2.0078,
"step": 60
},
{
"epoch": 0.40752351097178685,
"grad_norm": 0.57421875,
"learning_rate": 5.4393305439330545e-05,
"loss": 1.9116,
"step": 65
},
{
"epoch": 0.438871473354232,
"grad_norm": 0.453125,
"learning_rate": 5.857740585774059e-05,
"loss": 1.8231,
"step": 70
},
{
"epoch": 0.4702194357366771,
"grad_norm": 1.2890625,
"learning_rate": 6.276150627615063e-05,
"loss": 1.7418,
"step": 75
},
{
"epoch": 0.5015673981191222,
"grad_norm": 0.67578125,
"learning_rate": 6.694560669456067e-05,
"loss": 1.6812,
"step": 80
},
{
"epoch": 0.5329153605015674,
"grad_norm": 0.41015625,
"learning_rate": 7.11297071129707e-05,
"loss": 1.6494,
"step": 85
},
{
"epoch": 0.5642633228840125,
"grad_norm": 1.046875,
"learning_rate": 7.531380753138076e-05,
"loss": 1.6192,
"step": 90
},
{
"epoch": 0.5956112852664577,
"grad_norm": 0.578125,
"learning_rate": 7.949790794979079e-05,
"loss": 1.6012,
"step": 95
},
{
"epoch": 0.6269592476489029,
"grad_norm": 0.7109375,
"learning_rate": 8.368200836820084e-05,
"loss": 1.5678,
"step": 100
},
{
"epoch": 0.658307210031348,
"grad_norm": 0.9765625,
"learning_rate": 8.786610878661088e-05,
"loss": 1.5555,
"step": 105
},
{
"epoch": 0.6896551724137931,
"grad_norm": 0.65234375,
"learning_rate": 9.205020920502092e-05,
"loss": 1.5386,
"step": 110
},
{
"epoch": 0.7210031347962382,
"grad_norm": 0.67578125,
"learning_rate": 9.623430962343097e-05,
"loss": 1.5285,
"step": 115
},
{
"epoch": 0.7523510971786834,
"grad_norm": 0.318359375,
"learning_rate": 0.000100418410041841,
"loss": 1.5008,
"step": 120
},
{
"epoch": 0.7836990595611285,
"grad_norm": 0.482421875,
"learning_rate": 0.00010460251046025104,
"loss": 1.485,
"step": 125
},
{
"epoch": 0.8150470219435737,
"grad_norm": 4.09375,
"learning_rate": 0.00010878661087866109,
"loss": 1.4749,
"step": 130
},
{
"epoch": 0.8463949843260188,
"grad_norm": 2.359375,
"learning_rate": 0.00011297071129707113,
"loss": 1.4928,
"step": 135
},
{
"epoch": 0.877742946708464,
"grad_norm": 1.640625,
"learning_rate": 0.00011715481171548118,
"loss": 1.4692,
"step": 140
},
{
"epoch": 0.9090909090909091,
"grad_norm": 1.359375,
"learning_rate": 0.00012133891213389121,
"loss": 1.4803,
"step": 145
},
{
"epoch": 0.9404388714733543,
"grad_norm": 0.349609375,
"learning_rate": 0.00012552301255230126,
"loss": 1.4609,
"step": 150
},
{
"epoch": 0.9717868338557993,
"grad_norm": 1.2265625,
"learning_rate": 0.0001297071129707113,
"loss": 1.4266,
"step": 155
},
{
"epoch": 0.9968652037617555,
"eval_loss": 1.9879965782165527,
"eval_runtime": 0.5558,
"eval_samples_per_second": 3.599,
"eval_steps_per_second": 1.799,
"step": 159
},
{
"epoch": 1.0031347962382444,
"grad_norm": 0.71875,
"learning_rate": 0.00013389121338912134,
"loss": 1.4278,
"step": 160
},
{
"epoch": 1.0344827586206897,
"grad_norm": 0.81640625,
"learning_rate": 0.00013807531380753137,
"loss": 1.4211,
"step": 165
},
{
"epoch": 1.0658307210031348,
"grad_norm": 0.5078125,
"learning_rate": 0.0001422594142259414,
"loss": 1.4012,
"step": 170
},
{
"epoch": 1.09717868338558,
"grad_norm": 2.828125,
"learning_rate": 0.00014644351464435147,
"loss": 1.4179,
"step": 175
},
{
"epoch": 1.1285266457680252,
"grad_norm": 4.40625,
"learning_rate": 0.0001506276150627615,
"loss": 1.4237,
"step": 180
},
{
"epoch": 1.1598746081504703,
"grad_norm": 1.4296875,
"learning_rate": 0.00015481171548117155,
"loss": 1.4272,
"step": 185
},
{
"epoch": 1.1912225705329154,
"grad_norm": 0.88671875,
"learning_rate": 0.00015899581589958158,
"loss": 1.4069,
"step": 190
},
{
"epoch": 1.2225705329153604,
"grad_norm": 0.6171875,
"learning_rate": 0.00016317991631799162,
"loss": 1.3882,
"step": 195
},
{
"epoch": 1.2539184952978055,
"grad_norm": 0.412109375,
"learning_rate": 0.00016736401673640169,
"loss": 1.376,
"step": 200
},
{
"epoch": 1.2852664576802508,
"grad_norm": 0.421875,
"learning_rate": 0.00017154811715481172,
"loss": 1.3793,
"step": 205
},
{
"epoch": 1.316614420062696,
"grad_norm": 0.58984375,
"learning_rate": 0.00017573221757322176,
"loss": 1.3698,
"step": 210
},
{
"epoch": 1.347962382445141,
"grad_norm": 1.3046875,
"learning_rate": 0.0001799163179916318,
"loss": 1.3695,
"step": 215
},
{
"epoch": 1.3793103448275863,
"grad_norm": 0.546875,
"learning_rate": 0.00018410041841004183,
"loss": 1.3761,
"step": 220
},
{
"epoch": 1.4106583072100314,
"grad_norm": 0.7265625,
"learning_rate": 0.0001882845188284519,
"loss": 1.3639,
"step": 225
},
{
"epoch": 1.4420062695924765,
"grad_norm": 1.2109375,
"learning_rate": 0.00019246861924686193,
"loss": 1.3572,
"step": 230
},
{
"epoch": 1.4733542319749215,
"grad_norm": 0.875,
"learning_rate": 0.00019665271966527197,
"loss": 1.3505,
"step": 235
},
{
"epoch": 1.5047021943573666,
"grad_norm": 0.5078125,
"learning_rate": 0.00019999989284554375,
"loss": 1.367,
"step": 240
},
{
"epoch": 1.536050156739812,
"grad_norm": 0.60546875,
"learning_rate": 0.00019999614246368665,
"loss": 1.3631,
"step": 245
},
{
"epoch": 1.567398119122257,
"grad_norm": 1.5546875,
"learning_rate": 0.0001999870345886555,
"loss": 1.3484,
"step": 250
},
{
"epoch": 1.5987460815047023,
"grad_norm": 1.28125,
"learning_rate": 0.00019997256970842288,
"loss": 1.335,
"step": 255
},
{
"epoch": 1.6300940438871474,
"grad_norm": 0.90625,
"learning_rate": 0.00019995274859797366,
"loss": 1.3461,
"step": 260
},
{
"epoch": 1.6614420062695925,
"grad_norm": 0.796875,
"learning_rate": 0.00019992757231926343,
"loss": 1.3332,
"step": 265
},
{
"epoch": 1.6927899686520376,
"grad_norm": 0.9296875,
"learning_rate": 0.00019989704222116167,
"loss": 1.3424,
"step": 270
},
{
"epoch": 1.7241379310344827,
"grad_norm": 0.54296875,
"learning_rate": 0.00019986115993937938,
"loss": 1.3278,
"step": 275
},
{
"epoch": 1.7554858934169277,
"grad_norm": 0.56640625,
"learning_rate": 0.00019981992739638148,
"loss": 1.3329,
"step": 280
},
{
"epoch": 1.786833855799373,
"grad_norm": 0.6953125,
"learning_rate": 0.00019977334680128394,
"loss": 1.3246,
"step": 285
},
{
"epoch": 1.8181818181818183,
"grad_norm": 0.4375,
"learning_rate": 0.00019972142064973519,
"loss": 1.3346,
"step": 290
},
{
"epoch": 1.8495297805642634,
"grad_norm": 0.61328125,
"learning_rate": 0.00019966415172378255,
"loss": 1.3236,
"step": 295
},
{
"epoch": 1.8808777429467085,
"grad_norm": 0.412109375,
"learning_rate": 0.00019960154309172322,
"loss": 1.3059,
"step": 300
},
{
"epoch": 1.9122257053291536,
"grad_norm": 0.546875,
"learning_rate": 0.00019953359810793978,
"loss": 1.2962,
"step": 305
},
{
"epoch": 1.9435736677115987,
"grad_norm": 0.41796875,
"learning_rate": 0.00019946032041272052,
"loss": 1.3079,
"step": 310
},
{
"epoch": 1.9749216300940438,
"grad_norm": 0.71875,
"learning_rate": 0.0001993817139320644,
"loss": 1.3029,
"step": 315
},
{
"epoch": 2.0,
"eval_loss": 1.9709800481796265,
"eval_runtime": 0.5506,
"eval_samples_per_second": 3.632,
"eval_steps_per_second": 1.816,
"step": 319
},
{
"epoch": 2.006269592476489,
"grad_norm": 0.58984375,
"learning_rate": 0.00019929778287747072,
"loss": 1.3202,
"step": 320
},
{
"epoch": 2.0376175548589344,
"grad_norm": 0.87109375,
"learning_rate": 0.00019920853174571347,
"loss": 1.2698,
"step": 325
},
{
"epoch": 2.0689655172413794,
"grad_norm": 0.361328125,
"learning_rate": 0.00019911396531860037,
"loss": 1.2581,
"step": 330
},
{
"epoch": 2.1003134796238245,
"grad_norm": 0.35546875,
"learning_rate": 0.00019901408866271678,
"loss": 1.2547,
"step": 335
},
{
"epoch": 2.1316614420062696,
"grad_norm": 0.54296875,
"learning_rate": 0.00019890890712915416,
"loss": 1.275,
"step": 340
},
{
"epoch": 2.1630094043887147,
"grad_norm": 0.388671875,
"learning_rate": 0.0001987984263532233,
"loss": 1.2573,
"step": 345
},
{
"epoch": 2.19435736677116,
"grad_norm": 0.369140625,
"learning_rate": 0.00019868265225415265,
"loss": 1.2681,
"step": 350
},
{
"epoch": 2.225705329153605,
"grad_norm": 0.5,
"learning_rate": 0.00019856159103477086,
"loss": 1.2735,
"step": 355
},
{
"epoch": 2.2570532915360504,
"grad_norm": 0.482421875,
"learning_rate": 0.00019843524918117475,
"loss": 1.2757,
"step": 360
},
{
"epoch": 2.2884012539184955,
"grad_norm": 0.41015625,
"learning_rate": 0.00019830363346238163,
"loss": 1.2594,
"step": 365
},
{
"epoch": 2.3197492163009406,
"grad_norm": 0.38671875,
"learning_rate": 0.00019816675092996665,
"loss": 1.248,
"step": 370
},
{
"epoch": 2.3510971786833856,
"grad_norm": 0.42578125,
"learning_rate": 0.000198024608917685,
"loss": 1.2508,
"step": 375
},
{
"epoch": 2.3824451410658307,
"grad_norm": 0.69140625,
"learning_rate": 0.00019787721504107916,
"loss": 1.2488,
"step": 380
},
{
"epoch": 2.413793103448276,
"grad_norm": 2.6875,
"learning_rate": 0.00019772457719707053,
"loss": 1.2454,
"step": 385
},
{
"epoch": 2.445141065830721,
"grad_norm": 1.1953125,
"learning_rate": 0.0001975667035635367,
"loss": 1.2667,
"step": 390
},
{
"epoch": 2.476489028213166,
"grad_norm": 0.34375,
"learning_rate": 0.00019740360259887308,
"loss": 1.2558,
"step": 395
},
{
"epoch": 2.507836990595611,
"grad_norm": 0.322265625,
"learning_rate": 0.00019723528304153984,
"loss": 1.2674,
"step": 400
},
{
"epoch": 2.5391849529780566,
"grad_norm": 0.435546875,
"learning_rate": 0.00019706175390959364,
"loss": 1.2715,
"step": 405
},
{
"epoch": 2.5705329153605017,
"grad_norm": 0.546875,
"learning_rate": 0.00019688302450020446,
"loss": 1.2679,
"step": 410
},
{
"epoch": 2.6018808777429467,
"grad_norm": 0.48046875,
"learning_rate": 0.00019669910438915763,
"loss": 1.2521,
"step": 415
},
{
"epoch": 2.633228840125392,
"grad_norm": 0.72265625,
"learning_rate": 0.00019651000343034073,
"loss": 1.2567,
"step": 420
},
{
"epoch": 2.664576802507837,
"grad_norm": 0.498046875,
"learning_rate": 0.00019631573175521547,
"loss": 1.2437,
"step": 425
},
{
"epoch": 2.695924764890282,
"grad_norm": 0.5390625,
"learning_rate": 0.0001961162997722751,
"loss": 1.242,
"step": 430
},
{
"epoch": 2.7272727272727275,
"grad_norm": 0.423828125,
"learning_rate": 0.0001959117181664867,
"loss": 1.2463,
"step": 435
},
{
"epoch": 2.7586206896551726,
"grad_norm": 0.73828125,
"learning_rate": 0.00019570199789871863,
"loss": 1.2465,
"step": 440
},
{
"epoch": 2.7899686520376177,
"grad_norm": 0.5625,
"learning_rate": 0.0001954871502051534,
"loss": 1.24,
"step": 445
},
{
"epoch": 2.8213166144200628,
"grad_norm": 0.353515625,
"learning_rate": 0.00019526718659668553,
"loss": 1.2382,
"step": 450
},
{
"epoch": 2.852664576802508,
"grad_norm": 0.34765625,
"learning_rate": 0.00019504211885830493,
"loss": 1.2405,
"step": 455
},
{
"epoch": 2.884012539184953,
"grad_norm": 0.384765625,
"learning_rate": 0.00019481195904846548,
"loss": 1.2526,
"step": 460
},
{
"epoch": 2.915360501567398,
"grad_norm": 0.359375,
"learning_rate": 0.000194576719498439,
"loss": 1.2287,
"step": 465
},
{
"epoch": 2.946708463949843,
"grad_norm": 0.373046875,
"learning_rate": 0.0001943364128116545,
"loss": 1.2388,
"step": 470
},
{
"epoch": 2.978056426332288,
"grad_norm": 0.287109375,
"learning_rate": 0.00019409105186302293,
"loss": 1.2414,
"step": 475
},
{
"epoch": 2.9968652037617556,
"eval_loss": 1.9793964624404907,
"eval_runtime": 0.5545,
"eval_samples_per_second": 3.607,
"eval_steps_per_second": 1.804,
"step": 478
},
{
"epoch": 3.0094043887147337,
"grad_norm": 0.3125,
"learning_rate": 0.00019384064979824752,
"loss": 1.2176,
"step": 480
},
{
"epoch": 3.040752351097179,
"grad_norm": 0.30859375,
"learning_rate": 0.00019358522003311927,
"loss": 1.187,
"step": 485
},
{
"epoch": 3.072100313479624,
"grad_norm": 0.318359375,
"learning_rate": 0.0001933247762527984,
"loss": 1.1861,
"step": 490
},
{
"epoch": 3.103448275862069,
"grad_norm": 0.361328125,
"learning_rate": 0.00019305933241108085,
"loss": 1.1895,
"step": 495
},
{
"epoch": 3.134796238244514,
"grad_norm": 0.353515625,
"learning_rate": 0.00019278890272965096,
"loss": 1.1912,
"step": 500
},
{
"epoch": 3.166144200626959,
"grad_norm": 0.5859375,
"learning_rate": 0.00019251350169731935,
"loss": 1.1844,
"step": 505
},
{
"epoch": 3.197492163009404,
"grad_norm": 0.48046875,
"learning_rate": 0.00019223314406924673,
"loss": 1.1933,
"step": 510
},
{
"epoch": 3.2288401253918497,
"grad_norm": 0.369140625,
"learning_rate": 0.0001919478448661533,
"loss": 1.1837,
"step": 515
},
{
"epoch": 3.260188087774295,
"grad_norm": 0.34765625,
"learning_rate": 0.0001916576193735141,
"loss": 1.1817,
"step": 520
},
{
"epoch": 3.29153605015674,
"grad_norm": 0.5,
"learning_rate": 0.00019136248314073983,
"loss": 1.1935,
"step": 525
},
{
"epoch": 3.322884012539185,
"grad_norm": 0.41015625,
"learning_rate": 0.00019106245198034403,
"loss": 1.1726,
"step": 530
},
{
"epoch": 3.35423197492163,
"grad_norm": 0.349609375,
"learning_rate": 0.00019075754196709572,
"loss": 1.1995,
"step": 535
},
{
"epoch": 3.385579937304075,
"grad_norm": 0.33984375,
"learning_rate": 0.0001904477694371582,
"loss": 1.1782,
"step": 540
},
{
"epoch": 3.41692789968652,
"grad_norm": 0.4921875,
"learning_rate": 0.00019013315098721388,
"loss": 1.2003,
"step": 545
},
{
"epoch": 3.4482758620689653,
"grad_norm": 0.337890625,
"learning_rate": 0.00018981370347357493,
"loss": 1.1869,
"step": 550
},
{
"epoch": 3.479623824451411,
"grad_norm": 0.5546875,
"learning_rate": 0.00018948944401128034,
"loss": 1.1821,
"step": 555
},
{
"epoch": 3.510971786833856,
"grad_norm": 0.33984375,
"learning_rate": 0.00018916038997317887,
"loss": 1.1851,
"step": 560
},
{
"epoch": 3.542319749216301,
"grad_norm": 0.310546875,
"learning_rate": 0.0001888265589889981,
"loss": 1.1873,
"step": 565
},
{
"epoch": 3.573667711598746,
"grad_norm": 0.349609375,
"learning_rate": 0.00018848796894440031,
"loss": 1.1952,
"step": 570
},
{
"epoch": 3.605015673981191,
"grad_norm": 0.322265625,
"learning_rate": 0.00018814463798002372,
"loss": 1.1829,
"step": 575
},
{
"epoch": 3.6363636363636362,
"grad_norm": 0.328125,
"learning_rate": 0.00018779658449051092,
"loss": 1.1979,
"step": 580
},
{
"epoch": 3.6677115987460818,
"grad_norm": 0.427734375,
"learning_rate": 0.00018744382712352318,
"loss": 1.1867,
"step": 585
},
{
"epoch": 3.699059561128527,
"grad_norm": 0.5234375,
"learning_rate": 0.00018708638477874144,
"loss": 1.1933,
"step": 590
},
{
"epoch": 3.730407523510972,
"grad_norm": 0.83984375,
"learning_rate": 0.00018672427660685364,
"loss": 1.1699,
"step": 595
},
{
"epoch": 3.761755485893417,
"grad_norm": 0.5625,
"learning_rate": 0.00018635752200852877,
"loss": 1.1757,
"step": 600
},
{
"epoch": 3.793103448275862,
"grad_norm": 0.412109375,
"learning_rate": 0.00018598614063337744,
"loss": 1.1991,
"step": 605
},
{
"epoch": 3.824451410658307,
"grad_norm": 0.48046875,
"learning_rate": 0.00018561015237889895,
"loss": 1.1871,
"step": 610
},
{
"epoch": 3.8557993730407523,
"grad_norm": 0.3203125,
"learning_rate": 0.0001852295773894155,
"loss": 1.1968,
"step": 615
},
{
"epoch": 3.8871473354231973,
"grad_norm": 0.4296875,
"learning_rate": 0.00018484443605499266,
"loss": 1.1792,
"step": 620
},
{
"epoch": 3.9184952978056424,
"grad_norm": 0.330078125,
"learning_rate": 0.0001844547490103472,
"loss": 1.2007,
"step": 625
},
{
"epoch": 3.9498432601880875,
"grad_norm": 0.31640625,
"learning_rate": 0.0001840605371337413,
"loss": 1.1966,
"step": 630
},
{
"epoch": 3.981191222570533,
"grad_norm": 0.34765625,
"learning_rate": 0.00018366182154586406,
"loss": 1.2012,
"step": 635
},
{
"epoch": 4.0,
"eval_loss": 2.0134387016296387,
"eval_runtime": 0.5449,
"eval_samples_per_second": 3.67,
"eval_steps_per_second": 1.835,
"step": 638
},
{
"epoch": 4.012539184952978,
"grad_norm": 0.34375,
"learning_rate": 0.00018325862360869994,
"loss": 1.1633,
"step": 640
},
{
"epoch": 4.043887147335423,
"grad_norm": 0.357421875,
"learning_rate": 0.00018285096492438424,
"loss": 1.1407,
"step": 645
},
{
"epoch": 4.075235109717869,
"grad_norm": 0.365234375,
"learning_rate": 0.00018243886733404564,
"loss": 1.1271,
"step": 650
},
{
"epoch": 4.106583072100314,
"grad_norm": 0.373046875,
"learning_rate": 0.0001820223529166361,
"loss": 1.1199,
"step": 655
},
{
"epoch": 4.137931034482759,
"grad_norm": 0.52734375,
"learning_rate": 0.00018160144398774797,
"loss": 1.1245,
"step": 660
},
{
"epoch": 4.169278996865204,
"grad_norm": 0.44140625,
"learning_rate": 0.0001811761630984183,
"loss": 1.1182,
"step": 665
},
{
"epoch": 4.200626959247649,
"grad_norm": 0.357421875,
"learning_rate": 0.00018074653303392063,
"loss": 1.1331,
"step": 670
},
{
"epoch": 4.231974921630094,
"grad_norm": 0.412109375,
"learning_rate": 0.0001803125768125443,
"loss": 1.1308,
"step": 675
},
{
"epoch": 4.263322884012539,
"grad_norm": 0.337890625,
"learning_rate": 0.0001798743176843611,
"loss": 1.1312,
"step": 680
},
{
"epoch": 4.294670846394984,
"grad_norm": 0.35546875,
"learning_rate": 0.00017943177912997971,
"loss": 1.1162,
"step": 685
},
{
"epoch": 4.326018808777429,
"grad_norm": 0.36328125,
"learning_rate": 0.00017898498485928763,
"loss": 1.1379,
"step": 690
},
{
"epoch": 4.3573667711598745,
"grad_norm": 0.34375,
"learning_rate": 0.00017853395881018073,
"loss": 1.1399,
"step": 695
},
{
"epoch": 4.38871473354232,
"grad_norm": 0.421875,
"learning_rate": 0.00017807872514728106,
"loss": 1.1272,
"step": 700
},
{
"epoch": 4.420062695924765,
"grad_norm": 0.40234375,
"learning_rate": 0.00017761930826064182,
"loss": 1.1293,
"step": 705
},
{
"epoch": 4.45141065830721,
"grad_norm": 0.48046875,
"learning_rate": 0.00017715573276444086,
"loss": 1.1315,
"step": 710
},
{
"epoch": 4.482758620689655,
"grad_norm": 0.353515625,
"learning_rate": 0.0001766880234956619,
"loss": 1.1355,
"step": 715
},
{
"epoch": 4.514106583072101,
"grad_norm": 0.333984375,
"learning_rate": 0.00017621620551276366,
"loss": 1.1434,
"step": 720
},
{
"epoch": 4.545454545454545,
"grad_norm": 0.38671875,
"learning_rate": 0.00017574030409433751,
"loss": 1.1433,
"step": 725
},
{
"epoch": 4.576802507836991,
"grad_norm": 0.4765625,
"learning_rate": 0.00017526034473775307,
"loss": 1.1341,
"step": 730
},
{
"epoch": 4.608150470219436,
"grad_norm": 0.380859375,
"learning_rate": 0.00017477635315779204,
"loss": 1.1352,
"step": 735
},
{
"epoch": 4.639498432601881,
"grad_norm": 0.421875,
"learning_rate": 0.0001742883552852706,
"loss": 1.1428,
"step": 740
},
{
"epoch": 4.670846394984326,
"grad_norm": 0.439453125,
"learning_rate": 0.00017379637726564994,
"loss": 1.1337,
"step": 745
},
{
"epoch": 4.702194357366771,
"grad_norm": 0.42578125,
"learning_rate": 0.00017330044545763574,
"loss": 1.1469,
"step": 750
},
{
"epoch": 4.733542319749216,
"grad_norm": 0.412109375,
"learning_rate": 0.00017280058643176578,
"loss": 1.1318,
"step": 755
},
{
"epoch": 4.764890282131661,
"grad_norm": 0.322265625,
"learning_rate": 0.00017229682696898624,
"loss": 1.1402,
"step": 760
},
{
"epoch": 4.7962382445141065,
"grad_norm": 0.314453125,
"learning_rate": 0.00017178919405921717,
"loss": 1.1288,
"step": 765
},
{
"epoch": 4.827586206896552,
"grad_norm": 0.419921875,
"learning_rate": 0.00017127771489990613,
"loss": 1.1298,
"step": 770
},
{
"epoch": 4.858934169278997,
"grad_norm": 0.35546875,
"learning_rate": 0.00017076241689457136,
"loss": 1.1386,
"step": 775
},
{
"epoch": 4.890282131661442,
"grad_norm": 0.333984375,
"learning_rate": 0.00017024332765133325,
"loss": 1.14,
"step": 780
},
{
"epoch": 4.921630094043887,
"grad_norm": 0.6171875,
"learning_rate": 0.00016972047498143544,
"loss": 1.1444,
"step": 785
},
{
"epoch": 4.952978056426332,
"grad_norm": 0.333984375,
"learning_rate": 0.00016919388689775464,
"loss": 1.1466,
"step": 790
},
{
"epoch": 4.984326018808778,
"grad_norm": 0.396484375,
"learning_rate": 0.0001686635916132998,
"loss": 1.1513,
"step": 795
},
{
"epoch": 4.996865203761756,
"eval_loss": 2.0582528114318848,
"eval_runtime": 0.5554,
"eval_samples_per_second": 3.601,
"eval_steps_per_second": 1.801,
"step": 797
},
{
"epoch": 5.015673981191223,
"grad_norm": 0.326171875,
"learning_rate": 0.00016812961753970054,
"loss": 1.1118,
"step": 800
},
{
"epoch": 5.047021943573668,
"grad_norm": 0.345703125,
"learning_rate": 0.00016759199328568504,
"loss": 1.0654,
"step": 805
},
{
"epoch": 5.078369905956113,
"grad_norm": 0.34375,
"learning_rate": 0.00016705074765554717,
"loss": 1.0557,
"step": 810
},
{
"epoch": 5.109717868338558,
"grad_norm": 0.369140625,
"learning_rate": 0.0001665059096476032,
"loss": 1.0685,
"step": 815
},
{
"epoch": 5.141065830721003,
"grad_norm": 0.369140625,
"learning_rate": 0.00016595750845263825,
"loss": 1.073,
"step": 820
},
{
"epoch": 5.172413793103448,
"grad_norm": 0.4296875,
"learning_rate": 0.00016540557345234237,
"loss": 1.0784,
"step": 825
},
{
"epoch": 5.2037617554858935,
"grad_norm": 0.4765625,
"learning_rate": 0.00016485013421773615,
"loss": 1.0628,
"step": 830
},
{
"epoch": 5.235109717868339,
"grad_norm": 0.396484375,
"learning_rate": 0.00016429122050758672,
"loss": 1.0822,
"step": 835
},
{
"epoch": 5.266457680250784,
"grad_norm": 0.52734375,
"learning_rate": 0.00016372886226681302,
"loss": 1.0748,
"step": 840
},
{
"epoch": 5.297805642633229,
"grad_norm": 0.359375,
"learning_rate": 0.00016316308962488173,
"loss": 1.0867,
"step": 845
},
{
"epoch": 5.329153605015674,
"grad_norm": 0.5234375,
"learning_rate": 0.00016259393289419277,
"loss": 1.0796,
"step": 850
},
{
"epoch": 5.360501567398119,
"grad_norm": 0.38671875,
"learning_rate": 0.00016202142256845553,
"loss": 1.0896,
"step": 855
},
{
"epoch": 5.391849529780564,
"grad_norm": 0.59765625,
"learning_rate": 0.00016144558932105473,
"loss": 1.0802,
"step": 860
},
{
"epoch": 5.423197492163009,
"grad_norm": 0.357421875,
"learning_rate": 0.00016086646400340757,
"loss": 1.0688,
"step": 865
},
{
"epoch": 5.454545454545454,
"grad_norm": 0.466796875,
"learning_rate": 0.00016028407764331014,
"loss": 1.0836,
"step": 870
},
{
"epoch": 5.485893416927899,
"grad_norm": 0.5,
"learning_rate": 0.00015969846144327574,
"loss": 1.0807,
"step": 875
},
{
"epoch": 5.517241379310345,
"grad_norm": 0.361328125,
"learning_rate": 0.0001591096467788625,
"loss": 1.0957,
"step": 880
},
{
"epoch": 5.54858934169279,
"grad_norm": 0.453125,
"learning_rate": 0.00015851766519699295,
"loss": 1.0724,
"step": 885
},
{
"epoch": 5.579937304075235,
"grad_norm": 0.40234375,
"learning_rate": 0.00015792254841426328,
"loss": 1.0989,
"step": 890
},
{
"epoch": 5.61128526645768,
"grad_norm": 0.4140625,
"learning_rate": 0.00015732432831524448,
"loss": 1.0886,
"step": 895
},
{
"epoch": 5.6426332288401255,
"grad_norm": 0.33203125,
"learning_rate": 0.00015672303695077398,
"loss": 1.0961,
"step": 900
},
{
"epoch": 5.673981191222571,
"grad_norm": 0.34765625,
"learning_rate": 0.00015611870653623825,
"loss": 1.0964,
"step": 905
},
{
"epoch": 5.705329153605016,
"grad_norm": 0.373046875,
"learning_rate": 0.00015551136944984699,
"loss": 1.0895,
"step": 910
},
{
"epoch": 5.736677115987461,
"grad_norm": 0.453125,
"learning_rate": 0.0001549010582308984,
"loss": 1.0814,
"step": 915
},
{
"epoch": 5.768025078369906,
"grad_norm": 0.388671875,
"learning_rate": 0.00015428780557803567,
"loss": 1.0926,
"step": 920
},
{
"epoch": 5.799373040752351,
"grad_norm": 0.515625,
"learning_rate": 0.00015367164434749534,
"loss": 1.0849,
"step": 925
},
{
"epoch": 5.830721003134796,
"grad_norm": 0.33203125,
"learning_rate": 0.00015305260755134667,
"loss": 1.0934,
"step": 930
},
{
"epoch": 5.862068965517241,
"grad_norm": 0.36328125,
"learning_rate": 0.00015243072835572318,
"loss": 1.0942,
"step": 935
},
{
"epoch": 5.893416927899686,
"grad_norm": 0.447265625,
"learning_rate": 0.0001518060400790456,
"loss": 1.0832,
"step": 940
},
{
"epoch": 5.924764890282132,
"grad_norm": 0.365234375,
"learning_rate": 0.00015117857619023677,
"loss": 1.0944,
"step": 945
},
{
"epoch": 5.956112852664576,
"grad_norm": 0.392578125,
"learning_rate": 0.00015054837030692854,
"loss": 1.0972,
"step": 950
},
{
"epoch": 5.987460815047022,
"grad_norm": 0.412109375,
"learning_rate": 0.00014991545619366054,
"loss": 1.0951,
"step": 955
},
{
"epoch": 6.0,
"eval_loss": 2.108396291732788,
"eval_runtime": 0.554,
"eval_samples_per_second": 3.61,
"eval_steps_per_second": 1.805,
"step": 957
},
{
"epoch": 6.018808777429467,
"grad_norm": 0.427734375,
"learning_rate": 0.00014927986776007128,
"loss": 1.054,
"step": 960
},
{
"epoch": 6.0501567398119125,
"grad_norm": 0.45703125,
"learning_rate": 0.00014864163905908132,
"loss": 1.0222,
"step": 965
},
{
"epoch": 6.081504702194358,
"grad_norm": 0.76953125,
"learning_rate": 0.00014800080428506882,
"loss": 1.0209,
"step": 970
},
{
"epoch": 6.112852664576803,
"grad_norm": 0.384765625,
"learning_rate": 0.00014735739777203745,
"loss": 1.0167,
"step": 975
},
{
"epoch": 6.144200626959248,
"grad_norm": 0.51171875,
"learning_rate": 0.000146711453991777,
"loss": 1.0113,
"step": 980
},
{
"epoch": 6.175548589341693,
"grad_norm": 0.455078125,
"learning_rate": 0.00014606300755201645,
"loss": 1.019,
"step": 985
},
{
"epoch": 6.206896551724138,
"grad_norm": 0.41796875,
"learning_rate": 0.00014541209319456972,
"loss": 1.0317,
"step": 990
},
{
"epoch": 6.238244514106583,
"grad_norm": 0.51953125,
"learning_rate": 0.00014475874579347435,
"loss": 1.0342,
"step": 995
},
{
"epoch": 6.269592476489028,
"grad_norm": 0.51953125,
"learning_rate": 0.00014410300035312302,
"loss": 1.0258,
"step": 1000
},
{
"epoch": 6.300940438871473,
"grad_norm": 0.65625,
"learning_rate": 0.00014344489200638827,
"loss": 1.0393,
"step": 1005
},
{
"epoch": 6.332288401253918,
"grad_norm": 0.65625,
"learning_rate": 0.00014278445601274,
"loss": 1.038,
"step": 1010
},
{
"epoch": 6.363636363636363,
"grad_norm": 0.53515625,
"learning_rate": 0.00014212172775635633,
"loss": 1.0334,
"step": 1015
},
{
"epoch": 6.394984326018808,
"grad_norm": 0.376953125,
"learning_rate": 0.0001414567427442282,
"loss": 1.0272,
"step": 1020
},
{
"epoch": 6.4263322884012535,
"grad_norm": 0.5390625,
"learning_rate": 0.00014078953660425652,
"loss": 1.0298,
"step": 1025
},
{
"epoch": 6.4576802507836994,
"grad_norm": 0.443359375,
"learning_rate": 0.00014012014508334365,
"loss": 1.0337,
"step": 1030
},
{
"epoch": 6.4890282131661445,
"grad_norm": 0.416015625,
"learning_rate": 0.00013944860404547816,
"loss": 1.0285,
"step": 1035
},
{
"epoch": 6.52037617554859,
"grad_norm": 0.38671875,
"learning_rate": 0.00013877494946981314,
"loss": 1.041,
"step": 1040
},
{
"epoch": 6.551724137931035,
"grad_norm": 0.42578125,
"learning_rate": 0.00013809921744873885,
"loss": 1.0319,
"step": 1045
},
{
"epoch": 6.58307210031348,
"grad_norm": 0.416015625,
"learning_rate": 0.0001374214441859487,
"loss": 1.0311,
"step": 1050
},
{
"epoch": 6.614420062695925,
"grad_norm": 0.40625,
"learning_rate": 0.00013674166599449977,
"loss": 1.0299,
"step": 1055
},
{
"epoch": 6.64576802507837,
"grad_norm": 0.353515625,
"learning_rate": 0.0001360599192948673,
"loss": 1.0435,
"step": 1060
},
{
"epoch": 6.677115987460815,
"grad_norm": 0.380859375,
"learning_rate": 0.00013537624061299303,
"loss": 1.0342,
"step": 1065
},
{
"epoch": 6.70846394984326,
"grad_norm": 0.365234375,
"learning_rate": 0.0001346906665783288,
"loss": 1.0426,
"step": 1070
},
{
"epoch": 6.739811912225705,
"grad_norm": 0.369140625,
"learning_rate": 0.00013400323392187357,
"loss": 1.0424,
"step": 1075
},
{
"epoch": 6.77115987460815,
"grad_norm": 0.380859375,
"learning_rate": 0.00013331397947420576,
"loss": 1.0644,
"step": 1080
},
{
"epoch": 6.802507836990595,
"grad_norm": 0.435546875,
"learning_rate": 0.00013262294016350986,
"loss": 1.0373,
"step": 1085
},
{
"epoch": 6.83385579937304,
"grad_norm": 0.443359375,
"learning_rate": 0.000131930153013598,
"loss": 1.0423,
"step": 1090
},
{
"epoch": 6.8652037617554855,
"grad_norm": 0.46875,
"learning_rate": 0.00013123565514192625,
"loss": 1.0421,
"step": 1095
},
{
"epoch": 6.896551724137931,
"grad_norm": 0.380859375,
"learning_rate": 0.00013053948375760604,
"loss": 1.04,
"step": 1100
},
{
"epoch": 6.927899686520377,
"grad_norm": 0.37890625,
"learning_rate": 0.00012984167615941056,
"loss": 1.0378,
"step": 1105
},
{
"epoch": 6.959247648902822,
"grad_norm": 0.451171875,
"learning_rate": 0.00012914226973377644,
"loss": 1.0383,
"step": 1110
},
{
"epoch": 6.990595611285267,
"grad_norm": 0.390625,
"learning_rate": 0.00012844130195280076,
"loss": 1.0414,
"step": 1115
},
{
"epoch": 6.996865203761756,
"eval_loss": 2.2094361782073975,
"eval_runtime": 0.5549,
"eval_samples_per_second": 3.604,
"eval_steps_per_second": 1.802,
"step": 1116
},
{
"epoch": 7.021943573667712,
"grad_norm": 0.416015625,
"learning_rate": 0.0001277388103722332,
"loss": 0.9864,
"step": 1120
},
{
"epoch": 7.053291536050157,
"grad_norm": 0.447265625,
"learning_rate": 0.00012703483262946415,
"loss": 0.9734,
"step": 1125
},
{
"epoch": 7.084639498432602,
"grad_norm": 0.4921875,
"learning_rate": 0.000126329406441508,
"loss": 0.9689,
"step": 1130
},
{
"epoch": 7.115987460815047,
"grad_norm": 0.43359375,
"learning_rate": 0.00012562256960298266,
"loss": 0.9804,
"step": 1135
},
{
"epoch": 7.147335423197492,
"grad_norm": 0.4453125,
"learning_rate": 0.0001249143599840843,
"loss": 0.9741,
"step": 1140
},
{
"epoch": 7.178683385579937,
"grad_norm": 0.5,
"learning_rate": 0.00012420481552855863,
"loss": 0.9766,
"step": 1145
},
{
"epoch": 7.210031347962382,
"grad_norm": 0.458984375,
"learning_rate": 0.00012349397425166786,
"loss": 0.9763,
"step": 1150
},
{
"epoch": 7.241379310344827,
"grad_norm": 0.44140625,
"learning_rate": 0.000122781874238154,
"loss": 0.9791,
"step": 1155
},
{
"epoch": 7.2727272727272725,
"grad_norm": 0.400390625,
"learning_rate": 0.00012206855364019845,
"loss": 0.9773,
"step": 1160
},
{
"epoch": 7.304075235109718,
"grad_norm": 0.40625,
"learning_rate": 0.00012135405067537777,
"loss": 0.9873,
"step": 1165
},
{
"epoch": 7.335423197492163,
"grad_norm": 0.470703125,
"learning_rate": 0.0001206384036246162,
"loss": 0.9888,
"step": 1170
},
{
"epoch": 7.366771159874608,
"grad_norm": 0.404296875,
"learning_rate": 0.0001199216508301348,
"loss": 0.9731,
"step": 1175
},
{
"epoch": 7.398119122257054,
"grad_norm": 0.41015625,
"learning_rate": 0.00011920383069339684,
"loss": 0.9975,
"step": 1180
},
{
"epoch": 7.429467084639499,
"grad_norm": 0.421875,
"learning_rate": 0.00011848498167305078,
"loss": 0.9835,
"step": 1185
},
{
"epoch": 7.460815047021944,
"grad_norm": 0.447265625,
"learning_rate": 0.0001177651422828695,
"loss": 0.9779,
"step": 1190
},
{
"epoch": 7.492163009404389,
"grad_norm": 0.46875,
"learning_rate": 0.00011704435108968688,
"loss": 0.9782,
"step": 1195
},
{
"epoch": 7.523510971786834,
"grad_norm": 0.447265625,
"learning_rate": 0.00011632264671133162,
"loss": 0.9797,
"step": 1200
},
{
"epoch": 7.554858934169279,
"grad_norm": 0.408203125,
"learning_rate": 0.00011560006781455812,
"loss": 0.9956,
"step": 1205
},
{
"epoch": 7.586206896551724,
"grad_norm": 0.419921875,
"learning_rate": 0.00011487665311297484,
"loss": 0.9923,
"step": 1210
},
{
"epoch": 7.617554858934169,
"grad_norm": 0.4296875,
"learning_rate": 0.00011415244136497013,
"loss": 0.9866,
"step": 1215
},
{
"epoch": 7.648902821316614,
"grad_norm": 0.4375,
"learning_rate": 0.00011342747137163572,
"loss": 0.9932,
"step": 1220
},
{
"epoch": 7.6802507836990594,
"grad_norm": 0.4140625,
"learning_rate": 0.00011270178197468789,
"loss": 0.9841,
"step": 1225
},
{
"epoch": 7.7115987460815045,
"grad_norm": 0.41796875,
"learning_rate": 0.00011197541205438634,
"loss": 0.9863,
"step": 1230
},
{
"epoch": 7.74294670846395,
"grad_norm": 0.400390625,
"learning_rate": 0.0001112484005274512,
"loss": 0.9951,
"step": 1235
},
{
"epoch": 7.774294670846395,
"grad_norm": 0.4140625,
"learning_rate": 0.00011052078634497796,
"loss": 0.9847,
"step": 1240
},
{
"epoch": 7.80564263322884,
"grad_norm": 0.427734375,
"learning_rate": 0.00010979260849035054,
"loss": 0.9868,
"step": 1245
},
{
"epoch": 7.836990595611285,
"grad_norm": 0.3984375,
"learning_rate": 0.00010906390597715282,
"loss": 0.9874,
"step": 1250
},
{
"epoch": 7.868338557993731,
"grad_norm": 0.453125,
"learning_rate": 0.00010833471784707824,
"loss": 0.9928,
"step": 1255
},
{
"epoch": 7.899686520376176,
"grad_norm": 0.41015625,
"learning_rate": 0.00010760508316783808,
"loss": 1.0034,
"step": 1260
},
{
"epoch": 7.931034482758621,
"grad_norm": 0.416015625,
"learning_rate": 0.00010687504103106854,
"loss": 0.9844,
"step": 1265
},
{
"epoch": 7.962382445141066,
"grad_norm": 0.396484375,
"learning_rate": 0.000106144630550236,
"loss": 0.986,
"step": 1270
},
{
"epoch": 7.993730407523511,
"grad_norm": 0.400390625,
"learning_rate": 0.00010541389085854176,
"loss": 1.0041,
"step": 1275
},
{
"epoch": 8.0,
"eval_loss": 2.304290294647217,
"eval_runtime": 0.554,
"eval_samples_per_second": 3.61,
"eval_steps_per_second": 1.805,
"step": 1276
},
{
"epoch": 8.025078369905955,
"grad_norm": 0.44140625,
"learning_rate": 0.00010468286110682517,
"loss": 0.9349,
"step": 1280
},
{
"epoch": 8.056426332288401,
"grad_norm": 0.4296875,
"learning_rate": 0.00010395158046146606,
"loss": 0.915,
"step": 1285
},
{
"epoch": 8.087774294670846,
"grad_norm": 0.470703125,
"learning_rate": 0.00010322008810228657,
"loss": 0.935,
"step": 1290
},
{
"epoch": 8.119122257053291,
"grad_norm": 0.41015625,
"learning_rate": 0.00010248842322045164,
"loss": 0.9215,
"step": 1295
},
{
"epoch": 8.150470219435737,
"grad_norm": 0.462890625,
"learning_rate": 0.0001017566250163696,
"loss": 0.9316,
"step": 1300
},
{
"epoch": 8.181818181818182,
"grad_norm": 0.453125,
"learning_rate": 0.00010102473269759171,
"loss": 0.9211,
"step": 1305
},
{
"epoch": 8.213166144200628,
"grad_norm": 0.431640625,
"learning_rate": 0.00010029278547671161,
"loss": 0.9244,
"step": 1310
},
{
"epoch": 8.244514106583072,
"grad_norm": 0.4296875,
"learning_rate": 9.956082256926448e-05,
"loss": 0.9338,
"step": 1315
},
{
"epoch": 8.275862068965518,
"grad_norm": 0.44921875,
"learning_rate": 9.88288831916259e-05,
"loss": 0.9279,
"step": 1320
},
{
"epoch": 8.307210031347962,
"grad_norm": 0.44921875,
"learning_rate": 9.80970065589108e-05,
"loss": 0.9312,
"step": 1325
},
{
"epoch": 8.338557993730408,
"grad_norm": 0.44921875,
"learning_rate": 9.73652318828724e-05,
"loss": 0.9378,
"step": 1330
},
{
"epoch": 8.369905956112852,
"grad_norm": 0.4765625,
"learning_rate": 9.663359836980144e-05,
"loss": 0.934,
"step": 1335
},
{
"epoch": 8.401253918495298,
"grad_norm": 0.515625,
"learning_rate": 9.590214521842556e-05,
"loss": 0.9366,
"step": 1340
},
{
"epoch": 8.432601880877742,
"grad_norm": 0.453125,
"learning_rate": 9.517091161780914e-05,
"loss": 0.9317,
"step": 1345
},
{
"epoch": 8.463949843260188,
"grad_norm": 0.474609375,
"learning_rate": 9.443993674525368e-05,
"loss": 0.9535,
"step": 1350
},
{
"epoch": 8.495297805642632,
"grad_norm": 0.4609375,
"learning_rate": 9.370925976419885e-05,
"loss": 0.9418,
"step": 1355
},
{
"epoch": 8.526645768025078,
"grad_norm": 0.451171875,
"learning_rate": 9.297891982212415e-05,
"loss": 0.9457,
"step": 1360
},
{
"epoch": 8.557993730407524,
"grad_norm": 0.5,
"learning_rate": 9.224895604845156e-05,
"loss": 0.9307,
"step": 1365
},
{
"epoch": 8.589341692789969,
"grad_norm": 0.49609375,
"learning_rate": 9.151940755244912e-05,
"loss": 0.9359,
"step": 1370
},
{
"epoch": 8.620689655172415,
"grad_norm": 0.462890625,
"learning_rate": 9.07903134211354e-05,
"loss": 0.9451,
"step": 1375
},
{
"epoch": 8.652037617554859,
"grad_norm": 0.443359375,
"learning_rate": 9.006171271718566e-05,
"loss": 0.9396,
"step": 1380
},
{
"epoch": 8.683385579937305,
"grad_norm": 0.42578125,
"learning_rate": 8.933364447683868e-05,
"loss": 0.9376,
"step": 1385
},
{
"epoch": 8.714733542319749,
"grad_norm": 0.44921875,
"learning_rate": 8.860614770780553e-05,
"loss": 0.9465,
"step": 1390
},
{
"epoch": 8.746081504702195,
"grad_norm": 0.431640625,
"learning_rate": 8.787926138717943e-05,
"loss": 0.9391,
"step": 1395
},
{
"epoch": 8.77742946708464,
"grad_norm": 0.443359375,
"learning_rate": 8.715302445934773e-05,
"loss": 0.9545,
"step": 1400
},
{
"epoch": 8.808777429467085,
"grad_norm": 0.46875,
"learning_rate": 8.642747583390521e-05,
"loss": 0.9418,
"step": 1405
},
{
"epoch": 8.84012539184953,
"grad_norm": 0.44921875,
"learning_rate": 8.570265438356948e-05,
"loss": 0.9383,
"step": 1410
},
{
"epoch": 8.871473354231975,
"grad_norm": 0.447265625,
"learning_rate": 8.497859894209828e-05,
"loss": 0.9524,
"step": 1415
},
{
"epoch": 8.90282131661442,
"grad_norm": 0.470703125,
"learning_rate": 8.425534830220893e-05,
"loss": 0.9504,
"step": 1420
},
{
"epoch": 8.934169278996865,
"grad_norm": 0.435546875,
"learning_rate": 8.353294121349992e-05,
"loss": 0.9448,
"step": 1425
},
{
"epoch": 8.96551724137931,
"grad_norm": 0.431640625,
"learning_rate": 8.281141638037464e-05,
"loss": 0.9385,
"step": 1430
},
{
"epoch": 8.996865203761756,
"grad_norm": 0.439453125,
"learning_rate": 8.209081245996807e-05,
"loss": 0.9481,
"step": 1435
},
{
"epoch": 8.996865203761756,
"eval_loss": 2.398902416229248,
"eval_runtime": 0.5512,
"eval_samples_per_second": 3.628,
"eval_steps_per_second": 1.814,
"step": 1435
},
{
"epoch": 9.0282131661442,
"grad_norm": 0.453125,
"learning_rate": 8.137116806007531e-05,
"loss": 0.8853,
"step": 1440
},
{
"epoch": 9.059561128526646,
"grad_norm": 0.443359375,
"learning_rate": 8.065252173708333e-05,
"loss": 0.8874,
"step": 1445
},
{
"epoch": 9.090909090909092,
"grad_norm": 0.49609375,
"learning_rate": 7.993491199390507e-05,
"loss": 0.8784,
"step": 1450
},
{
"epoch": 9.122257053291536,
"grad_norm": 0.443359375,
"learning_rate": 7.921837727791673e-05,
"loss": 0.8917,
"step": 1455
},
{
"epoch": 9.153605015673982,
"grad_norm": 0.462890625,
"learning_rate": 7.85029559788976e-05,
"loss": 0.8781,
"step": 1460
},
{
"epoch": 9.184952978056426,
"grad_norm": 0.46484375,
"learning_rate": 7.778868642697359e-05,
"loss": 0.8851,
"step": 1465
},
{
"epoch": 9.216300940438872,
"grad_norm": 0.4609375,
"learning_rate": 7.707560689056343e-05,
"loss": 0.8892,
"step": 1470
},
{
"epoch": 9.247648902821316,
"grad_norm": 0.486328125,
"learning_rate": 7.636375557432835e-05,
"loss": 0.8863,
"step": 1475
},
{
"epoch": 9.278996865203762,
"grad_norm": 0.46875,
"learning_rate": 7.565317061712525e-05,
"loss": 0.8907,
"step": 1480
},
{
"epoch": 9.310344827586206,
"grad_norm": 0.52734375,
"learning_rate": 7.494389008996327e-05,
"loss": 0.8906,
"step": 1485
},
{
"epoch": 9.341692789968652,
"grad_norm": 0.5078125,
"learning_rate": 7.423595199396419e-05,
"loss": 0.8987,
"step": 1490
},
{
"epoch": 9.373040752351097,
"grad_norm": 0.490234375,
"learning_rate": 7.35293942583263e-05,
"loss": 0.8996,
"step": 1495
},
{
"epoch": 9.404388714733543,
"grad_norm": 0.474609375,
"learning_rate": 7.282425473829236e-05,
"loss": 0.8985,
"step": 1500
},
{
"epoch": 9.435736677115987,
"grad_norm": 0.46484375,
"learning_rate": 7.212057121312133e-05,
"loss": 0.8923,
"step": 1505
},
{
"epoch": 9.467084639498433,
"grad_norm": 0.470703125,
"learning_rate": 7.141838138406438e-05,
"loss": 0.8873,
"step": 1510
},
{
"epoch": 9.498432601880877,
"grad_norm": 0.462890625,
"learning_rate": 7.071772287234497e-05,
"loss": 0.8872,
"step": 1515
},
{
"epoch": 9.529780564263323,
"grad_norm": 0.46484375,
"learning_rate": 7.001863321714309e-05,
"loss": 0.8988,
"step": 1520
},
{
"epoch": 9.561128526645769,
"grad_norm": 0.482421875,
"learning_rate": 6.932114987358413e-05,
"loss": 0.895,
"step": 1525
},
{
"epoch": 9.592476489028213,
"grad_norm": 0.470703125,
"learning_rate": 6.862531021073222e-05,
"loss": 0.8905,
"step": 1530
},
{
"epoch": 9.623824451410659,
"grad_norm": 0.5234375,
"learning_rate": 6.79311515095878e-05,
"loss": 0.9014,
"step": 1535
},
{
"epoch": 9.655172413793103,
"grad_norm": 0.625,
"learning_rate": 6.723871096109064e-05,
"loss": 0.9016,
"step": 1540
},
{
"epoch": 9.68652037617555,
"grad_norm": 0.5,
"learning_rate": 6.654802566412697e-05,
"loss": 0.9134,
"step": 1545
},
{
"epoch": 9.717868338557993,
"grad_norm": 0.53125,
"learning_rate": 6.585913262354184e-05,
"loss": 0.9018,
"step": 1550
},
{
"epoch": 9.74921630094044,
"grad_norm": 0.515625,
"learning_rate": 6.51720687481567e-05,
"loss": 0.8992,
"step": 1555
},
{
"epoch": 9.780564263322884,
"grad_norm": 0.49609375,
"learning_rate": 6.448687084879175e-05,
"loss": 0.9016,
"step": 1560
},
{
"epoch": 9.81191222570533,
"grad_norm": 0.455078125,
"learning_rate": 6.380357563629381e-05,
"loss": 0.8973,
"step": 1565
},
{
"epoch": 9.843260188087774,
"grad_norm": 0.46484375,
"learning_rate": 6.312221971956944e-05,
"loss": 0.8979,
"step": 1570
},
{
"epoch": 9.87460815047022,
"grad_norm": 0.46484375,
"learning_rate": 6.24428396036236e-05,
"loss": 0.8956,
"step": 1575
},
{
"epoch": 9.905956112852664,
"grad_norm": 0.466796875,
"learning_rate": 6.176547168760373e-05,
"loss": 0.9019,
"step": 1580
},
{
"epoch": 9.93730407523511,
"grad_norm": 0.458984375,
"learning_rate": 6.109015226284961e-05,
"loss": 0.9004,
"step": 1585
},
{
"epoch": 9.968652037617554,
"grad_norm": 0.45703125,
"learning_rate": 6.041691751094908e-05,
"loss": 0.8983,
"step": 1590
},
{
"epoch": 10.0,
"grad_norm": 0.4921875,
"learning_rate": 5.974580350179938e-05,
"loss": 0.9006,
"step": 1595
},
{
"epoch": 10.0,
"eval_loss": 2.5172829627990723,
"eval_runtime": 0.5456,
"eval_samples_per_second": 3.666,
"eval_steps_per_second": 1.833,
"step": 1595
},
{
"epoch": 10.031347962382446,
"grad_norm": 0.482421875,
"learning_rate": 5.9076846191674803e-05,
"loss": 0.8494,
"step": 1600
},
{
"epoch": 10.06269592476489,
"grad_norm": 0.48828125,
"learning_rate": 5.8410081421300154e-05,
"loss": 0.8491,
"step": 1605
},
{
"epoch": 10.094043887147336,
"grad_norm": 0.46875,
"learning_rate": 5.7745544913930496e-05,
"loss": 0.8479,
"step": 1610
},
{
"epoch": 10.12539184952978,
"grad_norm": 0.474609375,
"learning_rate": 5.7083272273437346e-05,
"loss": 0.8561,
"step": 1615
},
{
"epoch": 10.156739811912226,
"grad_norm": 0.47265625,
"learning_rate": 5.642329898240089e-05,
"loss": 0.8459,
"step": 1620
},
{
"epoch": 10.18808777429467,
"grad_norm": 0.52734375,
"learning_rate": 5.5765660400209174e-05,
"loss": 0.8513,
"step": 1625
},
{
"epoch": 10.219435736677116,
"grad_norm": 0.462890625,
"learning_rate": 5.511039176116357e-05,
"loss": 0.8604,
"step": 1630
},
{
"epoch": 10.25078369905956,
"grad_norm": 0.4765625,
"learning_rate": 5.44575281725909e-05,
"loss": 0.8602,
"step": 1635
},
{
"epoch": 10.282131661442007,
"grad_norm": 0.4765625,
"learning_rate": 5.3807104612962676e-05,
"loss": 0.8559,
"step": 1640
},
{
"epoch": 10.31347962382445,
"grad_norm": 0.4765625,
"learning_rate": 5.3159155930021e-05,
"loss": 0.8642,
"step": 1645
},
{
"epoch": 10.344827586206897,
"grad_norm": 0.50390625,
"learning_rate": 5.251371683891146e-05,
"loss": 0.8565,
"step": 1650
},
{
"epoch": 10.376175548589341,
"grad_norm": 0.484375,
"learning_rate": 5.1870821920323275e-05,
"loss": 0.8513,
"step": 1655
},
{
"epoch": 10.407523510971787,
"grad_norm": 0.51171875,
"learning_rate": 5.123050561863657e-05,
"loss": 0.8552,
"step": 1660
},
{
"epoch": 10.438871473354231,
"grad_norm": 0.5,
"learning_rate": 5.05928022400768e-05,
"loss": 0.8521,
"step": 1665
},
{
"epoch": 10.470219435736677,
"grad_norm": 0.490234375,
"learning_rate": 4.9957745950876945e-05,
"loss": 0.8661,
"step": 1670
},
{
"epoch": 10.501567398119121,
"grad_norm": 0.494140625,
"learning_rate": 4.9325370775446864e-05,
"loss": 0.8551,
"step": 1675
},
{
"epoch": 10.532915360501567,
"grad_norm": 0.474609375,
"learning_rate": 4.869571059455039e-05,
"loss": 0.864,
"step": 1680
},
{
"epoch": 10.564263322884013,
"grad_norm": 0.470703125,
"learning_rate": 4.806879914349009e-05,
"loss": 0.8631,
"step": 1685
},
{
"epoch": 10.595611285266457,
"grad_norm": 0.47265625,
"learning_rate": 4.74446700102998e-05,
"loss": 0.8589,
"step": 1690
},
{
"epoch": 10.626959247648903,
"grad_norm": 0.48046875,
"learning_rate": 4.6823356633945136e-05,
"loss": 0.8682,
"step": 1695
},
{
"epoch": 10.658307210031348,
"grad_norm": 0.478515625,
"learning_rate": 4.620489230253198e-05,
"loss": 0.8628,
"step": 1700
},
{
"epoch": 10.689655172413794,
"grad_norm": 0.5,
"learning_rate": 4.558931015152288e-05,
"loss": 0.868,
"step": 1705
},
{
"epoch": 10.721003134796238,
"grad_norm": 0.51953125,
"learning_rate": 4.497664316196175e-05,
"loss": 0.8608,
"step": 1710
},
{
"epoch": 10.752351097178684,
"grad_norm": 0.50390625,
"learning_rate": 4.4366924158707014e-05,
"loss": 0.8676,
"step": 1715
},
{
"epoch": 10.783699059561128,
"grad_norm": 0.474609375,
"learning_rate": 4.3760185808672784e-05,
"loss": 0.8652,
"step": 1720
},
{
"epoch": 10.815047021943574,
"grad_norm": 0.482421875,
"learning_rate": 4.315646061907872e-05,
"loss": 0.8578,
"step": 1725
},
{
"epoch": 10.846394984326018,
"grad_norm": 0.48046875,
"learning_rate": 4.25557809357084e-05,
"loss": 0.856,
"step": 1730
},
{
"epoch": 10.877742946708464,
"grad_norm": 0.49609375,
"learning_rate": 4.195817894117635e-05,
"loss": 0.862,
"step": 1735
},
{
"epoch": 10.909090909090908,
"grad_norm": 0.48828125,
"learning_rate": 4.136368665320366e-05,
"loss": 0.8602,
"step": 1740
},
{
"epoch": 10.940438871473354,
"grad_norm": 0.48046875,
"learning_rate": 4.0772335922902784e-05,
"loss": 0.8572,
"step": 1745
},
{
"epoch": 10.971786833855798,
"grad_norm": 0.47265625,
"learning_rate": 4.0184158433070937e-05,
"loss": 0.8626,
"step": 1750
},
{
"epoch": 10.996865203761756,
"eval_loss": 2.6419336795806885,
"eval_runtime": 0.557,
"eval_samples_per_second": 3.591,
"eval_steps_per_second": 1.795,
"step": 1754
},
{
"epoch": 11.003134796238244,
"grad_norm": 0.4765625,
"learning_rate": 3.9599185696492544e-05,
"loss": 0.8655,
"step": 1755
},
{
"epoch": 11.03448275862069,
"grad_norm": 0.4765625,
"learning_rate": 3.9017449054251055e-05,
"loss": 0.8346,
"step": 1760
},
{
"epoch": 11.065830721003135,
"grad_norm": 0.494140625,
"learning_rate": 3.843897967404968e-05,
"loss": 0.8387,
"step": 1765
},
{
"epoch": 11.09717868338558,
"grad_norm": 0.498046875,
"learning_rate": 3.7863808548541535e-05,
"loss": 0.8205,
"step": 1770
},
{
"epoch": 11.128526645768025,
"grad_norm": 0.470703125,
"learning_rate": 3.729196649366914e-05,
"loss": 0.8316,
"step": 1775
},
{
"epoch": 11.15987460815047,
"grad_norm": 0.4921875,
"learning_rate": 3.672348414701341e-05,
"loss": 0.8391,
"step": 1780
},
{
"epoch": 11.191222570532915,
"grad_norm": 0.484375,
"learning_rate": 3.615839196615217e-05,
"loss": 0.8264,
"step": 1785
},
{
"epoch": 11.22257053291536,
"grad_norm": 0.4765625,
"learning_rate": 3.5596720227028376e-05,
"loss": 0.831,
"step": 1790
},
{
"epoch": 11.253918495297805,
"grad_norm": 0.484375,
"learning_rate": 3.503849902232792e-05,
"loss": 0.8312,
"step": 1795
},
{
"epoch": 11.285266457680251,
"grad_norm": 0.48046875,
"learning_rate": 3.448375825986741e-05,
"loss": 0.8382,
"step": 1800
},
{
"epoch": 11.316614420062695,
"grad_norm": 0.4921875,
"learning_rate": 3.393252766099187e-05,
"loss": 0.8166,
"step": 1805
},
{
"epoch": 11.347962382445141,
"grad_norm": 0.482421875,
"learning_rate": 3.338483675898227e-05,
"loss": 0.8285,
"step": 1810
},
{
"epoch": 11.379310344827585,
"grad_norm": 0.48046875,
"learning_rate": 3.284071489747325e-05,
"loss": 0.8384,
"step": 1815
},
{
"epoch": 11.410658307210031,
"grad_norm": 0.48046875,
"learning_rate": 3.230019122888094e-05,
"loss": 0.8332,
"step": 1820
},
{
"epoch": 11.442006269592476,
"grad_norm": 0.51171875,
"learning_rate": 3.176329471284113e-05,
"loss": 0.8301,
"step": 1825
},
{
"epoch": 11.473354231974922,
"grad_norm": 0.49609375,
"learning_rate": 3.123005411465766e-05,
"loss": 0.8411,
"step": 1830
},
{
"epoch": 11.504702194357368,
"grad_norm": 0.4765625,
"learning_rate": 3.070049800376127e-05,
"loss": 0.8308,
"step": 1835
},
{
"epoch": 11.536050156739812,
"grad_norm": 0.498046875,
"learning_rate": 3.01746547521789e-05,
"loss": 0.8285,
"step": 1840
},
{
"epoch": 11.567398119122258,
"grad_norm": 0.486328125,
"learning_rate": 2.96525525330136e-05,
"loss": 0.835,
"step": 1845
},
{
"epoch": 11.598746081504702,
"grad_norm": 0.5,
"learning_rate": 2.9134219318935228e-05,
"loss": 0.8454,
"step": 1850
},
{
"epoch": 11.630094043887148,
"grad_norm": 0.5,
"learning_rate": 2.8619682880681596e-05,
"loss": 0.8331,
"step": 1855
},
{
"epoch": 11.661442006269592,
"grad_norm": 0.486328125,
"learning_rate": 2.8108970785570698e-05,
"loss": 0.8363,
"step": 1860
},
{
"epoch": 11.692789968652038,
"grad_norm": 0.482421875,
"learning_rate": 2.7602110396023673e-05,
"loss": 0.8324,
"step": 1865
},
{
"epoch": 11.724137931034482,
"grad_norm": 0.486328125,
"learning_rate": 2.7099128868098846e-05,
"loss": 0.8368,
"step": 1870
},
{
"epoch": 11.755485893416928,
"grad_norm": 0.48828125,
"learning_rate": 2.6600053150036797e-05,
"loss": 0.834,
"step": 1875
},
{
"epoch": 11.786833855799372,
"grad_norm": 0.48046875,
"learning_rate": 2.610490998081653e-05,
"loss": 0.8374,
"step": 1880
},
{
"epoch": 11.818181818181818,
"grad_norm": 0.49609375,
"learning_rate": 2.5613725888722828e-05,
"loss": 0.8436,
"step": 1885
},
{
"epoch": 11.849529780564263,
"grad_norm": 0.482421875,
"learning_rate": 2.5126527189925076e-05,
"loss": 0.8318,
"step": 1890
},
{
"epoch": 11.880877742946709,
"grad_norm": 0.482421875,
"learning_rate": 2.464333998706726e-05,
"loss": 0.8339,
"step": 1895
},
{
"epoch": 11.912225705329153,
"grad_norm": 0.48828125,
"learning_rate": 2.416419016786936e-05,
"loss": 0.844,
"step": 1900
},
{
"epoch": 11.943573667711599,
"grad_norm": 0.490234375,
"learning_rate": 2.3689103403740543e-05,
"loss": 0.8424,
"step": 1905
},
{
"epoch": 11.974921630094045,
"grad_norm": 0.478515625,
"learning_rate": 2.3218105148403656e-05,
"loss": 0.8351,
"step": 1910
},
{
"epoch": 12.0,
"eval_loss": 2.7330658435821533,
"eval_runtime": 0.544,
"eval_samples_per_second": 3.676,
"eval_steps_per_second": 1.838,
"step": 1914
},
{
"epoch": 12.006269592476489,
"grad_norm": 0.47265625,
"learning_rate": 2.2751220636531522e-05,
"loss": 0.8286,
"step": 1915
},
{
"epoch": 12.037617554858935,
"grad_norm": 0.48046875,
"learning_rate": 2.2288474882394917e-05,
"loss": 0.8207,
"step": 1920
},
{
"epoch": 12.068965517241379,
"grad_norm": 0.482421875,
"learning_rate": 2.1829892678522458e-05,
"loss": 0.8146,
"step": 1925
},
{
"epoch": 12.100313479623825,
"grad_norm": 0.48828125,
"learning_rate": 2.1375498594372113e-05,
"loss": 0.8151,
"step": 1930
},
{
"epoch": 12.13166144200627,
"grad_norm": 0.498046875,
"learning_rate": 2.0925316975015087e-05,
"loss": 0.8178,
"step": 1935
},
{
"epoch": 12.163009404388715,
"grad_norm": 0.478515625,
"learning_rate": 2.0479371939831325e-05,
"loss": 0.8197,
"step": 1940
},
{
"epoch": 12.19435736677116,
"grad_norm": 0.4765625,
"learning_rate": 2.003768738121732e-05,
"loss": 0.8224,
"step": 1945
},
{
"epoch": 12.225705329153605,
"grad_norm": 0.494140625,
"learning_rate": 1.9600286963305957e-05,
"loss": 0.8195,
"step": 1950
},
{
"epoch": 12.25705329153605,
"grad_norm": 0.486328125,
"learning_rate": 1.9167194120698795e-05,
"loss": 0.8232,
"step": 1955
},
{
"epoch": 12.288401253918495,
"grad_norm": 0.478515625,
"learning_rate": 1.87384320572104e-05,
"loss": 0.8164,
"step": 1960
},
{
"epoch": 12.31974921630094,
"grad_norm": 0.4921875,
"learning_rate": 1.8314023744625208e-05,
"loss": 0.8123,
"step": 1965
},
{
"epoch": 12.351097178683386,
"grad_norm": 0.478515625,
"learning_rate": 1.789399192146678e-05,
"loss": 0.824,
"step": 1970
},
{
"epoch": 12.38244514106583,
"grad_norm": 0.498046875,
"learning_rate": 1.7478359091779394e-05,
"loss": 0.8155,
"step": 1975
},
{
"epoch": 12.413793103448276,
"grad_norm": 0.484375,
"learning_rate": 1.706714752392259e-05,
"loss": 0.8314,
"step": 1980
},
{
"epoch": 12.445141065830722,
"grad_norm": 0.490234375,
"learning_rate": 1.666037924937791e-05,
"loss": 0.8257,
"step": 1985
},
{
"epoch": 12.476489028213166,
"grad_norm": 0.48046875,
"learning_rate": 1.6258076061568582e-05,
"loss": 0.8244,
"step": 1990
},
{
"epoch": 12.507836990595612,
"grad_norm": 0.490234375,
"learning_rate": 1.5860259514691933e-05,
"loss": 0.8147,
"step": 1995
},
{
"epoch": 12.539184952978056,
"grad_norm": 0.478515625,
"learning_rate": 1.5466950922564426e-05,
"loss": 0.8277,
"step": 2000
},
{
"epoch": 12.570532915360502,
"grad_norm": 0.48828125,
"learning_rate": 1.5078171357479942e-05,
"loss": 0.8243,
"step": 2005
},
{
"epoch": 12.601880877742946,
"grad_norm": 0.48828125,
"learning_rate": 1.4693941649080655e-05,
"loss": 0.8269,
"step": 2010
},
{
"epoch": 12.633228840125392,
"grad_norm": 0.498046875,
"learning_rate": 1.4314282383241096e-05,
"loss": 0.8155,
"step": 2015
},
{
"epoch": 12.664576802507836,
"grad_norm": 0.490234375,
"learning_rate": 1.3939213900965132e-05,
"loss": 0.8249,
"step": 2020
},
{
"epoch": 12.695924764890282,
"grad_norm": 0.48046875,
"learning_rate": 1.3568756297296292e-05,
"loss": 0.8218,
"step": 2025
},
{
"epoch": 12.727272727272727,
"grad_norm": 0.486328125,
"learning_rate": 1.3202929420241051e-05,
"loss": 0.8158,
"step": 2030
},
{
"epoch": 12.758620689655173,
"grad_norm": 0.4921875,
"learning_rate": 1.284175286970546e-05,
"loss": 0.8216,
"step": 2035
},
{
"epoch": 12.789968652037617,
"grad_norm": 0.486328125,
"learning_rate": 1.2485245996445006e-05,
"loss": 0.8241,
"step": 2040
},
{
"epoch": 12.821316614420063,
"grad_norm": 0.494140625,
"learning_rate": 1.2133427901027917e-05,
"loss": 0.8241,
"step": 2045
},
{
"epoch": 12.852664576802507,
"grad_norm": 0.498046875,
"learning_rate": 1.1786317432811767e-05,
"loss": 0.8234,
"step": 2050
},
{
"epoch": 12.884012539184953,
"grad_norm": 0.490234375,
"learning_rate": 1.1443933188933553e-05,
"loss": 0.8206,
"step": 2055
},
{
"epoch": 12.915360501567399,
"grad_norm": 0.486328125,
"learning_rate": 1.1106293513313436e-05,
"loss": 0.8188,
"step": 2060
},
{
"epoch": 12.946708463949843,
"grad_norm": 0.478515625,
"learning_rate": 1.0773416495671773e-05,
"loss": 0.8234,
"step": 2065
},
{
"epoch": 12.978056426332289,
"grad_norm": 0.484375,
"learning_rate": 1.0445319970560041e-05,
"loss": 0.8265,
"step": 2070
},
{
"epoch": 12.996865203761756,
"eval_loss": 2.783811569213867,
"eval_runtime": 0.5489,
"eval_samples_per_second": 3.643,
"eval_steps_per_second": 1.822,
"step": 2073
},
{
"epoch": 13.009404388714733,
"grad_norm": 0.470703125,
"learning_rate": 1.0122021516405278e-05,
"loss": 0.8204,
"step": 2075
},
{
"epoch": 13.04075235109718,
"grad_norm": 0.4921875,
"learning_rate": 9.803538454568284e-06,
"loss": 0.8004,
"step": 2080
},
{
"epoch": 13.072100313479623,
"grad_norm": 0.498046875,
"learning_rate": 9.489887848415569e-06,
"loss": 0.8145,
"step": 2085
},
{
"epoch": 13.10344827586207,
"grad_norm": 0.48828125,
"learning_rate": 9.1810865024052e-06,
"loss": 0.8177,
"step": 2090
},
{
"epoch": 13.134796238244514,
"grad_norm": 0.48046875,
"learning_rate": 8.87715096118642e-06,
"loss": 0.8189,
"step": 2095
},
{
"epoch": 13.16614420062696,
"grad_norm": 0.48046875,
"learning_rate": 8.578097508713279e-06,
"loss": 0.8142,
"step": 2100
},
{
"epoch": 13.197492163009404,
"grad_norm": 0.47265625,
"learning_rate": 8.283942167372127e-06,
"loss": 0.8273,
"step": 2105
},
{
"epoch": 13.22884012539185,
"grad_norm": 0.4765625,
"learning_rate": 7.994700697123247e-06,
"loss": 0.8079,
"step": 2110
},
{
"epoch": 13.260188087774294,
"grad_norm": 0.482421875,
"learning_rate": 7.710388594656449e-06,
"loss": 0.8126,
"step": 2115
},
{
"epoch": 13.29153605015674,
"grad_norm": 0.48828125,
"learning_rate": 7.431021092560819e-06,
"loss": 0.813,
"step": 2120
},
{
"epoch": 13.322884012539184,
"grad_norm": 0.4921875,
"learning_rate": 7.156613158508619e-06,
"loss": 0.8156,
"step": 2125
},
{
"epoch": 13.35423197492163,
"grad_norm": 0.482421875,
"learning_rate": 6.887179494453288e-06,
"loss": 0.8058,
"step": 2130
},
{
"epoch": 13.385579937304076,
"grad_norm": 0.48046875,
"learning_rate": 6.622734535841868e-06,
"loss": 0.8222,
"step": 2135
},
{
"epoch": 13.41692789968652,
"grad_norm": 0.4765625,
"learning_rate": 6.363292450841485e-06,
"loss": 0.8177,
"step": 2140
},
{
"epoch": 13.448275862068966,
"grad_norm": 0.482421875,
"learning_rate": 6.108867139580365e-06,
"loss": 0.8204,
"step": 2145
},
{
"epoch": 13.47962382445141,
"grad_norm": 0.48828125,
"learning_rate": 5.859472233402985e-06,
"loss": 0.8132,
"step": 2150
},
{
"epoch": 13.510971786833856,
"grad_norm": 0.490234375,
"learning_rate": 5.615121094139897e-06,
"loss": 0.8177,
"step": 2155
},
{
"epoch": 13.5423197492163,
"grad_norm": 0.494140625,
"learning_rate": 5.3758268133916825e-06,
"loss": 0.8137,
"step": 2160
},
{
"epoch": 13.573667711598747,
"grad_norm": 0.48828125,
"learning_rate": 5.14160221182769e-06,
"loss": 0.8241,
"step": 2165
},
{
"epoch": 13.60501567398119,
"grad_norm": 0.474609375,
"learning_rate": 4.912459838499028e-06,
"loss": 0.8184,
"step": 2170
},
{
"epoch": 13.636363636363637,
"grad_norm": 0.48046875,
"learning_rate": 4.688411970166295e-06,
"loss": 0.8203,
"step": 2175
},
{
"epoch": 13.66771159874608,
"grad_norm": 0.490234375,
"learning_rate": 4.469470610641802e-06,
"loss": 0.8107,
"step": 2180
},
{
"epoch": 13.699059561128527,
"grad_norm": 0.486328125,
"learning_rate": 4.2556474901464195e-06,
"loss": 0.8115,
"step": 2185
},
{
"epoch": 13.730407523510971,
"grad_norm": 0.494140625,
"learning_rate": 4.046954064681185e-06,
"loss": 0.8156,
"step": 2190
},
{
"epoch": 13.761755485893417,
"grad_norm": 0.48828125,
"learning_rate": 3.843401515413392e-06,
"loss": 0.8246,
"step": 2195
},
{
"epoch": 13.793103448275861,
"grad_norm": 0.47265625,
"learning_rate": 3.6450007480777093e-06,
"loss": 0.8191,
"step": 2200
},
{
"epoch": 13.824451410658307,
"grad_norm": 0.490234375,
"learning_rate": 3.451762392391733e-06,
"loss": 0.824,
"step": 2205
},
{
"epoch": 13.855799373040753,
"grad_norm": 0.482421875,
"learning_rate": 3.2636968014865378e-06,
"loss": 0.8202,
"step": 2210
},
{
"epoch": 13.887147335423197,
"grad_norm": 0.474609375,
"learning_rate": 3.080814051352021e-06,
"loss": 0.8148,
"step": 2215
},
{
"epoch": 13.918495297805643,
"grad_norm": 0.486328125,
"learning_rate": 2.9031239402970144e-06,
"loss": 0.8245,
"step": 2220
},
{
"epoch": 13.949843260188088,
"grad_norm": 0.486328125,
"learning_rate": 2.730635988424335e-06,
"loss": 0.8265,
"step": 2225
},
{
"epoch": 13.981191222570533,
"grad_norm": 0.46875,
"learning_rate": 2.5633594371206937e-06,
"loss": 0.8167,
"step": 2230
},
{
"epoch": 14.0,
"eval_loss": 2.799032211303711,
"eval_runtime": 0.5421,
"eval_samples_per_second": 3.689,
"eval_steps_per_second": 1.845,
"step": 2233
},
{
"epoch": 14.012539184952978,
"grad_norm": 0.466796875,
"learning_rate": 2.401303248561659e-06,
"loss": 0.8138,
"step": 2235
},
{
"epoch": 14.043887147335424,
"grad_norm": 0.48046875,
"learning_rate": 2.2444761052313856e-06,
"loss": 0.8159,
"step": 2240
},
{
"epoch": 14.075235109717868,
"grad_norm": 0.478515625,
"learning_rate": 2.0928864094574842e-06,
"loss": 0.8174,
"step": 2245
},
{
"epoch": 14.106583072100314,
"grad_norm": 0.49609375,
"learning_rate": 1.9465422829608837e-06,
"loss": 0.8186,
"step": 2250
},
{
"epoch": 14.137931034482758,
"grad_norm": 0.48828125,
"learning_rate": 1.8054515664206128e-06,
"loss": 0.8183,
"step": 2255
},
{
"epoch": 14.169278996865204,
"grad_norm": 0.48046875,
"learning_rate": 1.6696218190537683e-06,
"loss": 0.814,
"step": 2260
},
{
"epoch": 14.200626959247648,
"grad_norm": 0.490234375,
"learning_rate": 1.539060318210539e-06,
"loss": 0.8215,
"step": 2265
},
{
"epoch": 14.231974921630094,
"grad_norm": 0.474609375,
"learning_rate": 1.413774058984252e-06,
"loss": 0.8152,
"step": 2270
},
{
"epoch": 14.263322884012538,
"grad_norm": 0.486328125,
"learning_rate": 1.2937697538366378e-06,
"loss": 0.8136,
"step": 2275
},
{
"epoch": 14.294670846394984,
"grad_norm": 0.4765625,
"learning_rate": 1.1790538322381527e-06,
"loss": 0.8116,
"step": 2280
},
{
"epoch": 14.32601880877743,
"grad_norm": 0.490234375,
"learning_rate": 1.0696324403235757e-06,
"loss": 0.824,
"step": 2285
},
{
"epoch": 14.357366771159874,
"grad_norm": 0.474609375,
"learning_rate": 9.655114405626386e-07,
"loss": 0.8171,
"step": 2290
},
{
"epoch": 14.38871473354232,
"grad_norm": 0.474609375,
"learning_rate": 8.666964114459997e-07,
"loss": 0.8055,
"step": 2295
},
{
"epoch": 14.420062695924765,
"grad_norm": 0.474609375,
"learning_rate": 7.73192647186316e-07,
"loss": 0.8262,
"step": 2300
},
{
"epoch": 14.45141065830721,
"grad_norm": 0.482421875,
"learning_rate": 6.850051574346372e-07,
"loss": 0.8127,
"step": 2305
},
{
"epoch": 14.482758620689655,
"grad_norm": 0.486328125,
"learning_rate": 6.021386670119756e-07,
"loss": 0.8089,
"step": 2310
},
{
"epoch": 14.5141065830721,
"grad_norm": 0.48828125,
"learning_rate": 5.245976156561305e-07,
"loss": 0.8186,
"step": 2315
},
{
"epoch": 14.545454545454545,
"grad_norm": 0.498046875,
"learning_rate": 4.523861577839239e-07,
"loss": 0.8223,
"step": 2320
},
{
"epoch": 14.576802507836991,
"grad_norm": 0.490234375,
"learning_rate": 3.8550816226852196e-07,
"loss": 0.8151,
"step": 2325
},
{
"epoch": 14.608150470219435,
"grad_norm": 0.48046875,
"learning_rate": 3.23967212232168e-07,
"loss": 0.8152,
"step": 2330
},
{
"epoch": 14.639498432601881,
"grad_norm": 0.48046875,
"learning_rate": 2.677666048542693e-07,
"loss": 0.8097,
"step": 2335
},
{
"epoch": 14.670846394984325,
"grad_norm": 0.484375,
"learning_rate": 2.1690935119468293e-07,
"loss": 0.827,
"step": 2340
},
{
"epoch": 14.702194357366771,
"grad_norm": 0.48828125,
"learning_rate": 1.7139817603240016e-07,
"loss": 0.8203,
"step": 2345
},
{
"epoch": 14.733542319749215,
"grad_norm": 0.484375,
"learning_rate": 1.3123551771958564e-07,
"loss": 0.8204,
"step": 2350
},
{
"epoch": 14.764890282131661,
"grad_norm": 0.482421875,
"learning_rate": 9.642352805093734e-08,
"loss": 0.8137,
"step": 2355
},
{
"epoch": 14.796238244514107,
"grad_norm": 0.482421875,
"learning_rate": 6.696407214835664e-08,
"loss": 0.8149,
"step": 2360
},
{
"epoch": 14.827586206896552,
"grad_norm": 0.474609375,
"learning_rate": 4.285872836108373e-08,
"loss": 0.8119,
"step": 2365
},
{
"epoch": 14.858934169278998,
"grad_norm": 0.47265625,
"learning_rate": 2.4108788181076423e-08,
"loss": 0.8128,
"step": 2370
},
{
"epoch": 14.890282131661442,
"grad_norm": 0.474609375,
"learning_rate": 1.071525617384328e-08,
"loss": 0.818,
"step": 2375
},
{
"epoch": 14.921630094043888,
"grad_norm": 0.474609375,
"learning_rate": 2.6788499246421795e-09,
"loss": 0.8068,
"step": 2380
},
{
"epoch": 14.952978056426332,
"grad_norm": 0.47265625,
"learning_rate": 0.0,
"loss": 0.8075,
"step": 2385
},
{
"epoch": 14.952978056426332,
"eval_loss": 2.8001084327697754,
"eval_runtime": 0.5613,
"eval_samples_per_second": 3.563,
"eval_steps_per_second": 1.782,
"step": 2385
},
{
"epoch": 14.952978056426332,
"step": 2385,
"total_flos": 1.4215766364399862e+18,
"train_loss": 1.0738131565117985,
"train_runtime": 14553.4383,
"train_samples_per_second": 7.888,
"train_steps_per_second": 0.164
}
],
"logging_steps": 5,
"max_steps": 2385,
"num_input_tokens_seen": 0,
"num_train_epochs": 15,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.4215766364399862e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}