avemio-digital's picture
Add files using upload-large-folder tool
a2a407d verified
raw
history blame
82.4 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 1000,
"global_step": 5102,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 6.812536716461182,
"learning_rate": 9.70873786407767e-08,
"loss": 1.4108,
"step": 10
},
{
"epoch": 0.0,
"grad_norm": 6.775967597961426,
"learning_rate": 1.941747572815534e-07,
"loss": 1.4065,
"step": 20
},
{
"epoch": 0.01,
"grad_norm": 6.67765998840332,
"learning_rate": 2.9126213592233014e-07,
"loss": 1.4145,
"step": 30
},
{
"epoch": 0.01,
"grad_norm": 6.1705145835876465,
"learning_rate": 3.883495145631068e-07,
"loss": 1.3955,
"step": 40
},
{
"epoch": 0.01,
"grad_norm": 5.644610404968262,
"learning_rate": 4.854368932038835e-07,
"loss": 1.3986,
"step": 50
},
{
"epoch": 0.01,
"grad_norm": 6.067681789398193,
"learning_rate": 5.825242718446603e-07,
"loss": 1.3721,
"step": 60
},
{
"epoch": 0.01,
"grad_norm": 5.449668884277344,
"learning_rate": 6.79611650485437e-07,
"loss": 1.3672,
"step": 70
},
{
"epoch": 0.02,
"grad_norm": 4.1965107917785645,
"learning_rate": 7.766990291262136e-07,
"loss": 1.3229,
"step": 80
},
{
"epoch": 0.02,
"grad_norm": 5.0556440353393555,
"learning_rate": 8.737864077669904e-07,
"loss": 1.2846,
"step": 90
},
{
"epoch": 0.02,
"grad_norm": 5.58712911605835,
"learning_rate": 9.70873786407767e-07,
"loss": 1.2561,
"step": 100
},
{
"epoch": 0.02,
"grad_norm": 4.737099647521973,
"learning_rate": 1.0679611650485437e-06,
"loss": 1.2511,
"step": 110
},
{
"epoch": 0.02,
"grad_norm": 4.916423320770264,
"learning_rate": 1.1650485436893206e-06,
"loss": 1.2378,
"step": 120
},
{
"epoch": 0.03,
"grad_norm": 5.898449420928955,
"learning_rate": 1.2621359223300972e-06,
"loss": 1.2263,
"step": 130
},
{
"epoch": 0.03,
"grad_norm": 4.9001641273498535,
"learning_rate": 1.359223300970874e-06,
"loss": 1.228,
"step": 140
},
{
"epoch": 0.03,
"grad_norm": 4.939080238342285,
"learning_rate": 1.4563106796116506e-06,
"loss": 1.2006,
"step": 150
},
{
"epoch": 0.03,
"grad_norm": 4.023560047149658,
"learning_rate": 1.5533980582524272e-06,
"loss": 1.1934,
"step": 160
},
{
"epoch": 0.03,
"grad_norm": 4.546477317810059,
"learning_rate": 1.650485436893204e-06,
"loss": 1.1905,
"step": 170
},
{
"epoch": 0.04,
"grad_norm": 4.045153617858887,
"learning_rate": 1.7475728155339808e-06,
"loss": 1.1642,
"step": 180
},
{
"epoch": 0.04,
"grad_norm": 4.5985870361328125,
"learning_rate": 1.8446601941747574e-06,
"loss": 1.1674,
"step": 190
},
{
"epoch": 0.04,
"grad_norm": 4.283823490142822,
"learning_rate": 1.941747572815534e-06,
"loss": 1.157,
"step": 200
},
{
"epoch": 0.04,
"grad_norm": 4.753105640411377,
"learning_rate": 2.0388349514563107e-06,
"loss": 1.164,
"step": 210
},
{
"epoch": 0.04,
"grad_norm": 4.330531597137451,
"learning_rate": 2.1359223300970874e-06,
"loss": 1.169,
"step": 220
},
{
"epoch": 0.05,
"grad_norm": 4.404067516326904,
"learning_rate": 2.2330097087378645e-06,
"loss": 1.1493,
"step": 230
},
{
"epoch": 0.05,
"grad_norm": 4.143772602081299,
"learning_rate": 2.330097087378641e-06,
"loss": 1.1388,
"step": 240
},
{
"epoch": 0.05,
"grad_norm": 3.965195417404175,
"learning_rate": 2.427184466019418e-06,
"loss": 1.147,
"step": 250
},
{
"epoch": 0.05,
"grad_norm": 4.0882039070129395,
"learning_rate": 2.5242718446601945e-06,
"loss": 1.1487,
"step": 260
},
{
"epoch": 0.05,
"grad_norm": 4.03162956237793,
"learning_rate": 2.621359223300971e-06,
"loss": 1.136,
"step": 270
},
{
"epoch": 0.05,
"grad_norm": 4.2308855056762695,
"learning_rate": 2.718446601941748e-06,
"loss": 1.1378,
"step": 280
},
{
"epoch": 0.06,
"grad_norm": 4.1606221199035645,
"learning_rate": 2.8155339805825245e-06,
"loss": 1.1471,
"step": 290
},
{
"epoch": 0.06,
"grad_norm": 4.03283166885376,
"learning_rate": 2.912621359223301e-06,
"loss": 1.1185,
"step": 300
},
{
"epoch": 0.06,
"grad_norm": 4.2199859619140625,
"learning_rate": 3.0097087378640778e-06,
"loss": 1.1432,
"step": 310
},
{
"epoch": 0.06,
"grad_norm": 4.150729656219482,
"learning_rate": 3.1067961165048544e-06,
"loss": 1.1155,
"step": 320
},
{
"epoch": 0.06,
"grad_norm": 3.9178309440612793,
"learning_rate": 3.2038834951456315e-06,
"loss": 1.1219,
"step": 330
},
{
"epoch": 0.07,
"grad_norm": 4.3575663566589355,
"learning_rate": 3.300970873786408e-06,
"loss": 1.108,
"step": 340
},
{
"epoch": 0.07,
"grad_norm": 4.251282215118408,
"learning_rate": 3.398058252427185e-06,
"loss": 1.1052,
"step": 350
},
{
"epoch": 0.07,
"grad_norm": 4.297124862670898,
"learning_rate": 3.4951456310679615e-06,
"loss": 1.112,
"step": 360
},
{
"epoch": 0.07,
"grad_norm": 4.4219465255737305,
"learning_rate": 3.592233009708738e-06,
"loss": 1.12,
"step": 370
},
{
"epoch": 0.07,
"grad_norm": 4.0326948165893555,
"learning_rate": 3.689320388349515e-06,
"loss": 1.1286,
"step": 380
},
{
"epoch": 0.08,
"grad_norm": 4.3179030418396,
"learning_rate": 3.7864077669902915e-06,
"loss": 1.1255,
"step": 390
},
{
"epoch": 0.08,
"grad_norm": 4.20508337020874,
"learning_rate": 3.883495145631068e-06,
"loss": 1.1116,
"step": 400
},
{
"epoch": 0.08,
"grad_norm": 4.361403942108154,
"learning_rate": 3.980582524271845e-06,
"loss": 1.1061,
"step": 410
},
{
"epoch": 0.08,
"grad_norm": 4.279186248779297,
"learning_rate": 4.0776699029126215e-06,
"loss": 1.1031,
"step": 420
},
{
"epoch": 0.08,
"grad_norm": 4.163991928100586,
"learning_rate": 4.1747572815533986e-06,
"loss": 1.1097,
"step": 430
},
{
"epoch": 0.09,
"grad_norm": 4.068342685699463,
"learning_rate": 4.271844660194175e-06,
"loss": 1.1048,
"step": 440
},
{
"epoch": 0.09,
"grad_norm": 4.233316421508789,
"learning_rate": 4.368932038834952e-06,
"loss": 1.11,
"step": 450
},
{
"epoch": 0.09,
"grad_norm": 4.018556594848633,
"learning_rate": 4.466019417475729e-06,
"loss": 1.1,
"step": 460
},
{
"epoch": 0.09,
"grad_norm": 4.257734298706055,
"learning_rate": 4.563106796116505e-06,
"loss": 1.1102,
"step": 470
},
{
"epoch": 0.09,
"grad_norm": 4.20249605178833,
"learning_rate": 4.660194174757282e-06,
"loss": 1.0938,
"step": 480
},
{
"epoch": 0.1,
"grad_norm": 4.196409225463867,
"learning_rate": 4.7572815533980585e-06,
"loss": 1.1153,
"step": 490
},
{
"epoch": 0.1,
"grad_norm": 3.7823269367218018,
"learning_rate": 4.854368932038836e-06,
"loss": 1.105,
"step": 500
},
{
"epoch": 0.1,
"grad_norm": 3.8932242393493652,
"learning_rate": 4.951456310679612e-06,
"loss": 1.0977,
"step": 510
},
{
"epoch": 0.1,
"grad_norm": 4.169296741485596,
"learning_rate": 5.048543689320389e-06,
"loss": 1.1022,
"step": 520
},
{
"epoch": 0.1,
"grad_norm": 4.110784530639648,
"learning_rate": 5.145631067961165e-06,
"loss": 1.0954,
"step": 530
},
{
"epoch": 0.11,
"grad_norm": 4.476339340209961,
"learning_rate": 5.242718446601942e-06,
"loss": 1.104,
"step": 540
},
{
"epoch": 0.11,
"grad_norm": 3.7383415699005127,
"learning_rate": 5.3398058252427185e-06,
"loss": 1.106,
"step": 550
},
{
"epoch": 0.11,
"grad_norm": 4.040097236633301,
"learning_rate": 5.436893203883496e-06,
"loss": 1.1006,
"step": 560
},
{
"epoch": 0.11,
"grad_norm": 4.430188179016113,
"learning_rate": 5.533980582524272e-06,
"loss": 1.1087,
"step": 570
},
{
"epoch": 0.11,
"grad_norm": 4.724983215332031,
"learning_rate": 5.631067961165049e-06,
"loss": 1.1004,
"step": 580
},
{
"epoch": 0.12,
"grad_norm": 4.040498733520508,
"learning_rate": 5.728155339805825e-06,
"loss": 1.0944,
"step": 590
},
{
"epoch": 0.12,
"grad_norm": 4.504246234893799,
"learning_rate": 5.825242718446602e-06,
"loss": 1.0915,
"step": 600
},
{
"epoch": 0.12,
"grad_norm": 4.28185510635376,
"learning_rate": 5.9223300970873785e-06,
"loss": 1.0951,
"step": 610
},
{
"epoch": 0.12,
"grad_norm": 3.7988641262054443,
"learning_rate": 6.0194174757281556e-06,
"loss": 1.1052,
"step": 620
},
{
"epoch": 0.12,
"grad_norm": 4.40338134765625,
"learning_rate": 6.116504854368932e-06,
"loss": 1.0931,
"step": 630
},
{
"epoch": 0.13,
"grad_norm": 4.3256659507751465,
"learning_rate": 6.213592233009709e-06,
"loss": 1.0863,
"step": 640
},
{
"epoch": 0.13,
"grad_norm": 4.146902561187744,
"learning_rate": 6.310679611650487e-06,
"loss": 1.0916,
"step": 650
},
{
"epoch": 0.13,
"grad_norm": 4.871020317077637,
"learning_rate": 6.407766990291263e-06,
"loss": 1.0853,
"step": 660
},
{
"epoch": 0.13,
"grad_norm": 3.6339454650878906,
"learning_rate": 6.50485436893204e-06,
"loss": 1.0907,
"step": 670
},
{
"epoch": 0.13,
"grad_norm": 4.507898330688477,
"learning_rate": 6.601941747572816e-06,
"loss": 1.1049,
"step": 680
},
{
"epoch": 0.14,
"grad_norm": 4.15292501449585,
"learning_rate": 6.6990291262135935e-06,
"loss": 1.0928,
"step": 690
},
{
"epoch": 0.14,
"grad_norm": 4.081875324249268,
"learning_rate": 6.79611650485437e-06,
"loss": 1.1104,
"step": 700
},
{
"epoch": 0.14,
"grad_norm": 3.9496753215789795,
"learning_rate": 6.893203883495147e-06,
"loss": 1.0916,
"step": 710
},
{
"epoch": 0.14,
"grad_norm": 4.676784992218018,
"learning_rate": 6.990291262135923e-06,
"loss": 1.087,
"step": 720
},
{
"epoch": 0.14,
"grad_norm": 3.889897584915161,
"learning_rate": 7.0873786407767e-06,
"loss": 1.0889,
"step": 730
},
{
"epoch": 0.15,
"grad_norm": 3.654224157333374,
"learning_rate": 7.184466019417476e-06,
"loss": 1.0929,
"step": 740
},
{
"epoch": 0.15,
"grad_norm": 5.457578182220459,
"learning_rate": 7.2815533980582534e-06,
"loss": 1.0855,
"step": 750
},
{
"epoch": 0.15,
"grad_norm": 4.403715133666992,
"learning_rate": 7.37864077669903e-06,
"loss": 1.0793,
"step": 760
},
{
"epoch": 0.15,
"grad_norm": 4.498962879180908,
"learning_rate": 7.475728155339807e-06,
"loss": 1.0946,
"step": 770
},
{
"epoch": 0.15,
"grad_norm": 4.233016014099121,
"learning_rate": 7.572815533980583e-06,
"loss": 1.0753,
"step": 780
},
{
"epoch": 0.15,
"grad_norm": 4.361231327056885,
"learning_rate": 7.66990291262136e-06,
"loss": 1.0865,
"step": 790
},
{
"epoch": 0.16,
"grad_norm": 4.116058349609375,
"learning_rate": 7.766990291262136e-06,
"loss": 1.0874,
"step": 800
},
{
"epoch": 0.16,
"grad_norm": 3.87738037109375,
"learning_rate": 7.864077669902913e-06,
"loss": 1.0897,
"step": 810
},
{
"epoch": 0.16,
"grad_norm": 4.444067478179932,
"learning_rate": 7.96116504854369e-06,
"loss": 1.0926,
"step": 820
},
{
"epoch": 0.16,
"grad_norm": 3.9490249156951904,
"learning_rate": 8.058252427184466e-06,
"loss": 1.0678,
"step": 830
},
{
"epoch": 0.16,
"grad_norm": 5.674923896789551,
"learning_rate": 8.155339805825243e-06,
"loss": 1.0923,
"step": 840
},
{
"epoch": 0.17,
"grad_norm": 4.707766056060791,
"learning_rate": 8.25242718446602e-06,
"loss": 1.0874,
"step": 850
},
{
"epoch": 0.17,
"grad_norm": 4.693995475769043,
"learning_rate": 8.349514563106797e-06,
"loss": 1.0558,
"step": 860
},
{
"epoch": 0.17,
"grad_norm": 4.561864376068115,
"learning_rate": 8.446601941747573e-06,
"loss": 1.0961,
"step": 870
},
{
"epoch": 0.17,
"grad_norm": 4.7990922927856445,
"learning_rate": 8.54368932038835e-06,
"loss": 1.0877,
"step": 880
},
{
"epoch": 0.17,
"grad_norm": 5.014951705932617,
"learning_rate": 8.640776699029127e-06,
"loss": 1.0866,
"step": 890
},
{
"epoch": 0.18,
"grad_norm": 5.007348537445068,
"learning_rate": 8.737864077669904e-06,
"loss": 1.0795,
"step": 900
},
{
"epoch": 0.18,
"grad_norm": 4.876419544219971,
"learning_rate": 8.834951456310681e-06,
"loss": 1.0826,
"step": 910
},
{
"epoch": 0.18,
"grad_norm": 5.8001556396484375,
"learning_rate": 8.932038834951458e-06,
"loss": 1.0828,
"step": 920
},
{
"epoch": 0.18,
"grad_norm": 4.761294364929199,
"learning_rate": 9.029126213592233e-06,
"loss": 1.0897,
"step": 930
},
{
"epoch": 0.18,
"grad_norm": 4.832348346710205,
"learning_rate": 9.12621359223301e-06,
"loss": 1.0826,
"step": 940
},
{
"epoch": 0.19,
"grad_norm": 5.325027942657471,
"learning_rate": 9.223300970873788e-06,
"loss": 1.0705,
"step": 950
},
{
"epoch": 0.19,
"grad_norm": 3.859924554824829,
"learning_rate": 9.320388349514565e-06,
"loss": 1.0691,
"step": 960
},
{
"epoch": 0.19,
"grad_norm": 3.8470637798309326,
"learning_rate": 9.41747572815534e-06,
"loss": 1.0808,
"step": 970
},
{
"epoch": 0.19,
"grad_norm": 4.887052536010742,
"learning_rate": 9.514563106796117e-06,
"loss": 1.0688,
"step": 980
},
{
"epoch": 0.19,
"grad_norm": 3.891918897628784,
"learning_rate": 9.611650485436894e-06,
"loss": 1.0748,
"step": 990
},
{
"epoch": 0.2,
"grad_norm": 4.252170562744141,
"learning_rate": 9.708737864077671e-06,
"loss": 1.0889,
"step": 1000
},
{
"epoch": 0.2,
"eval_loss": 1.0763322114944458,
"eval_runtime": 12.4536,
"eval_samples_per_second": 52.515,
"eval_steps_per_second": 6.584,
"step": 1000
},
{
"epoch": 0.2,
"grad_norm": 4.76177453994751,
"learning_rate": 9.805825242718447e-06,
"loss": 1.0925,
"step": 1010
},
{
"epoch": 0.2,
"grad_norm": 5.820517063140869,
"learning_rate": 9.902912621359224e-06,
"loss": 1.0879,
"step": 1020
},
{
"epoch": 0.2,
"grad_norm": 4.241539478302002,
"learning_rate": 1e-05,
"loss": 1.0688,
"step": 1030
},
{
"epoch": 0.2,
"grad_norm": 4.468598365783691,
"learning_rate": 9.999971286914108e-06,
"loss": 1.0782,
"step": 1040
},
{
"epoch": 0.21,
"grad_norm": 4.580018043518066,
"learning_rate": 9.999885147986207e-06,
"loss": 1.0793,
"step": 1050
},
{
"epoch": 0.21,
"grad_norm": 5.351871490478516,
"learning_rate": 9.999741584205621e-06,
"loss": 1.0746,
"step": 1060
},
{
"epoch": 0.21,
"grad_norm": 4.939090251922607,
"learning_rate": 9.999540597221217e-06,
"loss": 1.0814,
"step": 1070
},
{
"epoch": 0.21,
"grad_norm": 4.497158050537109,
"learning_rate": 9.999282189341374e-06,
"loss": 1.076,
"step": 1080
},
{
"epoch": 0.21,
"grad_norm": 3.999101161956787,
"learning_rate": 9.998966363533972e-06,
"loss": 1.0826,
"step": 1090
},
{
"epoch": 0.22,
"grad_norm": 5.041301727294922,
"learning_rate": 9.99859312342634e-06,
"loss": 1.0675,
"step": 1100
},
{
"epoch": 0.22,
"grad_norm": 4.890156269073486,
"learning_rate": 9.998162473305229e-06,
"loss": 1.0724,
"step": 1110
},
{
"epoch": 0.22,
"grad_norm": 6.730093955993652,
"learning_rate": 9.997674418116759e-06,
"loss": 1.0679,
"step": 1120
},
{
"epoch": 0.22,
"grad_norm": 5.929831504821777,
"learning_rate": 9.997128963466355e-06,
"loss": 1.064,
"step": 1130
},
{
"epoch": 0.22,
"grad_norm": 6.288021087646484,
"learning_rate": 9.996526115618694e-06,
"loss": 1.0895,
"step": 1140
},
{
"epoch": 0.23,
"grad_norm": 4.647055149078369,
"learning_rate": 9.995865881497621e-06,
"loss": 1.0768,
"step": 1150
},
{
"epoch": 0.23,
"grad_norm": 5.861464977264404,
"learning_rate": 9.995148268686086e-06,
"loss": 1.073,
"step": 1160
},
{
"epoch": 0.23,
"grad_norm": 5.072396278381348,
"learning_rate": 9.994373285426034e-06,
"loss": 1.0679,
"step": 1170
},
{
"epoch": 0.23,
"grad_norm": 4.59302282333374,
"learning_rate": 9.993540940618334e-06,
"loss": 1.0999,
"step": 1180
},
{
"epoch": 0.23,
"grad_norm": 4.47246789932251,
"learning_rate": 9.992651243822658e-06,
"loss": 1.0797,
"step": 1190
},
{
"epoch": 0.24,
"grad_norm": 3.917146682739258,
"learning_rate": 9.991704205257383e-06,
"loss": 1.0786,
"step": 1200
},
{
"epoch": 0.24,
"grad_norm": 5.097021102905273,
"learning_rate": 9.99069983579947e-06,
"loss": 1.0709,
"step": 1210
},
{
"epoch": 0.24,
"grad_norm": 5.385958671569824,
"learning_rate": 9.989638146984337e-06,
"loss": 1.0878,
"step": 1220
},
{
"epoch": 0.24,
"grad_norm": 4.534526824951172,
"learning_rate": 9.988519151005728e-06,
"loss": 1.0753,
"step": 1230
},
{
"epoch": 0.24,
"grad_norm": 5.479306697845459,
"learning_rate": 9.987342860715575e-06,
"loss": 1.0638,
"step": 1240
},
{
"epoch": 0.25,
"grad_norm": 4.557966232299805,
"learning_rate": 9.986109289623848e-06,
"loss": 1.0841,
"step": 1250
},
{
"epoch": 0.25,
"grad_norm": 4.882152080535889,
"learning_rate": 9.984818451898399e-06,
"loss": 1.0678,
"step": 1260
},
{
"epoch": 0.25,
"grad_norm": 5.506185054779053,
"learning_rate": 9.983470362364803e-06,
"loss": 1.0766,
"step": 1270
},
{
"epoch": 0.25,
"grad_norm": 4.027829170227051,
"learning_rate": 9.982065036506183e-06,
"loss": 1.0825,
"step": 1280
},
{
"epoch": 0.25,
"grad_norm": 5.385190010070801,
"learning_rate": 9.980602490463037e-06,
"loss": 1.0709,
"step": 1290
},
{
"epoch": 0.25,
"grad_norm": 6.701911449432373,
"learning_rate": 9.979082741033047e-06,
"loss": 1.0813,
"step": 1300
},
{
"epoch": 0.26,
"grad_norm": 4.571600437164307,
"learning_rate": 9.977505805670895e-06,
"loss": 1.0678,
"step": 1310
},
{
"epoch": 0.26,
"grad_norm": 6.041151523590088,
"learning_rate": 9.97587170248805e-06,
"loss": 1.08,
"step": 1320
},
{
"epoch": 0.26,
"grad_norm": 5.1545729637146,
"learning_rate": 9.97418045025257e-06,
"loss": 1.071,
"step": 1330
},
{
"epoch": 0.26,
"grad_norm": 4.163790702819824,
"learning_rate": 9.972432068388885e-06,
"loss": 1.068,
"step": 1340
},
{
"epoch": 0.26,
"grad_norm": 5.824156284332275,
"learning_rate": 9.97062657697757e-06,
"loss": 1.0718,
"step": 1350
},
{
"epoch": 0.27,
"grad_norm": 6.520030975341797,
"learning_rate": 9.968763996755115e-06,
"loss": 1.0779,
"step": 1360
},
{
"epoch": 0.27,
"grad_norm": 4.418417453765869,
"learning_rate": 9.966844349113695e-06,
"loss": 1.0677,
"step": 1370
},
{
"epoch": 0.27,
"grad_norm": 4.918398380279541,
"learning_rate": 9.96486765610091e-06,
"loss": 1.0942,
"step": 1380
},
{
"epoch": 0.27,
"grad_norm": 5.05225944519043,
"learning_rate": 9.96283394041954e-06,
"loss": 1.0852,
"step": 1390
},
{
"epoch": 0.27,
"grad_norm": 4.492018699645996,
"learning_rate": 9.96074322542729e-06,
"loss": 1.0728,
"step": 1400
},
{
"epoch": 0.28,
"grad_norm": 4.9885711669921875,
"learning_rate": 9.958595535136511e-06,
"loss": 1.0618,
"step": 1410
},
{
"epoch": 0.28,
"grad_norm": 5.547979831695557,
"learning_rate": 9.95639089421393e-06,
"loss": 1.0753,
"step": 1420
},
{
"epoch": 0.28,
"grad_norm": 4.2231621742248535,
"learning_rate": 9.954129327980362e-06,
"loss": 1.0573,
"step": 1430
},
{
"epoch": 0.28,
"grad_norm": 4.681105613708496,
"learning_rate": 9.951810862410426e-06,
"loss": 1.0833,
"step": 1440
},
{
"epoch": 0.28,
"grad_norm": 5.339748382568359,
"learning_rate": 9.949435524132245e-06,
"loss": 1.0712,
"step": 1450
},
{
"epoch": 0.29,
"grad_norm": 4.844823837280273,
"learning_rate": 9.947003340427134e-06,
"loss": 1.0803,
"step": 1460
},
{
"epoch": 0.29,
"grad_norm": 3.774728536605835,
"learning_rate": 9.944514339229292e-06,
"loss": 1.0787,
"step": 1470
},
{
"epoch": 0.29,
"grad_norm": 4.437483310699463,
"learning_rate": 9.941968549125481e-06,
"loss": 1.0757,
"step": 1480
},
{
"epoch": 0.29,
"grad_norm": 5.675827503204346,
"learning_rate": 9.9393659993547e-06,
"loss": 1.0635,
"step": 1490
},
{
"epoch": 0.29,
"grad_norm": 5.193900108337402,
"learning_rate": 9.936706719807839e-06,
"loss": 1.0527,
"step": 1500
},
{
"epoch": 0.3,
"grad_norm": 6.871400833129883,
"learning_rate": 9.93399074102735e-06,
"loss": 1.0683,
"step": 1510
},
{
"epoch": 0.3,
"grad_norm": 6.098533630371094,
"learning_rate": 9.931218094206882e-06,
"loss": 1.0704,
"step": 1520
},
{
"epoch": 0.3,
"grad_norm": 7.03292989730835,
"learning_rate": 9.928388811190938e-06,
"loss": 1.0839,
"step": 1530
},
{
"epoch": 0.3,
"grad_norm": 4.196811199188232,
"learning_rate": 9.925502924474495e-06,
"loss": 1.0749,
"step": 1540
},
{
"epoch": 0.3,
"grad_norm": 5.939242362976074,
"learning_rate": 9.922560467202638e-06,
"loss": 1.0761,
"step": 1550
},
{
"epoch": 0.31,
"grad_norm": 5.359401702880859,
"learning_rate": 9.919561473170178e-06,
"loss": 1.0625,
"step": 1560
},
{
"epoch": 0.31,
"grad_norm": 5.799266338348389,
"learning_rate": 9.916505976821262e-06,
"loss": 1.0691,
"step": 1570
},
{
"epoch": 0.31,
"grad_norm": 4.341587543487549,
"learning_rate": 9.913394013248987e-06,
"loss": 1.0737,
"step": 1580
},
{
"epoch": 0.31,
"grad_norm": 5.106640815734863,
"learning_rate": 9.91022561819498e-06,
"loss": 1.0805,
"step": 1590
},
{
"epoch": 0.31,
"grad_norm": 4.987344741821289,
"learning_rate": 9.907000828049001e-06,
"loss": 1.0569,
"step": 1600
},
{
"epoch": 0.32,
"grad_norm": 5.510980606079102,
"learning_rate": 9.903719679848522e-06,
"loss": 1.078,
"step": 1610
},
{
"epoch": 0.32,
"grad_norm": 4.564126968383789,
"learning_rate": 9.9003822112783e-06,
"loss": 1.0726,
"step": 1620
},
{
"epoch": 0.32,
"grad_norm": 4.666530132293701,
"learning_rate": 9.89698846066994e-06,
"loss": 1.0635,
"step": 1630
},
{
"epoch": 0.32,
"grad_norm": 4.472270488739014,
"learning_rate": 9.893538467001466e-06,
"loss": 1.0529,
"step": 1640
},
{
"epoch": 0.32,
"grad_norm": 4.534265995025635,
"learning_rate": 9.890032269896862e-06,
"loss": 1.08,
"step": 1650
},
{
"epoch": 0.33,
"grad_norm": 4.677114963531494,
"learning_rate": 9.886469909625624e-06,
"loss": 1.0719,
"step": 1660
},
{
"epoch": 0.33,
"grad_norm": 4.84233283996582,
"learning_rate": 9.882851427102299e-06,
"loss": 1.0665,
"step": 1670
},
{
"epoch": 0.33,
"grad_norm": 4.839110374450684,
"learning_rate": 9.879176863885997e-06,
"loss": 1.0635,
"step": 1680
},
{
"epoch": 0.33,
"grad_norm": 4.322112560272217,
"learning_rate": 9.875446262179948e-06,
"loss": 1.0755,
"step": 1690
},
{
"epoch": 0.33,
"grad_norm": 4.2779130935668945,
"learning_rate": 9.87165966483098e-06,
"loss": 1.0745,
"step": 1700
},
{
"epoch": 0.34,
"grad_norm": 5.142566204071045,
"learning_rate": 9.867817115329055e-06,
"loss": 1.0725,
"step": 1710
},
{
"epoch": 0.34,
"grad_norm": 4.487438678741455,
"learning_rate": 9.863918657806752e-06,
"loss": 1.0538,
"step": 1720
},
{
"epoch": 0.34,
"grad_norm": 4.5258588790893555,
"learning_rate": 9.85996433703877e-06,
"loss": 1.0559,
"step": 1730
},
{
"epoch": 0.34,
"grad_norm": 4.927529811859131,
"learning_rate": 9.855954198441411e-06,
"loss": 1.0661,
"step": 1740
},
{
"epoch": 0.34,
"grad_norm": 3.9455795288085938,
"learning_rate": 9.851888288072053e-06,
"loss": 1.0769,
"step": 1750
},
{
"epoch": 0.34,
"grad_norm": 4.094160556793213,
"learning_rate": 9.847766652628635e-06,
"loss": 1.0767,
"step": 1760
},
{
"epoch": 0.35,
"grad_norm": 4.2093424797058105,
"learning_rate": 9.843589339449102e-06,
"loss": 1.0635,
"step": 1770
},
{
"epoch": 0.35,
"grad_norm": 4.292007923126221,
"learning_rate": 9.839356396510875e-06,
"loss": 1.0593,
"step": 1780
},
{
"epoch": 0.35,
"grad_norm": 4.241662502288818,
"learning_rate": 9.835067872430297e-06,
"loss": 1.0627,
"step": 1790
},
{
"epoch": 0.35,
"grad_norm": 3.465583324432373,
"learning_rate": 9.830723816462071e-06,
"loss": 1.0551,
"step": 1800
},
{
"epoch": 0.35,
"grad_norm": 4.395579814910889,
"learning_rate": 9.8263242784987e-06,
"loss": 1.0691,
"step": 1810
},
{
"epoch": 0.36,
"grad_norm": 4.91081428527832,
"learning_rate": 9.821869309069907e-06,
"loss": 1.0632,
"step": 1820
},
{
"epoch": 0.36,
"grad_norm": 3.9647531509399414,
"learning_rate": 9.817358959342057e-06,
"loss": 1.0693,
"step": 1830
},
{
"epoch": 0.36,
"grad_norm": 4.396968364715576,
"learning_rate": 9.81279328111758e-06,
"loss": 1.0869,
"step": 1840
},
{
"epoch": 0.36,
"grad_norm": 4.095502853393555,
"learning_rate": 9.808172326834356e-06,
"loss": 1.0636,
"step": 1850
},
{
"epoch": 0.36,
"grad_norm": 4.527934551239014,
"learning_rate": 9.80349614956513e-06,
"loss": 1.0679,
"step": 1860
},
{
"epoch": 0.37,
"grad_norm": 5.237788200378418,
"learning_rate": 9.798764803016892e-06,
"loss": 1.05,
"step": 1870
},
{
"epoch": 0.37,
"grad_norm": 4.45642614364624,
"learning_rate": 9.793978341530265e-06,
"loss": 1.0697,
"step": 1880
},
{
"epoch": 0.37,
"grad_norm": 3.550489664077759,
"learning_rate": 9.789136820078884e-06,
"loss": 1.079,
"step": 1890
},
{
"epoch": 0.37,
"grad_norm": 3.879551887512207,
"learning_rate": 9.784240294268756e-06,
"loss": 1.0649,
"step": 1900
},
{
"epoch": 0.37,
"grad_norm": 4.333283424377441,
"learning_rate": 9.779288820337628e-06,
"loss": 1.0668,
"step": 1910
},
{
"epoch": 0.38,
"grad_norm": 3.761474370956421,
"learning_rate": 9.774282455154338e-06,
"loss": 1.0738,
"step": 1920
},
{
"epoch": 0.38,
"grad_norm": 3.7375967502593994,
"learning_rate": 9.769221256218165e-06,
"loss": 1.0574,
"step": 1930
},
{
"epoch": 0.38,
"grad_norm": 3.852424144744873,
"learning_rate": 9.764105281658161e-06,
"loss": 1.0536,
"step": 1940
},
{
"epoch": 0.38,
"grad_norm": 4.345045566558838,
"learning_rate": 9.758934590232495e-06,
"loss": 1.0898,
"step": 1950
},
{
"epoch": 0.38,
"grad_norm": 4.355409622192383,
"learning_rate": 9.753709241327773e-06,
"loss": 1.0657,
"step": 1960
},
{
"epoch": 0.39,
"grad_norm": 4.160512924194336,
"learning_rate": 9.748429294958345e-06,
"loss": 1.0699,
"step": 1970
},
{
"epoch": 0.39,
"grad_norm": 4.259660243988037,
"learning_rate": 9.74309481176564e-06,
"loss": 1.0703,
"step": 1980
},
{
"epoch": 0.39,
"grad_norm": 4.260087013244629,
"learning_rate": 9.737705853017442e-06,
"loss": 1.0816,
"step": 1990
},
{
"epoch": 0.39,
"grad_norm": 4.026108264923096,
"learning_rate": 9.732262480607207e-06,
"loss": 1.0556,
"step": 2000
},
{
"epoch": 0.39,
"eval_loss": 1.0583994388580322,
"eval_runtime": 12.5254,
"eval_samples_per_second": 52.214,
"eval_steps_per_second": 6.547,
"step": 2000
},
{
"epoch": 0.39,
"grad_norm": 4.157490253448486,
"learning_rate": 9.726764757053343e-06,
"loss": 1.0735,
"step": 2010
},
{
"epoch": 0.4,
"grad_norm": 4.293674468994141,
"learning_rate": 9.721212745498493e-06,
"loss": 1.0697,
"step": 2020
},
{
"epoch": 0.4,
"grad_norm": 3.8046298027038574,
"learning_rate": 9.715606509708812e-06,
"loss": 1.0635,
"step": 2030
},
{
"epoch": 0.4,
"grad_norm": 3.55698561668396,
"learning_rate": 9.709946114073231e-06,
"loss": 1.0685,
"step": 2040
},
{
"epoch": 0.4,
"grad_norm": 3.9492263793945312,
"learning_rate": 9.704231623602721e-06,
"loss": 1.0692,
"step": 2050
},
{
"epoch": 0.4,
"grad_norm": 2.995659351348877,
"learning_rate": 9.698463103929542e-06,
"loss": 1.069,
"step": 2060
},
{
"epoch": 0.41,
"grad_norm": 4.452322483062744,
"learning_rate": 9.692640621306497e-06,
"loss": 1.0728,
"step": 2070
},
{
"epoch": 0.41,
"grad_norm": 4.006470680236816,
"learning_rate": 9.686764242606164e-06,
"loss": 1.0616,
"step": 2080
},
{
"epoch": 0.41,
"grad_norm": 4.041233539581299,
"learning_rate": 9.680834035320127e-06,
"loss": 1.0712,
"step": 2090
},
{
"epoch": 0.41,
"grad_norm": 3.9690816402435303,
"learning_rate": 9.674850067558209e-06,
"loss": 1.0682,
"step": 2100
},
{
"epoch": 0.41,
"grad_norm": 4.229337215423584,
"learning_rate": 9.66881240804768e-06,
"loss": 1.0734,
"step": 2110
},
{
"epoch": 0.42,
"grad_norm": 3.3140339851379395,
"learning_rate": 9.662721126132473e-06,
"loss": 1.0665,
"step": 2120
},
{
"epoch": 0.42,
"grad_norm": 4.23881721496582,
"learning_rate": 9.656576291772392e-06,
"loss": 1.0535,
"step": 2130
},
{
"epoch": 0.42,
"grad_norm": 3.9377248287200928,
"learning_rate": 9.650377975542298e-06,
"loss": 1.068,
"step": 2140
},
{
"epoch": 0.42,
"grad_norm": 3.9194915294647217,
"learning_rate": 9.644126248631306e-06,
"loss": 1.0803,
"step": 2150
},
{
"epoch": 0.42,
"grad_norm": 3.5396149158477783,
"learning_rate": 9.637821182841965e-06,
"loss": 1.0574,
"step": 2160
},
{
"epoch": 0.43,
"grad_norm": 4.14241361618042,
"learning_rate": 9.631462850589432e-06,
"loss": 1.0643,
"step": 2170
},
{
"epoch": 0.43,
"grad_norm": 3.7068986892700195,
"learning_rate": 9.625051324900645e-06,
"loss": 1.0519,
"step": 2180
},
{
"epoch": 0.43,
"grad_norm": 3.9636712074279785,
"learning_rate": 9.618586679413477e-06,
"loss": 1.054,
"step": 2190
},
{
"epoch": 0.43,
"grad_norm": 3.6051032543182373,
"learning_rate": 9.612068988375898e-06,
"loss": 1.0534,
"step": 2200
},
{
"epoch": 0.43,
"grad_norm": 3.969806671142578,
"learning_rate": 9.605498326645115e-06,
"loss": 1.0851,
"step": 2210
},
{
"epoch": 0.44,
"grad_norm": 4.08690071105957,
"learning_rate": 9.598874769686721e-06,
"loss": 1.0645,
"step": 2220
},
{
"epoch": 0.44,
"grad_norm": 3.7391505241394043,
"learning_rate": 9.592198393573816e-06,
"loss": 1.0693,
"step": 2230
},
{
"epoch": 0.44,
"grad_norm": 4.053625583648682,
"learning_rate": 9.585469274986148e-06,
"loss": 1.0682,
"step": 2240
},
{
"epoch": 0.44,
"grad_norm": 3.923471212387085,
"learning_rate": 9.578687491209219e-06,
"loss": 1.0606,
"step": 2250
},
{
"epoch": 0.44,
"grad_norm": 4.096630573272705,
"learning_rate": 9.571853120133406e-06,
"loss": 1.064,
"step": 2260
},
{
"epoch": 0.44,
"grad_norm": 3.9412453174591064,
"learning_rate": 9.564966240253062e-06,
"loss": 1.0786,
"step": 2270
},
{
"epoch": 0.45,
"grad_norm": 3.6600263118743896,
"learning_rate": 9.558026930665614e-06,
"loss": 1.0622,
"step": 2280
},
{
"epoch": 0.45,
"grad_norm": 4.281722545623779,
"learning_rate": 9.551035271070665e-06,
"loss": 1.0542,
"step": 2290
},
{
"epoch": 0.45,
"grad_norm": 3.6689794063568115,
"learning_rate": 9.543991341769057e-06,
"loss": 1.0496,
"step": 2300
},
{
"epoch": 0.45,
"grad_norm": 4.148682594299316,
"learning_rate": 9.536895223661975e-06,
"loss": 1.0648,
"step": 2310
},
{
"epoch": 0.45,
"grad_norm": 4.112921237945557,
"learning_rate": 9.529746998249994e-06,
"loss": 1.0632,
"step": 2320
},
{
"epoch": 0.46,
"grad_norm": 4.144115447998047,
"learning_rate": 9.52254674763216e-06,
"loss": 1.0555,
"step": 2330
},
{
"epoch": 0.46,
"grad_norm": 4.198709011077881,
"learning_rate": 9.515294554505039e-06,
"loss": 1.049,
"step": 2340
},
{
"epoch": 0.46,
"grad_norm": 3.7727572917938232,
"learning_rate": 9.507990502161769e-06,
"loss": 1.0428,
"step": 2350
},
{
"epoch": 0.46,
"grad_norm": 4.66112756729126,
"learning_rate": 9.500634674491099e-06,
"loss": 1.0666,
"step": 2360
},
{
"epoch": 0.46,
"grad_norm": 4.313396453857422,
"learning_rate": 9.49322715597644e-06,
"loss": 1.0658,
"step": 2370
},
{
"epoch": 0.47,
"grad_norm": 3.8979835510253906,
"learning_rate": 9.485768031694872e-06,
"loss": 1.0516,
"step": 2380
},
{
"epoch": 0.47,
"grad_norm": 4.047280788421631,
"learning_rate": 9.478257387316189e-06,
"loss": 1.0708,
"step": 2390
},
{
"epoch": 0.47,
"grad_norm": 3.9112637042999268,
"learning_rate": 9.470695309101903e-06,
"loss": 1.0576,
"step": 2400
},
{
"epoch": 0.47,
"grad_norm": 3.9057672023773193,
"learning_rate": 9.463081883904251e-06,
"loss": 1.0653,
"step": 2410
},
{
"epoch": 0.47,
"grad_norm": 4.1275105476379395,
"learning_rate": 9.455417199165209e-06,
"loss": 1.0454,
"step": 2420
},
{
"epoch": 0.48,
"grad_norm": 3.834106922149658,
"learning_rate": 9.447701342915473e-06,
"loss": 1.0593,
"step": 2430
},
{
"epoch": 0.48,
"grad_norm": 3.673773765563965,
"learning_rate": 9.439934403773468e-06,
"loss": 1.0543,
"step": 2440
},
{
"epoch": 0.48,
"grad_norm": 4.011989593505859,
"learning_rate": 9.4321164709443e-06,
"loss": 1.0468,
"step": 2450
},
{
"epoch": 0.48,
"grad_norm": 4.221887588500977,
"learning_rate": 9.42424763421877e-06,
"loss": 1.0699,
"step": 2460
},
{
"epoch": 0.48,
"grad_norm": 3.603546142578125,
"learning_rate": 9.416327983972304e-06,
"loss": 1.0525,
"step": 2470
},
{
"epoch": 0.49,
"grad_norm": 4.173734188079834,
"learning_rate": 9.408357611163945e-06,
"loss": 1.0678,
"step": 2480
},
{
"epoch": 0.49,
"grad_norm": 3.5241010189056396,
"learning_rate": 9.400336607335294e-06,
"loss": 1.0536,
"step": 2490
},
{
"epoch": 0.49,
"grad_norm": 3.8831429481506348,
"learning_rate": 9.392265064609455e-06,
"loss": 1.0367,
"step": 2500
},
{
"epoch": 0.49,
"grad_norm": 3.8573215007781982,
"learning_rate": 9.384143075689992e-06,
"loss": 1.0474,
"step": 2510
},
{
"epoch": 0.49,
"grad_norm": 3.8050475120544434,
"learning_rate": 9.375970733859848e-06,
"loss": 1.0508,
"step": 2520
},
{
"epoch": 0.5,
"grad_norm": 4.164061546325684,
"learning_rate": 9.367748132980286e-06,
"loss": 1.0629,
"step": 2530
},
{
"epoch": 0.5,
"grad_norm": 3.8858516216278076,
"learning_rate": 9.359475367489805e-06,
"loss": 1.0616,
"step": 2540
},
{
"epoch": 0.5,
"grad_norm": 4.033576965332031,
"learning_rate": 9.351152532403054e-06,
"loss": 1.0687,
"step": 2550
},
{
"epoch": 0.5,
"grad_norm": 4.06900691986084,
"learning_rate": 9.342779723309746e-06,
"loss": 1.0519,
"step": 2560
},
{
"epoch": 0.5,
"grad_norm": 3.8570592403411865,
"learning_rate": 9.334357036373552e-06,
"loss": 1.0482,
"step": 2570
},
{
"epoch": 0.51,
"grad_norm": 4.30549430847168,
"learning_rate": 9.32588456833101e-06,
"loss": 1.0714,
"step": 2580
},
{
"epoch": 0.51,
"grad_norm": 3.6463634967803955,
"learning_rate": 9.317362416490396e-06,
"loss": 1.055,
"step": 2590
},
{
"epoch": 0.51,
"grad_norm": 4.366295337677002,
"learning_rate": 9.308790678730627e-06,
"loss": 1.0502,
"step": 2600
},
{
"epoch": 0.51,
"grad_norm": 3.8573434352874756,
"learning_rate": 9.300169453500117e-06,
"loss": 1.0597,
"step": 2610
},
{
"epoch": 0.51,
"grad_norm": 3.8393239974975586,
"learning_rate": 9.291498839815658e-06,
"loss": 1.0553,
"step": 2620
},
{
"epoch": 0.52,
"grad_norm": 4.246349334716797,
"learning_rate": 9.282778937261279e-06,
"loss": 1.0734,
"step": 2630
},
{
"epoch": 0.52,
"grad_norm": 4.076491355895996,
"learning_rate": 9.274009845987106e-06,
"loss": 1.0643,
"step": 2640
},
{
"epoch": 0.52,
"grad_norm": 3.7121007442474365,
"learning_rate": 9.26519166670821e-06,
"loss": 1.0491,
"step": 2650
},
{
"epoch": 0.52,
"grad_norm": 3.8223936557769775,
"learning_rate": 9.256324500703439e-06,
"loss": 1.0713,
"step": 2660
},
{
"epoch": 0.52,
"grad_norm": 3.4132473468780518,
"learning_rate": 9.247408449814281e-06,
"loss": 1.0541,
"step": 2670
},
{
"epoch": 0.53,
"grad_norm": 3.846742868423462,
"learning_rate": 9.238443616443666e-06,
"loss": 1.0573,
"step": 2680
},
{
"epoch": 0.53,
"grad_norm": 4.1583123207092285,
"learning_rate": 9.229430103554808e-06,
"loss": 1.038,
"step": 2690
},
{
"epoch": 0.53,
"grad_norm": 4.192993640899658,
"learning_rate": 9.22036801467001e-06,
"loss": 1.0645,
"step": 2700
},
{
"epoch": 0.53,
"grad_norm": 4.137371063232422,
"learning_rate": 9.211257453869495e-06,
"loss": 1.058,
"step": 2710
},
{
"epoch": 0.53,
"grad_norm": 4.091256618499756,
"learning_rate": 9.202098525790182e-06,
"loss": 1.0702,
"step": 2720
},
{
"epoch": 0.54,
"grad_norm": 3.9540719985961914,
"learning_rate": 9.192891335624508e-06,
"loss": 1.0406,
"step": 2730
},
{
"epoch": 0.54,
"grad_norm": 4.149806022644043,
"learning_rate": 9.183635989119211e-06,
"loss": 1.0558,
"step": 2740
},
{
"epoch": 0.54,
"grad_norm": 3.950911045074463,
"learning_rate": 9.174332592574115e-06,
"loss": 1.0446,
"step": 2750
},
{
"epoch": 0.54,
"grad_norm": 3.7937111854553223,
"learning_rate": 9.164981252840908e-06,
"loss": 1.0608,
"step": 2760
},
{
"epoch": 0.54,
"grad_norm": 4.06609582901001,
"learning_rate": 9.155582077321918e-06,
"loss": 1.0653,
"step": 2770
},
{
"epoch": 0.54,
"grad_norm": 4.201600074768066,
"learning_rate": 9.146135173968881e-06,
"loss": 1.0651,
"step": 2780
},
{
"epoch": 0.55,
"grad_norm": 3.8813118934631348,
"learning_rate": 9.136640651281694e-06,
"loss": 1.0567,
"step": 2790
},
{
"epoch": 0.55,
"grad_norm": 3.4754064083099365,
"learning_rate": 9.127098618307177e-06,
"loss": 1.0632,
"step": 2800
},
{
"epoch": 0.55,
"grad_norm": 3.5580170154571533,
"learning_rate": 9.117509184637814e-06,
"loss": 1.057,
"step": 2810
},
{
"epoch": 0.55,
"grad_norm": 3.7533814907073975,
"learning_rate": 9.107872460410496e-06,
"loss": 1.0398,
"step": 2820
},
{
"epoch": 0.55,
"grad_norm": 4.086584091186523,
"learning_rate": 9.098188556305262e-06,
"loss": 1.0633,
"step": 2830
},
{
"epoch": 0.56,
"grad_norm": 4.2910237312316895,
"learning_rate": 9.088457583544022e-06,
"loss": 1.0334,
"step": 2840
},
{
"epoch": 0.56,
"grad_norm": 4.132745742797852,
"learning_rate": 9.078679653889273e-06,
"loss": 1.0595,
"step": 2850
},
{
"epoch": 0.56,
"grad_norm": 3.9243948459625244,
"learning_rate": 9.068854879642833e-06,
"loss": 1.0641,
"step": 2860
},
{
"epoch": 0.56,
"grad_norm": 3.943058490753174,
"learning_rate": 9.058983373644532e-06,
"loss": 1.0493,
"step": 2870
},
{
"epoch": 0.56,
"grad_norm": 3.724886417388916,
"learning_rate": 9.049065249270936e-06,
"loss": 1.0374,
"step": 2880
},
{
"epoch": 0.57,
"grad_norm": 3.8636670112609863,
"learning_rate": 9.039100620434025e-06,
"loss": 1.0634,
"step": 2890
},
{
"epoch": 0.57,
"grad_norm": 3.841193675994873,
"learning_rate": 9.029089601579895e-06,
"loss": 1.0433,
"step": 2900
},
{
"epoch": 0.57,
"grad_norm": 3.6147212982177734,
"learning_rate": 9.019032307687446e-06,
"loss": 1.0416,
"step": 2910
},
{
"epoch": 0.57,
"grad_norm": 3.8550570011138916,
"learning_rate": 9.008928854267054e-06,
"loss": 1.064,
"step": 2920
},
{
"epoch": 0.57,
"grad_norm": 3.5188698768615723,
"learning_rate": 8.99877935735925e-06,
"loss": 1.0472,
"step": 2930
},
{
"epoch": 0.58,
"grad_norm": 4.188703536987305,
"learning_rate": 8.988583933533384e-06,
"loss": 1.0688,
"step": 2940
},
{
"epoch": 0.58,
"grad_norm": 3.8695075511932373,
"learning_rate": 8.978342699886289e-06,
"loss": 1.0391,
"step": 2950
},
{
"epoch": 0.58,
"grad_norm": 3.8634023666381836,
"learning_rate": 8.968055774040932e-06,
"loss": 1.0422,
"step": 2960
},
{
"epoch": 0.58,
"grad_norm": 4.071281909942627,
"learning_rate": 8.95772327414507e-06,
"loss": 1.0442,
"step": 2970
},
{
"epoch": 0.58,
"grad_norm": 4.14091157913208,
"learning_rate": 8.947345318869883e-06,
"loss": 1.0541,
"step": 2980
},
{
"epoch": 0.59,
"grad_norm": 4.287483215332031,
"learning_rate": 8.936922027408618e-06,
"loss": 1.0391,
"step": 2990
},
{
"epoch": 0.59,
"grad_norm": 4.207295894622803,
"learning_rate": 8.926453519475225e-06,
"loss": 1.0455,
"step": 3000
},
{
"epoch": 0.59,
"eval_loss": 1.046447992324829,
"eval_runtime": 12.4637,
"eval_samples_per_second": 52.472,
"eval_steps_per_second": 6.579,
"step": 3000
},
{
"epoch": 0.59,
"grad_norm": 4.189324855804443,
"learning_rate": 8.91593991530297e-06,
"loss": 1.0559,
"step": 3010
},
{
"epoch": 0.59,
"grad_norm": 4.109393119812012,
"learning_rate": 8.905381335643056e-06,
"loss": 1.0524,
"step": 3020
},
{
"epoch": 0.59,
"grad_norm": 3.9047772884368896,
"learning_rate": 8.89477790176325e-06,
"loss": 1.059,
"step": 3030
},
{
"epoch": 0.6,
"grad_norm": 4.3157196044921875,
"learning_rate": 8.884129735446471e-06,
"loss": 1.0494,
"step": 3040
},
{
"epoch": 0.6,
"grad_norm": 4.147355079650879,
"learning_rate": 8.873436958989409e-06,
"loss": 1.0517,
"step": 3050
},
{
"epoch": 0.6,
"grad_norm": 4.453030586242676,
"learning_rate": 8.862699695201107e-06,
"loss": 1.0538,
"step": 3060
},
{
"epoch": 0.6,
"grad_norm": 4.24041223526001,
"learning_rate": 8.851918067401552e-06,
"loss": 1.0425,
"step": 3070
},
{
"epoch": 0.6,
"grad_norm": 3.6763038635253906,
"learning_rate": 8.84109219942027e-06,
"loss": 1.0558,
"step": 3080
},
{
"epoch": 0.61,
"grad_norm": 3.736586093902588,
"learning_rate": 8.83022221559489e-06,
"loss": 1.0589,
"step": 3090
},
{
"epoch": 0.61,
"grad_norm": 3.622150182723999,
"learning_rate": 8.819308240769726e-06,
"loss": 1.0428,
"step": 3100
},
{
"epoch": 0.61,
"grad_norm": 3.96793794631958,
"learning_rate": 8.808350400294332e-06,
"loss": 1.0245,
"step": 3110
},
{
"epoch": 0.61,
"grad_norm": 3.7367801666259766,
"learning_rate": 8.797348820022079e-06,
"loss": 1.0551,
"step": 3120
},
{
"epoch": 0.61,
"grad_norm": 3.7123868465423584,
"learning_rate": 8.78630362630869e-06,
"loss": 1.0381,
"step": 3130
},
{
"epoch": 0.62,
"grad_norm": 3.651548385620117,
"learning_rate": 8.775214946010806e-06,
"loss": 1.0428,
"step": 3140
},
{
"epoch": 0.62,
"grad_norm": 4.124800205230713,
"learning_rate": 8.764082906484518e-06,
"loss": 1.0638,
"step": 3150
},
{
"epoch": 0.62,
"grad_norm": 3.599874496459961,
"learning_rate": 8.752907635583911e-06,
"loss": 1.0441,
"step": 3160
},
{
"epoch": 0.62,
"grad_norm": 3.769707441329956,
"learning_rate": 8.74168926165959e-06,
"loss": 1.0526,
"step": 3170
},
{
"epoch": 0.62,
"grad_norm": 4.018752098083496,
"learning_rate": 8.730427913557205e-06,
"loss": 1.0672,
"step": 3180
},
{
"epoch": 0.63,
"grad_norm": 3.963313341140747,
"learning_rate": 8.71912372061598e-06,
"loss": 1.0606,
"step": 3190
},
{
"epoch": 0.63,
"grad_norm": 4.098948001861572,
"learning_rate": 8.707776812667224e-06,
"loss": 1.0383,
"step": 3200
},
{
"epoch": 0.63,
"grad_norm": 3.441176652908325,
"learning_rate": 8.696387320032827e-06,
"loss": 1.0629,
"step": 3210
},
{
"epoch": 0.63,
"grad_norm": 3.6925058364868164,
"learning_rate": 8.684955373523787e-06,
"loss": 1.0555,
"step": 3220
},
{
"epoch": 0.63,
"grad_norm": 3.5602104663848877,
"learning_rate": 8.673481104438685e-06,
"loss": 1.0421,
"step": 3230
},
{
"epoch": 0.64,
"grad_norm": 4.177275657653809,
"learning_rate": 8.661964644562194e-06,
"loss": 1.0504,
"step": 3240
},
{
"epoch": 0.64,
"grad_norm": 3.9053499698638916,
"learning_rate": 8.650406126163553e-06,
"loss": 1.0508,
"step": 3250
},
{
"epoch": 0.64,
"grad_norm": 3.4393839836120605,
"learning_rate": 8.638805681995052e-06,
"loss": 1.0375,
"step": 3260
},
{
"epoch": 0.64,
"grad_norm": 3.890512228012085,
"learning_rate": 8.627163445290514e-06,
"loss": 1.0453,
"step": 3270
},
{
"epoch": 0.64,
"grad_norm": 4.122755527496338,
"learning_rate": 8.615479549763756e-06,
"loss": 1.0427,
"step": 3280
},
{
"epoch": 0.64,
"grad_norm": 3.3840408325195312,
"learning_rate": 8.603754129607055e-06,
"loss": 1.0454,
"step": 3290
},
{
"epoch": 0.65,
"grad_norm": 4.196717262268066,
"learning_rate": 8.591987319489612e-06,
"loss": 1.0594,
"step": 3300
},
{
"epoch": 0.65,
"grad_norm": 3.9698941707611084,
"learning_rate": 8.580179254555997e-06,
"loss": 1.0431,
"step": 3310
},
{
"epoch": 0.65,
"grad_norm": 3.883592128753662,
"learning_rate": 8.5683300704246e-06,
"loss": 1.0257,
"step": 3320
},
{
"epoch": 0.65,
"grad_norm": 3.783325672149658,
"learning_rate": 8.556439903186082e-06,
"loss": 1.0445,
"step": 3330
},
{
"epoch": 0.65,
"grad_norm": 4.042956352233887,
"learning_rate": 8.544508889401799e-06,
"loss": 1.0507,
"step": 3340
},
{
"epoch": 0.66,
"grad_norm": 3.7064452171325684,
"learning_rate": 8.53253716610224e-06,
"loss": 1.0604,
"step": 3350
},
{
"epoch": 0.66,
"grad_norm": 3.835759401321411,
"learning_rate": 8.520524870785453e-06,
"loss": 1.0526,
"step": 3360
},
{
"epoch": 0.66,
"grad_norm": 3.9541969299316406,
"learning_rate": 8.508472141415468e-06,
"loss": 1.0365,
"step": 3370
},
{
"epoch": 0.66,
"grad_norm": 4.182323932647705,
"learning_rate": 8.4963791164207e-06,
"loss": 1.0292,
"step": 3380
},
{
"epoch": 0.66,
"grad_norm": 4.205846309661865,
"learning_rate": 8.484245934692379e-06,
"loss": 1.0236,
"step": 3390
},
{
"epoch": 0.67,
"grad_norm": 3.628838300704956,
"learning_rate": 8.472072735582942e-06,
"loss": 1.0457,
"step": 3400
},
{
"epoch": 0.67,
"grad_norm": 3.7961418628692627,
"learning_rate": 8.45985965890443e-06,
"loss": 1.0491,
"step": 3410
},
{
"epoch": 0.67,
"grad_norm": 3.6906325817108154,
"learning_rate": 8.447606844926895e-06,
"loss": 1.0315,
"step": 3420
},
{
"epoch": 0.67,
"grad_norm": 4.307608604431152,
"learning_rate": 8.435314434376773e-06,
"loss": 1.0498,
"step": 3430
},
{
"epoch": 0.67,
"grad_norm": 3.8979074954986572,
"learning_rate": 8.422982568435283e-06,
"loss": 1.0637,
"step": 3440
},
{
"epoch": 0.68,
"grad_norm": 4.03852653503418,
"learning_rate": 8.410611388736793e-06,
"loss": 1.06,
"step": 3450
},
{
"epoch": 0.68,
"grad_norm": 3.238548994064331,
"learning_rate": 8.398201037367202e-06,
"loss": 1.0385,
"step": 3460
},
{
"epoch": 0.68,
"grad_norm": 4.223562240600586,
"learning_rate": 8.385751656862305e-06,
"loss": 1.039,
"step": 3470
},
{
"epoch": 0.68,
"grad_norm": 3.7112879753112793,
"learning_rate": 8.373263390206155e-06,
"loss": 1.0412,
"step": 3480
},
{
"epoch": 0.68,
"grad_norm": 3.801882743835449,
"learning_rate": 8.36073638082942e-06,
"loss": 1.0455,
"step": 3490
},
{
"epoch": 0.69,
"grad_norm": 4.619334697723389,
"learning_rate": 8.348170772607737e-06,
"loss": 1.054,
"step": 3500
},
{
"epoch": 0.69,
"grad_norm": 3.967297077178955,
"learning_rate": 8.335566709860065e-06,
"loss": 1.0369,
"step": 3510
},
{
"epoch": 0.69,
"grad_norm": 3.9955484867095947,
"learning_rate": 8.322924337347016e-06,
"loss": 1.0631,
"step": 3520
},
{
"epoch": 0.69,
"grad_norm": 3.6488661766052246,
"learning_rate": 8.3102438002692e-06,
"loss": 1.0427,
"step": 3530
},
{
"epoch": 0.69,
"grad_norm": 4.1679534912109375,
"learning_rate": 8.29752524426556e-06,
"loss": 1.0396,
"step": 3540
},
{
"epoch": 0.7,
"grad_norm": 3.7516090869903564,
"learning_rate": 8.284768815411693e-06,
"loss": 1.0457,
"step": 3550
},
{
"epoch": 0.7,
"grad_norm": 3.902599811553955,
"learning_rate": 8.27197466021817e-06,
"loss": 1.0354,
"step": 3560
},
{
"epoch": 0.7,
"grad_norm": 3.828345775604248,
"learning_rate": 8.259142925628862e-06,
"loss": 1.0359,
"step": 3570
},
{
"epoch": 0.7,
"grad_norm": 4.114760875701904,
"learning_rate": 8.246273759019252e-06,
"loss": 1.0346,
"step": 3580
},
{
"epoch": 0.7,
"grad_norm": 4.289566993713379,
"learning_rate": 8.233367308194735e-06,
"loss": 1.038,
"step": 3590
},
{
"epoch": 0.71,
"grad_norm": 3.713040828704834,
"learning_rate": 8.220423721388918e-06,
"loss": 1.0442,
"step": 3600
},
{
"epoch": 0.71,
"grad_norm": 4.1226630210876465,
"learning_rate": 8.20744314726193e-06,
"loss": 1.0502,
"step": 3610
},
{
"epoch": 0.71,
"grad_norm": 3.980717182159424,
"learning_rate": 8.19442573489871e-06,
"loss": 1.0398,
"step": 3620
},
{
"epoch": 0.71,
"grad_norm": 3.998352527618408,
"learning_rate": 8.181371633807289e-06,
"loss": 1.0558,
"step": 3630
},
{
"epoch": 0.71,
"grad_norm": 4.392803192138672,
"learning_rate": 8.168280993917078e-06,
"loss": 1.0508,
"step": 3640
},
{
"epoch": 0.72,
"grad_norm": 4.483020305633545,
"learning_rate": 8.155153965577139e-06,
"loss": 1.028,
"step": 3650
},
{
"epoch": 0.72,
"grad_norm": 3.593369960784912,
"learning_rate": 8.141990699554476e-06,
"loss": 1.0591,
"step": 3660
},
{
"epoch": 0.72,
"grad_norm": 3.9365193843841553,
"learning_rate": 8.12879134703228e-06,
"loss": 1.0496,
"step": 3670
},
{
"epoch": 0.72,
"grad_norm": 4.063488960266113,
"learning_rate": 8.115556059608208e-06,
"loss": 1.0554,
"step": 3680
},
{
"epoch": 0.72,
"grad_norm": 3.916815996170044,
"learning_rate": 8.102284989292639e-06,
"loss": 1.0382,
"step": 3690
},
{
"epoch": 0.73,
"grad_norm": 3.987957000732422,
"learning_rate": 8.088978288506923e-06,
"loss": 1.0668,
"step": 3700
},
{
"epoch": 0.73,
"grad_norm": 4.158136367797852,
"learning_rate": 8.075636110081643e-06,
"loss": 1.0346,
"step": 3710
},
{
"epoch": 0.73,
"grad_norm": 3.6939709186553955,
"learning_rate": 8.062258607254841e-06,
"loss": 1.0401,
"step": 3720
},
{
"epoch": 0.73,
"grad_norm": 4.190096378326416,
"learning_rate": 8.048845933670274e-06,
"loss": 1.0285,
"step": 3730
},
{
"epoch": 0.73,
"grad_norm": 3.59887957572937,
"learning_rate": 8.035398243375636e-06,
"loss": 1.036,
"step": 3740
},
{
"epoch": 0.74,
"grad_norm": 3.9749696254730225,
"learning_rate": 8.021915690820808e-06,
"loss": 1.0555,
"step": 3750
},
{
"epoch": 0.74,
"grad_norm": 3.9689297676086426,
"learning_rate": 8.008398430856064e-06,
"loss": 1.038,
"step": 3760
},
{
"epoch": 0.74,
"grad_norm": 3.9846508502960205,
"learning_rate": 7.994846618730301e-06,
"loss": 1.0523,
"step": 3770
},
{
"epoch": 0.74,
"grad_norm": 4.488775730133057,
"learning_rate": 7.981260410089258e-06,
"loss": 1.0244,
"step": 3780
},
{
"epoch": 0.74,
"grad_norm": 4.135215759277344,
"learning_rate": 7.967639960973727e-06,
"loss": 1.0653,
"step": 3790
},
{
"epoch": 0.74,
"grad_norm": 4.3885884284973145,
"learning_rate": 7.953985427817757e-06,
"loss": 1.0531,
"step": 3800
},
{
"epoch": 0.75,
"grad_norm": 3.9444169998168945,
"learning_rate": 7.94029696744686e-06,
"loss": 1.04,
"step": 3810
},
{
"epoch": 0.75,
"grad_norm": 4.401015758514404,
"learning_rate": 7.92657473707621e-06,
"loss": 1.0498,
"step": 3820
},
{
"epoch": 0.75,
"grad_norm": 4.35683012008667,
"learning_rate": 7.912818894308845e-06,
"loss": 1.0288,
"step": 3830
},
{
"epoch": 0.75,
"grad_norm": 4.314106464385986,
"learning_rate": 7.899029597133836e-06,
"loss": 1.0413,
"step": 3840
},
{
"epoch": 0.75,
"grad_norm": 3.9266910552978516,
"learning_rate": 7.885207003924498e-06,
"loss": 1.0319,
"step": 3850
},
{
"epoch": 0.76,
"grad_norm": 3.997091054916382,
"learning_rate": 7.87135127343655e-06,
"loss": 1.0324,
"step": 3860
},
{
"epoch": 0.76,
"grad_norm": 4.264638423919678,
"learning_rate": 7.857462564806306e-06,
"loss": 1.0328,
"step": 3870
},
{
"epoch": 0.76,
"grad_norm": 4.032344818115234,
"learning_rate": 7.84354103754884e-06,
"loss": 1.0415,
"step": 3880
},
{
"epoch": 0.76,
"grad_norm": 4.569589614868164,
"learning_rate": 7.82958685155615e-06,
"loss": 1.0566,
"step": 3890
},
{
"epoch": 0.76,
"grad_norm": 4.405215740203857,
"learning_rate": 7.815600167095338e-06,
"loss": 1.0508,
"step": 3900
},
{
"epoch": 0.77,
"grad_norm": 3.7878050804138184,
"learning_rate": 7.801581144806752e-06,
"loss": 1.0365,
"step": 3910
},
{
"epoch": 0.77,
"grad_norm": 3.773585319519043,
"learning_rate": 7.787529945702145e-06,
"loss": 1.0366,
"step": 3920
},
{
"epoch": 0.77,
"grad_norm": 4.027467727661133,
"learning_rate": 7.773446731162835e-06,
"loss": 1.0285,
"step": 3930
},
{
"epoch": 0.77,
"grad_norm": 3.831883430480957,
"learning_rate": 7.759331662937841e-06,
"loss": 1.0342,
"step": 3940
},
{
"epoch": 0.77,
"grad_norm": 4.330446243286133,
"learning_rate": 7.745184903142029e-06,
"loss": 1.0398,
"step": 3950
},
{
"epoch": 0.78,
"grad_norm": 4.389279842376709,
"learning_rate": 7.731006614254252e-06,
"loss": 1.017,
"step": 3960
},
{
"epoch": 0.78,
"grad_norm": 4.518781661987305,
"learning_rate": 7.716796959115479e-06,
"loss": 1.0465,
"step": 3970
},
{
"epoch": 0.78,
"grad_norm": 4.294104099273682,
"learning_rate": 7.70255610092693e-06,
"loss": 1.0328,
"step": 3980
},
{
"epoch": 0.78,
"grad_norm": 3.937368154525757,
"learning_rate": 7.688284203248197e-06,
"loss": 1.0496,
"step": 3990
},
{
"epoch": 0.78,
"grad_norm": 4.222658157348633,
"learning_rate": 7.673981429995372e-06,
"loss": 1.032,
"step": 4000
},
{
"epoch": 0.78,
"eval_loss": 1.0327889919281006,
"eval_runtime": 12.4602,
"eval_samples_per_second": 52.487,
"eval_steps_per_second": 6.581,
"step": 4000
},
{
"epoch": 0.79,
"grad_norm": 3.9882421493530273,
"learning_rate": 7.659647945439157e-06,
"loss": 1.0262,
"step": 4010
},
{
"epoch": 0.79,
"grad_norm": 4.082265377044678,
"learning_rate": 7.645283914202981e-06,
"loss": 1.03,
"step": 4020
},
{
"epoch": 0.79,
"grad_norm": 4.47793436050415,
"learning_rate": 7.63088950126111e-06,
"loss": 1.0402,
"step": 4030
},
{
"epoch": 0.79,
"grad_norm": 4.539676189422607,
"learning_rate": 7.616464871936748e-06,
"loss": 1.0441,
"step": 4040
},
{
"epoch": 0.79,
"grad_norm": 4.070407867431641,
"learning_rate": 7.602010191900147e-06,
"loss": 1.0298,
"step": 4050
},
{
"epoch": 0.8,
"grad_norm": 4.478466510772705,
"learning_rate": 7.587525627166691e-06,
"loss": 1.0298,
"step": 4060
},
{
"epoch": 0.8,
"grad_norm": 4.1451005935668945,
"learning_rate": 7.573011344095002e-06,
"loss": 1.0411,
"step": 4070
},
{
"epoch": 0.8,
"grad_norm": 3.8588812351226807,
"learning_rate": 7.558467509385023e-06,
"loss": 1.0312,
"step": 4080
},
{
"epoch": 0.8,
"grad_norm": 4.136762619018555,
"learning_rate": 7.5438942900761035e-06,
"loss": 1.0436,
"step": 4090
},
{
"epoch": 0.8,
"grad_norm": 4.054186820983887,
"learning_rate": 7.529291853545082e-06,
"loss": 1.0421,
"step": 4100
},
{
"epoch": 0.81,
"grad_norm": 4.862720012664795,
"learning_rate": 7.514660367504368e-06,
"loss": 1.0355,
"step": 4110
},
{
"epoch": 0.81,
"grad_norm": 38.17692565917969,
"learning_rate": 7.500000000000001e-06,
"loss": 1.045,
"step": 4120
},
{
"epoch": 0.81,
"grad_norm": 4.037071228027344,
"learning_rate": 7.485310919409742e-06,
"loss": 1.0382,
"step": 4130
},
{
"epoch": 0.81,
"grad_norm": 4.044297218322754,
"learning_rate": 7.470593294441124e-06,
"loss": 1.0354,
"step": 4140
},
{
"epoch": 0.81,
"grad_norm": 3.8578081130981445,
"learning_rate": 7.455847294129519e-06,
"loss": 1.0475,
"step": 4150
},
{
"epoch": 0.82,
"grad_norm": 4.1042094230651855,
"learning_rate": 7.4410730878361936e-06,
"loss": 1.0302,
"step": 4160
},
{
"epoch": 0.82,
"grad_norm": 4.391599178314209,
"learning_rate": 7.426270845246373e-06,
"loss": 1.0317,
"step": 4170
},
{
"epoch": 0.82,
"grad_norm": 4.354910373687744,
"learning_rate": 7.411440736367281e-06,
"loss": 1.0291,
"step": 4180
},
{
"epoch": 0.82,
"grad_norm": 4.061986923217773,
"learning_rate": 7.396582931526194e-06,
"loss": 1.0434,
"step": 4190
},
{
"epoch": 0.82,
"grad_norm": 3.731538772583008,
"learning_rate": 7.381697601368481e-06,
"loss": 1.0472,
"step": 4200
},
{
"epoch": 0.83,
"grad_norm": 4.0257887840271,
"learning_rate": 7.36678491685565e-06,
"loss": 1.0399,
"step": 4210
},
{
"epoch": 0.83,
"grad_norm": 4.179793834686279,
"learning_rate": 7.351845049263374e-06,
"loss": 1.0518,
"step": 4220
},
{
"epoch": 0.83,
"grad_norm": 4.212937355041504,
"learning_rate": 7.3368781701795365e-06,
"loss": 1.0381,
"step": 4230
},
{
"epoch": 0.83,
"grad_norm": 4.426169395446777,
"learning_rate": 7.321884451502252e-06,
"loss": 1.0338,
"step": 4240
},
{
"epoch": 0.83,
"grad_norm": 4.190229415893555,
"learning_rate": 7.30686406543789e-06,
"loss": 1.0482,
"step": 4250
},
{
"epoch": 0.83,
"grad_norm": 3.897801160812378,
"learning_rate": 7.291817184499107e-06,
"loss": 1.0331,
"step": 4260
},
{
"epoch": 0.84,
"grad_norm": 4.616969585418701,
"learning_rate": 7.276743981502856e-06,
"loss": 1.0515,
"step": 4270
},
{
"epoch": 0.84,
"grad_norm": 3.8713490962982178,
"learning_rate": 7.2616446295684075e-06,
"loss": 1.0222,
"step": 4280
},
{
"epoch": 0.84,
"grad_norm": 3.93888783454895,
"learning_rate": 7.246519302115355e-06,
"loss": 1.0355,
"step": 4290
},
{
"epoch": 0.84,
"grad_norm": 4.489087104797363,
"learning_rate": 7.23136817286163e-06,
"loss": 1.0316,
"step": 4300
},
{
"epoch": 0.84,
"grad_norm": 3.9029769897460938,
"learning_rate": 7.216191415821503e-06,
"loss": 1.0212,
"step": 4310
},
{
"epoch": 0.85,
"grad_norm": 4.405784606933594,
"learning_rate": 7.200989205303583e-06,
"loss": 1.0421,
"step": 4320
},
{
"epoch": 0.85,
"grad_norm": 4.0875701904296875,
"learning_rate": 7.185761715908826e-06,
"loss": 1.0468,
"step": 4330
},
{
"epoch": 0.85,
"grad_norm": 4.10852575302124,
"learning_rate": 7.170509122528511e-06,
"loss": 1.0307,
"step": 4340
},
{
"epoch": 0.85,
"grad_norm": 4.401843547821045,
"learning_rate": 7.15523160034225e-06,
"loss": 1.0265,
"step": 4350
},
{
"epoch": 0.85,
"grad_norm": 4.106047630310059,
"learning_rate": 7.139929324815965e-06,
"loss": 1.021,
"step": 4360
},
{
"epoch": 0.86,
"grad_norm": 4.1536407470703125,
"learning_rate": 7.124602471699878e-06,
"loss": 1.0409,
"step": 4370
},
{
"epoch": 0.86,
"grad_norm": 4.14933443069458,
"learning_rate": 7.109251217026487e-06,
"loss": 1.0385,
"step": 4380
},
{
"epoch": 0.86,
"grad_norm": 4.064835071563721,
"learning_rate": 7.0938757371085485e-06,
"loss": 1.0312,
"step": 4390
},
{
"epoch": 0.86,
"grad_norm": 3.811549425125122,
"learning_rate": 7.078476208537057e-06,
"loss": 1.0359,
"step": 4400
},
{
"epoch": 0.86,
"grad_norm": 4.325003623962402,
"learning_rate": 7.063052808179205e-06,
"loss": 1.0483,
"step": 4410
},
{
"epoch": 0.87,
"grad_norm": 3.5266337394714355,
"learning_rate": 7.04760571317636e-06,
"loss": 1.0228,
"step": 4420
},
{
"epoch": 0.87,
"grad_norm": 4.071694850921631,
"learning_rate": 7.032135100942027e-06,
"loss": 1.0353,
"step": 4430
},
{
"epoch": 0.87,
"grad_norm": 4.121958255767822,
"learning_rate": 7.016641149159816e-06,
"loss": 1.049,
"step": 4440
},
{
"epoch": 0.87,
"grad_norm": 4.714683532714844,
"learning_rate": 7.00112403578139e-06,
"loss": 1.0361,
"step": 4450
},
{
"epoch": 0.87,
"grad_norm": 4.453790664672852,
"learning_rate": 6.985583939024436e-06,
"loss": 1.033,
"step": 4460
},
{
"epoch": 0.88,
"grad_norm": 4.712753772735596,
"learning_rate": 6.970021037370609e-06,
"loss": 1.0462,
"step": 4470
},
{
"epoch": 0.88,
"grad_norm": 4.329601287841797,
"learning_rate": 6.9544355095634775e-06,
"loss": 1.0459,
"step": 4480
},
{
"epoch": 0.88,
"grad_norm": 4.669638156890869,
"learning_rate": 6.938827534606484e-06,
"loss": 1.0335,
"step": 4490
},
{
"epoch": 0.88,
"grad_norm": 3.9964518547058105,
"learning_rate": 6.923197291760876e-06,
"loss": 1.0433,
"step": 4500
},
{
"epoch": 0.88,
"grad_norm": 3.998533248901367,
"learning_rate": 6.907544960543659e-06,
"loss": 1.035,
"step": 4510
},
{
"epoch": 0.89,
"grad_norm": 4.344484329223633,
"learning_rate": 6.891870720725522e-06,
"loss": 1.0405,
"step": 4520
},
{
"epoch": 0.89,
"grad_norm": 4.392531871795654,
"learning_rate": 6.8761747523287845e-06,
"loss": 1.0339,
"step": 4530
},
{
"epoch": 0.89,
"grad_norm": 4.274383544921875,
"learning_rate": 6.860457235625322e-06,
"loss": 1.0337,
"step": 4540
},
{
"epoch": 0.89,
"grad_norm": 4.2484846115112305,
"learning_rate": 6.844718351134496e-06,
"loss": 1.0433,
"step": 4550
},
{
"epoch": 0.89,
"grad_norm": 3.707181692123413,
"learning_rate": 6.828958279621085e-06,
"loss": 1.0497,
"step": 4560
},
{
"epoch": 0.9,
"grad_norm": 4.188033103942871,
"learning_rate": 6.813177202093203e-06,
"loss": 1.0274,
"step": 4570
},
{
"epoch": 0.9,
"grad_norm": 3.837230682373047,
"learning_rate": 6.797375299800224e-06,
"loss": 1.0395,
"step": 4580
},
{
"epoch": 0.9,
"grad_norm": 3.9512484073638916,
"learning_rate": 6.7815527542307e-06,
"loss": 1.0516,
"step": 4590
},
{
"epoch": 0.9,
"grad_norm": 4.2635297775268555,
"learning_rate": 6.765709747110274e-06,
"loss": 1.057,
"step": 4600
},
{
"epoch": 0.9,
"grad_norm": 4.248997211456299,
"learning_rate": 6.749846460399594e-06,
"loss": 1.0296,
"step": 4610
},
{
"epoch": 0.91,
"grad_norm": 4.210043430328369,
"learning_rate": 6.7339630762922295e-06,
"loss": 1.0291,
"step": 4620
},
{
"epoch": 0.91,
"grad_norm": 3.8999147415161133,
"learning_rate": 6.7180597772125665e-06,
"loss": 1.0375,
"step": 4630
},
{
"epoch": 0.91,
"grad_norm": 4.221770286560059,
"learning_rate": 6.702136745813721e-06,
"loss": 1.0206,
"step": 4640
},
{
"epoch": 0.91,
"grad_norm": 4.14971399307251,
"learning_rate": 6.686194164975446e-06,
"loss": 1.0283,
"step": 4650
},
{
"epoch": 0.91,
"grad_norm": 3.6616663932800293,
"learning_rate": 6.670232217802011e-06,
"loss": 1.0299,
"step": 4660
},
{
"epoch": 0.92,
"grad_norm": 4.623802661895752,
"learning_rate": 6.654251087620125e-06,
"loss": 1.0325,
"step": 4670
},
{
"epoch": 0.92,
"grad_norm": 3.6086490154266357,
"learning_rate": 6.638250957976813e-06,
"loss": 1.0299,
"step": 4680
},
{
"epoch": 0.92,
"grad_norm": 4.8812456130981445,
"learning_rate": 6.6222320126373105e-06,
"loss": 1.0436,
"step": 4690
},
{
"epoch": 0.92,
"grad_norm": 3.9015376567840576,
"learning_rate": 6.6061944355829634e-06,
"loss": 1.0093,
"step": 4700
},
{
"epoch": 0.92,
"grad_norm": 4.15576171875,
"learning_rate": 6.590138411009099e-06,
"loss": 1.0378,
"step": 4710
},
{
"epoch": 0.93,
"grad_norm": 4.204216957092285,
"learning_rate": 6.574064123322925e-06,
"loss": 1.032,
"step": 4720
},
{
"epoch": 0.93,
"grad_norm": 4.158588409423828,
"learning_rate": 6.557971757141402e-06,
"loss": 1.0182,
"step": 4730
},
{
"epoch": 0.93,
"grad_norm": 4.28289270401001,
"learning_rate": 6.541861497289126e-06,
"loss": 1.0324,
"step": 4740
},
{
"epoch": 0.93,
"grad_norm": 4.406084060668945,
"learning_rate": 6.525733528796207e-06,
"loss": 1.0311,
"step": 4750
},
{
"epoch": 0.93,
"grad_norm": 3.9430246353149414,
"learning_rate": 6.509588036896144e-06,
"loss": 1.0365,
"step": 4760
},
{
"epoch": 0.93,
"grad_norm": 3.8312675952911377,
"learning_rate": 6.493425207023693e-06,
"loss": 1.0313,
"step": 4770
},
{
"epoch": 0.94,
"grad_norm": 4.555315017700195,
"learning_rate": 6.477245224812746e-06,
"loss": 1.0336,
"step": 4780
},
{
"epoch": 0.94,
"grad_norm": 4.399374961853027,
"learning_rate": 6.46104827609419e-06,
"loss": 1.0309,
"step": 4790
},
{
"epoch": 0.94,
"grad_norm": 4.963261604309082,
"learning_rate": 6.444834546893773e-06,
"loss": 1.0401,
"step": 4800
},
{
"epoch": 0.94,
"grad_norm": 4.5317559242248535,
"learning_rate": 6.42860422342998e-06,
"loss": 1.0287,
"step": 4810
},
{
"epoch": 0.94,
"grad_norm": 4.08970308303833,
"learning_rate": 6.412357492111877e-06,
"loss": 1.0314,
"step": 4820
},
{
"epoch": 0.95,
"grad_norm": 4.869360446929932,
"learning_rate": 6.396094539536981e-06,
"loss": 1.0426,
"step": 4830
},
{
"epoch": 0.95,
"grad_norm": 4.279962539672852,
"learning_rate": 6.379815552489112e-06,
"loss": 1.044,
"step": 4840
},
{
"epoch": 0.95,
"grad_norm": 4.379662990570068,
"learning_rate": 6.363520717936256e-06,
"loss": 1.022,
"step": 4850
},
{
"epoch": 0.95,
"grad_norm": 4.329278469085693,
"learning_rate": 6.347210223028403e-06,
"loss": 1.0295,
"step": 4860
},
{
"epoch": 0.95,
"grad_norm": 4.4202423095703125,
"learning_rate": 6.330884255095409e-06,
"loss": 1.0391,
"step": 4870
},
{
"epoch": 0.96,
"grad_norm": 4.681463718414307,
"learning_rate": 6.3145430016448435e-06,
"loss": 1.0326,
"step": 4880
},
{
"epoch": 0.96,
"grad_norm": 4.008312225341797,
"learning_rate": 6.298186650359832e-06,
"loss": 1.0459,
"step": 4890
},
{
"epoch": 0.96,
"grad_norm": 4.1975884437561035,
"learning_rate": 6.281815389096903e-06,
"loss": 1.032,
"step": 4900
},
{
"epoch": 0.96,
"grad_norm": 4.011014461517334,
"learning_rate": 6.265429405883825e-06,
"loss": 1.0537,
"step": 4910
},
{
"epoch": 0.96,
"grad_norm": 4.628488063812256,
"learning_rate": 6.24902888891746e-06,
"loss": 1.0296,
"step": 4920
},
{
"epoch": 0.97,
"grad_norm": 4.286167621612549,
"learning_rate": 6.232614026561586e-06,
"loss": 1.0251,
"step": 4930
},
{
"epoch": 0.97,
"grad_norm": 4.162431240081787,
"learning_rate": 6.216185007344745e-06,
"loss": 1.0231,
"step": 4940
},
{
"epoch": 0.97,
"grad_norm": 4.599613189697266,
"learning_rate": 6.199742019958074e-06,
"loss": 1.0259,
"step": 4950
},
{
"epoch": 0.97,
"grad_norm": 3.7376463413238525,
"learning_rate": 6.183285253253135e-06,
"loss": 1.0308,
"step": 4960
},
{
"epoch": 0.97,
"grad_norm": 4.396124362945557,
"learning_rate": 6.1668148962397525e-06,
"loss": 1.0383,
"step": 4970
},
{
"epoch": 0.98,
"grad_norm": 4.382174015045166,
"learning_rate": 6.150331138083833e-06,
"loss": 1.0269,
"step": 4980
},
{
"epoch": 0.98,
"grad_norm": 4.524794578552246,
"learning_rate": 6.133834168105206e-06,
"loss": 1.0381,
"step": 4990
},
{
"epoch": 0.98,
"grad_norm": 4.226146221160889,
"learning_rate": 6.117324175775435e-06,
"loss": 1.0449,
"step": 5000
},
{
"epoch": 0.98,
"eval_loss": 1.023166298866272,
"eval_runtime": 12.4375,
"eval_samples_per_second": 52.583,
"eval_steps_per_second": 6.593,
"step": 5000
},
{
"epoch": 0.98,
"grad_norm": 4.120533466339111,
"learning_rate": 6.100801350715652e-06,
"loss": 1.0285,
"step": 5010
},
{
"epoch": 0.98,
"grad_norm": 3.9948532581329346,
"learning_rate": 6.084265882694378e-06,
"loss": 1.0411,
"step": 5020
},
{
"epoch": 0.99,
"grad_norm": 4.175631999969482,
"learning_rate": 6.0677179616253345e-06,
"loss": 1.0347,
"step": 5030
},
{
"epoch": 0.99,
"grad_norm": 4.19612455368042,
"learning_rate": 6.0511577775652744e-06,
"loss": 1.0367,
"step": 5040
},
{
"epoch": 0.99,
"grad_norm": 4.379330158233643,
"learning_rate": 6.034585520711792e-06,
"loss": 1.0314,
"step": 5050
},
{
"epoch": 0.99,
"grad_norm": 4.5682902336120605,
"learning_rate": 6.018001381401143e-06,
"loss": 1.0333,
"step": 5060
},
{
"epoch": 0.99,
"grad_norm": 4.473631381988525,
"learning_rate": 6.001405550106052e-06,
"loss": 1.0397,
"step": 5070
},
{
"epoch": 1.0,
"grad_norm": 4.200445175170898,
"learning_rate": 5.9847982174335314e-06,
"loss": 1.0262,
"step": 5080
},
{
"epoch": 1.0,
"grad_norm": 3.9142019748687744,
"learning_rate": 5.96817957412269e-06,
"loss": 1.034,
"step": 5090
},
{
"epoch": 1.0,
"grad_norm": 4.04217004776001,
"learning_rate": 5.951549811042539e-06,
"loss": 1.0466,
"step": 5100
}
],
"logging_steps": 10,
"max_steps": 10300,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 1,
"total_flos": 1.9350228034513797e+19,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}