chansung's picture
Model save
c382497 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 9.975062344139651,
"eval_steps": 500,
"global_step": 2000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004987531172069825,
"grad_norm": 4.53125,
"learning_rate": 1.0000000000000002e-06,
"loss": 2.7759,
"step": 1
},
{
"epoch": 0.02493765586034913,
"grad_norm": 8.125,
"learning_rate": 5e-06,
"loss": 2.8609,
"step": 5
},
{
"epoch": 0.04987531172069826,
"grad_norm": 3.984375,
"learning_rate": 1e-05,
"loss": 2.7873,
"step": 10
},
{
"epoch": 0.07481296758104738,
"grad_norm": 9.625,
"learning_rate": 1.5e-05,
"loss": 2.7334,
"step": 15
},
{
"epoch": 0.09975062344139651,
"grad_norm": 2.859375,
"learning_rate": 2e-05,
"loss": 2.5897,
"step": 20
},
{
"epoch": 0.12468827930174564,
"grad_norm": 2.0,
"learning_rate": 2.5e-05,
"loss": 2.423,
"step": 25
},
{
"epoch": 0.14962593516209477,
"grad_norm": 5.6875,
"learning_rate": 3e-05,
"loss": 2.2591,
"step": 30
},
{
"epoch": 0.1745635910224439,
"grad_norm": 2.34375,
"learning_rate": 3.5e-05,
"loss": 2.1123,
"step": 35
},
{
"epoch": 0.19950124688279303,
"grad_norm": 4.03125,
"learning_rate": 4e-05,
"loss": 1.9826,
"step": 40
},
{
"epoch": 0.22443890274314215,
"grad_norm": 3.015625,
"learning_rate": 4.5e-05,
"loss": 1.8715,
"step": 45
},
{
"epoch": 0.24937655860349128,
"grad_norm": 1.625,
"learning_rate": 5e-05,
"loss": 1.7412,
"step": 50
},
{
"epoch": 0.2743142144638404,
"grad_norm": 0.98046875,
"learning_rate": 5.500000000000001e-05,
"loss": 1.6251,
"step": 55
},
{
"epoch": 0.29925187032418954,
"grad_norm": 3.3125,
"learning_rate": 6e-05,
"loss": 1.5376,
"step": 60
},
{
"epoch": 0.32418952618453867,
"grad_norm": 0.57421875,
"learning_rate": 6.500000000000001e-05,
"loss": 1.444,
"step": 65
},
{
"epoch": 0.3491271820448878,
"grad_norm": 0.7890625,
"learning_rate": 7e-05,
"loss": 1.3852,
"step": 70
},
{
"epoch": 0.3740648379052369,
"grad_norm": 0.4765625,
"learning_rate": 7.500000000000001e-05,
"loss": 1.3384,
"step": 75
},
{
"epoch": 0.39900249376558605,
"grad_norm": 0.46484375,
"learning_rate": 8e-05,
"loss": 1.2946,
"step": 80
},
{
"epoch": 0.4239401496259352,
"grad_norm": 0.61328125,
"learning_rate": 8.5e-05,
"loss": 1.2742,
"step": 85
},
{
"epoch": 0.4488778054862843,
"grad_norm": 0.431640625,
"learning_rate": 9e-05,
"loss": 1.2416,
"step": 90
},
{
"epoch": 0.47381546134663344,
"grad_norm": 0.345703125,
"learning_rate": 9.5e-05,
"loss": 1.2248,
"step": 95
},
{
"epoch": 0.49875311720698257,
"grad_norm": 0.55078125,
"learning_rate": 0.0001,
"loss": 1.1963,
"step": 100
},
{
"epoch": 0.5236907730673317,
"grad_norm": 0.48046875,
"learning_rate": 0.000105,
"loss": 1.1878,
"step": 105
},
{
"epoch": 0.5486284289276808,
"grad_norm": 0.5625,
"learning_rate": 0.00011000000000000002,
"loss": 1.1837,
"step": 110
},
{
"epoch": 0.57356608478803,
"grad_norm": 0.51953125,
"learning_rate": 0.00011499999999999999,
"loss": 1.1665,
"step": 115
},
{
"epoch": 0.5985037406483791,
"grad_norm": 1.2109375,
"learning_rate": 0.00012,
"loss": 1.1605,
"step": 120
},
{
"epoch": 0.6234413965087282,
"grad_norm": 0.490234375,
"learning_rate": 0.000125,
"loss": 1.1303,
"step": 125
},
{
"epoch": 0.6483790523690773,
"grad_norm": 0.62890625,
"learning_rate": 0.00013000000000000002,
"loss": 1.1453,
"step": 130
},
{
"epoch": 0.6733167082294265,
"grad_norm": 0.53515625,
"learning_rate": 0.00013500000000000003,
"loss": 1.138,
"step": 135
},
{
"epoch": 0.6982543640897756,
"grad_norm": 0.431640625,
"learning_rate": 0.00014,
"loss": 1.1157,
"step": 140
},
{
"epoch": 0.7231920199501247,
"grad_norm": 0.98046875,
"learning_rate": 0.000145,
"loss": 1.1255,
"step": 145
},
{
"epoch": 0.7481296758104738,
"grad_norm": 0.59375,
"learning_rate": 0.00015000000000000001,
"loss": 1.1147,
"step": 150
},
{
"epoch": 0.773067331670823,
"grad_norm": 1.2109375,
"learning_rate": 0.000155,
"loss": 1.1138,
"step": 155
},
{
"epoch": 0.7980049875311721,
"grad_norm": 0.9921875,
"learning_rate": 0.00016,
"loss": 1.102,
"step": 160
},
{
"epoch": 0.8229426433915212,
"grad_norm": 0.66015625,
"learning_rate": 0.000165,
"loss": 1.0893,
"step": 165
},
{
"epoch": 0.8478802992518704,
"grad_norm": 1.0859375,
"learning_rate": 0.00017,
"loss": 1.0974,
"step": 170
},
{
"epoch": 0.8728179551122195,
"grad_norm": 0.56640625,
"learning_rate": 0.000175,
"loss": 1.0929,
"step": 175
},
{
"epoch": 0.8977556109725686,
"grad_norm": 0.439453125,
"learning_rate": 0.00018,
"loss": 1.0945,
"step": 180
},
{
"epoch": 0.9226932668329177,
"grad_norm": 0.515625,
"learning_rate": 0.00018500000000000002,
"loss": 1.086,
"step": 185
},
{
"epoch": 0.9476309226932669,
"grad_norm": 0.3828125,
"learning_rate": 0.00019,
"loss": 1.0736,
"step": 190
},
{
"epoch": 0.972568578553616,
"grad_norm": 0.494140625,
"learning_rate": 0.000195,
"loss": 1.0721,
"step": 195
},
{
"epoch": 0.9975062344139651,
"grad_norm": 0.734375,
"learning_rate": 0.0002,
"loss": 1.0668,
"step": 200
},
{
"epoch": 0.9975062344139651,
"eval_loss": 2.471259355545044,
"eval_runtime": 0.9509,
"eval_samples_per_second": 10.517,
"eval_steps_per_second": 1.052,
"step": 200
},
{
"epoch": 1.0224438902743143,
"grad_norm": 0.921875,
"learning_rate": 0.00019999619230641713,
"loss": 1.041,
"step": 205
},
{
"epoch": 1.0473815461346634,
"grad_norm": 0.4296875,
"learning_rate": 0.00019998476951563915,
"loss": 1.0476,
"step": 210
},
{
"epoch": 1.0723192019950125,
"grad_norm": 0.57421875,
"learning_rate": 0.00019996573249755572,
"loss": 1.0466,
"step": 215
},
{
"epoch": 1.0972568578553616,
"grad_norm": 0.4765625,
"learning_rate": 0.0001999390827019096,
"loss": 1.0377,
"step": 220
},
{
"epoch": 1.1221945137157108,
"grad_norm": 0.6015625,
"learning_rate": 0.0001999048221581858,
"loss": 1.04,
"step": 225
},
{
"epoch": 1.14713216957606,
"grad_norm": 0.6953125,
"learning_rate": 0.0001998629534754574,
"loss": 1.0279,
"step": 230
},
{
"epoch": 1.172069825436409,
"grad_norm": 0.5859375,
"learning_rate": 0.0001998134798421867,
"loss": 1.0384,
"step": 235
},
{
"epoch": 1.1970074812967582,
"grad_norm": 0.46875,
"learning_rate": 0.00019975640502598244,
"loss": 1.0298,
"step": 240
},
{
"epoch": 1.2219451371571073,
"grad_norm": 0.4375,
"learning_rate": 0.0001996917333733128,
"loss": 1.0369,
"step": 245
},
{
"epoch": 1.2468827930174564,
"grad_norm": 0.462890625,
"learning_rate": 0.00019961946980917456,
"loss": 1.0218,
"step": 250
},
{
"epoch": 1.2718204488778055,
"grad_norm": 0.494140625,
"learning_rate": 0.00019953961983671788,
"loss": 1.0195,
"step": 255
},
{
"epoch": 1.2967581047381547,
"grad_norm": 0.427734375,
"learning_rate": 0.00019945218953682734,
"loss": 1.0136,
"step": 260
},
{
"epoch": 1.3216957605985038,
"grad_norm": 0.5703125,
"learning_rate": 0.00019935718556765876,
"loss": 1.0284,
"step": 265
},
{
"epoch": 1.346633416458853,
"grad_norm": 0.40234375,
"learning_rate": 0.00019925461516413223,
"loss": 1.0308,
"step": 270
},
{
"epoch": 1.371571072319202,
"grad_norm": 0.49609375,
"learning_rate": 0.00019914448613738106,
"loss": 1.0243,
"step": 275
},
{
"epoch": 1.3965087281795512,
"grad_norm": 0.71484375,
"learning_rate": 0.00019902680687415705,
"loss": 1.0262,
"step": 280
},
{
"epoch": 1.4214463840399003,
"grad_norm": 0.490234375,
"learning_rate": 0.0001989015863361917,
"loss": 1.0164,
"step": 285
},
{
"epoch": 1.4463840399002494,
"grad_norm": 0.640625,
"learning_rate": 0.00019876883405951377,
"loss": 1.0182,
"step": 290
},
{
"epoch": 1.4713216957605986,
"grad_norm": 0.396484375,
"learning_rate": 0.00019862856015372317,
"loss": 1.0073,
"step": 295
},
{
"epoch": 1.4962593516209477,
"grad_norm": 0.4921875,
"learning_rate": 0.00019848077530122083,
"loss": 1.0199,
"step": 300
},
{
"epoch": 1.5211970074812968,
"grad_norm": 0.4453125,
"learning_rate": 0.0001983254907563955,
"loss": 0.9976,
"step": 305
},
{
"epoch": 1.546134663341646,
"grad_norm": 0.60546875,
"learning_rate": 0.00019816271834476642,
"loss": 0.9939,
"step": 310
},
{
"epoch": 1.571072319201995,
"grad_norm": 0.421875,
"learning_rate": 0.00019799247046208297,
"loss": 1.0139,
"step": 315
},
{
"epoch": 1.5960099750623442,
"grad_norm": 0.42578125,
"learning_rate": 0.00019781476007338058,
"loss": 1.0017,
"step": 320
},
{
"epoch": 1.6209476309226933,
"grad_norm": 0.47265625,
"learning_rate": 0.00019762960071199333,
"loss": 0.9987,
"step": 325
},
{
"epoch": 1.6458852867830425,
"grad_norm": 0.41796875,
"learning_rate": 0.00019743700647852354,
"loss": 0.9893,
"step": 330
},
{
"epoch": 1.6708229426433916,
"grad_norm": 0.443359375,
"learning_rate": 0.00019723699203976766,
"loss": 0.9944,
"step": 335
},
{
"epoch": 1.6957605985037407,
"grad_norm": 0.353515625,
"learning_rate": 0.00019702957262759965,
"loss": 0.9848,
"step": 340
},
{
"epoch": 1.7206982543640899,
"grad_norm": 0.546875,
"learning_rate": 0.0001968147640378108,
"loss": 0.9907,
"step": 345
},
{
"epoch": 1.745635910224439,
"grad_norm": 0.46484375,
"learning_rate": 0.00019659258262890683,
"loss": 0.9837,
"step": 350
},
{
"epoch": 1.770573566084788,
"grad_norm": 0.48046875,
"learning_rate": 0.0001963630453208623,
"loss": 0.9871,
"step": 355
},
{
"epoch": 1.7955112219451372,
"grad_norm": 0.71875,
"learning_rate": 0.0001961261695938319,
"loss": 0.9928,
"step": 360
},
{
"epoch": 1.8204488778054864,
"grad_norm": 0.671875,
"learning_rate": 0.0001958819734868193,
"loss": 0.988,
"step": 365
},
{
"epoch": 1.8453865336658355,
"grad_norm": 0.5,
"learning_rate": 0.00019563047559630357,
"loss": 0.9768,
"step": 370
},
{
"epoch": 1.8703241895261846,
"grad_norm": 0.6875,
"learning_rate": 0.0001953716950748227,
"loss": 0.9947,
"step": 375
},
{
"epoch": 1.8952618453865338,
"grad_norm": 0.6484375,
"learning_rate": 0.00019510565162951537,
"loss": 0.9951,
"step": 380
},
{
"epoch": 1.9201995012468829,
"grad_norm": 0.5546875,
"learning_rate": 0.00019483236552061994,
"loss": 0.9709,
"step": 385
},
{
"epoch": 1.945137157107232,
"grad_norm": 0.36328125,
"learning_rate": 0.0001945518575599317,
"loss": 0.9837,
"step": 390
},
{
"epoch": 1.9700748129675811,
"grad_norm": 0.38671875,
"learning_rate": 0.00019426414910921787,
"loss": 0.9895,
"step": 395
},
{
"epoch": 1.9950124688279303,
"grad_norm": 0.421875,
"learning_rate": 0.00019396926207859084,
"loss": 0.9846,
"step": 400
},
{
"epoch": 2.0,
"eval_loss": 2.465005397796631,
"eval_runtime": 0.5313,
"eval_samples_per_second": 18.822,
"eval_steps_per_second": 1.882,
"step": 401
},
{
"epoch": 2.0199501246882794,
"grad_norm": 0.79296875,
"learning_rate": 0.00019366721892483978,
"loss": 0.9501,
"step": 405
},
{
"epoch": 2.0448877805486285,
"grad_norm": 0.671875,
"learning_rate": 0.00019335804264972018,
"loss": 0.9451,
"step": 410
},
{
"epoch": 2.0698254364089776,
"grad_norm": 0.59765625,
"learning_rate": 0.00019304175679820247,
"loss": 0.9343,
"step": 415
},
{
"epoch": 2.0947630922693268,
"grad_norm": 0.412109375,
"learning_rate": 0.00019271838545667876,
"loss": 0.9283,
"step": 420
},
{
"epoch": 2.119700748129676,
"grad_norm": 0.416015625,
"learning_rate": 0.0001923879532511287,
"loss": 0.9398,
"step": 425
},
{
"epoch": 2.144638403990025,
"grad_norm": 0.337890625,
"learning_rate": 0.00019205048534524406,
"loss": 0.9454,
"step": 430
},
{
"epoch": 2.169576059850374,
"grad_norm": 0.494140625,
"learning_rate": 0.0001917060074385124,
"loss": 0.9341,
"step": 435
},
{
"epoch": 2.1945137157107233,
"grad_norm": 0.38671875,
"learning_rate": 0.0001913545457642601,
"loss": 0.9405,
"step": 440
},
{
"epoch": 2.2194513715710724,
"grad_norm": 0.4296875,
"learning_rate": 0.00019099612708765434,
"loss": 0.9424,
"step": 445
},
{
"epoch": 2.2443890274314215,
"grad_norm": 0.451171875,
"learning_rate": 0.000190630778703665,
"loss": 0.9329,
"step": 450
},
{
"epoch": 2.2693266832917707,
"grad_norm": 0.41796875,
"learning_rate": 0.00019025852843498607,
"loss": 0.9377,
"step": 455
},
{
"epoch": 2.29426433915212,
"grad_norm": 0.5,
"learning_rate": 0.0001898794046299167,
"loss": 0.9409,
"step": 460
},
{
"epoch": 2.319201995012469,
"grad_norm": 0.46484375,
"learning_rate": 0.00018949343616020252,
"loss": 0.9473,
"step": 465
},
{
"epoch": 2.344139650872818,
"grad_norm": 0.38671875,
"learning_rate": 0.0001891006524188368,
"loss": 0.9384,
"step": 470
},
{
"epoch": 2.369077306733167,
"grad_norm": 0.40625,
"learning_rate": 0.00018870108331782217,
"loss": 0.9471,
"step": 475
},
{
"epoch": 2.3940149625935163,
"grad_norm": 0.392578125,
"learning_rate": 0.00018829475928589271,
"loss": 0.9393,
"step": 480
},
{
"epoch": 2.4189526184538654,
"grad_norm": 0.3671875,
"learning_rate": 0.00018788171126619653,
"loss": 0.931,
"step": 485
},
{
"epoch": 2.4438902743142146,
"grad_norm": 0.408203125,
"learning_rate": 0.00018746197071393958,
"loss": 0.9309,
"step": 490
},
{
"epoch": 2.4688279301745637,
"grad_norm": 0.361328125,
"learning_rate": 0.00018703556959398998,
"loss": 0.9375,
"step": 495
},
{
"epoch": 2.493765586034913,
"grad_norm": 0.3984375,
"learning_rate": 0.00018660254037844388,
"loss": 0.9384,
"step": 500
},
{
"epoch": 2.518703241895262,
"grad_norm": 0.37109375,
"learning_rate": 0.00018616291604415258,
"loss": 0.9365,
"step": 505
},
{
"epoch": 2.543640897755611,
"grad_norm": 0.34765625,
"learning_rate": 0.00018571673007021123,
"loss": 0.9449,
"step": 510
},
{
"epoch": 2.56857855361596,
"grad_norm": 0.36328125,
"learning_rate": 0.00018526401643540922,
"loss": 0.9422,
"step": 515
},
{
"epoch": 2.5935162094763093,
"grad_norm": 0.458984375,
"learning_rate": 0.0001848048096156426,
"loss": 0.9313,
"step": 520
},
{
"epoch": 2.6184538653366585,
"grad_norm": 0.431640625,
"learning_rate": 0.0001843391445812886,
"loss": 0.9279,
"step": 525
},
{
"epoch": 2.6433915211970076,
"grad_norm": 0.3984375,
"learning_rate": 0.00018386705679454242,
"loss": 0.9393,
"step": 530
},
{
"epoch": 2.6683291770573567,
"grad_norm": 0.34765625,
"learning_rate": 0.00018338858220671682,
"loss": 0.942,
"step": 535
},
{
"epoch": 2.693266832917706,
"grad_norm": 0.39453125,
"learning_rate": 0.00018290375725550417,
"loss": 0.9412,
"step": 540
},
{
"epoch": 2.718204488778055,
"grad_norm": 0.345703125,
"learning_rate": 0.00018241261886220154,
"loss": 0.9455,
"step": 545
},
{
"epoch": 2.743142144638404,
"grad_norm": 0.47265625,
"learning_rate": 0.0001819152044288992,
"loss": 0.9385,
"step": 550
},
{
"epoch": 2.7680798004987532,
"grad_norm": 0.392578125,
"learning_rate": 0.00018141155183563193,
"loss": 0.9438,
"step": 555
},
{
"epoch": 2.7930174563591024,
"grad_norm": 0.3984375,
"learning_rate": 0.00018090169943749476,
"loss": 0.937,
"step": 560
},
{
"epoch": 2.8179551122194515,
"grad_norm": 0.38671875,
"learning_rate": 0.00018038568606172173,
"loss": 0.9257,
"step": 565
},
{
"epoch": 2.8428927680798006,
"grad_norm": 0.3828125,
"learning_rate": 0.00017986355100472928,
"loss": 0.9311,
"step": 570
},
{
"epoch": 2.8678304239401498,
"grad_norm": 0.419921875,
"learning_rate": 0.00017933533402912354,
"loss": 0.9327,
"step": 575
},
{
"epoch": 2.892768079800499,
"grad_norm": 0.400390625,
"learning_rate": 0.00017880107536067218,
"loss": 0.9202,
"step": 580
},
{
"epoch": 2.917705735660848,
"grad_norm": 0.455078125,
"learning_rate": 0.0001782608156852414,
"loss": 0.9277,
"step": 585
},
{
"epoch": 2.942643391521197,
"grad_norm": 0.462890625,
"learning_rate": 0.0001777145961456971,
"loss": 0.9359,
"step": 590
},
{
"epoch": 2.9675810473815463,
"grad_norm": 0.41796875,
"learning_rate": 0.00017716245833877201,
"loss": 0.9207,
"step": 595
},
{
"epoch": 2.9925187032418954,
"grad_norm": 0.55078125,
"learning_rate": 0.0001766044443118978,
"loss": 0.941,
"step": 600
},
{
"epoch": 2.997506234413965,
"eval_loss": 2.487478733062744,
"eval_runtime": 0.9468,
"eval_samples_per_second": 10.562,
"eval_steps_per_second": 1.056,
"step": 601
},
{
"epoch": 3.0174563591022445,
"grad_norm": 0.453125,
"learning_rate": 0.0001760405965600031,
"loss": 0.9043,
"step": 605
},
{
"epoch": 3.0423940149625937,
"grad_norm": 0.400390625,
"learning_rate": 0.00017547095802227723,
"loss": 0.8768,
"step": 610
},
{
"epoch": 3.067331670822943,
"grad_norm": 0.404296875,
"learning_rate": 0.00017489557207890023,
"loss": 0.8886,
"step": 615
},
{
"epoch": 3.092269326683292,
"grad_norm": 0.365234375,
"learning_rate": 0.00017431448254773944,
"loss": 0.8857,
"step": 620
},
{
"epoch": 3.117206982543641,
"grad_norm": 0.416015625,
"learning_rate": 0.0001737277336810124,
"loss": 0.8933,
"step": 625
},
{
"epoch": 3.14214463840399,
"grad_norm": 0.35546875,
"learning_rate": 0.00017313537016191706,
"loss": 0.8898,
"step": 630
},
{
"epoch": 3.1670822942643393,
"grad_norm": 0.451171875,
"learning_rate": 0.00017253743710122875,
"loss": 0.8916,
"step": 635
},
{
"epoch": 3.1920199501246884,
"grad_norm": 0.56640625,
"learning_rate": 0.0001719339800338651,
"loss": 0.882,
"step": 640
},
{
"epoch": 3.2169576059850375,
"grad_norm": 0.470703125,
"learning_rate": 0.00017132504491541818,
"loss": 0.8975,
"step": 645
},
{
"epoch": 3.2418952618453867,
"grad_norm": 0.427734375,
"learning_rate": 0.00017071067811865476,
"loss": 0.8952,
"step": 650
},
{
"epoch": 3.266832917705736,
"grad_norm": 0.384765625,
"learning_rate": 0.0001700909264299851,
"loss": 0.8788,
"step": 655
},
{
"epoch": 3.291770573566085,
"grad_norm": 0.396484375,
"learning_rate": 0.00016946583704589973,
"loss": 0.8939,
"step": 660
},
{
"epoch": 3.316708229426434,
"grad_norm": 0.439453125,
"learning_rate": 0.0001688354575693754,
"loss": 0.8954,
"step": 665
},
{
"epoch": 3.341645885286783,
"grad_norm": 0.38671875,
"learning_rate": 0.00016819983600624986,
"loss": 0.8839,
"step": 670
},
{
"epoch": 3.3665835411471323,
"grad_norm": 0.392578125,
"learning_rate": 0.00016755902076156604,
"loss": 0.896,
"step": 675
},
{
"epoch": 3.3915211970074814,
"grad_norm": 0.337890625,
"learning_rate": 0.00016691306063588583,
"loss": 0.8921,
"step": 680
},
{
"epoch": 3.4164588528678306,
"grad_norm": 0.41015625,
"learning_rate": 0.00016626200482157378,
"loss": 0.8848,
"step": 685
},
{
"epoch": 3.4413965087281797,
"grad_norm": 0.35546875,
"learning_rate": 0.00016560590289905073,
"loss": 0.8909,
"step": 690
},
{
"epoch": 3.466334164588529,
"grad_norm": 0.3671875,
"learning_rate": 0.00016494480483301836,
"loss": 0.8981,
"step": 695
},
{
"epoch": 3.491271820448878,
"grad_norm": 0.341796875,
"learning_rate": 0.00016427876096865394,
"loss": 0.8871,
"step": 700
},
{
"epoch": 3.516209476309227,
"grad_norm": 0.380859375,
"learning_rate": 0.0001636078220277764,
"loss": 0.8843,
"step": 705
},
{
"epoch": 3.541147132169576,
"grad_norm": 0.404296875,
"learning_rate": 0.00016293203910498376,
"loss": 0.8867,
"step": 710
},
{
"epoch": 3.5660847880299253,
"grad_norm": 0.396484375,
"learning_rate": 0.00016225146366376198,
"loss": 0.889,
"step": 715
},
{
"epoch": 3.5910224438902745,
"grad_norm": 0.380859375,
"learning_rate": 0.0001615661475325658,
"loss": 0.8902,
"step": 720
},
{
"epoch": 3.6159600997506236,
"grad_norm": 0.412109375,
"learning_rate": 0.00016087614290087208,
"loss": 0.8821,
"step": 725
},
{
"epoch": 3.6408977556109727,
"grad_norm": 0.37890625,
"learning_rate": 0.00016018150231520486,
"loss": 0.8948,
"step": 730
},
{
"epoch": 3.665835411471322,
"grad_norm": 0.396484375,
"learning_rate": 0.00015948227867513415,
"loss": 0.8944,
"step": 735
},
{
"epoch": 3.690773067331671,
"grad_norm": 0.37890625,
"learning_rate": 0.00015877852522924732,
"loss": 0.8931,
"step": 740
},
{
"epoch": 3.71571072319202,
"grad_norm": 0.38671875,
"learning_rate": 0.00015807029557109398,
"loss": 0.8817,
"step": 745
},
{
"epoch": 3.7406483790523692,
"grad_norm": 0.4296875,
"learning_rate": 0.0001573576436351046,
"loss": 0.8938,
"step": 750
},
{
"epoch": 3.765586034912718,
"grad_norm": 0.345703125,
"learning_rate": 0.00015664062369248328,
"loss": 0.8953,
"step": 755
},
{
"epoch": 3.7905236907730675,
"grad_norm": 0.37890625,
"learning_rate": 0.0001559192903470747,
"loss": 0.8887,
"step": 760
},
{
"epoch": 3.815461346633416,
"grad_norm": 0.404296875,
"learning_rate": 0.0001551936985312058,
"loss": 0.8957,
"step": 765
},
{
"epoch": 3.8403990024937658,
"grad_norm": 0.69140625,
"learning_rate": 0.00015446390350150273,
"loss": 0.9005,
"step": 770
},
{
"epoch": 3.8653366583541144,
"grad_norm": 0.515625,
"learning_rate": 0.0001537299608346824,
"loss": 0.8937,
"step": 775
},
{
"epoch": 3.890274314214464,
"grad_norm": 0.6484375,
"learning_rate": 0.0001529919264233205,
"loss": 0.8938,
"step": 780
},
{
"epoch": 3.9152119700748127,
"grad_norm": 0.44921875,
"learning_rate": 0.0001522498564715949,
"loss": 0.8859,
"step": 785
},
{
"epoch": 3.9401496259351623,
"grad_norm": 0.34765625,
"learning_rate": 0.00015150380749100545,
"loss": 0.8847,
"step": 790
},
{
"epoch": 3.965087281795511,
"grad_norm": 0.4609375,
"learning_rate": 0.00015075383629607042,
"loss": 0.89,
"step": 795
},
{
"epoch": 3.9900249376558605,
"grad_norm": 0.353515625,
"learning_rate": 0.00015000000000000001,
"loss": 0.8872,
"step": 800
},
{
"epoch": 4.0,
"eval_loss": 2.521277666091919,
"eval_runtime": 0.5369,
"eval_samples_per_second": 18.624,
"eval_steps_per_second": 1.862,
"step": 802
},
{
"epoch": 4.014962593516209,
"grad_norm": 0.357421875,
"learning_rate": 0.00014924235601034672,
"loss": 0.8609,
"step": 805
},
{
"epoch": 4.039900249376559,
"grad_norm": 0.3359375,
"learning_rate": 0.00014848096202463372,
"loss": 0.8593,
"step": 810
},
{
"epoch": 4.0648379052369075,
"grad_norm": 0.369140625,
"learning_rate": 0.00014771587602596084,
"loss": 0.8442,
"step": 815
},
{
"epoch": 4.089775561097257,
"grad_norm": 0.37890625,
"learning_rate": 0.00014694715627858908,
"loss": 0.8414,
"step": 820
},
{
"epoch": 4.114713216957606,
"grad_norm": 0.390625,
"learning_rate": 0.00014617486132350343,
"loss": 0.8416,
"step": 825
},
{
"epoch": 4.139650872817955,
"grad_norm": 0.3515625,
"learning_rate": 0.00014539904997395468,
"loss": 0.8449,
"step": 830
},
{
"epoch": 4.164588528678304,
"grad_norm": 0.5546875,
"learning_rate": 0.00014461978131098088,
"loss": 0.8426,
"step": 835
},
{
"epoch": 4.1895261845386536,
"grad_norm": 0.3984375,
"learning_rate": 0.00014383711467890774,
"loss": 0.8485,
"step": 840
},
{
"epoch": 4.214463840399002,
"grad_norm": 0.39453125,
"learning_rate": 0.00014305110968082952,
"loss": 0.8504,
"step": 845
},
{
"epoch": 4.239401496259352,
"grad_norm": 0.36328125,
"learning_rate": 0.00014226182617406996,
"loss": 0.8498,
"step": 850
},
{
"epoch": 4.2643391521197005,
"grad_norm": 0.34375,
"learning_rate": 0.00014146932426562392,
"loss": 0.858,
"step": 855
},
{
"epoch": 4.28927680798005,
"grad_norm": 0.38671875,
"learning_rate": 0.00014067366430758004,
"loss": 0.8497,
"step": 860
},
{
"epoch": 4.314214463840399,
"grad_norm": 0.46875,
"learning_rate": 0.00013987490689252463,
"loss": 0.8493,
"step": 865
},
{
"epoch": 4.339152119700748,
"grad_norm": 0.421875,
"learning_rate": 0.00013907311284892736,
"loss": 0.852,
"step": 870
},
{
"epoch": 4.364089775561097,
"grad_norm": 0.38671875,
"learning_rate": 0.000138268343236509,
"loss": 0.8581,
"step": 875
},
{
"epoch": 4.389027431421447,
"grad_norm": 0.388671875,
"learning_rate": 0.00013746065934159123,
"loss": 0.8518,
"step": 880
},
{
"epoch": 4.413965087281795,
"grad_norm": 0.375,
"learning_rate": 0.00013665012267242974,
"loss": 0.8556,
"step": 885
},
{
"epoch": 4.438902743142145,
"grad_norm": 0.373046875,
"learning_rate": 0.00013583679495453,
"loss": 0.8618,
"step": 890
},
{
"epoch": 4.4638403990024935,
"grad_norm": 0.33203125,
"learning_rate": 0.00013502073812594675,
"loss": 0.8431,
"step": 895
},
{
"epoch": 4.488778054862843,
"grad_norm": 0.396484375,
"learning_rate": 0.00013420201433256689,
"loss": 0.8481,
"step": 900
},
{
"epoch": 4.513715710723192,
"grad_norm": 0.4140625,
"learning_rate": 0.0001333806859233771,
"loss": 0.858,
"step": 905
},
{
"epoch": 4.538653366583541,
"grad_norm": 0.51171875,
"learning_rate": 0.00013255681544571568,
"loss": 0.8491,
"step": 910
},
{
"epoch": 4.56359102244389,
"grad_norm": 0.3828125,
"learning_rate": 0.00013173046564050924,
"loss": 0.8536,
"step": 915
},
{
"epoch": 4.58852867830424,
"grad_norm": 0.37109375,
"learning_rate": 0.00013090169943749476,
"loss": 0.8505,
"step": 920
},
{
"epoch": 4.613466334164588,
"grad_norm": 0.365234375,
"learning_rate": 0.00013007057995042732,
"loss": 0.8447,
"step": 925
},
{
"epoch": 4.638403990024938,
"grad_norm": 0.39453125,
"learning_rate": 0.00012923717047227368,
"loss": 0.8519,
"step": 930
},
{
"epoch": 4.6633416458852865,
"grad_norm": 0.390625,
"learning_rate": 0.00012840153447039228,
"loss": 0.8503,
"step": 935
},
{
"epoch": 4.688279301745636,
"grad_norm": 0.38671875,
"learning_rate": 0.0001275637355816999,
"loss": 0.8525,
"step": 940
},
{
"epoch": 4.713216957605985,
"grad_norm": 0.396484375,
"learning_rate": 0.00012672383760782568,
"loss": 0.8422,
"step": 945
},
{
"epoch": 4.738154613466334,
"grad_norm": 0.38671875,
"learning_rate": 0.00012588190451025207,
"loss": 0.8513,
"step": 950
},
{
"epoch": 4.763092269326683,
"grad_norm": 0.373046875,
"learning_rate": 0.00012503800040544416,
"loss": 0.8574,
"step": 955
},
{
"epoch": 4.788029925187033,
"grad_norm": 0.34765625,
"learning_rate": 0.00012419218955996676,
"loss": 0.8386,
"step": 960
},
{
"epoch": 4.812967581047381,
"grad_norm": 0.40234375,
"learning_rate": 0.00012334453638559057,
"loss": 0.8521,
"step": 965
},
{
"epoch": 4.837905236907731,
"grad_norm": 0.4296875,
"learning_rate": 0.0001224951054343865,
"loss": 0.8469,
"step": 970
},
{
"epoch": 4.86284289276808,
"grad_norm": 0.365234375,
"learning_rate": 0.00012164396139381029,
"loss": 0.8506,
"step": 975
},
{
"epoch": 4.887780548628429,
"grad_norm": 0.365234375,
"learning_rate": 0.00012079116908177593,
"loss": 0.8524,
"step": 980
},
{
"epoch": 4.912718204488778,
"grad_norm": 0.373046875,
"learning_rate": 0.00011993679344171973,
"loss": 0.846,
"step": 985
},
{
"epoch": 4.937655860349127,
"grad_norm": 0.349609375,
"learning_rate": 0.00011908089953765449,
"loss": 0.8523,
"step": 990
},
{
"epoch": 4.962593516209476,
"grad_norm": 0.376953125,
"learning_rate": 0.00011822355254921478,
"loss": 0.8515,
"step": 995
},
{
"epoch": 4.987531172069826,
"grad_norm": 0.384765625,
"learning_rate": 0.00011736481776669306,
"loss": 0.8497,
"step": 1000
},
{
"epoch": 4.997506234413965,
"eval_loss": 2.5718774795532227,
"eval_runtime": 0.8024,
"eval_samples_per_second": 12.462,
"eval_steps_per_second": 1.246,
"step": 1002
},
{
"epoch": 5.012468827930174,
"grad_norm": 0.37109375,
"learning_rate": 0.00011650476058606777,
"loss": 0.8318,
"step": 1005
},
{
"epoch": 5.037406483790524,
"grad_norm": 0.380859375,
"learning_rate": 0.0001156434465040231,
"loss": 0.8014,
"step": 1010
},
{
"epoch": 5.062344139650873,
"grad_norm": 0.37890625,
"learning_rate": 0.00011478094111296109,
"loss": 0.8068,
"step": 1015
},
{
"epoch": 5.087281795511222,
"grad_norm": 0.353515625,
"learning_rate": 0.00011391731009600654,
"loss": 0.8065,
"step": 1020
},
{
"epoch": 5.112219451371571,
"grad_norm": 0.408203125,
"learning_rate": 0.00011305261922200519,
"loss": 0.8112,
"step": 1025
},
{
"epoch": 5.13715710723192,
"grad_norm": 0.384765625,
"learning_rate": 0.00011218693434051475,
"loss": 0.8188,
"step": 1030
},
{
"epoch": 5.162094763092269,
"grad_norm": 0.392578125,
"learning_rate": 0.0001113203213767907,
"loss": 0.8073,
"step": 1035
},
{
"epoch": 5.187032418952619,
"grad_norm": 0.376953125,
"learning_rate": 0.00011045284632676536,
"loss": 0.8124,
"step": 1040
},
{
"epoch": 5.211970074812967,
"grad_norm": 0.39453125,
"learning_rate": 0.00010958457525202241,
"loss": 0.8194,
"step": 1045
},
{
"epoch": 5.236907730673317,
"grad_norm": 0.380859375,
"learning_rate": 0.00010871557427476583,
"loss": 0.805,
"step": 1050
},
{
"epoch": 5.261845386533666,
"grad_norm": 0.359375,
"learning_rate": 0.0001078459095727845,
"loss": 0.8162,
"step": 1055
},
{
"epoch": 5.286783042394015,
"grad_norm": 0.375,
"learning_rate": 0.00010697564737441252,
"loss": 0.8079,
"step": 1060
},
{
"epoch": 5.311720698254364,
"grad_norm": 0.38671875,
"learning_rate": 0.00010610485395348571,
"loss": 0.8098,
"step": 1065
},
{
"epoch": 5.3366583541147135,
"grad_norm": 0.37890625,
"learning_rate": 0.0001052335956242944,
"loss": 0.8182,
"step": 1070
},
{
"epoch": 5.361596009975062,
"grad_norm": 0.365234375,
"learning_rate": 0.00010436193873653361,
"loss": 0.8157,
"step": 1075
},
{
"epoch": 5.386533665835412,
"grad_norm": 0.412109375,
"learning_rate": 0.00010348994967025012,
"loss": 0.8149,
"step": 1080
},
{
"epoch": 5.41147132169576,
"grad_norm": 0.376953125,
"learning_rate": 0.00010261769483078733,
"loss": 0.8144,
"step": 1085
},
{
"epoch": 5.43640897755611,
"grad_norm": 0.3671875,
"learning_rate": 0.00010174524064372837,
"loss": 0.8094,
"step": 1090
},
{
"epoch": 5.461346633416459,
"grad_norm": 0.37109375,
"learning_rate": 0.0001008726535498374,
"loss": 0.8203,
"step": 1095
},
{
"epoch": 5.486284289276808,
"grad_norm": 0.375,
"learning_rate": 0.0001,
"loss": 0.8134,
"step": 1100
},
{
"epoch": 5.511221945137157,
"grad_norm": 0.3828125,
"learning_rate": 9.912734645016263e-05,
"loss": 0.8172,
"step": 1105
},
{
"epoch": 5.5361596009975065,
"grad_norm": 0.376953125,
"learning_rate": 9.825475935627165e-05,
"loss": 0.8193,
"step": 1110
},
{
"epoch": 5.561097256857855,
"grad_norm": 0.37890625,
"learning_rate": 9.73823051692127e-05,
"loss": 0.8154,
"step": 1115
},
{
"epoch": 5.586034912718205,
"grad_norm": 0.51953125,
"learning_rate": 9.651005032974994e-05,
"loss": 0.8231,
"step": 1120
},
{
"epoch": 5.610972568578553,
"grad_norm": 0.412109375,
"learning_rate": 9.563806126346642e-05,
"loss": 0.8181,
"step": 1125
},
{
"epoch": 5.635910224438903,
"grad_norm": 0.337890625,
"learning_rate": 9.476640437570562e-05,
"loss": 0.8137,
"step": 1130
},
{
"epoch": 5.660847880299252,
"grad_norm": 0.400390625,
"learning_rate": 9.38951460465143e-05,
"loss": 0.8228,
"step": 1135
},
{
"epoch": 5.685785536159601,
"grad_norm": 0.421875,
"learning_rate": 9.302435262558747e-05,
"loss": 0.8143,
"step": 1140
},
{
"epoch": 5.71072319201995,
"grad_norm": 0.384765625,
"learning_rate": 9.215409042721552e-05,
"loss": 0.8166,
"step": 1145
},
{
"epoch": 5.7356608478802995,
"grad_norm": 0.375,
"learning_rate": 9.128442572523417e-05,
"loss": 0.8238,
"step": 1150
},
{
"epoch": 5.760598503740648,
"grad_norm": 0.392578125,
"learning_rate": 9.04154247479776e-05,
"loss": 0.8163,
"step": 1155
},
{
"epoch": 5.785536159600998,
"grad_norm": 0.3671875,
"learning_rate": 8.954715367323468e-05,
"loss": 0.8254,
"step": 1160
},
{
"epoch": 5.8104738154613464,
"grad_norm": 0.3984375,
"learning_rate": 8.867967862320934e-05,
"loss": 0.8141,
"step": 1165
},
{
"epoch": 5.835411471321696,
"grad_norm": 0.380859375,
"learning_rate": 8.781306565948528e-05,
"loss": 0.8207,
"step": 1170
},
{
"epoch": 5.860349127182045,
"grad_norm": 0.36328125,
"learning_rate": 8.694738077799488e-05,
"loss": 0.8197,
"step": 1175
},
{
"epoch": 5.885286783042394,
"grad_norm": 0.353515625,
"learning_rate": 8.608268990399349e-05,
"loss": 0.8162,
"step": 1180
},
{
"epoch": 5.910224438902743,
"grad_norm": 0.3828125,
"learning_rate": 8.521905888703893e-05,
"loss": 0.8192,
"step": 1185
},
{
"epoch": 5.9351620947630925,
"grad_norm": 0.388671875,
"learning_rate": 8.435655349597689e-05,
"loss": 0.8216,
"step": 1190
},
{
"epoch": 5.960099750623441,
"grad_norm": 0.404296875,
"learning_rate": 8.349523941393224e-05,
"loss": 0.8127,
"step": 1195
},
{
"epoch": 5.985037406483791,
"grad_norm": 0.36328125,
"learning_rate": 8.263518223330697e-05,
"loss": 0.8085,
"step": 1200
},
{
"epoch": 6.0,
"eval_loss": 2.6190178394317627,
"eval_runtime": 0.5351,
"eval_samples_per_second": 18.688,
"eval_steps_per_second": 1.869,
"step": 1203
},
{
"epoch": 6.0099750623441395,
"grad_norm": 0.353515625,
"learning_rate": 8.177644745078526e-05,
"loss": 0.7925,
"step": 1205
},
{
"epoch": 6.034912718204489,
"grad_norm": 0.5,
"learning_rate": 8.091910046234552e-05,
"loss": 0.7848,
"step": 1210
},
{
"epoch": 6.059850374064838,
"grad_norm": 0.380859375,
"learning_rate": 8.00632065582803e-05,
"loss": 0.7776,
"step": 1215
},
{
"epoch": 6.084788029925187,
"grad_norm": 0.392578125,
"learning_rate": 7.920883091822408e-05,
"loss": 0.7813,
"step": 1220
},
{
"epoch": 6.109725685785536,
"grad_norm": 0.392578125,
"learning_rate": 7.835603860618972e-05,
"loss": 0.7775,
"step": 1225
},
{
"epoch": 6.134663341645886,
"grad_norm": 0.40234375,
"learning_rate": 7.750489456561352e-05,
"loss": 0.7795,
"step": 1230
},
{
"epoch": 6.159600997506234,
"grad_norm": 0.40625,
"learning_rate": 7.66554636144095e-05,
"loss": 0.7802,
"step": 1235
},
{
"epoch": 6.184538653366584,
"grad_norm": 0.39453125,
"learning_rate": 7.580781044003324e-05,
"loss": 0.7901,
"step": 1240
},
{
"epoch": 6.2094763092269325,
"grad_norm": 0.375,
"learning_rate": 7.496199959455584e-05,
"loss": 0.7797,
"step": 1245
},
{
"epoch": 6.234413965087282,
"grad_norm": 0.404296875,
"learning_rate": 7.411809548974792e-05,
"loss": 0.7788,
"step": 1250
},
{
"epoch": 6.259351620947631,
"grad_norm": 0.384765625,
"learning_rate": 7.327616239217431e-05,
"loss": 0.7815,
"step": 1255
},
{
"epoch": 6.28428927680798,
"grad_norm": 0.486328125,
"learning_rate": 7.243626441830009e-05,
"loss": 0.7878,
"step": 1260
},
{
"epoch": 6.309226932668329,
"grad_norm": 0.38671875,
"learning_rate": 7.159846552960774e-05,
"loss": 0.7831,
"step": 1265
},
{
"epoch": 6.334164588528679,
"grad_norm": 0.388671875,
"learning_rate": 7.076282952772633e-05,
"loss": 0.7852,
"step": 1270
},
{
"epoch": 6.359102244389027,
"grad_norm": 0.3984375,
"learning_rate": 6.992942004957271e-05,
"loss": 0.7967,
"step": 1275
},
{
"epoch": 6.384039900249377,
"grad_norm": 0.3828125,
"learning_rate": 6.909830056250527e-05,
"loss": 0.7775,
"step": 1280
},
{
"epoch": 6.4089775561097255,
"grad_norm": 0.38671875,
"learning_rate": 6.826953435949081e-05,
"loss": 0.7836,
"step": 1285
},
{
"epoch": 6.433915211970075,
"grad_norm": 0.3984375,
"learning_rate": 6.744318455428436e-05,
"loss": 0.7802,
"step": 1290
},
{
"epoch": 6.458852867830424,
"grad_norm": 0.396484375,
"learning_rate": 6.661931407662292e-05,
"loss": 0.7923,
"step": 1295
},
{
"epoch": 6.483790523690773,
"grad_norm": 0.400390625,
"learning_rate": 6.579798566743314e-05,
"loss": 0.7813,
"step": 1300
},
{
"epoch": 6.508728179551122,
"grad_norm": 0.419921875,
"learning_rate": 6.497926187405326e-05,
"loss": 0.7923,
"step": 1305
},
{
"epoch": 6.533665835411472,
"grad_norm": 0.3671875,
"learning_rate": 6.416320504546997e-05,
"loss": 0.7941,
"step": 1310
},
{
"epoch": 6.55860349127182,
"grad_norm": 0.365234375,
"learning_rate": 6.334987732757029e-05,
"loss": 0.783,
"step": 1315
},
{
"epoch": 6.58354114713217,
"grad_norm": 0.37109375,
"learning_rate": 6.25393406584088e-05,
"loss": 0.7837,
"step": 1320
},
{
"epoch": 6.6084788029925186,
"grad_norm": 0.412109375,
"learning_rate": 6.173165676349103e-05,
"loss": 0.7885,
"step": 1325
},
{
"epoch": 6.633416458852868,
"grad_norm": 0.375,
"learning_rate": 6.092688715107264e-05,
"loss": 0.7984,
"step": 1330
},
{
"epoch": 6.658354114713217,
"grad_norm": 0.365234375,
"learning_rate": 6.012509310747538e-05,
"loss": 0.7971,
"step": 1335
},
{
"epoch": 6.683291770573566,
"grad_norm": 0.376953125,
"learning_rate": 5.9326335692419995e-05,
"loss": 0.798,
"step": 1340
},
{
"epoch": 6.708229426433915,
"grad_norm": 0.37890625,
"learning_rate": 5.853067573437612e-05,
"loss": 0.7786,
"step": 1345
},
{
"epoch": 6.733167082294265,
"grad_norm": 0.404296875,
"learning_rate": 5.773817382593008e-05,
"loss": 0.7939,
"step": 1350
},
{
"epoch": 6.758104738154613,
"grad_norm": 0.369140625,
"learning_rate": 5.694889031917047e-05,
"loss": 0.7881,
"step": 1355
},
{
"epoch": 6.783042394014963,
"grad_norm": 0.3828125,
"learning_rate": 5.616288532109225e-05,
"loss": 0.7872,
"step": 1360
},
{
"epoch": 6.807980049875312,
"grad_norm": 0.365234375,
"learning_rate": 5.5380218689019125e-05,
"loss": 0.7919,
"step": 1365
},
{
"epoch": 6.832917705735661,
"grad_norm": 0.41796875,
"learning_rate": 5.4600950026045326e-05,
"loss": 0.7819,
"step": 1370
},
{
"epoch": 6.85785536159601,
"grad_norm": 0.376953125,
"learning_rate": 5.382513867649663e-05,
"loss": 0.7805,
"step": 1375
},
{
"epoch": 6.882793017456359,
"grad_norm": 0.384765625,
"learning_rate": 5.305284372141095e-05,
"loss": 0.7995,
"step": 1380
},
{
"epoch": 6.907730673316708,
"grad_norm": 0.365234375,
"learning_rate": 5.2284123974039154e-05,
"loss": 0.7911,
"step": 1385
},
{
"epoch": 6.932668329177058,
"grad_norm": 0.373046875,
"learning_rate": 5.15190379753663e-05,
"loss": 0.7821,
"step": 1390
},
{
"epoch": 6.957605985037406,
"grad_norm": 0.376953125,
"learning_rate": 5.07576439896533e-05,
"loss": 0.7839,
"step": 1395
},
{
"epoch": 6.982543640897756,
"grad_norm": 0.375,
"learning_rate": 5.000000000000002e-05,
"loss": 0.7905,
"step": 1400
},
{
"epoch": 6.997506234413965,
"eval_loss": 2.6576273441314697,
"eval_runtime": 0.9682,
"eval_samples_per_second": 10.328,
"eval_steps_per_second": 1.033,
"step": 1403
},
{
"epoch": 7.007481296758105,
"grad_norm": 0.376953125,
"learning_rate": 4.924616370392961e-05,
"loss": 0.7696,
"step": 1405
},
{
"epoch": 7.032418952618454,
"grad_norm": 0.392578125,
"learning_rate": 4.8496192508994576e-05,
"loss": 0.7561,
"step": 1410
},
{
"epoch": 7.057356608478803,
"grad_norm": 0.38671875,
"learning_rate": 4.7750143528405126e-05,
"loss": 0.7558,
"step": 1415
},
{
"epoch": 7.082294264339152,
"grad_norm": 0.41796875,
"learning_rate": 4.700807357667952e-05,
"loss": 0.7591,
"step": 1420
},
{
"epoch": 7.107231920199501,
"grad_norm": 0.3828125,
"learning_rate": 4.6270039165317605e-05,
"loss": 0.7628,
"step": 1425
},
{
"epoch": 7.132169576059851,
"grad_norm": 0.38671875,
"learning_rate": 4.5536096498497295e-05,
"loss": 0.7595,
"step": 1430
},
{
"epoch": 7.157107231920199,
"grad_norm": 0.384765625,
"learning_rate": 4.480630146879419e-05,
"loss": 0.7667,
"step": 1435
},
{
"epoch": 7.182044887780549,
"grad_norm": 0.37109375,
"learning_rate": 4.4080709652925336e-05,
"loss": 0.7551,
"step": 1440
},
{
"epoch": 7.206982543640898,
"grad_norm": 0.416015625,
"learning_rate": 4.335937630751674e-05,
"loss": 0.7631,
"step": 1445
},
{
"epoch": 7.231920199501247,
"grad_norm": 0.412109375,
"learning_rate": 4.264235636489542e-05,
"loss": 0.7659,
"step": 1450
},
{
"epoch": 7.256857855361596,
"grad_norm": 0.419921875,
"learning_rate": 4.1929704428906026e-05,
"loss": 0.7652,
"step": 1455
},
{
"epoch": 7.2817955112219455,
"grad_norm": 0.376953125,
"learning_rate": 4.12214747707527e-05,
"loss": 0.7697,
"step": 1460
},
{
"epoch": 7.306733167082294,
"grad_norm": 0.36328125,
"learning_rate": 4.0517721324865884e-05,
"loss": 0.7646,
"step": 1465
},
{
"epoch": 7.331670822942644,
"grad_norm": 0.373046875,
"learning_rate": 3.981849768479517e-05,
"loss": 0.7659,
"step": 1470
},
{
"epoch": 7.356608478802992,
"grad_norm": 0.37109375,
"learning_rate": 3.9123857099127936e-05,
"loss": 0.7665,
"step": 1475
},
{
"epoch": 7.381546134663342,
"grad_norm": 0.388671875,
"learning_rate": 3.843385246743417e-05,
"loss": 0.769,
"step": 1480
},
{
"epoch": 7.406483790523691,
"grad_norm": 0.369140625,
"learning_rate": 3.774853633623806e-05,
"loss": 0.7728,
"step": 1485
},
{
"epoch": 7.43142144638404,
"grad_norm": 0.37109375,
"learning_rate": 3.7067960895016275e-05,
"loss": 0.7605,
"step": 1490
},
{
"epoch": 7.456359102244389,
"grad_norm": 0.38671875,
"learning_rate": 3.6392177972223594e-05,
"loss": 0.7678,
"step": 1495
},
{
"epoch": 7.4812967581047385,
"grad_norm": 0.37890625,
"learning_rate": 3.5721239031346066e-05,
"loss": 0.7676,
"step": 1500
},
{
"epoch": 7.506234413965087,
"grad_norm": 0.423828125,
"learning_rate": 3.5055195166981645e-05,
"loss": 0.7742,
"step": 1505
},
{
"epoch": 7.531172069825437,
"grad_norm": 0.3671875,
"learning_rate": 3.439409710094929e-05,
"loss": 0.7642,
"step": 1510
},
{
"epoch": 7.556109725685785,
"grad_norm": 0.3828125,
"learning_rate": 3.373799517842627e-05,
"loss": 0.7751,
"step": 1515
},
{
"epoch": 7.581047381546135,
"grad_norm": 0.38671875,
"learning_rate": 3.308693936411421e-05,
"loss": 0.7615,
"step": 1520
},
{
"epoch": 7.605985037406484,
"grad_norm": 0.375,
"learning_rate": 3.244097923843398e-05,
"loss": 0.7666,
"step": 1525
},
{
"epoch": 7.630922693266833,
"grad_norm": 0.388671875,
"learning_rate": 3.1800163993750166e-05,
"loss": 0.7725,
"step": 1530
},
{
"epoch": 7.655860349127182,
"grad_norm": 0.380859375,
"learning_rate": 3.116454243062459e-05,
"loss": 0.7635,
"step": 1535
},
{
"epoch": 7.6807980049875315,
"grad_norm": 0.384765625,
"learning_rate": 3.053416295410026e-05,
"loss": 0.7676,
"step": 1540
},
{
"epoch": 7.70573566084788,
"grad_norm": 0.3828125,
"learning_rate": 2.9909073570014912e-05,
"loss": 0.7621,
"step": 1545
},
{
"epoch": 7.73067331670823,
"grad_norm": 0.375,
"learning_rate": 2.9289321881345254e-05,
"loss": 0.7496,
"step": 1550
},
{
"epoch": 7.7556109725685785,
"grad_norm": 0.380859375,
"learning_rate": 2.8674955084581857e-05,
"loss": 0.7622,
"step": 1555
},
{
"epoch": 7.780548628428928,
"grad_norm": 0.365234375,
"learning_rate": 2.8066019966134904e-05,
"loss": 0.7671,
"step": 1560
},
{
"epoch": 7.805486284289277,
"grad_norm": 0.37109375,
"learning_rate": 2.746256289877126e-05,
"loss": 0.7593,
"step": 1565
},
{
"epoch": 7.830423940149626,
"grad_norm": 0.375,
"learning_rate": 2.6864629838082956e-05,
"loss": 0.7597,
"step": 1570
},
{
"epoch": 7.855361596009975,
"grad_norm": 0.37109375,
"learning_rate": 2.6272266318987603e-05,
"loss": 0.7761,
"step": 1575
},
{
"epoch": 7.8802992518703245,
"grad_norm": 0.373046875,
"learning_rate": 2.5685517452260567e-05,
"loss": 0.7646,
"step": 1580
},
{
"epoch": 7.905236907730673,
"grad_norm": 0.373046875,
"learning_rate": 2.5104427921099782e-05,
"loss": 0.763,
"step": 1585
},
{
"epoch": 7.930174563591023,
"grad_norm": 0.390625,
"learning_rate": 2.45290419777228e-05,
"loss": 0.7679,
"step": 1590
},
{
"epoch": 7.9551122194513715,
"grad_norm": 0.380859375,
"learning_rate": 2.3959403439996907e-05,
"loss": 0.7591,
"step": 1595
},
{
"epoch": 7.980049875311721,
"grad_norm": 0.369140625,
"learning_rate": 2.339555568810221e-05,
"loss": 0.7684,
"step": 1600
},
{
"epoch": 8.0,
"eval_loss": 2.6891818046569824,
"eval_runtime": 0.5383,
"eval_samples_per_second": 18.576,
"eval_steps_per_second": 1.858,
"step": 1604
},
{
"epoch": 8.00498753117207,
"grad_norm": 0.369140625,
"learning_rate": 2.2837541661228025e-05,
"loss": 0.7604,
"step": 1605
},
{
"epoch": 8.029925187032418,
"grad_norm": 0.3671875,
"learning_rate": 2.2285403854302912e-05,
"loss": 0.7569,
"step": 1610
},
{
"epoch": 8.054862842892769,
"grad_norm": 0.373046875,
"learning_rate": 2.173918431475861e-05,
"loss": 0.7565,
"step": 1615
},
{
"epoch": 8.079800498753118,
"grad_norm": 0.3828125,
"learning_rate": 2.119892463932781e-05,
"loss": 0.7619,
"step": 1620
},
{
"epoch": 8.104738154613466,
"grad_norm": 0.37109375,
"learning_rate": 2.0664665970876496e-05,
"loss": 0.7531,
"step": 1625
},
{
"epoch": 8.129675810473815,
"grad_norm": 0.375,
"learning_rate": 2.013644899527074e-05,
"loss": 0.7463,
"step": 1630
},
{
"epoch": 8.154613466334165,
"grad_norm": 0.3671875,
"learning_rate": 1.9614313938278272e-05,
"loss": 0.7611,
"step": 1635
},
{
"epoch": 8.179551122194514,
"grad_norm": 0.3671875,
"learning_rate": 1.9098300562505266e-05,
"loss": 0.7513,
"step": 1640
},
{
"epoch": 8.204488778054863,
"grad_norm": 0.3671875,
"learning_rate": 1.858844816436809e-05,
"loss": 0.7477,
"step": 1645
},
{
"epoch": 8.229426433915211,
"grad_norm": 0.3671875,
"learning_rate": 1.808479557110081e-05,
"loss": 0.7548,
"step": 1650
},
{
"epoch": 8.254364089775562,
"grad_norm": 0.3671875,
"learning_rate": 1.7587381137798432e-05,
"loss": 0.7528,
"step": 1655
},
{
"epoch": 8.27930174563591,
"grad_norm": 0.37109375,
"learning_rate": 1.7096242744495837e-05,
"loss": 0.7551,
"step": 1660
},
{
"epoch": 8.30423940149626,
"grad_norm": 0.361328125,
"learning_rate": 1.661141779328319e-05,
"loss": 0.7474,
"step": 1665
},
{
"epoch": 8.329177057356608,
"grad_norm": 0.3828125,
"learning_rate": 1.6132943205457606e-05,
"loss": 0.7582,
"step": 1670
},
{
"epoch": 8.354114713216958,
"grad_norm": 0.361328125,
"learning_rate": 1.566085541871145e-05,
"loss": 0.7383,
"step": 1675
},
{
"epoch": 8.379052369077307,
"grad_norm": 0.3671875,
"learning_rate": 1.5195190384357404e-05,
"loss": 0.7522,
"step": 1680
},
{
"epoch": 8.403990024937656,
"grad_norm": 0.38671875,
"learning_rate": 1.4735983564590783e-05,
"loss": 0.7516,
"step": 1685
},
{
"epoch": 8.428927680798004,
"grad_norm": 0.375,
"learning_rate": 1.4283269929788779e-05,
"loss": 0.755,
"step": 1690
},
{
"epoch": 8.453865336658355,
"grad_norm": 0.384765625,
"learning_rate": 1.3837083955847418e-05,
"loss": 0.7474,
"step": 1695
},
{
"epoch": 8.478802992518704,
"grad_norm": 0.369140625,
"learning_rate": 1.339745962155613e-05,
"loss": 0.762,
"step": 1700
},
{
"epoch": 8.503740648379052,
"grad_norm": 0.373046875,
"learning_rate": 1.296443040601003e-05,
"loss": 0.7577,
"step": 1705
},
{
"epoch": 8.528678304239401,
"grad_norm": 0.365234375,
"learning_rate": 1.2538029286060426e-05,
"loss": 0.7583,
"step": 1710
},
{
"epoch": 8.553615960099751,
"grad_norm": 0.38671875,
"learning_rate": 1.2118288733803473e-05,
"loss": 0.7482,
"step": 1715
},
{
"epoch": 8.5785536159601,
"grad_norm": 0.3671875,
"learning_rate": 1.1705240714107302e-05,
"loss": 0.7529,
"step": 1720
},
{
"epoch": 8.603491271820449,
"grad_norm": 0.380859375,
"learning_rate": 1.129891668217783e-05,
"loss": 0.7537,
"step": 1725
},
{
"epoch": 8.628428927680797,
"grad_norm": 0.37109375,
"learning_rate": 1.0899347581163221e-05,
"loss": 0.7528,
"step": 1730
},
{
"epoch": 8.653366583541148,
"grad_norm": 0.37109375,
"learning_rate": 1.0506563839797501e-05,
"loss": 0.7498,
"step": 1735
},
{
"epoch": 8.678304239401497,
"grad_norm": 0.3671875,
"learning_rate": 1.0120595370083318e-05,
"loss": 0.7617,
"step": 1740
},
{
"epoch": 8.703241895261845,
"grad_norm": 0.365234375,
"learning_rate": 9.74147156501396e-06,
"loss": 0.7514,
"step": 1745
},
{
"epoch": 8.728179551122194,
"grad_norm": 0.359375,
"learning_rate": 9.369221296335006e-06,
"loss": 0.7521,
"step": 1750
},
{
"epoch": 8.753117206982544,
"grad_norm": 0.369140625,
"learning_rate": 9.00387291234569e-06,
"loss": 0.7667,
"step": 1755
},
{
"epoch": 8.778054862842893,
"grad_norm": 0.361328125,
"learning_rate": 8.645454235739903e-06,
"loss": 0.753,
"step": 1760
},
{
"epoch": 8.802992518703242,
"grad_norm": 0.369140625,
"learning_rate": 8.293992561487596e-06,
"loss": 0.7532,
"step": 1765
},
{
"epoch": 8.82793017456359,
"grad_norm": 0.37109375,
"learning_rate": 7.949514654755962e-06,
"loss": 0.7516,
"step": 1770
},
{
"epoch": 8.85286783042394,
"grad_norm": 0.365234375,
"learning_rate": 7.612046748871327e-06,
"loss": 0.7586,
"step": 1775
},
{
"epoch": 8.87780548628429,
"grad_norm": 0.375,
"learning_rate": 7.281614543321269e-06,
"loss": 0.7534,
"step": 1780
},
{
"epoch": 8.902743142144638,
"grad_norm": 0.359375,
"learning_rate": 6.958243201797554e-06,
"loss": 0.7483,
"step": 1785
},
{
"epoch": 8.927680798004987,
"grad_norm": 0.3671875,
"learning_rate": 6.6419573502798374e-06,
"loss": 0.7441,
"step": 1790
},
{
"epoch": 8.952618453865338,
"grad_norm": 0.373046875,
"learning_rate": 6.332781075160243e-06,
"loss": 0.7482,
"step": 1795
},
{
"epoch": 8.977556109725686,
"grad_norm": 0.373046875,
"learning_rate": 6.030737921409169e-06,
"loss": 0.7564,
"step": 1800
},
{
"epoch": 8.997506234413965,
"eval_loss": 2.6970067024230957,
"eval_runtime": 0.8304,
"eval_samples_per_second": 12.042,
"eval_steps_per_second": 1.204,
"step": 1804
},
{
"epoch": 9.002493765586035,
"grad_norm": 0.39453125,
"learning_rate": 5.735850890782157e-06,
"loss": 0.7559,
"step": 1805
},
{
"epoch": 9.027431421446384,
"grad_norm": 0.359375,
"learning_rate": 5.448142440068316e-06,
"loss": 0.7497,
"step": 1810
},
{
"epoch": 9.052369077306734,
"grad_norm": 0.369140625,
"learning_rate": 5.167634479380068e-06,
"loss": 0.7512,
"step": 1815
},
{
"epoch": 9.077306733167083,
"grad_norm": 0.369140625,
"learning_rate": 4.8943483704846475e-06,
"loss": 0.746,
"step": 1820
},
{
"epoch": 9.102244389027431,
"grad_norm": 0.37109375,
"learning_rate": 4.628304925177318e-06,
"loss": 0.7498,
"step": 1825
},
{
"epoch": 9.12718204488778,
"grad_norm": 0.369140625,
"learning_rate": 4.369524403696457e-06,
"loss": 0.7535,
"step": 1830
},
{
"epoch": 9.15211970074813,
"grad_norm": 0.37109375,
"learning_rate": 4.118026513180695e-06,
"loss": 0.7456,
"step": 1835
},
{
"epoch": 9.17705735660848,
"grad_norm": 0.373046875,
"learning_rate": 3.873830406168111e-06,
"loss": 0.7535,
"step": 1840
},
{
"epoch": 9.201995012468828,
"grad_norm": 0.359375,
"learning_rate": 3.6369546791377052e-06,
"loss": 0.7559,
"step": 1845
},
{
"epoch": 9.226932668329177,
"grad_norm": 0.365234375,
"learning_rate": 3.40741737109318e-06,
"loss": 0.7421,
"step": 1850
},
{
"epoch": 9.251870324189527,
"grad_norm": 0.365234375,
"learning_rate": 3.1852359621892367e-06,
"loss": 0.7535,
"step": 1855
},
{
"epoch": 9.276807980049876,
"grad_norm": 0.369140625,
"learning_rate": 2.970427372400353e-06,
"loss": 0.7479,
"step": 1860
},
{
"epoch": 9.301745635910224,
"grad_norm": 0.3671875,
"learning_rate": 2.7630079602323442e-06,
"loss": 0.7545,
"step": 1865
},
{
"epoch": 9.326683291770573,
"grad_norm": 0.359375,
"learning_rate": 2.5629935214764865e-06,
"loss": 0.7456,
"step": 1870
},
{
"epoch": 9.351620947630924,
"grad_norm": 0.37109375,
"learning_rate": 2.3703992880066638e-06,
"loss": 0.7569,
"step": 1875
},
{
"epoch": 9.376558603491272,
"grad_norm": 0.3671875,
"learning_rate": 2.1852399266194314e-06,
"loss": 0.7547,
"step": 1880
},
{
"epoch": 9.401496259351621,
"grad_norm": 0.365234375,
"learning_rate": 2.0075295379170412e-06,
"loss": 0.7544,
"step": 1885
},
{
"epoch": 9.42643391521197,
"grad_norm": 0.375,
"learning_rate": 1.8372816552336026e-06,
"loss": 0.7579,
"step": 1890
},
{
"epoch": 9.451371571072318,
"grad_norm": 0.361328125,
"learning_rate": 1.6745092436045494e-06,
"loss": 0.7456,
"step": 1895
},
{
"epoch": 9.476309226932669,
"grad_norm": 0.361328125,
"learning_rate": 1.5192246987791981e-06,
"loss": 0.7461,
"step": 1900
},
{
"epoch": 9.501246882793017,
"grad_norm": 0.359375,
"learning_rate": 1.3714398462768563e-06,
"loss": 0.7453,
"step": 1905
},
{
"epoch": 9.526184538653366,
"grad_norm": 0.365234375,
"learning_rate": 1.231165940486234e-06,
"loss": 0.7542,
"step": 1910
},
{
"epoch": 9.551122194513717,
"grad_norm": 0.37109375,
"learning_rate": 1.0984136638083177e-06,
"loss": 0.7612,
"step": 1915
},
{
"epoch": 9.576059850374065,
"grad_norm": 0.373046875,
"learning_rate": 9.731931258429638e-07,
"loss": 0.7553,
"step": 1920
},
{
"epoch": 9.600997506234414,
"grad_norm": 0.365234375,
"learning_rate": 8.555138626189618e-07,
"loss": 0.7532,
"step": 1925
},
{
"epoch": 9.625935162094763,
"grad_norm": 0.3671875,
"learning_rate": 7.453848358678017e-07,
"loss": 0.7504,
"step": 1930
},
{
"epoch": 9.650872817955111,
"grad_norm": 0.369140625,
"learning_rate": 6.428144323412544e-07,
"loss": 0.7573,
"step": 1935
},
{
"epoch": 9.675810473815462,
"grad_norm": 0.365234375,
"learning_rate": 5.478104631726711e-07,
"loss": 0.7509,
"step": 1940
},
{
"epoch": 9.70074812967581,
"grad_norm": 0.369140625,
"learning_rate": 4.6038016328211476e-07,
"loss": 0.7465,
"step": 1945
},
{
"epoch": 9.72568578553616,
"grad_norm": 0.369140625,
"learning_rate": 3.805301908254455e-07,
"loss": 0.7466,
"step": 1950
},
{
"epoch": 9.75062344139651,
"grad_norm": 0.359375,
"learning_rate": 3.0826662668720364e-07,
"loss": 0.758,
"step": 1955
},
{
"epoch": 9.775561097256858,
"grad_norm": 0.37109375,
"learning_rate": 2.4359497401758024e-07,
"loss": 0.7448,
"step": 1960
},
{
"epoch": 9.800498753117207,
"grad_norm": 0.3671875,
"learning_rate": 1.86520157813308e-07,
"loss": 0.7529,
"step": 1965
},
{
"epoch": 9.825436408977556,
"grad_norm": 0.36328125,
"learning_rate": 1.3704652454261668e-07,
"loss": 0.7475,
"step": 1970
},
{
"epoch": 9.850374064837904,
"grad_norm": 0.361328125,
"learning_rate": 9.517784181422019e-08,
"loss": 0.7581,
"step": 1975
},
{
"epoch": 9.875311720698255,
"grad_norm": 0.365234375,
"learning_rate": 6.09172980904238e-08,
"loss": 0.7494,
"step": 1980
},
{
"epoch": 9.900249376558603,
"grad_norm": 0.369140625,
"learning_rate": 3.4267502444274015e-08,
"loss": 0.7507,
"step": 1985
},
{
"epoch": 9.925187032418952,
"grad_norm": 0.375,
"learning_rate": 1.5230484360873044e-08,
"loss": 0.7452,
"step": 1990
},
{
"epoch": 9.950124688279303,
"grad_norm": 0.365234375,
"learning_rate": 3.807693582869032e-09,
"loss": 0.7455,
"step": 1995
},
{
"epoch": 9.975062344139651,
"grad_norm": 0.380859375,
"learning_rate": 0.0,
"loss": 0.747,
"step": 2000
},
{
"epoch": 9.975062344139651,
"eval_loss": 2.6979904174804688,
"eval_runtime": 0.4975,
"eval_samples_per_second": 20.1,
"eval_steps_per_second": 2.01,
"step": 2000
},
{
"epoch": 9.975062344139651,
"step": 2000,
"total_flos": 1.1920978083462513e+18,
"train_loss": 0.9040022449493408,
"train_runtime": 12134.7289,
"train_samples_per_second": 7.919,
"train_steps_per_second": 0.165
}
],
"logging_steps": 5,
"max_steps": 2000,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.1920978083462513e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}