zephyr-8b-sft-full / trainer_state.json
li-muyang's picture
Model save
a7dde55 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 100,
"global_step": 2853,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0010515247108307045,
"grad_norm": 21.846562454867367,
"learning_rate": 6.993006993006993e-08,
"loss": 1.3669,
"step": 1
},
{
"epoch": 0.005257623554153523,
"grad_norm": 21.132009448461105,
"learning_rate": 3.496503496503497e-07,
"loss": 1.3522,
"step": 5
},
{
"epoch": 0.010515247108307046,
"grad_norm": 17.494352717419737,
"learning_rate": 6.993006993006994e-07,
"loss": 1.354,
"step": 10
},
{
"epoch": 0.015772870662460567,
"grad_norm": 3.3387986746360583,
"learning_rate": 1.0489510489510491e-06,
"loss": 1.2867,
"step": 15
},
{
"epoch": 0.02103049421661409,
"grad_norm": 2.205920355996413,
"learning_rate": 1.3986013986013987e-06,
"loss": 1.2275,
"step": 20
},
{
"epoch": 0.026288117770767613,
"grad_norm": 1.369597839526372,
"learning_rate": 1.7482517482517483e-06,
"loss": 1.1889,
"step": 25
},
{
"epoch": 0.031545741324921134,
"grad_norm": 1.1230719730143253,
"learning_rate": 2.0979020979020983e-06,
"loss": 1.1654,
"step": 30
},
{
"epoch": 0.03680336487907466,
"grad_norm": 0.8609236106610554,
"learning_rate": 2.4475524475524477e-06,
"loss": 1.1648,
"step": 35
},
{
"epoch": 0.04206098843322818,
"grad_norm": 0.797862698606503,
"learning_rate": 2.7972027972027974e-06,
"loss": 1.1519,
"step": 40
},
{
"epoch": 0.0473186119873817,
"grad_norm": 0.8072361631632317,
"learning_rate": 3.1468531468531472e-06,
"loss": 1.1237,
"step": 45
},
{
"epoch": 0.052576235541535225,
"grad_norm": 0.7629783040030311,
"learning_rate": 3.4965034965034966e-06,
"loss": 1.1325,
"step": 50
},
{
"epoch": 0.05783385909568875,
"grad_norm": 0.6873017397880803,
"learning_rate": 3.846153846153847e-06,
"loss": 1.1026,
"step": 55
},
{
"epoch": 0.06309148264984227,
"grad_norm": 0.696611266506068,
"learning_rate": 4.195804195804197e-06,
"loss": 1.1039,
"step": 60
},
{
"epoch": 0.0683491062039958,
"grad_norm": 0.7029981157873147,
"learning_rate": 4.5454545454545455e-06,
"loss": 1.0926,
"step": 65
},
{
"epoch": 0.07360672975814932,
"grad_norm": 0.7133610172269549,
"learning_rate": 4.895104895104895e-06,
"loss": 1.1062,
"step": 70
},
{
"epoch": 0.07886435331230283,
"grad_norm": 0.7217968834872122,
"learning_rate": 5.244755244755245e-06,
"loss": 1.1003,
"step": 75
},
{
"epoch": 0.08412197686645637,
"grad_norm": 0.7014523604685313,
"learning_rate": 5.594405594405595e-06,
"loss": 1.0944,
"step": 80
},
{
"epoch": 0.08937960042060988,
"grad_norm": 0.7206581604903497,
"learning_rate": 5.944055944055944e-06,
"loss": 1.1056,
"step": 85
},
{
"epoch": 0.0946372239747634,
"grad_norm": 0.7457066334039347,
"learning_rate": 6.2937062937062944e-06,
"loss": 1.1065,
"step": 90
},
{
"epoch": 0.09989484752891693,
"grad_norm": 0.7208104956061856,
"learning_rate": 6.643356643356644e-06,
"loss": 1.0892,
"step": 95
},
{
"epoch": 0.10515247108307045,
"grad_norm": 0.7717192179121782,
"learning_rate": 6.993006993006993e-06,
"loss": 1.103,
"step": 100
},
{
"epoch": 0.10515247108307045,
"eval_loss": 1.0989242792129517,
"eval_runtime": 734.3008,
"eval_samples_per_second": 18.326,
"eval_steps_per_second": 0.144,
"step": 100
},
{
"epoch": 0.11041009463722397,
"grad_norm": 0.8259894569490014,
"learning_rate": 7.342657342657343e-06,
"loss": 1.0814,
"step": 105
},
{
"epoch": 0.1156677181913775,
"grad_norm": 1.1085848778320089,
"learning_rate": 7.692307692307694e-06,
"loss": 1.1025,
"step": 110
},
{
"epoch": 0.12092534174553102,
"grad_norm": 0.7455910381865771,
"learning_rate": 8.041958041958042e-06,
"loss": 1.1079,
"step": 115
},
{
"epoch": 0.12618296529968454,
"grad_norm": 0.9674079050397694,
"learning_rate": 8.391608391608393e-06,
"loss": 1.0784,
"step": 120
},
{
"epoch": 0.13144058885383805,
"grad_norm": 0.9988378537350968,
"learning_rate": 8.741258741258743e-06,
"loss": 1.0759,
"step": 125
},
{
"epoch": 0.1366982124079916,
"grad_norm": 0.8719562477062238,
"learning_rate": 9.090909090909091e-06,
"loss": 1.1161,
"step": 130
},
{
"epoch": 0.14195583596214512,
"grad_norm": 0.7675881427867184,
"learning_rate": 9.44055944055944e-06,
"loss": 1.0935,
"step": 135
},
{
"epoch": 0.14721345951629863,
"grad_norm": 0.724214660257682,
"learning_rate": 9.79020979020979e-06,
"loss": 1.0905,
"step": 140
},
{
"epoch": 0.15247108307045215,
"grad_norm": 0.7816188167257716,
"learning_rate": 1.013986013986014e-05,
"loss": 1.1015,
"step": 145
},
{
"epoch": 0.15772870662460567,
"grad_norm": 0.8160659875512388,
"learning_rate": 1.048951048951049e-05,
"loss": 1.0841,
"step": 150
},
{
"epoch": 0.16298633017875921,
"grad_norm": 0.7024355651373789,
"learning_rate": 1.083916083916084e-05,
"loss": 1.0944,
"step": 155
},
{
"epoch": 0.16824395373291273,
"grad_norm": 0.8350948417258764,
"learning_rate": 1.118881118881119e-05,
"loss": 1.0865,
"step": 160
},
{
"epoch": 0.17350157728706625,
"grad_norm": 0.7759430616830292,
"learning_rate": 1.1538461538461538e-05,
"loss": 1.0957,
"step": 165
},
{
"epoch": 0.17875920084121977,
"grad_norm": 0.8253932883708276,
"learning_rate": 1.1888111888111888e-05,
"loss": 1.0862,
"step": 170
},
{
"epoch": 0.18401682439537329,
"grad_norm": 0.727381322987075,
"learning_rate": 1.2237762237762239e-05,
"loss": 1.0531,
"step": 175
},
{
"epoch": 0.1892744479495268,
"grad_norm": 0.7093901420339217,
"learning_rate": 1.2587412587412589e-05,
"loss": 1.0983,
"step": 180
},
{
"epoch": 0.19453207150368035,
"grad_norm": 0.953147430950059,
"learning_rate": 1.2937062937062939e-05,
"loss": 1.0971,
"step": 185
},
{
"epoch": 0.19978969505783387,
"grad_norm": 0.7716908173558169,
"learning_rate": 1.3286713286713288e-05,
"loss": 1.075,
"step": 190
},
{
"epoch": 0.20504731861198738,
"grad_norm": 0.8672736054906722,
"learning_rate": 1.3636363636363637e-05,
"loss": 1.0793,
"step": 195
},
{
"epoch": 0.2103049421661409,
"grad_norm": 0.8675965486083684,
"learning_rate": 1.3986013986013986e-05,
"loss": 1.0867,
"step": 200
},
{
"epoch": 0.2103049421661409,
"eval_loss": 1.0965888500213623,
"eval_runtime": 649.556,
"eval_samples_per_second": 20.717,
"eval_steps_per_second": 0.163,
"step": 200
},
{
"epoch": 0.21556256572029442,
"grad_norm": 0.8977881972264273,
"learning_rate": 1.4335664335664336e-05,
"loss": 1.0954,
"step": 205
},
{
"epoch": 0.22082018927444794,
"grad_norm": 0.7235719037878356,
"learning_rate": 1.4685314685314686e-05,
"loss": 1.0983,
"step": 210
},
{
"epoch": 0.22607781282860148,
"grad_norm": 0.7296340934381736,
"learning_rate": 1.5034965034965037e-05,
"loss": 1.0782,
"step": 215
},
{
"epoch": 0.231335436382755,
"grad_norm": 0.8547235635957527,
"learning_rate": 1.5384615384615387e-05,
"loss": 1.0924,
"step": 220
},
{
"epoch": 0.23659305993690852,
"grad_norm": 0.7564410266828079,
"learning_rate": 1.5734265734265734e-05,
"loss": 1.0912,
"step": 225
},
{
"epoch": 0.24185068349106204,
"grad_norm": 0.8525133534517718,
"learning_rate": 1.6083916083916083e-05,
"loss": 1.0954,
"step": 230
},
{
"epoch": 0.24710830704521555,
"grad_norm": 0.805401759340136,
"learning_rate": 1.6433566433566433e-05,
"loss": 1.0749,
"step": 235
},
{
"epoch": 0.25236593059936907,
"grad_norm": 0.7402154168479581,
"learning_rate": 1.6783216783216786e-05,
"loss": 1.0986,
"step": 240
},
{
"epoch": 0.2576235541535226,
"grad_norm": 0.7474174775706688,
"learning_rate": 1.7132867132867136e-05,
"loss": 1.0869,
"step": 245
},
{
"epoch": 0.2628811777076761,
"grad_norm": 0.7369490806417859,
"learning_rate": 1.7482517482517486e-05,
"loss": 1.0776,
"step": 250
},
{
"epoch": 0.26813880126182965,
"grad_norm": 0.9287606428487797,
"learning_rate": 1.7832167832167832e-05,
"loss": 1.1021,
"step": 255
},
{
"epoch": 0.2733964248159832,
"grad_norm": 0.6947297961768544,
"learning_rate": 1.8181818181818182e-05,
"loss": 1.1012,
"step": 260
},
{
"epoch": 0.2786540483701367,
"grad_norm": 0.8123153065370199,
"learning_rate": 1.8531468531468532e-05,
"loss": 1.1074,
"step": 265
},
{
"epoch": 0.28391167192429023,
"grad_norm": 0.8302401027015646,
"learning_rate": 1.888111888111888e-05,
"loss": 1.1083,
"step": 270
},
{
"epoch": 0.2891692954784437,
"grad_norm": 0.6881962349423542,
"learning_rate": 1.923076923076923e-05,
"loss": 1.0937,
"step": 275
},
{
"epoch": 0.29442691903259727,
"grad_norm": 0.7279189773202484,
"learning_rate": 1.958041958041958e-05,
"loss": 1.0913,
"step": 280
},
{
"epoch": 0.2996845425867508,
"grad_norm": 1.0229788845896652,
"learning_rate": 1.993006993006993e-05,
"loss": 1.062,
"step": 285
},
{
"epoch": 0.3049421661409043,
"grad_norm": 0.8087865464750971,
"learning_rate": 1.9999880177844552e-05,
"loss": 1.0943,
"step": 290
},
{
"epoch": 0.31019978969505785,
"grad_norm": 0.7888576974082969,
"learning_rate": 1.9999393405259354e-05,
"loss": 1.0814,
"step": 295
},
{
"epoch": 0.31545741324921134,
"grad_norm": 0.7387794672867187,
"learning_rate": 1.9998532211572566e-05,
"loss": 1.111,
"step": 300
},
{
"epoch": 0.31545741324921134,
"eval_loss": 1.101216435432434,
"eval_runtime": 582.6045,
"eval_samples_per_second": 23.098,
"eval_steps_per_second": 0.182,
"step": 300
},
{
"epoch": 0.3207150368033649,
"grad_norm": 0.7908623885539283,
"learning_rate": 1.999729662903106e-05,
"loss": 1.0945,
"step": 305
},
{
"epoch": 0.32597266035751843,
"grad_norm": 0.6771503993700702,
"learning_rate": 1.999568670390045e-05,
"loss": 1.0926,
"step": 310
},
{
"epoch": 0.3312302839116719,
"grad_norm": 0.6841445829487095,
"learning_rate": 1.9993702496463395e-05,
"loss": 1.1157,
"step": 315
},
{
"epoch": 0.33648790746582546,
"grad_norm": 0.6751678361124496,
"learning_rate": 1.9991344081017312e-05,
"loss": 1.1029,
"step": 320
},
{
"epoch": 0.34174553101997895,
"grad_norm": 0.6430661618178782,
"learning_rate": 1.9988611545871606e-05,
"loss": 1.0914,
"step": 325
},
{
"epoch": 0.3470031545741325,
"grad_norm": 0.6415970890637294,
"learning_rate": 1.9985504993344375e-05,
"loss": 1.095,
"step": 330
},
{
"epoch": 0.352260778128286,
"grad_norm": 0.7730277501959658,
"learning_rate": 1.9982024539758547e-05,
"loss": 1.1047,
"step": 335
},
{
"epoch": 0.35751840168243953,
"grad_norm": 0.697788892685463,
"learning_rate": 1.997817031543756e-05,
"loss": 1.0943,
"step": 340
},
{
"epoch": 0.3627760252365931,
"grad_norm": 0.7222719849480133,
"learning_rate": 1.9973942464700456e-05,
"loss": 1.0723,
"step": 345
},
{
"epoch": 0.36803364879074657,
"grad_norm": 0.7260613938093592,
"learning_rate": 1.9969341145856493e-05,
"loss": 1.0839,
"step": 350
},
{
"epoch": 0.3732912723449001,
"grad_norm": 0.733047206796414,
"learning_rate": 1.9964366531199205e-05,
"loss": 1.1031,
"step": 355
},
{
"epoch": 0.3785488958990536,
"grad_norm": 0.7026038822669134,
"learning_rate": 1.995901880699997e-05,
"loss": 1.0921,
"step": 360
},
{
"epoch": 0.38380651945320715,
"grad_norm": 0.7130129636017671,
"learning_rate": 1.9953298173501007e-05,
"loss": 1.1082,
"step": 365
},
{
"epoch": 0.3890641430073607,
"grad_norm": 0.6640121507704535,
"learning_rate": 1.9947204844907903e-05,
"loss": 1.0865,
"step": 370
},
{
"epoch": 0.3943217665615142,
"grad_norm": 0.6489539943403665,
"learning_rate": 1.994073904938157e-05,
"loss": 1.1005,
"step": 375
},
{
"epoch": 0.39957939011566773,
"grad_norm": 0.6442461845826825,
"learning_rate": 1.9933901029029732e-05,
"loss": 1.0723,
"step": 380
},
{
"epoch": 0.4048370136698212,
"grad_norm": 0.7261445745563544,
"learning_rate": 1.992669103989783e-05,
"loss": 1.1011,
"step": 385
},
{
"epoch": 0.41009463722397477,
"grad_norm": 0.7222317305626339,
"learning_rate": 1.9919109351959444e-05,
"loss": 1.0908,
"step": 390
},
{
"epoch": 0.4153522607781283,
"grad_norm": 0.6323590176184729,
"learning_rate": 1.9911156249106186e-05,
"loss": 1.089,
"step": 395
},
{
"epoch": 0.4206098843322818,
"grad_norm": 0.6782790061464099,
"learning_rate": 1.9902832029137086e-05,
"loss": 1.0974,
"step": 400
},
{
"epoch": 0.4206098843322818,
"eval_loss": 1.0965957641601562,
"eval_runtime": 611.6915,
"eval_samples_per_second": 22.0,
"eval_steps_per_second": 0.173,
"step": 400
},
{
"epoch": 0.42586750788643535,
"grad_norm": 0.7130643351963079,
"learning_rate": 1.9894137003747404e-05,
"loss": 1.0863,
"step": 405
},
{
"epoch": 0.43112513144058884,
"grad_norm": 0.7035673344638229,
"learning_rate": 1.988507149851699e-05,
"loss": 1.0928,
"step": 410
},
{
"epoch": 0.4363827549947424,
"grad_norm": 0.6366880839024881,
"learning_rate": 1.987563585289808e-05,
"loss": 1.0876,
"step": 415
},
{
"epoch": 0.4416403785488959,
"grad_norm": 0.6724589905602705,
"learning_rate": 1.9865830420202587e-05,
"loss": 1.0814,
"step": 420
},
{
"epoch": 0.4468980021030494,
"grad_norm": 0.7296112006903912,
"learning_rate": 1.9855655567588877e-05,
"loss": 1.0849,
"step": 425
},
{
"epoch": 0.45215562565720296,
"grad_norm": 0.7301202009190912,
"learning_rate": 1.984511167604801e-05,
"loss": 1.0943,
"step": 430
},
{
"epoch": 0.45741324921135645,
"grad_norm": 0.6728038801425467,
"learning_rate": 1.9834199140389485e-05,
"loss": 1.0958,
"step": 435
},
{
"epoch": 0.46267087276551,
"grad_norm": 0.6461291574762016,
"learning_rate": 1.982291836922645e-05,
"loss": 1.0814,
"step": 440
},
{
"epoch": 0.4679284963196635,
"grad_norm": 0.7354488392025322,
"learning_rate": 1.9811269784960404e-05,
"loss": 1.1019,
"step": 445
},
{
"epoch": 0.47318611987381703,
"grad_norm": 0.871639557338332,
"learning_rate": 1.9799253823765383e-05,
"loss": 1.1006,
"step": 450
},
{
"epoch": 0.4784437434279706,
"grad_norm": 0.7250167929947016,
"learning_rate": 1.9786870935571617e-05,
"loss": 1.0976,
"step": 455
},
{
"epoch": 0.48370136698212407,
"grad_norm": 0.7624377086650501,
"learning_rate": 1.97741215840487e-05,
"loss": 1.073,
"step": 460
},
{
"epoch": 0.4889589905362776,
"grad_norm": 0.7335919595002304,
"learning_rate": 1.9761006246588217e-05,
"loss": 1.0928,
"step": 465
},
{
"epoch": 0.4942166140904311,
"grad_norm": 0.6382852192610631,
"learning_rate": 1.9747525414285863e-05,
"loss": 1.0945,
"step": 470
},
{
"epoch": 0.49947423764458465,
"grad_norm": 0.718180529210079,
"learning_rate": 1.9733679591923062e-05,
"loss": 1.0749,
"step": 475
},
{
"epoch": 0.5047318611987381,
"grad_norm": 0.6600718457016724,
"learning_rate": 1.9719469297948076e-05,
"loss": 1.1181,
"step": 480
},
{
"epoch": 0.5099894847528917,
"grad_norm": 0.6689062165685349,
"learning_rate": 1.9704895064456573e-05,
"loss": 1.0952,
"step": 485
},
{
"epoch": 0.5152471083070452,
"grad_norm": 0.7143276328895771,
"learning_rate": 1.968995743717171e-05,
"loss": 1.0896,
"step": 490
},
{
"epoch": 0.5205047318611987,
"grad_norm": 0.6221294359823765,
"learning_rate": 1.9674656975423704e-05,
"loss": 1.0742,
"step": 495
},
{
"epoch": 0.5257623554153522,
"grad_norm": 0.7268351101096144,
"learning_rate": 1.9658994252128884e-05,
"loss": 1.0898,
"step": 500
},
{
"epoch": 0.5257623554153522,
"eval_loss": 1.091992974281311,
"eval_runtime": 577.2656,
"eval_samples_per_second": 23.312,
"eval_steps_per_second": 0.184,
"step": 500
},
{
"epoch": 0.5310199789695058,
"grad_norm": 0.7409815849447423,
"learning_rate": 1.964296985376823e-05,
"loss": 1.0785,
"step": 505
},
{
"epoch": 0.5362776025236593,
"grad_norm": 0.7136236155581998,
"learning_rate": 1.962658438036543e-05,
"loss": 1.0983,
"step": 510
},
{
"epoch": 0.5415352260778128,
"grad_norm": 0.7215624141555339,
"learning_rate": 1.9609838445464406e-05,
"loss": 1.1007,
"step": 515
},
{
"epoch": 0.5467928496319664,
"grad_norm": 0.6979369948772214,
"learning_rate": 1.959273267610633e-05,
"loss": 1.0806,
"step": 520
},
{
"epoch": 0.5520504731861199,
"grad_norm": 0.7255670203711404,
"learning_rate": 1.9575267712806152e-05,
"loss": 1.0753,
"step": 525
},
{
"epoch": 0.5573080967402734,
"grad_norm": 0.6378781651024482,
"learning_rate": 1.955744420952863e-05,
"loss": 1.1001,
"step": 530
},
{
"epoch": 0.562565720294427,
"grad_norm": 0.6440842622036477,
"learning_rate": 1.9539262833663813e-05,
"loss": 1.0867,
"step": 535
},
{
"epoch": 0.5678233438485805,
"grad_norm": 0.650711077304966,
"learning_rate": 1.9520724266002078e-05,
"loss": 1.0861,
"step": 540
},
{
"epoch": 0.573080967402734,
"grad_norm": 0.9412839294952584,
"learning_rate": 1.9501829200708627e-05,
"loss": 1.066,
"step": 545
},
{
"epoch": 0.5783385909568874,
"grad_norm": 0.7997373349509072,
"learning_rate": 1.948257834529749e-05,
"loss": 1.0804,
"step": 550
},
{
"epoch": 0.583596214511041,
"grad_norm": 0.6632970321629863,
"learning_rate": 1.9462972420605045e-05,
"loss": 1.0796,
"step": 555
},
{
"epoch": 0.5888538380651945,
"grad_norm": 0.6907348547616222,
"learning_rate": 1.9443012160763014e-05,
"loss": 1.0914,
"step": 560
},
{
"epoch": 0.594111461619348,
"grad_norm": 0.7602392699866063,
"learning_rate": 1.9422698313170982e-05,
"loss": 1.0782,
"step": 565
},
{
"epoch": 0.5993690851735016,
"grad_norm": 0.7425506195668518,
"learning_rate": 1.9402031638468407e-05,
"loss": 1.0728,
"step": 570
},
{
"epoch": 0.6046267087276551,
"grad_norm": 0.6057134136385478,
"learning_rate": 1.9381012910506146e-05,
"loss": 1.0944,
"step": 575
},
{
"epoch": 0.6098843322818086,
"grad_norm": 0.611926050399381,
"learning_rate": 1.935964291631746e-05,
"loss": 1.0887,
"step": 580
},
{
"epoch": 0.6151419558359621,
"grad_norm": 0.6044521957797464,
"learning_rate": 1.933792245608857e-05,
"loss": 1.0653,
"step": 585
},
{
"epoch": 0.6203995793901157,
"grad_norm": 0.6160859598416025,
"learning_rate": 1.9315852343128677e-05,
"loss": 1.0697,
"step": 590
},
{
"epoch": 0.6256572029442692,
"grad_norm": 0.6454926848454089,
"learning_rate": 1.9293433403839506e-05,
"loss": 1.0835,
"step": 595
},
{
"epoch": 0.6309148264984227,
"grad_norm": 0.6271287719549755,
"learning_rate": 1.9270666477684375e-05,
"loss": 1.0749,
"step": 600
},
{
"epoch": 0.6309148264984227,
"eval_loss": 1.0876203775405884,
"eval_runtime": 619.1152,
"eval_samples_per_second": 21.736,
"eval_steps_per_second": 0.171,
"step": 600
},
{
"epoch": 0.6361724500525763,
"grad_norm": 0.634393838535348,
"learning_rate": 1.9247552417156758e-05,
"loss": 1.0729,
"step": 605
},
{
"epoch": 0.6414300736067298,
"grad_norm": 0.6594690945271786,
"learning_rate": 1.9224092087748344e-05,
"loss": 1.0827,
"step": 610
},
{
"epoch": 0.6466876971608833,
"grad_norm": 0.611714575208264,
"learning_rate": 1.920028636791667e-05,
"loss": 1.0882,
"step": 615
},
{
"epoch": 0.6519453207150369,
"grad_norm": 0.7463577820820205,
"learning_rate": 1.9176136149052184e-05,
"loss": 1.0756,
"step": 620
},
{
"epoch": 0.6572029442691903,
"grad_norm": 0.5943822071057456,
"learning_rate": 1.9151642335444894e-05,
"loss": 1.0781,
"step": 625
},
{
"epoch": 0.6624605678233438,
"grad_norm": 0.6478466639224281,
"learning_rate": 1.9126805844250507e-05,
"loss": 1.0799,
"step": 630
},
{
"epoch": 0.6677181913774973,
"grad_norm": 1.125407499631879,
"learning_rate": 1.910162760545607e-05,
"loss": 1.0863,
"step": 635
},
{
"epoch": 0.6729758149316509,
"grad_norm": 0.6317836803464292,
"learning_rate": 1.9076108561845167e-05,
"loss": 1.068,
"step": 640
},
{
"epoch": 0.6782334384858044,
"grad_norm": 0.6782741352289255,
"learning_rate": 1.90502496689626e-05,
"loss": 1.0717,
"step": 645
},
{
"epoch": 0.6834910620399579,
"grad_norm": 0.6549048073170591,
"learning_rate": 1.902405189507862e-05,
"loss": 1.0729,
"step": 650
},
{
"epoch": 0.6887486855941115,
"grad_norm": 0.5944400668808439,
"learning_rate": 1.899751622115267e-05,
"loss": 1.073,
"step": 655
},
{
"epoch": 0.694006309148265,
"grad_norm": 0.6344443790559094,
"learning_rate": 1.8970643640796642e-05,
"loss": 1.0765,
"step": 660
},
{
"epoch": 0.6992639327024185,
"grad_norm": 0.6066328657447971,
"learning_rate": 1.8943435160237693e-05,
"loss": 1.068,
"step": 665
},
{
"epoch": 0.704521556256572,
"grad_norm": 0.7935810543521484,
"learning_rate": 1.8915891798280545e-05,
"loss": 1.075,
"step": 670
},
{
"epoch": 0.7097791798107256,
"grad_norm": 0.6311479883642119,
"learning_rate": 1.8888014586269353e-05,
"loss": 1.0605,
"step": 675
},
{
"epoch": 0.7150368033648791,
"grad_norm": 0.6247754068444527,
"learning_rate": 1.8859804568049083e-05,
"loss": 1.0853,
"step": 680
},
{
"epoch": 0.7202944269190326,
"grad_norm": 0.6133863303859032,
"learning_rate": 1.8831262799926412e-05,
"loss": 1.0751,
"step": 685
},
{
"epoch": 0.7255520504731862,
"grad_norm": 0.6378281851358015,
"learning_rate": 1.88023903506302e-05,
"loss": 1.086,
"step": 690
},
{
"epoch": 0.7308096740273397,
"grad_norm": 0.6695843196133265,
"learning_rate": 1.8773188301271458e-05,
"loss": 1.0655,
"step": 695
},
{
"epoch": 0.7360672975814931,
"grad_norm": 0.6310578043108518,
"learning_rate": 1.874365774530285e-05,
"loss": 1.0847,
"step": 700
},
{
"epoch": 0.7360672975814931,
"eval_loss": 1.083134412765503,
"eval_runtime": 594.902,
"eval_samples_per_second": 22.621,
"eval_steps_per_second": 0.178,
"step": 700
},
{
"epoch": 0.7413249211356467,
"grad_norm": 0.7538683907974313,
"learning_rate": 1.8713799788477794e-05,
"loss": 1.0691,
"step": 705
},
{
"epoch": 0.7465825446898002,
"grad_norm": 0.706371524563473,
"learning_rate": 1.8683615548809007e-05,
"loss": 1.0654,
"step": 710
},
{
"epoch": 0.7518401682439537,
"grad_norm": 0.7089836009644308,
"learning_rate": 1.865310615652668e-05,
"loss": 1.0732,
"step": 715
},
{
"epoch": 0.7570977917981072,
"grad_norm": 0.6253449282146815,
"learning_rate": 1.862227275403614e-05,
"loss": 1.0595,
"step": 720
},
{
"epoch": 0.7623554153522608,
"grad_norm": 0.6352792231235775,
"learning_rate": 1.8591116495875065e-05,
"loss": 1.0611,
"step": 725
},
{
"epoch": 0.7676130389064143,
"grad_norm": 0.6559807547521417,
"learning_rate": 1.8559638548670276e-05,
"loss": 1.0772,
"step": 730
},
{
"epoch": 0.7728706624605678,
"grad_norm": 0.660949169309788,
"learning_rate": 1.8527840091094038e-05,
"loss": 1.0723,
"step": 735
},
{
"epoch": 0.7781282860147214,
"grad_norm": 0.6485292004090661,
"learning_rate": 1.849572231381993e-05,
"loss": 1.0756,
"step": 740
},
{
"epoch": 0.7833859095688749,
"grad_norm": 0.5894518164357108,
"learning_rate": 1.8463286419478256e-05,
"loss": 1.0878,
"step": 745
},
{
"epoch": 0.7886435331230284,
"grad_norm": 0.6373909243160687,
"learning_rate": 1.843053362261102e-05,
"loss": 1.0698,
"step": 750
},
{
"epoch": 0.7939011566771819,
"grad_norm": 0.6247774742453552,
"learning_rate": 1.8397465149626438e-05,
"loss": 1.0689,
"step": 755
},
{
"epoch": 0.7991587802313355,
"grad_norm": 0.6702489085237104,
"learning_rate": 1.836408223875303e-05,
"loss": 1.0878,
"step": 760
},
{
"epoch": 0.804416403785489,
"grad_norm": 0.5901778445639561,
"learning_rate": 1.8330386139993253e-05,
"loss": 1.0615,
"step": 765
},
{
"epoch": 0.8096740273396424,
"grad_norm": 0.5690160698641555,
"learning_rate": 1.8296378115076683e-05,
"loss": 1.0627,
"step": 770
},
{
"epoch": 0.814931650893796,
"grad_norm": 0.7286612536078287,
"learning_rate": 1.826205943741277e-05,
"loss": 1.0599,
"step": 775
},
{
"epoch": 0.8201892744479495,
"grad_norm": 0.6255138205467193,
"learning_rate": 1.8227431392043188e-05,
"loss": 1.0738,
"step": 780
},
{
"epoch": 0.825446898002103,
"grad_norm": 0.6089376456915286,
"learning_rate": 1.8192495275593667e-05,
"loss": 1.0682,
"step": 785
},
{
"epoch": 0.8307045215562566,
"grad_norm": 0.6155868150283563,
"learning_rate": 1.8157252396225487e-05,
"loss": 1.065,
"step": 790
},
{
"epoch": 0.8359621451104101,
"grad_norm": 0.7289316735890606,
"learning_rate": 1.812170407358647e-05,
"loss": 1.0577,
"step": 795
},
{
"epoch": 0.8412197686645636,
"grad_norm": 0.6194611530873854,
"learning_rate": 1.8085851638761564e-05,
"loss": 1.0749,
"step": 800
},
{
"epoch": 0.8412197686645636,
"eval_loss": 1.0777511596679688,
"eval_runtime": 578.5287,
"eval_samples_per_second": 23.261,
"eval_steps_per_second": 0.183,
"step": 800
},
{
"epoch": 0.8464773922187171,
"grad_norm": 0.5897179737564566,
"learning_rate": 1.8049696434223018e-05,
"loss": 1.064,
"step": 805
},
{
"epoch": 0.8517350157728707,
"grad_norm": 0.6249138645283078,
"learning_rate": 1.801323981378011e-05,
"loss": 1.0689,
"step": 810
},
{
"epoch": 0.8569926393270242,
"grad_norm": 0.6094536651967496,
"learning_rate": 1.797648314252844e-05,
"loss": 1.0547,
"step": 815
},
{
"epoch": 0.8622502628811777,
"grad_norm": 0.6427649229281082,
"learning_rate": 1.7939427796798835e-05,
"loss": 1.0709,
"step": 820
},
{
"epoch": 0.8675078864353313,
"grad_norm": 0.625645109760211,
"learning_rate": 1.790207516410579e-05,
"loss": 1.0711,
"step": 825
},
{
"epoch": 0.8727655099894848,
"grad_norm": 0.6900102876237034,
"learning_rate": 1.7864426643095537e-05,
"loss": 1.0551,
"step": 830
},
{
"epoch": 0.8780231335436383,
"grad_norm": 0.6633694160119932,
"learning_rate": 1.7826483643493664e-05,
"loss": 1.0647,
"step": 835
},
{
"epoch": 0.8832807570977917,
"grad_norm": 0.6706740933862908,
"learning_rate": 1.7788247586052324e-05,
"loss": 1.068,
"step": 840
},
{
"epoch": 0.8885383806519453,
"grad_norm": 0.6147588746912578,
"learning_rate": 1.774971990249703e-05,
"loss": 1.0675,
"step": 845
},
{
"epoch": 0.8937960042060988,
"grad_norm": 0.650347913047383,
"learning_rate": 1.7710902035473075e-05,
"loss": 1.0563,
"step": 850
},
{
"epoch": 0.8990536277602523,
"grad_norm": 0.5896501069060196,
"learning_rate": 1.7671795438491476e-05,
"loss": 1.0549,
"step": 855
},
{
"epoch": 0.9043112513144059,
"grad_norm": 0.5865757288759952,
"learning_rate": 1.763240157587457e-05,
"loss": 1.074,
"step": 860
},
{
"epoch": 0.9095688748685594,
"grad_norm": 0.6448523425472431,
"learning_rate": 1.759272192270118e-05,
"loss": 1.0406,
"step": 865
},
{
"epoch": 0.9148264984227129,
"grad_norm": 0.628930087369231,
"learning_rate": 1.7552757964751375e-05,
"loss": 1.0604,
"step": 870
},
{
"epoch": 0.9200841219768665,
"grad_norm": 0.5573844980993936,
"learning_rate": 1.751251119845085e-05,
"loss": 1.0712,
"step": 875
},
{
"epoch": 0.92534174553102,
"grad_norm": 0.5760631844651097,
"learning_rate": 1.7471983130814872e-05,
"loss": 1.0677,
"step": 880
},
{
"epoch": 0.9305993690851735,
"grad_norm": 0.6608474625527273,
"learning_rate": 1.7431175279391864e-05,
"loss": 1.0564,
"step": 885
},
{
"epoch": 0.935856992639327,
"grad_norm": 0.6158122817932856,
"learning_rate": 1.7390089172206594e-05,
"loss": 1.0698,
"step": 890
},
{
"epoch": 0.9411146161934806,
"grad_norm": 0.6348226976928315,
"learning_rate": 1.7348726347702922e-05,
"loss": 1.0541,
"step": 895
},
{
"epoch": 0.9463722397476341,
"grad_norm": 0.5893951119046926,
"learning_rate": 1.730708835468624e-05,
"loss": 1.055,
"step": 900
},
{
"epoch": 0.9463722397476341,
"eval_loss": 1.0719902515411377,
"eval_runtime": 554.5404,
"eval_samples_per_second": 24.267,
"eval_steps_per_second": 0.191,
"step": 900
},
{
"epoch": 0.9516298633017876,
"grad_norm": 0.6398319094636862,
"learning_rate": 1.7265176752265437e-05,
"loss": 1.0606,
"step": 905
},
{
"epoch": 0.9568874868559412,
"grad_norm": 0.6048116978972946,
"learning_rate": 1.7222993109794547e-05,
"loss": 1.0602,
"step": 910
},
{
"epoch": 0.9621451104100947,
"grad_norm": 0.5840246341713026,
"learning_rate": 1.7180539006813973e-05,
"loss": 1.0479,
"step": 915
},
{
"epoch": 0.9674027339642481,
"grad_norm": 0.5778229669814231,
"learning_rate": 1.7137816032991338e-05,
"loss": 1.0552,
"step": 920
},
{
"epoch": 0.9726603575184016,
"grad_norm": 0.599559903007225,
"learning_rate": 1.7094825788061984e-05,
"loss": 1.0602,
"step": 925
},
{
"epoch": 0.9779179810725552,
"grad_norm": 0.6085935007813816,
"learning_rate": 1.7051569881769033e-05,
"loss": 1.0702,
"step": 930
},
{
"epoch": 0.9831756046267087,
"grad_norm": 0.6210127216958851,
"learning_rate": 1.7008049933803153e-05,
"loss": 1.0562,
"step": 935
},
{
"epoch": 0.9884332281808622,
"grad_norm": 0.5660970609343743,
"learning_rate": 1.696426757374187e-05,
"loss": 1.0488,
"step": 940
},
{
"epoch": 0.9936908517350158,
"grad_norm": 0.6052820312725565,
"learning_rate": 1.6920224440988578e-05,
"loss": 1.0579,
"step": 945
},
{
"epoch": 0.9989484752891693,
"grad_norm": 0.6336659141670167,
"learning_rate": 1.6875922184711152e-05,
"loss": 1.0391,
"step": 950
},
{
"epoch": 1.0042060988433228,
"grad_norm": 0.8649311407022923,
"learning_rate": 1.6831362463780173e-05,
"loss": 0.9427,
"step": 955
},
{
"epoch": 1.0094637223974763,
"grad_norm": 0.7906840430230622,
"learning_rate": 1.6786546946706826e-05,
"loss": 0.9093,
"step": 960
},
{
"epoch": 1.0147213459516298,
"grad_norm": 0.7615451637281871,
"learning_rate": 1.6741477311580442e-05,
"loss": 0.9129,
"step": 965
},
{
"epoch": 1.0199789695057835,
"grad_norm": 0.81395189037578,
"learning_rate": 1.669615524600562e-05,
"loss": 0.9116,
"step": 970
},
{
"epoch": 1.025236593059937,
"grad_norm": 0.6675565867389684,
"learning_rate": 1.6650582447039087e-05,
"loss": 0.897,
"step": 975
},
{
"epoch": 1.0304942166140905,
"grad_norm": 0.6558457233521835,
"learning_rate": 1.6604760621126104e-05,
"loss": 0.9059,
"step": 980
},
{
"epoch": 1.035751840168244,
"grad_norm": 0.791116301575079,
"learning_rate": 1.655869148403661e-05,
"loss": 0.9123,
"step": 985
},
{
"epoch": 1.0410094637223974,
"grad_norm": 0.6281691549427542,
"learning_rate": 1.6512376760800943e-05,
"loss": 0.9165,
"step": 990
},
{
"epoch": 1.046267087276551,
"grad_norm": 0.722210053446233,
"learning_rate": 1.646581818564528e-05,
"loss": 0.8885,
"step": 995
},
{
"epoch": 1.0515247108307044,
"grad_norm": 0.6566766982009167,
"learning_rate": 1.641901750192666e-05,
"loss": 0.9184,
"step": 1000
},
{
"epoch": 1.0515247108307044,
"eval_loss": 1.0817060470581055,
"eval_runtime": 548.8481,
"eval_samples_per_second": 24.519,
"eval_steps_per_second": 0.193,
"step": 1000
},
{
"epoch": 1.0567823343848581,
"grad_norm": 0.7215682123240776,
"learning_rate": 1.6371976462067744e-05,
"loss": 0.9048,
"step": 1005
},
{
"epoch": 1.0620399579390116,
"grad_norm": 0.5754913559382355,
"learning_rate": 1.6324696827491178e-05,
"loss": 0.9062,
"step": 1010
},
{
"epoch": 1.0672975814931651,
"grad_norm": 0.7713724891213452,
"learning_rate": 1.6277180368553637e-05,
"loss": 0.9003,
"step": 1015
},
{
"epoch": 1.0725552050473186,
"grad_norm": 0.6705202466831766,
"learning_rate": 1.622942886447953e-05,
"loss": 0.9076,
"step": 1020
},
{
"epoch": 1.077812828601472,
"grad_norm": 0.7709385226269342,
"learning_rate": 1.6181444103294405e-05,
"loss": 0.9016,
"step": 1025
},
{
"epoch": 1.0830704521556256,
"grad_norm": 0.6618094790250554,
"learning_rate": 1.613322788175796e-05,
"loss": 0.9087,
"step": 1030
},
{
"epoch": 1.088328075709779,
"grad_norm": 0.7111642531915952,
"learning_rate": 1.608478200529679e-05,
"loss": 0.8993,
"step": 1035
},
{
"epoch": 1.0935856992639328,
"grad_norm": 0.9967278615618546,
"learning_rate": 1.6036108287936774e-05,
"loss": 0.9053,
"step": 1040
},
{
"epoch": 1.0988433228180863,
"grad_norm": 0.7211016358920939,
"learning_rate": 1.598720855223516e-05,
"loss": 0.8967,
"step": 1045
},
{
"epoch": 1.1041009463722398,
"grad_norm": 0.681965857428634,
"learning_rate": 1.5938084629212308e-05,
"loss": 0.9069,
"step": 1050
},
{
"epoch": 1.1093585699263933,
"grad_norm": 0.7296745556202008,
"learning_rate": 1.5888738358283125e-05,
"loss": 0.8918,
"step": 1055
},
{
"epoch": 1.1146161934805467,
"grad_norm": 0.6472282910374098,
"learning_rate": 1.5839171587188213e-05,
"loss": 0.8953,
"step": 1060
},
{
"epoch": 1.1198738170347002,
"grad_norm": 0.6420578981972046,
"learning_rate": 1.5789386171924656e-05,
"loss": 0.9185,
"step": 1065
},
{
"epoch": 1.125131440588854,
"grad_norm": 0.6592365438130466,
"learning_rate": 1.5739383976676538e-05,
"loss": 0.9338,
"step": 1070
},
{
"epoch": 1.1303890641430074,
"grad_norm": 0.6668713420054354,
"learning_rate": 1.5689166873745133e-05,
"loss": 0.9071,
"step": 1075
},
{
"epoch": 1.135646687697161,
"grad_norm": 0.6314319656757978,
"learning_rate": 1.5638736743478807e-05,
"loss": 0.9094,
"step": 1080
},
{
"epoch": 1.1409043112513144,
"grad_norm": 0.6557318538936868,
"learning_rate": 1.5588095474202597e-05,
"loss": 0.9056,
"step": 1085
},
{
"epoch": 1.146161934805468,
"grad_norm": 0.6988942180423913,
"learning_rate": 1.55372449621475e-05,
"loss": 0.9093,
"step": 1090
},
{
"epoch": 1.1514195583596214,
"grad_norm": 0.6288925365676942,
"learning_rate": 1.54861871113795e-05,
"loss": 0.8931,
"step": 1095
},
{
"epoch": 1.1566771819137749,
"grad_norm": 0.6060978130757313,
"learning_rate": 1.5434923833728238e-05,
"loss": 0.8955,
"step": 1100
},
{
"epoch": 1.1566771819137749,
"eval_loss": 1.0778801441192627,
"eval_runtime": 560.7689,
"eval_samples_per_second": 23.997,
"eval_steps_per_second": 0.189,
"step": 1100
},
{
"epoch": 1.1619348054679284,
"grad_norm": 0.636138975576772,
"learning_rate": 1.538345704871544e-05,
"loss": 0.9164,
"step": 1105
},
{
"epoch": 1.167192429022082,
"grad_norm": 0.7813214708227075,
"learning_rate": 1.533178868348304e-05,
"loss": 0.9123,
"step": 1110
},
{
"epoch": 1.1724500525762356,
"grad_norm": 0.6454922302300423,
"learning_rate": 1.5279920672721014e-05,
"loss": 0.9096,
"step": 1115
},
{
"epoch": 1.177707676130389,
"grad_norm": 0.6684532969652581,
"learning_rate": 1.522785495859495e-05,
"loss": 0.913,
"step": 1120
},
{
"epoch": 1.1829652996845426,
"grad_norm": 0.659104192691736,
"learning_rate": 1.517559349067331e-05,
"loss": 0.9127,
"step": 1125
},
{
"epoch": 1.188222923238696,
"grad_norm": 0.6327096229416864,
"learning_rate": 1.5123138225854437e-05,
"loss": 0.9179,
"step": 1130
},
{
"epoch": 1.1934805467928495,
"grad_norm": 0.6821427010599724,
"learning_rate": 1.507049112829328e-05,
"loss": 0.916,
"step": 1135
},
{
"epoch": 1.1987381703470033,
"grad_norm": 0.6383663706263557,
"learning_rate": 1.5017654169327847e-05,
"loss": 0.9205,
"step": 1140
},
{
"epoch": 1.2039957939011567,
"grad_norm": 0.6642751432840621,
"learning_rate": 1.4964629327405385e-05,
"loss": 0.9064,
"step": 1145
},
{
"epoch": 1.2092534174553102,
"grad_norm": 0.6370926988086576,
"learning_rate": 1.4911418588008302e-05,
"loss": 0.9009,
"step": 1150
},
{
"epoch": 1.2145110410094637,
"grad_norm": 0.6726809074089126,
"learning_rate": 1.4858023943579831e-05,
"loss": 0.9177,
"step": 1155
},
{
"epoch": 1.2197686645636172,
"grad_norm": 0.6624168311883211,
"learning_rate": 1.4804447393449408e-05,
"loss": 0.9008,
"step": 1160
},
{
"epoch": 1.2250262881177707,
"grad_norm": 0.6736191492385858,
"learning_rate": 1.4750690943757815e-05,
"loss": 0.9177,
"step": 1165
},
{
"epoch": 1.2302839116719242,
"grad_norm": 0.6626164162916314,
"learning_rate": 1.469675660738206e-05,
"loss": 0.9125,
"step": 1170
},
{
"epoch": 1.235541535226078,
"grad_norm": 0.6561095205909978,
"learning_rate": 1.4642646403860017e-05,
"loss": 0.9224,
"step": 1175
},
{
"epoch": 1.2407991587802314,
"grad_norm": 0.6404857197573285,
"learning_rate": 1.4588362359314787e-05,
"loss": 0.9147,
"step": 1180
},
{
"epoch": 1.2460567823343849,
"grad_norm": 0.6247458161762777,
"learning_rate": 1.453390650637884e-05,
"loss": 0.9055,
"step": 1185
},
{
"epoch": 1.2513144058885384,
"grad_norm": 0.6205798650094878,
"learning_rate": 1.4479280884117919e-05,
"loss": 0.9098,
"step": 1190
},
{
"epoch": 1.2565720294426919,
"grad_norm": 0.6171085702613818,
"learning_rate": 1.4424487537954658e-05,
"loss": 0.9086,
"step": 1195
},
{
"epoch": 1.2618296529968454,
"grad_norm": 0.6817002284070426,
"learning_rate": 1.4369528519592016e-05,
"loss": 0.914,
"step": 1200
},
{
"epoch": 1.2618296529968454,
"eval_loss": 1.0758436918258667,
"eval_runtime": 554.9555,
"eval_samples_per_second": 24.249,
"eval_steps_per_second": 0.191,
"step": 1200
},
{
"epoch": 1.267087276550999,
"grad_norm": 0.6556393089241064,
"learning_rate": 1.4314405886936444e-05,
"loss": 0.907,
"step": 1205
},
{
"epoch": 1.2723449001051526,
"grad_norm": 0.6564247019338768,
"learning_rate": 1.425912170402083e-05,
"loss": 0.8947,
"step": 1210
},
{
"epoch": 1.277602523659306,
"grad_norm": 0.6909745550376631,
"learning_rate": 1.4203678040927211e-05,
"loss": 0.9015,
"step": 1215
},
{
"epoch": 1.2828601472134595,
"grad_norm": 0.6649938010634878,
"learning_rate": 1.414807697370926e-05,
"loss": 0.9147,
"step": 1220
},
{
"epoch": 1.288117770767613,
"grad_norm": 0.6827602346821062,
"learning_rate": 1.4092320584314552e-05,
"loss": 0.9223,
"step": 1225
},
{
"epoch": 1.2933753943217665,
"grad_norm": 0.6891969548538285,
"learning_rate": 1.4036410960506601e-05,
"loss": 0.909,
"step": 1230
},
{
"epoch": 1.29863301787592,
"grad_norm": 0.7488612526253159,
"learning_rate": 1.3980350195786691e-05,
"loss": 0.9063,
"step": 1235
},
{
"epoch": 1.3038906414300735,
"grad_norm": 0.8765777386899024,
"learning_rate": 1.3924140389315488e-05,
"loss": 0.8949,
"step": 1240
},
{
"epoch": 1.3091482649842272,
"grad_norm": 0.6756135072464465,
"learning_rate": 1.3867783645834428e-05,
"loss": 0.9173,
"step": 1245
},
{
"epoch": 1.3144058885383807,
"grad_norm": 0.6511543641668399,
"learning_rate": 1.3811282075586916e-05,
"loss": 0.9075,
"step": 1250
},
{
"epoch": 1.3196635120925342,
"grad_norm": 0.6171780710166301,
"learning_rate": 1.3754637794239303e-05,
"loss": 0.8977,
"step": 1255
},
{
"epoch": 1.3249211356466877,
"grad_norm": 0.658721220404947,
"learning_rate": 1.3697852922801669e-05,
"loss": 0.9072,
"step": 1260
},
{
"epoch": 1.3301787592008412,
"grad_norm": 0.6417444192429201,
"learning_rate": 1.3640929587548403e-05,
"loss": 0.9091,
"step": 1265
},
{
"epoch": 1.3354363827549949,
"grad_norm": 0.6187189724748463,
"learning_rate": 1.3583869919938597e-05,
"loss": 0.9129,
"step": 1270
},
{
"epoch": 1.3406940063091484,
"grad_norm": 0.5843959371785157,
"learning_rate": 1.3526676056536205e-05,
"loss": 0.9092,
"step": 1275
},
{
"epoch": 1.3459516298633019,
"grad_norm": 0.6932618289744372,
"learning_rate": 1.3469350138930073e-05,
"loss": 0.9079,
"step": 1280
},
{
"epoch": 1.3512092534174553,
"grad_norm": 0.6598615985676897,
"learning_rate": 1.3411894313653727e-05,
"loss": 0.8944,
"step": 1285
},
{
"epoch": 1.3564668769716088,
"grad_norm": 0.6427748827555393,
"learning_rate": 1.3354310732105014e-05,
"loss": 0.898,
"step": 1290
},
{
"epoch": 1.3617245005257623,
"grad_norm": 0.6121349209877303,
"learning_rate": 1.3296601550465525e-05,
"loss": 0.909,
"step": 1295
},
{
"epoch": 1.3669821240799158,
"grad_norm": 0.6575524447093695,
"learning_rate": 1.3238768929619874e-05,
"loss": 0.9098,
"step": 1300
},
{
"epoch": 1.3669821240799158,
"eval_loss": 1.069818139076233,
"eval_runtime": 559.1797,
"eval_samples_per_second": 24.066,
"eval_steps_per_second": 0.19,
"step": 1300
},
{
"epoch": 1.3722397476340693,
"grad_norm": 0.6531825341664897,
"learning_rate": 1.3180815035074786e-05,
"loss": 0.9171,
"step": 1305
},
{
"epoch": 1.3774973711882228,
"grad_norm": 0.6882987706313063,
"learning_rate": 1.3122742036877994e-05,
"loss": 0.8888,
"step": 1310
},
{
"epoch": 1.3827549947423765,
"grad_norm": 0.724082633852385,
"learning_rate": 1.3064552109537e-05,
"loss": 0.896,
"step": 1315
},
{
"epoch": 1.38801261829653,
"grad_norm": 0.6895669186673943,
"learning_rate": 1.3006247431937644e-05,
"loss": 0.925,
"step": 1320
},
{
"epoch": 1.3932702418506835,
"grad_norm": 0.6718431536804129,
"learning_rate": 1.2947830187262514e-05,
"loss": 0.9099,
"step": 1325
},
{
"epoch": 1.398527865404837,
"grad_norm": 0.688445352407702,
"learning_rate": 1.2889302562909214e-05,
"loss": 0.8949,
"step": 1330
},
{
"epoch": 1.4037854889589905,
"grad_norm": 0.6016293866381901,
"learning_rate": 1.2830666750408434e-05,
"loss": 0.9015,
"step": 1335
},
{
"epoch": 1.4090431125131442,
"grad_norm": 0.6182893633299666,
"learning_rate": 1.2771924945341906e-05,
"loss": 0.9075,
"step": 1340
},
{
"epoch": 1.4143007360672977,
"grad_norm": 0.6593893582600123,
"learning_rate": 1.2713079347260198e-05,
"loss": 0.8963,
"step": 1345
},
{
"epoch": 1.4195583596214512,
"grad_norm": 0.6688143172592789,
"learning_rate": 1.2654132159600327e-05,
"loss": 0.9021,
"step": 1350
},
{
"epoch": 1.4248159831756047,
"grad_norm": 0.6250269029897194,
"learning_rate": 1.2595085589603281e-05,
"loss": 0.9001,
"step": 1355
},
{
"epoch": 1.4300736067297581,
"grad_norm": 0.6184329559921266,
"learning_rate": 1.2535941848231352e-05,
"loss": 0.8931,
"step": 1360
},
{
"epoch": 1.4353312302839116,
"grad_norm": 0.6598155701237914,
"learning_rate": 1.2476703150085356e-05,
"loss": 0.9046,
"step": 1365
},
{
"epoch": 1.4405888538380651,
"grad_norm": 0.6728059285538895,
"learning_rate": 1.2417371713321713e-05,
"loss": 0.9081,
"step": 1370
},
{
"epoch": 1.4458464773922186,
"grad_norm": 0.6795053004000011,
"learning_rate": 1.2357949759569372e-05,
"loss": 0.8935,
"step": 1375
},
{
"epoch": 1.4511041009463723,
"grad_norm": 0.6370835079324721,
"learning_rate": 1.2298439513846634e-05,
"loss": 0.9134,
"step": 1380
},
{
"epoch": 1.4563617245005258,
"grad_norm": 0.6511674325575209,
"learning_rate": 1.2238843204477855e-05,
"loss": 0.9025,
"step": 1385
},
{
"epoch": 1.4616193480546793,
"grad_norm": 0.6486276822993603,
"learning_rate": 1.2179163063009974e-05,
"loss": 0.9084,
"step": 1390
},
{
"epoch": 1.4668769716088328,
"grad_norm": 0.6375900541444521,
"learning_rate": 1.2119401324128976e-05,
"loss": 0.892,
"step": 1395
},
{
"epoch": 1.4721345951629863,
"grad_norm": 0.6636437536958206,
"learning_rate": 1.2059560225576212e-05,
"loss": 0.9126,
"step": 1400
},
{
"epoch": 1.4721345951629863,
"eval_loss": 1.066650629043579,
"eval_runtime": 578.6632,
"eval_samples_per_second": 23.255,
"eval_steps_per_second": 0.183,
"step": 1400
},
{
"epoch": 1.4773922187171398,
"grad_norm": 0.6880602268392096,
"learning_rate": 1.1999642008064612e-05,
"loss": 0.9133,
"step": 1405
},
{
"epoch": 1.4826498422712935,
"grad_norm": 0.6439745800900593,
"learning_rate": 1.1939648915194766e-05,
"loss": 0.8956,
"step": 1410
},
{
"epoch": 1.487907465825447,
"grad_norm": 0.6333947925789535,
"learning_rate": 1.1879583193370934e-05,
"loss": 0.8967,
"step": 1415
},
{
"epoch": 1.4931650893796005,
"grad_norm": 0.6887095313857406,
"learning_rate": 1.1819447091716918e-05,
"loss": 0.8953,
"step": 1420
},
{
"epoch": 1.498422712933754,
"grad_norm": 0.8314454423988585,
"learning_rate": 1.1759242861991855e-05,
"loss": 0.9061,
"step": 1425
},
{
"epoch": 1.5036803364879074,
"grad_norm": 0.6207340757493971,
"learning_rate": 1.1698972758505891e-05,
"loss": 0.884,
"step": 1430
},
{
"epoch": 1.508937960042061,
"grad_norm": 0.6356005817235517,
"learning_rate": 1.1638639038035771e-05,
"loss": 0.9056,
"step": 1435
},
{
"epoch": 1.5141955835962144,
"grad_norm": 0.6341731273814719,
"learning_rate": 1.1578243959740345e-05,
"loss": 0.8926,
"step": 1440
},
{
"epoch": 1.519453207150368,
"grad_norm": 0.6524260051325438,
"learning_rate": 1.1517789785075965e-05,
"loss": 0.8925,
"step": 1445
},
{
"epoch": 1.5247108307045214,
"grad_norm": 0.6390976768866661,
"learning_rate": 1.1457278777711816e-05,
"loss": 0.896,
"step": 1450
},
{
"epoch": 1.5299684542586751,
"grad_norm": 0.672745789784435,
"learning_rate": 1.139671320344514e-05,
"loss": 0.8919,
"step": 1455
},
{
"epoch": 1.5352260778128286,
"grad_norm": 0.6849640495250097,
"learning_rate": 1.1336095330116406e-05,
"loss": 0.8908,
"step": 1460
},
{
"epoch": 1.540483701366982,
"grad_norm": 0.6909452334309092,
"learning_rate": 1.127542742752439e-05,
"loss": 0.901,
"step": 1465
},
{
"epoch": 1.5457413249211358,
"grad_norm": 0.6514347502639167,
"learning_rate": 1.1214711767341184e-05,
"loss": 0.8886,
"step": 1470
},
{
"epoch": 1.5509989484752893,
"grad_norm": 0.7670979545467012,
"learning_rate": 1.1153950623027127e-05,
"loss": 0.8915,
"step": 1475
},
{
"epoch": 1.5562565720294428,
"grad_norm": 0.7094429002966973,
"learning_rate": 1.1093146269745694e-05,
"loss": 0.8986,
"step": 1480
},
{
"epoch": 1.5615141955835963,
"grad_norm": 0.7040092519773771,
"learning_rate": 1.1032300984278286e-05,
"loss": 0.8995,
"step": 1485
},
{
"epoch": 1.5667718191377498,
"grad_norm": 0.6717747776159033,
"learning_rate": 1.0971417044938984e-05,
"loss": 0.8894,
"step": 1490
},
{
"epoch": 1.5720294426919033,
"grad_norm": 0.6111734491076107,
"learning_rate": 1.091049673148924e-05,
"loss": 0.8903,
"step": 1495
},
{
"epoch": 1.5772870662460567,
"grad_norm": 0.6339144886316356,
"learning_rate": 1.0849542325052514e-05,
"loss": 0.9032,
"step": 1500
},
{
"epoch": 1.5772870662460567,
"eval_loss": 1.060400366783142,
"eval_runtime": 553.3344,
"eval_samples_per_second": 24.32,
"eval_steps_per_second": 0.192,
"step": 1500
},
{
"epoch": 1.5825446898002102,
"grad_norm": 0.6119889525138412,
"learning_rate": 1.0788556108028854e-05,
"loss": 0.9059,
"step": 1505
},
{
"epoch": 1.5878023133543637,
"grad_norm": 0.6610719745391888,
"learning_rate": 1.072754036400944e-05,
"loss": 0.8845,
"step": 1510
},
{
"epoch": 1.5930599369085172,
"grad_norm": 0.6334246363490683,
"learning_rate": 1.0666497377691067e-05,
"loss": 0.909,
"step": 1515
},
{
"epoch": 1.598317560462671,
"grad_norm": 0.6600607162051635,
"learning_rate": 1.0605429434790607e-05,
"loss": 0.9101,
"step": 1520
},
{
"epoch": 1.6035751840168244,
"grad_norm": 0.6624807422048473,
"learning_rate": 1.0544338821959407e-05,
"loss": 0.8918,
"step": 1525
},
{
"epoch": 1.608832807570978,
"grad_norm": 0.6540415860179337,
"learning_rate": 1.0483227826697686e-05,
"loss": 0.902,
"step": 1530
},
{
"epoch": 1.6140904311251314,
"grad_norm": 0.6339684794581751,
"learning_rate": 1.0422098737268862e-05,
"loss": 0.9047,
"step": 1535
},
{
"epoch": 1.619348054679285,
"grad_norm": 0.63411282308358,
"learning_rate": 1.0360953842613886e-05,
"loss": 0.9106,
"step": 1540
},
{
"epoch": 1.6246056782334386,
"grad_norm": 0.6246624939138397,
"learning_rate": 1.0299795432265516e-05,
"loss": 0.8941,
"step": 1545
},
{
"epoch": 1.629863301787592,
"grad_norm": 0.6422075365217625,
"learning_rate": 1.0238625796262604e-05,
"loss": 0.8969,
"step": 1550
},
{
"epoch": 1.6351209253417456,
"grad_norm": 0.641718675847965,
"learning_rate": 1.0177447225064334e-05,
"loss": 0.8932,
"step": 1555
},
{
"epoch": 1.640378548895899,
"grad_norm": 0.6996379461819543,
"learning_rate": 1.0116262009464475e-05,
"loss": 0.8988,
"step": 1560
},
{
"epoch": 1.6456361724500526,
"grad_norm": 0.6496660294162664,
"learning_rate": 1.0055072440505576e-05,
"loss": 0.8857,
"step": 1565
},
{
"epoch": 1.650893796004206,
"grad_norm": 0.6913136358312865,
"learning_rate": 9.993880809393203e-06,
"loss": 0.8953,
"step": 1570
},
{
"epoch": 1.6561514195583595,
"grad_norm": 0.6323428927883549,
"learning_rate": 9.932689407410136e-06,
"loss": 0.894,
"step": 1575
},
{
"epoch": 1.661409043112513,
"grad_norm": 0.7165826659774039,
"learning_rate": 9.871500525830581e-06,
"loss": 0.8946,
"step": 1580
},
{
"epoch": 1.6666666666666665,
"grad_norm": 0.6630355223364007,
"learning_rate": 9.810316455834359e-06,
"loss": 0.8907,
"step": 1585
},
{
"epoch": 1.6719242902208202,
"grad_norm": 0.6096362135364939,
"learning_rate": 9.749139488421133e-06,
"loss": 0.893,
"step": 1590
},
{
"epoch": 1.6771819137749737,
"grad_norm": 0.6414609478289887,
"learning_rate": 9.687971914324607e-06,
"loss": 0.897,
"step": 1595
},
{
"epoch": 1.6824395373291272,
"grad_norm": 0.6909628111495161,
"learning_rate": 9.626816023926771e-06,
"loss": 0.8882,
"step": 1600
},
{
"epoch": 1.6824395373291272,
"eval_loss": 1.054638385772705,
"eval_runtime": 554.7908,
"eval_samples_per_second": 24.256,
"eval_steps_per_second": 0.191,
"step": 1600
},
{
"epoch": 1.687697160883281,
"grad_norm": 0.658967287448874,
"learning_rate": 9.565674107172109e-06,
"loss": 0.8963,
"step": 1605
},
{
"epoch": 1.6929547844374344,
"grad_norm": 0.671113099618244,
"learning_rate": 9.504548453481875e-06,
"loss": 0.9006,
"step": 1610
},
{
"epoch": 1.698212407991588,
"grad_norm": 0.6356681511467472,
"learning_rate": 9.443441351668375e-06,
"loss": 0.8855,
"step": 1615
},
{
"epoch": 1.7034700315457414,
"grad_norm": 0.6733155642148883,
"learning_rate": 9.382355089849235e-06,
"loss": 0.8918,
"step": 1620
},
{
"epoch": 1.7087276550998949,
"grad_norm": 0.6923042640634246,
"learning_rate": 9.321291955361756e-06,
"loss": 0.8933,
"step": 1625
},
{
"epoch": 1.7139852786540484,
"grad_norm": 0.6424747471753014,
"learning_rate": 9.260254234677235e-06,
"loss": 0.8816,
"step": 1630
},
{
"epoch": 1.7192429022082019,
"grad_norm": 0.6144029954554266,
"learning_rate": 9.199244213315377e-06,
"loss": 0.8905,
"step": 1635
},
{
"epoch": 1.7245005257623554,
"grad_norm": 0.6170077707358106,
"learning_rate": 9.138264175758693e-06,
"loss": 0.8863,
"step": 1640
},
{
"epoch": 1.7297581493165088,
"grad_norm": 0.6197301026220858,
"learning_rate": 9.07731640536698e-06,
"loss": 0.8796,
"step": 1645
},
{
"epoch": 1.7350157728706623,
"grad_norm": 0.6643068163348533,
"learning_rate": 9.016403184291805e-06,
"loss": 0.8908,
"step": 1650
},
{
"epoch": 1.7402733964248158,
"grad_norm": 0.5938702426426252,
"learning_rate": 8.955526793391049e-06,
"loss": 0.8902,
"step": 1655
},
{
"epoch": 1.7455310199789695,
"grad_norm": 0.6474013520993763,
"learning_rate": 8.894689512143528e-06,
"loss": 0.8862,
"step": 1660
},
{
"epoch": 1.750788643533123,
"grad_norm": 0.6350300886593221,
"learning_rate": 8.833893618563604e-06,
"loss": 0.8847,
"step": 1665
},
{
"epoch": 1.7560462670872765,
"grad_norm": 0.6377181777254709,
"learning_rate": 8.773141389115914e-06,
"loss": 0.8865,
"step": 1670
},
{
"epoch": 1.7613038906414302,
"grad_norm": 0.6115861001350186,
"learning_rate": 8.712435098630116e-06,
"loss": 0.8863,
"step": 1675
},
{
"epoch": 1.7665615141955837,
"grad_norm": 0.6631610912721477,
"learning_rate": 8.651777020215713e-06,
"loss": 0.8959,
"step": 1680
},
{
"epoch": 1.7718191377497372,
"grad_norm": 0.6241016927327407,
"learning_rate": 8.591169425176931e-06,
"loss": 0.8726,
"step": 1685
},
{
"epoch": 1.7770767613038907,
"grad_norm": 0.6207578864949994,
"learning_rate": 8.53061458292768e-06,
"loss": 0.8892,
"step": 1690
},
{
"epoch": 1.7823343848580442,
"grad_norm": 0.6848519519116634,
"learning_rate": 8.470114760906583e-06,
"loss": 0.8943,
"step": 1695
},
{
"epoch": 1.7875920084121977,
"grad_norm": 0.6571666376626863,
"learning_rate": 8.409672224492051e-06,
"loss": 0.8847,
"step": 1700
},
{
"epoch": 1.7875920084121977,
"eval_loss": 1.049035906791687,
"eval_runtime": 554.1715,
"eval_samples_per_second": 24.283,
"eval_steps_per_second": 0.191,
"step": 1700
},
{
"epoch": 1.7928496319663512,
"grad_norm": 0.645616472902103,
"learning_rate": 8.349289236917482e-06,
"loss": 0.8816,
"step": 1705
},
{
"epoch": 1.7981072555205047,
"grad_norm": 0.6574673506951342,
"learning_rate": 8.28896805918649e-06,
"loss": 0.8648,
"step": 1710
},
{
"epoch": 1.8033648790746581,
"grad_norm": 0.6469048695832662,
"learning_rate": 8.228710949988283e-06,
"loss": 0.8844,
"step": 1715
},
{
"epoch": 1.8086225026288116,
"grad_norm": 0.6503850752063266,
"learning_rate": 8.168520165613035e-06,
"loss": 0.8927,
"step": 1720
},
{
"epoch": 1.8138801261829653,
"grad_norm": 0.6478121209226875,
"learning_rate": 8.108397959867445e-06,
"loss": 0.8973,
"step": 1725
},
{
"epoch": 1.8191377497371188,
"grad_norm": 0.6456428948521569,
"learning_rate": 8.04834658399032e-06,
"loss": 0.8829,
"step": 1730
},
{
"epoch": 1.8243953732912723,
"grad_norm": 0.6790517960706193,
"learning_rate": 7.988368286568287e-06,
"loss": 0.8756,
"step": 1735
},
{
"epoch": 1.8296529968454258,
"grad_norm": 0.7206882773594423,
"learning_rate": 7.928465313451603e-06,
"loss": 0.9051,
"step": 1740
},
{
"epoch": 1.8349106203995795,
"grad_norm": 0.6598808403648849,
"learning_rate": 7.868639907670042e-06,
"loss": 0.9019,
"step": 1745
},
{
"epoch": 1.840168243953733,
"grad_norm": 0.8136122353035425,
"learning_rate": 7.808894309348925e-06,
"loss": 0.8814,
"step": 1750
},
{
"epoch": 1.8454258675078865,
"grad_norm": 0.6383354015679575,
"learning_rate": 7.749230755625228e-06,
"loss": 0.8775,
"step": 1755
},
{
"epoch": 1.85068349106204,
"grad_norm": 0.6270079438127367,
"learning_rate": 7.689651480563824e-06,
"loss": 0.8959,
"step": 1760
},
{
"epoch": 1.8559411146161935,
"grad_norm": 0.6235081226194247,
"learning_rate": 7.630158715073813e-06,
"loss": 0.8871,
"step": 1765
},
{
"epoch": 1.861198738170347,
"grad_norm": 0.6526009528156013,
"learning_rate": 7.570754686825004e-06,
"loss": 0.8867,
"step": 1770
},
{
"epoch": 1.8664563617245005,
"grad_norm": 0.6321251834720393,
"learning_rate": 7.511441620164499e-06,
"loss": 0.9111,
"step": 1775
},
{
"epoch": 1.871713985278654,
"grad_norm": 0.6579757228675541,
"learning_rate": 7.452221736033387e-06,
"loss": 0.8758,
"step": 1780
},
{
"epoch": 1.8769716088328074,
"grad_norm": 0.6288476910531294,
"learning_rate": 7.393097251883609e-06,
"loss": 0.8848,
"step": 1785
},
{
"epoch": 1.882229232386961,
"grad_norm": 0.6671896908639643,
"learning_rate": 7.334070381594904e-06,
"loss": 0.8879,
"step": 1790
},
{
"epoch": 1.8874868559411146,
"grad_norm": 0.6375714540658346,
"learning_rate": 7.275143335391927e-06,
"loss": 0.8871,
"step": 1795
},
{
"epoch": 1.8927444794952681,
"grad_norm": 0.6461378473926269,
"learning_rate": 7.21631831976147e-06,
"loss": 0.8831,
"step": 1800
},
{
"epoch": 1.8927444794952681,
"eval_loss": 1.0454537868499756,
"eval_runtime": 554.7002,
"eval_samples_per_second": 24.26,
"eval_steps_per_second": 0.191,
"step": 1800
},
{
"epoch": 1.8980021030494216,
"grad_norm": 0.6266360175385085,
"learning_rate": 7.157597537369866e-06,
"loss": 0.8836,
"step": 1805
},
{
"epoch": 1.9032597266035753,
"grad_norm": 0.7106762429735706,
"learning_rate": 7.098983186980495e-06,
"loss": 0.8894,
"step": 1810
},
{
"epoch": 1.9085173501577288,
"grad_norm": 0.6449309860617594,
"learning_rate": 7.040477463371449e-06,
"loss": 0.8961,
"step": 1815
},
{
"epoch": 1.9137749737118823,
"grad_norm": 0.6118460786718801,
"learning_rate": 6.982082557253371e-06,
"loss": 0.8898,
"step": 1820
},
{
"epoch": 1.9190325972660358,
"grad_norm": 0.6200070078112132,
"learning_rate": 6.9238006551873985e-06,
"loss": 0.8993,
"step": 1825
},
{
"epoch": 1.9242902208201893,
"grad_norm": 0.62946195709294,
"learning_rate": 6.86563393950331e-06,
"loss": 0.8746,
"step": 1830
},
{
"epoch": 1.9295478443743428,
"grad_norm": 0.6894329752058552,
"learning_rate": 6.807584588217798e-06,
"loss": 0.8768,
"step": 1835
},
{
"epoch": 1.9348054679284963,
"grad_norm": 0.6337025218810814,
"learning_rate": 6.749654774952925e-06,
"loss": 0.8774,
"step": 1840
},
{
"epoch": 1.9400630914826498,
"grad_norm": 0.6061458342443647,
"learning_rate": 6.691846668854709e-06,
"loss": 0.8925,
"step": 1845
},
{
"epoch": 1.9453207150368033,
"grad_norm": 0.6323722322620482,
"learning_rate": 6.634162434511939e-06,
"loss": 0.8878,
"step": 1850
},
{
"epoch": 1.9505783385909568,
"grad_norm": 0.6295608770739457,
"learning_rate": 6.57660423187509e-06,
"loss": 0.8894,
"step": 1855
},
{
"epoch": 1.9558359621451105,
"grad_norm": 0.7448236764255614,
"learning_rate": 6.519174216175458e-06,
"loss": 0.884,
"step": 1860
},
{
"epoch": 1.961093585699264,
"grad_norm": 0.6643005564433259,
"learning_rate": 6.461874537844465e-06,
"loss": 0.8712,
"step": 1865
},
{
"epoch": 1.9663512092534174,
"grad_norm": 0.6460805751831616,
"learning_rate": 6.404707342433123e-06,
"loss": 0.8794,
"step": 1870
},
{
"epoch": 1.971608832807571,
"grad_norm": 0.6437260367816269,
"learning_rate": 6.347674770531716e-06,
"loss": 0.8913,
"step": 1875
},
{
"epoch": 1.9768664563617246,
"grad_norm": 0.6422567155892785,
"learning_rate": 6.2907789576896125e-06,
"loss": 0.8722,
"step": 1880
},
{
"epoch": 1.9821240799158781,
"grad_norm": 0.6631332611742206,
"learning_rate": 6.2340220343353455e-06,
"loss": 0.8747,
"step": 1885
},
{
"epoch": 1.9873817034700316,
"grad_norm": 0.5923326352879508,
"learning_rate": 6.177406125696804e-06,
"loss": 0.8863,
"step": 1890
},
{
"epoch": 1.9926393270241851,
"grad_norm": 0.6040038053093328,
"learning_rate": 6.120933351721665e-06,
"loss": 0.8822,
"step": 1895
},
{
"epoch": 1.9978969505783386,
"grad_norm": 0.6155901401028533,
"learning_rate": 6.064605826998031e-06,
"loss": 0.8781,
"step": 1900
},
{
"epoch": 1.9978969505783386,
"eval_loss": 1.0413092374801636,
"eval_runtime": 562.2304,
"eval_samples_per_second": 23.935,
"eval_steps_per_second": 0.189,
"step": 1900
},
{
"epoch": 2.003154574132492,
"grad_norm": 0.9856009462574625,
"learning_rate": 6.00842566067522e-06,
"loss": 0.7565,
"step": 1905
},
{
"epoch": 2.0084121976866456,
"grad_norm": 1.1010436371290768,
"learning_rate": 5.952394956384823e-06,
"loss": 0.7157,
"step": 1910
},
{
"epoch": 2.013669821240799,
"grad_norm": 0.7976498381871772,
"learning_rate": 5.896515812161896e-06,
"loss": 0.7125,
"step": 1915
},
{
"epoch": 2.0189274447949526,
"grad_norm": 0.7307266469267819,
"learning_rate": 5.840790320366444e-06,
"loss": 0.7208,
"step": 1920
},
{
"epoch": 2.024185068349106,
"grad_norm": 0.7601898382424687,
"learning_rate": 5.7852205676050355e-06,
"loss": 0.7079,
"step": 1925
},
{
"epoch": 2.0294426919032595,
"grad_norm": 0.7636705961643997,
"learning_rate": 5.7298086346527e-06,
"loss": 0.7021,
"step": 1930
},
{
"epoch": 2.034700315457413,
"grad_norm": 0.7600945150765135,
"learning_rate": 5.674556596374993e-06,
"loss": 0.698,
"step": 1935
},
{
"epoch": 2.039957939011567,
"grad_norm": 0.7283661435999434,
"learning_rate": 5.619466521650309e-06,
"loss": 0.7135,
"step": 1940
},
{
"epoch": 2.0452155625657205,
"grad_norm": 0.673772367415323,
"learning_rate": 5.564540473292433e-06,
"loss": 0.712,
"step": 1945
},
{
"epoch": 2.050473186119874,
"grad_norm": 0.7274211486508272,
"learning_rate": 5.509780507973266e-06,
"loss": 0.7316,
"step": 1950
},
{
"epoch": 2.0557308096740274,
"grad_norm": 0.7019707530514135,
"learning_rate": 5.455188676145846e-06,
"loss": 0.7178,
"step": 1955
},
{
"epoch": 2.060988433228181,
"grad_norm": 0.727356607819457,
"learning_rate": 5.40076702196755e-06,
"loss": 0.6901,
"step": 1960
},
{
"epoch": 2.0662460567823344,
"grad_norm": 0.7393497514045044,
"learning_rate": 5.346517583223567e-06,
"loss": 0.7091,
"step": 1965
},
{
"epoch": 2.071503680336488,
"grad_norm": 0.6909005753061759,
"learning_rate": 5.292442391250567e-06,
"loss": 0.7103,
"step": 1970
},
{
"epoch": 2.0767613038906414,
"grad_norm": 0.7199779190451211,
"learning_rate": 5.238543470860677e-06,
"loss": 0.7142,
"step": 1975
},
{
"epoch": 2.082018927444795,
"grad_norm": 0.6986050924763797,
"learning_rate": 5.184822840265635e-06,
"loss": 0.719,
"step": 1980
},
{
"epoch": 2.0872765509989484,
"grad_norm": 0.6873483374112779,
"learning_rate": 5.131282511001221e-06,
"loss": 0.7188,
"step": 1985
},
{
"epoch": 2.092534174553102,
"grad_norm": 0.704017833699201,
"learning_rate": 5.077924487851954e-06,
"loss": 0.7206,
"step": 1990
},
{
"epoch": 2.0977917981072554,
"grad_norm": 0.6869215244017003,
"learning_rate": 5.024750768776011e-06,
"loss": 0.7197,
"step": 1995
},
{
"epoch": 2.103049421661409,
"grad_norm": 0.6616206251205331,
"learning_rate": 4.971763344830419e-06,
"loss": 0.7197,
"step": 2000
},
{
"epoch": 2.103049421661409,
"eval_loss": 1.0822256803512573,
"eval_runtime": 566.9236,
"eval_samples_per_second": 23.737,
"eval_steps_per_second": 0.187,
"step": 2000
},
{
"epoch": 2.108307045215563,
"grad_norm": 0.747519024431639,
"learning_rate": 4.91896420009649e-06,
"loss": 0.7115,
"step": 2005
},
{
"epoch": 2.1135646687697163,
"grad_norm": 0.7095283324919017,
"learning_rate": 4.866355311605547e-06,
"loss": 0.7215,
"step": 2010
},
{
"epoch": 2.1188222923238698,
"grad_norm": 0.7245597363837365,
"learning_rate": 4.813938649264881e-06,
"loss": 0.7038,
"step": 2015
},
{
"epoch": 2.1240799158780233,
"grad_norm": 0.7212203821120433,
"learning_rate": 4.7617161757839895e-06,
"loss": 0.715,
"step": 2020
},
{
"epoch": 2.1293375394321767,
"grad_norm": 0.7027132940392441,
"learning_rate": 4.7096898466010976e-06,
"loss": 0.716,
"step": 2025
},
{
"epoch": 2.1345951629863302,
"grad_norm": 0.6920491890608464,
"learning_rate": 4.657861609809923e-06,
"loss": 0.7027,
"step": 2030
},
{
"epoch": 2.1398527865404837,
"grad_norm": 0.7246862757367895,
"learning_rate": 4.6062334060867416e-06,
"loss": 0.7211,
"step": 2035
},
{
"epoch": 2.145110410094637,
"grad_norm": 0.6816731320053306,
"learning_rate": 4.554807168617703e-06,
"loss": 0.7127,
"step": 2040
},
{
"epoch": 2.1503680336487907,
"grad_norm": 0.7013385203267727,
"learning_rate": 4.5035848230264715e-06,
"loss": 0.7158,
"step": 2045
},
{
"epoch": 2.155625657202944,
"grad_norm": 0.7169543079018775,
"learning_rate": 4.452568287302088e-06,
"loss": 0.7071,
"step": 2050
},
{
"epoch": 2.1608832807570977,
"grad_norm": 0.713248407044651,
"learning_rate": 4.40175947172719e-06,
"loss": 0.7068,
"step": 2055
},
{
"epoch": 2.166140904311251,
"grad_norm": 0.6698951380098755,
"learning_rate": 4.351160278806444e-06,
"loss": 0.7169,
"step": 2060
},
{
"epoch": 2.1713985278654047,
"grad_norm": 0.6926886822542322,
"learning_rate": 4.300772603195335e-06,
"loss": 0.7097,
"step": 2065
},
{
"epoch": 2.176656151419558,
"grad_norm": 0.7101604887955768,
"learning_rate": 4.250598331629215e-06,
"loss": 0.7199,
"step": 2070
},
{
"epoch": 2.181913774973712,
"grad_norm": 0.6817786841786956,
"learning_rate": 4.200639342852648e-06,
"loss": 0.709,
"step": 2075
},
{
"epoch": 2.1871713985278656,
"grad_norm": 0.670024634466742,
"learning_rate": 4.150897507549076e-06,
"loss": 0.7031,
"step": 2080
},
{
"epoch": 2.192429022082019,
"grad_norm": 0.704511383930273,
"learning_rate": 4.101374688270751e-06,
"loss": 0.716,
"step": 2085
},
{
"epoch": 2.1976866456361726,
"grad_norm": 0.6737857814580686,
"learning_rate": 4.052072739369015e-06,
"loss": 0.7151,
"step": 2090
},
{
"epoch": 2.202944269190326,
"grad_norm": 0.7004818342552892,
"learning_rate": 4.0029935069248494e-06,
"loss": 0.7084,
"step": 2095
},
{
"epoch": 2.2082018927444795,
"grad_norm": 0.6938485406548258,
"learning_rate": 3.954138828679762e-06,
"loss": 0.7137,
"step": 2100
},
{
"epoch": 2.2082018927444795,
"eval_loss": 1.0840835571289062,
"eval_runtime": 554.526,
"eval_samples_per_second": 24.268,
"eval_steps_per_second": 0.191,
"step": 2100
},
{
"epoch": 2.213459516298633,
"grad_norm": 0.6902078976776752,
"learning_rate": 3.905510533966959e-06,
"loss": 0.7096,
"step": 2105
},
{
"epoch": 2.2187171398527865,
"grad_norm": 0.7110522716973304,
"learning_rate": 3.857110443642864e-06,
"loss": 0.6949,
"step": 2110
},
{
"epoch": 2.22397476340694,
"grad_norm": 0.7247408104466715,
"learning_rate": 3.8089403700189254e-06,
"loss": 0.7187,
"step": 2115
},
{
"epoch": 2.2292323869610935,
"grad_norm": 0.7097288878868501,
"learning_rate": 3.7610021167937526e-06,
"loss": 0.7036,
"step": 2120
},
{
"epoch": 2.234490010515247,
"grad_norm": 0.7612906599424331,
"learning_rate": 3.713297478985595e-06,
"loss": 0.7205,
"step": 2125
},
{
"epoch": 2.2397476340694005,
"grad_norm": 0.7985865232124967,
"learning_rate": 3.6658282428651026e-06,
"loss": 0.7018,
"step": 2130
},
{
"epoch": 2.245005257623554,
"grad_norm": 0.6445514804150951,
"learning_rate": 3.618596185888471e-06,
"loss": 0.6983,
"step": 2135
},
{
"epoch": 2.250262881177708,
"grad_norm": 0.6788252376343907,
"learning_rate": 3.5716030766308553e-06,
"loss": 0.6963,
"step": 2140
},
{
"epoch": 2.2555205047318614,
"grad_norm": 0.6558652902911214,
"learning_rate": 3.5248506747201694e-06,
"loss": 0.6988,
"step": 2145
},
{
"epoch": 2.260778128286015,
"grad_norm": 0.727190238646923,
"learning_rate": 3.4783407307711913e-06,
"loss": 0.701,
"step": 2150
},
{
"epoch": 2.2660357518401684,
"grad_norm": 0.7053251271830925,
"learning_rate": 3.4320749863199987e-06,
"loss": 0.7038,
"step": 2155
},
{
"epoch": 2.271293375394322,
"grad_norm": 0.691685408706534,
"learning_rate": 3.3860551737587857e-06,
"loss": 0.7068,
"step": 2160
},
{
"epoch": 2.2765509989484753,
"grad_norm": 0.6897266118308167,
"learning_rate": 3.3402830162709644e-06,
"loss": 0.703,
"step": 2165
},
{
"epoch": 2.281808622502629,
"grad_norm": 0.6917521598477109,
"learning_rate": 3.2947602277666678e-06,
"loss": 0.7136,
"step": 2170
},
{
"epoch": 2.2870662460567823,
"grad_norm": 0.6899343095386444,
"learning_rate": 3.2494885128185517e-06,
"loss": 0.6984,
"step": 2175
},
{
"epoch": 2.292323869610936,
"grad_norm": 0.6869089208872174,
"learning_rate": 3.2044695665979865e-06,
"loss": 0.724,
"step": 2180
},
{
"epoch": 2.2975814931650893,
"grad_norm": 0.7005346292608602,
"learning_rate": 3.1597050748115655e-06,
"loss": 0.7035,
"step": 2185
},
{
"epoch": 2.302839116719243,
"grad_norm": 0.7061499912056902,
"learning_rate": 3.115196713638e-06,
"loss": 0.6865,
"step": 2190
},
{
"epoch": 2.3080967402733963,
"grad_norm": 0.6815319705079519,
"learning_rate": 3.0709461496653504e-06,
"loss": 0.7156,
"step": 2195
},
{
"epoch": 2.3133543638275498,
"grad_norm": 0.7049825225126681,
"learning_rate": 3.0269550398286096e-06,
"loss": 0.7115,
"step": 2200
},
{
"epoch": 2.3133543638275498,
"eval_loss": 1.0800352096557617,
"eval_runtime": 568.5479,
"eval_samples_per_second": 23.669,
"eval_steps_per_second": 0.186,
"step": 2200
},
{
"epoch": 2.3186119873817033,
"grad_norm": 0.6675183707377966,
"learning_rate": 2.983225031347683e-06,
"loss": 0.7087,
"step": 2205
},
{
"epoch": 2.3238696109358568,
"grad_norm": 0.7114348169331429,
"learning_rate": 2.939757761665686e-06,
"loss": 0.7077,
"step": 2210
},
{
"epoch": 2.3291272344900107,
"grad_norm": 0.7191874914216904,
"learning_rate": 2.8965548583876534e-06,
"loss": 0.7201,
"step": 2215
},
{
"epoch": 2.334384858044164,
"grad_norm": 0.6766258501238187,
"learning_rate": 2.853617939219574e-06,
"loss": 0.7072,
"step": 2220
},
{
"epoch": 2.3396424815983177,
"grad_norm": 0.7028752741574394,
"learning_rate": 2.810948611907832e-06,
"loss": 0.6955,
"step": 2225
},
{
"epoch": 2.344900105152471,
"grad_norm": 0.7210493538085075,
"learning_rate": 2.7685484741790023e-06,
"loss": 0.7129,
"step": 2230
},
{
"epoch": 2.3501577287066246,
"grad_norm": 0.6928964162595481,
"learning_rate": 2.7264191136800112e-06,
"loss": 0.6873,
"step": 2235
},
{
"epoch": 2.355415352260778,
"grad_norm": 0.6949752358383088,
"learning_rate": 2.6845621079187122e-06,
"loss": 0.7207,
"step": 2240
},
{
"epoch": 2.3606729758149316,
"grad_norm": 0.7000497878298911,
"learning_rate": 2.6429790242047927e-06,
"loss": 0.7019,
"step": 2245
},
{
"epoch": 2.365930599369085,
"grad_norm": 0.6655488986940491,
"learning_rate": 2.6016714195911085e-06,
"loss": 0.6909,
"step": 2250
},
{
"epoch": 2.3711882229232386,
"grad_norm": 0.6946100724369102,
"learning_rate": 2.560640840815363e-06,
"loss": 0.703,
"step": 2255
},
{
"epoch": 2.376445846477392,
"grad_norm": 0.6799665527381428,
"learning_rate": 2.5198888242422014e-06,
"loss": 0.7029,
"step": 2260
},
{
"epoch": 2.3817034700315456,
"grad_norm": 0.698092499847167,
"learning_rate": 2.4794168958056854e-06,
"loss": 0.706,
"step": 2265
},
{
"epoch": 2.386961093585699,
"grad_norm": 0.6725956864860293,
"learning_rate": 2.439226570952137e-06,
"loss": 0.7087,
"step": 2270
},
{
"epoch": 2.392218717139853,
"grad_norm": 0.7109494323803826,
"learning_rate": 2.3993193545834182e-06,
"loss": 0.7125,
"step": 2275
},
{
"epoch": 2.3974763406940065,
"grad_norm": 0.7088160313512611,
"learning_rate": 2.35969674100056e-06,
"loss": 0.6979,
"step": 2280
},
{
"epoch": 2.40273396424816,
"grad_norm": 0.6826523489540324,
"learning_rate": 2.3203602138478264e-06,
"loss": 0.7055,
"step": 2285
},
{
"epoch": 2.4079915878023135,
"grad_norm": 0.6930882874841964,
"learning_rate": 2.281311246057143e-06,
"loss": 0.7201,
"step": 2290
},
{
"epoch": 2.413249211356467,
"grad_norm": 0.6782194389254947,
"learning_rate": 2.242551299792962e-06,
"loss": 0.7278,
"step": 2295
},
{
"epoch": 2.4185068349106205,
"grad_norm": 0.6611886260527141,
"learning_rate": 2.204081826397494e-06,
"loss": 0.7178,
"step": 2300
},
{
"epoch": 2.4185068349106205,
"eval_loss": 1.0789012908935547,
"eval_runtime": 548.9059,
"eval_samples_per_second": 24.516,
"eval_steps_per_second": 0.193,
"step": 2300
},
{
"epoch": 2.423764458464774,
"grad_norm": 0.6913748928617807,
"learning_rate": 2.1659042663363795e-06,
"loss": 0.7031,
"step": 2305
},
{
"epoch": 2.4290220820189274,
"grad_norm": 0.68971986235768,
"learning_rate": 2.1280200491447465e-06,
"loss": 0.6902,
"step": 2310
},
{
"epoch": 2.434279705573081,
"grad_norm": 0.7068453091320502,
"learning_rate": 2.0904305933736714e-06,
"loss": 0.7064,
"step": 2315
},
{
"epoch": 2.4395373291272344,
"grad_norm": 0.7009937280786678,
"learning_rate": 2.053137306537082e-06,
"loss": 0.702,
"step": 2320
},
{
"epoch": 2.444794952681388,
"grad_norm": 0.7009541498050648,
"learning_rate": 2.0161415850590327e-06,
"loss": 0.7072,
"step": 2325
},
{
"epoch": 2.4500525762355414,
"grad_norm": 0.6679413662712783,
"learning_rate": 1.9794448142214396e-06,
"loss": 0.7121,
"step": 2330
},
{
"epoch": 2.455310199789695,
"grad_norm": 0.6929272185822167,
"learning_rate": 1.9430483681121836e-06,
"loss": 0.7164,
"step": 2335
},
{
"epoch": 2.4605678233438484,
"grad_norm": 0.7778000958451866,
"learning_rate": 1.9069536095736817e-06,
"loss": 0.7091,
"step": 2340
},
{
"epoch": 2.465825446898002,
"grad_norm": 0.6672776696135466,
"learning_rate": 1.8711618901518446e-06,
"loss": 0.7132,
"step": 2345
},
{
"epoch": 2.471083070452156,
"grad_norm": 0.6949140160619673,
"learning_rate": 1.8356745500454699e-06,
"loss": 0.6974,
"step": 2350
},
{
"epoch": 2.4763406940063093,
"grad_norm": 0.6950911698278153,
"learning_rate": 1.8004929180560582e-06,
"loss": 0.6894,
"step": 2355
},
{
"epoch": 2.481598317560463,
"grad_norm": 0.6826148060946653,
"learning_rate": 1.7656183115380577e-06,
"loss": 0.7043,
"step": 2360
},
{
"epoch": 2.4868559411146163,
"grad_norm": 0.7310354415413428,
"learning_rate": 1.7310520363495454e-06,
"loss": 0.7021,
"step": 2365
},
{
"epoch": 2.4921135646687698,
"grad_norm": 0.6754671470342107,
"learning_rate": 1.6967953868033104e-06,
"loss": 0.7043,
"step": 2370
},
{
"epoch": 2.4973711882229233,
"grad_norm": 0.6935442287350769,
"learning_rate": 1.6628496456184107e-06,
"loss": 0.6994,
"step": 2375
},
{
"epoch": 2.5026288117770767,
"grad_norm": 0.690259266155438,
"learning_rate": 1.6292160838721316e-06,
"loss": 0.6946,
"step": 2380
},
{
"epoch": 2.5078864353312302,
"grad_norm": 0.6934285014568452,
"learning_rate": 1.5958959609523905e-06,
"loss": 0.719,
"step": 2385
},
{
"epoch": 2.5131440588853837,
"grad_norm": 0.706595235609839,
"learning_rate": 1.562890524510583e-06,
"loss": 0.699,
"step": 2390
},
{
"epoch": 2.518401682439537,
"grad_norm": 0.7031045404384867,
"learning_rate": 1.530201010414859e-06,
"loss": 0.7019,
"step": 2395
},
{
"epoch": 2.5236593059936907,
"grad_norm": 0.6611225731580428,
"learning_rate": 1.4978286427038602e-06,
"loss": 0.7063,
"step": 2400
},
{
"epoch": 2.5236593059936907,
"eval_loss": 1.0776675939559937,
"eval_runtime": 549.0786,
"eval_samples_per_second": 24.508,
"eval_steps_per_second": 0.193,
"step": 2400
},
{
"epoch": 2.5289169295478446,
"grad_norm": 0.6876289627741422,
"learning_rate": 1.4657746335408695e-06,
"loss": 0.7068,
"step": 2405
},
{
"epoch": 2.534174553101998,
"grad_norm": 0.680233555417602,
"learning_rate": 1.4340401831684413e-06,
"loss": 0.6807,
"step": 2410
},
{
"epoch": 2.5394321766561516,
"grad_norm": 0.6654932547762412,
"learning_rate": 1.4026264798634359e-06,
"loss": 0.7179,
"step": 2415
},
{
"epoch": 2.544689800210305,
"grad_norm": 0.6945732690751362,
"learning_rate": 1.371534699892547e-06,
"loss": 0.7086,
"step": 2420
},
{
"epoch": 2.5499474237644586,
"grad_norm": 0.6862420273962914,
"learning_rate": 1.3407660074682472e-06,
"loss": 0.7028,
"step": 2425
},
{
"epoch": 2.555205047318612,
"grad_norm": 0.651460129300283,
"learning_rate": 1.3103215547051962e-06,
"loss": 0.6975,
"step": 2430
},
{
"epoch": 2.5604626708727656,
"grad_norm": 0.6970590762896678,
"learning_rate": 1.2802024815770942e-06,
"loss": 0.7115,
"step": 2435
},
{
"epoch": 2.565720294426919,
"grad_norm": 0.6744240212503375,
"learning_rate": 1.250409915874007e-06,
"loss": 0.7057,
"step": 2440
},
{
"epoch": 2.5709779179810726,
"grad_norm": 0.6699733139877856,
"learning_rate": 1.220944973160133e-06,
"loss": 0.6884,
"step": 2445
},
{
"epoch": 2.576235541535226,
"grad_norm": 0.6915305368046275,
"learning_rate": 1.1918087567320257e-06,
"loss": 0.7026,
"step": 2450
},
{
"epoch": 2.5814931650893795,
"grad_norm": 0.6755768658668228,
"learning_rate": 1.1630023575772908e-06,
"loss": 0.6966,
"step": 2455
},
{
"epoch": 2.586750788643533,
"grad_norm": 0.705779731938613,
"learning_rate": 1.1345268543337283e-06,
"loss": 0.6988,
"step": 2460
},
{
"epoch": 2.5920084121976865,
"grad_norm": 0.7111985726538933,
"learning_rate": 1.1063833132489477e-06,
"loss": 0.696,
"step": 2465
},
{
"epoch": 2.59726603575184,
"grad_norm": 0.6539744158999056,
"learning_rate": 1.0785727881404329e-06,
"loss": 0.6961,
"step": 2470
},
{
"epoch": 2.6025236593059935,
"grad_norm": 0.6848492944946433,
"learning_rate": 1.051096320356103e-06,
"loss": 0.7046,
"step": 2475
},
{
"epoch": 2.607781282860147,
"grad_norm": 0.7032823101149783,
"learning_rate": 1.0239549387352954e-06,
"loss": 0.7201,
"step": 2480
},
{
"epoch": 2.6130389064143005,
"grad_norm": 0.6762173164818084,
"learning_rate": 9.97149659570259e-07,
"loss": 0.7116,
"step": 2485
},
{
"epoch": 2.6182965299684544,
"grad_norm": 0.6806035208648271,
"learning_rate": 9.706814865680957e-07,
"loss": 0.7045,
"step": 2490
},
{
"epoch": 2.623554153522608,
"grad_norm": 0.6776503088053696,
"learning_rate": 9.445514108131693e-07,
"loss": 0.6888,
"step": 2495
},
{
"epoch": 2.6288117770767614,
"grad_norm": 0.6836339268439919,
"learning_rate": 9.187604107300107e-07,
"loss": 0.6964,
"step": 2500
},
{
"epoch": 2.6288117770767614,
"eval_loss": 1.0754879713058472,
"eval_runtime": 544.4972,
"eval_samples_per_second": 24.715,
"eval_steps_per_second": 0.195,
"step": 2500
},
{
"epoch": 2.634069400630915,
"grad_norm": 0.6761130619047382,
"learning_rate": 8.933094520466634e-07,
"loss": 0.7058,
"step": 2505
},
{
"epoch": 2.6393270241850684,
"grad_norm": 0.6672694366752451,
"learning_rate": 8.681994877585365e-07,
"loss": 0.7054,
"step": 2510
},
{
"epoch": 2.644584647739222,
"grad_norm": 0.7017173692899314,
"learning_rate": 8.434314580927105e-07,
"loss": 0.7003,
"step": 2515
},
{
"epoch": 2.6498422712933754,
"grad_norm": 0.6828167224204641,
"learning_rate": 8.19006290472737e-07,
"loss": 0.7134,
"step": 2520
},
{
"epoch": 2.655099894847529,
"grad_norm": 0.6887161892823586,
"learning_rate": 7.949248994839131e-07,
"loss": 0.7107,
"step": 2525
},
{
"epoch": 2.6603575184016823,
"grad_norm": 0.6858305599284509,
"learning_rate": 7.711881868390292e-07,
"loss": 0.7185,
"step": 2530
},
{
"epoch": 2.665615141955836,
"grad_norm": 0.6919951634850794,
"learning_rate": 7.477970413446089e-07,
"loss": 0.7038,
"step": 2535
},
{
"epoch": 2.6708727655099898,
"grad_norm": 0.7059421711173827,
"learning_rate": 7.247523388676292e-07,
"loss": 0.6934,
"step": 2540
},
{
"epoch": 2.6761303890641432,
"grad_norm": 0.697370543891664,
"learning_rate": 7.020549423027223e-07,
"loss": 0.6874,
"step": 2545
},
{
"epoch": 2.6813880126182967,
"grad_norm": 0.6851210955122395,
"learning_rate": 6.797057015398634e-07,
"loss": 0.7091,
"step": 2550
},
{
"epoch": 2.6866456361724502,
"grad_norm": 0.6810814971271851,
"learning_rate": 6.577054534325511e-07,
"loss": 0.6935,
"step": 2555
},
{
"epoch": 2.6919032597266037,
"grad_norm": 0.6676833725760639,
"learning_rate": 6.360550217664685e-07,
"loss": 0.7088,
"step": 2560
},
{
"epoch": 2.697160883280757,
"grad_norm": 0.7148977742599517,
"learning_rate": 6.147552172286375e-07,
"loss": 0.6987,
"step": 2565
},
{
"epoch": 2.7024185068349107,
"grad_norm": 0.6475197510665502,
"learning_rate": 5.938068373770667e-07,
"loss": 0.6864,
"step": 2570
},
{
"epoch": 2.707676130389064,
"grad_norm": 0.685110898697612,
"learning_rate": 5.732106666108827e-07,
"loss": 0.6937,
"step": 2575
},
{
"epoch": 2.7129337539432177,
"grad_norm": 0.6850644373487722,
"learning_rate": 5.529674761409643e-07,
"loss": 0.701,
"step": 2580
},
{
"epoch": 2.718191377497371,
"grad_norm": 0.6619622645326332,
"learning_rate": 5.330780239610534e-07,
"loss": 0.705,
"step": 2585
},
{
"epoch": 2.7234490010515247,
"grad_norm": 0.6779887305496379,
"learning_rate": 5.135430548193909e-07,
"loss": 0.6912,
"step": 2590
},
{
"epoch": 2.728706624605678,
"grad_norm": 0.6695357873979283,
"learning_rate": 4.943633001908111e-07,
"loss": 0.7007,
"step": 2595
},
{
"epoch": 2.7339642481598316,
"grad_norm": 0.6851094475471325,
"learning_rate": 4.7553947824936496e-07,
"loss": 0.7121,
"step": 2600
},
{
"epoch": 2.7339642481598316,
"eval_loss": 1.0742169618606567,
"eval_runtime": 543.8651,
"eval_samples_per_second": 24.743,
"eval_steps_per_second": 0.195,
"step": 2600
},
{
"epoch": 2.739221871713985,
"grad_norm": 0.6798881286754066,
"learning_rate": 4.5707229384142184e-07,
"loss": 0.7043,
"step": 2605
},
{
"epoch": 2.7444794952681386,
"grad_norm": 0.6627199879579073,
"learning_rate": 4.3896243845927943e-07,
"loss": 0.7083,
"step": 2610
},
{
"epoch": 2.749737118822292,
"grad_norm": 0.6911107462785068,
"learning_rate": 4.21210590215273e-07,
"loss": 0.7062,
"step": 2615
},
{
"epoch": 2.7549947423764456,
"grad_norm": 0.6538298159253733,
"learning_rate": 4.0381741381638085e-07,
"loss": 0.6919,
"step": 2620
},
{
"epoch": 2.7602523659305995,
"grad_norm": 0.6913261772512153,
"learning_rate": 3.8678356053933666e-07,
"loss": 0.6899,
"step": 2625
},
{
"epoch": 2.765509989484753,
"grad_norm": 0.6731586319154937,
"learning_rate": 3.7010966820623996e-07,
"loss": 0.7115,
"step": 2630
},
{
"epoch": 2.7707676130389065,
"grad_norm": 0.6739111157184594,
"learning_rate": 3.5379636116067764e-07,
"loss": 0.6938,
"step": 2635
},
{
"epoch": 2.77602523659306,
"grad_norm": 0.6775894239204638,
"learning_rate": 3.378442502443424e-07,
"loss": 0.7018,
"step": 2640
},
{
"epoch": 2.7812828601472135,
"grad_norm": 0.6630535974515509,
"learning_rate": 3.222539327741592e-07,
"loss": 0.7108,
"step": 2645
},
{
"epoch": 2.786540483701367,
"grad_norm": 0.6476313251006354,
"learning_rate": 3.070259925199248e-07,
"loss": 0.7064,
"step": 2650
},
{
"epoch": 2.7917981072555205,
"grad_norm": 0.6793550821713811,
"learning_rate": 2.921609996824437e-07,
"loss": 0.686,
"step": 2655
},
{
"epoch": 2.797055730809674,
"grad_norm": 0.6950659181503308,
"learning_rate": 2.7765951087218134e-07,
"loss": 0.6922,
"step": 2660
},
{
"epoch": 2.8023133543638274,
"grad_norm": 0.6759277309855073,
"learning_rate": 2.6352206908841325e-07,
"loss": 0.7123,
"step": 2665
},
{
"epoch": 2.807570977917981,
"grad_norm": 0.6871290912583685,
"learning_rate": 2.497492036989058e-07,
"loss": 0.7071,
"step": 2670
},
{
"epoch": 2.812828601472135,
"grad_norm": 0.6672178424750838,
"learning_rate": 2.3634143042008396e-07,
"loss": 0.7055,
"step": 2675
},
{
"epoch": 2.8180862250262884,
"grad_norm": 0.6871427641549465,
"learning_rate": 2.2329925129772613e-07,
"loss": 0.7162,
"step": 2680
},
{
"epoch": 2.823343848580442,
"grad_norm": 0.6996639531083144,
"learning_rate": 2.1062315468816318e-07,
"loss": 0.7116,
"step": 2685
},
{
"epoch": 2.8286014721345953,
"grad_norm": 0.7057461914462779,
"learning_rate": 1.9831361523999227e-07,
"loss": 0.6978,
"step": 2690
},
{
"epoch": 2.833859095688749,
"grad_norm": 0.6606180852855636,
"learning_rate": 1.8637109387630637e-07,
"loss": 0.6872,
"step": 2695
},
{
"epoch": 2.8391167192429023,
"grad_norm": 0.6603518954437334,
"learning_rate": 1.7479603777742937e-07,
"loss": 0.7049,
"step": 2700
},
{
"epoch": 2.8391167192429023,
"eval_loss": 1.074755311012268,
"eval_runtime": 548.3041,
"eval_samples_per_second": 24.543,
"eval_steps_per_second": 0.193,
"step": 2700
},
{
"epoch": 2.844374342797056,
"grad_norm": 0.7039186631952389,
"learning_rate": 1.6358888036418053e-07,
"loss": 0.7076,
"step": 2705
},
{
"epoch": 2.8496319663512093,
"grad_norm": 0.6613941861667958,
"learning_rate": 1.5275004128163407e-07,
"loss": 0.7022,
"step": 2710
},
{
"epoch": 2.854889589905363,
"grad_norm": 0.6784432805911156,
"learning_rate": 1.422799263834196e-07,
"loss": 0.7018,
"step": 2715
},
{
"epoch": 2.8601472134595163,
"grad_norm": 0.662880920108081,
"learning_rate": 1.3217892771651087e-07,
"loss": 0.7039,
"step": 2720
},
{
"epoch": 2.8654048370136698,
"grad_norm": 0.674177068306156,
"learning_rate": 1.224474235065587e-07,
"loss": 0.6948,
"step": 2725
},
{
"epoch": 2.8706624605678233,
"grad_norm": 0.6576941034750949,
"learning_rate": 1.1308577814371669e-07,
"loss": 0.6959,
"step": 2730
},
{
"epoch": 2.8759200841219767,
"grad_norm": 0.6877738227702634,
"learning_rate": 1.040943421690055e-07,
"loss": 0.7016,
"step": 2735
},
{
"epoch": 2.8811777076761302,
"grad_norm": 0.6570796449184478,
"learning_rate": 9.547345226118666e-08,
"loss": 0.7008,
"step": 2740
},
{
"epoch": 2.8864353312302837,
"grad_norm": 0.6556870027002477,
"learning_rate": 8.722343122414823e-08,
"loss": 0.7114,
"step": 2745
},
{
"epoch": 2.891692954784437,
"grad_norm": 0.6525356309193387,
"learning_rate": 7.93445879748267e-08,
"loss": 0.705,
"step": 2750
},
{
"epoch": 2.8969505783385907,
"grad_norm": 0.6979809421888648,
"learning_rate": 7.183721753163508e-08,
"loss": 0.705,
"step": 2755
},
{
"epoch": 2.9022082018927446,
"grad_norm": 0.6680505376816218,
"learning_rate": 6.470160100341516e-08,
"loss": 0.7028,
"step": 2760
},
{
"epoch": 2.907465825446898,
"grad_norm": 0.6754425700333265,
"learning_rate": 5.793800557891471e-08,
"loss": 0.6969,
"step": 2765
},
{
"epoch": 2.9127234490010516,
"grad_norm": 0.6770770823855421,
"learning_rate": 5.154668451678224e-08,
"loss": 0.709,
"step": 2770
},
{
"epoch": 2.917981072555205,
"grad_norm": 0.6880130710385723,
"learning_rate": 4.552787713608231e-08,
"loss": 0.69,
"step": 2775
},
{
"epoch": 2.9232386961093586,
"grad_norm": 0.6625414510833385,
"learning_rate": 3.988180880733161e-08,
"loss": 0.6962,
"step": 2780
},
{
"epoch": 2.928496319663512,
"grad_norm": 0.6643252155800653,
"learning_rate": 3.460869094407127e-08,
"loss": 0.7037,
"step": 2785
},
{
"epoch": 2.9337539432176656,
"grad_norm": 0.6897645676504198,
"learning_rate": 2.9708720994934272e-08,
"loss": 0.6896,
"step": 2790
},
{
"epoch": 2.939011566771819,
"grad_norm": 0.7113672933129457,
"learning_rate": 2.5182082436266963e-08,
"loss": 0.7165,
"step": 2795
},
{
"epoch": 2.9442691903259726,
"grad_norm": 0.6781710312687059,
"learning_rate": 2.1028944765251193e-08,
"loss": 0.7024,
"step": 2800
},
{
"epoch": 2.9442691903259726,
"eval_loss": 1.074735403060913,
"eval_runtime": 544.9092,
"eval_samples_per_second": 24.696,
"eval_steps_per_second": 0.195,
"step": 2800
},
{
"epoch": 2.949526813880126,
"grad_norm": 0.7502190973801118,
"learning_rate": 1.724946349355605e-08,
"loss": 0.6952,
"step": 2805
},
{
"epoch": 2.9547844374342795,
"grad_norm": 0.6554060805074167,
"learning_rate": 1.3843780141521435e-08,
"loss": 0.7095,
"step": 2810
},
{
"epoch": 2.9600420609884335,
"grad_norm": 0.6884790361695539,
"learning_rate": 1.081202223285449e-08,
"loss": 0.7096,
"step": 2815
},
{
"epoch": 2.965299684542587,
"grad_norm": 0.6687316519292371,
"learning_rate": 8.154303289854559e-09,
"loss": 0.7071,
"step": 2820
},
{
"epoch": 2.9705573080967405,
"grad_norm": 0.6719077380861403,
"learning_rate": 5.870722829164344e-09,
"loss": 0.6954,
"step": 2825
},
{
"epoch": 2.975814931650894,
"grad_norm": 0.6445219670997994,
"learning_rate": 3.9613663580406745e-09,
"loss": 0.6844,
"step": 2830
},
{
"epoch": 2.9810725552050474,
"grad_norm": 0.6702818163839258,
"learning_rate": 2.426305371155957e-09,
"loss": 0.6924,
"step": 2835
},
{
"epoch": 2.986330178759201,
"grad_norm": 0.6546313538456479,
"learning_rate": 1.265597347920311e-09,
"loss": 0.7013,
"step": 2840
},
{
"epoch": 2.9915878023133544,
"grad_norm": 0.6790610179426215,
"learning_rate": 4.792857503266301e-10,
"loss": 0.7013,
"step": 2845
},
{
"epoch": 2.996845425867508,
"grad_norm": 0.6610872038208641,
"learning_rate": 6.740002132743506e-11,
"loss": 0.708,
"step": 2850
},
{
"epoch": 3.0,
"step": 2853,
"total_flos": 1194720315310080.0,
"train_loss": 0.8973418972260736,
"train_runtime": 76133.7056,
"train_samples_per_second": 4.793,
"train_steps_per_second": 0.037
}
],
"logging_steps": 5,
"max_steps": 2853,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1194720315310080.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}