gemma7b-classification-gpt4o-100k / trainer_state.json

Model save

20f4bcc verified 7 months ago

161 kB

	{
	"best_metric": null,
	"best_model_checkpoint": null,
	"epoch": 15.0,
	"eval_steps": 500,
	"global_step": 4785,
	"is_hyper_param_search": false,
	"is_local_process_zero": true,
	"is_world_process_zero": true,
	"log_history": [
	{
	"epoch": 0.003134796238244514,
	"grad_norm": 1200.0,
	"learning_rate": 4.175365344467641e-07,
	"loss": 56.0196,
	"step": 1
	},
	{
	"epoch": 0.01567398119122257,
	"grad_norm": 1104.0,
	"learning_rate": 2.0876826722338207e-06,
	"loss": 57.1065,
	"step": 5
	},
	{
	"epoch": 0.03134796238244514,
	"grad_norm": 780.0,
	"learning_rate": 4.175365344467641e-06,
	"loss": 53.982,
	"step": 10
	},
	{
	"epoch": 0.047021943573667714,
	"grad_norm": 378.0,
	"learning_rate": 6.2630480167014616e-06,
	"loss": 39.7621,
	"step": 15
	},
	{
	"epoch": 0.06269592476489028,
	"grad_norm": 119.0,
	"learning_rate": 8.350730688935283e-06,
	"loss": 31.2881,
	"step": 20
	},
	{
	"epoch": 0.07836990595611286,
	"grad_norm": 59.0,
	"learning_rate": 1.0438413361169103e-05,
	"loss": 29.3676,
	"step": 25
	},
	{
	"epoch": 0.09404388714733543,
	"grad_norm": 27.75,
	"learning_rate": 1.2526096033402923e-05,
	"loss": 27.221,
	"step": 30
	},
	{
	"epoch": 0.109717868338558,
	"grad_norm": 17.0,
	"learning_rate": 1.4613778705636743e-05,
	"loss": 25.3228,
	"step": 35
	},
	{
	"epoch": 0.12539184952978055,
	"grad_norm": 11.375,
	"learning_rate": 1.6701461377870565e-05,
	"loss": 24.7964,
	"step": 40
	},
	{
	"epoch": 0.14106583072100312,
	"grad_norm": 14.6875,
	"learning_rate": 1.8789144050104384e-05,
	"loss": 23.6672,
	"step": 45
	},
	{
	"epoch": 0.15673981191222572,
	"grad_norm": 23.5,
	"learning_rate": 2.0876826722338206e-05,
	"loss": 22.938,
	"step": 50
	},
	{
	"epoch": 0.1724137931034483,
	"grad_norm": 46.75,
	"learning_rate": 2.2964509394572024e-05,
	"loss": 20.5447,
	"step": 55
	},
	{
	"epoch": 0.18808777429467086,
	"grad_norm": 102.5,
	"learning_rate": 2.5052192066805846e-05,
	"loss": 15.6089,
	"step": 60
	},
	{
	"epoch": 0.20376175548589343,
	"grad_norm": 21.875,
	"learning_rate": 2.7139874739039668e-05,
	"loss": 6.57,
	"step": 65
	},
	{
	"epoch": 0.219435736677116,
	"grad_norm": 7.15625,
	"learning_rate": 2.9227557411273487e-05,
	"loss": 2.9537,
	"step": 70
	},
	{
	"epoch": 0.23510971786833856,
	"grad_norm": 3.953125,
	"learning_rate": 3.131524008350731e-05,
	"loss": 2.4063,
	"step": 75
	},
	{
	"epoch": 0.2507836990595611,
	"grad_norm": 3.234375,
	"learning_rate": 3.340292275574113e-05,
	"loss": 2.1568,
	"step": 80
	},
	{
	"epoch": 0.2664576802507837,
	"grad_norm": 4.75,
	"learning_rate": 3.5490605427974946e-05,
	"loss": 1.9549,
	"step": 85
	},
	{
	"epoch": 0.28213166144200624,
	"grad_norm": 5.90625,
	"learning_rate": 3.757828810020877e-05,
	"loss": 1.8178,
	"step": 90
	},
	{
	"epoch": 0.29780564263322884,
	"grad_norm": 16.625,
	"learning_rate": 3.966597077244259e-05,
	"loss": 1.7357,
	"step": 95
	},
	{
	"epoch": 0.31347962382445144,
	"grad_norm": 10.625,
	"learning_rate": 4.175365344467641e-05,
	"loss": 1.6526,
	"step": 100
	},
	{
	"epoch": 0.329153605015674,
	"grad_norm": 16.875,
	"learning_rate": 4.3841336116910233e-05,
	"loss": 1.6119,
	"step": 105
	},
	{
	"epoch": 0.3448275862068966,
	"grad_norm": 15.9375,
	"learning_rate": 4.592901878914405e-05,
	"loss": 1.5619,
	"step": 110
	},
	{
	"epoch": 0.3605015673981191,
	"grad_norm": 6.25,
	"learning_rate": 4.801670146137787e-05,
	"loss": 1.553,
	"step": 115
	},
	{
	"epoch": 0.3761755485893417,
	"grad_norm": 16.875,
	"learning_rate": 5.010438413361169e-05,
	"loss": 1.5499,
	"step": 120
	},
	{
	"epoch": 0.39184952978056425,
	"grad_norm": 18.875,
	"learning_rate": 5.219206680584552e-05,
	"loss": 1.5358,
	"step": 125
	},
	{
	"epoch": 0.40752351097178685,
	"grad_norm": 10.5,
	"learning_rate": 5.4279749478079336e-05,
	"loss": 1.5022,
	"step": 130
	},
	{
	"epoch": 0.4231974921630094,
	"grad_norm": 8.6875,
	"learning_rate": 5.636743215031316e-05,
	"loss": 1.4674,
	"step": 135
	},
	{
	"epoch": 0.438871473354232,
	"grad_norm": 3.25,
	"learning_rate": 5.8455114822546973e-05,
	"loss": 1.4238,
	"step": 140
	},
	{
	"epoch": 0.45454545454545453,
	"grad_norm": 9.0,
	"learning_rate": 6.05427974947808e-05,
	"loss": 1.3893,
	"step": 145
	},
	{
	"epoch": 0.4702194357366771,
	"grad_norm": 25.875,
	"learning_rate": 6.263048016701462e-05,
	"loss": 1.3631,
	"step": 150
	},
	{
	"epoch": 0.48589341692789967,
	"grad_norm": 5.21875,
	"learning_rate": 6.471816283924845e-05,
	"loss": 1.3578,
	"step": 155
	},
	{
	"epoch": 0.5015673981191222,
	"grad_norm": 4.96875,
	"learning_rate": 6.680584551148226e-05,
	"loss": 1.2742,
	"step": 160
	},
	{
	"epoch": 0.5172413793103449,
	"grad_norm": 7.21875,
	"learning_rate": 6.889352818371608e-05,
	"loss": 1.2685,
	"step": 165
	},
	{
	"epoch": 0.5329153605015674,
	"grad_norm": 8.5,
	"learning_rate": 7.098121085594989e-05,
	"loss": 1.2819,
	"step": 170
	},
	{
	"epoch": 0.54858934169279,
	"grad_norm": 9.1875,
	"learning_rate": 7.306889352818372e-05,
	"loss": 1.284,
	"step": 175
	},
	{
	"epoch": 0.5642633228840125,
	"grad_norm": 6.40625,
	"learning_rate": 7.515657620041754e-05,
	"loss": 1.2541,
	"step": 180
	},
	{
	"epoch": 0.5799373040752351,
	"grad_norm": 11.5625,
	"learning_rate": 7.724425887265136e-05,
	"loss": 1.25,
	"step": 185
	},
	{
	"epoch": 0.5956112852664577,
	"grad_norm": 11.875,
	"learning_rate": 7.933194154488518e-05,
	"loss": 1.2324,
	"step": 190
	},
	{
	"epoch": 0.6112852664576802,
	"grad_norm": 4.6875,
	"learning_rate": 8.141962421711901e-05,
	"loss": 1.2093,
	"step": 195
	},
	{
	"epoch": 0.6269592476489029,
	"grad_norm": 10.0,
	"learning_rate": 8.350730688935282e-05,
	"loss": 1.2225,
	"step": 200
	},
	{
	"epoch": 0.6426332288401254,
	"grad_norm": 2.03125,
	"learning_rate": 8.559498956158665e-05,
	"loss": 1.1948,
	"step": 205
	},
	{
	"epoch": 0.658307210031348,
	"grad_norm": 6.125,
	"learning_rate": 8.768267223382047e-05,
	"loss": 1.1823,
	"step": 210
	},
	{
	"epoch": 0.6739811912225705,
	"grad_norm": 4.34375,
	"learning_rate": 8.977035490605428e-05,
	"loss": 1.1812,
	"step": 215
	},
	{
	"epoch": 0.6896551724137931,
	"grad_norm": 5.3125,
	"learning_rate": 9.18580375782881e-05,
	"loss": 1.1833,
	"step": 220
	},
	{
	"epoch": 0.7053291536050157,
	"grad_norm": 9.875,
	"learning_rate": 9.394572025052193e-05,
	"loss": 1.1606,
	"step": 225
	},
	{
	"epoch": 0.7210031347962382,
	"grad_norm": 2.609375,
	"learning_rate": 9.603340292275574e-05,
	"loss": 1.1662,
	"step": 230
	},
	{
	"epoch": 0.7366771159874608,
	"grad_norm": 2.96875,
	"learning_rate": 9.812108559498957e-05,
	"loss": 1.1573,
	"step": 235
	},
	{
	"epoch": 0.7523510971786834,
	"grad_norm": 2.0,
	"learning_rate": 0.00010020876826722338,
	"loss": 1.1386,
	"step": 240
	},
	{
	"epoch": 0.768025078369906,
	"grad_norm": 5.3125,
	"learning_rate": 0.00010229645093945721,
	"loss": 1.1822,
	"step": 245
	},
	{
	"epoch": 0.7836990595611285,
	"grad_norm": 8.125,
	"learning_rate": 0.00010438413361169104,
	"loss": 1.1415,
	"step": 250
	},
	{
	"epoch": 0.799373040752351,
	"grad_norm": 4.65625,
	"learning_rate": 0.00010647181628392484,
	"loss": 1.1534,
	"step": 255
	},
	{
	"epoch": 0.8150470219435737,
	"grad_norm": 1.0703125,
	"learning_rate": 0.00010855949895615867,
	"loss": 1.1338,
	"step": 260
	},
	{
	"epoch": 0.8307210031347962,
	"grad_norm": 3.03125,
	"learning_rate": 0.00011064718162839249,
	"loss": 1.1435,
	"step": 265
	},
	{
	"epoch": 0.8463949843260188,
	"grad_norm": 2.90625,
	"learning_rate": 0.00011273486430062632,
	"loss": 1.1213,
	"step": 270
	},
	{
	"epoch": 0.8620689655172413,
	"grad_norm": 3.875,
	"learning_rate": 0.00011482254697286012,
	"loss": 1.123,
	"step": 275
	},
	{
	"epoch": 0.877742946708464,
	"grad_norm": 9.8125,
	"learning_rate": 0.00011691022964509395,
	"loss": 1.154,
	"step": 280
	},
	{
	"epoch": 0.8934169278996865,
	"grad_norm": 3.15625,
	"learning_rate": 0.00011899791231732778,
	"loss": 1.1346,
	"step": 285
	},
	{
	"epoch": 0.9090909090909091,
	"grad_norm": 15.75,
	"learning_rate": 0.0001210855949895616,
	"loss": 1.0998,
	"step": 290
	},
	{
	"epoch": 0.9247648902821317,
	"grad_norm": 4.375,
	"learning_rate": 0.0001231732776617954,
	"loss": 1.0742,
	"step": 295
	},
	{
	"epoch": 0.9404388714733543,
	"grad_norm": 3.015625,
	"learning_rate": 0.00012526096033402923,
	"loss": 1.0956,
	"step": 300
	},
	{
	"epoch": 0.9561128526645768,
	"grad_norm": 3.796875,
	"learning_rate": 0.00012734864300626306,
	"loss": 1.0698,
	"step": 305
	},
	{
	"epoch": 0.9717868338557993,
	"grad_norm": 2.75,
	"learning_rate": 0.0001294363256784969,
	"loss": 1.0526,
	"step": 310
	},
	{
	"epoch": 0.987460815047022,
	"grad_norm": 2.078125,
	"learning_rate": 0.0001315240083507307,
	"loss": 1.0589,
	"step": 315
	},
	{
	"epoch": 1.0,
	"eval_loss": 1.7604742050170898,
	"eval_runtime": 0.8071,
	"eval_samples_per_second": 2.478,
	"eval_steps_per_second": 1.239,
	"step": 319
	},
	{
	"epoch": 1.0031347962382444,
	"grad_norm": 2.359375,
	"learning_rate": 0.00013361169102296452,
	"loss": 1.06,
	"step": 320
	},
	{
	"epoch": 1.0188087774294672,
	"grad_norm": 34.0,
	"learning_rate": 0.00013569937369519835,
	"loss": 1.0232,
	"step": 325
	},
	{
	"epoch": 1.0344827586206897,
	"grad_norm": 8.6875,
	"learning_rate": 0.00013778705636743215,
	"loss": 1.0624,
	"step": 330
	},
	{
	"epoch": 1.0501567398119123,
	"grad_norm": 10.8125,
	"learning_rate": 0.00013987473903966598,
	"loss": 1.0581,
	"step": 335
	},
	{
	"epoch": 1.0658307210031348,
	"grad_norm": 3.109375,
	"learning_rate": 0.00014196242171189978,
	"loss": 1.0274,
	"step": 340
	},
	{
	"epoch": 1.0815047021943573,
	"grad_norm": 3.828125,
	"learning_rate": 0.0001440501043841336,
	"loss": 1.0301,
	"step": 345
	},
	{
	"epoch": 1.09717868338558,
	"grad_norm": 10.25,
	"learning_rate": 0.00014613778705636744,
	"loss": 1.0221,
	"step": 350
	},
	{
	"epoch": 1.1128526645768024,
	"grad_norm": 4.0625,
	"learning_rate": 0.00014822546972860124,
	"loss": 1.0185,
	"step": 355
	},
	{
	"epoch": 1.1285266457680252,
	"grad_norm": 6.65625,
	"learning_rate": 0.00015031315240083507,
	"loss": 1.0146,
	"step": 360
	},
	{
	"epoch": 1.1442006269592477,
	"grad_norm": 4.21875,
	"learning_rate": 0.0001524008350730689,
	"loss": 1.0134,
	"step": 365
	},
	{
	"epoch": 1.1598746081504703,
	"grad_norm": 5.375,
	"learning_rate": 0.00015448851774530273,
	"loss": 1.0372,
	"step": 370
	},
	{
	"epoch": 1.1755485893416928,
	"grad_norm": 1.046875,
	"learning_rate": 0.00015657620041753653,
	"loss": 1.0096,
	"step": 375
	},
	{
	"epoch": 1.1912225705329154,
	"grad_norm": 3.734375,
	"learning_rate": 0.00015866388308977036,
	"loss": 1.0143,
	"step": 380
	},
	{
	"epoch": 1.206896551724138,
	"grad_norm": 3.265625,
	"learning_rate": 0.0001607515657620042,
	"loss": 1.0159,
	"step": 385
	},
	{
	"epoch": 1.2225705329153604,
	"grad_norm": 1.875,
	"learning_rate": 0.00016283924843423802,
	"loss": 1.0069,
	"step": 390
	},
	{
	"epoch": 1.238244514106583,
	"grad_norm": 1.9609375,
	"learning_rate": 0.00016492693110647182,
	"loss": 1.0019,
	"step": 395
	},
	{
	"epoch": 1.2539184952978055,
	"grad_norm": 5.0,
	"learning_rate": 0.00016701461377870565,
	"loss": 0.9979,
	"step": 400
	},
	{
	"epoch": 1.2695924764890283,
	"grad_norm": 0.85546875,
	"learning_rate": 0.00016910229645093947,
	"loss": 1.0066,
	"step": 405
	},
	{
	"epoch": 1.2852664576802508,
	"grad_norm": 3.25,
	"learning_rate": 0.0001711899791231733,
	"loss": 1.0002,
	"step": 410
	},
	{
	"epoch": 1.3009404388714734,
	"grad_norm": 0.92578125,
	"learning_rate": 0.0001732776617954071,
	"loss": 1.0036,
	"step": 415
	},
	{
	"epoch": 1.316614420062696,
	"grad_norm": 3.125,
	"learning_rate": 0.00017536534446764093,
	"loss": 1.003,
	"step": 420
	},
	{
	"epoch": 1.3322884012539185,
	"grad_norm": 3.390625,
	"learning_rate": 0.00017745302713987476,
	"loss": 1.0207,
	"step": 425
	},
	{
	"epoch": 1.347962382445141,
	"grad_norm": 9.3125,
	"learning_rate": 0.00017954070981210856,
	"loss": 1.0097,
	"step": 430
	},
	{
	"epoch": 1.3636363636363638,
	"grad_norm": 2.140625,
	"learning_rate": 0.0001816283924843424,
	"loss": 0.9984,
	"step": 435
	},
	{
	"epoch": 1.3793103448275863,
	"grad_norm": 0.93359375,
	"learning_rate": 0.0001837160751565762,
	"loss": 1.019,
	"step": 440
	},
	{
	"epoch": 1.3949843260188088,
	"grad_norm": 1.4140625,
	"learning_rate": 0.00018580375782881002,
	"loss": 1.0093,
	"step": 445
	},
	{
	"epoch": 1.4106583072100314,
	"grad_norm": 2.90625,
	"learning_rate": 0.00018789144050104385,
	"loss": 0.9906,
	"step": 450
	},
	{
	"epoch": 1.426332288401254,
	"grad_norm": 1.8203125,
	"learning_rate": 0.00018997912317327765,
	"loss": 0.9956,
	"step": 455
	},
	{
	"epoch": 1.4420062695924765,
	"grad_norm": 4.21875,
	"learning_rate": 0.00019206680584551148,
	"loss": 1.0026,
	"step": 460
	},
	{
	"epoch": 1.457680250783699,
	"grad_norm": 2.46875,
	"learning_rate": 0.0001941544885177453,
	"loss": 0.9779,
	"step": 465
	},
	{
	"epoch": 1.4733542319749215,
	"grad_norm": 1.0859375,
	"learning_rate": 0.00019624217118997914,
	"loss": 0.9981,
	"step": 470
	},
	{
	"epoch": 1.489028213166144,
	"grad_norm": 2.34375,
	"learning_rate": 0.00019832985386221294,
	"loss": 1.0087,
	"step": 475
	},
	{
	"epoch": 1.5047021943573666,
	"grad_norm": 5.71875,
	"learning_rate": 0.0001999999733852936,
	"loss": 1.0154,
	"step": 480
	},
	{
	"epoch": 1.5203761755485894,
	"grad_norm": 2.125,
	"learning_rate": 0.00019999904187205744,
	"loss": 1.0196,
	"step": 485
	},
	{
	"epoch": 1.536050156739812,
	"grad_norm": 5.1875,
	"learning_rate": 0.00019999677963766844,
	"loss": 1.0196,
	"step": 490
	},
	{
	"epoch": 1.5517241379310345,
	"grad_norm": 3.015625,
	"learning_rate": 0.00019999318671223102,
	"loss": 1.003,
	"step": 495
	},
	{
	"epoch": 1.567398119122257,
	"grad_norm": 2.453125,
	"learning_rate": 0.0001999882631435574,
	"loss": 1.0022,
	"step": 500
	},
	{
	"epoch": 1.5830721003134798,
	"grad_norm": 5.78125,
	"learning_rate": 0.00019998200899716724,
	"loss": 0.9932,
	"step": 505
	},
	{
	"epoch": 1.5987460815047023,
	"grad_norm": 3.625,
	"learning_rate": 0.00019997442435628653,
	"loss": 1.0083,
	"step": 510
	},
	{
	"epoch": 1.6144200626959249,
	"grad_norm": 6.375,
	"learning_rate": 0.00019996550932184666,
	"loss": 1.0153,
	"step": 515
	},
	{
	"epoch": 1.6300940438871474,
	"grad_norm": 1.59375,
	"learning_rate": 0.00019995526401248302,
	"loss": 1.0158,
	"step": 520
	},
	{
	"epoch": 1.64576802507837,
	"grad_norm": 4.84375,
	"learning_rate": 0.00019994368856453341,
	"loss": 1.0308,
	"step": 525
	},
	{
	"epoch": 1.6614420062695925,
	"grad_norm": 2.796875,
	"learning_rate": 0.00019993078313203632,
	"loss": 0.9834,
	"step": 530
	},
	{
	"epoch": 1.677115987460815,
	"grad_norm": 1.28125,
	"learning_rate": 0.0001999165478867286,
	"loss": 0.9937,
	"step": 535
	},
	{
	"epoch": 1.6927899686520376,
	"grad_norm": 1.9921875,
	"learning_rate": 0.00019990098301804357,
	"loss": 0.9884,
	"step": 540
	},
	{
	"epoch": 1.70846394984326,
	"grad_norm": 1.671875,
	"learning_rate": 0.00019988408873310815,
	"loss": 0.9846,
	"step": 545
	},
	{
	"epoch": 1.7241379310344827,
	"grad_norm": 0.9921875,
	"learning_rate": 0.00019986586525674036,
	"loss": 0.9711,
	"step": 550
	},
	{
	"epoch": 1.7398119122257052,
	"grad_norm": 1.921875,
	"learning_rate": 0.00019984631283144616,
	"loss": 0.9789,
	"step": 555
	},
	{
	"epoch": 1.7554858934169277,
	"grad_norm": 1.953125,
	"learning_rate": 0.0001998254317174163,
	"loss": 0.984,
	"step": 560
	},
	{
	"epoch": 1.7711598746081505,
	"grad_norm": 3.046875,
	"learning_rate": 0.00019980322219252284,
	"loss": 0.9609,
	"step": 565
	},
	{
	"epoch": 1.786833855799373,
	"grad_norm": 1.7578125,
	"learning_rate": 0.0001997796845523155,
	"loss": 0.9997,
	"step": 570
	},
	{
	"epoch": 1.8025078369905956,
	"grad_norm": 2.9375,
	"learning_rate": 0.00019975481911001762,
	"loss": 0.9772,
	"step": 575
	},
	{
	"epoch": 1.8181818181818183,
	"grad_norm": 1.5234375,
	"learning_rate": 0.00019972862619652203,
	"loss": 0.9939,
	"step": 580
	},
	{
	"epoch": 1.8338557993730409,
	"grad_norm": 1.6953125,
	"learning_rate": 0.00019970110616038673,
	"loss": 0.9794,
	"step": 585
	},
	{
	"epoch": 1.8495297805642634,
	"grad_norm": 3.109375,
	"learning_rate": 0.0001996722593678302,
	"loss": 0.9702,
	"step": 590
	},
	{
	"epoch": 1.865203761755486,
	"grad_norm": 1.2578125,
	"learning_rate": 0.00019964208620272647,
	"loss": 0.9667,
	"step": 595
	},
	{
	"epoch": 1.8808777429467085,
	"grad_norm": 0.8203125,
	"learning_rate": 0.00019961058706660005,
	"loss": 0.9632,
	"step": 600
	},
	{
	"epoch": 1.896551724137931,
	"grad_norm": 12.5,
	"learning_rate": 0.00019957776237862067,
	"loss": 0.9554,
	"step": 605
	},
	{
	"epoch": 1.9122257053291536,
	"grad_norm": 1.6328125,
	"learning_rate": 0.00019954361257559756,
	"loss": 0.9902,
	"step": 610
	},
	{
	"epoch": 1.9278996865203761,
	"grad_norm": 1.328125,
	"learning_rate": 0.0001995081381119737,
	"loss": 0.998,
	"step": 615
	},
	{
	"epoch": 1.9435736677115987,
	"grad_norm": 2.28125,
	"learning_rate": 0.00019947133945981987,
	"loss": 0.9781,
	"step": 620
	},
	{
	"epoch": 1.9592476489028212,
	"grad_norm": 2.796875,
	"learning_rate": 0.00019943321710882815,
	"loss": 0.9956,
	"step": 625
	},
	{
	"epoch": 1.9749216300940438,
	"grad_norm": 3.046875,
	"learning_rate": 0.0001993937715663056,
	"loss": 0.9776,
	"step": 630
	},
	{
	"epoch": 1.9905956112852663,
	"grad_norm": 1.9140625,
	"learning_rate": 0.00019935300335716748,
	"loss": 1.0,
	"step": 635
	},
	{
	"epoch": 2.0,
	"eval_loss": 1.7305909395217896,
	"eval_runtime": 0.8036,
	"eval_samples_per_second": 2.489,
	"eval_steps_per_second": 1.244,
	"step": 638
	},
	{
	"epoch": 2.006269592476489,
	"grad_norm": 3.921875,
	"learning_rate": 0.00019931091302393008,
	"loss": 0.9559,
	"step": 640
	},
	{
	"epoch": 2.0219435736677114,
	"grad_norm": 1.6015625,
	"learning_rate": 0.00019926750112670382,
	"loss": 0.8412,
	"step": 645
	},
	{
	"epoch": 2.0376175548589344,
	"grad_norm": 1.8203125,
	"learning_rate": 0.00019922276824318547,
	"loss": 0.8475,
	"step": 650
	},
	{
	"epoch": 2.053291536050157,
	"grad_norm": 1.3359375,
	"learning_rate": 0.0001991767149686507,
	"loss": 0.8512,
	"step": 655
	},
	{
	"epoch": 2.0689655172413794,
	"grad_norm": 1.0,
	"learning_rate": 0.0001991293419159461,
	"loss": 0.8301,
	"step": 660
	},
	{
	"epoch": 2.084639498432602,
	"grad_norm": 2.390625,
	"learning_rate": 0.00019908064971548085,
	"loss": 0.8622,
	"step": 665
	},
	{
	"epoch": 2.1003134796238245,
	"grad_norm": 1.1953125,
	"learning_rate": 0.0001990306390152186,
	"loss": 0.8375,
	"step": 670
	},
	{
	"epoch": 2.115987460815047,
	"grad_norm": 3.390625,
	"learning_rate": 0.00019897931048066877,
	"loss": 0.8571,
	"step": 675
	},
	{
	"epoch": 2.1316614420062696,
	"grad_norm": 1.8203125,
	"learning_rate": 0.00019892666479487744,
	"loss": 0.904,
	"step": 680
	},
	{
	"epoch": 2.147335423197492,
	"grad_norm": 2.859375,
	"learning_rate": 0.00019887270265841868,
	"loss": 0.8602,
	"step": 685
	},
	{
	"epoch": 2.1630094043887147,
	"grad_norm": 2.125,
	"learning_rate": 0.00019881742478938496,
	"loss": 0.8618,
	"step": 690
	},
	{
	"epoch": 2.1786833855799372,
	"grad_norm": 3.09375,
	"learning_rate": 0.00019876083192337757,
	"loss": 0.8737,
	"step": 695
	},
	{
	"epoch": 2.19435736677116,
	"grad_norm": 3.9375,
	"learning_rate": 0.00019870292481349698,
	"loss": 0.8688,
	"step": 700
	},
	{
	"epoch": 2.2100313479623823,
	"grad_norm": 1.7578125,
	"learning_rate": 0.00019864370423033274,
	"loss": 0.8821,
	"step": 705
	},
	{
	"epoch": 2.225705329153605,
	"grad_norm": 1.3203125,
	"learning_rate": 0.00019858317096195323,
	"loss": 0.8746,
	"step": 710
	},
	{
	"epoch": 2.2413793103448274,
	"grad_norm": 2.796875,
	"learning_rate": 0.00019852132581389513,
	"loss": 0.8742,
	"step": 715
	},
	{
	"epoch": 2.2570532915360504,
	"grad_norm": 2.421875,
	"learning_rate": 0.00019845816960915286,
	"loss": 0.8747,
	"step": 720
	},
	{
	"epoch": 2.2727272727272725,
	"grad_norm": 2.046875,
	"learning_rate": 0.0001983937031881674,
	"loss": 0.843,
	"step": 725
	},
	{
	"epoch": 2.2884012539184955,
	"grad_norm": 3.140625,
	"learning_rate": 0.0001983279274088153,
	"loss": 0.8682,
	"step": 730
	},
	{
	"epoch": 2.304075235109718,
	"grad_norm": 2.9375,
	"learning_rate": 0.00019826084314639714,
	"loss": 0.856,
	"step": 735
	},
	{
	"epoch": 2.3197492163009406,
	"grad_norm": 1.21875,
	"learning_rate": 0.00019819245129362595,
	"loss": 0.8663,
	"step": 740
	},
	{
	"epoch": 2.335423197492163,
	"grad_norm": 2.21875,
	"learning_rate": 0.00019812275276061533,
	"loss": 0.8483,
	"step": 745
	},
	{
	"epoch": 2.3510971786833856,
	"grad_norm": 9.875,
	"learning_rate": 0.00019805174847486721,
	"loss": 0.8416,
	"step": 750
	},
	{
	"epoch": 2.366771159874608,
	"grad_norm": 0.78515625,
	"learning_rate": 0.00019797943938125977,
	"loss": 0.8743,
	"step": 755
	},
	{
	"epoch": 2.3824451410658307,
	"grad_norm": 1.21875,
	"learning_rate": 0.00019790582644203458,
	"loss": 0.8529,
	"step": 760
	},
	{
	"epoch": 2.3981191222570533,
	"grad_norm": 6.15625,
	"learning_rate": 0.00019783091063678402,
	"loss": 0.8628,
	"step": 765
	},
	{
	"epoch": 2.413793103448276,
	"grad_norm": 2.65625,
	"learning_rate": 0.00019775469296243807,
	"loss": 0.8689,
	"step": 770
	},
	{
	"epoch": 2.4294670846394983,
	"grad_norm": 1.59375,
	"learning_rate": 0.0001976771744332512,
	"loss": 0.8671,
	"step": 775
	},
	{
	"epoch": 2.445141065830721,
	"grad_norm": 2.0,
	"learning_rate": 0.00019759835608078877,
	"loss": 0.8832,
	"step": 780
	},
	{
	"epoch": 2.4608150470219434,
	"grad_norm": 1.1796875,
	"learning_rate": 0.00019751823895391323,
	"loss": 0.878,
	"step": 785
	},
	{
	"epoch": 2.476489028213166,
	"grad_norm": 1.5,
	"learning_rate": 0.00019743682411877046,
	"loss": 0.8882,
	"step": 790
	},
	{
	"epoch": 2.492163009404389,
	"grad_norm": 1.7265625,
	"learning_rate": 0.00019735411265877522,
	"loss": 0.8934,
	"step": 795
	},
	{
	"epoch": 2.507836990595611,
	"grad_norm": 1.4140625,
	"learning_rate": 0.00019727010567459696,
	"loss": 0.8815,
	"step": 800
	},
	{
	"epoch": 2.523510971786834,
	"grad_norm": 1.1171875,
	"learning_rate": 0.00019718480428414505,
	"loss": 0.8925,
	"step": 805
	},
	{
	"epoch": 2.5391849529780566,
	"grad_norm": 1.4765625,
	"learning_rate": 0.00019709820962255409,
	"loss": 0.8956,
	"step": 810
	},
	{
	"epoch": 2.554858934169279,
	"grad_norm": 3.546875,
	"learning_rate": 0.00019701032284216857,
	"loss": 0.8828,
	"step": 815
	},
	{
	"epoch": 2.5705329153605017,
	"grad_norm": 1.125,
	"learning_rate": 0.00019692114511252767,
	"loss": 0.9194,
	"step": 820
	},
	{
	"epoch": 2.586206896551724,
	"grad_norm": 1.734375,
	"learning_rate": 0.00019683067762034967,
	"loss": 0.8825,
	"step": 825
	},
	{
	"epoch": 2.6018808777429467,
	"grad_norm": 4.1875,
	"learning_rate": 0.00019673892156951613,
	"loss": 0.8725,
	"step": 830
	},
	{
	"epoch": 2.6175548589341693,
	"grad_norm": 1.296875,
	"learning_rate": 0.00019664587818105596,
	"loss": 0.877,
	"step": 835
	},
	{
	"epoch": 2.633228840125392,
	"grad_norm": 0.89453125,
	"learning_rate": 0.0001965515486931291,
	"loss": 0.8764,
	"step": 840
	},
	{
	"epoch": 2.6489028213166144,
	"grad_norm": 0.8515625,
	"learning_rate": 0.00019645593436101,
	"loss": 0.8571,
	"step": 845
	},
	{
	"epoch": 2.664576802507837,
	"grad_norm": 2.40625,
	"learning_rate": 0.00019635903645707096,
	"loss": 0.887,
	"step": 850
	},
	{
	"epoch": 2.6802507836990594,
	"grad_norm": 0.8203125,
	"learning_rate": 0.00019626085627076528,
	"loss": 0.8755,
	"step": 855
	},
	{
	"epoch": 2.695924764890282,
	"grad_norm": 1.4140625,
	"learning_rate": 0.00019616139510861,
	"loss": 0.8664,
	"step": 860
	},
	{
	"epoch": 2.7115987460815045,
	"grad_norm": 0.9296875,
	"learning_rate": 0.00019606065429416848,
	"loss": 0.8888,
	"step": 865
	},
	{
	"epoch": 2.7272727272727275,
	"grad_norm": 1.4296875,
	"learning_rate": 0.00019595863516803293,
	"loss": 0.8772,
	"step": 870
	},
	{
	"epoch": 2.7429467084639496,
	"grad_norm": 1.5859375,
	"learning_rate": 0.0001958553390878064,
	"loss": 0.8919,
	"step": 875
	},
	{
	"epoch": 2.7586206896551726,
	"grad_norm": 1.609375,
	"learning_rate": 0.00019575076742808488,
	"loss": 0.8806,
	"step": 880
	},
	{
	"epoch": 2.774294670846395,
	"grad_norm": 0.81640625,
	"learning_rate": 0.00019564492158043891,
	"loss": 0.8722,
	"step": 885
	},
	{
	"epoch": 2.7899686520376177,
	"grad_norm": 1.359375,
	"learning_rate": 0.0001955378029533951,
	"loss": 0.8887,
	"step": 890
	},
	{
	"epoch": 2.80564263322884,
	"grad_norm": 1.1015625,
	"learning_rate": 0.00019542941297241722,
	"loss": 0.9079,
	"step": 895
	},
	{
	"epoch": 2.8213166144200628,
	"grad_norm": 2.0,
	"learning_rate": 0.00019531975307988763,
	"loss": 0.877,
	"step": 900
	},
	{
	"epoch": 2.8369905956112853,
	"grad_norm": 1.4140625,
	"learning_rate": 0.00019520882473508762,
	"loss": 0.8953,
	"step": 905
	},
	{
	"epoch": 2.852664576802508,
	"grad_norm": 5.03125,
	"learning_rate": 0.00019509662941417826,
	"loss": 0.8886,
	"step": 910
	},
	{
	"epoch": 2.8683385579937304,
	"grad_norm": 1.15625,
	"learning_rate": 0.00019498316861018086,
	"loss": 0.9104,
	"step": 915
	},
	{
	"epoch": 2.884012539184953,
	"grad_norm": 1.6015625,
	"learning_rate": 0.0001948684438329566,
	"loss": 0.8815,
	"step": 920
	},
	{
	"epoch": 2.8996865203761755,
	"grad_norm": 1.8125,
	"learning_rate": 0.00019475245660918717,
	"loss": 0.8718,
	"step": 925
	},
	{
	"epoch": 2.915360501567398,
	"grad_norm": 6.46875,
	"learning_rate": 0.00019463520848235377,
	"loss": 0.8774,
	"step": 930
	},
	{
	"epoch": 2.9310344827586206,
	"grad_norm": 1.015625,
	"learning_rate": 0.000194516701012717,
	"loss": 0.8853,
	"step": 935
	},
	{
	"epoch": 2.946708463949843,
	"grad_norm": 1.5546875,
	"learning_rate": 0.00019439693577729593,
	"loss": 0.8833,
	"step": 940
	},
	{
	"epoch": 2.962382445141066,
	"grad_norm": 1.796875,
	"learning_rate": 0.0001942759143698472,
	"loss": 0.8867,
	"step": 945
	},
	{
	"epoch": 2.978056426332288,
	"grad_norm": 1.0703125,
	"learning_rate": 0.0001941536384008437,
	"loss": 0.8923,
	"step": 950
	},
	{
	"epoch": 2.993730407523511,
	"grad_norm": 1.75,
	"learning_rate": 0.0001940301094974531,
	"loss": 0.8741,
	"step": 955
	},
	{
	"epoch": 3.0,
	"eval_loss": 1.8115239143371582,
	"eval_runtime": 0.806,
	"eval_samples_per_second": 2.481,
	"eval_steps_per_second": 1.241,
	"step": 957
	},
	{
	"epoch": 3.0094043887147337,
	"grad_norm": 0.7578125,
	"learning_rate": 0.00019390532930351652,
	"loss": 0.7936,
	"step": 960
	},
	{
	"epoch": 3.0250783699059562,
	"grad_norm": 1.7734375,
	"learning_rate": 0.00019377929947952626,
	"loss": 0.7324,
	"step": 965
	},
	{
	"epoch": 3.040752351097179,
	"grad_norm": 1.7109375,
	"learning_rate": 0.00019365202170260393,
	"loss": 0.726,
	"step": 970
	},
	{
	"epoch": 3.0564263322884013,
	"grad_norm": 1.0859375,
	"learning_rate": 0.000193523497666478,
	"loss": 0.7355,
	"step": 975
	},
	{
	"epoch": 3.072100313479624,
	"grad_norm": 2.578125,
	"learning_rate": 0.00019339372908146147,
	"loss": 0.7393,
	"step": 980
	},
	{
	"epoch": 3.0877742946708464,
	"grad_norm": 1.0,
	"learning_rate": 0.00019326271767442884,
	"loss": 0.736,
	"step": 985
	},
	{
	"epoch": 3.103448275862069,
	"grad_norm": 1.3203125,
	"learning_rate": 0.00019313046518879337,
	"loss": 0.7157,
	"step": 990
	},
	{
	"epoch": 3.1191222570532915,
	"grad_norm": 1.015625,
	"learning_rate": 0.00019299697338448369,
	"loss": 0.7231,
	"step": 995
	},
	{
	"epoch": 3.134796238244514,
	"grad_norm": 1.2734375,
	"learning_rate": 0.0001928622440379205,
	"loss": 0.7221,
	"step": 1000
	},
	{
	"epoch": 3.1504702194357366,
	"grad_norm": 0.9140625,
	"learning_rate": 0.0001927262789419929,
	"loss": 0.7352,
	"step": 1005
	},
	{
	"epoch": 3.166144200626959,
	"grad_norm": 0.9375,
	"learning_rate": 0.0001925890799060345,
	"loss": 0.7196,
	"step": 1010
	},
	{
	"epoch": 3.1818181818181817,
	"grad_norm": 0.90625,
	"learning_rate": 0.00019245064875579942,
	"loss": 0.7269,
	"step": 1015
	},
	{
	"epoch": 3.197492163009404,
	"grad_norm": 0.98828125,
	"learning_rate": 0.00019231098733343783,
	"loss": 0.7225,
	"step": 1020
	},
	{
	"epoch": 3.2131661442006267,
	"grad_norm": 1.0625,
	"learning_rate": 0.00019217009749747174,
	"loss": 0.734,
	"step": 1025
	},
	{
	"epoch": 3.2288401253918497,
	"grad_norm": 0.828125,
	"learning_rate": 0.0001920279811227699,
	"loss": 0.7387,
	"step": 1030
	},
	{
	"epoch": 3.2445141065830723,
	"grad_norm": 1.5703125,
	"learning_rate": 0.00019188464010052312,
	"loss": 0.7303,
	"step": 1035
	},
	{
	"epoch": 3.260188087774295,
	"grad_norm": 1.53125,
	"learning_rate": 0.00019174007633821893,
	"loss": 0.7565,
	"step": 1040
	},
	{
	"epoch": 3.2758620689655173,
	"grad_norm": 2.921875,
	"learning_rate": 0.00019159429175961634,
	"loss": 0.7588,
	"step": 1045
	},
	{
	"epoch": 3.29153605015674,
	"grad_norm": 1.0,
	"learning_rate": 0.0001914472883047202,
	"loss": 0.7452,
	"step": 1050
	},
	{
	"epoch": 3.3072100313479624,
	"grad_norm": 1.1953125,
	"learning_rate": 0.00019129906792975527,
	"loss": 0.7395,
	"step": 1055
	},
	{
	"epoch": 3.322884012539185,
	"grad_norm": 1.203125,
	"learning_rate": 0.0001911496326071404,
	"loss": 0.7429,
	"step": 1060
	},
	{
	"epoch": 3.3385579937304075,
	"grad_norm": 0.92578125,
	"learning_rate": 0.00019099898432546202,
	"loss": 0.7643,
	"step": 1065
	},
	{
	"epoch": 3.35423197492163,
	"grad_norm": 1.3203125,
	"learning_rate": 0.00019084712508944793,
	"loss": 0.755,
	"step": 1070
	},
	{
	"epoch": 3.3699059561128526,
	"grad_norm": 1.8671875,
	"learning_rate": 0.00019069405691994045,
	"loss": 0.7381,
	"step": 1075
	},
	{
	"epoch": 3.385579937304075,
	"grad_norm": 1.46875,
	"learning_rate": 0.00019053978185386964,
	"loss": 0.7546,
	"step": 1080
	},
	{
	"epoch": 3.4012539184952977,
	"grad_norm": 2.609375,
	"learning_rate": 0.00019038430194422606,
	"loss": 0.7624,
	"step": 1085
	},
	{
	"epoch": 3.41692789968652,
	"grad_norm": 1.3203125,
	"learning_rate": 0.00019022761926003359,
	"loss": 0.7657,
	"step": 1090
	},
	{
	"epoch": 3.4326018808777428,
	"grad_norm": 1.15625,
	"learning_rate": 0.00019006973588632184,
	"loss": 0.7433,
	"step": 1095
	},
	{
	"epoch": 3.4482758620689653,
	"grad_norm": 1.6015625,
	"learning_rate": 0.0001899106539240984,
	"loss": 0.7767,
	"step": 1100
	},
	{
	"epoch": 3.4639498432601883,
	"grad_norm": 33.75,
	"learning_rate": 0.00018975037549032086,
	"loss": 0.755,
	"step": 1105
	},
	{
	"epoch": 3.479623824451411,
	"grad_norm": 0.8515625,
	"learning_rate": 0.0001895889027178687,
	"loss": 0.7631,
	"step": 1110
	},
	{
	"epoch": 3.4952978056426334,
	"grad_norm": 1.1015625,
	"learning_rate": 0.0001894262377555148,
	"loss": 0.7545,
	"step": 1115
	},
	{
	"epoch": 3.510971786833856,
	"grad_norm": 0.9765625,
	"learning_rate": 0.00018926238276789704,
	"loss": 0.7491,
	"step": 1120
	},
	{
	"epoch": 3.5266457680250785,
	"grad_norm": 1.0234375,
	"learning_rate": 0.0001890973399354892,
	"loss": 0.7663,
	"step": 1125
	},
	{
	"epoch": 3.542319749216301,
	"grad_norm": 1.0703125,
	"learning_rate": 0.00018893111145457225,
	"loss": 0.755,
	"step": 1130
	},
	{
	"epoch": 3.5579937304075235,
	"grad_norm": 1.8046875,
	"learning_rate": 0.00018876369953720496,
	"loss": 0.7681,
	"step": 1135
	},
	{
	"epoch": 3.573667711598746,
	"grad_norm": 5.125,
	"learning_rate": 0.00018859510641119448,
	"loss": 0.766,
	"step": 1140
	},
	{
	"epoch": 3.5893416927899686,
	"grad_norm": 1.3828125,
	"learning_rate": 0.00018842533432006662,
	"loss": 0.7801,
	"step": 1145
	},
	{
	"epoch": 3.605015673981191,
	"grad_norm": 1.0625,
	"learning_rate": 0.00018825438552303621,
	"loss": 0.7647,
	"step": 1150
	},
	{
	"epoch": 3.6206896551724137,
	"grad_norm": 1.109375,
	"learning_rate": 0.00018808226229497684,
	"loss": 0.7768,
	"step": 1155
	},
	{
	"epoch": 3.6363636363636362,
	"grad_norm": 1.4921875,
	"learning_rate": 0.00018790896692639068,
	"loss": 0.7786,
	"step": 1160
	},
	{
	"epoch": 3.652037617554859,
	"grad_norm": 1.2890625,
	"learning_rate": 0.00018773450172337793,
	"loss": 0.762,
	"step": 1165
	},
	{
	"epoch": 3.6677115987460818,
	"grad_norm": 1.0078125,
	"learning_rate": 0.00018755886900760619,
	"loss": 0.7612,
	"step": 1170
	},
	{
	"epoch": 3.683385579937304,
	"grad_norm": 0.81640625,
	"learning_rate": 0.00018738207111627958,
	"loss": 0.7718,
	"step": 1175
	},
	{
	"epoch": 3.699059561128527,
	"grad_norm": 0.89453125,
	"learning_rate": 0.00018720411040210752,
	"loss": 0.7577,
	"step": 1180
	},
	{
	"epoch": 3.714733542319749,
	"grad_norm": 0.8828125,
	"learning_rate": 0.00018702498923327366,
	"loss": 0.7429,
	"step": 1185
	},
	{
	"epoch": 3.730407523510972,
	"grad_norm": 0.828125,
	"learning_rate": 0.00018684470999340405,
	"loss": 0.7552,
	"step": 1190
	},
	{
	"epoch": 3.7460815047021945,
	"grad_norm": 1.46875,
	"learning_rate": 0.00018666327508153567,
	"loss": 0.7606,
	"step": 1195
	},
	{
	"epoch": 3.761755485893417,
	"grad_norm": 0.89453125,
	"learning_rate": 0.0001864806869120844,
	"loss": 0.7678,
	"step": 1200
	},
	{
	"epoch": 3.7774294670846396,
	"grad_norm": 1.078125,
	"learning_rate": 0.00018629694791481296,
	"loss": 0.7985,
	"step": 1205
	},
	{
	"epoch": 3.793103448275862,
	"grad_norm": 5.125,
	"learning_rate": 0.00018611206053479842,
	"loss": 0.7712,
	"step": 1210
	},
	{
	"epoch": 3.8087774294670846,
	"grad_norm": 1.78125,
	"learning_rate": 0.00018592602723239984,
	"loss": 0.7745,
	"step": 1215
	},
	{
	"epoch": 3.824451410658307,
	"grad_norm": 0.89453125,
	"learning_rate": 0.00018573885048322547,
	"loss": 0.7684,
	"step": 1220
	},
	{
	"epoch": 3.8401253918495297,
	"grad_norm": 0.83984375,
	"learning_rate": 0.00018555053277809975,
	"loss": 0.7811,
	"step": 1225
	},
	{
	"epoch": 3.8557993730407523,
	"grad_norm": 1.1484375,
	"learning_rate": 0.00018536107662303026,
	"loss": 0.7732,
	"step": 1230
	},
	{
	"epoch": 3.871473354231975,
	"grad_norm": 1.015625,
	"learning_rate": 0.00018517048453917424,
	"loss": 0.7668,
	"step": 1235
	},
	{
	"epoch": 3.8871473354231973,
	"grad_norm": 0.921875,
	"learning_rate": 0.00018497875906280515,
	"loss": 0.759,
	"step": 1240
	},
	{
	"epoch": 3.9028213166144203,
	"grad_norm": 1.46875,
	"learning_rate": 0.00018478590274527898,
	"loss": 0.7763,
	"step": 1245
	},
	{
	"epoch": 3.9184952978056424,
	"grad_norm": 1.6171875,
	"learning_rate": 0.0001845919181530001,
	"loss": 0.7633,
	"step": 1250
	},
	{
	"epoch": 3.9341692789968654,
	"grad_norm": 0.7265625,
	"learning_rate": 0.00018439680786738722,
	"loss": 0.7853,
	"step": 1255
	},
	{
	"epoch": 3.9498432601880875,
	"grad_norm": 1.46875,
	"learning_rate": 0.00018420057448483905,
	"loss": 0.7856,
	"step": 1260
	},
	{
	"epoch": 3.9655172413793105,
	"grad_norm": 1.359375,
	"learning_rate": 0.00018400322061669982,
	"loss": 0.7831,
	"step": 1265
	},
	{
	"epoch": 3.981191222570533,
	"grad_norm": 0.75390625,
	"learning_rate": 0.00018380474888922426,
	"loss": 0.7952,
	"step": 1270
	},
	{
	"epoch": 3.9968652037617556,
	"grad_norm": 1.0625,
	"learning_rate": 0.000183605161943543,
	"loss": 0.7735,
	"step": 1275
	},
	{
	"epoch": 4.0,
	"eval_loss": 1.923946738243103,
	"eval_runtime": 0.8,
	"eval_samples_per_second": 2.5,
	"eval_steps_per_second": 1.25,
	"step": 1276
	},
	{
	"epoch": 4.012539184952978,
	"grad_norm": 0.84375,
	"learning_rate": 0.0001834044624356272,
	"loss": 0.6611,
	"step": 1280
	},
	{
	"epoch": 4.028213166144201,
	"grad_norm": 0.8984375,
	"learning_rate": 0.0001832026530362532,
	"loss": 0.5993,
	"step": 1285
	},
	{
	"epoch": 4.043887147335423,
	"grad_norm": 1.3125,
	"learning_rate": 0.00018299973643096714,
	"loss": 0.6197,
	"step": 1290
	},
	{
	"epoch": 4.059561128526646,
	"grad_norm": 1.1640625,
	"learning_rate": 0.00018279571532004907,
	"loss": 0.6147,
	"step": 1295
	},
	{
	"epoch": 4.075235109717869,
	"grad_norm": 1.1171875,
	"learning_rate": 0.00018259059241847707,
	"loss": 0.6295,
	"step": 1300
	},
	{
	"epoch": 4.090909090909091,
	"grad_norm": 0.97265625,
	"learning_rate": 0.00018238437045589115,
	"loss": 0.6219,
	"step": 1305
	},
	{
	"epoch": 4.106583072100314,
	"grad_norm": 0.98828125,
	"learning_rate": 0.00018217705217655689,
	"loss": 0.6033,
	"step": 1310
	},
	{
	"epoch": 4.122257053291536,
	"grad_norm": 0.875,
	"learning_rate": 0.0001819686403393289,
	"loss": 0.622,
	"step": 1315
	},
	{
	"epoch": 4.137931034482759,
	"grad_norm": 0.890625,
	"learning_rate": 0.00018175913771761417,
	"loss": 0.6166,
	"step": 1320
	},
	{
	"epoch": 4.153605015673981,
	"grad_norm": 1.234375,
	"learning_rate": 0.0001815485470993351,
	"loss": 0.6335,
	"step": 1325
	},
	{
	"epoch": 4.169278996865204,
	"grad_norm": 0.88671875,
	"learning_rate": 0.00018133687128689242,
	"loss": 0.6204,
	"step": 1330
	},
	{
	"epoch": 4.184952978056426,
	"grad_norm": 1.09375,
	"learning_rate": 0.000181124113097128,
	"loss": 0.635,
	"step": 1335
	},
	{
	"epoch": 4.200626959247649,
	"grad_norm": 0.83203125,
	"learning_rate": 0.00018091027536128716,
	"loss": 0.6224,
	"step": 1340
	},
	{
	"epoch": 4.216300940438871,
	"grad_norm": 0.890625,
	"learning_rate": 0.00018069536092498112,
	"loss": 0.6314,
	"step": 1345
	},
	{
	"epoch": 4.231974921630094,
	"grad_norm": 0.80859375,
	"learning_rate": 0.00018047937264814917,
	"loss": 0.6421,
	"step": 1350
	},
	{
	"epoch": 4.247648902821316,
	"grad_norm": 2.25,
	"learning_rate": 0.00018026231340502057,
	"loss": 0.6335,
	"step": 1355
	},
	{
	"epoch": 4.263322884012539,
	"grad_norm": 1.0,
	"learning_rate": 0.00018004418608407626,
	"loss": 0.6365,
	"step": 1360
	},
	{
	"epoch": 4.278996865203762,
	"grad_norm": 1.2421875,
	"learning_rate": 0.00017982499358801037,
	"loss": 0.6289,
	"step": 1365
	},
	{
	"epoch": 4.294670846394984,
	"grad_norm": 0.95703125,
	"learning_rate": 0.00017960473883369186,
	"loss": 0.6297,
	"step": 1370
	},
	{
	"epoch": 4.310344827586207,
	"grad_norm": 1.484375,
	"learning_rate": 0.00017938342475212532,
	"loss": 0.6496,
	"step": 1375
	},
	{
	"epoch": 4.326018808777429,
	"grad_norm": 0.9140625,
	"learning_rate": 0.00017916105428841234,
	"loss": 0.6454,
	"step": 1380
	},
	{
	"epoch": 4.341692789968652,
	"grad_norm": 1.21875,
	"learning_rate": 0.00017893763040171203,
	"loss": 0.6452,
	"step": 1385
	},
	{
	"epoch": 4.3573667711598745,
	"grad_norm": 2.25,
	"learning_rate": 0.00017871315606520183,
	"loss": 0.6542,
	"step": 1390
	},
	{
	"epoch": 4.3730407523510975,
	"grad_norm": 1.21875,
	"learning_rate": 0.0001784876342660378,
	"loss": 0.6448,
	"step": 1395
	},
	{
	"epoch": 4.38871473354232,
	"grad_norm": 0.95703125,
	"learning_rate": 0.00017826106800531498,
	"loss": 0.63,
	"step": 1400
	},
	{
	"epoch": 4.4043887147335425,
	"grad_norm": 1.828125,
	"learning_rate": 0.0001780334602980275,
	"loss": 0.6317,
	"step": 1405
	},
	{
	"epoch": 4.420062695924765,
	"grad_norm": 1.75,
	"learning_rate": 0.0001778048141730282,
	"loss": 0.6375,
	"step": 1410
	},
	{
	"epoch": 4.435736677115988,
	"grad_norm": 1.046875,
	"learning_rate": 0.00017757513267298856,
	"loss": 0.645,
	"step": 1415
	},
	{
	"epoch": 4.45141065830721,
	"grad_norm": 1.6328125,
	"learning_rate": 0.00017734441885435828,
	"loss": 0.6431,
	"step": 1420
	},
	{
	"epoch": 4.467084639498433,
	"grad_norm": 0.9296875,
	"learning_rate": 0.00017711267578732423,
	"loss": 0.6494,
	"step": 1425
	},
	{
	"epoch": 4.482758620689655,
	"grad_norm": 1.2421875,
	"learning_rate": 0.00017687990655577008,
	"loss": 0.6464,
	"step": 1430
	},
	{
	"epoch": 4.498432601880878,
	"grad_norm": 33.75,
	"learning_rate": 0.00017664611425723486,
	"loss": 0.6598,
	"step": 1435
	},
	{
	"epoch": 4.514106583072101,
	"grad_norm": 1.5234375,
	"learning_rate": 0.00017641130200287197,
	"loss": 0.6602,
	"step": 1440
	},
	{
	"epoch": 4.529780564263323,
	"grad_norm": 1.34375,
	"learning_rate": 0.00017617547291740767,
	"loss": 0.6443,
	"step": 1445
	},
	{
	"epoch": 4.545454545454545,
	"grad_norm": 1.2890625,
	"learning_rate": 0.00017593863013909956,
	"loss": 0.6441,
	"step": 1450
	},
	{
	"epoch": 4.561128526645768,
	"grad_norm": 1.375,
	"learning_rate": 0.00017570077681969474,
	"loss": 0.6405,
	"step": 1455
	},
	{
	"epoch": 4.576802507836991,
	"grad_norm": 2.15625,
	"learning_rate": 0.00017546191612438804,
	"loss": 0.6605,
	"step": 1460
	},
	{
	"epoch": 4.592476489028213,
	"grad_norm": 1.1796875,
	"learning_rate": 0.0001752220512317797,
	"loss": 0.6572,
	"step": 1465
	},
	{
	"epoch": 4.608150470219436,
	"grad_norm": 1.96875,
	"learning_rate": 0.00017498118533383316,
	"loss": 0.6396,
	"step": 1470
	},
	{
	"epoch": 4.623824451410658,
	"grad_norm": 1.2265625,
	"learning_rate": 0.0001747393216358326,
	"loss": 0.656,
	"step": 1475
	},
	{
	"epoch": 4.639498432601881,
	"grad_norm": 1.0390625,
	"learning_rate": 0.00017449646335634017,
	"loss": 0.6602,
	"step": 1480
	},
	{
	"epoch": 4.655172413793103,
	"grad_norm": 0.8203125,
	"learning_rate": 0.00017425261372715345,
	"loss": 0.6519,
	"step": 1485
	},
	{
	"epoch": 4.670846394984326,
	"grad_norm": 0.875,
	"learning_rate": 0.00017400777599326203,
	"loss": 0.6475,
	"step": 1490
	},
	{
	"epoch": 4.686520376175548,
	"grad_norm": 0.84765625,
	"learning_rate": 0.00017376195341280468,
	"loss": 0.6656,
	"step": 1495
	},
	{
	"epoch": 4.702194357366771,
	"grad_norm": 0.88671875,
	"learning_rate": 0.00017351514925702583,
	"loss": 0.6655,
	"step": 1500
	},
	{
	"epoch": 4.717868338557993,
	"grad_norm": 1.4296875,
	"learning_rate": 0.00017326736681023204,
	"loss": 0.672,
	"step": 1505
	},
	{
	"epoch": 4.733542319749216,
	"grad_norm": 0.98828125,
	"learning_rate": 0.00017301860936974834,
	"loss": 0.6635,
	"step": 1510
	},
	{
	"epoch": 4.749216300940439,
	"grad_norm": 1.15625,
	"learning_rate": 0.00017276888024587433,
	"loss": 0.6666,
	"step": 1515
	},
	{
	"epoch": 4.764890282131661,
	"grad_norm": 1.265625,
	"learning_rate": 0.00017251818276184012,
	"loss": 0.6692,
	"step": 1520
	},
	{
	"epoch": 4.7805642633228835,
	"grad_norm": 0.80859375,
	"learning_rate": 0.0001722665202537621,
	"loss": 0.6578,
	"step": 1525
	},
	{
	"epoch": 4.7962382445141065,
	"grad_norm": 0.92578125,
	"learning_rate": 0.00017201389607059863,
	"loss": 0.6607,
	"step": 1530
	},
	{
	"epoch": 4.8119122257053295,
	"grad_norm": 2.140625,
	"learning_rate": 0.00017176031357410537,
	"loss": 0.6538,
	"step": 1535
	},
	{
	"epoch": 4.827586206896552,
	"grad_norm": 1.828125,
	"learning_rate": 0.0001715057761387905,
	"loss": 0.6703,
	"step": 1540
	},
	{
	"epoch": 4.843260188087775,
	"grad_norm": 0.87109375,
	"learning_rate": 0.00017125028715187,
	"loss": 0.6761,
	"step": 1545
	},
	{
	"epoch": 4.858934169278997,
	"grad_norm": 0.83984375,
	"learning_rate": 0.0001709938500132225,
	"loss": 0.6616,
	"step": 1550
	},
	{
	"epoch": 4.87460815047022,
	"grad_norm": 1.015625,
	"learning_rate": 0.00017073646813534388,
	"loss": 0.6597,
	"step": 1555
	},
	{
	"epoch": 4.890282131661442,
	"grad_norm": 0.8984375,
	"learning_rate": 0.00017047814494330207,
	"loss": 0.6733,
	"step": 1560
	},
	{
	"epoch": 4.905956112852665,
	"grad_norm": 1.859375,
	"learning_rate": 0.00017021888387469135,
	"loss": 0.6737,
	"step": 1565
	},
	{
	"epoch": 4.921630094043887,
	"grad_norm": 0.890625,
	"learning_rate": 0.00016995868837958665,
	"loss": 0.6736,
	"step": 1570
	},
	{
	"epoch": 4.93730407523511,
	"grad_norm": 0.921875,
	"learning_rate": 0.0001696975619204977,
	"loss": 0.68,
	"step": 1575
	},
	{
	"epoch": 4.952978056426332,
	"grad_norm": 2.0,
	"learning_rate": 0.0001694355079723227,
	"loss": 0.6755,
	"step": 1580
	},
	{
	"epoch": 4.968652037617555,
	"grad_norm": 0.78125,
	"learning_rate": 0.0001691725300223025,
	"loss": 0.6827,
	"step": 1585
	},
	{
	"epoch": 4.984326018808778,
	"grad_norm": 0.859375,
	"learning_rate": 0.0001689086315699738,
	"loss": 0.6681,
	"step": 1590
	},
	{
	"epoch": 5.0,
	"grad_norm": 0.91015625,
	"learning_rate": 0.00016864381612712276,
	"loss": 0.6599,
	"step": 1595
	},
	{
	"epoch": 5.0,
	"eval_loss": 2.0616867542266846,
	"eval_runtime": 0.7937,
	"eval_samples_per_second": 2.52,
	"eval_steps_per_second": 1.26,
	"step": 1595
	},
	{
	"epoch": 5.015673981191223,
	"grad_norm": 1.9296875,
	"learning_rate": 0.00016837808721773827,
	"loss": 0.5239,
	"step": 1600
	},
	{
	"epoch": 5.031347962382445,
	"grad_norm": 1.125,
	"learning_rate": 0.000168111448377965,
	"loss": 0.509,
	"step": 1605
	},
	{
	"epoch": 5.047021943573668,
	"grad_norm": 2.5625,
	"learning_rate": 0.0001678439031560564,
	"loss": 0.5041,
	"step": 1610
	},
	{
	"epoch": 5.06269592476489,
	"grad_norm": 0.8203125,
	"learning_rate": 0.00016757545511232746,
	"loss": 0.5195,
	"step": 1615
	},
	{
	"epoch": 5.078369905956113,
	"grad_norm": 0.8984375,
	"learning_rate": 0.00016730610781910728,
	"loss": 0.4963,
	"step": 1620
	},
	{
	"epoch": 5.094043887147335,
	"grad_norm": 0.88671875,
	"learning_rate": 0.00016703586486069164,
	"loss": 0.5159,
	"step": 1625
	},
	{
	"epoch": 5.109717868338558,
	"grad_norm": 1.0859375,
	"learning_rate": 0.0001667647298332952,
	"loss": 0.5111,
	"step": 1630
	},
	{
	"epoch": 5.12539184952978,
	"grad_norm": 0.9765625,
	"learning_rate": 0.00016649270634500366,
	"loss": 0.5309,
	"step": 1635
	},
	{
	"epoch": 5.141065830721003,
	"grad_norm": 0.9296875,
	"learning_rate": 0.00016621979801572585,
	"loss": 0.5254,
	"step": 1640
	},
	{
	"epoch": 5.156739811912225,
	"grad_norm": 0.8359375,
	"learning_rate": 0.00016594600847714538,
	"loss": 0.5274,
	"step": 1645
	},
	{
	"epoch": 5.172413793103448,
	"grad_norm": 0.8984375,
	"learning_rate": 0.0001656713413726725,
	"loss": 0.5239,
	"step": 1650
	},
	{
	"epoch": 5.1880877742946705,
	"grad_norm": 0.984375,
	"learning_rate": 0.00016539580035739547,
	"loss": 0.5246,
	"step": 1655
	},
	{
	"epoch": 5.2037617554858935,
	"grad_norm": 1.6953125,
	"learning_rate": 0.00016511938909803204,
	"loss": 0.5367,
	"step": 1660
	},
	{
	"epoch": 5.219435736677116,
	"grad_norm": 0.9453125,
	"learning_rate": 0.00016484211127288048,
	"loss": 0.5356,
	"step": 1665
	},
	{
	"epoch": 5.235109717868339,
	"grad_norm": 0.953125,
	"learning_rate": 0.00016456397057177085,
	"loss": 0.5367,
	"step": 1670
	},
	{
	"epoch": 5.250783699059561,
	"grad_norm": 1.25,
	"learning_rate": 0.00016428497069601578,
	"loss": 0.534,
	"step": 1675
	},
	{
	"epoch": 5.266457680250784,
	"grad_norm": 0.9296875,
	"learning_rate": 0.00016400511535836118,
	"loss": 0.5476,
	"step": 1680
	},
	{
	"epoch": 5.282131661442007,
	"grad_norm": 1.1015625,
	"learning_rate": 0.0001637244082829369,
	"loss": 0.544,
	"step": 1685
	},
	{
	"epoch": 5.297805642633229,
	"grad_norm": 0.84375,
	"learning_rate": 0.00016344285320520717,
	"loss": 0.5414,
	"step": 1690
	},
	{
	"epoch": 5.313479623824452,
	"grad_norm": 1.0,
	"learning_rate": 0.00016316045387192087,
	"loss": 0.5435,
	"step": 1695
	},
	{
	"epoch": 5.329153605015674,
	"grad_norm": 0.85546875,
	"learning_rate": 0.00016287721404106167,
	"loss": 0.5412,
	"step": 1700
	},
	{
	"epoch": 5.344827586206897,
	"grad_norm": 1.0,
	"learning_rate": 0.00016259313748179802,
	"loss": 0.5448,
	"step": 1705
	},
	{
	"epoch": 5.360501567398119,
	"grad_norm": 0.76171875,
	"learning_rate": 0.000162308227974433,
	"loss": 0.5523,
	"step": 1710
	},
	{
	"epoch": 5.376175548589342,
	"grad_norm": 1.015625,
	"learning_rate": 0.00016202248931035404,
	"loss": 0.5382,
	"step": 1715
	},
	{
	"epoch": 5.391849529780564,
	"grad_norm": 1.0546875,
	"learning_rate": 0.0001617359252919824,
	"loss": 0.5427,
	"step": 1720
	},
	{
	"epoch": 5.407523510971787,
	"grad_norm": 1.203125,
	"learning_rate": 0.00016144853973272262,
	"loss": 0.5426,
	"step": 1725
	},
	{
	"epoch": 5.423197492163009,
	"grad_norm": 1.09375,
	"learning_rate": 0.00016116033645691174,
	"loss": 0.5514,
	"step": 1730
	},
	{
	"epoch": 5.438871473354232,
	"grad_norm": 1.1953125,
	"learning_rate": 0.00016087131929976852,
	"loss": 0.5471,
	"step": 1735
	},
	{
	"epoch": 5.454545454545454,
	"grad_norm": 0.8046875,
	"learning_rate": 0.00016058149210734223,
	"loss": 0.5617,
	"step": 1740
	},
	{
	"epoch": 5.470219435736677,
	"grad_norm": 0.90234375,
	"learning_rate": 0.0001602908587364616,
	"loss": 0.5598,
	"step": 1745
	},
	{
	"epoch": 5.485893416927899,
	"grad_norm": 0.875,
	"learning_rate": 0.00015999942305468338,
	"loss": 0.5457,
	"step": 1750
	},
	{
	"epoch": 5.501567398119122,
	"grad_norm": 0.99609375,
	"learning_rate": 0.000159707188940241,
	"loss": 0.5667,
	"step": 1755
	},
	{
	"epoch": 5.517241379310345,
	"grad_norm": 2.90625,
	"learning_rate": 0.00015941416028199298,
	"loss": 0.552,
	"step": 1760
	},
	{
	"epoch": 5.532915360501567,
	"grad_norm": 0.8046875,
	"learning_rate": 0.00015912034097937094,
	"loss": 0.5408,
	"step": 1765
	},
	{
	"epoch": 5.54858934169279,
	"grad_norm": 0.953125,
	"learning_rate": 0.00015882573494232797,
	"loss": 0.5516,
	"step": 1770
	},
	{
	"epoch": 5.564263322884012,
	"grad_norm": 0.94921875,
	"learning_rate": 0.00015853034609128648,
	"loss": 0.5649,
	"step": 1775
	},
	{
	"epoch": 5.579937304075235,
	"grad_norm": 1.25,
	"learning_rate": 0.00015823417835708606,
	"loss": 0.5666,
	"step": 1780
	},
	{
	"epoch": 5.5956112852664575,
	"grad_norm": 1.515625,
	"learning_rate": 0.00015793723568093118,
	"loss": 0.5653,
	"step": 1785
	},
	{
	"epoch": 5.61128526645768,
	"grad_norm": 1.4296875,
	"learning_rate": 0.00015763952201433866,
	"loss": 0.5663,
	"step": 1790
	},
	{
	"epoch": 5.6269592476489025,
	"grad_norm": 0.84765625,
	"learning_rate": 0.00015734104131908522,
	"loss": 0.5582,
	"step": 1795
	},
	{
	"epoch": 5.6426332288401255,
	"grad_norm": 0.94140625,
	"learning_rate": 0.00015704179756715467,
	"loss": 0.5569,
	"step": 1800
	},
	{
	"epoch": 5.658307210031348,
	"grad_norm": 0.83203125,
	"learning_rate": 0.00015674179474068508,
	"loss": 0.5618,
	"step": 1805
	},
	{
	"epoch": 5.673981191222571,
	"grad_norm": 0.890625,
	"learning_rate": 0.00015644103683191575,
	"loss": 0.5636,
	"step": 1810
	},
	{
	"epoch": 5.689655172413794,
	"grad_norm": 0.80859375,
	"learning_rate": 0.00015613952784313418,
	"loss": 0.5562,
	"step": 1815
	},
	{
	"epoch": 5.705329153605016,
	"grad_norm": 1.7421875,
	"learning_rate": 0.00015583727178662262,
	"loss": 0.5566,
	"step": 1820
	},
	{
	"epoch": 5.721003134796238,
	"grad_norm": 1.4375,
	"learning_rate": 0.00015553427268460496,
	"loss": 0.5591,
	"step": 1825
	},
	{
	"epoch": 5.736677115987461,
	"grad_norm": 0.98046875,
	"learning_rate": 0.00015523053456919294,
	"loss": 0.5666,
	"step": 1830
	},
	{
	"epoch": 5.752351097178684,
	"grad_norm": 0.98828125,
	"learning_rate": 0.00015492606148233265,
	"loss": 0.5664,
	"step": 1835
	},
	{
	"epoch": 5.768025078369906,
	"grad_norm": 0.84375,
	"learning_rate": 0.00015462085747575068,
	"loss": 0.5624,
	"step": 1840
	},
	{
	"epoch": 5.783699059561129,
	"grad_norm": 0.85546875,
	"learning_rate": 0.00015431492661090022,
	"loss": 0.5587,
	"step": 1845
	},
	{
	"epoch": 5.799373040752351,
	"grad_norm": 0.99609375,
	"learning_rate": 0.00015400827295890702,
	"loss": 0.5645,
	"step": 1850
	},
	{
	"epoch": 5.815047021943574,
	"grad_norm": 1.03125,
	"learning_rate": 0.0001537009006005152,
	"loss": 0.5631,
	"step": 1855
	},
	{
	"epoch": 5.830721003134796,
	"grad_norm": 1.046875,
	"learning_rate": 0.0001533928136260329,
	"loss": 0.5712,
	"step": 1860
	},
	{
	"epoch": 5.846394984326019,
	"grad_norm": 1.140625,
	"learning_rate": 0.00015308401613527796,
	"loss": 0.5676,
	"step": 1865
	},
	{
	"epoch": 5.862068965517241,
	"grad_norm": 0.9375,
	"learning_rate": 0.00015277451223752326,
	"loss": 0.5761,
	"step": 1870
	},
	{
	"epoch": 5.877742946708464,
	"grad_norm": 0.98046875,
	"learning_rate": 0.00015246430605144216,
	"loss": 0.5685,
	"step": 1875
	},
	{
	"epoch": 5.893416927899686,
	"grad_norm": 0.83203125,
	"learning_rate": 0.00015215340170505348,
	"loss": 0.5678,
	"step": 1880
	},
	{
	"epoch": 5.909090909090909,
	"grad_norm": 1.6484375,
	"learning_rate": 0.0001518418033356668,
	"loss": 0.569,
	"step": 1885
	},
	{
	"epoch": 5.924764890282132,
	"grad_norm": 0.9765625,
	"learning_rate": 0.00015152951508982726,
	"loss": 0.5669,
	"step": 1890
	},
	{
	"epoch": 5.940438871473354,
	"grad_norm": 0.91015625,
	"learning_rate": 0.0001512165411232604,
	"loss": 0.5759,
	"step": 1895
	},
	{
	"epoch": 5.956112852664576,
	"grad_norm": 0.87890625,
	"learning_rate": 0.00015090288560081692,
	"loss": 0.5641,
	"step": 1900
	},
	{
	"epoch": 5.971786833855799,
	"grad_norm": 1.1953125,
	"learning_rate": 0.0001505885526964172,
	"loss": 0.574,
	"step": 1905
	},
	{
	"epoch": 5.987460815047022,
	"grad_norm": 1.078125,
	"learning_rate": 0.00015027354659299578,
	"loss": 0.5764,
	"step": 1910
	},
	{
	"epoch": 6.0,
	"eval_loss": 2.323450803756714,
	"eval_runtime": 0.8121,
	"eval_samples_per_second": 2.463,
	"eval_steps_per_second": 1.231,
	"step": 1914
	},
	{
	"epoch": 6.003134796238244,
	"grad_norm": 0.96484375,
	"learning_rate": 0.00014995787148244563,
	"loss": 0.5514,
	"step": 1915
	},
	{
	"epoch": 6.018808777429467,
	"grad_norm": 1.1484375,
	"learning_rate": 0.00014964153156556245,
	"loss": 0.4392,
	"step": 1920
	},
	{
	"epoch": 6.0344827586206895,
	"grad_norm": 1.015625,
	"learning_rate": 0.00014932453105198884,
	"loss": 0.4499,
	"step": 1925
	},
	{
	"epoch": 6.0501567398119125,
	"grad_norm": 1.0625,
	"learning_rate": 0.0001490068741601581,
	"loss": 0.4379,
	"step": 1930
	},
	{
	"epoch": 6.065830721003135,
	"grad_norm": 0.84765625,
	"learning_rate": 0.00014868856511723814,
	"loss": 0.431,
	"step": 1935
	},
	{
	"epoch": 6.081504702194358,
	"grad_norm": 0.98046875,
	"learning_rate": 0.00014836960815907532,
	"loss": 0.4428,
	"step": 1940
	},
	{
	"epoch": 6.09717868338558,
	"grad_norm": 0.89453125,
	"learning_rate": 0.000148050007530138,
	"loss": 0.4482,
	"step": 1945
	},
	{
	"epoch": 6.112852664576803,
	"grad_norm": 1.453125,
	"learning_rate": 0.00014772976748346015,
	"loss": 0.4478,
	"step": 1950
	},
	{
	"epoch": 6.128526645768025,
	"grad_norm": 2.453125,
	"learning_rate": 0.00014740889228058462,
	"loss": 0.4414,
	"step": 1955
	},
	{
	"epoch": 6.144200626959248,
	"grad_norm": 1.1328125,
	"learning_rate": 0.0001470873861915065,
	"loss": 0.4466,
	"step": 1960
	},
	{
	"epoch": 6.15987460815047,
	"grad_norm": 1.2109375,
	"learning_rate": 0.00014676525349461637,
	"loss": 0.4356,
	"step": 1965
	},
	{
	"epoch": 6.175548589341693,
	"grad_norm": 0.8984375,
	"learning_rate": 0.00014644249847664317,
	"loss": 0.4526,
	"step": 1970
	},
	{
	"epoch": 6.191222570532915,
	"grad_norm": 1.1796875,
	"learning_rate": 0.00014611912543259742,
	"loss": 0.442,
	"step": 1975
	},
	{
	"epoch": 6.206896551724138,
	"grad_norm": 1.21875,
	"learning_rate": 0.00014579513866571378,
	"loss": 0.4463,
	"step": 1980
	},
	{
	"epoch": 6.222570532915361,
	"grad_norm": 0.875,
	"learning_rate": 0.00014547054248739404,
	"loss": 0.444,
	"step": 1985
	},
	{
	"epoch": 6.238244514106583,
	"grad_norm": 0.96484375,
	"learning_rate": 0.0001451453412171496,
	"loss": 0.4513,
	"step": 1990
	},
	{
	"epoch": 6.253918495297806,
	"grad_norm": 0.9140625,
	"learning_rate": 0.000144819539182544,
	"loss": 0.4494,
	"step": 1995
	},
	{
	"epoch": 6.269592476489028,
	"grad_norm": 0.92578125,
	"learning_rate": 0.00014449314071913533,
	"loss": 0.4493,
	"step": 2000
	},
	{
	"epoch": 6.285266457680251,
	"grad_norm": 0.95703125,
	"learning_rate": 0.00014416615017041868,
	"loss": 0.4605,
	"step": 2005
	},
	{
	"epoch": 6.300940438871473,
	"grad_norm": 0.8515625,
	"learning_rate": 0.00014383857188776807,
	"loss": 0.4642,
	"step": 2010
	},
	{
	"epoch": 6.316614420062696,
	"grad_norm": 0.91015625,
	"learning_rate": 0.00014351041023037884,
	"loss": 0.4526,
	"step": 2015
	},
	{
	"epoch": 6.332288401253918,
	"grad_norm": 0.87109375,
	"learning_rate": 0.00014318166956520936,
	"loss": 0.4634,
	"step": 2020
	},
	{
	"epoch": 6.347962382445141,
	"grad_norm": 0.89453125,
	"learning_rate": 0.00014285235426692315,
	"loss": 0.4608,
	"step": 2025
	},
	{
	"epoch": 6.363636363636363,
	"grad_norm": 0.9296875,
	"learning_rate": 0.00014252246871783051,
	"loss": 0.4588,
	"step": 2030
	},
	{
	"epoch": 6.379310344827586,
	"grad_norm": 0.83203125,
	"learning_rate": 0.00014219201730783024,
	"loss": 0.467,
	"step": 2035
	},
	{
	"epoch": 6.394984326018808,
	"grad_norm": 0.97265625,
	"learning_rate": 0.0001418610044343514,
	"loss": 0.4641,
	"step": 2040
	},
	{
	"epoch": 6.410658307210031,
	"grad_norm": 0.9609375,
	"learning_rate": 0.00014152943450229443,
	"loss": 0.4647,
	"step": 2045
	},
	{
	"epoch": 6.4263322884012535,
	"grad_norm": 0.875,
	"learning_rate": 0.00014119731192397284,
	"loss": 0.4627,
	"step": 2050
	},
	{
	"epoch": 6.4420062695924765,
	"grad_norm": 1.1796875,
	"learning_rate": 0.0001408646411190544,
	"loss": 0.4664,
	"step": 2055
	},
	{
	"epoch": 6.4576802507836994,
	"grad_norm": 1.0625,
	"learning_rate": 0.0001405314265145023,
	"loss": 0.4697,
	"step": 2060
	},
	{
	"epoch": 6.4733542319749215,
	"grad_norm": 0.95703125,
	"learning_rate": 0.0001401976725445162,
	"loss": 0.4664,
	"step": 2065
	},
	{
	"epoch": 6.4890282131661445,
	"grad_norm": 0.9375,
	"learning_rate": 0.0001398633836504734,
	"loss": 0.4692,
	"step": 2070
	},
	{
	"epoch": 6.504702194357367,
	"grad_norm": 0.8828125,
	"learning_rate": 0.00013952856428086952,
	"loss": 0.4658,
	"step": 2075
	},
	{
	"epoch": 6.52037617554859,
	"grad_norm": 0.87109375,
	"learning_rate": 0.00013919321889125941,
	"loss": 0.4711,
	"step": 2080
	},
	{
	"epoch": 6.536050156739812,
	"grad_norm": 0.859375,
	"learning_rate": 0.0001388573519441979,
	"loss": 0.4675,
	"step": 2085
	},
	{
	"epoch": 6.551724137931035,
	"grad_norm": 0.94140625,
	"learning_rate": 0.00013852096790918026,
	"loss": 0.4677,
	"step": 2090
	},
	{
	"epoch": 6.567398119122257,
	"grad_norm": 0.9140625,
	"learning_rate": 0.00013818407126258293,
	"loss": 0.4728,
	"step": 2095
	},
	{
	"epoch": 6.58307210031348,
	"grad_norm": 2.75,
	"learning_rate": 0.0001378466664876038,
	"loss": 0.4769,
	"step": 2100
	},
	{
	"epoch": 6.598746081504702,
	"grad_norm": 2.203125,
	"learning_rate": 0.00013750875807420259,
	"loss": 0.463,
	"step": 2105
	},
	{
	"epoch": 6.614420062695925,
	"grad_norm": 1.1015625,
	"learning_rate": 0.00013717035051904114,
	"loss": 0.4663,
	"step": 2110
	},
	{
	"epoch": 6.630094043887147,
	"grad_norm": 0.953125,
	"learning_rate": 0.00013683144832542352,
	"loss": 0.4699,
	"step": 2115
	},
	{
	"epoch": 6.64576802507837,
	"grad_norm": 0.96875,
	"learning_rate": 0.00013649205600323609,
	"loss": 0.4703,
	"step": 2120
	},
	{
	"epoch": 6.661442006269592,
	"grad_norm": 0.91015625,
	"learning_rate": 0.00013615217806888755,
	"loss": 0.4643,
	"step": 2125
	},
	{
	"epoch": 6.677115987460815,
	"grad_norm": 1.078125,
	"learning_rate": 0.0001358118190452488,
	"loss": 0.4669,
	"step": 2130
	},
	{
	"epoch": 6.692789968652038,
	"grad_norm": 0.9375,
	"learning_rate": 0.0001354709834615928,
	"loss": 0.4703,
	"step": 2135
	},
	{
	"epoch": 6.70846394984326,
	"grad_norm": 0.875,
	"learning_rate": 0.00013512967585353413,
	"loss": 0.4714,
	"step": 2140
	},
	{
	"epoch": 6.724137931034483,
	"grad_norm": 0.95703125,
	"learning_rate": 0.00013478790076296892,
	"loss": 0.4658,
	"step": 2145
	},
	{
	"epoch": 6.739811912225705,
	"grad_norm": 0.97265625,
	"learning_rate": 0.00013444566273801414,
	"loss": 0.4649,
	"step": 2150
	},
	{
	"epoch": 6.755485893416928,
	"grad_norm": 0.83984375,
	"learning_rate": 0.00013410296633294727,
	"loss": 0.4783,
	"step": 2155
	},
	{
	"epoch": 6.77115987460815,
	"grad_norm": 1.171875,
	"learning_rate": 0.00013375981610814545,
	"loss": 0.4741,
	"step": 2160
	},
	{
	"epoch": 6.786833855799373,
	"grad_norm": 0.89453125,
	"learning_rate": 0.00013341621663002514,
	"loss": 0.4651,
	"step": 2165
	},
	{
	"epoch": 6.802507836990595,
	"grad_norm": 0.9765625,
	"learning_rate": 0.0001330721724709811,
	"loss": 0.4682,
	"step": 2170
	},
	{
	"epoch": 6.818181818181818,
	"grad_norm": 0.8671875,
	"learning_rate": 0.00013272768820932554,
	"loss": 0.4761,
	"step": 2175
	},
	{
	"epoch": 6.83385579937304,
	"grad_norm": 0.953125,
	"learning_rate": 0.0001323827684292273,
	"loss": 0.467,
	"step": 2180
	},
	{
	"epoch": 6.849529780564263,
	"grad_norm": 0.9453125,
	"learning_rate": 0.0001320374177206509,
	"loss": 0.4719,
	"step": 2185
	},
	{
	"epoch": 6.8652037617554855,
	"grad_norm": 0.87890625,
	"learning_rate": 0.00013169164067929526,
	"loss": 0.4829,
	"step": 2190
	},
	{
	"epoch": 6.8808777429467085,
	"grad_norm": 0.9140625,
	"learning_rate": 0.00013134544190653274,
	"loss": 0.4743,
	"step": 2195
	},
	{
	"epoch": 6.896551724137931,
	"grad_norm": 0.78515625,
	"learning_rate": 0.00013099882600934773,
	"loss": 0.4701,
	"step": 2200
	},
	{
	"epoch": 6.912225705329154,
	"grad_norm": 0.94140625,
	"learning_rate": 0.00013065179760027556,
	"loss": 0.4703,
	"step": 2205
	},
	{
	"epoch": 6.927899686520377,
	"grad_norm": 0.89453125,
	"learning_rate": 0.00013030436129734082,
	"loss": 0.4802,
	"step": 2210
	},
	{
	"epoch": 6.943573667711599,
	"grad_norm": 1.2265625,
	"learning_rate": 0.00012995652172399623,
	"loss": 0.4781,
	"step": 2215
	},
	{
	"epoch": 6.959247648902822,
	"grad_norm": 1.265625,
	"learning_rate": 0.00012960828350906095,
	"loss": 0.4838,
	"step": 2220
	},
	{
	"epoch": 6.974921630094044,
	"grad_norm": 1.546875,
	"learning_rate": 0.00012925965128665897,
	"loss": 0.4751,
	"step": 2225
	},
	{
	"epoch": 6.990595611285267,
	"grad_norm": 0.921875,
	"learning_rate": 0.0001289106296961574,
	"loss": 0.4817,
	"step": 2230
	},
	{
	"epoch": 7.0,
	"eval_loss": 2.6759016513824463,
	"eval_runtime": 0.7898,
	"eval_samples_per_second": 2.532,
	"eval_steps_per_second": 1.266,
	"step": 2233
	},
	{
	"epoch": 7.006269592476489,
	"grad_norm": 0.78125,
	"learning_rate": 0.00012856122338210493,
	"loss": 0.4256,
	"step": 2235
	},
	{
	"epoch": 7.021943573667712,
	"grad_norm": 0.8828125,
	"learning_rate": 0.00012821143699416984,
	"loss": 0.3775,
	"step": 2240
	},
	{
	"epoch": 7.037617554858934,
	"grad_norm": 0.73828125,
	"learning_rate": 0.00012786127518707818,
	"loss": 0.3705,
	"step": 2245
	},
	{
	"epoch": 7.053291536050157,
	"grad_norm": 0.859375,
	"learning_rate": 0.00012751074262055178,
	"loss": 0.3732,
	"step": 2250
	},
	{
	"epoch": 7.068965517241379,
	"grad_norm": 0.86328125,
	"learning_rate": 0.00012715984395924643,
	"loss": 0.3737,
	"step": 2255
	},
	{
	"epoch": 7.084639498432602,
	"grad_norm": 0.9296875,
	"learning_rate": 0.00012680858387268952,
	"loss": 0.3788,
	"step": 2260
	},
	{
	"epoch": 7.100313479623824,
	"grad_norm": 0.890625,
	"learning_rate": 0.00012645696703521818,
	"loss": 0.3711,
	"step": 2265
	},
	{
	"epoch": 7.115987460815047,
	"grad_norm": 1.015625,
	"learning_rate": 0.00012610499812591673,
	"loss": 0.3725,
	"step": 2270
	},
	{
	"epoch": 7.131661442006269,
	"grad_norm": 0.90234375,
	"learning_rate": 0.0001257526818285549,
	"loss": 0.371,
	"step": 2275
	},
	{
	"epoch": 7.147335423197492,
	"grad_norm": 0.9296875,
	"learning_rate": 0.0001254000228315251,
	"loss": 0.3751,
	"step": 2280
	},
	{
	"epoch": 7.163009404388715,
	"grad_norm": 0.921875,
	"learning_rate": 0.00012504702582778008,
	"loss": 0.3798,
	"step": 2285
	},
	{
	"epoch": 7.178683385579937,
	"grad_norm": 0.765625,
	"learning_rate": 0.00012469369551477074,
	"loss": 0.3826,
	"step": 2290
	},
	{
	"epoch": 7.19435736677116,
	"grad_norm": 0.8671875,
	"learning_rate": 0.0001243400365943833,
	"loss": 0.3846,
	"step": 2295
	},
	{
	"epoch": 7.210031347962382,
	"grad_norm": 0.87890625,
	"learning_rate": 0.00012398605377287694,
	"loss": 0.383,
	"step": 2300
	},
	{
	"epoch": 7.225705329153605,
	"grad_norm": 0.78125,
	"learning_rate": 0.000123631751760821,
	"loss": 0.387,
	"step": 2305
	},
	{
	"epoch": 7.241379310344827,
	"grad_norm": 0.796875,
	"learning_rate": 0.00012327713527303255,
	"loss": 0.3752,
	"step": 2310
	},
	{
	"epoch": 7.25705329153605,
	"grad_norm": 0.9453125,
	"learning_rate": 0.0001229222090285134,
	"loss": 0.3832,
	"step": 2315
	},
	{
	"epoch": 7.2727272727272725,
	"grad_norm": 0.84765625,
	"learning_rate": 0.00012256697775038741,
	"loss": 0.39,
	"step": 2320
	},
	{
	"epoch": 7.2884012539184955,
	"grad_norm": 0.828125,
	"learning_rate": 0.00012221144616583765,
	"loss": 0.3902,
	"step": 2325
	},
	{
	"epoch": 7.304075235109718,
	"grad_norm": 0.92578125,
	"learning_rate": 0.00012185561900604341,
	"loss": 0.376,
	"step": 2330
	},
	{
	"epoch": 7.3197492163009406,
	"grad_norm": 0.84765625,
	"learning_rate": 0.00012149950100611738,
	"loss": 0.3873,
	"step": 2335
	},
	{
	"epoch": 7.335423197492163,
	"grad_norm": 0.80078125,
	"learning_rate": 0.00012114309690504249,
	"loss": 0.388,
	"step": 2340
	},
	{
	"epoch": 7.351097178683386,
	"grad_norm": 0.88671875,
	"learning_rate": 0.00012078641144560898,
	"loss": 0.3889,
	"step": 2345
	},
	{
	"epoch": 7.366771159874608,
	"grad_norm": 0.796875,
	"learning_rate": 0.00012042944937435116,
	"loss": 0.395,
	"step": 2350
	},
	{
	"epoch": 7.382445141065831,
	"grad_norm": 0.84765625,
	"learning_rate": 0.00012007221544148435,
	"loss": 0.3957,
	"step": 2355
	},
	{
	"epoch": 7.398119122257054,
	"grad_norm": 0.80078125,
	"learning_rate": 0.00011971471440084157,
	"loss": 0.393,
	"step": 2360
	},
	{
	"epoch": 7.413793103448276,
	"grad_norm": 0.91015625,
	"learning_rate": 0.00011935695100981041,
	"loss": 0.3884,
	"step": 2365
	},
	{
	"epoch": 7.429467084639499,
	"grad_norm": 0.84375,
	"learning_rate": 0.00011899893002926958,
	"loss": 0.3907,
	"step": 2370
	},
	{
	"epoch": 7.445141065830721,
	"grad_norm": 0.84375,
	"learning_rate": 0.00011864065622352568,
	"loss": 0.3865,
	"step": 2375
	},
	{
	"epoch": 7.460815047021944,
	"grad_norm": 0.94140625,
	"learning_rate": 0.00011828213436024968,
	"loss": 0.3866,
	"step": 2380
	},
	{
	"epoch": 7.476489028213166,
	"grad_norm": 0.83984375,
	"learning_rate": 0.00011792336921041359,
	"loss": 0.3878,
	"step": 2385
	},
	{
	"epoch": 7.492163009404389,
	"grad_norm": 0.9140625,
	"learning_rate": 0.00011756436554822685,
	"loss": 0.3933,
	"step": 2390
	},
	{
	"epoch": 7.507836990595611,
	"grad_norm": 0.89453125,
	"learning_rate": 0.00011720512815107292,
	"loss": 0.3825,
	"step": 2395
	},
	{
	"epoch": 7.523510971786834,
	"grad_norm": 0.82421875,
	"learning_rate": 0.00011684566179944567,
	"loss": 0.3892,
	"step": 2400
	},
	{
	"epoch": 7.539184952978056,
	"grad_norm": 0.83203125,
	"learning_rate": 0.00011648597127688567,
	"loss": 0.3918,
	"step": 2405
	},
	{
	"epoch": 7.554858934169279,
	"grad_norm": 0.8515625,
	"learning_rate": 0.00011612606136991665,
	"loss": 0.3952,
	"step": 2410
	},
	{
	"epoch": 7.570532915360501,
	"grad_norm": 0.84375,
	"learning_rate": 0.00011576593686798181,
	"loss": 0.3941,
	"step": 2415
	},
	{
	"epoch": 7.586206896551724,
	"grad_norm": 1.1015625,
	"learning_rate": 0.00011540560256337995,
	"loss": 0.3996,
	"step": 2420
	},
	{
	"epoch": 7.601880877742946,
	"grad_norm": 0.921875,
	"learning_rate": 0.00011504506325120184,
	"loss": 0.3973,
	"step": 2425
	},
	{
	"epoch": 7.617554858934169,
	"grad_norm": 0.84765625,
	"learning_rate": 0.0001146843237292663,
	"loss": 0.3944,
	"step": 2430
	},
	{
	"epoch": 7.633228840125392,
	"grad_norm": 0.921875,
	"learning_rate": 0.0001143233887980565,
	"loss": 0.4008,
	"step": 2435
	},
	{
	"epoch": 7.648902821316614,
	"grad_norm": 0.83203125,
	"learning_rate": 0.00011396226326065593,
	"loss": 0.3967,
	"step": 2440
	},
	{
	"epoch": 7.664576802507837,
	"grad_norm": 0.87890625,
	"learning_rate": 0.00011360095192268454,
	"loss": 0.3979,
	"step": 2445
	},
	{
	"epoch": 7.6802507836990594,
	"grad_norm": 0.84765625,
	"learning_rate": 0.00011323945959223477,
	"loss": 0.3957,
	"step": 2450
	},
	{
	"epoch": 7.695924764890282,
	"grad_norm": 0.8984375,
	"learning_rate": 0.00011287779107980766,
	"loss": 0.398,
	"step": 2455
	},
	{
	"epoch": 7.7115987460815045,
	"grad_norm": 0.88671875,
	"learning_rate": 0.0001125159511982487,
	"loss": 0.3976,
	"step": 2460
	},
	{
	"epoch": 7.7272727272727275,
	"grad_norm": 0.8671875,
	"learning_rate": 0.00011215394476268387,
	"loss": 0.3941,
	"step": 2465
	},
	{
	"epoch": 7.74294670846395,
	"grad_norm": 1.0390625,
	"learning_rate": 0.00011179177659045554,
	"loss": 0.4061,
	"step": 2470
	},
	{
	"epoch": 7.758620689655173,
	"grad_norm": 1.03125,
	"learning_rate": 0.00011142945150105839,
	"loss": 0.3991,
	"step": 2475
	},
	{
	"epoch": 7.774294670846395,
	"grad_norm": 0.86328125,
	"learning_rate": 0.00011106697431607518,
	"loss": 0.3863,
	"step": 2480
	},
	{
	"epoch": 7.789968652037618,
	"grad_norm": 0.90625,
	"learning_rate": 0.00011070434985911271,
	"loss": 0.3923,
	"step": 2485
	},
	{
	"epoch": 7.80564263322884,
	"grad_norm": 0.84375,
	"learning_rate": 0.0001103415829557376,
	"loss": 0.3988,
	"step": 2490
	},
	{
	"epoch": 7.821316614420063,
	"grad_norm": 0.8515625,
	"learning_rate": 0.00010997867843341198,
	"loss": 0.3868,
	"step": 2495
	},
	{
	"epoch": 7.836990595611285,
	"grad_norm": 0.8984375,
	"learning_rate": 0.0001096156411214294,
	"loss": 0.3949,
	"step": 2500
	},
	{
	"epoch": 7.852664576802508,
	"grad_norm": 0.765625,
	"learning_rate": 0.00010925247585085044,
	"loss": 0.4005,
	"step": 2505
	},
	{
	"epoch": 7.868338557993731,
	"grad_norm": 0.84375,
	"learning_rate": 0.00010888918745443845,
	"loss": 0.3974,
	"step": 2510
	},
	{
	"epoch": 7.884012539184953,
	"grad_norm": 0.921875,
	"learning_rate": 0.00010852578076659535,
	"loss": 0.4004,
	"step": 2515
	},
	{
	"epoch": 7.899686520376176,
	"grad_norm": 0.80859375,
	"learning_rate": 0.00010816226062329706,
	"loss": 0.3997,
	"step": 2520
	},
	{
	"epoch": 7.915360501567398,
	"grad_norm": 0.91796875,
	"learning_rate": 0.00010779863186202943,
	"loss": 0.4002,
	"step": 2525
	},
	{
	"epoch": 7.931034482758621,
	"grad_norm": 0.92578125,
	"learning_rate": 0.00010743489932172366,
	"loss": 0.3973,
	"step": 2530
	},
	{
	"epoch": 7.946708463949843,
	"grad_norm": 0.9453125,
	"learning_rate": 0.00010707106784269196,
	"loss": 0.3968,
	"step": 2535
	},
	{
	"epoch": 7.962382445141066,
	"grad_norm": 0.953125,
	"learning_rate": 0.00010670714226656315,
	"loss": 0.397,
	"step": 2540
	},
	{
	"epoch": 7.978056426332288,
	"grad_norm": 0.90625,
	"learning_rate": 0.00010634312743621832,
	"loss": 0.4065,
	"step": 2545
	},
	{
	"epoch": 7.993730407523511,
	"grad_norm": 0.84375,
	"learning_rate": 0.00010597902819572619,
	"loss": 0.4016,
	"step": 2550
	},
	{
	"epoch": 8.0,
	"eval_loss": 3.0132577419281006,
	"eval_runtime": 0.7738,
	"eval_samples_per_second": 2.585,
	"eval_steps_per_second": 1.292,
	"step": 2552
	},
	{
	"epoch": 8.009404388714733,
	"grad_norm": 0.76953125,
	"learning_rate": 0.00010561484939027877,
	"loss": 0.361,
	"step": 2555
	},
	{
	"epoch": 8.025078369905955,
	"grad_norm": 0.88671875,
	"learning_rate": 0.00010525059586612693,
	"loss": 0.3253,
	"step": 2560
	},
	{
	"epoch": 8.04075235109718,
	"grad_norm": 0.765625,
	"learning_rate": 0.0001048862724705158,
	"loss": 0.3205,
	"step": 2565
	},
	{
	"epoch": 8.056426332288401,
	"grad_norm": 0.72265625,
	"learning_rate": 0.00010452188405162033,
	"loss": 0.3241,
	"step": 2570
	},
	{
	"epoch": 8.072100313479623,
	"grad_norm": 0.828125,
	"learning_rate": 0.00010415743545848072,
	"loss": 0.3193,
	"step": 2575
	},
	{
	"epoch": 8.087774294670846,
	"grad_norm": 0.79296875,
	"learning_rate": 0.00010379293154093796,
	"loss": 0.3212,
	"step": 2580
	},
	{
	"epoch": 8.10344827586207,
	"grad_norm": 0.80859375,
	"learning_rate": 0.00010342837714956928,
	"loss": 0.3161,
	"step": 2585
	},
	{
	"epoch": 8.119122257053291,
	"grad_norm": 0.765625,
	"learning_rate": 0.00010306377713562354,
	"loss": 0.321,
	"step": 2590
	},
	{
	"epoch": 8.134796238244514,
	"grad_norm": 0.83203125,
	"learning_rate": 0.00010269913635095676,
	"loss": 0.3245,
	"step": 2595
	},
	{
	"epoch": 8.150470219435737,
	"grad_norm": 0.83984375,
	"learning_rate": 0.00010233445964796749,
	"loss": 0.3279,
	"step": 2600
	},
	{
	"epoch": 8.16614420062696,
	"grad_norm": 0.765625,
	"learning_rate": 0.00010196975187953221,
	"loss": 0.3233,
	"step": 2605
	},
	{
	"epoch": 8.181818181818182,
	"grad_norm": 0.78515625,
	"learning_rate": 0.00010160501789894086,
	"loss": 0.3207,
	"step": 2610
	},
	{
	"epoch": 8.197492163009404,
	"grad_norm": 0.7890625,
	"learning_rate": 0.0001012402625598322,
	"loss": 0.3217,
	"step": 2615
	},
	{
	"epoch": 8.213166144200628,
	"grad_norm": 0.76953125,
	"learning_rate": 0.00010087549071612919,
	"loss": 0.3255,
	"step": 2620
	},
	{
	"epoch": 8.22884012539185,
	"grad_norm": 0.82421875,
	"learning_rate": 0.00010051070722197438,
	"loss": 0.3256,
	"step": 2625
	},
	{
	"epoch": 8.244514106583072,
	"grad_norm": 0.74609375,
	"learning_rate": 0.0001001459169316654,
	"loss": 0.3233,
	"step": 2630
	},
	{
	"epoch": 8.260188087774294,
	"grad_norm": 0.8046875,
	"learning_rate": 9.978112469959033e-05,
	"loss": 0.3232,
	"step": 2635
	},
	{
	"epoch": 8.275862068965518,
	"grad_norm": 0.80859375,
	"learning_rate": 9.941633538016315e-05,
	"loss": 0.3237,
	"step": 2640
	},
	{
	"epoch": 8.29153605015674,
	"grad_norm": 0.79296875,
	"learning_rate": 9.90515538277589e-05,
	"loss": 0.3307,
	"step": 2645
	},
	{
	"epoch": 8.307210031347962,
	"grad_norm": 0.765625,
	"learning_rate": 9.868678489664945e-05,
	"loss": 0.3228,
	"step": 2650
	},
	{
	"epoch": 8.322884012539184,
	"grad_norm": 0.75390625,
	"learning_rate": 9.832203344093855e-05,
	"loss": 0.3238,
	"step": 2655
	},
	{
	"epoch": 8.338557993730408,
	"grad_norm": 0.78125,
	"learning_rate": 9.795730431449759e-05,
	"loss": 0.3302,
	"step": 2660
	},
	{
	"epoch": 8.35423197492163,
	"grad_norm": 0.734375,
	"learning_rate": 9.759260237090058e-05,
	"loss": 0.3243,
	"step": 2665
	},
	{
	"epoch": 8.369905956112852,
	"grad_norm": 0.84765625,
	"learning_rate": 9.722793246336006e-05,
	"loss": 0.3255,
	"step": 2670
	},
	{
	"epoch": 8.385579937304076,
	"grad_norm": 0.7734375,
	"learning_rate": 9.686329944466203e-05,
	"loss": 0.323,
	"step": 2675
	},
	{
	"epoch": 8.401253918495298,
	"grad_norm": 0.8125,
	"learning_rate": 9.649870816710172e-05,
	"loss": 0.3278,
	"step": 2680
	},
	{
	"epoch": 8.41692789968652,
	"grad_norm": 0.875,
	"learning_rate": 9.613416348241887e-05,
	"loss": 0.3282,
	"step": 2685
	},
	{
	"epoch": 8.432601880877742,
	"grad_norm": 0.90234375,
	"learning_rate": 9.576967024173323e-05,
	"loss": 0.3226,
	"step": 2690
	},
	{
	"epoch": 8.448275862068966,
	"grad_norm": 0.80078125,
	"learning_rate": 9.540523329547984e-05,
	"loss": 0.3361,
	"step": 2695
	},
	{
	"epoch": 8.463949843260188,
	"grad_norm": 0.8671875,
	"learning_rate": 9.504085749334479e-05,
	"loss": 0.3309,
	"step": 2700
	},
	{
	"epoch": 8.47962382445141,
	"grad_norm": 0.74609375,
	"learning_rate": 9.467654768420032e-05,
	"loss": 0.325,
	"step": 2705
	},
	{
	"epoch": 8.495297805642632,
	"grad_norm": 0.78515625,
	"learning_rate": 9.431230871604067e-05,
	"loss": 0.3265,
	"step": 2710
	},
	{
	"epoch": 8.510971786833856,
	"grad_norm": 0.8203125,
	"learning_rate": 9.394814543591719e-05,
	"loss": 0.3302,
	"step": 2715
	},
	{
	"epoch": 8.526645768025078,
	"grad_norm": 0.77734375,
	"learning_rate": 9.358406268987417e-05,
	"loss": 0.3299,
	"step": 2720
	},
	{
	"epoch": 8.5423197492163,
	"grad_norm": 0.80859375,
	"learning_rate": 9.322006532288411e-05,
	"loss": 0.3303,
	"step": 2725
	},
	{
	"epoch": 8.557993730407524,
	"grad_norm": 0.87109375,
	"learning_rate": 9.285615817878342e-05,
	"loss": 0.3246,
	"step": 2730
	},
	{
	"epoch": 8.573667711598747,
	"grad_norm": 0.7578125,
	"learning_rate": 9.249234610020779e-05,
	"loss": 0.3256,
	"step": 2735
	},
	{
	"epoch": 8.589341692789969,
	"grad_norm": 0.80859375,
	"learning_rate": 9.212863392852793e-05,
	"loss": 0.3286,
	"step": 2740
	},
	{
	"epoch": 8.60501567398119,
	"grad_norm": 0.8359375,
	"learning_rate": 9.176502650378499e-05,
	"loss": 0.3301,
	"step": 2745
	},
	{
	"epoch": 8.620689655172415,
	"grad_norm": 0.73828125,
	"learning_rate": 9.140152866462629e-05,
	"loss": 0.3345,
	"step": 2750
	},
	{
	"epoch": 8.636363636363637,
	"grad_norm": 0.76171875,
	"learning_rate": 9.103814524824073e-05,
	"loss": 0.3335,
	"step": 2755
	},
	{
	"epoch": 8.652037617554859,
	"grad_norm": 0.73828125,
	"learning_rate": 9.067488109029474e-05,
	"loss": 0.3287,
	"step": 2760
	},
	{
	"epoch": 8.66771159874608,
	"grad_norm": 0.77734375,
	"learning_rate": 9.031174102486752e-05,
	"loss": 0.3286,
	"step": 2765
	},
	{
	"epoch": 8.683385579937305,
	"grad_norm": 0.79296875,
	"learning_rate": 8.994872988438711e-05,
	"loss": 0.3283,
	"step": 2770
	},
	{
	"epoch": 8.699059561128527,
	"grad_norm": 0.80078125,
	"learning_rate": 8.958585249956578e-05,
	"loss": 0.3308,
	"step": 2775
	},
	{
	"epoch": 8.714733542319749,
	"grad_norm": 0.859375,
	"learning_rate": 8.922311369933598e-05,
	"loss": 0.3308,
	"step": 2780
	},
	{
	"epoch": 8.730407523510971,
	"grad_norm": 0.859375,
	"learning_rate": 8.886051831078582e-05,
	"loss": 0.3314,
	"step": 2785
	},
	{
	"epoch": 8.746081504702195,
	"grad_norm": 0.8046875,
	"learning_rate": 8.849807115909513e-05,
	"loss": 0.33,
	"step": 2790
	},
	{
	"epoch": 8.761755485893417,
	"grad_norm": 0.86328125,
	"learning_rate": 8.8135777067471e-05,
	"loss": 0.329,
	"step": 2795
	},
	{
	"epoch": 8.77742946708464,
	"grad_norm": 0.84375,
	"learning_rate": 8.777364085708378e-05,
	"loss": 0.3356,
	"step": 2800
	},
	{
	"epoch": 8.793103448275861,
	"grad_norm": 0.83984375,
	"learning_rate": 8.741166734700273e-05,
	"loss": 0.3384,
	"step": 2805
	},
	{
	"epoch": 8.808777429467085,
	"grad_norm": 0.80859375,
	"learning_rate": 8.704986135413212e-05,
	"loss": 0.3324,
	"step": 2810
	},
	{
	"epoch": 8.824451410658307,
	"grad_norm": 0.78125,
	"learning_rate": 8.668822769314691e-05,
	"loss": 0.3291,
	"step": 2815
	},
	{
	"epoch": 8.84012539184953,
	"grad_norm": 0.78515625,
	"learning_rate": 8.632677117642892e-05,
	"loss": 0.3358,
	"step": 2820
	},
	{
	"epoch": 8.855799373040753,
	"grad_norm": 0.90625,
	"learning_rate": 8.596549661400248e-05,
	"loss": 0.3331,
	"step": 2825
	},
	{
	"epoch": 8.871473354231975,
	"grad_norm": 0.7890625,
	"learning_rate": 8.560440881347071e-05,
	"loss": 0.3306,
	"step": 2830
	},
	{
	"epoch": 8.887147335423197,
	"grad_norm": 0.796875,
	"learning_rate": 8.524351257995135e-05,
	"loss": 0.3322,
	"step": 2835
	},
	{
	"epoch": 8.90282131661442,
	"grad_norm": 0.78515625,
	"learning_rate": 8.488281271601302e-05,
	"loss": 0.3246,
	"step": 2840
	},
	{
	"epoch": 8.918495297805643,
	"grad_norm": 0.81640625,
	"learning_rate": 8.452231402161099e-05,
	"loss": 0.3339,
	"step": 2845
	},
	{
	"epoch": 8.934169278996865,
	"grad_norm": 0.74609375,
	"learning_rate": 8.416202129402371e-05,
	"loss": 0.3291,
	"step": 2850
	},
	{
	"epoch": 8.949843260188088,
	"grad_norm": 0.765625,
	"learning_rate": 8.380193932778857e-05,
	"loss": 0.3268,
	"step": 2855
	},
	{
	"epoch": 8.96551724137931,
	"grad_norm": 0.84375,
	"learning_rate": 8.344207291463843e-05,
	"loss": 0.3258,
	"step": 2860
	},
	{
	"epoch": 8.981191222570533,
	"grad_norm": 0.77734375,
	"learning_rate": 8.30824268434376e-05,
	"loss": 0.3345,
	"step": 2865
	},
	{
	"epoch": 8.996865203761756,
	"grad_norm": 0.796875,
	"learning_rate": 8.27230059001184e-05,
	"loss": 0.327,
	"step": 2870
	},
	{
	"epoch": 9.0,
	"eval_loss": 3.4537088871002197,
	"eval_runtime": 0.7983,
	"eval_samples_per_second": 2.505,
	"eval_steps_per_second": 1.253,
	"step": 2871
	},
	{
	"epoch": 9.012539184952978,
	"grad_norm": 0.5625,
	"learning_rate": 8.2363814867617e-05,
	"loss": 0.2932,
	"step": 2875
	},
	{
	"epoch": 9.0282131661442,
	"grad_norm": 0.7734375,
	"learning_rate": 8.200485852581036e-05,
	"loss": 0.2798,
	"step": 2880
	},
	{
	"epoch": 9.043887147335424,
	"grad_norm": 0.78125,
	"learning_rate": 8.16461416514522e-05,
	"loss": 0.2751,
	"step": 2885
	},
	{
	"epoch": 9.059561128526646,
	"grad_norm": 0.71484375,
	"learning_rate": 8.12876690181096e-05,
	"loss": 0.2771,
	"step": 2890
	},
	{
	"epoch": 9.075235109717868,
	"grad_norm": 0.66796875,
	"learning_rate": 8.092944539609937e-05,
	"loss": 0.281,
	"step": 2895
	},
	{
	"epoch": 9.090909090909092,
	"grad_norm": 0.7109375,
	"learning_rate": 8.057147555242473e-05,
	"loss": 0.2709,
	"step": 2900
	},
	{
	"epoch": 9.106583072100314,
	"grad_norm": 0.6875,
	"learning_rate": 8.021376425071175e-05,
	"loss": 0.2787,
	"step": 2905
	},
	{
	"epoch": 9.122257053291536,
	"grad_norm": 0.64453125,
	"learning_rate": 7.985631625114603e-05,
	"loss": 0.2817,
	"step": 2910
	},
	{
	"epoch": 9.137931034482758,
	"grad_norm": 0.68359375,
	"learning_rate": 7.94991363104092e-05,
	"loss": 0.2798,
	"step": 2915
	},
	{
	"epoch": 9.153605015673982,
	"grad_norm": 0.72265625,
	"learning_rate": 7.914222918161589e-05,
	"loss": 0.2804,
	"step": 2920
	},
	{
	"epoch": 9.169278996865204,
	"grad_norm": 0.71484375,
	"learning_rate": 7.878559961425025e-05,
	"loss": 0.2778,
	"step": 2925
	},
	{
	"epoch": 9.184952978056426,
	"grad_norm": 0.67578125,
	"learning_rate": 7.842925235410288e-05,
	"loss": 0.2832,
	"step": 2930
	},
	{
	"epoch": 9.200626959247648,
	"grad_norm": 0.703125,
	"learning_rate": 7.807319214320747e-05,
	"loss": 0.2781,
	"step": 2935
	},
	{
	"epoch": 9.216300940438872,
	"grad_norm": 0.67578125,
	"learning_rate": 7.771742371977811e-05,
	"loss": 0.277,
	"step": 2940
	},
	{
	"epoch": 9.231974921630094,
	"grad_norm": 0.69921875,
	"learning_rate": 7.73619518181457e-05,
	"loss": 0.2701,
	"step": 2945
	},
	{
	"epoch": 9.247648902821316,
	"grad_norm": 0.73046875,
	"learning_rate": 7.700678116869543e-05,
	"loss": 0.2771,
	"step": 2950
	},
	{
	"epoch": 9.263322884012538,
	"grad_norm": 0.7265625,
	"learning_rate": 7.66519164978035e-05,
	"loss": 0.2738,
	"step": 2955
	},
	{
	"epoch": 9.278996865203762,
	"grad_norm": 0.77734375,
	"learning_rate": 7.629736252777445e-05,
	"loss": 0.2735,
	"step": 2960
	},
	{
	"epoch": 9.294670846394984,
	"grad_norm": 0.6875,
	"learning_rate": 7.594312397677809e-05,
	"loss": 0.2773,
	"step": 2965
	},
	{
	"epoch": 9.310344827586206,
	"grad_norm": 0.68359375,
	"learning_rate": 7.558920555878696e-05,
	"loss": 0.2764,
	"step": 2970
	},
	{
	"epoch": 9.32601880877743,
	"grad_norm": 0.83984375,
	"learning_rate": 7.523561198351342e-05,
	"loss": 0.2828,
	"step": 2975
	},
	{
	"epoch": 9.341692789968652,
	"grad_norm": 0.74609375,
	"learning_rate": 7.48823479563471e-05,
	"loss": 0.2834,
	"step": 2980
	},
	{
	"epoch": 9.357366771159874,
	"grad_norm": 0.87109375,
	"learning_rate": 7.452941817829212e-05,
	"loss": 0.2848,
	"step": 2985
	},
	{
	"epoch": 9.373040752351097,
	"grad_norm": 0.66015625,
	"learning_rate": 7.417682734590469e-05,
	"loss": 0.2801,
	"step": 2990
	},
	{
	"epoch": 9.38871473354232,
	"grad_norm": 0.71875,
	"learning_rate": 7.382458015123057e-05,
	"loss": 0.2822,
	"step": 2995
	},
	{
	"epoch": 9.404388714733543,
	"grad_norm": 0.69921875,
	"learning_rate": 7.347268128174265e-05,
	"loss": 0.2736,
	"step": 3000
	},
	{
	"epoch": 9.420062695924765,
	"grad_norm": 0.703125,
	"learning_rate": 7.31211354202784e-05,
	"loss": 0.2801,
	"step": 3005
	},
	{
	"epoch": 9.435736677115987,
	"grad_norm": 0.703125,
	"learning_rate": 7.276994724497787e-05,
	"loss": 0.2799,
	"step": 3010
	},
	{
	"epoch": 9.45141065830721,
	"grad_norm": 0.6953125,
	"learning_rate": 7.241912142922109e-05,
	"loss": 0.2768,
	"step": 3015
	},
	{
	"epoch": 9.467084639498433,
	"grad_norm": 0.72265625,
	"learning_rate": 7.206866264156623e-05,
	"loss": 0.2851,
	"step": 3020
	},
	{
	"epoch": 9.482758620689655,
	"grad_norm": 0.78125,
	"learning_rate": 7.171857554568706e-05,
	"loss": 0.284,
	"step": 3025
	},
	{
	"epoch": 9.498432601880877,
	"grad_norm": 0.73046875,
	"learning_rate": 7.136886480031138e-05,
	"loss": 0.2826,
	"step": 3030
	},
	{
	"epoch": 9.5141065830721,
	"grad_norm": 0.7265625,
	"learning_rate": 7.101953505915857e-05,
	"loss": 0.2823,
	"step": 3035
	},
	{
	"epoch": 9.529780564263323,
	"grad_norm": 0.7109375,
	"learning_rate": 7.067059097087796e-05,
	"loss": 0.2767,
	"step": 3040
	},
	{
	"epoch": 9.545454545454545,
	"grad_norm": 0.71484375,
	"learning_rate": 7.03220371789868e-05,
	"loss": 0.2808,
	"step": 3045
	},
	{
	"epoch": 9.561128526645769,
	"grad_norm": 0.6953125,
	"learning_rate": 6.997387832180864e-05,
	"loss": 0.2847,
	"step": 3050
	},
	{
	"epoch": 9.576802507836991,
	"grad_norm": 0.7109375,
	"learning_rate": 6.962611903241132e-05,
	"loss": 0.2829,
	"step": 3055
	},
	{
	"epoch": 9.592476489028213,
	"grad_norm": 0.6875,
	"learning_rate": 6.92787639385457e-05,
	"loss": 0.2815,
	"step": 3060
	},
	{
	"epoch": 9.608150470219435,
	"grad_norm": 0.94140625,
	"learning_rate": 6.893181766258373e-05,
	"loss": 0.281,
	"step": 3065
	},
	{
	"epoch": 9.623824451410659,
	"grad_norm": 0.69140625,
	"learning_rate": 6.858528482145716e-05,
	"loss": 0.2807,
	"step": 3070
	},
	{
	"epoch": 9.639498432601881,
	"grad_norm": 0.6953125,
	"learning_rate": 6.823917002659596e-05,
	"loss": 0.2884,
	"step": 3075
	},
	{
	"epoch": 9.655172413793103,
	"grad_norm": 0.6484375,
	"learning_rate": 6.789347788386706e-05,
	"loss": 0.2789,
	"step": 3080
	},
	{
	"epoch": 9.670846394984325,
	"grad_norm": 0.73828125,
	"learning_rate": 6.754821299351299e-05,
	"loss": 0.2733,
	"step": 3085
	},
	{
	"epoch": 9.68652037617555,
	"grad_norm": 0.6953125,
	"learning_rate": 6.720337995009076e-05,
	"loss": 0.2876,
	"step": 3090
	},
	{
	"epoch": 9.702194357366771,
	"grad_norm": 0.6875,
	"learning_rate": 6.68589833424105e-05,
	"loss": 0.2856,
	"step": 3095
	},
	{
	"epoch": 9.717868338557993,
	"grad_norm": 0.77734375,
	"learning_rate": 6.651502775347469e-05,
	"loss": 0.2829,
	"step": 3100
	},
	{
	"epoch": 9.733542319749215,
	"grad_norm": 0.71484375,
	"learning_rate": 6.617151776041692e-05,
	"loss": 0.2875,
	"step": 3105
	},
	{
	"epoch": 9.74921630094044,
	"grad_norm": 0.69140625,
	"learning_rate": 6.582845793444119e-05,
	"loss": 0.2746,
	"step": 3110
	},
	{
	"epoch": 9.764890282131661,
	"grad_norm": 0.703125,
	"learning_rate": 6.548585284076084e-05,
	"loss": 0.288,
	"step": 3115
	},
	{
	"epoch": 9.780564263322884,
	"grad_norm": 0.7421875,
	"learning_rate": 6.514370703853806e-05,
	"loss": 0.2807,
	"step": 3120
	},
	{
	"epoch": 9.796238244514107,
	"grad_norm": 0.7265625,
	"learning_rate": 6.480202508082298e-05,
	"loss": 0.2858,
	"step": 3125
	},
	{
	"epoch": 9.81191222570533,
	"grad_norm": 0.69921875,
	"learning_rate": 6.44608115144933e-05,
	"loss": 0.2793,
	"step": 3130
	},
	{
	"epoch": 9.827586206896552,
	"grad_norm": 0.7109375,
	"learning_rate": 6.412007088019364e-05,
	"loss": 0.2818,
	"step": 3135
	},
	{
	"epoch": 9.843260188087774,
	"grad_norm": 0.75390625,
	"learning_rate": 6.377980771227509e-05,
	"loss": 0.2864,
	"step": 3140
	},
	{
	"epoch": 9.858934169278998,
	"grad_norm": 0.73046875,
	"learning_rate": 6.344002653873504e-05,
	"loss": 0.2819,
	"step": 3145
	},
	{
	"epoch": 9.87460815047022,
	"grad_norm": 0.75,
	"learning_rate": 6.31007318811567e-05,
	"loss": 0.2833,
	"step": 3150
	},
	{
	"epoch": 9.890282131661442,
	"grad_norm": 0.7265625,
	"learning_rate": 6.276192825464918e-05,
	"loss": 0.2888,
	"step": 3155
	},
	{
	"epoch": 9.905956112852664,
	"grad_norm": 0.66796875,
	"learning_rate": 6.242362016778713e-05,
	"loss": 0.2819,
	"step": 3160
	},
	{
	"epoch": 9.921630094043888,
	"grad_norm": 0.703125,
	"learning_rate": 6.208581212255104e-05,
	"loss": 0.2858,
	"step": 3165
	},
	{
	"epoch": 9.93730407523511,
	"grad_norm": 0.6875,
	"learning_rate": 6.1748508614267e-05,
	"loss": 0.2793,
	"step": 3170
	},
	{
	"epoch": 9.952978056426332,
	"grad_norm": 0.73046875,
	"learning_rate": 6.141171413154722e-05,
	"loss": 0.2795,
	"step": 3175
	},
	{
	"epoch": 9.968652037617554,
	"grad_norm": 0.6640625,
	"learning_rate": 6.107543315623001e-05,
	"loss": 0.2794,
	"step": 3180
	},
	{
	"epoch": 9.984326018808778,
	"grad_norm": 0.7578125,
	"learning_rate": 6.073967016332041e-05,
	"loss": 0.2833,
	"step": 3185
	},
	{
	"epoch": 10.0,
	"grad_norm": 0.6484375,
	"learning_rate": 6.040442962093029e-05,
	"loss": 0.2814,
	"step": 3190
	},
	{
	"epoch": 10.0,
	"eval_loss": 3.8273370265960693,
	"eval_runtime": 0.8006,
	"eval_samples_per_second": 2.498,
	"eval_steps_per_second": 1.249,
	"step": 3190
	},
	{
	"epoch": 10.015673981191222,
	"grad_norm": 0.53515625,
	"learning_rate": 6.006971599021928e-05,
	"loss": 0.2526,
	"step": 3195
	},
	{
	"epoch": 10.031347962382446,
	"grad_norm": 0.59375,
	"learning_rate": 5.973553372533509e-05,
	"loss": 0.2478,
	"step": 3200
	},
	{
	"epoch": 10.047021943573668,
	"grad_norm": 0.6484375,
	"learning_rate": 5.9401887273354475e-05,
	"loss": 0.2487,
	"step": 3205
	},
	{
	"epoch": 10.06269592476489,
	"grad_norm": 0.6640625,
	"learning_rate": 5.9068781074223824e-05,
	"loss": 0.2522,
	"step": 3210
	},
	{
	"epoch": 10.078369905956112,
	"grad_norm": 0.6015625,
	"learning_rate": 5.8736219560700324e-05,
	"loss": 0.2503,
	"step": 3215
	},
	{
	"epoch": 10.094043887147336,
	"grad_norm": 0.65625,
	"learning_rate": 5.840420715829272e-05,
	"loss": 0.247,
	"step": 3220
	},
	{
	"epoch": 10.109717868338558,
	"grad_norm": 0.67578125,
	"learning_rate": 5.807274828520266e-05,
	"loss": 0.2521,
	"step": 3225
	},
	{
	"epoch": 10.12539184952978,
	"grad_norm": 0.6640625,
	"learning_rate": 5.774184735226571e-05,
	"loss": 0.2484,
	"step": 3230
	},
	{
	"epoch": 10.141065830721002,
	"grad_norm": 0.59375,
	"learning_rate": 5.741150876289283e-05,
	"loss": 0.2474,
	"step": 3235
	},
	{
	"epoch": 10.156739811912226,
	"grad_norm": 0.58984375,
	"learning_rate": 5.708173691301153e-05,
	"loss": 0.2506,
	"step": 3240
	},
	{
	"epoch": 10.172413793103448,
	"grad_norm": 0.61328125,
	"learning_rate": 5.675253619100772e-05,
	"loss": 0.2544,
	"step": 3245
	},
	{
	"epoch": 10.18808777429467,
	"grad_norm": 0.66015625,
	"learning_rate": 5.642391097766693e-05,
	"loss": 0.249,
	"step": 3250
	},
	{
	"epoch": 10.203761755485893,
	"grad_norm": 1.2578125,
	"learning_rate": 5.609586564611631e-05,
	"loss": 0.2535,
	"step": 3255
	},
	{
	"epoch": 10.219435736677116,
	"grad_norm": 0.57421875,
	"learning_rate": 5.576840456176631e-05,
	"loss": 0.2504,
	"step": 3260
	},
	{
	"epoch": 10.235109717868339,
	"grad_norm": 0.66015625,
	"learning_rate": 5.544153208225265e-05,
	"loss": 0.2524,
	"step": 3265
	},
	{
	"epoch": 10.25078369905956,
	"grad_norm": 0.65234375,
	"learning_rate": 5.511525255737815e-05,
	"loss": 0.2549,
	"step": 3270
	},
	{
	"epoch": 10.266457680250785,
	"grad_norm": 0.625,
	"learning_rate": 5.478957032905514e-05,
	"loss": 0.2521,
	"step": 3275
	},
	{
	"epoch": 10.282131661442007,
	"grad_norm": 0.6171875,
	"learning_rate": 5.446448973124736e-05,
	"loss": 0.2531,
	"step": 3280
	},
	{
	"epoch": 10.297805642633229,
	"grad_norm": 0.59375,
	"learning_rate": 5.414001508991264e-05,
	"loss": 0.2533,
	"step": 3285
	},
	{
	"epoch": 10.31347962382445,
	"grad_norm": 0.63671875,
	"learning_rate": 5.3816150722944916e-05,
	"loss": 0.2544,
	"step": 3290
	},
	{
	"epoch": 10.329153605015675,
	"grad_norm": 0.6328125,
	"learning_rate": 5.3492900940117264e-05,
	"loss": 0.2491,
	"step": 3295
	},
	{
	"epoch": 10.344827586206897,
	"grad_norm": 0.63671875,
	"learning_rate": 5.3170270043024015e-05,
	"loss": 0.2522,
	"step": 3300
	},
	{
	"epoch": 10.360501567398119,
	"grad_norm": 0.62890625,
	"learning_rate": 5.284826232502399e-05,
	"loss": 0.249,
	"step": 3305
	},
	{
	"epoch": 10.376175548589341,
	"grad_norm": 0.77734375,
	"learning_rate": 5.252688207118297e-05,
	"loss": 0.2459,
	"step": 3310
	},
	{
	"epoch": 10.391849529780565,
	"grad_norm": 0.625,
	"learning_rate": 5.220613355821704e-05,
	"loss": 0.2503,
	"step": 3315
	},
	{
	"epoch": 10.407523510971787,
	"grad_norm": 0.6328125,
	"learning_rate": 5.188602105443533e-05,
	"loss": 0.2543,
	"step": 3320
	},
	{
	"epoch": 10.423197492163009,
	"grad_norm": 0.65234375,
	"learning_rate": 5.156654881968348e-05,
	"loss": 0.2509,
	"step": 3325
	},
	{
	"epoch": 10.438871473354231,
	"grad_norm": 0.65234375,
	"learning_rate": 5.124772110528684e-05,
	"loss": 0.2524,
	"step": 3330
	},
	{
	"epoch": 10.454545454545455,
	"grad_norm": 0.6796875,
	"learning_rate": 5.0929542153993926e-05,
	"loss": 0.2525,
	"step": 3335
	},
	{
	"epoch": 10.470219435736677,
	"grad_norm": 0.62890625,
	"learning_rate": 5.061201619991984e-05,
	"loss": 0.248,
	"step": 3340
	},
	{
	"epoch": 10.4858934169279,
	"grad_norm": 0.72265625,
	"learning_rate": 5.029514746849018e-05,
	"loss": 0.2516,
	"step": 3345
	},
	{
	"epoch": 10.501567398119121,
	"grad_norm": 0.75,
	"learning_rate": 4.9978940176384514e-05,
	"loss": 0.2509,
	"step": 3350
	},
	{
	"epoch": 10.517241379310345,
	"grad_norm": 0.625,
	"learning_rate": 4.966339853148055e-05,
	"loss": 0.2506,
	"step": 3355
	},
	{
	"epoch": 10.532915360501567,
	"grad_norm": 0.6171875,
	"learning_rate": 4.934852673279787e-05,
	"loss": 0.254,
	"step": 3360
	},
	{
	"epoch": 10.54858934169279,
	"grad_norm": 0.63671875,
	"learning_rate": 4.9034328970442275e-05,
	"loss": 0.2519,
	"step": 3365
	},
	{
	"epoch": 10.564263322884013,
	"grad_norm": 0.625,
	"learning_rate": 4.8720809425549916e-05,
	"loss": 0.2541,
	"step": 3370
	},
	{
	"epoch": 10.579937304075235,
	"grad_norm": 0.6484375,
	"learning_rate": 4.8407972270231704e-05,
	"loss": 0.2563,
	"step": 3375
	},
	{
	"epoch": 10.595611285266457,
	"grad_norm": 0.60546875,
	"learning_rate": 4.809582166751765e-05,
	"loss": 0.2529,
	"step": 3380
	},
	{
	"epoch": 10.61128526645768,
	"grad_norm": 0.7265625,
	"learning_rate": 4.778436177130173e-05,
	"loss": 0.2546,
	"step": 3385
	},
	{
	"epoch": 10.626959247648903,
	"grad_norm": 0.6328125,
	"learning_rate": 4.747359672628631e-05,
	"loss": 0.2545,
	"step": 3390
	},
	{
	"epoch": 10.642633228840126,
	"grad_norm": 0.62890625,
	"learning_rate": 4.7163530667927226e-05,
	"loss": 0.2564,
	"step": 3395
	},
	{
	"epoch": 10.658307210031348,
	"grad_norm": 0.66015625,
	"learning_rate": 4.685416772237864e-05,
	"loss": 0.2506,
	"step": 3400
	},
	{
	"epoch": 10.67398119122257,
	"grad_norm": 0.6640625,
	"learning_rate": 4.65455120064382e-05,
	"loss": 0.2505,
	"step": 3405
	},
	{
	"epoch": 10.689655172413794,
	"grad_norm": 0.68359375,
	"learning_rate": 4.623756762749207e-05,
	"loss": 0.2499,
	"step": 3410
	},
	{
	"epoch": 10.705329153605016,
	"grad_norm": 0.6484375,
	"learning_rate": 4.593033868346059e-05,
	"loss": 0.2529,
	"step": 3415
	},
	{
	"epoch": 10.721003134796238,
	"grad_norm": 0.63671875,
	"learning_rate": 4.5623829262743414e-05,
	"loss": 0.2549,
	"step": 3420
	},
	{
	"epoch": 10.736677115987462,
	"grad_norm": 0.59375,
	"learning_rate": 4.531804344416536e-05,
	"loss": 0.2524,
	"step": 3425
	},
	{
	"epoch": 10.752351097178684,
	"grad_norm": 0.6015625,
	"learning_rate": 4.501298529692194e-05,
	"loss": 0.2556,
	"step": 3430
	},
	{
	"epoch": 10.768025078369906,
	"grad_norm": 0.64453125,
	"learning_rate": 4.470865888052537e-05,
	"loss": 0.2518,
	"step": 3435
	},
	{
	"epoch": 10.783699059561128,
	"grad_norm": 0.640625,
	"learning_rate": 4.4405068244750446e-05,
	"loss": 0.2544,
	"step": 3440
	},
	{
	"epoch": 10.799373040752352,
	"grad_norm": 0.61328125,
	"learning_rate": 4.410221742958073e-05,
	"loss": 0.2493,
	"step": 3445
	},
	{
	"epoch": 10.815047021943574,
	"grad_norm": 0.76953125,
	"learning_rate": 4.380011046515461e-05,
	"loss": 0.2556,
	"step": 3450
	},
	{
	"epoch": 10.830721003134796,
	"grad_norm": 0.6640625,
	"learning_rate": 4.349875137171196e-05,
	"loss": 0.2525,
	"step": 3455
	},
	{
	"epoch": 10.846394984326018,
	"grad_norm": 0.63671875,
	"learning_rate": 4.3198144159540346e-05,
	"loss": 0.2532,
	"step": 3460
	},
	{
	"epoch": 10.862068965517242,
	"grad_norm": 0.625,
	"learning_rate": 4.289829282892188e-05,
	"loss": 0.2494,
	"step": 3465
	},
	{
	"epoch": 10.877742946708464,
	"grad_norm": 0.61328125,
	"learning_rate": 4.2599201370079875e-05,
	"loss": 0.2495,
	"step": 3470
	},
	{
	"epoch": 10.893416927899686,
	"grad_norm": 0.6484375,
	"learning_rate": 4.230087376312582e-05,
	"loss": 0.2483,
	"step": 3475
	},
	{
	"epoch": 10.909090909090908,
	"grad_norm": 0.71875,
	"learning_rate": 4.2003313978006244e-05,
	"loss": 0.2505,
	"step": 3480
	},
	{
	"epoch": 10.924764890282132,
	"grad_norm": 0.609375,
	"learning_rate": 4.170652597445016e-05,
	"loss": 0.2499,
	"step": 3485
	},
	{
	"epoch": 10.940438871473354,
	"grad_norm": 0.69921875,
	"learning_rate": 4.1410513701916086e-05,
	"loss": 0.2501,
	"step": 3490
	},
	{
	"epoch": 10.956112852664576,
	"grad_norm": 0.66796875,
	"learning_rate": 4.111528109953975e-05,
	"loss": 0.2547,
	"step": 3495
	},
	{
	"epoch": 10.971786833855798,
	"grad_norm": 0.60546875,
	"learning_rate": 4.0820832096081415e-05,
	"loss": 0.252,
	"step": 3500
	},
	{
	"epoch": 10.987460815047022,
	"grad_norm": 0.625,
	"learning_rate": 4.052717060987386e-05,
	"loss": 0.2539,
	"step": 3505
	},
	{
	"epoch": 11.0,
	"eval_loss": 4.135468006134033,
	"eval_runtime": 0.805,
	"eval_samples_per_second": 2.485,
	"eval_steps_per_second": 1.242,
	"step": 3509
	},
	{
	"epoch": 11.003134796238244,
	"grad_norm": 0.5078125,
	"learning_rate": 4.023430054876999e-05,
	"loss": 0.2514,
	"step": 3510
	},
	{
	"epoch": 11.018808777429467,
	"grad_norm": 0.52734375,
	"learning_rate": 3.994222581009107e-05,
	"loss": 0.2418,
	"step": 3515
	},
	{
	"epoch": 11.03448275862069,
	"grad_norm": 0.5390625,
	"learning_rate": 3.965095028057461e-05,
	"loss": 0.2369,
	"step": 3520
	},
	{
	"epoch": 11.050156739811912,
	"grad_norm": 0.57421875,
	"learning_rate": 3.936047783632286e-05,
	"loss": 0.2349,
	"step": 3525
	},
	{
	"epoch": 11.065830721003135,
	"grad_norm": 0.6015625,
	"learning_rate": 3.907081234275109e-05,
	"loss": 0.2392,
	"step": 3530
	},
	{
	"epoch": 11.081504702194357,
	"grad_norm": 0.5859375,
	"learning_rate": 3.878195765453626e-05,
	"loss": 0.2316,
	"step": 3535
	},
	{
	"epoch": 11.09717868338558,
	"grad_norm": 0.53515625,
	"learning_rate": 3.849391761556559e-05,
	"loss": 0.2321,
	"step": 3540
	},
	{
	"epoch": 11.112852664576803,
	"grad_norm": 0.52734375,
	"learning_rate": 3.820669605888556e-05,
	"loss": 0.235,
	"step": 3545
	},
	{
	"epoch": 11.128526645768025,
	"grad_norm": 0.55078125,
	"learning_rate": 3.79202968066508e-05,
	"loss": 0.2333,
	"step": 3550
	},
	{
	"epoch": 11.144200626959247,
	"grad_norm": 0.55078125,
	"learning_rate": 3.7634723670073294e-05,
	"loss": 0.2376,
	"step": 3555
	},
	{
	"epoch": 11.15987460815047,
	"grad_norm": 0.6171875,
	"learning_rate": 3.7349980449371516e-05,
	"loss": 0.2379,
	"step": 3560
	},
	{
	"epoch": 11.175548589341693,
	"grad_norm": 0.5625,
	"learning_rate": 3.706607093372012e-05,
	"loss": 0.2344,
	"step": 3565
	},
	{
	"epoch": 11.191222570532915,
	"grad_norm": 0.55078125,
	"learning_rate": 3.67829989011992e-05,
	"loss": 0.2355,
	"step": 3570
	},
	{
	"epoch": 11.206896551724139,
	"grad_norm": 0.5703125,
	"learning_rate": 3.65007681187443e-05,
	"loss": 0.2332,
	"step": 3575
	},
	{
	"epoch": 11.22257053291536,
	"grad_norm": 0.6015625,
	"learning_rate": 3.621938234209613e-05,
	"loss": 0.2346,
	"step": 3580
	},
	{
	"epoch": 11.238244514106583,
	"grad_norm": 0.6015625,
	"learning_rate": 3.5938845315750666e-05,
	"loss": 0.2385,
	"step": 3585
	},
	{
	"epoch": 11.253918495297805,
	"grad_norm": 0.5546875,
	"learning_rate": 3.565916077290914e-05,
	"loss": 0.2367,
	"step": 3590
	},
	{
	"epoch": 11.269592476489029,
	"grad_norm": 0.66796875,
	"learning_rate": 3.5380332435428655e-05,
	"loss": 0.2383,
	"step": 3595
	},
	{
	"epoch": 11.285266457680251,
	"grad_norm": 0.62890625,
	"learning_rate": 3.510236401377236e-05,
	"loss": 0.2373,
	"step": 3600
	},
	{
	"epoch": 11.300940438871473,
	"grad_norm": 0.6015625,
	"learning_rate": 3.482525920696036e-05,
	"loss": 0.2352,
	"step": 3605
	},
	{
	"epoch": 11.316614420062695,
	"grad_norm": 0.5234375,
	"learning_rate": 3.454902170252019e-05,
	"loss": 0.2343,
	"step": 3610
	},
	{
	"epoch": 11.33228840125392,
	"grad_norm": 0.64453125,
	"learning_rate": 3.4273655176438014e-05,
	"loss": 0.2317,
	"step": 3615
	},
	{
	"epoch": 11.347962382445141,
	"grad_norm": 0.59765625,
	"learning_rate": 3.3999163293109534e-05,
	"loss": 0.2375,
	"step": 3620
	},
	{
	"epoch": 11.363636363636363,
	"grad_norm": 0.625,
	"learning_rate": 3.372554970529137e-05,
	"loss": 0.2384,
	"step": 3625
	},
	{
	"epoch": 11.379310344827585,
	"grad_norm": 0.63671875,
	"learning_rate": 3.345281805405219e-05,
	"loss": 0.2385,
	"step": 3630
	},
	{
	"epoch": 11.39498432601881,
	"grad_norm": 0.63671875,
	"learning_rate": 3.318097196872464e-05,
	"loss": 0.2429,
	"step": 3635
	},
	{
	"epoch": 11.410658307210031,
	"grad_norm": 0.58984375,
	"learning_rate": 3.291001506685666e-05,
	"loss": 0.2351,
	"step": 3640
	},
	{
	"epoch": 11.426332288401253,
	"grad_norm": 0.5546875,
	"learning_rate": 3.2639950954163644e-05,
	"loss": 0.2377,
	"step": 3645
	},
	{
	"epoch": 11.442006269592476,
	"grad_norm": 0.72265625,
	"learning_rate": 3.23707832244803e-05,
	"loss": 0.2353,
	"step": 3650
	},
	{
	"epoch": 11.4576802507837,
	"grad_norm": 0.5390625,
	"learning_rate": 3.2102515459712876e-05,
	"loss": 0.2361,
	"step": 3655
	},
	{
	"epoch": 11.473354231974922,
	"grad_norm": 0.61328125,
	"learning_rate": 3.1835151229791435e-05,
	"loss": 0.2357,
	"step": 3660
	},
	{
	"epoch": 11.489028213166144,
	"grad_norm": 0.54296875,
	"learning_rate": 3.1568694092622475e-05,
	"loss": 0.2359,
	"step": 3665
	},
	{
	"epoch": 11.504702194357368,
	"grad_norm": 0.5390625,
	"learning_rate": 3.1303147594041394e-05,
	"loss": 0.2333,
	"step": 3670
	},
	{
	"epoch": 11.52037617554859,
	"grad_norm": 0.5859375,
	"learning_rate": 3.1038515267765545e-05,
	"loss": 0.236,
	"step": 3675
	},
	{
	"epoch": 11.536050156739812,
	"grad_norm": 0.64453125,
	"learning_rate": 3.0774800635346934e-05,
	"loss": 0.2323,
	"step": 3680
	},
	{
	"epoch": 11.551724137931034,
	"grad_norm": 0.578125,
	"learning_rate": 3.0512007206125638e-05,
	"loss": 0.2358,
	"step": 3685
	},
	{
	"epoch": 11.567398119122258,
	"grad_norm": 0.6328125,
	"learning_rate": 3.0250138477182886e-05,
	"loss": 0.2379,
	"step": 3690
	},
	{
	"epoch": 11.58307210031348,
	"grad_norm": 0.640625,
	"learning_rate": 2.9989197933294687e-05,
	"loss": 0.2353,
	"step": 3695
	},
	{
	"epoch": 11.598746081504702,
	"grad_norm": 0.56640625,
	"learning_rate": 2.9729189046885266e-05,
	"loss": 0.2359,
	"step": 3700
	},
	{
	"epoch": 11.614420062695924,
	"grad_norm": 0.5546875,
	"learning_rate": 2.947011527798107e-05,
	"loss": 0.2375,
	"step": 3705
	},
	{
	"epoch": 11.630094043887148,
	"grad_norm": 0.609375,
	"learning_rate": 2.9211980074164514e-05,
	"loss": 0.2364,
	"step": 3710
	},
	{
	"epoch": 11.64576802507837,
	"grad_norm": 0.62109375,
	"learning_rate": 2.895478687052835e-05,
	"loss": 0.2393,
	"step": 3715
	},
	{
	"epoch": 11.661442006269592,
	"grad_norm": 0.59375,
	"learning_rate": 2.8698539089629662e-05,
	"loss": 0.235,
	"step": 3720
	},
	{
	"epoch": 11.677115987460816,
	"grad_norm": 0.56640625,
	"learning_rate": 2.844324014144457e-05,
	"loss": 0.2341,
	"step": 3725
	},
	{
	"epoch": 11.692789968652038,
	"grad_norm": 0.56640625,
	"learning_rate": 2.818889342332275e-05,
	"loss": 0.2299,
	"step": 3730
	},
	{
	"epoch": 11.70846394984326,
	"grad_norm": 0.61328125,
	"learning_rate": 2.793550231994222e-05,
	"loss": 0.2388,
	"step": 3735
	},
	{
	"epoch": 11.724137931034482,
	"grad_norm": 0.57421875,
	"learning_rate": 2.768307020326425e-05,
	"loss": 0.2341,
	"step": 3740
	},
	{
	"epoch": 11.739811912225706,
	"grad_norm": 0.578125,
	"learning_rate": 2.7431600432488657e-05,
	"loss": 0.234,
	"step": 3745
	},
	{
	"epoch": 11.755485893416928,
	"grad_norm": 0.60546875,
	"learning_rate": 2.7181096354008884e-05,
	"loss": 0.2364,
	"step": 3750
	},
	{
	"epoch": 11.77115987460815,
	"grad_norm": 0.62890625,
	"learning_rate": 2.6931561301367646e-05,
	"loss": 0.2331,
	"step": 3755
	},
	{
	"epoch": 11.786833855799372,
	"grad_norm": 0.60546875,
	"learning_rate": 2.6682998595212505e-05,
	"loss": 0.2357,
	"step": 3760
	},
	{
	"epoch": 11.802507836990596,
	"grad_norm": 0.65625,
	"learning_rate": 2.6435411543251677e-05,
	"loss": 0.2389,
	"step": 3765
	},
	{
	"epoch": 11.818181818181818,
	"grad_norm": 0.60546875,
	"learning_rate": 2.6188803440209942e-05,
	"loss": 0.241,
	"step": 3770
	},
	{
	"epoch": 11.83385579937304,
	"grad_norm": 0.62109375,
	"learning_rate": 2.5943177567785015e-05,
	"loss": 0.2361,
	"step": 3775
	},
	{
	"epoch": 11.849529780564263,
	"grad_norm": 0.578125,
	"learning_rate": 2.5698537194603566e-05,
	"loss": 0.2352,
	"step": 3780
	},
	{
	"epoch": 11.865203761755486,
	"grad_norm": 0.58203125,
	"learning_rate": 2.5454885576178067e-05,
	"loss": 0.2389,
	"step": 3785
	},
	{
	"epoch": 11.880877742946709,
	"grad_norm": 0.71875,
	"learning_rate": 2.5212225954863132e-05,
	"loss": 0.2367,
	"step": 3790
	},
	{
	"epoch": 11.89655172413793,
	"grad_norm": 0.5703125,
	"learning_rate": 2.4970561559812645e-05,
	"loss": 0.2383,
	"step": 3795
	},
	{
	"epoch": 11.912225705329153,
	"grad_norm": 0.61328125,
	"learning_rate": 2.472989560693665e-05,
	"loss": 0.2314,
	"step": 3800
	},
	{
	"epoch": 11.927899686520377,
	"grad_norm": 0.66796875,
	"learning_rate": 2.449023129885859e-05,
	"loss": 0.2388,
	"step": 3805
	},
	{
	"epoch": 11.943573667711599,
	"grad_norm": 0.56640625,
	"learning_rate": 2.425157182487262e-05,
	"loss": 0.2383,
	"step": 3810
	},
	{
	"epoch": 11.95924764890282,
	"grad_norm": 0.67578125,
	"learning_rate": 2.401392036090132e-05,
	"loss": 0.2384,
	"step": 3815
	},
	{
	"epoch": 11.974921630094045,
	"grad_norm": 0.62890625,
	"learning_rate": 2.3777280069453245e-05,
	"loss": 0.2358,
	"step": 3820
	},
	{
	"epoch": 11.990595611285267,
	"grad_norm": 0.55859375,
	"learning_rate": 2.3541654099581e-05,
	"loss": 0.233,
	"step": 3825
	},
	{
	"epoch": 12.0,
	"eval_loss": 4.354940891265869,
	"eval_runtime": 0.795,
	"eval_samples_per_second": 2.516,
	"eval_steps_per_second": 1.258,
	"step": 3828
	},
	{
	"epoch": 12.006269592476489,
	"grad_norm": 0.5546875,
	"learning_rate": 2.330704558683926e-05,
	"loss": 0.2349,
	"step": 3830
	},
	{
	"epoch": 12.021943573667711,
	"grad_norm": 0.51171875,
	"learning_rate": 2.307345765324306e-05,
	"loss": 0.2297,
	"step": 3835
	},
	{
	"epoch": 12.037617554858935,
	"grad_norm": 0.5390625,
	"learning_rate": 2.284089340722618e-05,
	"loss": 0.2337,
	"step": 3840
	},
	{
	"epoch": 12.053291536050157,
	"grad_norm": 0.51953125,
	"learning_rate": 2.2609355943599942e-05,
	"loss": 0.2295,
	"step": 3845
	},
	{
	"epoch": 12.068965517241379,
	"grad_norm": 0.578125,
	"learning_rate": 2.2378848343511804e-05,
	"loss": 0.2287,
	"step": 3850
	},
	{
	"epoch": 12.084639498432601,
	"grad_norm": 0.5859375,
	"learning_rate": 2.214937367440463e-05,
	"loss": 0.2281,
	"step": 3855
	},
	{
	"epoch": 12.100313479623825,
	"grad_norm": 0.56640625,
	"learning_rate": 2.192093498997555e-05,
	"loss": 0.2299,
	"step": 3860
	},
	{
	"epoch": 12.115987460815047,
	"grad_norm": 0.58984375,
	"learning_rate": 2.169353533013565e-05,
	"loss": 0.2287,
	"step": 3865
	},
	{
	"epoch": 12.13166144200627,
	"grad_norm": 0.5390625,
	"learning_rate": 2.1467177720969268e-05,
	"loss": 0.2281,
	"step": 3870
	},
	{
	"epoch": 12.147335423197493,
	"grad_norm": 0.490234375,
	"learning_rate": 2.12418651746939e-05,
	"loss": 0.2238,
	"step": 3875
	},
	{
	"epoch": 12.163009404388715,
	"grad_norm": 0.56640625,
	"learning_rate": 2.101760068961992e-05,
	"loss": 0.2331,
	"step": 3880
	},
	{
	"epoch": 12.178683385579937,
	"grad_norm": 0.5234375,
	"learning_rate": 2.0794387250110913e-05,
	"loss": 0.2286,
	"step": 3885
	},
	{
	"epoch": 12.19435736677116,
	"grad_norm": 0.53125,
	"learning_rate": 2.0572227826543755e-05,
	"loss": 0.2323,
	"step": 3890
	},
	{
	"epoch": 12.210031347962383,
	"grad_norm": 0.5390625,
	"learning_rate": 2.0351125375269264e-05,
	"loss": 0.2269,
	"step": 3895
	},
	{
	"epoch": 12.225705329153605,
	"grad_norm": 0.51171875,
	"learning_rate": 2.0131082838572655e-05,
	"loss": 0.2288,
	"step": 3900
	},
	{
	"epoch": 12.241379310344827,
	"grad_norm": 0.546875,
	"learning_rate": 1.991210314463461e-05,
	"loss": 0.236,
	"step": 3905
	},
	{
	"epoch": 12.25705329153605,
	"grad_norm": 0.54296875,
	"learning_rate": 1.969418920749214e-05,
	"loss": 0.229,
	"step": 3910
	},
	{
	"epoch": 12.272727272727273,
	"grad_norm": 0.58203125,
	"learning_rate": 1.9477343926999913e-05,
	"loss": 0.2305,
	"step": 3915
	},
	{
	"epoch": 12.288401253918495,
	"grad_norm": 0.52734375,
	"learning_rate": 1.9261570188791555e-05,
	"loss": 0.2287,
	"step": 3920
	},
	{
	"epoch": 12.304075235109718,
	"grad_norm": 0.55859375,
	"learning_rate": 1.9046870864241384e-05,
	"loss": 0.2274,
	"step": 3925
	},
	{
	"epoch": 12.31974921630094,
	"grad_norm": 0.52734375,
	"learning_rate": 1.8833248810426073e-05,
	"loss": 0.2282,
	"step": 3930
	},
	{
	"epoch": 12.335423197492164,
	"grad_norm": 0.54296875,
	"learning_rate": 1.8620706870086723e-05,
	"loss": 0.2282,
	"step": 3935
	},
	{
	"epoch": 12.351097178683386,
	"grad_norm": 0.578125,
	"learning_rate": 1.8409247871591006e-05,
	"loss": 0.2323,
	"step": 3940
	},
	{
	"epoch": 12.366771159874608,
	"grad_norm": 0.57421875,
	"learning_rate": 1.8198874628895524e-05,
	"loss": 0.228,
	"step": 3945
	},
	{
	"epoch": 12.38244514106583,
	"grad_norm": 0.55078125,
	"learning_rate": 1.798958994150829e-05,
	"loss": 0.2227,
	"step": 3950
	},
	{
	"epoch": 12.398119122257054,
	"grad_norm": 0.50390625,
	"learning_rate": 1.7781396594451637e-05,
	"loss": 0.235,
	"step": 3955
	},
	{
	"epoch": 12.413793103448276,
	"grad_norm": 0.55078125,
	"learning_rate": 1.757429735822499e-05,
	"loss": 0.2303,
	"step": 3960
	},
	{
	"epoch": 12.429467084639498,
	"grad_norm": 0.6875,
	"learning_rate": 1.7368294988768097e-05,
	"loss": 0.2318,
	"step": 3965
	},
	{
	"epoch": 12.445141065830722,
	"grad_norm": 0.59765625,
	"learning_rate": 1.716339222742436e-05,
	"loss": 0.2305,
	"step": 3970
	},
	{
	"epoch": 12.460815047021944,
	"grad_norm": 0.54296875,
	"learning_rate": 1.695959180090425e-05,
	"loss": 0.2268,
	"step": 3975
	},
	{
	"epoch": 12.476489028213166,
	"grad_norm": 0.65234375,
	"learning_rate": 1.6756896421249168e-05,
	"loss": 0.2279,
	"step": 3980
	},
	{
	"epoch": 12.492163009404388,
	"grad_norm": 0.57421875,
	"learning_rate": 1.6555308785795232e-05,
	"loss": 0.2267,
	"step": 3985
	},
	{
	"epoch": 12.507836990595612,
	"grad_norm": 0.59765625,
	"learning_rate": 1.6354831577137485e-05,
	"loss": 0.2287,
	"step": 3990
	},
	{
	"epoch": 12.523510971786834,
	"grad_norm": 0.58984375,
	"learning_rate": 1.6155467463094066e-05,
	"loss": 0.2248,
	"step": 3995
	},
	{
	"epoch": 12.539184952978056,
	"grad_norm": 0.5703125,
	"learning_rate": 1.5957219096670883e-05,
	"loss": 0.2299,
	"step": 4000
	},
	{
	"epoch": 12.554858934169278,
	"grad_norm": 0.59765625,
	"learning_rate": 1.576008911602609e-05,
	"loss": 0.2288,
	"step": 4005
	},
	{
	"epoch": 12.570532915360502,
	"grad_norm": 0.56640625,
	"learning_rate": 1.5564080144435212e-05,
	"loss": 0.2318,
	"step": 4010
	},
	{
	"epoch": 12.586206896551724,
	"grad_norm": 0.5625,
	"learning_rate": 1.536919479025609e-05,
	"loss": 0.233,
	"step": 4015
	},
	{
	"epoch": 12.601880877742946,
	"grad_norm": 0.55078125,
	"learning_rate": 1.517543564689422e-05,
	"loss": 0.2253,
	"step": 4020
	},
	{
	"epoch": 12.61755485893417,
	"grad_norm": 0.60546875,
	"learning_rate": 1.4982805292768165e-05,
	"loss": 0.2266,
	"step": 4025
	},
	{
	"epoch": 12.633228840125392,
	"grad_norm": 0.5546875,
	"learning_rate": 1.4791306291275398e-05,
	"loss": 0.2272,
	"step": 4030
	},
	{
	"epoch": 12.648902821316614,
	"grad_norm": 0.55078125,
	"learning_rate": 1.4600941190758022e-05,
	"loss": 0.2304,
	"step": 4035
	},
	{
	"epoch": 12.664576802507836,
	"grad_norm": 0.56640625,
	"learning_rate": 1.4411712524469012e-05,
	"loss": 0.2314,
	"step": 4040
	},
	{
	"epoch": 12.68025078369906,
	"grad_norm": 0.59375,
	"learning_rate": 1.4223622810538328e-05,
	"loss": 0.2303,
	"step": 4045
	},
	{
	"epoch": 12.695924764890282,
	"grad_norm": 0.5546875,
	"learning_rate": 1.4036674551939599e-05,
	"loss": 0.2323,
	"step": 4050
	},
	{
	"epoch": 12.711598746081505,
	"grad_norm": 0.6015625,
	"learning_rate": 1.385087023645667e-05,
	"loss": 0.2307,
	"step": 4055
	},
	{
	"epoch": 12.727272727272727,
	"grad_norm": 0.515625,
	"learning_rate": 1.3666212336650586e-05,
	"loss": 0.2235,
	"step": 4060
	},
	{
	"epoch": 12.74294670846395,
	"grad_norm": 0.51953125,
	"learning_rate": 1.3482703309826584e-05,
	"loss": 0.2285,
	"step": 4065
	},
	{
	"epoch": 12.758620689655173,
	"grad_norm": 0.578125,
	"learning_rate": 1.330034559800154e-05,
	"loss": 0.2311,
	"step": 4070
	},
	{
	"epoch": 12.774294670846395,
	"grad_norm": 0.578125,
	"learning_rate": 1.31191416278713e-05,
	"loss": 0.2286,
	"step": 4075
	},
	{
	"epoch": 12.789968652037617,
	"grad_norm": 0.55859375,
	"learning_rate": 1.293909381077858e-05,
	"loss": 0.2294,
	"step": 4080
	},
	{
	"epoch": 12.80564263322884,
	"grad_norm": 0.58203125,
	"learning_rate": 1.2760204542680654e-05,
	"loss": 0.2309,
	"step": 4085
	},
	{
	"epoch": 12.821316614420063,
	"grad_norm": 0.57421875,
	"learning_rate": 1.2582476204117755e-05,
	"loss": 0.2294,
	"step": 4090
	},
	{
	"epoch": 12.836990595611285,
	"grad_norm": 0.5234375,
	"learning_rate": 1.2405911160181072e-05,
	"loss": 0.2241,
	"step": 4095
	},
	{
	"epoch": 12.852664576802507,
	"grad_norm": 0.52734375,
	"learning_rate": 1.2230511760481533e-05,
	"loss": 0.2253,
	"step": 4100
	},
	{
	"epoch": 12.86833855799373,
	"grad_norm": 0.5703125,
	"learning_rate": 1.2056280339118397e-05,
	"loss": 0.2358,
	"step": 4105
	},
	{
	"epoch": 12.884012539184953,
	"grad_norm": 0.5625,
	"learning_rate": 1.188321921464829e-05,
	"loss": 0.229,
	"step": 4110
	},
	{
	"epoch": 12.899686520376175,
	"grad_norm": 0.578125,
	"learning_rate": 1.1711330690054211e-05,
	"loss": 0.2299,
	"step": 4115
	},
	{
	"epoch": 12.915360501567399,
	"grad_norm": 0.54296875,
	"learning_rate": 1.1540617052715074e-05,
	"loss": 0.2283,
	"step": 4120
	},
	{
	"epoch": 12.931034482758621,
	"grad_norm": 0.55859375,
	"learning_rate": 1.1371080574375114e-05,
	"loss": 0.2297,
	"step": 4125
	},
	{
	"epoch": 12.946708463949843,
	"grad_norm": 0.5859375,
	"learning_rate": 1.1202723511113766e-05,
	"loss": 0.2338,
	"step": 4130
	},
	{
	"epoch": 12.962382445141065,
	"grad_norm": 0.6171875,
	"learning_rate": 1.1035548103315484e-05,
	"loss": 0.2337,
	"step": 4135
	},
	{
	"epoch": 12.978056426332289,
	"grad_norm": 0.5703125,
	"learning_rate": 1.086955657564015e-05,
	"loss": 0.2294,
	"step": 4140
	},
	{
	"epoch": 12.993730407523511,
	"grad_norm": 0.6328125,
	"learning_rate": 1.0704751136993251e-05,
	"loss": 0.2281,
	"step": 4145
	},
	{
	"epoch": 13.0,
	"eval_loss": 4.455935478210449,
	"eval_runtime": 0.8034,
	"eval_samples_per_second": 2.489,
	"eval_steps_per_second": 1.245,
	"step": 4147
	},
	{
	"epoch": 13.009404388714733,
	"grad_norm": 0.50390625,
	"learning_rate": 1.0541133980496686e-05,
	"loss": 0.2257,
	"step": 4150
	},
	{
	"epoch": 13.025078369905955,
	"grad_norm": 0.57421875,
	"learning_rate": 1.0378707283459376e-05,
	"loss": 0.2262,
	"step": 4155
	},
	{
	"epoch": 13.04075235109718,
	"grad_norm": 0.49609375,
	"learning_rate": 1.0217473207348483e-05,
	"loss": 0.2247,
	"step": 4160
	},
	{
	"epoch": 13.056426332288401,
	"grad_norm": 0.5859375,
	"learning_rate": 1.0057433897760493e-05,
	"loss": 0.2277,
	"step": 4165
	},
	{
	"epoch": 13.072100313479623,
	"grad_norm": 0.53515625,
	"learning_rate": 9.898591484392793e-06,
	"loss": 0.2256,
	"step": 4170
	},
	{
	"epoch": 13.087774294670846,
	"grad_norm": 0.5625,
	"learning_rate": 9.74094808101519e-06,
	"loss": 0.23,
	"step": 4175
	},
	{
	"epoch": 13.10344827586207,
	"grad_norm": 0.60546875,
	"learning_rate": 9.584505785441932e-06,
	"loss": 0.2266,
	"step": 4180
	},
	{
	"epoch": 13.119122257053291,
	"grad_norm": 0.55078125,
	"learning_rate": 9.429266679503657e-06,
	"loss": 0.2283,
	"step": 4185
	},
	{
	"epoch": 13.134796238244514,
	"grad_norm": 0.515625,
	"learning_rate": 9.275232829019787e-06,
	"loss": 0.2257,
	"step": 4190
	},
	{
	"epoch": 13.150470219435737,
	"grad_norm": 0.51171875,
	"learning_rate": 9.122406283771002e-06,
	"loss": 0.2307,
	"step": 4195
	},
	{
	"epoch": 13.16614420062696,
	"grad_norm": 0.4921875,
	"learning_rate": 8.970789077471953e-06,
	"loss": 0.2259,
	"step": 4200
	},
	{
	"epoch": 13.181818181818182,
	"grad_norm": 0.54296875,
	"learning_rate": 8.82038322774419e-06,
	"loss": 0.2286,
	"step": 4205
	},
	{
	"epoch": 13.197492163009404,
	"grad_norm": 0.5625,
	"learning_rate": 8.671190736089373e-06,
	"loss": 0.2277,
	"step": 4210
	},
	{
	"epoch": 13.213166144200628,
	"grad_norm": 0.73046875,
	"learning_rate": 8.523213587862533e-06,
	"loss": 0.2287,
	"step": 4215
	},
	{
	"epoch": 13.22884012539185,
	"grad_norm": 0.55859375,
	"learning_rate": 8.376453752245795e-06,
	"loss": 0.2266,
	"step": 4220
	},
	{
	"epoch": 13.244514106583072,
	"grad_norm": 0.54296875,
	"learning_rate": 8.230913182222e-06,
	"loss": 0.2264,
	"step": 4225
	},
	{
	"epoch": 13.260188087774294,
	"grad_norm": 0.5625,
	"learning_rate": 8.086593814548882e-06,
	"loss": 0.2258,
	"step": 4230
	},
	{
	"epoch": 13.275862068965518,
	"grad_norm": 0.5703125,
	"learning_rate": 7.943497569733183e-06,
	"loss": 0.2295,
	"step": 4235
	},
	{
	"epoch": 13.29153605015674,
	"grad_norm": 0.578125,
	"learning_rate": 7.801626352005186e-06,
	"loss": 0.2254,
	"step": 4240
	},
	{
	"epoch": 13.307210031347962,
	"grad_norm": 0.51171875,
	"learning_rate": 7.66098204929323e-06,
	"loss": 0.2284,
	"step": 4245
	},
	{
	"epoch": 13.322884012539184,
	"grad_norm": 0.578125,
	"learning_rate": 7.521566533198765e-06,
	"loss": 0.2263,
	"step": 4250
	},
	{
	"epoch": 13.338557993730408,
	"grad_norm": 0.53125,
	"learning_rate": 7.383381658971311e-06,
	"loss": 0.2279,
	"step": 4255
	},
	{
	"epoch": 13.35423197492163,
	"grad_norm": 0.4921875,
	"learning_rate": 7.246429265483856e-06,
	"loss": 0.2238,
	"step": 4260
	},
	{
	"epoch": 13.369905956112852,
	"grad_norm": 0.5234375,
	"learning_rate": 7.1107111752083175e-06,
	"loss": 0.2273,
	"step": 4265
	},
	{
	"epoch": 13.385579937304076,
	"grad_norm": 0.53125,
	"learning_rate": 6.976229194191352e-06,
	"loss": 0.2244,
	"step": 4270
	},
	{
	"epoch": 13.401253918495298,
	"grad_norm": 0.53515625,
	"learning_rate": 6.842985112030253e-06,
	"loss": 0.2256,
	"step": 4275
	},
	{
	"epoch": 13.41692789968652,
	"grad_norm": 0.53125,
	"learning_rate": 6.710980701849223e-06,
	"loss": 0.2246,
	"step": 4280
	},
	{
	"epoch": 13.432601880877742,
	"grad_norm": 0.498046875,
	"learning_rate": 6.580217720275661e-06,
	"loss": 0.2275,
	"step": 4285
	},
	{
	"epoch": 13.448275862068966,
	"grad_norm": 0.5234375,
	"learning_rate": 6.450697907416936e-06,
	"loss": 0.2269,
	"step": 4290
	},
	{
	"epoch": 13.463949843260188,
	"grad_norm": 0.5234375,
	"learning_rate": 6.3224229868370845e-06,
	"loss": 0.2311,
	"step": 4295
	},
	{
	"epoch": 13.47962382445141,
	"grad_norm": 0.5859375,
	"learning_rate": 6.19539466553396e-06,
	"loss": 0.2254,
	"step": 4300
	},
	{
	"epoch": 13.495297805642632,
	"grad_norm": 0.58984375,
	"learning_rate": 6.0696146339165095e-06,
	"loss": 0.2289,
	"step": 4305
	},
	{
	"epoch": 13.510971786833856,
	"grad_norm": 0.55078125,
	"learning_rate": 5.945084565782277e-06,
	"loss": 0.2319,
	"step": 4310
	},
	{
	"epoch": 13.526645768025078,
	"grad_norm": 0.498046875,
	"learning_rate": 5.82180611829507e-06,
	"loss": 0.224,
	"step": 4315
	},
	{
	"epoch": 13.5423197492163,
	"grad_norm": 0.5703125,
	"learning_rate": 5.699780931963006e-06,
	"loss": 0.2297,
	"step": 4320
	},
	{
	"epoch": 13.557993730407524,
	"grad_norm": 0.54296875,
	"learning_rate": 5.5790106306165766e-06,
	"loss": 0.228,
	"step": 4325
	},
	{
	"epoch": 13.573667711598747,
	"grad_norm": 0.6171875,
	"learning_rate": 5.459496821387166e-06,
	"loss": 0.2315,
	"step": 4330
	},
	{
	"epoch": 13.589341692789969,
	"grad_norm": 0.53515625,
	"learning_rate": 5.341241094685523e-06,
	"loss": 0.2283,
	"step": 4335
	},
	{
	"epoch": 13.60501567398119,
	"grad_norm": 0.6953125,
	"learning_rate": 5.2242450241806964e-06,
	"loss": 0.2277,
	"step": 4340
	},
	{
	"epoch": 13.620689655172415,
	"grad_norm": 0.55859375,
	"learning_rate": 5.108510166779068e-06,
	"loss": 0.2264,
	"step": 4345
	},
	{
	"epoch": 13.636363636363637,
	"grad_norm": 0.51953125,
	"learning_rate": 4.994038062603645e-06,
	"loss": 0.232,
	"step": 4350
	},
	{
	"epoch": 13.652037617554859,
	"grad_norm": 0.546875,
	"learning_rate": 4.880830234973499e-06,
	"loss": 0.2254,
	"step": 4355
	},
	{
	"epoch": 13.66771159874608,
	"grad_norm": 0.58984375,
	"learning_rate": 4.7688881903835915e-06,
	"loss": 0.2277,
	"step": 4360
	},
	{
	"epoch": 13.683385579937305,
	"grad_norm": 0.515625,
	"learning_rate": 4.658213418484636e-06,
	"loss": 0.2292,
	"step": 4365
	},
	{
	"epoch": 13.699059561128527,
	"grad_norm": 0.5,
	"learning_rate": 4.548807392063359e-06,
	"loss": 0.2246,
	"step": 4370
	},
	{
	"epoch": 13.714733542319749,
	"grad_norm": 0.51171875,
	"learning_rate": 4.4406715670228474e-06,
	"loss": 0.226,
	"step": 4375
	},
	{
	"epoch": 13.730407523510971,
	"grad_norm": 0.5546875,
	"learning_rate": 4.333807382363197e-06,
	"loss": 0.2292,
	"step": 4380
	},
	{
	"epoch": 13.746081504702195,
	"grad_norm": 0.55078125,
	"learning_rate": 4.22821626016231e-06,
	"loss": 0.2289,
	"step": 4385
	},
	{
	"epoch": 13.761755485893417,
	"grad_norm": 0.5234375,
	"learning_rate": 4.123899605557091e-06,
	"loss": 0.2247,
	"step": 4390
	},
	{
	"epoch": 13.77742946708464,
	"grad_norm": 0.55078125,
	"learning_rate": 4.020858806724592e-06,
	"loss": 0.2221,
	"step": 4395
	},
	{
	"epoch": 13.793103448275861,
	"grad_norm": 0.55078125,
	"learning_rate": 3.91909523486369e-06,
	"loss": 0.2277,
	"step": 4400
	},
	{
	"epoch": 13.808777429467085,
	"grad_norm": 0.55859375,
	"learning_rate": 3.818610244176702e-06,
	"loss": 0.2298,
	"step": 4405
	},
	{
	"epoch": 13.824451410658307,
	"grad_norm": 0.625,
	"learning_rate": 3.719405171851487e-06,
	"loss": 0.2223,
	"step": 4410
	},
	{
	"epoch": 13.84012539184953,
	"grad_norm": 0.546875,
	"learning_rate": 3.621481338043564e-06,
	"loss": 0.2269,
	"step": 4415
	},
	{
	"epoch": 13.855799373040753,
	"grad_norm": 0.57421875,
	"learning_rate": 3.5248400458586127e-06,
	"loss": 0.2266,
	"step": 4420
	},
	{
	"epoch": 13.871473354231975,
	"grad_norm": 0.546875,
	"learning_rate": 3.429482581335053e-06,
	"loss": 0.2296,
	"step": 4425
	},
	{
	"epoch": 13.887147335423197,
	"grad_norm": 0.54296875,
	"learning_rate": 3.3354102134269927e-06,
	"loss": 0.2281,
	"step": 4430
	},
	{
	"epoch": 13.90282131661442,
	"grad_norm": 0.5625,
	"learning_rate": 3.2426241939873313e-06,
	"loss": 0.2288,
	"step": 4435
	},
	{
	"epoch": 13.918495297805643,
	"grad_norm": 0.62109375,
	"learning_rate": 3.151125757751083e-06,
	"loss": 0.2299,
	"step": 4440
	},
	{
	"epoch": 13.934169278996865,
	"grad_norm": 0.578125,
	"learning_rate": 3.0609161223189575e-06,
	"loss": 0.2238,
	"step": 4445
	},
	{
	"epoch": 13.949843260188088,
	"grad_norm": 0.58203125,
	"learning_rate": 2.9719964881411712e-06,
	"loss": 0.2243,
	"step": 4450
	},
	{
	"epoch": 13.96551724137931,
	"grad_norm": 0.890625,
	"learning_rate": 2.8843680385014284e-06,
	"loss": 0.2257,
	"step": 4455
	},
	{
	"epoch": 13.981191222570533,
	"grad_norm": 0.609375,
	"learning_rate": 2.798031939501222e-06,
	"loss": 0.2286,
	"step": 4460
	},
	{
	"epoch": 13.996865203761756,
	"grad_norm": 0.52734375,
	"learning_rate": 2.7129893400442807e-06,
	"loss": 0.2274,
	"step": 4465
	},
	{
	"epoch": 14.0,
	"eval_loss": 4.4672441482543945,
	"eval_runtime": 0.8024,
	"eval_samples_per_second": 2.492,
	"eval_steps_per_second": 1.246,
	"step": 4466
	},
	{
	"epoch": 14.012539184952978,
	"grad_norm": 0.58984375,
	"learning_rate": 2.629241371821334e-06,
	"loss": 0.2259,
	"step": 4470
	},
	{
	"epoch": 14.0282131661442,
	"grad_norm": 0.51953125,
	"learning_rate": 2.546789149294959e-06,
	"loss": 0.2248,
	"step": 4475
	},
	{
	"epoch": 14.043887147335424,
	"grad_norm": 0.59765625,
	"learning_rate": 2.4656337696848496e-06,
	"loss": 0.226,
	"step": 4480
	},
	{
	"epoch": 14.059561128526646,
	"grad_norm": 0.5625,
	"learning_rate": 2.3857763129531473e-06,
	"loss": 0.2239,
	"step": 4485
	},
	{
	"epoch": 14.075235109717868,
	"grad_norm": 0.51953125,
	"learning_rate": 2.3072178417901326e-06,
	"loss": 0.2277,
	"step": 4490
	},
	{
	"epoch": 14.090909090909092,
	"grad_norm": 0.57421875,
	"learning_rate": 2.229959401599968e-06,
	"loss": 0.2246,
	"step": 4495
	},
	{
	"epoch": 14.106583072100314,
	"grad_norm": 0.5546875,
	"learning_rate": 2.154002020486945e-06,
	"loss": 0.2244,
	"step": 4500
	},
	{
	"epoch": 14.122257053291536,
	"grad_norm": 0.5546875,
	"learning_rate": 2.0793467092416696e-06,
	"loss": 0.2226,
	"step": 4505
	},
	{
	"epoch": 14.137931034482758,
	"grad_norm": 0.53515625,
	"learning_rate": 2.005994461327698e-06,
	"loss": 0.2299,
	"step": 4510
	},
	{
	"epoch": 14.153605015673982,
	"grad_norm": 0.515625,
	"learning_rate": 1.933946252868224e-06,
	"loss": 0.2287,
	"step": 4515
	},
	{
	"epoch": 14.169278996865204,
	"grad_norm": 0.54296875,
	"learning_rate": 1.8632030426332215e-06,
	"loss": 0.2226,
	"step": 4520
	},
	{
	"epoch": 14.184952978056426,
	"grad_norm": 0.59765625,
	"learning_rate": 1.7937657720265454e-06,
	"loss": 0.2262,
	"step": 4525
	},
	{
	"epoch": 14.200626959247648,
	"grad_norm": 0.5703125,
	"learning_rate": 1.7256353650735302e-06,
	"loss": 0.2298,
	"step": 4530
	},
	{
	"epoch": 14.216300940438872,
	"grad_norm": 0.5859375,
	"learning_rate": 1.6588127284085652e-06,
	"loss": 0.2286,
	"step": 4535
	},
	{
	"epoch": 14.231974921630094,
	"grad_norm": 0.65234375,
	"learning_rate": 1.5932987512631614e-06,
	"loss": 0.2282,
	"step": 4540
	},
	{
	"epoch": 14.247648902821316,
	"grad_norm": 0.58203125,
	"learning_rate": 1.529094305453993e-06,
	"loss": 0.222,
	"step": 4545
	},
	{
	"epoch": 14.263322884012538,
	"grad_norm": 0.5234375,
	"learning_rate": 1.4662002453714074e-06,
	"loss": 0.2256,
	"step": 4550
	},
	{
	"epoch": 14.278996865203762,
	"grad_norm": 0.59375,
	"learning_rate": 1.4046174079679787e-06,
	"loss": 0.2266,
	"step": 4555
	},
	{
	"epoch": 14.294670846394984,
	"grad_norm": 0.59375,
	"learning_rate": 1.3443466127474046e-06,
	"loss": 0.2245,
	"step": 4560
	},
	{
	"epoch": 14.310344827586206,
	"grad_norm": 0.546875,
	"learning_rate": 1.285388661753595e-06,
	"loss": 0.2332,
	"step": 4565
	},
	{
	"epoch": 14.32601880877743,
	"grad_norm": 0.50390625,
	"learning_rate": 1.2277443395599886e-06,
	"loss": 0.2266,
	"step": 4570
	},
	{
	"epoch": 14.341692789968652,
	"grad_norm": 0.52734375,
	"learning_rate": 1.1714144132591199e-06,
	"loss": 0.2281,
	"step": 4575
	},
	{
	"epoch": 14.357366771159874,
	"grad_norm": 0.53125,
	"learning_rate": 1.116399632452414e-06,
	"loss": 0.2292,
	"step": 4580
	},
	{
	"epoch": 14.373040752351097,
	"grad_norm": 0.51171875,
	"learning_rate": 1.062700729240218e-06,
	"loss": 0.2246,
	"step": 4585
	},
	{
	"epoch": 14.38871473354232,
	"grad_norm": 0.57421875,
	"learning_rate": 1.0103184182120418e-06,
	"loss": 0.228,
	"step": 4590
	},
	{
	"epoch": 14.404388714733543,
	"grad_norm": 0.54296875,
	"learning_rate": 9.592533964370542e-07,
	"loss": 0.2257,
	"step": 4595
	},
	{
	"epoch": 14.420062695924765,
	"grad_norm": 0.51171875,
	"learning_rate": 9.095063434548135e-07,
	"loss": 0.229,
	"step": 4600
	},
	{
	"epoch": 14.435736677115987,
	"grad_norm": 0.490234375,
	"learning_rate": 8.61077921266229e-07,
	"loss": 0.2273,
	"step": 4605
	},
	{
	"epoch": 14.45141065830721,
	"grad_norm": 0.5546875,
	"learning_rate": 8.139687743247138e-07,
	"loss": 0.2256,
	"step": 4610
	},
	{
	"epoch": 14.467084639498433,
	"grad_norm": 0.640625,
	"learning_rate": 7.681795295276684e-07,
	"loss": 0.229,
	"step": 4615
	},
	{
	"epoch": 14.482758620689655,
	"grad_norm": 0.5234375,
	"learning_rate": 7.237107962080991e-07,
	"loss": 0.2248,
	"step": 4620
	},
	{
	"epoch": 14.498432601880877,
	"grad_norm": 0.54296875,
	"learning_rate": 6.805631661265133e-07,
	"loss": 0.2292,
	"step": 4625
	},
	{
	"epoch": 14.5141065830721,
	"grad_norm": 0.578125,
	"learning_rate": 6.387372134630587e-07,
	"loss": 0.2225,
	"step": 4630
	},
	{
	"epoch": 14.529780564263323,
	"grad_norm": 0.5078125,
	"learning_rate": 5.982334948098522e-07,
	"loss": 0.2291,
	"step": 4635
	},
	{
	"epoch": 14.545454545454545,
	"grad_norm": 0.5703125,
	"learning_rate": 5.5905254916363e-07,
	"loss": 0.2228,
	"step": 4640
	},
	{
	"epoch": 14.561128526645769,
	"grad_norm": 0.5546875,
	"learning_rate": 5.211948979184978e-07,
	"loss": 0.2282,
	"step": 4645
	},
	{
	"epoch": 14.576802507836991,
	"grad_norm": 0.53515625,
	"learning_rate": 4.846610448590804e-07,
	"loss": 0.2291,
	"step": 4650
	},
	{
	"epoch": 14.592476489028213,
	"grad_norm": 0.5625,
	"learning_rate": 4.4945147615372827e-07,
	"loss": 0.229,
	"step": 4655
	},
	{
	"epoch": 14.608150470219435,
	"grad_norm": 0.56640625,
	"learning_rate": 4.1556666034811007e-07,
	"loss": 0.2294,
	"step": 4660
	},
	{
	"epoch": 14.623824451410659,
	"grad_norm": 0.546875,
	"learning_rate": 3.8300704835896316e-07,
	"loss": 0.2268,
	"step": 4665
	},
	{
	"epoch": 14.639498432601881,
	"grad_norm": 0.5390625,
	"learning_rate": 3.517730734680869e-07,
	"loss": 0.2263,
	"step": 4670
	},
	{
	"epoch": 14.655172413793103,
	"grad_norm": 0.58984375,
	"learning_rate": 3.2186515131655823e-07,
	"loss": 0.2267,
	"step": 4675
	},
	{
	"epoch": 14.670846394984325,
	"grad_norm": 0.5078125,
	"learning_rate": 2.932836798992589e-07,
	"loss": 0.2281,
	"step": 4680
	},
	{
	"epoch": 14.68652037617555,
	"grad_norm": 0.52734375,
	"learning_rate": 2.660290395595011e-07,
	"loss": 0.2286,
	"step": 4685
	},
	{
	"epoch": 14.702194357366771,
	"grad_norm": 0.54296875,
	"learning_rate": 2.401015929840322e-07,
	"loss": 0.2275,
	"step": 4690
	},
	{
	"epoch": 14.717868338557993,
	"grad_norm": 0.55859375,
	"learning_rate": 2.155016851981717e-07,
	"loss": 0.229,
	"step": 4695
	},
	{
	"epoch": 14.733542319749215,
	"grad_norm": 0.50390625,
	"learning_rate": 1.9222964356123696e-07,
	"loss": 0.2284,
	"step": 4700
	},
	{
	"epoch": 14.74921630094044,
	"grad_norm": 0.56640625,
	"learning_rate": 1.7028577776216915e-07,
	"loss": 0.2328,
	"step": 4705
	},
	{
	"epoch": 14.764890282131661,
	"grad_norm": 0.59765625,
	"learning_rate": 1.496703798154364e-07,
	"loss": 0.2272,
	"step": 4710
	},
	{
	"epoch": 14.780564263322884,
	"grad_norm": 0.482421875,
	"learning_rate": 1.3038372405711487e-07,
	"loss": 0.2307,
	"step": 4715
	},
	{
	"epoch": 14.796238244514107,
	"grad_norm": 0.5234375,
	"learning_rate": 1.1242606714129134e-07,
	"loss": 0.2253,
	"step": 4720
	},
	{
	"epoch": 14.81191222570533,
	"grad_norm": 0.55859375,
	"learning_rate": 9.579764803658853e-08,
	"loss": 0.2238,
	"step": 4725
	},
	{
	"epoch": 14.827586206896552,
	"grad_norm": 0.52734375,
	"learning_rate": 8.049868802301187e-08,
	"loss": 0.2273,
	"step": 4730
	},
	{
	"epoch": 14.843260188087774,
	"grad_norm": 0.5,
	"learning_rate": 6.652939068899633e-08,
	"loss": 0.2315,
	"step": 4735
	},
	{
	"epoch": 14.858934169278998,
	"grad_norm": 0.52734375,
	"learning_rate": 5.388994192875307e-08,
	"loss": 0.2232,
	"step": 4740
	},
	{
	"epoch": 14.87460815047022,
	"grad_norm": 0.515625,
	"learning_rate": 4.258050993967144e-08,
	"loss": 0.2258,
	"step": 4745
	},
	{
	"epoch": 14.890282131661442,
	"grad_norm": 0.515625,
	"learning_rate": 3.260124522023178e-08,
	"loss": 0.2287,
	"step": 4750
	},
	{
	"epoch": 14.905956112852664,
	"grad_norm": 0.484375,
	"learning_rate": 2.3952280567873796e-08,
	"loss": 0.2208,
	"step": 4755
	},
	{
	"epoch": 14.921630094043888,
	"grad_norm": 0.515625,
	"learning_rate": 1.6633731077297933e-08,
	"loss": 0.2233,
	"step": 4760
	},
	{
	"epoch": 14.93730407523511,
	"grad_norm": 0.58203125,
	"learning_rate": 1.0645694138933237e-08,
	"loss": 0.2239,
	"step": 4765
	},
	{
	"epoch": 14.952978056426332,
	"grad_norm": 0.50390625,
	"learning_rate": 5.988249437627325e-09,
	"loss": 0.2257,
	"step": 4770
	},
	{
	"epoch": 14.968652037617554,
	"grad_norm": 0.609375,
	"learning_rate": 2.6614589515583377e-09,
	"loss": 0.2308,
	"step": 4775
	},
	{
	"epoch": 14.984326018808778,
	"grad_norm": 0.55859375,
	"learning_rate": 6.65366951457802e-10,
	"loss": 0.2262,
	"step": 4780
	},
	{
	"epoch": 15.0,
	"grad_norm": 0.5625,
	"learning_rate": 0.0,
	"loss": 0.2251,
	"step": 4785
	},
	{
	"epoch": 15.0,
	"eval_loss": 4.466753005981445,
	"eval_runtime": 0.7937,
	"eval_samples_per_second": 2.52,
	"eval_steps_per_second": 1.26,
	"step": 4785
	},
	{
	"epoch": 15.0,
	"step": 4785,
	"total_flos": 5.539666200113447e+18,
	"train_loss": 0.8980661474674348,
	"train_runtime": 27834.6576,
	"train_samples_per_second": 4.124,
	"train_steps_per_second": 0.172
	}
	],
	"logging_steps": 5,
	"max_steps": 4785,
	"num_input_tokens_seen": 0,
	"num_train_epochs": 15,
	"save_steps": 100,
	"stateful_callbacks": {
	"TrainerControl": {
	"args": {
	"should_epoch_stop": false,
	"should_evaluate": false,
	"should_log": false,
	"should_save": true,
	"should_training_stop": false
	},
	"attributes": {}
	}
	},
	"total_flos": 5.539666200113447e+18,
	"train_batch_size": 4,
	"trial_name": null,
	"trial_params": null
	}