usclm-mistral-mk1 / last-checkpoint /trainer_state.json
gabrielaltay's picture
Training in progress, step 1170, checkpoint
824d57a verified
raw
history blame
46.9 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.10003419972640219,
"eval_steps": 500,
"global_step": 1170,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 4.3728413581848145,
"learning_rate": 4.998290013679891e-05,
"loss": 10.0117,
"step": 4
},
{
"epoch": 0.0,
"grad_norm": 2.599367380142212,
"learning_rate": 4.996580027359781e-05,
"loss": 9.1745,
"step": 8
},
{
"epoch": 0.0,
"grad_norm": 2.4951720237731934,
"learning_rate": 4.994870041039672e-05,
"loss": 8.7624,
"step": 12
},
{
"epoch": 0.0,
"grad_norm": 2.4932913780212402,
"learning_rate": 4.9931600547195625e-05,
"loss": 8.5709,
"step": 16
},
{
"epoch": 0.0,
"grad_norm": 2.3983781337738037,
"learning_rate": 4.991450068399453e-05,
"loss": 8.4411,
"step": 20
},
{
"epoch": 0.0,
"grad_norm": 2.2340972423553467,
"learning_rate": 4.989740082079344e-05,
"loss": 8.1957,
"step": 24
},
{
"epoch": 0.0,
"grad_norm": 2.1964590549468994,
"learning_rate": 4.988030095759234e-05,
"loss": 8.0598,
"step": 28
},
{
"epoch": 0.0,
"grad_norm": 1.9686617851257324,
"learning_rate": 4.986320109439125e-05,
"loss": 7.8679,
"step": 32
},
{
"epoch": 0.0,
"grad_norm": 1.8235087394714355,
"learning_rate": 4.984610123119015e-05,
"loss": 7.7621,
"step": 36
},
{
"epoch": 0.0,
"grad_norm": 1.7598885297775269,
"learning_rate": 4.9829001367989056e-05,
"loss": 7.5417,
"step": 40
},
{
"epoch": 0.0,
"grad_norm": 1.94789719581604,
"learning_rate": 4.981190150478797e-05,
"loss": 7.3331,
"step": 44
},
{
"epoch": 0.0,
"grad_norm": 1.9366331100463867,
"learning_rate": 4.979480164158687e-05,
"loss": 7.1039,
"step": 48
},
{
"epoch": 0.0,
"grad_norm": 2.1228675842285156,
"learning_rate": 4.977770177838578e-05,
"loss": 6.9965,
"step": 52
},
{
"epoch": 0.0,
"grad_norm": 1.9855167865753174,
"learning_rate": 4.976060191518468e-05,
"loss": 6.8865,
"step": 56
},
{
"epoch": 0.01,
"grad_norm": 1.8035985231399536,
"learning_rate": 4.9743502051983585e-05,
"loss": 6.6322,
"step": 60
},
{
"epoch": 0.01,
"grad_norm": 1.8589977025985718,
"learning_rate": 4.9726402188782486e-05,
"loss": 6.5616,
"step": 64
},
{
"epoch": 0.01,
"grad_norm": 1.745139241218567,
"learning_rate": 4.97093023255814e-05,
"loss": 6.3479,
"step": 68
},
{
"epoch": 0.01,
"grad_norm": 1.266196608543396,
"learning_rate": 4.969220246238031e-05,
"loss": 6.2343,
"step": 72
},
{
"epoch": 0.01,
"grad_norm": 2.005223035812378,
"learning_rate": 4.967510259917921e-05,
"loss": 6.1866,
"step": 76
},
{
"epoch": 0.01,
"grad_norm": 1.5017377138137817,
"learning_rate": 4.9658002735978115e-05,
"loss": 5.972,
"step": 80
},
{
"epoch": 0.01,
"grad_norm": 1.6136974096298218,
"learning_rate": 4.9640902872777016e-05,
"loss": 6.1335,
"step": 84
},
{
"epoch": 0.01,
"grad_norm": 1.3865970373153687,
"learning_rate": 4.962380300957592e-05,
"loss": 5.944,
"step": 88
},
{
"epoch": 0.01,
"grad_norm": 1.283933162689209,
"learning_rate": 4.960670314637483e-05,
"loss": 5.8713,
"step": 92
},
{
"epoch": 0.01,
"grad_norm": 1.1588549613952637,
"learning_rate": 4.958960328317374e-05,
"loss": 5.7731,
"step": 96
},
{
"epoch": 0.01,
"grad_norm": 1.3081687688827515,
"learning_rate": 4.9572503419972645e-05,
"loss": 5.7429,
"step": 100
},
{
"epoch": 0.01,
"grad_norm": 1.6212745904922485,
"learning_rate": 4.9555403556771546e-05,
"loss": 5.481,
"step": 104
},
{
"epoch": 0.01,
"grad_norm": 1.709843397140503,
"learning_rate": 4.953830369357045e-05,
"loss": 5.4855,
"step": 108
},
{
"epoch": 0.01,
"grad_norm": 1.2814812660217285,
"learning_rate": 4.952120383036936e-05,
"loss": 5.4367,
"step": 112
},
{
"epoch": 0.01,
"grad_norm": 1.5539968013763428,
"learning_rate": 4.950410396716826e-05,
"loss": 5.2652,
"step": 116
},
{
"epoch": 0.01,
"grad_norm": 1.500375509262085,
"learning_rate": 4.948700410396717e-05,
"loss": 5.5137,
"step": 120
},
{
"epoch": 0.01,
"grad_norm": 1.7938899993896484,
"learning_rate": 4.9469904240766076e-05,
"loss": 5.4663,
"step": 124
},
{
"epoch": 0.01,
"grad_norm": 1.6169573068618774,
"learning_rate": 4.945280437756498e-05,
"loss": 5.3137,
"step": 128
},
{
"epoch": 0.01,
"grad_norm": 1.1904114484786987,
"learning_rate": 4.943570451436389e-05,
"loss": 5.1871,
"step": 132
},
{
"epoch": 0.01,
"grad_norm": 2.5564723014831543,
"learning_rate": 4.941860465116279e-05,
"loss": 5.4058,
"step": 136
},
{
"epoch": 0.01,
"grad_norm": 1.6187268495559692,
"learning_rate": 4.94015047879617e-05,
"loss": 5.1919,
"step": 140
},
{
"epoch": 0.01,
"grad_norm": 1.2222367525100708,
"learning_rate": 4.93844049247606e-05,
"loss": 5.1876,
"step": 144
},
{
"epoch": 0.01,
"grad_norm": 1.5898586511611938,
"learning_rate": 4.936730506155951e-05,
"loss": 5.1217,
"step": 148
},
{
"epoch": 0.01,
"grad_norm": 1.446902871131897,
"learning_rate": 4.935020519835842e-05,
"loss": 5.0582,
"step": 152
},
{
"epoch": 0.01,
"grad_norm": 1.862309217453003,
"learning_rate": 4.933310533515732e-05,
"loss": 4.9028,
"step": 156
},
{
"epoch": 0.01,
"grad_norm": 1.5455704927444458,
"learning_rate": 4.931600547195623e-05,
"loss": 5.0352,
"step": 160
},
{
"epoch": 0.01,
"grad_norm": 1.772558569908142,
"learning_rate": 4.929890560875513e-05,
"loss": 4.9707,
"step": 164
},
{
"epoch": 0.01,
"grad_norm": 1.8154480457305908,
"learning_rate": 4.9281805745554036e-05,
"loss": 5.0588,
"step": 168
},
{
"epoch": 0.01,
"grad_norm": 1.4536504745483398,
"learning_rate": 4.9264705882352944e-05,
"loss": 4.9997,
"step": 172
},
{
"epoch": 0.02,
"grad_norm": 1.6166527271270752,
"learning_rate": 4.924760601915185e-05,
"loss": 4.9472,
"step": 176
},
{
"epoch": 0.02,
"grad_norm": 2.1260979175567627,
"learning_rate": 4.923050615595076e-05,
"loss": 4.944,
"step": 180
},
{
"epoch": 0.02,
"grad_norm": 1.4778213500976562,
"learning_rate": 4.921340629274966e-05,
"loss": 4.8657,
"step": 184
},
{
"epoch": 0.02,
"grad_norm": 1.255488395690918,
"learning_rate": 4.9196306429548566e-05,
"loss": 4.7728,
"step": 188
},
{
"epoch": 0.02,
"grad_norm": 1.5698314905166626,
"learning_rate": 4.917920656634747e-05,
"loss": 4.9433,
"step": 192
},
{
"epoch": 0.02,
"grad_norm": 1.342402696609497,
"learning_rate": 4.9162106703146374e-05,
"loss": 4.6336,
"step": 196
},
{
"epoch": 0.02,
"grad_norm": 1.4427978992462158,
"learning_rate": 4.914500683994528e-05,
"loss": 4.7112,
"step": 200
},
{
"epoch": 0.02,
"grad_norm": 1.6528425216674805,
"learning_rate": 4.912790697674419e-05,
"loss": 4.876,
"step": 204
},
{
"epoch": 0.02,
"grad_norm": 1.706386923789978,
"learning_rate": 4.9110807113543096e-05,
"loss": 4.5971,
"step": 208
},
{
"epoch": 0.02,
"grad_norm": 1.7773737907409668,
"learning_rate": 4.9093707250342e-05,
"loss": 4.7944,
"step": 212
},
{
"epoch": 0.02,
"grad_norm": 1.7884222269058228,
"learning_rate": 4.9076607387140904e-05,
"loss": 4.5912,
"step": 216
},
{
"epoch": 0.02,
"grad_norm": 1.5389587879180908,
"learning_rate": 4.905950752393981e-05,
"loss": 4.6656,
"step": 220
},
{
"epoch": 0.02,
"grad_norm": 1.7006316184997559,
"learning_rate": 4.904240766073871e-05,
"loss": 4.6589,
"step": 224
},
{
"epoch": 0.02,
"grad_norm": 1.6314700841903687,
"learning_rate": 4.9025307797537626e-05,
"loss": 4.5036,
"step": 228
},
{
"epoch": 0.02,
"grad_norm": 1.7512930631637573,
"learning_rate": 4.900820793433653e-05,
"loss": 4.5105,
"step": 232
},
{
"epoch": 0.02,
"grad_norm": 1.9109033346176147,
"learning_rate": 4.8991108071135434e-05,
"loss": 4.4449,
"step": 236
},
{
"epoch": 0.02,
"grad_norm": 1.547317385673523,
"learning_rate": 4.897400820793434e-05,
"loss": 4.4906,
"step": 240
},
{
"epoch": 0.02,
"grad_norm": 1.5143060684204102,
"learning_rate": 4.895690834473324e-05,
"loss": 4.3545,
"step": 244
},
{
"epoch": 0.02,
"grad_norm": 1.6648136377334595,
"learning_rate": 4.893980848153215e-05,
"loss": 4.4093,
"step": 248
},
{
"epoch": 0.02,
"grad_norm": 1.5027951002120972,
"learning_rate": 4.892270861833106e-05,
"loss": 4.3444,
"step": 252
},
{
"epoch": 0.02,
"grad_norm": 1.9429397583007812,
"learning_rate": 4.8905608755129964e-05,
"loss": 4.4937,
"step": 256
},
{
"epoch": 0.02,
"grad_norm": 1.4009640216827393,
"learning_rate": 4.888850889192887e-05,
"loss": 4.3625,
"step": 260
},
{
"epoch": 0.02,
"grad_norm": 1.7445279359817505,
"learning_rate": 4.887140902872777e-05,
"loss": 4.2759,
"step": 264
},
{
"epoch": 0.02,
"grad_norm": 1.4381403923034668,
"learning_rate": 4.885430916552668e-05,
"loss": 4.574,
"step": 268
},
{
"epoch": 0.02,
"grad_norm": 1.5582979917526245,
"learning_rate": 4.883720930232558e-05,
"loss": 4.3831,
"step": 272
},
{
"epoch": 0.02,
"grad_norm": 1.461166501045227,
"learning_rate": 4.882010943912449e-05,
"loss": 4.5049,
"step": 276
},
{
"epoch": 0.02,
"grad_norm": 1.756549596786499,
"learning_rate": 4.8803009575923394e-05,
"loss": 4.1334,
"step": 280
},
{
"epoch": 0.02,
"grad_norm": 1.9006497859954834,
"learning_rate": 4.87859097127223e-05,
"loss": 4.3406,
"step": 284
},
{
"epoch": 0.02,
"grad_norm": 1.4551759958267212,
"learning_rate": 4.876880984952121e-05,
"loss": 4.4688,
"step": 288
},
{
"epoch": 0.02,
"grad_norm": 1.5335747003555298,
"learning_rate": 4.875170998632011e-05,
"loss": 4.2096,
"step": 292
},
{
"epoch": 0.03,
"grad_norm": 1.4802557229995728,
"learning_rate": 4.873461012311902e-05,
"loss": 4.0918,
"step": 296
},
{
"epoch": 0.03,
"grad_norm": 1.9993939399719238,
"learning_rate": 4.8717510259917924e-05,
"loss": 4.3636,
"step": 300
},
{
"epoch": 0.03,
"grad_norm": 1.706895351409912,
"learning_rate": 4.8700410396716825e-05,
"loss": 4.0891,
"step": 304
},
{
"epoch": 0.03,
"grad_norm": 1.4630484580993652,
"learning_rate": 4.868331053351574e-05,
"loss": 4.3254,
"step": 308
},
{
"epoch": 0.03,
"grad_norm": 1.4353913068771362,
"learning_rate": 4.866621067031464e-05,
"loss": 4.1935,
"step": 312
},
{
"epoch": 0.03,
"grad_norm": 1.5042275190353394,
"learning_rate": 4.864911080711355e-05,
"loss": 4.0913,
"step": 316
},
{
"epoch": 0.03,
"grad_norm": 1.792472004890442,
"learning_rate": 4.863201094391245e-05,
"loss": 4.2159,
"step": 320
},
{
"epoch": 0.03,
"grad_norm": 1.4059948921203613,
"learning_rate": 4.8614911080711355e-05,
"loss": 4.2934,
"step": 324
},
{
"epoch": 0.03,
"grad_norm": 1.8408161401748657,
"learning_rate": 4.859781121751026e-05,
"loss": 4.1482,
"step": 328
},
{
"epoch": 0.03,
"grad_norm": 1.716046690940857,
"learning_rate": 4.858071135430917e-05,
"loss": 4.0273,
"step": 332
},
{
"epoch": 0.03,
"grad_norm": 1.8415429592132568,
"learning_rate": 4.856361149110808e-05,
"loss": 4.0837,
"step": 336
},
{
"epoch": 0.03,
"grad_norm": 1.4880731105804443,
"learning_rate": 4.854651162790698e-05,
"loss": 4.1957,
"step": 340
},
{
"epoch": 0.03,
"grad_norm": 1.8150016069412231,
"learning_rate": 4.8529411764705885e-05,
"loss": 4.225,
"step": 344
},
{
"epoch": 0.03,
"grad_norm": 1.7828190326690674,
"learning_rate": 4.851231190150479e-05,
"loss": 4.1053,
"step": 348
},
{
"epoch": 0.03,
"grad_norm": 1.8549565076828003,
"learning_rate": 4.849521203830369e-05,
"loss": 3.8068,
"step": 352
},
{
"epoch": 0.03,
"grad_norm": 1.4295508861541748,
"learning_rate": 4.84781121751026e-05,
"loss": 3.9358,
"step": 356
},
{
"epoch": 0.03,
"grad_norm": 1.4277971982955933,
"learning_rate": 4.846101231190151e-05,
"loss": 4.1512,
"step": 360
},
{
"epoch": 0.03,
"grad_norm": 1.4937466382980347,
"learning_rate": 4.8443912448700415e-05,
"loss": 4.0162,
"step": 364
},
{
"epoch": 0.03,
"grad_norm": 1.3076913356781006,
"learning_rate": 4.842681258549932e-05,
"loss": 4.0336,
"step": 368
},
{
"epoch": 0.03,
"grad_norm": 1.673946738243103,
"learning_rate": 4.840971272229822e-05,
"loss": 4.0915,
"step": 372
},
{
"epoch": 0.03,
"grad_norm": 1.5978224277496338,
"learning_rate": 4.839261285909713e-05,
"loss": 4.137,
"step": 376
},
{
"epoch": 0.03,
"grad_norm": 2.214573383331299,
"learning_rate": 4.837551299589603e-05,
"loss": 3.8231,
"step": 380
},
{
"epoch": 0.03,
"grad_norm": 1.4263286590576172,
"learning_rate": 4.835841313269494e-05,
"loss": 3.9326,
"step": 384
},
{
"epoch": 0.03,
"grad_norm": 1.694931149482727,
"learning_rate": 4.8341313269493845e-05,
"loss": 4.0673,
"step": 388
},
{
"epoch": 0.03,
"grad_norm": 1.558498740196228,
"learning_rate": 4.832421340629275e-05,
"loss": 3.9454,
"step": 392
},
{
"epoch": 0.03,
"grad_norm": 1.5231866836547852,
"learning_rate": 4.830711354309166e-05,
"loss": 4.0214,
"step": 396
},
{
"epoch": 0.03,
"grad_norm": 1.6748324632644653,
"learning_rate": 4.829001367989056e-05,
"loss": 3.9983,
"step": 400
},
{
"epoch": 0.03,
"grad_norm": 1.3733506202697754,
"learning_rate": 4.827291381668947e-05,
"loss": 3.8526,
"step": 404
},
{
"epoch": 0.03,
"grad_norm": 1.6695655584335327,
"learning_rate": 4.8255813953488375e-05,
"loss": 3.8948,
"step": 408
},
{
"epoch": 0.04,
"grad_norm": 1.3915668725967407,
"learning_rate": 4.8238714090287276e-05,
"loss": 3.8511,
"step": 412
},
{
"epoch": 0.04,
"grad_norm": 1.5736514329910278,
"learning_rate": 4.822161422708619e-05,
"loss": 3.9446,
"step": 416
},
{
"epoch": 0.04,
"grad_norm": 1.562687635421753,
"learning_rate": 4.820451436388509e-05,
"loss": 3.6682,
"step": 420
},
{
"epoch": 0.04,
"grad_norm": 1.4215232133865356,
"learning_rate": 4.8187414500684e-05,
"loss": 3.6618,
"step": 424
},
{
"epoch": 0.04,
"grad_norm": 1.8750267028808594,
"learning_rate": 4.8170314637482905e-05,
"loss": 3.8246,
"step": 428
},
{
"epoch": 0.04,
"grad_norm": 1.5843207836151123,
"learning_rate": 4.8153214774281806e-05,
"loss": 3.807,
"step": 432
},
{
"epoch": 0.04,
"grad_norm": 1.6384755373001099,
"learning_rate": 4.813611491108071e-05,
"loss": 3.8329,
"step": 436
},
{
"epoch": 0.04,
"grad_norm": 1.646612286567688,
"learning_rate": 4.811901504787962e-05,
"loss": 3.9736,
"step": 440
},
{
"epoch": 0.04,
"grad_norm": 1.592463493347168,
"learning_rate": 4.810191518467853e-05,
"loss": 3.6107,
"step": 444
},
{
"epoch": 0.04,
"grad_norm": 1.6230803728103638,
"learning_rate": 4.808481532147743e-05,
"loss": 3.8141,
"step": 448
},
{
"epoch": 0.04,
"grad_norm": 1.7098625898361206,
"learning_rate": 4.8067715458276336e-05,
"loss": 3.8751,
"step": 452
},
{
"epoch": 0.04,
"grad_norm": 1.441146731376648,
"learning_rate": 4.805061559507524e-05,
"loss": 3.8461,
"step": 456
},
{
"epoch": 0.04,
"grad_norm": 1.4387036561965942,
"learning_rate": 4.8033515731874144e-05,
"loss": 3.8432,
"step": 460
},
{
"epoch": 0.04,
"grad_norm": 1.6620376110076904,
"learning_rate": 4.801641586867305e-05,
"loss": 3.7844,
"step": 464
},
{
"epoch": 0.04,
"grad_norm": 1.5403114557266235,
"learning_rate": 4.799931600547196e-05,
"loss": 3.6556,
"step": 468
},
{
"epoch": 0.04,
"grad_norm": 1.5642478466033936,
"learning_rate": 4.7982216142270866e-05,
"loss": 3.7526,
"step": 472
},
{
"epoch": 0.04,
"grad_norm": 1.5027506351470947,
"learning_rate": 4.796511627906977e-05,
"loss": 3.7862,
"step": 476
},
{
"epoch": 0.04,
"grad_norm": 1.5570485591888428,
"learning_rate": 4.7948016415868674e-05,
"loss": 3.7656,
"step": 480
},
{
"epoch": 0.04,
"grad_norm": 2.2708029747009277,
"learning_rate": 4.793091655266758e-05,
"loss": 3.8742,
"step": 484
},
{
"epoch": 0.04,
"grad_norm": 1.5446516275405884,
"learning_rate": 4.791381668946648e-05,
"loss": 3.6401,
"step": 488
},
{
"epoch": 0.04,
"grad_norm": 1.5480107069015503,
"learning_rate": 4.789671682626539e-05,
"loss": 3.7441,
"step": 492
},
{
"epoch": 0.04,
"grad_norm": 1.5571659803390503,
"learning_rate": 4.78796169630643e-05,
"loss": 3.6849,
"step": 496
},
{
"epoch": 0.04,
"grad_norm": 1.9021155834197998,
"learning_rate": 4.7862517099863204e-05,
"loss": 3.7989,
"step": 500
},
{
"epoch": 0.04,
"grad_norm": 1.5292832851409912,
"learning_rate": 4.784541723666211e-05,
"loss": 3.8129,
"step": 504
},
{
"epoch": 0.04,
"grad_norm": 1.5750986337661743,
"learning_rate": 4.782831737346101e-05,
"loss": 3.648,
"step": 508
},
{
"epoch": 0.04,
"grad_norm": 1.6995611190795898,
"learning_rate": 4.781121751025992e-05,
"loss": 3.7686,
"step": 512
},
{
"epoch": 0.04,
"grad_norm": 1.7573764324188232,
"learning_rate": 4.7794117647058826e-05,
"loss": 3.6636,
"step": 516
},
{
"epoch": 0.04,
"grad_norm": 1.5131741762161255,
"learning_rate": 4.7777017783857733e-05,
"loss": 3.5793,
"step": 520
},
{
"epoch": 0.04,
"grad_norm": 1.6838548183441162,
"learning_rate": 4.775991792065664e-05,
"loss": 3.6336,
"step": 524
},
{
"epoch": 0.05,
"grad_norm": 1.9402954578399658,
"learning_rate": 4.774281805745554e-05,
"loss": 3.6733,
"step": 528
},
{
"epoch": 0.05,
"grad_norm": 1.7752444744110107,
"learning_rate": 4.772571819425445e-05,
"loss": 3.8021,
"step": 532
},
{
"epoch": 0.05,
"grad_norm": 1.4470579624176025,
"learning_rate": 4.7708618331053356e-05,
"loss": 3.6526,
"step": 536
},
{
"epoch": 0.05,
"grad_norm": 1.5560115575790405,
"learning_rate": 4.769151846785226e-05,
"loss": 3.7236,
"step": 540
},
{
"epoch": 0.05,
"grad_norm": 1.6849740743637085,
"learning_rate": 4.7674418604651164e-05,
"loss": 3.7586,
"step": 544
},
{
"epoch": 0.05,
"grad_norm": 1.4917711019515991,
"learning_rate": 4.765731874145007e-05,
"loss": 3.7791,
"step": 548
},
{
"epoch": 0.05,
"grad_norm": 1.791801929473877,
"learning_rate": 4.764021887824898e-05,
"loss": 3.5659,
"step": 552
},
{
"epoch": 0.05,
"grad_norm": 1.531817078590393,
"learning_rate": 4.7623119015047886e-05,
"loss": 3.5811,
"step": 556
},
{
"epoch": 0.05,
"grad_norm": 1.8725793361663818,
"learning_rate": 4.7606019151846787e-05,
"loss": 3.7337,
"step": 560
},
{
"epoch": 0.05,
"grad_norm": 1.5851105451583862,
"learning_rate": 4.7588919288645694e-05,
"loss": 3.4899,
"step": 564
},
{
"epoch": 0.05,
"grad_norm": 1.4937329292297363,
"learning_rate": 4.7571819425444594e-05,
"loss": 3.5653,
"step": 568
},
{
"epoch": 0.05,
"grad_norm": 1.8102850914001465,
"learning_rate": 4.75547195622435e-05,
"loss": 3.517,
"step": 572
},
{
"epoch": 0.05,
"grad_norm": 1.4572982788085938,
"learning_rate": 4.753761969904241e-05,
"loss": 3.723,
"step": 576
},
{
"epoch": 0.05,
"grad_norm": 1.5815645456314087,
"learning_rate": 4.7520519835841317e-05,
"loss": 3.6615,
"step": 580
},
{
"epoch": 0.05,
"grad_norm": 1.6406457424163818,
"learning_rate": 4.7503419972640224e-05,
"loss": 3.6796,
"step": 584
},
{
"epoch": 0.05,
"grad_norm": 1.8512486219406128,
"learning_rate": 4.7486320109439124e-05,
"loss": 3.5586,
"step": 588
},
{
"epoch": 0.05,
"grad_norm": 1.6738507747650146,
"learning_rate": 4.746922024623803e-05,
"loss": 3.4724,
"step": 592
},
{
"epoch": 0.05,
"grad_norm": 1.766518235206604,
"learning_rate": 4.745212038303693e-05,
"loss": 3.7213,
"step": 596
},
{
"epoch": 0.05,
"grad_norm": 1.681229591369629,
"learning_rate": 4.7435020519835846e-05,
"loss": 3.6538,
"step": 600
},
{
"epoch": 0.05,
"grad_norm": 1.7900785207748413,
"learning_rate": 4.7417920656634754e-05,
"loss": 3.7952,
"step": 604
},
{
"epoch": 0.05,
"grad_norm": 1.5610637664794922,
"learning_rate": 4.7400820793433654e-05,
"loss": 3.5255,
"step": 608
},
{
"epoch": 0.05,
"grad_norm": 1.3365826606750488,
"learning_rate": 4.738372093023256e-05,
"loss": 3.5621,
"step": 612
},
{
"epoch": 0.05,
"grad_norm": 1.8946609497070312,
"learning_rate": 4.736662106703146e-05,
"loss": 3.5664,
"step": 616
},
{
"epoch": 0.05,
"grad_norm": 1.728330135345459,
"learning_rate": 4.734952120383037e-05,
"loss": 3.7577,
"step": 620
},
{
"epoch": 0.05,
"grad_norm": 1.7583847045898438,
"learning_rate": 4.733242134062928e-05,
"loss": 3.5722,
"step": 624
},
{
"epoch": 0.05,
"grad_norm": 1.9061710834503174,
"learning_rate": 4.7315321477428184e-05,
"loss": 3.3436,
"step": 628
},
{
"epoch": 0.05,
"grad_norm": 1.7074024677276611,
"learning_rate": 4.729822161422709e-05,
"loss": 3.5426,
"step": 632
},
{
"epoch": 0.05,
"grad_norm": 1.649793028831482,
"learning_rate": 4.728112175102599e-05,
"loss": 3.5826,
"step": 636
},
{
"epoch": 0.05,
"grad_norm": 1.7260462045669556,
"learning_rate": 4.72640218878249e-05,
"loss": 3.5353,
"step": 640
},
{
"epoch": 0.06,
"grad_norm": 1.745391607284546,
"learning_rate": 4.724692202462381e-05,
"loss": 3.3534,
"step": 644
},
{
"epoch": 0.06,
"grad_norm": 1.374226450920105,
"learning_rate": 4.722982216142271e-05,
"loss": 3.386,
"step": 648
},
{
"epoch": 0.06,
"grad_norm": 1.8732798099517822,
"learning_rate": 4.7212722298221615e-05,
"loss": 3.6073,
"step": 652
},
{
"epoch": 0.06,
"grad_norm": 2.4075376987457275,
"learning_rate": 4.719562243502052e-05,
"loss": 3.5393,
"step": 656
},
{
"epoch": 0.06,
"grad_norm": 1.54508638381958,
"learning_rate": 4.717852257181943e-05,
"loss": 3.475,
"step": 660
},
{
"epoch": 0.06,
"grad_norm": 1.5737498998641968,
"learning_rate": 4.716142270861834e-05,
"loss": 3.5497,
"step": 664
},
{
"epoch": 0.06,
"grad_norm": 1.8623074293136597,
"learning_rate": 4.714432284541724e-05,
"loss": 3.4603,
"step": 668
},
{
"epoch": 0.06,
"grad_norm": 1.7199251651763916,
"learning_rate": 4.7127222982216145e-05,
"loss": 3.5801,
"step": 672
},
{
"epoch": 0.06,
"grad_norm": 1.5866843461990356,
"learning_rate": 4.7110123119015045e-05,
"loss": 3.3454,
"step": 676
},
{
"epoch": 0.06,
"grad_norm": 1.9907779693603516,
"learning_rate": 4.709302325581396e-05,
"loss": 3.6247,
"step": 680
},
{
"epoch": 0.06,
"grad_norm": 1.3720687627792358,
"learning_rate": 4.707592339261287e-05,
"loss": 3.3614,
"step": 684
},
{
"epoch": 0.06,
"grad_norm": 1.739660620689392,
"learning_rate": 4.705882352941177e-05,
"loss": 3.4211,
"step": 688
},
{
"epoch": 0.06,
"grad_norm": 1.6425236463546753,
"learning_rate": 4.7041723666210675e-05,
"loss": 3.423,
"step": 692
},
{
"epoch": 0.06,
"grad_norm": 1.5457091331481934,
"learning_rate": 4.7024623803009575e-05,
"loss": 3.3258,
"step": 696
},
{
"epoch": 0.06,
"grad_norm": 1.5979949235916138,
"learning_rate": 4.700752393980848e-05,
"loss": 3.2838,
"step": 700
},
{
"epoch": 0.06,
"grad_norm": 1.6761040687561035,
"learning_rate": 4.699042407660739e-05,
"loss": 3.334,
"step": 704
},
{
"epoch": 0.06,
"grad_norm": 1.552573323249817,
"learning_rate": 4.69733242134063e-05,
"loss": 3.5355,
"step": 708
},
{
"epoch": 0.06,
"grad_norm": 1.6743354797363281,
"learning_rate": 4.6956224350205205e-05,
"loss": 3.4076,
"step": 712
},
{
"epoch": 0.06,
"grad_norm": 4.555662155151367,
"learning_rate": 4.6939124487004105e-05,
"loss": 3.7446,
"step": 716
},
{
"epoch": 0.06,
"grad_norm": 1.5942860841751099,
"learning_rate": 4.692202462380301e-05,
"loss": 3.553,
"step": 720
},
{
"epoch": 0.06,
"grad_norm": 1.9089189767837524,
"learning_rate": 4.690492476060191e-05,
"loss": 3.3584,
"step": 724
},
{
"epoch": 0.06,
"grad_norm": 2.0768373012542725,
"learning_rate": 4.688782489740082e-05,
"loss": 3.4295,
"step": 728
},
{
"epoch": 0.06,
"grad_norm": 3.3060362339019775,
"learning_rate": 4.687072503419973e-05,
"loss": 3.415,
"step": 732
},
{
"epoch": 0.06,
"grad_norm": 1.9307421445846558,
"learning_rate": 4.6853625170998635e-05,
"loss": 3.4901,
"step": 736
},
{
"epoch": 0.06,
"grad_norm": 1.7142446041107178,
"learning_rate": 4.683652530779754e-05,
"loss": 3.6096,
"step": 740
},
{
"epoch": 0.06,
"grad_norm": 1.7756657600402832,
"learning_rate": 4.681942544459644e-05,
"loss": 3.4534,
"step": 744
},
{
"epoch": 0.06,
"grad_norm": 1.5459442138671875,
"learning_rate": 4.680232558139535e-05,
"loss": 3.3813,
"step": 748
},
{
"epoch": 0.06,
"grad_norm": 2.1961963176727295,
"learning_rate": 4.678522571819426e-05,
"loss": 3.3467,
"step": 752
},
{
"epoch": 0.06,
"grad_norm": 1.7320613861083984,
"learning_rate": 4.676812585499316e-05,
"loss": 3.3927,
"step": 756
},
{
"epoch": 0.06,
"grad_norm": 1.6255303621292114,
"learning_rate": 4.6751025991792066e-05,
"loss": 3.4644,
"step": 760
},
{
"epoch": 0.07,
"grad_norm": 1.5142388343811035,
"learning_rate": 4.673392612859097e-05,
"loss": 3.4657,
"step": 764
},
{
"epoch": 0.07,
"grad_norm": 1.5093744993209839,
"learning_rate": 4.671682626538988e-05,
"loss": 3.3144,
"step": 768
},
{
"epoch": 0.07,
"grad_norm": 1.4730446338653564,
"learning_rate": 4.669972640218879e-05,
"loss": 3.3382,
"step": 772
},
{
"epoch": 0.07,
"grad_norm": 1.7650116682052612,
"learning_rate": 4.668262653898769e-05,
"loss": 3.4519,
"step": 776
},
{
"epoch": 0.07,
"grad_norm": 1.637071132659912,
"learning_rate": 4.6665526675786596e-05,
"loss": 3.3723,
"step": 780
},
{
"epoch": 0.07,
"grad_norm": 1.4582788944244385,
"learning_rate": 4.6648426812585496e-05,
"loss": 3.4839,
"step": 784
},
{
"epoch": 0.07,
"grad_norm": 1.4654922485351562,
"learning_rate": 4.663132694938441e-05,
"loss": 3.1257,
"step": 788
},
{
"epoch": 0.07,
"grad_norm": 1.4668978452682495,
"learning_rate": 4.661422708618332e-05,
"loss": 3.2506,
"step": 792
},
{
"epoch": 0.07,
"grad_norm": 1.5190536975860596,
"learning_rate": 4.659712722298222e-05,
"loss": 3.4055,
"step": 796
},
{
"epoch": 0.07,
"grad_norm": 1.9057687520980835,
"learning_rate": 4.6580027359781126e-05,
"loss": 3.4804,
"step": 800
},
{
"epoch": 0.07,
"grad_norm": 1.4573813676834106,
"learning_rate": 4.6562927496580026e-05,
"loss": 3.3792,
"step": 804
},
{
"epoch": 0.07,
"grad_norm": 2.3757171630859375,
"learning_rate": 4.6545827633378933e-05,
"loss": 3.4649,
"step": 808
},
{
"epoch": 0.07,
"grad_norm": 1.7024015188217163,
"learning_rate": 4.652872777017784e-05,
"loss": 3.3149,
"step": 812
},
{
"epoch": 0.07,
"grad_norm": 1.5424153804779053,
"learning_rate": 4.651162790697675e-05,
"loss": 3.3,
"step": 816
},
{
"epoch": 0.07,
"grad_norm": 1.4667370319366455,
"learning_rate": 4.6494528043775655e-05,
"loss": 3.2845,
"step": 820
},
{
"epoch": 0.07,
"grad_norm": 1.5216193199157715,
"learning_rate": 4.6477428180574556e-05,
"loss": 3.3062,
"step": 824
},
{
"epoch": 0.07,
"grad_norm": 1.5499827861785889,
"learning_rate": 4.6460328317373463e-05,
"loss": 3.263,
"step": 828
},
{
"epoch": 0.07,
"grad_norm": 1.644148588180542,
"learning_rate": 4.6443228454172364e-05,
"loss": 3.4374,
"step": 832
},
{
"epoch": 0.07,
"grad_norm": 1.6420782804489136,
"learning_rate": 4.642612859097127e-05,
"loss": 3.4049,
"step": 836
},
{
"epoch": 0.07,
"grad_norm": 1.4325385093688965,
"learning_rate": 4.640902872777018e-05,
"loss": 3.3499,
"step": 840
},
{
"epoch": 0.07,
"grad_norm": 1.6447879076004028,
"learning_rate": 4.6391928864569086e-05,
"loss": 3.4098,
"step": 844
},
{
"epoch": 0.07,
"grad_norm": 1.4675137996673584,
"learning_rate": 4.637482900136799e-05,
"loss": 3.2778,
"step": 848
},
{
"epoch": 0.07,
"grad_norm": 1.5847750902175903,
"learning_rate": 4.6357729138166894e-05,
"loss": 3.1695,
"step": 852
},
{
"epoch": 0.07,
"grad_norm": 1.514772653579712,
"learning_rate": 4.63406292749658e-05,
"loss": 3.1556,
"step": 856
},
{
"epoch": 0.07,
"grad_norm": 1.511367917060852,
"learning_rate": 4.632352941176471e-05,
"loss": 3.3307,
"step": 860
},
{
"epoch": 0.07,
"grad_norm": 1.7640196084976196,
"learning_rate": 4.630642954856361e-05,
"loss": 3.2985,
"step": 864
},
{
"epoch": 0.07,
"grad_norm": 1.7074265480041504,
"learning_rate": 4.628932968536252e-05,
"loss": 3.2598,
"step": 868
},
{
"epoch": 0.07,
"grad_norm": 1.52577805519104,
"learning_rate": 4.6272229822161424e-05,
"loss": 3.0872,
"step": 872
},
{
"epoch": 0.07,
"grad_norm": 1.6908501386642456,
"learning_rate": 4.625512995896033e-05,
"loss": 3.1931,
"step": 876
},
{
"epoch": 0.08,
"grad_norm": 1.6583012342453003,
"learning_rate": 4.623803009575924e-05,
"loss": 3.1849,
"step": 880
},
{
"epoch": 0.08,
"grad_norm": 1.4067027568817139,
"learning_rate": 4.622093023255814e-05,
"loss": 3.1912,
"step": 884
},
{
"epoch": 0.08,
"grad_norm": 1.388487458229065,
"learning_rate": 4.6203830369357046e-05,
"loss": 3.2845,
"step": 888
},
{
"epoch": 0.08,
"grad_norm": 1.7910948991775513,
"learning_rate": 4.6186730506155954e-05,
"loss": 3.1146,
"step": 892
},
{
"epoch": 0.08,
"grad_norm": 1.4454721212387085,
"learning_rate": 4.616963064295486e-05,
"loss": 3.2114,
"step": 896
},
{
"epoch": 0.08,
"grad_norm": 1.4809476137161255,
"learning_rate": 4.615253077975377e-05,
"loss": 3.3329,
"step": 900
},
{
"epoch": 0.08,
"grad_norm": 1.6668082475662231,
"learning_rate": 4.613543091655267e-05,
"loss": 3.2517,
"step": 904
},
{
"epoch": 0.08,
"grad_norm": 1.72063410282135,
"learning_rate": 4.6118331053351576e-05,
"loss": 3.0895,
"step": 908
},
{
"epoch": 0.08,
"grad_norm": 1.5995526313781738,
"learning_rate": 4.610123119015048e-05,
"loss": 3.2703,
"step": 912
},
{
"epoch": 0.08,
"grad_norm": 1.453747034072876,
"learning_rate": 4.6084131326949384e-05,
"loss": 3.3619,
"step": 916
},
{
"epoch": 0.08,
"grad_norm": 1.7982878684997559,
"learning_rate": 4.606703146374829e-05,
"loss": 3.2657,
"step": 920
},
{
"epoch": 0.08,
"grad_norm": 1.3937926292419434,
"learning_rate": 4.60499316005472e-05,
"loss": 3.1727,
"step": 924
},
{
"epoch": 0.08,
"grad_norm": 1.5705368518829346,
"learning_rate": 4.6032831737346106e-05,
"loss": 3.0657,
"step": 928
},
{
"epoch": 0.08,
"grad_norm": 1.6821500062942505,
"learning_rate": 4.601573187414501e-05,
"loss": 3.3191,
"step": 932
},
{
"epoch": 0.08,
"grad_norm": 1.5288244485855103,
"learning_rate": 4.5998632010943914e-05,
"loss": 3.0876,
"step": 936
},
{
"epoch": 0.08,
"grad_norm": 1.7748781442642212,
"learning_rate": 4.598153214774282e-05,
"loss": 3.18,
"step": 940
},
{
"epoch": 0.08,
"grad_norm": 1.7024669647216797,
"learning_rate": 4.596443228454172e-05,
"loss": 3.3601,
"step": 944
},
{
"epoch": 0.08,
"grad_norm": 1.7017521858215332,
"learning_rate": 4.5947332421340636e-05,
"loss": 2.9751,
"step": 948
},
{
"epoch": 0.08,
"grad_norm": 1.7405025959014893,
"learning_rate": 4.593023255813954e-05,
"loss": 3.2073,
"step": 952
},
{
"epoch": 0.08,
"grad_norm": 1.6610547304153442,
"learning_rate": 4.5913132694938444e-05,
"loss": 3.2363,
"step": 956
},
{
"epoch": 0.08,
"grad_norm": 1.4785616397857666,
"learning_rate": 4.5896032831737345e-05,
"loss": 3.3254,
"step": 960
},
{
"epoch": 0.08,
"grad_norm": 3.453533411026001,
"learning_rate": 4.587893296853625e-05,
"loss": 3.21,
"step": 964
},
{
"epoch": 0.08,
"grad_norm": 1.4303959608078003,
"learning_rate": 4.586183310533516e-05,
"loss": 3.3226,
"step": 968
},
{
"epoch": 0.08,
"grad_norm": 1.6272943019866943,
"learning_rate": 4.584473324213407e-05,
"loss": 3.5944,
"step": 972
},
{
"epoch": 0.08,
"grad_norm": 1.823716402053833,
"learning_rate": 4.5827633378932974e-05,
"loss": 3.234,
"step": 976
},
{
"epoch": 0.08,
"grad_norm": 1.7006231546401978,
"learning_rate": 4.5810533515731875e-05,
"loss": 3.0123,
"step": 980
},
{
"epoch": 0.08,
"grad_norm": 1.3645117282867432,
"learning_rate": 4.579343365253078e-05,
"loss": 3.2243,
"step": 984
},
{
"epoch": 0.08,
"grad_norm": 1.7724872827529907,
"learning_rate": 4.577633378932969e-05,
"loss": 3.1705,
"step": 988
},
{
"epoch": 0.08,
"grad_norm": 1.6377599239349365,
"learning_rate": 4.575923392612859e-05,
"loss": 3.2647,
"step": 992
},
{
"epoch": 0.09,
"grad_norm": 1.4828346967697144,
"learning_rate": 4.57421340629275e-05,
"loss": 3.0613,
"step": 996
},
{
"epoch": 0.09,
"grad_norm": 1.515553593635559,
"learning_rate": 4.5725034199726405e-05,
"loss": 3.0368,
"step": 1000
},
{
"epoch": 0.09,
"grad_norm": 1.498844861984253,
"learning_rate": 4.570793433652531e-05,
"loss": 3.102,
"step": 1004
},
{
"epoch": 0.09,
"grad_norm": 1.4339860677719116,
"learning_rate": 4.569083447332422e-05,
"loss": 3.1874,
"step": 1008
},
{
"epoch": 0.09,
"grad_norm": 1.5611178874969482,
"learning_rate": 4.567373461012312e-05,
"loss": 3.075,
"step": 1012
},
{
"epoch": 0.09,
"grad_norm": 1.6089402437210083,
"learning_rate": 4.565663474692203e-05,
"loss": 3.1544,
"step": 1016
},
{
"epoch": 0.09,
"grad_norm": 1.8874709606170654,
"learning_rate": 4.563953488372093e-05,
"loss": 3.2806,
"step": 1020
},
{
"epoch": 0.09,
"grad_norm": 1.5624805688858032,
"learning_rate": 4.5622435020519835e-05,
"loss": 3.1442,
"step": 1024
},
{
"epoch": 0.09,
"grad_norm": 1.968634009361267,
"learning_rate": 4.560533515731875e-05,
"loss": 3.2743,
"step": 1028
},
{
"epoch": 0.09,
"grad_norm": 1.4064279794692993,
"learning_rate": 4.558823529411765e-05,
"loss": 3.0726,
"step": 1032
},
{
"epoch": 0.09,
"grad_norm": 1.577048897743225,
"learning_rate": 4.557113543091656e-05,
"loss": 3.1129,
"step": 1036
},
{
"epoch": 0.09,
"grad_norm": 1.548323392868042,
"learning_rate": 4.555403556771546e-05,
"loss": 3.144,
"step": 1040
},
{
"epoch": 0.09,
"grad_norm": 1.5542961359024048,
"learning_rate": 4.5536935704514365e-05,
"loss": 3.182,
"step": 1044
},
{
"epoch": 0.09,
"grad_norm": 1.5506356954574585,
"learning_rate": 4.551983584131327e-05,
"loss": 3.2186,
"step": 1048
},
{
"epoch": 0.09,
"grad_norm": 1.6962274312973022,
"learning_rate": 4.550273597811218e-05,
"loss": 3.0385,
"step": 1052
},
{
"epoch": 0.09,
"grad_norm": 1.4382833242416382,
"learning_rate": 4.548563611491109e-05,
"loss": 3.1268,
"step": 1056
},
{
"epoch": 0.09,
"grad_norm": 2.269392728805542,
"learning_rate": 4.546853625170999e-05,
"loss": 3.2579,
"step": 1060
},
{
"epoch": 0.09,
"grad_norm": 1.5414201021194458,
"learning_rate": 4.5451436388508895e-05,
"loss": 3.1501,
"step": 1064
},
{
"epoch": 0.09,
"grad_norm": 1.936946153640747,
"learning_rate": 4.54343365253078e-05,
"loss": 3.1711,
"step": 1068
},
{
"epoch": 0.09,
"grad_norm": 1.7430529594421387,
"learning_rate": 4.54172366621067e-05,
"loss": 3.1299,
"step": 1072
},
{
"epoch": 0.09,
"grad_norm": 1.3500404357910156,
"learning_rate": 4.540013679890561e-05,
"loss": 3.0935,
"step": 1076
},
{
"epoch": 0.09,
"grad_norm": 1.5768132209777832,
"learning_rate": 4.538303693570452e-05,
"loss": 3.1091,
"step": 1080
},
{
"epoch": 0.09,
"grad_norm": 1.4829493761062622,
"learning_rate": 4.5365937072503425e-05,
"loss": 3.0862,
"step": 1084
},
{
"epoch": 0.09,
"grad_norm": 1.5560483932495117,
"learning_rate": 4.5348837209302326e-05,
"loss": 3.1024,
"step": 1088
},
{
"epoch": 0.09,
"grad_norm": 1.6295199394226074,
"learning_rate": 4.533173734610123e-05,
"loss": 2.9539,
"step": 1092
},
{
"epoch": 0.09,
"grad_norm": 2.1724135875701904,
"learning_rate": 4.531463748290014e-05,
"loss": 3.1936,
"step": 1096
},
{
"epoch": 0.09,
"grad_norm": 1.7400479316711426,
"learning_rate": 4.529753761969904e-05,
"loss": 3.1873,
"step": 1100
},
{
"epoch": 0.09,
"grad_norm": 1.554962396621704,
"learning_rate": 4.528043775649795e-05,
"loss": 3.0643,
"step": 1104
},
{
"epoch": 0.09,
"grad_norm": 1.3722119331359863,
"learning_rate": 4.5263337893296855e-05,
"loss": 3.0088,
"step": 1108
},
{
"epoch": 0.1,
"grad_norm": 1.395577311515808,
"learning_rate": 4.524623803009576e-05,
"loss": 3.0584,
"step": 1112
},
{
"epoch": 0.1,
"grad_norm": 1.509710431098938,
"learning_rate": 4.522913816689467e-05,
"loss": 3.2276,
"step": 1116
},
{
"epoch": 0.1,
"grad_norm": 1.5213786363601685,
"learning_rate": 4.521203830369357e-05,
"loss": 2.9433,
"step": 1120
},
{
"epoch": 0.1,
"grad_norm": 1.51616632938385,
"learning_rate": 4.519493844049248e-05,
"loss": 2.9902,
"step": 1124
},
{
"epoch": 0.1,
"grad_norm": 1.5429805517196655,
"learning_rate": 4.517783857729138e-05,
"loss": 3.0884,
"step": 1128
},
{
"epoch": 0.1,
"grad_norm": 1.4714343547821045,
"learning_rate": 4.5160738714090286e-05,
"loss": 3.0322,
"step": 1132
},
{
"epoch": 0.1,
"grad_norm": 1.4637647867202759,
"learning_rate": 4.51436388508892e-05,
"loss": 3.0352,
"step": 1136
},
{
"epoch": 0.1,
"grad_norm": 1.3424347639083862,
"learning_rate": 4.51265389876881e-05,
"loss": 3.0866,
"step": 1140
},
{
"epoch": 0.1,
"grad_norm": 1.689968228340149,
"learning_rate": 4.510943912448701e-05,
"loss": 3.0002,
"step": 1144
},
{
"epoch": 0.1,
"grad_norm": 1.9935293197631836,
"learning_rate": 4.509233926128591e-05,
"loss": 2.9518,
"step": 1148
},
{
"epoch": 0.1,
"grad_norm": 1.5461673736572266,
"learning_rate": 4.5075239398084816e-05,
"loss": 3.0394,
"step": 1152
},
{
"epoch": 0.1,
"grad_norm": 1.5787737369537354,
"learning_rate": 4.505813953488372e-05,
"loss": 3.0934,
"step": 1156
},
{
"epoch": 0.1,
"grad_norm": 1.4527121782302856,
"learning_rate": 4.504103967168263e-05,
"loss": 2.9734,
"step": 1160
},
{
"epoch": 0.1,
"grad_norm": 1.6236604452133179,
"learning_rate": 4.502393980848154e-05,
"loss": 3.021,
"step": 1164
},
{
"epoch": 0.1,
"grad_norm": 1.5616329908370972,
"learning_rate": 4.500683994528044e-05,
"loss": 3.0602,
"step": 1168
}
],
"logging_steps": 4,
"max_steps": 11696,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1170,
"total_flos": 1.585392327327744e+16,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}