|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.10003419972640219, |
|
"eval_steps": 500, |
|
"global_step": 1170, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 4.3728413581848145, |
|
"learning_rate": 4.998290013679891e-05, |
|
"loss": 10.0117, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 2.599367380142212, |
|
"learning_rate": 4.996580027359781e-05, |
|
"loss": 9.1745, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 2.4951720237731934, |
|
"learning_rate": 4.994870041039672e-05, |
|
"loss": 8.7624, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 2.4932913780212402, |
|
"learning_rate": 4.9931600547195625e-05, |
|
"loss": 8.5709, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 2.3983781337738037, |
|
"learning_rate": 4.991450068399453e-05, |
|
"loss": 8.4411, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 2.2340972423553467, |
|
"learning_rate": 4.989740082079344e-05, |
|
"loss": 8.1957, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 2.1964590549468994, |
|
"learning_rate": 4.988030095759234e-05, |
|
"loss": 8.0598, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 1.9686617851257324, |
|
"learning_rate": 4.986320109439125e-05, |
|
"loss": 7.8679, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 1.8235087394714355, |
|
"learning_rate": 4.984610123119015e-05, |
|
"loss": 7.7621, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 1.7598885297775269, |
|
"learning_rate": 4.9829001367989056e-05, |
|
"loss": 7.5417, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 1.94789719581604, |
|
"learning_rate": 4.981190150478797e-05, |
|
"loss": 7.3331, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 1.9366331100463867, |
|
"learning_rate": 4.979480164158687e-05, |
|
"loss": 7.1039, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 2.1228675842285156, |
|
"learning_rate": 4.977770177838578e-05, |
|
"loss": 6.9965, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 1.9855167865753174, |
|
"learning_rate": 4.976060191518468e-05, |
|
"loss": 6.8865, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.8035985231399536, |
|
"learning_rate": 4.9743502051983585e-05, |
|
"loss": 6.6322, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.8589977025985718, |
|
"learning_rate": 4.9726402188782486e-05, |
|
"loss": 6.5616, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.745139241218567, |
|
"learning_rate": 4.97093023255814e-05, |
|
"loss": 6.3479, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.266196608543396, |
|
"learning_rate": 4.969220246238031e-05, |
|
"loss": 6.2343, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 2.005223035812378, |
|
"learning_rate": 4.967510259917921e-05, |
|
"loss": 6.1866, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.5017377138137817, |
|
"learning_rate": 4.9658002735978115e-05, |
|
"loss": 5.972, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.6136974096298218, |
|
"learning_rate": 4.9640902872777016e-05, |
|
"loss": 6.1335, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.3865970373153687, |
|
"learning_rate": 4.962380300957592e-05, |
|
"loss": 5.944, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.283933162689209, |
|
"learning_rate": 4.960670314637483e-05, |
|
"loss": 5.8713, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.1588549613952637, |
|
"learning_rate": 4.958960328317374e-05, |
|
"loss": 5.7731, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.3081687688827515, |
|
"learning_rate": 4.9572503419972645e-05, |
|
"loss": 5.7429, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.6212745904922485, |
|
"learning_rate": 4.9555403556771546e-05, |
|
"loss": 5.481, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.709843397140503, |
|
"learning_rate": 4.953830369357045e-05, |
|
"loss": 5.4855, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.2814812660217285, |
|
"learning_rate": 4.952120383036936e-05, |
|
"loss": 5.4367, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.5539968013763428, |
|
"learning_rate": 4.950410396716826e-05, |
|
"loss": 5.2652, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.500375509262085, |
|
"learning_rate": 4.948700410396717e-05, |
|
"loss": 5.5137, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.7938899993896484, |
|
"learning_rate": 4.9469904240766076e-05, |
|
"loss": 5.4663, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.6169573068618774, |
|
"learning_rate": 4.945280437756498e-05, |
|
"loss": 5.3137, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.1904114484786987, |
|
"learning_rate": 4.943570451436389e-05, |
|
"loss": 5.1871, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 2.5564723014831543, |
|
"learning_rate": 4.941860465116279e-05, |
|
"loss": 5.4058, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.6187268495559692, |
|
"learning_rate": 4.94015047879617e-05, |
|
"loss": 5.1919, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.2222367525100708, |
|
"learning_rate": 4.93844049247606e-05, |
|
"loss": 5.1876, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.5898586511611938, |
|
"learning_rate": 4.936730506155951e-05, |
|
"loss": 5.1217, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.446902871131897, |
|
"learning_rate": 4.935020519835842e-05, |
|
"loss": 5.0582, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.862309217453003, |
|
"learning_rate": 4.933310533515732e-05, |
|
"loss": 4.9028, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.5455704927444458, |
|
"learning_rate": 4.931600547195623e-05, |
|
"loss": 5.0352, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.772558569908142, |
|
"learning_rate": 4.929890560875513e-05, |
|
"loss": 4.9707, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.8154480457305908, |
|
"learning_rate": 4.9281805745554036e-05, |
|
"loss": 5.0588, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.4536504745483398, |
|
"learning_rate": 4.9264705882352944e-05, |
|
"loss": 4.9997, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.6166527271270752, |
|
"learning_rate": 4.924760601915185e-05, |
|
"loss": 4.9472, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 2.1260979175567627, |
|
"learning_rate": 4.923050615595076e-05, |
|
"loss": 4.944, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.4778213500976562, |
|
"learning_rate": 4.921340629274966e-05, |
|
"loss": 4.8657, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.255488395690918, |
|
"learning_rate": 4.9196306429548566e-05, |
|
"loss": 4.7728, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.5698314905166626, |
|
"learning_rate": 4.917920656634747e-05, |
|
"loss": 4.9433, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.342402696609497, |
|
"learning_rate": 4.9162106703146374e-05, |
|
"loss": 4.6336, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.4427978992462158, |
|
"learning_rate": 4.914500683994528e-05, |
|
"loss": 4.7112, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.6528425216674805, |
|
"learning_rate": 4.912790697674419e-05, |
|
"loss": 4.876, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.706386923789978, |
|
"learning_rate": 4.9110807113543096e-05, |
|
"loss": 4.5971, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.7773737907409668, |
|
"learning_rate": 4.9093707250342e-05, |
|
"loss": 4.7944, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.7884222269058228, |
|
"learning_rate": 4.9076607387140904e-05, |
|
"loss": 4.5912, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.5389587879180908, |
|
"learning_rate": 4.905950752393981e-05, |
|
"loss": 4.6656, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.7006316184997559, |
|
"learning_rate": 4.904240766073871e-05, |
|
"loss": 4.6589, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.6314700841903687, |
|
"learning_rate": 4.9025307797537626e-05, |
|
"loss": 4.5036, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.7512930631637573, |
|
"learning_rate": 4.900820793433653e-05, |
|
"loss": 4.5105, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.9109033346176147, |
|
"learning_rate": 4.8991108071135434e-05, |
|
"loss": 4.4449, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.547317385673523, |
|
"learning_rate": 4.897400820793434e-05, |
|
"loss": 4.4906, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.5143060684204102, |
|
"learning_rate": 4.895690834473324e-05, |
|
"loss": 4.3545, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.6648136377334595, |
|
"learning_rate": 4.893980848153215e-05, |
|
"loss": 4.4093, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.5027951002120972, |
|
"learning_rate": 4.892270861833106e-05, |
|
"loss": 4.3444, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.9429397583007812, |
|
"learning_rate": 4.8905608755129964e-05, |
|
"loss": 4.4937, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.4009640216827393, |
|
"learning_rate": 4.888850889192887e-05, |
|
"loss": 4.3625, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.7445279359817505, |
|
"learning_rate": 4.887140902872777e-05, |
|
"loss": 4.2759, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.4381403923034668, |
|
"learning_rate": 4.885430916552668e-05, |
|
"loss": 4.574, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.5582979917526245, |
|
"learning_rate": 4.883720930232558e-05, |
|
"loss": 4.3831, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.461166501045227, |
|
"learning_rate": 4.882010943912449e-05, |
|
"loss": 4.5049, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.756549596786499, |
|
"learning_rate": 4.8803009575923394e-05, |
|
"loss": 4.1334, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.9006497859954834, |
|
"learning_rate": 4.87859097127223e-05, |
|
"loss": 4.3406, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.4551759958267212, |
|
"learning_rate": 4.876880984952121e-05, |
|
"loss": 4.4688, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.5335747003555298, |
|
"learning_rate": 4.875170998632011e-05, |
|
"loss": 4.2096, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.4802557229995728, |
|
"learning_rate": 4.873461012311902e-05, |
|
"loss": 4.0918, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.9993939399719238, |
|
"learning_rate": 4.8717510259917924e-05, |
|
"loss": 4.3636, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.706895351409912, |
|
"learning_rate": 4.8700410396716825e-05, |
|
"loss": 4.0891, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.4630484580993652, |
|
"learning_rate": 4.868331053351574e-05, |
|
"loss": 4.3254, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.4353913068771362, |
|
"learning_rate": 4.866621067031464e-05, |
|
"loss": 4.1935, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.5042275190353394, |
|
"learning_rate": 4.864911080711355e-05, |
|
"loss": 4.0913, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.792472004890442, |
|
"learning_rate": 4.863201094391245e-05, |
|
"loss": 4.2159, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.4059948921203613, |
|
"learning_rate": 4.8614911080711355e-05, |
|
"loss": 4.2934, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.8408161401748657, |
|
"learning_rate": 4.859781121751026e-05, |
|
"loss": 4.1482, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.716046690940857, |
|
"learning_rate": 4.858071135430917e-05, |
|
"loss": 4.0273, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.8415429592132568, |
|
"learning_rate": 4.856361149110808e-05, |
|
"loss": 4.0837, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.4880731105804443, |
|
"learning_rate": 4.854651162790698e-05, |
|
"loss": 4.1957, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.8150016069412231, |
|
"learning_rate": 4.8529411764705885e-05, |
|
"loss": 4.225, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.7828190326690674, |
|
"learning_rate": 4.851231190150479e-05, |
|
"loss": 4.1053, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.8549565076828003, |
|
"learning_rate": 4.849521203830369e-05, |
|
"loss": 3.8068, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.4295508861541748, |
|
"learning_rate": 4.84781121751026e-05, |
|
"loss": 3.9358, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.4277971982955933, |
|
"learning_rate": 4.846101231190151e-05, |
|
"loss": 4.1512, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.4937466382980347, |
|
"learning_rate": 4.8443912448700415e-05, |
|
"loss": 4.0162, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.3076913356781006, |
|
"learning_rate": 4.842681258549932e-05, |
|
"loss": 4.0336, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.673946738243103, |
|
"learning_rate": 4.840971272229822e-05, |
|
"loss": 4.0915, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.5978224277496338, |
|
"learning_rate": 4.839261285909713e-05, |
|
"loss": 4.137, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 2.214573383331299, |
|
"learning_rate": 4.837551299589603e-05, |
|
"loss": 3.8231, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.4263286590576172, |
|
"learning_rate": 4.835841313269494e-05, |
|
"loss": 3.9326, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.694931149482727, |
|
"learning_rate": 4.8341313269493845e-05, |
|
"loss": 4.0673, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.558498740196228, |
|
"learning_rate": 4.832421340629275e-05, |
|
"loss": 3.9454, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.5231866836547852, |
|
"learning_rate": 4.830711354309166e-05, |
|
"loss": 4.0214, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.6748324632644653, |
|
"learning_rate": 4.829001367989056e-05, |
|
"loss": 3.9983, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.3733506202697754, |
|
"learning_rate": 4.827291381668947e-05, |
|
"loss": 3.8526, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.6695655584335327, |
|
"learning_rate": 4.8255813953488375e-05, |
|
"loss": 3.8948, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.3915668725967407, |
|
"learning_rate": 4.8238714090287276e-05, |
|
"loss": 3.8511, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.5736514329910278, |
|
"learning_rate": 4.822161422708619e-05, |
|
"loss": 3.9446, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.562687635421753, |
|
"learning_rate": 4.820451436388509e-05, |
|
"loss": 3.6682, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.4215232133865356, |
|
"learning_rate": 4.8187414500684e-05, |
|
"loss": 3.6618, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.8750267028808594, |
|
"learning_rate": 4.8170314637482905e-05, |
|
"loss": 3.8246, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.5843207836151123, |
|
"learning_rate": 4.8153214774281806e-05, |
|
"loss": 3.807, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.6384755373001099, |
|
"learning_rate": 4.813611491108071e-05, |
|
"loss": 3.8329, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.646612286567688, |
|
"learning_rate": 4.811901504787962e-05, |
|
"loss": 3.9736, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.592463493347168, |
|
"learning_rate": 4.810191518467853e-05, |
|
"loss": 3.6107, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.6230803728103638, |
|
"learning_rate": 4.808481532147743e-05, |
|
"loss": 3.8141, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.7098625898361206, |
|
"learning_rate": 4.8067715458276336e-05, |
|
"loss": 3.8751, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.441146731376648, |
|
"learning_rate": 4.805061559507524e-05, |
|
"loss": 3.8461, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.4387036561965942, |
|
"learning_rate": 4.8033515731874144e-05, |
|
"loss": 3.8432, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.6620376110076904, |
|
"learning_rate": 4.801641586867305e-05, |
|
"loss": 3.7844, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.5403114557266235, |
|
"learning_rate": 4.799931600547196e-05, |
|
"loss": 3.6556, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.5642478466033936, |
|
"learning_rate": 4.7982216142270866e-05, |
|
"loss": 3.7526, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.5027506351470947, |
|
"learning_rate": 4.796511627906977e-05, |
|
"loss": 3.7862, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.5570485591888428, |
|
"learning_rate": 4.7948016415868674e-05, |
|
"loss": 3.7656, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 2.2708029747009277, |
|
"learning_rate": 4.793091655266758e-05, |
|
"loss": 3.8742, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.5446516275405884, |
|
"learning_rate": 4.791381668946648e-05, |
|
"loss": 3.6401, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.5480107069015503, |
|
"learning_rate": 4.789671682626539e-05, |
|
"loss": 3.7441, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.5571659803390503, |
|
"learning_rate": 4.78796169630643e-05, |
|
"loss": 3.6849, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.9021155834197998, |
|
"learning_rate": 4.7862517099863204e-05, |
|
"loss": 3.7989, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.5292832851409912, |
|
"learning_rate": 4.784541723666211e-05, |
|
"loss": 3.8129, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.5750986337661743, |
|
"learning_rate": 4.782831737346101e-05, |
|
"loss": 3.648, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.6995611190795898, |
|
"learning_rate": 4.781121751025992e-05, |
|
"loss": 3.7686, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.7573764324188232, |
|
"learning_rate": 4.7794117647058826e-05, |
|
"loss": 3.6636, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.5131741762161255, |
|
"learning_rate": 4.7777017783857733e-05, |
|
"loss": 3.5793, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.6838548183441162, |
|
"learning_rate": 4.775991792065664e-05, |
|
"loss": 3.6336, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.9402954578399658, |
|
"learning_rate": 4.774281805745554e-05, |
|
"loss": 3.6733, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.7752444744110107, |
|
"learning_rate": 4.772571819425445e-05, |
|
"loss": 3.8021, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.4470579624176025, |
|
"learning_rate": 4.7708618331053356e-05, |
|
"loss": 3.6526, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.5560115575790405, |
|
"learning_rate": 4.769151846785226e-05, |
|
"loss": 3.7236, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.6849740743637085, |
|
"learning_rate": 4.7674418604651164e-05, |
|
"loss": 3.7586, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.4917711019515991, |
|
"learning_rate": 4.765731874145007e-05, |
|
"loss": 3.7791, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.791801929473877, |
|
"learning_rate": 4.764021887824898e-05, |
|
"loss": 3.5659, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.531817078590393, |
|
"learning_rate": 4.7623119015047886e-05, |
|
"loss": 3.5811, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.8725793361663818, |
|
"learning_rate": 4.7606019151846787e-05, |
|
"loss": 3.7337, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.5851105451583862, |
|
"learning_rate": 4.7588919288645694e-05, |
|
"loss": 3.4899, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.4937329292297363, |
|
"learning_rate": 4.7571819425444594e-05, |
|
"loss": 3.5653, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.8102850914001465, |
|
"learning_rate": 4.75547195622435e-05, |
|
"loss": 3.517, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.4572982788085938, |
|
"learning_rate": 4.753761969904241e-05, |
|
"loss": 3.723, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.5815645456314087, |
|
"learning_rate": 4.7520519835841317e-05, |
|
"loss": 3.6615, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.6406457424163818, |
|
"learning_rate": 4.7503419972640224e-05, |
|
"loss": 3.6796, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.8512486219406128, |
|
"learning_rate": 4.7486320109439124e-05, |
|
"loss": 3.5586, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.6738507747650146, |
|
"learning_rate": 4.746922024623803e-05, |
|
"loss": 3.4724, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.766518235206604, |
|
"learning_rate": 4.745212038303693e-05, |
|
"loss": 3.7213, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.681229591369629, |
|
"learning_rate": 4.7435020519835846e-05, |
|
"loss": 3.6538, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.7900785207748413, |
|
"learning_rate": 4.7417920656634754e-05, |
|
"loss": 3.7952, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.5610637664794922, |
|
"learning_rate": 4.7400820793433654e-05, |
|
"loss": 3.5255, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.3365826606750488, |
|
"learning_rate": 4.738372093023256e-05, |
|
"loss": 3.5621, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.8946609497070312, |
|
"learning_rate": 4.736662106703146e-05, |
|
"loss": 3.5664, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.728330135345459, |
|
"learning_rate": 4.734952120383037e-05, |
|
"loss": 3.7577, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.7583847045898438, |
|
"learning_rate": 4.733242134062928e-05, |
|
"loss": 3.5722, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.9061710834503174, |
|
"learning_rate": 4.7315321477428184e-05, |
|
"loss": 3.3436, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.7074024677276611, |
|
"learning_rate": 4.729822161422709e-05, |
|
"loss": 3.5426, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.649793028831482, |
|
"learning_rate": 4.728112175102599e-05, |
|
"loss": 3.5826, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.7260462045669556, |
|
"learning_rate": 4.72640218878249e-05, |
|
"loss": 3.5353, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.745391607284546, |
|
"learning_rate": 4.724692202462381e-05, |
|
"loss": 3.3534, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.374226450920105, |
|
"learning_rate": 4.722982216142271e-05, |
|
"loss": 3.386, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.8732798099517822, |
|
"learning_rate": 4.7212722298221615e-05, |
|
"loss": 3.6073, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 2.4075376987457275, |
|
"learning_rate": 4.719562243502052e-05, |
|
"loss": 3.5393, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.54508638381958, |
|
"learning_rate": 4.717852257181943e-05, |
|
"loss": 3.475, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.5737498998641968, |
|
"learning_rate": 4.716142270861834e-05, |
|
"loss": 3.5497, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.8623074293136597, |
|
"learning_rate": 4.714432284541724e-05, |
|
"loss": 3.4603, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.7199251651763916, |
|
"learning_rate": 4.7127222982216145e-05, |
|
"loss": 3.5801, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.5866843461990356, |
|
"learning_rate": 4.7110123119015045e-05, |
|
"loss": 3.3454, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.9907779693603516, |
|
"learning_rate": 4.709302325581396e-05, |
|
"loss": 3.6247, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.3720687627792358, |
|
"learning_rate": 4.707592339261287e-05, |
|
"loss": 3.3614, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.739660620689392, |
|
"learning_rate": 4.705882352941177e-05, |
|
"loss": 3.4211, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.6425236463546753, |
|
"learning_rate": 4.7041723666210675e-05, |
|
"loss": 3.423, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.5457091331481934, |
|
"learning_rate": 4.7024623803009575e-05, |
|
"loss": 3.3258, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.5979949235916138, |
|
"learning_rate": 4.700752393980848e-05, |
|
"loss": 3.2838, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.6761040687561035, |
|
"learning_rate": 4.699042407660739e-05, |
|
"loss": 3.334, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.552573323249817, |
|
"learning_rate": 4.69733242134063e-05, |
|
"loss": 3.5355, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.6743354797363281, |
|
"learning_rate": 4.6956224350205205e-05, |
|
"loss": 3.4076, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 4.555662155151367, |
|
"learning_rate": 4.6939124487004105e-05, |
|
"loss": 3.7446, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.5942860841751099, |
|
"learning_rate": 4.692202462380301e-05, |
|
"loss": 3.553, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.9089189767837524, |
|
"learning_rate": 4.690492476060191e-05, |
|
"loss": 3.3584, |
|
"step": 724 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 2.0768373012542725, |
|
"learning_rate": 4.688782489740082e-05, |
|
"loss": 3.4295, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 3.3060362339019775, |
|
"learning_rate": 4.687072503419973e-05, |
|
"loss": 3.415, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.9307421445846558, |
|
"learning_rate": 4.6853625170998635e-05, |
|
"loss": 3.4901, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.7142446041107178, |
|
"learning_rate": 4.683652530779754e-05, |
|
"loss": 3.6096, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.7756657600402832, |
|
"learning_rate": 4.681942544459644e-05, |
|
"loss": 3.4534, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.5459442138671875, |
|
"learning_rate": 4.680232558139535e-05, |
|
"loss": 3.3813, |
|
"step": 748 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 2.1961963176727295, |
|
"learning_rate": 4.678522571819426e-05, |
|
"loss": 3.3467, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.7320613861083984, |
|
"learning_rate": 4.676812585499316e-05, |
|
"loss": 3.3927, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.6255303621292114, |
|
"learning_rate": 4.6751025991792066e-05, |
|
"loss": 3.4644, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.5142388343811035, |
|
"learning_rate": 4.673392612859097e-05, |
|
"loss": 3.4657, |
|
"step": 764 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.5093744993209839, |
|
"learning_rate": 4.671682626538988e-05, |
|
"loss": 3.3144, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.4730446338653564, |
|
"learning_rate": 4.669972640218879e-05, |
|
"loss": 3.3382, |
|
"step": 772 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.7650116682052612, |
|
"learning_rate": 4.668262653898769e-05, |
|
"loss": 3.4519, |
|
"step": 776 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.637071132659912, |
|
"learning_rate": 4.6665526675786596e-05, |
|
"loss": 3.3723, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.4582788944244385, |
|
"learning_rate": 4.6648426812585496e-05, |
|
"loss": 3.4839, |
|
"step": 784 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.4654922485351562, |
|
"learning_rate": 4.663132694938441e-05, |
|
"loss": 3.1257, |
|
"step": 788 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.4668978452682495, |
|
"learning_rate": 4.661422708618332e-05, |
|
"loss": 3.2506, |
|
"step": 792 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.5190536975860596, |
|
"learning_rate": 4.659712722298222e-05, |
|
"loss": 3.4055, |
|
"step": 796 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.9057687520980835, |
|
"learning_rate": 4.6580027359781126e-05, |
|
"loss": 3.4804, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.4573813676834106, |
|
"learning_rate": 4.6562927496580026e-05, |
|
"loss": 3.3792, |
|
"step": 804 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 2.3757171630859375, |
|
"learning_rate": 4.6545827633378933e-05, |
|
"loss": 3.4649, |
|
"step": 808 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.7024015188217163, |
|
"learning_rate": 4.652872777017784e-05, |
|
"loss": 3.3149, |
|
"step": 812 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.5424153804779053, |
|
"learning_rate": 4.651162790697675e-05, |
|
"loss": 3.3, |
|
"step": 816 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.4667370319366455, |
|
"learning_rate": 4.6494528043775655e-05, |
|
"loss": 3.2845, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.5216193199157715, |
|
"learning_rate": 4.6477428180574556e-05, |
|
"loss": 3.3062, |
|
"step": 824 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.5499827861785889, |
|
"learning_rate": 4.6460328317373463e-05, |
|
"loss": 3.263, |
|
"step": 828 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.644148588180542, |
|
"learning_rate": 4.6443228454172364e-05, |
|
"loss": 3.4374, |
|
"step": 832 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.6420782804489136, |
|
"learning_rate": 4.642612859097127e-05, |
|
"loss": 3.4049, |
|
"step": 836 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.4325385093688965, |
|
"learning_rate": 4.640902872777018e-05, |
|
"loss": 3.3499, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.6447879076004028, |
|
"learning_rate": 4.6391928864569086e-05, |
|
"loss": 3.4098, |
|
"step": 844 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.4675137996673584, |
|
"learning_rate": 4.637482900136799e-05, |
|
"loss": 3.2778, |
|
"step": 848 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.5847750902175903, |
|
"learning_rate": 4.6357729138166894e-05, |
|
"loss": 3.1695, |
|
"step": 852 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.514772653579712, |
|
"learning_rate": 4.63406292749658e-05, |
|
"loss": 3.1556, |
|
"step": 856 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.511367917060852, |
|
"learning_rate": 4.632352941176471e-05, |
|
"loss": 3.3307, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.7640196084976196, |
|
"learning_rate": 4.630642954856361e-05, |
|
"loss": 3.2985, |
|
"step": 864 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.7074265480041504, |
|
"learning_rate": 4.628932968536252e-05, |
|
"loss": 3.2598, |
|
"step": 868 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.52577805519104, |
|
"learning_rate": 4.6272229822161424e-05, |
|
"loss": 3.0872, |
|
"step": 872 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.6908501386642456, |
|
"learning_rate": 4.625512995896033e-05, |
|
"loss": 3.1931, |
|
"step": 876 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.6583012342453003, |
|
"learning_rate": 4.623803009575924e-05, |
|
"loss": 3.1849, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.4067027568817139, |
|
"learning_rate": 4.622093023255814e-05, |
|
"loss": 3.1912, |
|
"step": 884 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.388487458229065, |
|
"learning_rate": 4.6203830369357046e-05, |
|
"loss": 3.2845, |
|
"step": 888 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.7910948991775513, |
|
"learning_rate": 4.6186730506155954e-05, |
|
"loss": 3.1146, |
|
"step": 892 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.4454721212387085, |
|
"learning_rate": 4.616963064295486e-05, |
|
"loss": 3.2114, |
|
"step": 896 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.4809476137161255, |
|
"learning_rate": 4.615253077975377e-05, |
|
"loss": 3.3329, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.6668082475662231, |
|
"learning_rate": 4.613543091655267e-05, |
|
"loss": 3.2517, |
|
"step": 904 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.72063410282135, |
|
"learning_rate": 4.6118331053351576e-05, |
|
"loss": 3.0895, |
|
"step": 908 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.5995526313781738, |
|
"learning_rate": 4.610123119015048e-05, |
|
"loss": 3.2703, |
|
"step": 912 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.453747034072876, |
|
"learning_rate": 4.6084131326949384e-05, |
|
"loss": 3.3619, |
|
"step": 916 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.7982878684997559, |
|
"learning_rate": 4.606703146374829e-05, |
|
"loss": 3.2657, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.3937926292419434, |
|
"learning_rate": 4.60499316005472e-05, |
|
"loss": 3.1727, |
|
"step": 924 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.5705368518829346, |
|
"learning_rate": 4.6032831737346106e-05, |
|
"loss": 3.0657, |
|
"step": 928 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.6821500062942505, |
|
"learning_rate": 4.601573187414501e-05, |
|
"loss": 3.3191, |
|
"step": 932 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.5288244485855103, |
|
"learning_rate": 4.5998632010943914e-05, |
|
"loss": 3.0876, |
|
"step": 936 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.7748781442642212, |
|
"learning_rate": 4.598153214774282e-05, |
|
"loss": 3.18, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.7024669647216797, |
|
"learning_rate": 4.596443228454172e-05, |
|
"loss": 3.3601, |
|
"step": 944 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.7017521858215332, |
|
"learning_rate": 4.5947332421340636e-05, |
|
"loss": 2.9751, |
|
"step": 948 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.7405025959014893, |
|
"learning_rate": 4.593023255813954e-05, |
|
"loss": 3.2073, |
|
"step": 952 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.6610547304153442, |
|
"learning_rate": 4.5913132694938444e-05, |
|
"loss": 3.2363, |
|
"step": 956 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.4785616397857666, |
|
"learning_rate": 4.5896032831737345e-05, |
|
"loss": 3.3254, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 3.453533411026001, |
|
"learning_rate": 4.587893296853625e-05, |
|
"loss": 3.21, |
|
"step": 964 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.4303959608078003, |
|
"learning_rate": 4.586183310533516e-05, |
|
"loss": 3.3226, |
|
"step": 968 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.6272943019866943, |
|
"learning_rate": 4.584473324213407e-05, |
|
"loss": 3.5944, |
|
"step": 972 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.823716402053833, |
|
"learning_rate": 4.5827633378932974e-05, |
|
"loss": 3.234, |
|
"step": 976 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.7006231546401978, |
|
"learning_rate": 4.5810533515731875e-05, |
|
"loss": 3.0123, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.3645117282867432, |
|
"learning_rate": 4.579343365253078e-05, |
|
"loss": 3.2243, |
|
"step": 984 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.7724872827529907, |
|
"learning_rate": 4.577633378932969e-05, |
|
"loss": 3.1705, |
|
"step": 988 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.6377599239349365, |
|
"learning_rate": 4.575923392612859e-05, |
|
"loss": 3.2647, |
|
"step": 992 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.4828346967697144, |
|
"learning_rate": 4.57421340629275e-05, |
|
"loss": 3.0613, |
|
"step": 996 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.515553593635559, |
|
"learning_rate": 4.5725034199726405e-05, |
|
"loss": 3.0368, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.498844861984253, |
|
"learning_rate": 4.570793433652531e-05, |
|
"loss": 3.102, |
|
"step": 1004 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.4339860677719116, |
|
"learning_rate": 4.569083447332422e-05, |
|
"loss": 3.1874, |
|
"step": 1008 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.5611178874969482, |
|
"learning_rate": 4.567373461012312e-05, |
|
"loss": 3.075, |
|
"step": 1012 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.6089402437210083, |
|
"learning_rate": 4.565663474692203e-05, |
|
"loss": 3.1544, |
|
"step": 1016 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.8874709606170654, |
|
"learning_rate": 4.563953488372093e-05, |
|
"loss": 3.2806, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.5624805688858032, |
|
"learning_rate": 4.5622435020519835e-05, |
|
"loss": 3.1442, |
|
"step": 1024 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.968634009361267, |
|
"learning_rate": 4.560533515731875e-05, |
|
"loss": 3.2743, |
|
"step": 1028 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.4064279794692993, |
|
"learning_rate": 4.558823529411765e-05, |
|
"loss": 3.0726, |
|
"step": 1032 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.577048897743225, |
|
"learning_rate": 4.557113543091656e-05, |
|
"loss": 3.1129, |
|
"step": 1036 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.548323392868042, |
|
"learning_rate": 4.555403556771546e-05, |
|
"loss": 3.144, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.5542961359024048, |
|
"learning_rate": 4.5536935704514365e-05, |
|
"loss": 3.182, |
|
"step": 1044 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.5506356954574585, |
|
"learning_rate": 4.551983584131327e-05, |
|
"loss": 3.2186, |
|
"step": 1048 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.6962274312973022, |
|
"learning_rate": 4.550273597811218e-05, |
|
"loss": 3.0385, |
|
"step": 1052 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.4382833242416382, |
|
"learning_rate": 4.548563611491109e-05, |
|
"loss": 3.1268, |
|
"step": 1056 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 2.269392728805542, |
|
"learning_rate": 4.546853625170999e-05, |
|
"loss": 3.2579, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.5414201021194458, |
|
"learning_rate": 4.5451436388508895e-05, |
|
"loss": 3.1501, |
|
"step": 1064 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.936946153640747, |
|
"learning_rate": 4.54343365253078e-05, |
|
"loss": 3.1711, |
|
"step": 1068 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.7430529594421387, |
|
"learning_rate": 4.54172366621067e-05, |
|
"loss": 3.1299, |
|
"step": 1072 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.3500404357910156, |
|
"learning_rate": 4.540013679890561e-05, |
|
"loss": 3.0935, |
|
"step": 1076 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.5768132209777832, |
|
"learning_rate": 4.538303693570452e-05, |
|
"loss": 3.1091, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.4829493761062622, |
|
"learning_rate": 4.5365937072503425e-05, |
|
"loss": 3.0862, |
|
"step": 1084 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.5560483932495117, |
|
"learning_rate": 4.5348837209302326e-05, |
|
"loss": 3.1024, |
|
"step": 1088 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.6295199394226074, |
|
"learning_rate": 4.533173734610123e-05, |
|
"loss": 2.9539, |
|
"step": 1092 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 2.1724135875701904, |
|
"learning_rate": 4.531463748290014e-05, |
|
"loss": 3.1936, |
|
"step": 1096 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.7400479316711426, |
|
"learning_rate": 4.529753761969904e-05, |
|
"loss": 3.1873, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.554962396621704, |
|
"learning_rate": 4.528043775649795e-05, |
|
"loss": 3.0643, |
|
"step": 1104 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.3722119331359863, |
|
"learning_rate": 4.5263337893296855e-05, |
|
"loss": 3.0088, |
|
"step": 1108 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.395577311515808, |
|
"learning_rate": 4.524623803009576e-05, |
|
"loss": 3.0584, |
|
"step": 1112 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.509710431098938, |
|
"learning_rate": 4.522913816689467e-05, |
|
"loss": 3.2276, |
|
"step": 1116 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.5213786363601685, |
|
"learning_rate": 4.521203830369357e-05, |
|
"loss": 2.9433, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.51616632938385, |
|
"learning_rate": 4.519493844049248e-05, |
|
"loss": 2.9902, |
|
"step": 1124 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.5429805517196655, |
|
"learning_rate": 4.517783857729138e-05, |
|
"loss": 3.0884, |
|
"step": 1128 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.4714343547821045, |
|
"learning_rate": 4.5160738714090286e-05, |
|
"loss": 3.0322, |
|
"step": 1132 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.4637647867202759, |
|
"learning_rate": 4.51436388508892e-05, |
|
"loss": 3.0352, |
|
"step": 1136 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.3424347639083862, |
|
"learning_rate": 4.51265389876881e-05, |
|
"loss": 3.0866, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.689968228340149, |
|
"learning_rate": 4.510943912448701e-05, |
|
"loss": 3.0002, |
|
"step": 1144 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.9935293197631836, |
|
"learning_rate": 4.509233926128591e-05, |
|
"loss": 2.9518, |
|
"step": 1148 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.5461673736572266, |
|
"learning_rate": 4.5075239398084816e-05, |
|
"loss": 3.0394, |
|
"step": 1152 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.5787737369537354, |
|
"learning_rate": 4.505813953488372e-05, |
|
"loss": 3.0934, |
|
"step": 1156 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.4527121782302856, |
|
"learning_rate": 4.504103967168263e-05, |
|
"loss": 2.9734, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.6236604452133179, |
|
"learning_rate": 4.502393980848154e-05, |
|
"loss": 3.021, |
|
"step": 1164 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.5616329908370972, |
|
"learning_rate": 4.500683994528044e-05, |
|
"loss": 3.0602, |
|
"step": 1168 |
|
} |
|
], |
|
"logging_steps": 4, |
|
"max_steps": 11696, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 1170, |
|
"total_flos": 1.585392327327744e+16, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|