|
{ |
|
"best_metric": NaN, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-200", |
|
"epoch": 0.5675368898978433, |
|
"eval_steps": 200, |
|
"global_step": 1000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0005675368898978433, |
|
"grad_norm": 4.242635250091553, |
|
"learning_rate": 1e-05, |
|
"loss": 2.0895, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0011350737797956867, |
|
"grad_norm": 3.8852005004882812, |
|
"learning_rate": 2e-05, |
|
"loss": 2.5238, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.00170261066969353, |
|
"grad_norm": 4.170104026794434, |
|
"learning_rate": 3e-05, |
|
"loss": 2.7007, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0022701475595913734, |
|
"grad_norm": 4.369339942932129, |
|
"learning_rate": 4e-05, |
|
"loss": 2.7838, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.0028376844494892167, |
|
"grad_norm": 4.925931453704834, |
|
"learning_rate": 5e-05, |
|
"loss": 2.8797, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.00340522133938706, |
|
"grad_norm": 5.264482021331787, |
|
"learning_rate": 6e-05, |
|
"loss": 2.971, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.003972758229284903, |
|
"grad_norm": 5.356241703033447, |
|
"learning_rate": 7e-05, |
|
"loss": 3.1023, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.004540295119182747, |
|
"grad_norm": 5.722445964813232, |
|
"learning_rate": 8e-05, |
|
"loss": 2.7219, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.00510783200908059, |
|
"grad_norm": 10.222125053405762, |
|
"learning_rate": 9e-05, |
|
"loss": 2.7866, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.0056753688989784334, |
|
"grad_norm": 6.646042346954346, |
|
"learning_rate": 0.0001, |
|
"loss": 2.4618, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.006242905788876277, |
|
"grad_norm": 8.881997108459473, |
|
"learning_rate": 9.999974825027756e-05, |
|
"loss": 2.2149, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.00681044267877412, |
|
"grad_norm": 7.798340797424316, |
|
"learning_rate": 9.999899300364532e-05, |
|
"loss": 1.9245, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.0073779795686719635, |
|
"grad_norm": 8.679018020629883, |
|
"learning_rate": 9.999773426770865e-05, |
|
"loss": 1.4955, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.007945516458569807, |
|
"grad_norm": 7.172414302825928, |
|
"learning_rate": 9.999597205514297e-05, |
|
"loss": 1.0912, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.00851305334846765, |
|
"grad_norm": 9.331069946289062, |
|
"learning_rate": 9.999370638369377e-05, |
|
"loss": 0.9289, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.009080590238365494, |
|
"grad_norm": 9.002270698547363, |
|
"learning_rate": 9.99909372761763e-05, |
|
"loss": 0.657, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.009648127128263337, |
|
"grad_norm": 9.825034141540527, |
|
"learning_rate": 9.998766476047547e-05, |
|
"loss": 0.6728, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.01021566401816118, |
|
"grad_norm": 14.936773300170898, |
|
"learning_rate": 9.998388886954547e-05, |
|
"loss": 0.6303, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.010783200908059024, |
|
"grad_norm": 14.347192764282227, |
|
"learning_rate": 9.997960964140947e-05, |
|
"loss": 0.6379, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.011350737797956867, |
|
"grad_norm": 12.535562515258789, |
|
"learning_rate": 9.997482711915927e-05, |
|
"loss": 0.5843, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.01191827468785471, |
|
"grad_norm": 12.070816993713379, |
|
"learning_rate": 9.99695413509548e-05, |
|
"loss": 0.5889, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.012485811577752554, |
|
"grad_norm": 10.580400466918945, |
|
"learning_rate": 9.996375239002369e-05, |
|
"loss": 0.601, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.013053348467650397, |
|
"grad_norm": 8.476105690002441, |
|
"learning_rate": 9.995746029466071e-05, |
|
"loss": 0.6079, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.01362088535754824, |
|
"grad_norm": 9.052068710327148, |
|
"learning_rate": 9.99506651282272e-05, |
|
"loss": 0.6068, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.014188422247446084, |
|
"grad_norm": 7.772611141204834, |
|
"learning_rate": 9.99433669591504e-05, |
|
"loss": 0.7142, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.014755959137343927, |
|
"grad_norm": 5.843691349029541, |
|
"learning_rate": 9.993556586092281e-05, |
|
"loss": 0.4146, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.01532349602724177, |
|
"grad_norm": 6.3077392578125, |
|
"learning_rate": 9.992726191210138e-05, |
|
"loss": 0.4692, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.015891032917139614, |
|
"grad_norm": 5.899518013000488, |
|
"learning_rate": 9.991845519630678e-05, |
|
"loss": 0.3898, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.016458569807037457, |
|
"grad_norm": 7.513155460357666, |
|
"learning_rate": 9.990914580222257e-05, |
|
"loss": 0.4173, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.0170261066969353, |
|
"grad_norm": 7.608649253845215, |
|
"learning_rate": 9.989933382359422e-05, |
|
"loss": 0.3552, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.017593643586833144, |
|
"grad_norm": 6.533905029296875, |
|
"learning_rate": 9.988901935922826e-05, |
|
"loss": 0.3371, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.018161180476730987, |
|
"grad_norm": 5.706025123596191, |
|
"learning_rate": 9.987820251299122e-05, |
|
"loss": 0.2471, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.01872871736662883, |
|
"grad_norm": 4.540302276611328, |
|
"learning_rate": 9.986688339380862e-05, |
|
"loss": 0.2083, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.019296254256526674, |
|
"grad_norm": 3.610647439956665, |
|
"learning_rate": 9.985506211566388e-05, |
|
"loss": 0.1795, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.019863791146424517, |
|
"grad_norm": 5.012790203094482, |
|
"learning_rate": 9.984273879759713e-05, |
|
"loss": 0.2031, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.02043132803632236, |
|
"grad_norm": 3.4943277835845947, |
|
"learning_rate": 9.982991356370404e-05, |
|
"loss": 0.1355, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.020998864926220204, |
|
"grad_norm": 4.174023628234863, |
|
"learning_rate": 9.981658654313457e-05, |
|
"loss": 0.1841, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.021566401816118047, |
|
"grad_norm": 2.6815176010131836, |
|
"learning_rate": 9.98027578700917e-05, |
|
"loss": 0.1052, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.02213393870601589, |
|
"grad_norm": 1.8553539514541626, |
|
"learning_rate": 9.978842768382998e-05, |
|
"loss": 0.0862, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.022701475595913734, |
|
"grad_norm": 2.5569417476654053, |
|
"learning_rate": 9.977359612865423e-05, |
|
"loss": 0.0708, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.023269012485811577, |
|
"grad_norm": 4.1771979331970215, |
|
"learning_rate": 9.975826335391808e-05, |
|
"loss": 0.0879, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.02383654937570942, |
|
"grad_norm": 3.163653612136841, |
|
"learning_rate": 9.974242951402235e-05, |
|
"loss": 0.1033, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.024404086265607264, |
|
"grad_norm": 4.081658840179443, |
|
"learning_rate": 9.972609476841367e-05, |
|
"loss": 0.1103, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.024971623155505107, |
|
"grad_norm": 5.564581871032715, |
|
"learning_rate": 9.970925928158274e-05, |
|
"loss": 0.1136, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.02553916004540295, |
|
"grad_norm": 86.91041564941406, |
|
"learning_rate": 9.969192322306271e-05, |
|
"loss": 1.3437, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.026106696935300794, |
|
"grad_norm": 0.0, |
|
"learning_rate": 9.967408676742751e-05, |
|
"loss": 0.0, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.026674233825198637, |
|
"grad_norm": 0.0, |
|
"learning_rate": 9.965575009429006e-05, |
|
"loss": 0.0, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.02724177071509648, |
|
"grad_norm": 0.0, |
|
"learning_rate": 9.963691338830044e-05, |
|
"loss": 0.0, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.027809307604994324, |
|
"grad_norm": 0.0, |
|
"learning_rate": 9.961757683914406e-05, |
|
"loss": 0.0, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.028376844494892167, |
|
"grad_norm": 0.0, |
|
"learning_rate": 9.959774064153977e-05, |
|
"loss": 0.0, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.02894438138479001, |
|
"grad_norm": 13.452136993408203, |
|
"learning_rate": 9.957740499523787e-05, |
|
"loss": 1.2541, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.029511918274687854, |
|
"grad_norm": 10.72777271270752, |
|
"learning_rate": 9.955657010501806e-05, |
|
"loss": 1.1451, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.030079455164585697, |
|
"grad_norm": 5.7461466789245605, |
|
"learning_rate": 9.953523618068749e-05, |
|
"loss": 0.7454, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.03064699205448354, |
|
"grad_norm": 4.224442481994629, |
|
"learning_rate": 9.951340343707852e-05, |
|
"loss": 0.6189, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.031214528944381384, |
|
"grad_norm": 5.231300354003906, |
|
"learning_rate": 9.949107209404665e-05, |
|
"loss": 0.5754, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.03178206583427923, |
|
"grad_norm": 4.194768905639648, |
|
"learning_rate": 9.946824237646824e-05, |
|
"loss": 0.4918, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.03234960272417707, |
|
"grad_norm": 3.1545193195343018, |
|
"learning_rate": 9.944491451423828e-05, |
|
"loss": 0.3889, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.032917139614074914, |
|
"grad_norm": 4.240617275238037, |
|
"learning_rate": 9.942108874226811e-05, |
|
"loss": 0.383, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.03348467650397276, |
|
"grad_norm": 3.6889731884002686, |
|
"learning_rate": 9.939676530048301e-05, |
|
"loss": 0.3308, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.0340522133938706, |
|
"grad_norm": 7.476080894470215, |
|
"learning_rate": 9.937194443381972e-05, |
|
"loss": 0.2755, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.034619750283768444, |
|
"grad_norm": 3.6329305171966553, |
|
"learning_rate": 9.934662639222412e-05, |
|
"loss": 0.2908, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.03518728717366629, |
|
"grad_norm": 3.839686870574951, |
|
"learning_rate": 9.93208114306486e-05, |
|
"loss": 0.317, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.03575482406356413, |
|
"grad_norm": 2.5644800662994385, |
|
"learning_rate": 9.929449980904952e-05, |
|
"loss": 0.2524, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.036322360953461974, |
|
"grad_norm": 2.268704652786255, |
|
"learning_rate": 9.926769179238466e-05, |
|
"loss": 0.2686, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.03688989784335982, |
|
"grad_norm": 3.147296905517578, |
|
"learning_rate": 9.924038765061042e-05, |
|
"loss": 0.2311, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.03745743473325766, |
|
"grad_norm": 2.1133084297180176, |
|
"learning_rate": 9.921258765867919e-05, |
|
"loss": 0.2072, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.038024971623155504, |
|
"grad_norm": 1.422222375869751, |
|
"learning_rate": 9.918429209653662e-05, |
|
"loss": 0.2128, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.03859250851305335, |
|
"grad_norm": 1.157293438911438, |
|
"learning_rate": 9.915550124911866e-05, |
|
"loss": 0.1427, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.03916004540295119, |
|
"grad_norm": 1.4492322206497192, |
|
"learning_rate": 9.912621540634887e-05, |
|
"loss": 0.1704, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.039727582292849034, |
|
"grad_norm": 1.1632133722305298, |
|
"learning_rate": 9.909643486313533e-05, |
|
"loss": 0.1299, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.04029511918274688, |
|
"grad_norm": 3.0194833278656006, |
|
"learning_rate": 9.90661599193678e-05, |
|
"loss": 0.1088, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.04086265607264472, |
|
"grad_norm": 1.3391764163970947, |
|
"learning_rate": 9.903539087991462e-05, |
|
"loss": 0.1105, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.041430192962542564, |
|
"grad_norm": 1.761568307876587, |
|
"learning_rate": 9.900412805461967e-05, |
|
"loss": 0.1449, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.04199772985244041, |
|
"grad_norm": 1.8852134943008423, |
|
"learning_rate": 9.897237175829926e-05, |
|
"loss": 0.1381, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.04256526674233825, |
|
"grad_norm": 1.8318125009536743, |
|
"learning_rate": 9.894012231073894e-05, |
|
"loss": 0.0953, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.043132803632236094, |
|
"grad_norm": 1.9700140953063965, |
|
"learning_rate": 9.890738003669029e-05, |
|
"loss": 0.1002, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.04370034052213394, |
|
"grad_norm": 2.201488971710205, |
|
"learning_rate": 9.887414526586763e-05, |
|
"loss": 0.0935, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.04426787741203178, |
|
"grad_norm": 2.130439281463623, |
|
"learning_rate": 9.884041833294476e-05, |
|
"loss": 0.1128, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.044835414301929624, |
|
"grad_norm": 1.2699000835418701, |
|
"learning_rate": 9.880619957755151e-05, |
|
"loss": 0.0704, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.04540295119182747, |
|
"grad_norm": 1.2597098350524902, |
|
"learning_rate": 9.877148934427037e-05, |
|
"loss": 0.0913, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.04597048808172531, |
|
"grad_norm": 1.9284838438034058, |
|
"learning_rate": 9.873628798263296e-05, |
|
"loss": 0.1002, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.046538024971623154, |
|
"grad_norm": 2.915311574935913, |
|
"learning_rate": 9.870059584711668e-05, |
|
"loss": 0.0878, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.047105561861521, |
|
"grad_norm": 2.436211347579956, |
|
"learning_rate": 9.866441329714088e-05, |
|
"loss": 0.0733, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.04767309875141884, |
|
"grad_norm": 1.696310043334961, |
|
"learning_rate": 9.862774069706346e-05, |
|
"loss": 0.0852, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.048240635641316684, |
|
"grad_norm": 0.9896736145019531, |
|
"learning_rate": 9.859057841617709e-05, |
|
"loss": 0.0163, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.04880817253121453, |
|
"grad_norm": 1.1014189720153809, |
|
"learning_rate": 9.855292682870551e-05, |
|
"loss": 0.0783, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.04937570942111237, |
|
"grad_norm": 1.2181459665298462, |
|
"learning_rate": 9.851478631379982e-05, |
|
"loss": 0.0454, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.049943246311010214, |
|
"grad_norm": 1.8675448894500732, |
|
"learning_rate": 9.847615725553456e-05, |
|
"loss": 0.062, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.05051078320090806, |
|
"grad_norm": 2.457249641418457, |
|
"learning_rate": 9.843704004290392e-05, |
|
"loss": 0.0158, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.0510783200908059, |
|
"grad_norm": 1.7994331121444702, |
|
"learning_rate": 9.839743506981782e-05, |
|
"loss": 0.0463, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.051645856980703744, |
|
"grad_norm": 1.6931531429290771, |
|
"learning_rate": 9.835734273509786e-05, |
|
"loss": 0.0308, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.05221339387060159, |
|
"grad_norm": 0.2792555093765259, |
|
"learning_rate": 9.831676344247342e-05, |
|
"loss": 0.0043, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.05278093076049943, |
|
"grad_norm": 1.1043611764907837, |
|
"learning_rate": 9.827569760057755e-05, |
|
"loss": 0.0266, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.053348467650397274, |
|
"grad_norm": 2.844864845275879, |
|
"learning_rate": 9.82341456229428e-05, |
|
"loss": 0.0724, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.05391600454029512, |
|
"grad_norm": 47.320552825927734, |
|
"learning_rate": 9.819210792799712e-05, |
|
"loss": 0.9477, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.05448354143019296, |
|
"grad_norm": 0.0, |
|
"learning_rate": 9.814958493905963e-05, |
|
"loss": 0.0, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.055051078320090804, |
|
"grad_norm": 8.552749633789062, |
|
"learning_rate": 9.810657708433637e-05, |
|
"loss": 0.0776, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.05561861520998865, |
|
"grad_norm": 0.0, |
|
"learning_rate": 9.806308479691595e-05, |
|
"loss": 0.0, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.05618615209988649, |
|
"grad_norm": 0.0, |
|
"learning_rate": 9.801910851476523e-05, |
|
"loss": 0.0, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.056753688989784334, |
|
"grad_norm": 0.0, |
|
"learning_rate": 9.797464868072488e-05, |
|
"loss": 0.0, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.05732122587968218, |
|
"grad_norm": 7.161002159118652, |
|
"learning_rate": 9.792970574250493e-05, |
|
"loss": 0.57, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.05788876276958002, |
|
"grad_norm": 5.865129470825195, |
|
"learning_rate": 9.788428015268027e-05, |
|
"loss": 0.4715, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.058456299659477864, |
|
"grad_norm": 4.487433433532715, |
|
"learning_rate": 9.783837236868609e-05, |
|
"loss": 0.3068, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.05902383654937571, |
|
"grad_norm": 4.0235209465026855, |
|
"learning_rate": 9.779198285281325e-05, |
|
"loss": 0.233, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.05959137343927355, |
|
"grad_norm": 5.06863260269165, |
|
"learning_rate": 9.77451120722037e-05, |
|
"loss": 0.2298, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.060158910329171394, |
|
"grad_norm": 2.5704989433288574, |
|
"learning_rate": 9.769776049884563e-05, |
|
"loss": 0.156, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.06072644721906924, |
|
"grad_norm": 3.1615333557128906, |
|
"learning_rate": 9.764992860956889e-05, |
|
"loss": 0.2069, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.06129398410896708, |
|
"grad_norm": 2.513225555419922, |
|
"learning_rate": 9.760161688604008e-05, |
|
"loss": 0.129, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.061861520998864925, |
|
"grad_norm": 2.5690817832946777, |
|
"learning_rate": 9.755282581475769e-05, |
|
"loss": 0.1146, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.06242905788876277, |
|
"grad_norm": 3.713148593902588, |
|
"learning_rate": 9.750355588704727e-05, |
|
"loss": 0.1187, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.06299659477866061, |
|
"grad_norm": 2.496454954147339, |
|
"learning_rate": 9.745380759905647e-05, |
|
"loss": 0.1376, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.06356413166855845, |
|
"grad_norm": 1.6230063438415527, |
|
"learning_rate": 9.740358145174998e-05, |
|
"loss": 0.0658, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.0641316685584563, |
|
"grad_norm": 1.4402037858963013, |
|
"learning_rate": 9.735287795090455e-05, |
|
"loss": 0.067, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.06469920544835414, |
|
"grad_norm": 1.7551097869873047, |
|
"learning_rate": 9.730169760710386e-05, |
|
"loss": 0.0546, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.06526674233825198, |
|
"grad_norm": 9.121005058288574, |
|
"learning_rate": 9.725004093573342e-05, |
|
"loss": 0.0719, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.06583427922814983, |
|
"grad_norm": 1.0054181814193726, |
|
"learning_rate": 9.719790845697533e-05, |
|
"loss": 0.04, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.06640181611804767, |
|
"grad_norm": 1.0197290182113647, |
|
"learning_rate": 9.714530069580309e-05, |
|
"loss": 0.0465, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.06696935300794551, |
|
"grad_norm": 1.8132661581039429, |
|
"learning_rate": 9.709221818197624e-05, |
|
"loss": 0.0315, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.06753688989784336, |
|
"grad_norm": 1.3299674987792969, |
|
"learning_rate": 9.703866145003511e-05, |
|
"loss": 0.0777, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.0681044267877412, |
|
"grad_norm": 0.7436254024505615, |
|
"learning_rate": 9.698463103929542e-05, |
|
"loss": 0.0274, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.06867196367763904, |
|
"grad_norm": 0.8951746821403503, |
|
"learning_rate": 9.693012749384279e-05, |
|
"loss": 0.0349, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.06923950056753689, |
|
"grad_norm": 1.018670678138733, |
|
"learning_rate": 9.687515136252731e-05, |
|
"loss": 0.0209, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.06980703745743473, |
|
"grad_norm": 0.5818009972572327, |
|
"learning_rate": 9.681970319895803e-05, |
|
"loss": 0.0151, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.07037457434733257, |
|
"grad_norm": 0.49394237995147705, |
|
"learning_rate": 9.676378356149734e-05, |
|
"loss": 0.0149, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.07094211123723042, |
|
"grad_norm": 1.2261513471603394, |
|
"learning_rate": 9.670739301325534e-05, |
|
"loss": 0.0279, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.07150964812712826, |
|
"grad_norm": 0.8551621437072754, |
|
"learning_rate": 9.665053212208426e-05, |
|
"loss": 0.0307, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.0720771850170261, |
|
"grad_norm": 0.7473931312561035, |
|
"learning_rate": 9.659320146057262e-05, |
|
"loss": 0.0258, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.07264472190692395, |
|
"grad_norm": 1.058668851852417, |
|
"learning_rate": 9.653540160603956e-05, |
|
"loss": 0.0333, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.07321225879682179, |
|
"grad_norm": 0.4712604284286499, |
|
"learning_rate": 9.647713314052896e-05, |
|
"loss": 0.0194, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.07377979568671963, |
|
"grad_norm": 0.554572343826294, |
|
"learning_rate": 9.641839665080363e-05, |
|
"loss": 0.012, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.07434733257661748, |
|
"grad_norm": 0.6603901386260986, |
|
"learning_rate": 9.635919272833938e-05, |
|
"loss": 0.0223, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.07491486946651532, |
|
"grad_norm": 0.8694583773612976, |
|
"learning_rate": 9.629952196931901e-05, |
|
"loss": 0.0455, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.07548240635641316, |
|
"grad_norm": 1.202289342880249, |
|
"learning_rate": 9.623938497462646e-05, |
|
"loss": 0.0438, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.07604994324631101, |
|
"grad_norm": 3.9090631008148193, |
|
"learning_rate": 9.617878234984055e-05, |
|
"loss": 0.0531, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.07661748013620885, |
|
"grad_norm": 1.0893713235855103, |
|
"learning_rate": 9.611771470522908e-05, |
|
"loss": 0.0357, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.0771850170261067, |
|
"grad_norm": 0.5505892634391785, |
|
"learning_rate": 9.60561826557425e-05, |
|
"loss": 0.0116, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.07775255391600454, |
|
"grad_norm": 3.2899348735809326, |
|
"learning_rate": 9.599418682100793e-05, |
|
"loss": 0.028, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.07832009080590238, |
|
"grad_norm": 1.5124907493591309, |
|
"learning_rate": 9.593172782532268e-05, |
|
"loss": 0.0374, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.07888762769580022, |
|
"grad_norm": 1.1038376092910767, |
|
"learning_rate": 9.586880629764817e-05, |
|
"loss": 0.0433, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.07945516458569807, |
|
"grad_norm": 0.2565675675868988, |
|
"learning_rate": 9.580542287160348e-05, |
|
"loss": 0.0037, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.08002270147559591, |
|
"grad_norm": 1.869051456451416, |
|
"learning_rate": 9.574157818545901e-05, |
|
"loss": 0.0232, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.08059023836549375, |
|
"grad_norm": 0.21874603629112244, |
|
"learning_rate": 9.567727288213005e-05, |
|
"loss": 0.004, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.0811577752553916, |
|
"grad_norm": 0.47795429825782776, |
|
"learning_rate": 9.561250760917027e-05, |
|
"loss": 0.0062, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.08172531214528944, |
|
"grad_norm": 1.0321537256240845, |
|
"learning_rate": 9.554728301876526e-05, |
|
"loss": 0.0152, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.08229284903518728, |
|
"grad_norm": 10.375513076782227, |
|
"learning_rate": 9.548159976772592e-05, |
|
"loss": 0.5914, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.08286038592508513, |
|
"grad_norm": 10.878011703491211, |
|
"learning_rate": 9.541545851748186e-05, |
|
"loss": 0.0679, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.08342792281498297, |
|
"grad_norm": 0.0, |
|
"learning_rate": 9.534885993407474e-05, |
|
"loss": 0.0, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.08399545970488081, |
|
"grad_norm": 0.0, |
|
"learning_rate": 9.528180468815155e-05, |
|
"loss": 0.0, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.08456299659477866, |
|
"grad_norm": 0.0, |
|
"learning_rate": 9.521429345495787e-05, |
|
"loss": 0.0, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.0851305334846765, |
|
"grad_norm": 0.0, |
|
"learning_rate": 9.514632691433107e-05, |
|
"loss": 0.0, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.08569807037457434, |
|
"grad_norm": 5.398491859436035, |
|
"learning_rate": 9.507790575069347e-05, |
|
"loss": 0.2644, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.08626560726447219, |
|
"grad_norm": 3.6026408672332764, |
|
"learning_rate": 9.50090306530454e-05, |
|
"loss": 0.1911, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.08683314415437003, |
|
"grad_norm": 3.1517863273620605, |
|
"learning_rate": 9.493970231495835e-05, |
|
"loss": 0.0834, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.08740068104426787, |
|
"grad_norm": 3.0677297115325928, |
|
"learning_rate": 9.486992143456792e-05, |
|
"loss": 0.0744, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.08796821793416572, |
|
"grad_norm": 2.8431897163391113, |
|
"learning_rate": 9.479968871456679e-05, |
|
"loss": 0.0717, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.08853575482406356, |
|
"grad_norm": 1.704907774925232, |
|
"learning_rate": 9.472900486219769e-05, |
|
"loss": 0.0569, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.0891032917139614, |
|
"grad_norm": 1.7260456085205078, |
|
"learning_rate": 9.46578705892462e-05, |
|
"loss": 0.0341, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.08967082860385925, |
|
"grad_norm": 2.4934730529785156, |
|
"learning_rate": 9.458628661203367e-05, |
|
"loss": 0.0329, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.09023836549375709, |
|
"grad_norm": 2.195416212081909, |
|
"learning_rate": 9.451425365140996e-05, |
|
"loss": 0.0476, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.09080590238365494, |
|
"grad_norm": 0.4546680748462677, |
|
"learning_rate": 9.444177243274618e-05, |
|
"loss": 0.0112, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.09137343927355278, |
|
"grad_norm": 1.1639807224273682, |
|
"learning_rate": 9.43688436859274e-05, |
|
"loss": 0.0313, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.09194097616345062, |
|
"grad_norm": 2.623199462890625, |
|
"learning_rate": 9.429546814534529e-05, |
|
"loss": 0.0568, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.09250851305334847, |
|
"grad_norm": 0.5077713131904602, |
|
"learning_rate": 9.422164654989072e-05, |
|
"loss": 0.0125, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.09307604994324631, |
|
"grad_norm": 0.37310367822647095, |
|
"learning_rate": 9.414737964294636e-05, |
|
"loss": 0.0067, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.09364358683314415, |
|
"grad_norm": 1.5596776008605957, |
|
"learning_rate": 9.407266817237911e-05, |
|
"loss": 0.0392, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.094211123723042, |
|
"grad_norm": 1.5760366916656494, |
|
"learning_rate": 9.399751289053267e-05, |
|
"loss": 0.0296, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.09477866061293984, |
|
"grad_norm": 0.7743924856185913, |
|
"learning_rate": 9.392191455421988e-05, |
|
"loss": 0.0112, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.09534619750283768, |
|
"grad_norm": 0.9516783356666565, |
|
"learning_rate": 9.384587392471515e-05, |
|
"loss": 0.0109, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.09591373439273553, |
|
"grad_norm": 0.824831485748291, |
|
"learning_rate": 9.376939176774679e-05, |
|
"loss": 0.0133, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.09648127128263337, |
|
"grad_norm": 0.7576391696929932, |
|
"learning_rate": 9.369246885348926e-05, |
|
"loss": 0.0229, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.09704880817253121, |
|
"grad_norm": 0.7438343167304993, |
|
"learning_rate": 9.361510595655545e-05, |
|
"loss": 0.0148, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.09761634506242906, |
|
"grad_norm": 1.1500699520111084, |
|
"learning_rate": 9.353730385598887e-05, |
|
"loss": 0.0842, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.0981838819523269, |
|
"grad_norm": 0.8265398740768433, |
|
"learning_rate": 9.345906333525581e-05, |
|
"loss": 0.0161, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.09875141884222474, |
|
"grad_norm": 0.7592639923095703, |
|
"learning_rate": 9.338038518223747e-05, |
|
"loss": 0.0276, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.09931895573212259, |
|
"grad_norm": 0.9068871736526489, |
|
"learning_rate": 9.330127018922194e-05, |
|
"loss": 0.0173, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.09988649262202043, |
|
"grad_norm": 3.255566120147705, |
|
"learning_rate": 9.322171915289635e-05, |
|
"loss": 0.0371, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.10045402951191827, |
|
"grad_norm": 0.607418954372406, |
|
"learning_rate": 9.314173287433873e-05, |
|
"loss": 0.0184, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.10102156640181612, |
|
"grad_norm": 1.0031543970108032, |
|
"learning_rate": 9.306131215901003e-05, |
|
"loss": 0.0184, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.10158910329171396, |
|
"grad_norm": 0.9904316663742065, |
|
"learning_rate": 9.298045781674596e-05, |
|
"loss": 0.0403, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.1021566401816118, |
|
"grad_norm": 0.6840120553970337, |
|
"learning_rate": 9.289917066174886e-05, |
|
"loss": 0.012, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.10272417707150965, |
|
"grad_norm": 1.294670820236206, |
|
"learning_rate": 9.281745151257946e-05, |
|
"loss": 0.0204, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.10329171396140749, |
|
"grad_norm": 0.5927445888519287, |
|
"learning_rate": 9.273530119214868e-05, |
|
"loss": 0.0166, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.10385925085130533, |
|
"grad_norm": 1.159449815750122, |
|
"learning_rate": 9.265272052770936e-05, |
|
"loss": 0.0258, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.10442678774120318, |
|
"grad_norm": 0.5215921401977539, |
|
"learning_rate": 9.256971035084785e-05, |
|
"loss": 0.0121, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.10499432463110102, |
|
"grad_norm": 0.537041962146759, |
|
"learning_rate": 9.248627149747573e-05, |
|
"loss": 0.0125, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.10556186152099886, |
|
"grad_norm": 0.3171516954898834, |
|
"learning_rate": 9.24024048078213e-05, |
|
"loss": 0.0058, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.1061293984108967, |
|
"grad_norm": 0.13635091483592987, |
|
"learning_rate": 9.231811112642121e-05, |
|
"loss": 0.0028, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.10669693530079455, |
|
"grad_norm": 0.38101646304130554, |
|
"learning_rate": 9.223339130211192e-05, |
|
"loss": 0.0174, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.10726447219069239, |
|
"grad_norm": 0.6654653549194336, |
|
"learning_rate": 9.214824618802109e-05, |
|
"loss": 0.0101, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.10783200908059024, |
|
"grad_norm": 0.3335312008857727, |
|
"learning_rate": 9.206267664155907e-05, |
|
"loss": 0.0065, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.10839954597048808, |
|
"grad_norm": 0.4047980308532715, |
|
"learning_rate": 9.197668352441025e-05, |
|
"loss": 0.0035, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.10896708286038592, |
|
"grad_norm": 1.2880914211273193, |
|
"learning_rate": 9.189026770252436e-05, |
|
"loss": 0.0283, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.10953461975028377, |
|
"grad_norm": 0.276368111371994, |
|
"learning_rate": 9.18034300461078e-05, |
|
"loss": 0.0044, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.11010215664018161, |
|
"grad_norm": 0.5553678274154663, |
|
"learning_rate": 9.171617142961477e-05, |
|
"loss": 0.0107, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.11066969353007945, |
|
"grad_norm": 2.4395854473114014, |
|
"learning_rate": 9.162849273173857e-05, |
|
"loss": 0.0229, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.1112372304199773, |
|
"grad_norm": 0.0, |
|
"learning_rate": 9.154039483540273e-05, |
|
"loss": 0.0, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.11180476730987514, |
|
"grad_norm": 0.0, |
|
"learning_rate": 9.145187862775209e-05, |
|
"loss": 0.0, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.11237230419977298, |
|
"grad_norm": 0.0, |
|
"learning_rate": 9.136294500014386e-05, |
|
"loss": 0.0, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.11293984108967083, |
|
"grad_norm": 0.0, |
|
"learning_rate": 9.12735948481387e-05, |
|
"loss": 0.0, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.11350737797956867, |
|
"grad_norm": 0.0, |
|
"learning_rate": 9.118382907149165e-05, |
|
"loss": 0.0, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.11350737797956867, |
|
"eval_loss": NaN, |
|
"eval_runtime": 106.2144, |
|
"eval_samples_per_second": 27.943, |
|
"eval_steps_per_second": 6.986, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.11407491486946651, |
|
"grad_norm": 2.6148464679718018, |
|
"learning_rate": 9.109364857414306e-05, |
|
"loss": 0.1986, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.11464245175936436, |
|
"grad_norm": 1.466818928718567, |
|
"learning_rate": 9.100305426420956e-05, |
|
"loss": 0.0807, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.1152099886492622, |
|
"grad_norm": 0.8603935241699219, |
|
"learning_rate": 9.091204705397484e-05, |
|
"loss": 0.0535, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.11577752553916004, |
|
"grad_norm": 1.8044847249984741, |
|
"learning_rate": 9.082062785988049e-05, |
|
"loss": 0.0691, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.11634506242905789, |
|
"grad_norm": 0.43121302127838135, |
|
"learning_rate": 9.072879760251679e-05, |
|
"loss": 0.0214, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.11691259931895573, |
|
"grad_norm": 0.8256431221961975, |
|
"learning_rate": 9.06365572066134e-05, |
|
"loss": 0.0738, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.11748013620885357, |
|
"grad_norm": 0.938698410987854, |
|
"learning_rate": 9.05439076010301e-05, |
|
"loss": 0.0192, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.11804767309875142, |
|
"grad_norm": 0.551313579082489, |
|
"learning_rate": 9.045084971874738e-05, |
|
"loss": 0.0205, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.11861520998864926, |
|
"grad_norm": 1.3714677095413208, |
|
"learning_rate": 9.035738449685707e-05, |
|
"loss": 0.0479, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.1191827468785471, |
|
"grad_norm": 0.35658133029937744, |
|
"learning_rate": 9.026351287655294e-05, |
|
"loss": 0.0098, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.11975028376844495, |
|
"grad_norm": 0.6199024319648743, |
|
"learning_rate": 9.016923580312113e-05, |
|
"loss": 0.036, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.12031782065834279, |
|
"grad_norm": 0.315612256526947, |
|
"learning_rate": 9.007455422593077e-05, |
|
"loss": 0.0065, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.12088535754824063, |
|
"grad_norm": 0.4234760105609894, |
|
"learning_rate": 8.997946909842425e-05, |
|
"loss": 0.0294, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.12145289443813848, |
|
"grad_norm": 0.6859511137008667, |
|
"learning_rate": 8.988398137810777e-05, |
|
"loss": 0.0263, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.12202043132803632, |
|
"grad_norm": 0.41570374369621277, |
|
"learning_rate": 8.978809202654162e-05, |
|
"loss": 0.0102, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.12258796821793416, |
|
"grad_norm": 0.5462591052055359, |
|
"learning_rate": 8.969180200933047e-05, |
|
"loss": 0.027, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.123155505107832, |
|
"grad_norm": 1.4283958673477173, |
|
"learning_rate": 8.959511229611376e-05, |
|
"loss": 0.0177, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.12372304199772985, |
|
"grad_norm": 0.2500029504299164, |
|
"learning_rate": 8.949802386055581e-05, |
|
"loss": 0.0027, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.12429057888762769, |
|
"grad_norm": 0.452014297246933, |
|
"learning_rate": 8.940053768033609e-05, |
|
"loss": 0.0182, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.12485811577752554, |
|
"grad_norm": 0.896853506565094, |
|
"learning_rate": 8.930265473713938e-05, |
|
"loss": 0.0119, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.1254256526674234, |
|
"grad_norm": 0.2784510552883148, |
|
"learning_rate": 8.92043760166458e-05, |
|
"loss": 0.0081, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.12599318955732122, |
|
"grad_norm": 0.6255968809127808, |
|
"learning_rate": 8.910570250852097e-05, |
|
"loss": 0.0148, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.12656072644721908, |
|
"grad_norm": 0.7261110544204712, |
|
"learning_rate": 8.900663520640604e-05, |
|
"loss": 0.022, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.1271282633371169, |
|
"grad_norm": 1.0769401788711548, |
|
"learning_rate": 8.890717510790763e-05, |
|
"loss": 0.0061, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.12769580022701477, |
|
"grad_norm": 0.48627111315727234, |
|
"learning_rate": 8.880732321458784e-05, |
|
"loss": 0.0277, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.1282633371169126, |
|
"grad_norm": 0.23434560000896454, |
|
"learning_rate": 8.870708053195413e-05, |
|
"loss": 0.0031, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.12883087400681045, |
|
"grad_norm": 0.5444976091384888, |
|
"learning_rate": 8.860644806944918e-05, |
|
"loss": 0.0096, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.12939841089670828, |
|
"grad_norm": 1.0603594779968262, |
|
"learning_rate": 8.850542684044078e-05, |
|
"loss": 0.0314, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.12996594778660614, |
|
"grad_norm": 0.5144844651222229, |
|
"learning_rate": 8.840401786221159e-05, |
|
"loss": 0.013, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.13053348467650397, |
|
"grad_norm": 0.1403108835220337, |
|
"learning_rate": 8.83022221559489e-05, |
|
"loss": 0.0024, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.13110102156640183, |
|
"grad_norm": 0.24111324548721313, |
|
"learning_rate": 8.820004074673433e-05, |
|
"loss": 0.0034, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.13166855845629966, |
|
"grad_norm": 0.789040744304657, |
|
"learning_rate": 8.809747466353356e-05, |
|
"loss": 0.0295, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.1322360953461975, |
|
"grad_norm": 0.7863220572471619, |
|
"learning_rate": 8.799452493918585e-05, |
|
"loss": 0.0139, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.13280363223609534, |
|
"grad_norm": 0.5174320340156555, |
|
"learning_rate": 8.789119261039385e-05, |
|
"loss": 0.0097, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.1333711691259932, |
|
"grad_norm": 0.23707064986228943, |
|
"learning_rate": 8.778747871771292e-05, |
|
"loss": 0.0029, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.13393870601589103, |
|
"grad_norm": 0.20634303987026215, |
|
"learning_rate": 8.768338430554082e-05, |
|
"loss": 0.0036, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.1345062429057889, |
|
"grad_norm": 0.34142839908599854, |
|
"learning_rate": 8.757891042210714e-05, |
|
"loss": 0.0041, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.13507377979568672, |
|
"grad_norm": 0.3984658420085907, |
|
"learning_rate": 8.74740581194627e-05, |
|
"loss": 0.0062, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.13564131668558457, |
|
"grad_norm": 0.39278241991996765, |
|
"learning_rate": 8.736882845346906e-05, |
|
"loss": 0.011, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.1362088535754824, |
|
"grad_norm": 0.49513405561447144, |
|
"learning_rate": 8.726322248378775e-05, |
|
"loss": 0.0138, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.13677639046538026, |
|
"grad_norm": 0.1306513100862503, |
|
"learning_rate": 8.715724127386972e-05, |
|
"loss": 0.0009, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.1373439273552781, |
|
"grad_norm": 0.04676857590675354, |
|
"learning_rate": 8.705088589094459e-05, |
|
"loss": 0.0007, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.13791146424517595, |
|
"grad_norm": 0.1314803659915924, |
|
"learning_rate": 8.694415740600988e-05, |
|
"loss": 0.0018, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.13847900113507378, |
|
"grad_norm": 0.1374429613351822, |
|
"learning_rate": 8.683705689382024e-05, |
|
"loss": 0.0011, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.13904653802497163, |
|
"grad_norm": 0.0954061895608902, |
|
"learning_rate": 8.672958543287666e-05, |
|
"loss": 0.0008, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.13961407491486946, |
|
"grad_norm": 0.0, |
|
"learning_rate": 8.662174410541555e-05, |
|
"loss": 0.0, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.14018161180476732, |
|
"grad_norm": 0.0, |
|
"learning_rate": 8.651353399739787e-05, |
|
"loss": 0.0, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.14074914869466515, |
|
"grad_norm": 0.0, |
|
"learning_rate": 8.640495619849821e-05, |
|
"loss": 0.0, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.141316685584563, |
|
"grad_norm": 0.0, |
|
"learning_rate": 8.629601180209381e-05, |
|
"loss": 0.0, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.14188422247446084, |
|
"grad_norm": 0.0, |
|
"learning_rate": 8.618670190525352e-05, |
|
"loss": 0.0, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.1424517593643587, |
|
"grad_norm": 5.090747356414795, |
|
"learning_rate": 8.607702760872678e-05, |
|
"loss": 0.3383, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.14301929625425652, |
|
"grad_norm": 3.2074856758117676, |
|
"learning_rate": 8.596699001693255e-05, |
|
"loss": 0.1635, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.14358683314415438, |
|
"grad_norm": 1.3404688835144043, |
|
"learning_rate": 8.585659023794818e-05, |
|
"loss": 0.0633, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.1441543700340522, |
|
"grad_norm": 0.9414183497428894, |
|
"learning_rate": 8.574582938349817e-05, |
|
"loss": 0.0453, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.14472190692395007, |
|
"grad_norm": 1.2090741395950317, |
|
"learning_rate": 8.563470856894316e-05, |
|
"loss": 0.0521, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.1452894438138479, |
|
"grad_norm": 0.47880247235298157, |
|
"learning_rate": 8.552322891326846e-05, |
|
"loss": 0.0178, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.14585698070374575, |
|
"grad_norm": 0.9624150991439819, |
|
"learning_rate": 8.541139153907296e-05, |
|
"loss": 0.0165, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.14642451759364358, |
|
"grad_norm": 0.33472761511802673, |
|
"learning_rate": 8.529919757255783e-05, |
|
"loss": 0.0167, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.14699205448354144, |
|
"grad_norm": 0.44358474016189575, |
|
"learning_rate": 8.518664814351502e-05, |
|
"loss": 0.0156, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.14755959137343927, |
|
"grad_norm": 0.7658271789550781, |
|
"learning_rate": 8.507374438531607e-05, |
|
"loss": 0.0382, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.14812712826333713, |
|
"grad_norm": 0.30857032537460327, |
|
"learning_rate": 8.496048743490053e-05, |
|
"loss": 0.0075, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.14869466515323496, |
|
"grad_norm": 1.9148088693618774, |
|
"learning_rate": 8.484687843276469e-05, |
|
"loss": 0.0156, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.1492622020431328, |
|
"grad_norm": 0.8631612658500671, |
|
"learning_rate": 8.473291852294987e-05, |
|
"loss": 0.0441, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.14982973893303064, |
|
"grad_norm": 0.5475025773048401, |
|
"learning_rate": 8.461860885303114e-05, |
|
"loss": 0.0406, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.1503972758229285, |
|
"grad_norm": 0.12846483290195465, |
|
"learning_rate": 8.450395057410561e-05, |
|
"loss": 0.003, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.15096481271282633, |
|
"grad_norm": 0.275508850812912, |
|
"learning_rate": 8.438894484078086e-05, |
|
"loss": 0.0133, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.1515323496027242, |
|
"grad_norm": 0.5227663516998291, |
|
"learning_rate": 8.427359281116334e-05, |
|
"loss": 0.0177, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.15209988649262202, |
|
"grad_norm": 0.6141021847724915, |
|
"learning_rate": 8.415789564684673e-05, |
|
"loss": 0.0131, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.15266742338251987, |
|
"grad_norm": 0.04998692125082016, |
|
"learning_rate": 8.404185451290018e-05, |
|
"loss": 0.0013, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.1532349602724177, |
|
"grad_norm": 0.46891334652900696, |
|
"learning_rate": 8.392547057785661e-05, |
|
"loss": 0.0273, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.15380249716231556, |
|
"grad_norm": 0.5121367573738098, |
|
"learning_rate": 8.380874501370097e-05, |
|
"loss": 0.019, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.1543700340522134, |
|
"grad_norm": 0.12058154493570328, |
|
"learning_rate": 8.369167899585841e-05, |
|
"loss": 0.0022, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.15493757094211125, |
|
"grad_norm": 0.31559038162231445, |
|
"learning_rate": 8.357427370318239e-05, |
|
"loss": 0.0164, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.15550510783200908, |
|
"grad_norm": 0.7393925189971924, |
|
"learning_rate": 8.345653031794292e-05, |
|
"loss": 0.0179, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.15607264472190693, |
|
"grad_norm": 0.4647248089313507, |
|
"learning_rate": 8.333845002581458e-05, |
|
"loss": 0.008, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.15664018161180476, |
|
"grad_norm": 0.21942315995693207, |
|
"learning_rate": 8.322003401586462e-05, |
|
"loss": 0.0028, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.15720771850170262, |
|
"grad_norm": 0.4762805104255676, |
|
"learning_rate": 8.310128348054094e-05, |
|
"loss": 0.0254, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.15777525539160045, |
|
"grad_norm": 0.3500935435295105, |
|
"learning_rate": 8.298219961566009e-05, |
|
"loss": 0.0085, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.1583427922814983, |
|
"grad_norm": 0.3653341233730316, |
|
"learning_rate": 8.286278362039528e-05, |
|
"loss": 0.0074, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.15891032917139614, |
|
"grad_norm": 0.2545139193534851, |
|
"learning_rate": 8.274303669726426e-05, |
|
"loss": 0.0049, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.159477866061294, |
|
"grad_norm": 0.0680074691772461, |
|
"learning_rate": 8.262296005211721e-05, |
|
"loss": 0.0014, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.16004540295119182, |
|
"grad_norm": 0.2128506749868393, |
|
"learning_rate": 8.250255489412463e-05, |
|
"loss": 0.0049, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.16061293984108968, |
|
"grad_norm": 0.0370691753923893, |
|
"learning_rate": 8.238182243576512e-05, |
|
"loss": 0.0008, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.1611804767309875, |
|
"grad_norm": 0.05422423034906387, |
|
"learning_rate": 8.226076389281316e-05, |
|
"loss": 0.0009, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.16174801362088537, |
|
"grad_norm": 0.25359049439430237, |
|
"learning_rate": 8.213938048432697e-05, |
|
"loss": 0.002, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.1623155505107832, |
|
"grad_norm": 0.12178357690572739, |
|
"learning_rate": 8.201767343263612e-05, |
|
"loss": 0.0022, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.16288308740068105, |
|
"grad_norm": 0.06412477791309357, |
|
"learning_rate": 8.189564396332928e-05, |
|
"loss": 0.0012, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.16345062429057888, |
|
"grad_norm": 0.04413165897130966, |
|
"learning_rate": 8.177329330524182e-05, |
|
"loss": 0.0009, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.16401816118047674, |
|
"grad_norm": 0.1375110000371933, |
|
"learning_rate": 8.165062269044353e-05, |
|
"loss": 0.0019, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.16458569807037457, |
|
"grad_norm": 0.04058424010872841, |
|
"learning_rate": 8.152763335422613e-05, |
|
"loss": 0.0006, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.16515323496027243, |
|
"grad_norm": 0.46348121762275696, |
|
"learning_rate": 8.140432653509089e-05, |
|
"loss": 0.0038, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.16572077185017026, |
|
"grad_norm": 0.18360164761543274, |
|
"learning_rate": 8.128070347473609e-05, |
|
"loss": 0.0018, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.1662883087400681, |
|
"grad_norm": 0.6381713151931763, |
|
"learning_rate": 8.115676541804456e-05, |
|
"loss": 0.0069, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.16685584562996594, |
|
"grad_norm": 0.11540421843528748, |
|
"learning_rate": 8.103251361307119e-05, |
|
"loss": 0.0011, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.1674233825198638, |
|
"grad_norm": 0.5324001908302307, |
|
"learning_rate": 8.090794931103026e-05, |
|
"loss": 0.0035, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.16799091940976163, |
|
"grad_norm": 0.0, |
|
"learning_rate": 8.07830737662829e-05, |
|
"loss": 0.0, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.1685584562996595, |
|
"grad_norm": 0.12677313387393951, |
|
"learning_rate": 8.065788823632451e-05, |
|
"loss": 0.0018, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.16912599318955732, |
|
"grad_norm": 0.0, |
|
"learning_rate": 8.053239398177191e-05, |
|
"loss": 0.0, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.16969353007945517, |
|
"grad_norm": 0.0, |
|
"learning_rate": 8.04065922663509e-05, |
|
"loss": 0.0, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.170261066969353, |
|
"grad_norm": 0.0, |
|
"learning_rate": 8.028048435688333e-05, |
|
"loss": 0.0, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.17082860385925086, |
|
"grad_norm": 3.0016751289367676, |
|
"learning_rate": 8.015407152327448e-05, |
|
"loss": 0.2288, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.1713961407491487, |
|
"grad_norm": 1.9297741651535034, |
|
"learning_rate": 8.002735503850016e-05, |
|
"loss": 0.0538, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.17196367763904655, |
|
"grad_norm": 0.6649733185768127, |
|
"learning_rate": 7.990033617859396e-05, |
|
"loss": 0.0204, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.17253121452894438, |
|
"grad_norm": 1.4802879095077515, |
|
"learning_rate": 7.97730162226344e-05, |
|
"loss": 0.0355, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.17309875141884223, |
|
"grad_norm": 1.2510706186294556, |
|
"learning_rate": 7.964539645273204e-05, |
|
"loss": 0.0712, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.17366628830874006, |
|
"grad_norm": 0.45705509185791016, |
|
"learning_rate": 7.95174781540165e-05, |
|
"loss": 0.0259, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.17423382519863792, |
|
"grad_norm": 0.5345933437347412, |
|
"learning_rate": 7.938926261462366e-05, |
|
"loss": 0.02, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.17480136208853575, |
|
"grad_norm": 0.3882739841938019, |
|
"learning_rate": 7.926075112568259e-05, |
|
"loss": 0.014, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.1753688989784336, |
|
"grad_norm": 0.07783171534538269, |
|
"learning_rate": 7.913194498130252e-05, |
|
"loss": 0.0029, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.17593643586833144, |
|
"grad_norm": 0.39135614037513733, |
|
"learning_rate": 7.900284547855991e-05, |
|
"loss": 0.0127, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.1765039727582293, |
|
"grad_norm": 0.6026532053947449, |
|
"learning_rate": 7.887345391748533e-05, |
|
"loss": 0.003, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.17707150964812712, |
|
"grad_norm": 0.4667406380176544, |
|
"learning_rate": 7.874377160105036e-05, |
|
"loss": 0.008, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.17763904653802498, |
|
"grad_norm": 0.4185452461242676, |
|
"learning_rate": 7.861379983515449e-05, |
|
"loss": 0.0121, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.1782065834279228, |
|
"grad_norm": 0.28240469098091125, |
|
"learning_rate": 7.848353992861195e-05, |
|
"loss": 0.007, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.17877412031782067, |
|
"grad_norm": 0.2165137082338333, |
|
"learning_rate": 7.835299319313853e-05, |
|
"loss": 0.0057, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.1793416572077185, |
|
"grad_norm": 0.4501195549964905, |
|
"learning_rate": 7.822216094333847e-05, |
|
"loss": 0.016, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.17990919409761635, |
|
"grad_norm": 0.21665939688682556, |
|
"learning_rate": 7.809104449669101e-05, |
|
"loss": 0.0053, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.18047673098751418, |
|
"grad_norm": 0.45562973618507385, |
|
"learning_rate": 7.795964517353735e-05, |
|
"loss": 0.0399, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.18104426787741204, |
|
"grad_norm": 0.2064024657011032, |
|
"learning_rate": 7.78279642970672e-05, |
|
"loss": 0.0033, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.18161180476730987, |
|
"grad_norm": 0.06030461564660072, |
|
"learning_rate": 7.769600319330552e-05, |
|
"loss": 0.0015, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.18217934165720773, |
|
"grad_norm": 0.12329499423503876, |
|
"learning_rate": 7.756376319109917e-05, |
|
"loss": 0.002, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.18274687854710556, |
|
"grad_norm": 0.09843454509973526, |
|
"learning_rate": 7.74312456221035e-05, |
|
"loss": 0.0024, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.18331441543700341, |
|
"grad_norm": 0.3691561222076416, |
|
"learning_rate": 7.729845182076895e-05, |
|
"loss": 0.0087, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.18388195232690124, |
|
"grad_norm": 0.18092408776283264, |
|
"learning_rate": 7.716538312432766e-05, |
|
"loss": 0.0042, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.1844494892167991, |
|
"grad_norm": 0.2996932566165924, |
|
"learning_rate": 7.703204087277988e-05, |
|
"loss": 0.0221, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.18501702610669693, |
|
"grad_norm": 0.12783432006835938, |
|
"learning_rate": 7.689842640888063e-05, |
|
"loss": 0.0033, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.1855845629965948, |
|
"grad_norm": 0.26556456089019775, |
|
"learning_rate": 7.676454107812607e-05, |
|
"loss": 0.0056, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.18615209988649262, |
|
"grad_norm": 0.1686052829027176, |
|
"learning_rate": 7.663038622873999e-05, |
|
"loss": 0.0029, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.18671963677639047, |
|
"grad_norm": 0.04450399801135063, |
|
"learning_rate": 7.649596321166024e-05, |
|
"loss": 0.0007, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.1872871736662883, |
|
"grad_norm": 0.2206043303012848, |
|
"learning_rate": 7.636127338052512e-05, |
|
"loss": 0.0019, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.18785471055618616, |
|
"grad_norm": 0.050599176436662674, |
|
"learning_rate": 7.622631809165973e-05, |
|
"loss": 0.0006, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.188422247446084, |
|
"grad_norm": 1.3266581296920776, |
|
"learning_rate": 7.60910987040623e-05, |
|
"loss": 0.0502, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.18898978433598185, |
|
"grad_norm": 0.012006393633782864, |
|
"learning_rate": 7.595561657939061e-05, |
|
"loss": 0.0004, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.18955732122587968, |
|
"grad_norm": 0.24034585058689117, |
|
"learning_rate": 7.58198730819481e-05, |
|
"loss": 0.0033, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.19012485811577753, |
|
"grad_norm": 0.378845751285553, |
|
"learning_rate": 7.568386957867033e-05, |
|
"loss": 0.0189, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.19069239500567536, |
|
"grad_norm": 0.019944118335843086, |
|
"learning_rate": 7.554760743911103e-05, |
|
"loss": 0.0004, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.19125993189557322, |
|
"grad_norm": 1.7960922718048096, |
|
"learning_rate": 7.541108803542846e-05, |
|
"loss": 0.0138, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.19182746878547105, |
|
"grad_norm": 0.015294855460524559, |
|
"learning_rate": 7.52743127423715e-05, |
|
"loss": 0.0004, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.1923950056753689, |
|
"grad_norm": 0.08567917346954346, |
|
"learning_rate": 7.51372829372658e-05, |
|
"loss": 0.0013, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.19296254256526674, |
|
"grad_norm": 0.07724782824516296, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 0.0008, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.1935300794551646, |
|
"grad_norm": 0.009706157259643078, |
|
"learning_rate": 7.486246531301177e-05, |
|
"loss": 0.0003, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.19409761634506242, |
|
"grad_norm": 0.04667770117521286, |
|
"learning_rate": 7.472468026127385e-05, |
|
"loss": 0.0004, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.19466515323496028, |
|
"grad_norm": 0.01700473390519619, |
|
"learning_rate": 7.45866462322802e-05, |
|
"loss": 0.0004, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.1952326901248581, |
|
"grad_norm": 0.6124288439750671, |
|
"learning_rate": 7.444836461603195e-05, |
|
"loss": 0.0087, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.19580022701475597, |
|
"grad_norm": 0.4518308937549591, |
|
"learning_rate": 7.430983680502344e-05, |
|
"loss": 0.0028, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.1963677639046538, |
|
"grad_norm": 0.0, |
|
"learning_rate": 7.417106419422819e-05, |
|
"loss": 0.0, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.19693530079455165, |
|
"grad_norm": 0.0, |
|
"learning_rate": 7.403204818108487e-05, |
|
"loss": 0.0, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.19750283768444948, |
|
"grad_norm": 0.14064697921276093, |
|
"learning_rate": 7.389279016548316e-05, |
|
"loss": 0.0015, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.19807037457434734, |
|
"grad_norm": 0.0, |
|
"learning_rate": 7.375329154974975e-05, |
|
"loss": 0.0, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.19863791146424517, |
|
"grad_norm": 0.0, |
|
"learning_rate": 7.361355373863414e-05, |
|
"loss": 0.0, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.19920544835414303, |
|
"grad_norm": 2.25301194190979, |
|
"learning_rate": 7.347357813929454e-05, |
|
"loss": 0.0754, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.19977298524404086, |
|
"grad_norm": 1.7841074466705322, |
|
"learning_rate": 7.333336616128369e-05, |
|
"loss": 0.051, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.20034052213393871, |
|
"grad_norm": 0.9712184071540833, |
|
"learning_rate": 7.319291921653464e-05, |
|
"loss": 0.016, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.20090805902383654, |
|
"grad_norm": 0.5902833342552185, |
|
"learning_rate": 7.305223871934657e-05, |
|
"loss": 0.0106, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.2014755959137344, |
|
"grad_norm": 0.2831375002861023, |
|
"learning_rate": 7.291132608637052e-05, |
|
"loss": 0.0077, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.20204313280363223, |
|
"grad_norm": 0.5281710624694824, |
|
"learning_rate": 7.277018273659517e-05, |
|
"loss": 0.0236, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.2026106696935301, |
|
"grad_norm": 0.7202128767967224, |
|
"learning_rate": 7.262881009133242e-05, |
|
"loss": 0.0256, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.20317820658342792, |
|
"grad_norm": 0.2578269839286804, |
|
"learning_rate": 7.24872095742033e-05, |
|
"loss": 0.0061, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.20374574347332577, |
|
"grad_norm": 0.21329842507839203, |
|
"learning_rate": 7.23453826111234e-05, |
|
"loss": 0.0029, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.2043132803632236, |
|
"grad_norm": 0.3269996643066406, |
|
"learning_rate": 7.220333063028872e-05, |
|
"loss": 0.0042, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.20488081725312146, |
|
"grad_norm": 0.5432631373405457, |
|
"learning_rate": 7.206105506216106e-05, |
|
"loss": 0.0272, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.2054483541430193, |
|
"grad_norm": 1.9401220083236694, |
|
"learning_rate": 7.191855733945387e-05, |
|
"loss": 0.0163, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.20601589103291715, |
|
"grad_norm": 0.24572275578975677, |
|
"learning_rate": 7.177583889711762e-05, |
|
"loss": 0.006, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.20658342792281498, |
|
"grad_norm": 0.13575679063796997, |
|
"learning_rate": 7.163290117232542e-05, |
|
"loss": 0.0026, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.20715096481271283, |
|
"grad_norm": 0.4401944577693939, |
|
"learning_rate": 7.148974560445859e-05, |
|
"loss": 0.0187, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.20771850170261066, |
|
"grad_norm": 0.45200228691101074, |
|
"learning_rate": 7.13463736350921e-05, |
|
"loss": 0.0083, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.20828603859250852, |
|
"grad_norm": 0.5528292655944824, |
|
"learning_rate": 7.120278670798009e-05, |
|
"loss": 0.0092, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.20885357548240635, |
|
"grad_norm": 0.5644862651824951, |
|
"learning_rate": 7.105898626904134e-05, |
|
"loss": 0.0147, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.2094211123723042, |
|
"grad_norm": 0.7960838675498962, |
|
"learning_rate": 7.091497376634464e-05, |
|
"loss": 0.0252, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.20998864926220204, |
|
"grad_norm": 0.26013273000717163, |
|
"learning_rate": 7.077075065009433e-05, |
|
"loss": 0.003, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.2105561861520999, |
|
"grad_norm": 0.44845283031463623, |
|
"learning_rate": 7.062631837261557e-05, |
|
"loss": 0.005, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.21112372304199772, |
|
"grad_norm": 0.48455584049224854, |
|
"learning_rate": 7.048167838833977e-05, |
|
"loss": 0.0084, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.21169125993189558, |
|
"grad_norm": 0.48875439167022705, |
|
"learning_rate": 7.033683215379002e-05, |
|
"loss": 0.0034, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.2122587968217934, |
|
"grad_norm": 1.5464515686035156, |
|
"learning_rate": 7.019178112756624e-05, |
|
"loss": 0.0422, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.21282633371169127, |
|
"grad_norm": 0.08226021379232407, |
|
"learning_rate": 7.004652677033068e-05, |
|
"loss": 0.0014, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.2133938706015891, |
|
"grad_norm": 0.9428783655166626, |
|
"learning_rate": 6.990107054479312e-05, |
|
"loss": 0.0085, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.21396140749148695, |
|
"grad_norm": 0.5083039999008179, |
|
"learning_rate": 6.97554139156961e-05, |
|
"loss": 0.0054, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.21452894438138478, |
|
"grad_norm": 0.7152851819992065, |
|
"learning_rate": 6.960955834980028e-05, |
|
"loss": 0.0118, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.21509648127128264, |
|
"grad_norm": 0.7423697113990784, |
|
"learning_rate": 6.946350531586959e-05, |
|
"loss": 0.0196, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.21566401816118047, |
|
"grad_norm": 0.19148842990398407, |
|
"learning_rate": 6.931725628465643e-05, |
|
"loss": 0.0029, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.21623155505107833, |
|
"grad_norm": 0.16525211930274963, |
|
"learning_rate": 6.917081272888697e-05, |
|
"loss": 0.0018, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.21679909194097616, |
|
"grad_norm": 0.7403731942176819, |
|
"learning_rate": 6.902417612324615e-05, |
|
"loss": 0.0156, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.21736662883087401, |
|
"grad_norm": 0.7522996068000793, |
|
"learning_rate": 6.8877347944363e-05, |
|
"loss": 0.0106, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.21793416572077184, |
|
"grad_norm": 0.35488778352737427, |
|
"learning_rate": 6.873032967079561e-05, |
|
"loss": 0.0266, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.2185017026106697, |
|
"grad_norm": 0.5787685513496399, |
|
"learning_rate": 6.858312278301637e-05, |
|
"loss": 0.0044, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.21906923950056753, |
|
"grad_norm": 0.5935757756233215, |
|
"learning_rate": 6.843572876339705e-05, |
|
"loss": 0.0101, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.2196367763904654, |
|
"grad_norm": 0.0742402896285057, |
|
"learning_rate": 6.828814909619373e-05, |
|
"loss": 0.0006, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.22020431328036322, |
|
"grad_norm": 0.08480936288833618, |
|
"learning_rate": 6.814038526753205e-05, |
|
"loss": 0.0014, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.22077185017026107, |
|
"grad_norm": 0.20840072631835938, |
|
"learning_rate": 6.799243876539212e-05, |
|
"loss": 0.0017, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.2213393870601589, |
|
"grad_norm": 0.050797827541828156, |
|
"learning_rate": 6.784431107959359e-05, |
|
"loss": 0.0007, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.22190692395005676, |
|
"grad_norm": 0.0777006596326828, |
|
"learning_rate": 6.769600370178059e-05, |
|
"loss": 0.0013, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.2224744608399546, |
|
"grad_norm": 0.04812987521290779, |
|
"learning_rate": 6.754751812540679e-05, |
|
"loss": 0.0008, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.22304199772985245, |
|
"grad_norm": 0.07565217465162277, |
|
"learning_rate": 6.739885584572026e-05, |
|
"loss": 0.0006, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.22360953461975028, |
|
"grad_norm": 0.03945764899253845, |
|
"learning_rate": 6.725001835974853e-05, |
|
"loss": 0.0005, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.22417707150964813, |
|
"grad_norm": 1.1225963830947876, |
|
"learning_rate": 6.710100716628344e-05, |
|
"loss": 0.0077, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.22474460839954596, |
|
"grad_norm": 0.24701376259326935, |
|
"learning_rate": 6.695182376586603e-05, |
|
"loss": 0.0024, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.22531214528944382, |
|
"grad_norm": 0.0, |
|
"learning_rate": 6.680246966077151e-05, |
|
"loss": 0.0, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.22587968217934165, |
|
"grad_norm": 0.0, |
|
"learning_rate": 6.665294635499404e-05, |
|
"loss": 0.0, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.2264472190692395, |
|
"grad_norm": 0.0, |
|
"learning_rate": 6.650325535423167e-05, |
|
"loss": 0.0, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.22701475595913734, |
|
"grad_norm": 0.0, |
|
"learning_rate": 6.635339816587109e-05, |
|
"loss": 0.0, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.22701475595913734, |
|
"eval_loss": NaN, |
|
"eval_runtime": 106.1666, |
|
"eval_samples_per_second": 27.956, |
|
"eval_steps_per_second": 6.989, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.2275822928490352, |
|
"grad_norm": 1.7875468730926514, |
|
"learning_rate": 6.620337629897254e-05, |
|
"loss": 0.1379, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 0.22814982973893302, |
|
"grad_norm": 1.6237751245498657, |
|
"learning_rate": 6.605319126425454e-05, |
|
"loss": 0.02, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.22871736662883088, |
|
"grad_norm": 0.16924582421779633, |
|
"learning_rate": 6.590284457407876e-05, |
|
"loss": 0.0052, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 0.2292849035187287, |
|
"grad_norm": 0.3651062250137329, |
|
"learning_rate": 6.575233774243465e-05, |
|
"loss": 0.0265, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.22985244040862657, |
|
"grad_norm": 0.13029779493808746, |
|
"learning_rate": 6.560167228492436e-05, |
|
"loss": 0.0032, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.2304199772985244, |
|
"grad_norm": 0.11845195293426514, |
|
"learning_rate": 6.545084971874738e-05, |
|
"loss": 0.0039, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.23098751418842225, |
|
"grad_norm": 0.344365656375885, |
|
"learning_rate": 6.529987156268526e-05, |
|
"loss": 0.0043, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 0.23155505107832008, |
|
"grad_norm": 0.8041085004806519, |
|
"learning_rate": 6.514873933708638e-05, |
|
"loss": 0.0077, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.23212258796821794, |
|
"grad_norm": 0.05168134719133377, |
|
"learning_rate": 6.499745456385054e-05, |
|
"loss": 0.0016, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 0.23269012485811577, |
|
"grad_norm": 0.27078044414520264, |
|
"learning_rate": 6.484601876641375e-05, |
|
"loss": 0.0126, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.23325766174801363, |
|
"grad_norm": 0.28879401087760925, |
|
"learning_rate": 6.46944334697328e-05, |
|
"loss": 0.0115, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 0.23382519863791146, |
|
"grad_norm": 0.9801868200302124, |
|
"learning_rate": 6.454270020026995e-05, |
|
"loss": 0.0165, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.23439273552780931, |
|
"grad_norm": 0.28464144468307495, |
|
"learning_rate": 6.439082048597755e-05, |
|
"loss": 0.0039, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 0.23496027241770714, |
|
"grad_norm": 0.07441152632236481, |
|
"learning_rate": 6.423879585628261e-05, |
|
"loss": 0.0013, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.235527809307605, |
|
"grad_norm": 0.4811006188392639, |
|
"learning_rate": 6.408662784207149e-05, |
|
"loss": 0.0168, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.23609534619750283, |
|
"grad_norm": 0.15365412831306458, |
|
"learning_rate": 6.39343179756744e-05, |
|
"loss": 0.0032, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.2366628830874007, |
|
"grad_norm": 0.1440768986940384, |
|
"learning_rate": 6.378186779084995e-05, |
|
"loss": 0.0026, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 0.23723041997729852, |
|
"grad_norm": 0.12057225406169891, |
|
"learning_rate": 6.36292788227699e-05, |
|
"loss": 0.0019, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.23779795686719638, |
|
"grad_norm": 0.41369637846946716, |
|
"learning_rate": 6.34765526080034e-05, |
|
"loss": 0.0114, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 0.2383654937570942, |
|
"grad_norm": 0.06439776718616486, |
|
"learning_rate": 6.332369068450174e-05, |
|
"loss": 0.0015, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.23893303064699206, |
|
"grad_norm": 0.05166054517030716, |
|
"learning_rate": 6.317069459158284e-05, |
|
"loss": 0.0012, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 0.2395005675368899, |
|
"grad_norm": 0.040280867367982864, |
|
"learning_rate": 6.30175658699156e-05, |
|
"loss": 0.0011, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.24006810442678775, |
|
"grad_norm": 0.2597777843475342, |
|
"learning_rate": 6.286430606150459e-05, |
|
"loss": 0.0034, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 0.24063564131668558, |
|
"grad_norm": 0.34109053015708923, |
|
"learning_rate": 6.271091670967436e-05, |
|
"loss": 0.0055, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.24120317820658344, |
|
"grad_norm": 0.6494819521903992, |
|
"learning_rate": 6.255739935905396e-05, |
|
"loss": 0.0092, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.24177071509648126, |
|
"grad_norm": 0.6631916165351868, |
|
"learning_rate": 6.240375555556145e-05, |
|
"loss": 0.016, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.24233825198637912, |
|
"grad_norm": 0.23462681472301483, |
|
"learning_rate": 6.22499868463882e-05, |
|
"loss": 0.0034, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 0.24290578887627695, |
|
"grad_norm": 0.03203234449028969, |
|
"learning_rate": 6.209609477998338e-05, |
|
"loss": 0.0007, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.2434733257661748, |
|
"grad_norm": 0.08686164021492004, |
|
"learning_rate": 6.194208090603844e-05, |
|
"loss": 0.0013, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 0.24404086265607264, |
|
"grad_norm": 0.2361176759004593, |
|
"learning_rate": 6.178794677547137e-05, |
|
"loss": 0.0025, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.2446083995459705, |
|
"grad_norm": 0.30464842915534973, |
|
"learning_rate": 6.163369394041111e-05, |
|
"loss": 0.0026, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 0.24517593643586832, |
|
"grad_norm": 1.3310155868530273, |
|
"learning_rate": 6.147932395418205e-05, |
|
"loss": 0.0677, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.24574347332576618, |
|
"grad_norm": 0.006984487175941467, |
|
"learning_rate": 6.132483837128823e-05, |
|
"loss": 0.0002, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 0.246311010215664, |
|
"grad_norm": 0.7241768836975098, |
|
"learning_rate": 6.117023874739772e-05, |
|
"loss": 0.0213, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.24687854710556187, |
|
"grad_norm": 0.06802449375391006, |
|
"learning_rate": 6.1015526639327035e-05, |
|
"loss": 0.0007, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.2474460839954597, |
|
"grad_norm": 0.22861234843730927, |
|
"learning_rate": 6.0860703605025395e-05, |
|
"loss": 0.0027, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.24801362088535756, |
|
"grad_norm": 0.09795883297920227, |
|
"learning_rate": 6.0705771203559024e-05, |
|
"loss": 0.0008, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 0.24858115777525538, |
|
"grad_norm": 0.9199258685112, |
|
"learning_rate": 6.05507309950955e-05, |
|
"loss": 0.0459, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.24914869466515324, |
|
"grad_norm": 1.980196475982666, |
|
"learning_rate": 6.0395584540887963e-05, |
|
"loss": 0.0015, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 0.24971623155505107, |
|
"grad_norm": 0.2596800625324249, |
|
"learning_rate": 6.024033340325954e-05, |
|
"loss": 0.0033, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.25028376844494893, |
|
"grad_norm": 0.9777965545654297, |
|
"learning_rate": 6.008497914558744e-05, |
|
"loss": 0.017, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 0.2508513053348468, |
|
"grad_norm": 0.36498022079467773, |
|
"learning_rate": 5.992952333228728e-05, |
|
"loss": 0.0042, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 0.2514188422247446, |
|
"grad_norm": 0.022741034626960754, |
|
"learning_rate": 5.9773967528797414e-05, |
|
"loss": 0.0004, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 0.25198637911464244, |
|
"grad_norm": 0.27772486209869385, |
|
"learning_rate": 5.9618313301563055e-05, |
|
"loss": 0.0032, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.2525539160045403, |
|
"grad_norm": 0.0808940976858139, |
|
"learning_rate": 5.946256221802051e-05, |
|
"loss": 0.0002, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.25312145289443816, |
|
"grad_norm": 0.27334359288215637, |
|
"learning_rate": 5.9306715846581506e-05, |
|
"loss": 0.0004, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 0.25368898978433596, |
|
"grad_norm": 0.0, |
|
"learning_rate": 5.915077575661723e-05, |
|
"loss": 0.0, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 0.2542565266742338, |
|
"grad_norm": 0.0, |
|
"learning_rate": 5.8994743518442694e-05, |
|
"loss": 0.0, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.2548240635641317, |
|
"grad_norm": 0.0, |
|
"learning_rate": 5.8838620703300784e-05, |
|
"loss": 0.0, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 0.25539160045402953, |
|
"grad_norm": 0.0, |
|
"learning_rate": 5.868240888334653e-05, |
|
"loss": 0.0, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.25595913734392733, |
|
"grad_norm": 1.7918459177017212, |
|
"learning_rate": 5.85261096316312e-05, |
|
"loss": 0.1334, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 0.2565266742338252, |
|
"grad_norm": 1.2250529527664185, |
|
"learning_rate": 5.836972452208654e-05, |
|
"loss": 0.0459, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.25709421112372305, |
|
"grad_norm": 0.15675216913223267, |
|
"learning_rate": 5.821325512950886e-05, |
|
"loss": 0.0036, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 0.2576617480136209, |
|
"grad_norm": 0.18163926899433136, |
|
"learning_rate": 5.805670302954321e-05, |
|
"loss": 0.0018, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 0.2582292849035187, |
|
"grad_norm": 0.2342452108860016, |
|
"learning_rate": 5.79000697986675e-05, |
|
"loss": 0.0119, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.25879682179341656, |
|
"grad_norm": 0.7253749370574951, |
|
"learning_rate": 5.7743357014176624e-05, |
|
"loss": 0.0193, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.2593643586833144, |
|
"grad_norm": 0.45949435234069824, |
|
"learning_rate": 5.7586566254166583e-05, |
|
"loss": 0.0124, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 0.2599318955732123, |
|
"grad_norm": 0.05333389714360237, |
|
"learning_rate": 5.7429699097518585e-05, |
|
"loss": 0.0021, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 0.2604994324631101, |
|
"grad_norm": 0.22825823724269867, |
|
"learning_rate": 5.7272757123883184e-05, |
|
"loss": 0.0032, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 0.26106696935300794, |
|
"grad_norm": 0.35596853494644165, |
|
"learning_rate": 5.7115741913664264e-05, |
|
"loss": 0.0059, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.2616345062429058, |
|
"grad_norm": 0.4787977635860443, |
|
"learning_rate": 5.695865504800327e-05, |
|
"loss": 0.0123, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 0.26220204313280365, |
|
"grad_norm": 0.15637889504432678, |
|
"learning_rate": 5.680149810876322e-05, |
|
"loss": 0.0022, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.26276958002270145, |
|
"grad_norm": 0.1934584677219391, |
|
"learning_rate": 5.664427267851271e-05, |
|
"loss": 0.0058, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 0.2633371169125993, |
|
"grad_norm": 0.08029989898204803, |
|
"learning_rate": 5.6486980340510086e-05, |
|
"loss": 0.0021, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.26390465380249717, |
|
"grad_norm": 0.21124523878097534, |
|
"learning_rate": 5.6329622678687463e-05, |
|
"loss": 0.0061, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.264472190692395, |
|
"grad_norm": 0.05409558117389679, |
|
"learning_rate": 5.617220127763474e-05, |
|
"loss": 0.0013, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 0.26503972758229283, |
|
"grad_norm": 0.5069450736045837, |
|
"learning_rate": 5.601471772258368e-05, |
|
"loss": 0.0084, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 0.2656072644721907, |
|
"grad_norm": 0.30205589532852173, |
|
"learning_rate": 5.585717359939192e-05, |
|
"loss": 0.0114, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.26617480136208854, |
|
"grad_norm": 0.13869255781173706, |
|
"learning_rate": 5.569957049452703e-05, |
|
"loss": 0.0011, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 0.2667423382519864, |
|
"grad_norm": 0.048545245081186295, |
|
"learning_rate": 5.5541909995050554e-05, |
|
"loss": 0.001, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.2673098751418842, |
|
"grad_norm": 0.01697002351284027, |
|
"learning_rate": 5.538419368860196e-05, |
|
"loss": 0.0006, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 0.26787741203178206, |
|
"grad_norm": 0.3081216812133789, |
|
"learning_rate": 5.522642316338268e-05, |
|
"loss": 0.0097, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.2684449489216799, |
|
"grad_norm": 0.1498224139213562, |
|
"learning_rate": 5.506860000814017e-05, |
|
"loss": 0.0024, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 0.2690124858115778, |
|
"grad_norm": 0.039433132857084274, |
|
"learning_rate": 5.4910725812151864e-05, |
|
"loss": 0.0007, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 0.2695800227014756, |
|
"grad_norm": 0.0603884682059288, |
|
"learning_rate": 5.475280216520913e-05, |
|
"loss": 0.0012, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.27014755959137343, |
|
"grad_norm": 0.8099268078804016, |
|
"learning_rate": 5.4594830657601384e-05, |
|
"loss": 0.0319, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.2707150964812713, |
|
"grad_norm": 0.23851421475410461, |
|
"learning_rate": 5.443681288009991e-05, |
|
"loss": 0.007, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 0.27128263337116915, |
|
"grad_norm": 0.09010318666696548, |
|
"learning_rate": 5.427875042394199e-05, |
|
"loss": 0.0015, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 0.27185017026106695, |
|
"grad_norm": 0.29702991247177124, |
|
"learning_rate": 5.412064488081482e-05, |
|
"loss": 0.0017, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 0.2724177071509648, |
|
"grad_norm": 0.46306195855140686, |
|
"learning_rate": 5.396249784283942e-05, |
|
"loss": 0.0063, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.27298524404086266, |
|
"grad_norm": 0.40036702156066895, |
|
"learning_rate": 5.3804310902554754e-05, |
|
"loss": 0.0078, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 0.2735527809307605, |
|
"grad_norm": 0.014721478335559368, |
|
"learning_rate": 5.364608565290155e-05, |
|
"loss": 0.0003, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 0.2741203178206583, |
|
"grad_norm": 0.43365278840065, |
|
"learning_rate": 5.348782368720626e-05, |
|
"loss": 0.0272, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 0.2746878547105562, |
|
"grad_norm": 0.8489099144935608, |
|
"learning_rate": 5.3329526599165204e-05, |
|
"loss": 0.0232, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 0.27525539160045404, |
|
"grad_norm": 0.22350762784481049, |
|
"learning_rate": 5.317119598282823e-05, |
|
"loss": 0.005, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.2758229284903519, |
|
"grad_norm": 0.028759444132447243, |
|
"learning_rate": 5.301283343258293e-05, |
|
"loss": 0.0007, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 0.2763904653802497, |
|
"grad_norm": 0.0732959434390068, |
|
"learning_rate": 5.2854440543138406e-05, |
|
"loss": 0.0015, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 0.27695800227014755, |
|
"grad_norm": 0.19646039605140686, |
|
"learning_rate": 5.2696018909509306e-05, |
|
"loss": 0.0036, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.2775255391600454, |
|
"grad_norm": 0.19537141919136047, |
|
"learning_rate": 5.253757012699972e-05, |
|
"loss": 0.0038, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 0.27809307604994327, |
|
"grad_norm": 0.18001757562160492, |
|
"learning_rate": 5.2379095791187124e-05, |
|
"loss": 0.0029, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.27866061293984107, |
|
"grad_norm": 0.05839482694864273, |
|
"learning_rate": 5.2220597497906307e-05, |
|
"loss": 0.0013, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 0.2792281498297389, |
|
"grad_norm": 0.0680989921092987, |
|
"learning_rate": 5.2062076843233366e-05, |
|
"loss": 0.0016, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.2797956867196368, |
|
"grad_norm": 0.03857843577861786, |
|
"learning_rate": 5.1903535423469505e-05, |
|
"loss": 0.0009, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 0.28036322360953464, |
|
"grad_norm": 0.2856152355670929, |
|
"learning_rate": 5.174497483512506e-05, |
|
"loss": 0.0027, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 0.28093076049943244, |
|
"grad_norm": 0.0, |
|
"learning_rate": 5.158639667490339e-05, |
|
"loss": 0.0, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.2814982973893303, |
|
"grad_norm": 0.0, |
|
"learning_rate": 5.142780253968481e-05, |
|
"loss": 0.0, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.28206583427922816, |
|
"grad_norm": 0.0, |
|
"learning_rate": 5.126919402651052e-05, |
|
"loss": 0.0, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 0.282633371169126, |
|
"grad_norm": 0.0, |
|
"learning_rate": 5.1110572732566475e-05, |
|
"loss": 0.0, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 0.2832009080590238, |
|
"grad_norm": 0.0, |
|
"learning_rate": 5.095194025516733e-05, |
|
"loss": 0.0, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 0.28376844494892167, |
|
"grad_norm": 0.0, |
|
"learning_rate": 5.0793298191740404e-05, |
|
"loss": 0.0, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.28433598183881953, |
|
"grad_norm": 3.819697856903076, |
|
"learning_rate": 5.063464813980948e-05, |
|
"loss": 0.271, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 0.2849035187287174, |
|
"grad_norm": 0.8054677844047546, |
|
"learning_rate": 5.047599169697884e-05, |
|
"loss": 0.0246, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 0.2854710556186152, |
|
"grad_norm": 0.3912200629711151, |
|
"learning_rate": 5.03173304609171e-05, |
|
"loss": 0.0167, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 0.28603859250851305, |
|
"grad_norm": 0.3080594539642334, |
|
"learning_rate": 5.015866602934112e-05, |
|
"loss": 0.0099, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 0.2866061293984109, |
|
"grad_norm": 0.6711567044258118, |
|
"learning_rate": 5e-05, |
|
"loss": 0.0091, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.28717366628830876, |
|
"grad_norm": 0.41851040720939636, |
|
"learning_rate": 4.984133397065889e-05, |
|
"loss": 0.0136, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 0.28774120317820656, |
|
"grad_norm": 0.15193375945091248, |
|
"learning_rate": 4.968266953908292e-05, |
|
"loss": 0.0029, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 0.2883087400681044, |
|
"grad_norm": 0.27360770106315613, |
|
"learning_rate": 4.952400830302117e-05, |
|
"loss": 0.0049, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 0.2888762769580023, |
|
"grad_norm": 0.06797119230031967, |
|
"learning_rate": 4.9365351860190526e-05, |
|
"loss": 0.0012, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 0.28944381384790013, |
|
"grad_norm": 0.42943570017814636, |
|
"learning_rate": 4.92067018082596e-05, |
|
"loss": 0.007, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.29001135073779793, |
|
"grad_norm": 0.41933485865592957, |
|
"learning_rate": 4.9048059744832666e-05, |
|
"loss": 0.024, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 0.2905788876276958, |
|
"grad_norm": 0.07279060781002045, |
|
"learning_rate": 4.888942726743353e-05, |
|
"loss": 0.0014, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 0.29114642451759365, |
|
"grad_norm": 0.022339163348078728, |
|
"learning_rate": 4.8730805973489476e-05, |
|
"loss": 0.0007, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 0.2917139614074915, |
|
"grad_norm": 0.033492498099803925, |
|
"learning_rate": 4.85721974603152e-05, |
|
"loss": 0.0011, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 0.2922814982973893, |
|
"grad_norm": 0.15747897326946259, |
|
"learning_rate": 4.841360332509663e-05, |
|
"loss": 0.0023, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.29284903518728717, |
|
"grad_norm": 0.4097544252872467, |
|
"learning_rate": 4.825502516487497e-05, |
|
"loss": 0.0324, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 0.293416572077185, |
|
"grad_norm": 0.39055225253105164, |
|
"learning_rate": 4.8096464576530507e-05, |
|
"loss": 0.0043, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 0.2939841089670829, |
|
"grad_norm": 0.011669347062706947, |
|
"learning_rate": 4.7937923156766646e-05, |
|
"loss": 0.0005, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 0.2945516458569807, |
|
"grad_norm": 0.15535907447338104, |
|
"learning_rate": 4.77794025020937e-05, |
|
"loss": 0.0019, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 0.29511918274687854, |
|
"grad_norm": 0.042138513177633286, |
|
"learning_rate": 4.762090420881289e-05, |
|
"loss": 0.0009, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.2956867196367764, |
|
"grad_norm": 0.24504394829273224, |
|
"learning_rate": 4.7462429873000295e-05, |
|
"loss": 0.0047, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 0.29625425652667425, |
|
"grad_norm": 0.08995606750249863, |
|
"learning_rate": 4.730398109049071e-05, |
|
"loss": 0.0018, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 0.29682179341657206, |
|
"grad_norm": 0.020904161036014557, |
|
"learning_rate": 4.71455594568616e-05, |
|
"loss": 0.0004, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 0.2973893303064699, |
|
"grad_norm": 0.028654640540480614, |
|
"learning_rate": 4.698716656741708e-05, |
|
"loss": 0.0007, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 0.29795686719636777, |
|
"grad_norm": 0.05848681926727295, |
|
"learning_rate": 4.6828804017171776e-05, |
|
"loss": 0.0009, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.2985244040862656, |
|
"grad_norm": 0.3643423020839691, |
|
"learning_rate": 4.667047340083481e-05, |
|
"loss": 0.0033, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 0.29909194097616343, |
|
"grad_norm": 0.12494904547929764, |
|
"learning_rate": 4.6512176312793736e-05, |
|
"loss": 0.0018, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 0.2996594778660613, |
|
"grad_norm": 0.03870720416307449, |
|
"learning_rate": 4.635391434709847e-05, |
|
"loss": 0.0008, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 0.30022701475595914, |
|
"grad_norm": 0.8066434264183044, |
|
"learning_rate": 4.619568909744524e-05, |
|
"loss": 0.0234, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 0.300794551645857, |
|
"grad_norm": 0.04658526927232742, |
|
"learning_rate": 4.603750215716057e-05, |
|
"loss": 0.0006, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.3013620885357548, |
|
"grad_norm": 0.362132728099823, |
|
"learning_rate": 4.587935511918521e-05, |
|
"loss": 0.04, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 0.30192962542565266, |
|
"grad_norm": 0.06521368026733398, |
|
"learning_rate": 4.5721249576058027e-05, |
|
"loss": 0.0009, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 0.3024971623155505, |
|
"grad_norm": 0.029287142679095268, |
|
"learning_rate": 4.5563187119900104e-05, |
|
"loss": 0.0005, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 0.3030646992054484, |
|
"grad_norm": 0.1905515044927597, |
|
"learning_rate": 4.5405169342398634e-05, |
|
"loss": 0.0018, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 0.3036322360953462, |
|
"grad_norm": 0.3800831735134125, |
|
"learning_rate": 4.5247197834790876e-05, |
|
"loss": 0.0213, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.30419977298524403, |
|
"grad_norm": 0.02702210657298565, |
|
"learning_rate": 4.508927418784815e-05, |
|
"loss": 0.0005, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 0.3047673098751419, |
|
"grad_norm": 0.021253688260912895, |
|
"learning_rate": 4.493139999185983e-05, |
|
"loss": 0.0004, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 0.30533484676503975, |
|
"grad_norm": 0.019332874566316605, |
|
"learning_rate": 4.477357683661734e-05, |
|
"loss": 0.0004, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 0.30590238365493755, |
|
"grad_norm": 0.20063234865665436, |
|
"learning_rate": 4.461580631139805e-05, |
|
"loss": 0.0033, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 0.3064699205448354, |
|
"grad_norm": 0.031181707978248596, |
|
"learning_rate": 4.445809000494946e-05, |
|
"loss": 0.0005, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.30703745743473326, |
|
"grad_norm": 0.6691449284553528, |
|
"learning_rate": 4.4300429505472976e-05, |
|
"loss": 0.0271, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 0.3076049943246311, |
|
"grad_norm": 0.009593677707016468, |
|
"learning_rate": 4.4142826400608086e-05, |
|
"loss": 0.0002, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 0.3081725312145289, |
|
"grad_norm": 0.009516783058643341, |
|
"learning_rate": 4.398528227741633e-05, |
|
"loss": 0.0003, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 0.3087400681044268, |
|
"grad_norm": 0.05990798771381378, |
|
"learning_rate": 4.3827798722365264e-05, |
|
"loss": 0.001, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 0.30930760499432464, |
|
"grad_norm": 0.0, |
|
"learning_rate": 4.3670377321312535e-05, |
|
"loss": 0.0, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.3098751418842225, |
|
"grad_norm": 0.0, |
|
"learning_rate": 4.351301965948991e-05, |
|
"loss": 0.0, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 0.3104426787741203, |
|
"grad_norm": 0.0, |
|
"learning_rate": 4.33557273214873e-05, |
|
"loss": 0.0, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 0.31101021566401815, |
|
"grad_norm": 0.0, |
|
"learning_rate": 4.3198501891236804e-05, |
|
"loss": 0.0, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 0.311577752553916, |
|
"grad_norm": 0.0, |
|
"learning_rate": 4.3041344951996746e-05, |
|
"loss": 0.0, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 0.31214528944381387, |
|
"grad_norm": 0.0, |
|
"learning_rate": 4.288425808633575e-05, |
|
"loss": 0.0, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.31271282633371167, |
|
"grad_norm": 0.8620555400848389, |
|
"learning_rate": 4.272724287611684e-05, |
|
"loss": 0.0271, |
|
"step": 551 |
|
}, |
|
{ |
|
"epoch": 0.3132803632236095, |
|
"grad_norm": 0.25762560963630676, |
|
"learning_rate": 4.2570300902481426e-05, |
|
"loss": 0.0042, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 0.3138479001135074, |
|
"grad_norm": 0.15001147985458374, |
|
"learning_rate": 4.241343374583343e-05, |
|
"loss": 0.0047, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 0.31441543700340524, |
|
"grad_norm": 0.2072601318359375, |
|
"learning_rate": 4.2256642985823395e-05, |
|
"loss": 0.0076, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 0.31498297389330304, |
|
"grad_norm": 0.19115956127643585, |
|
"learning_rate": 4.20999302013325e-05, |
|
"loss": 0.004, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.3155505107832009, |
|
"grad_norm": 0.19318291544914246, |
|
"learning_rate": 4.19432969704568e-05, |
|
"loss": 0.0067, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 0.31611804767309876, |
|
"grad_norm": 0.0738341435790062, |
|
"learning_rate": 4.178674487049116e-05, |
|
"loss": 0.0015, |
|
"step": 557 |
|
}, |
|
{ |
|
"epoch": 0.3166855845629966, |
|
"grad_norm": 0.3997495174407959, |
|
"learning_rate": 4.163027547791347e-05, |
|
"loss": 0.013, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 0.3172531214528944, |
|
"grad_norm": 0.10490912199020386, |
|
"learning_rate": 4.147389036836881e-05, |
|
"loss": 0.002, |
|
"step": 559 |
|
}, |
|
{ |
|
"epoch": 0.3178206583427923, |
|
"grad_norm": 0.3020564615726471, |
|
"learning_rate": 4.131759111665349e-05, |
|
"loss": 0.0184, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.31838819523269013, |
|
"grad_norm": 0.1874658763408661, |
|
"learning_rate": 4.116137929669921e-05, |
|
"loss": 0.0056, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 0.318955732122588, |
|
"grad_norm": 0.1652912199497223, |
|
"learning_rate": 4.100525648155731e-05, |
|
"loss": 0.0029, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 0.3195232690124858, |
|
"grad_norm": 0.2240392118692398, |
|
"learning_rate": 4.084922424338277e-05, |
|
"loss": 0.0084, |
|
"step": 563 |
|
}, |
|
{ |
|
"epoch": 0.32009080590238365, |
|
"grad_norm": 0.3925991654396057, |
|
"learning_rate": 4.06932841534185e-05, |
|
"loss": 0.0035, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 0.3206583427922815, |
|
"grad_norm": 0.23100757598876953, |
|
"learning_rate": 4.0537437781979506e-05, |
|
"loss": 0.0054, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.32122587968217936, |
|
"grad_norm": 0.05905711650848389, |
|
"learning_rate": 4.038168669843697e-05, |
|
"loss": 0.0007, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 0.32179341657207716, |
|
"grad_norm": 0.26876482367515564, |
|
"learning_rate": 4.0226032471202604e-05, |
|
"loss": 0.0166, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 0.322360953461975, |
|
"grad_norm": 0.5311969518661499, |
|
"learning_rate": 4.007047666771274e-05, |
|
"loss": 0.0041, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 0.3229284903518729, |
|
"grad_norm": 0.0217901561409235, |
|
"learning_rate": 3.991502085441259e-05, |
|
"loss": 0.0006, |
|
"step": 569 |
|
}, |
|
{ |
|
"epoch": 0.32349602724177073, |
|
"grad_norm": 0.04868500307202339, |
|
"learning_rate": 3.9759666596740476e-05, |
|
"loss": 0.0009, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.32406356413166854, |
|
"grad_norm": 0.01934129185974598, |
|
"learning_rate": 3.960441545911204e-05, |
|
"loss": 0.0005, |
|
"step": 571 |
|
}, |
|
{ |
|
"epoch": 0.3246311010215664, |
|
"grad_norm": 0.3002278208732605, |
|
"learning_rate": 3.944926900490452e-05, |
|
"loss": 0.006, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 0.32519863791146425, |
|
"grad_norm": 0.08359342068433762, |
|
"learning_rate": 3.929422879644099e-05, |
|
"loss": 0.0008, |
|
"step": 573 |
|
}, |
|
{ |
|
"epoch": 0.3257661748013621, |
|
"grad_norm": 0.05703277885913849, |
|
"learning_rate": 3.913929639497462e-05, |
|
"loss": 0.0006, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 0.3263337116912599, |
|
"grad_norm": 0.1300325244665146, |
|
"learning_rate": 3.898447336067297e-05, |
|
"loss": 0.0007, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.32690124858115777, |
|
"grad_norm": 0.2558203935623169, |
|
"learning_rate": 3.882976125260229e-05, |
|
"loss": 0.0116, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 0.3274687854710556, |
|
"grad_norm": 0.12041133642196655, |
|
"learning_rate": 3.8675161628711776e-05, |
|
"loss": 0.0024, |
|
"step": 577 |
|
}, |
|
{ |
|
"epoch": 0.3280363223609535, |
|
"grad_norm": 0.01596921868622303, |
|
"learning_rate": 3.852067604581794e-05, |
|
"loss": 0.0005, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 0.3286038592508513, |
|
"grad_norm": 0.35162538290023804, |
|
"learning_rate": 3.836630605958888e-05, |
|
"loss": 0.017, |
|
"step": 579 |
|
}, |
|
{ |
|
"epoch": 0.32917139614074914, |
|
"grad_norm": 0.43673884868621826, |
|
"learning_rate": 3.821205322452863e-05, |
|
"loss": 0.0116, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.329738933030647, |
|
"grad_norm": 0.9133800268173218, |
|
"learning_rate": 3.8057919093961553e-05, |
|
"loss": 0.0122, |
|
"step": 581 |
|
}, |
|
{ |
|
"epoch": 0.33030646992054485, |
|
"grad_norm": 0.024000134319067, |
|
"learning_rate": 3.790390522001662e-05, |
|
"loss": 0.0007, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 0.33087400681044266, |
|
"grad_norm": 0.934701681137085, |
|
"learning_rate": 3.775001315361183e-05, |
|
"loss": 0.0058, |
|
"step": 583 |
|
}, |
|
{ |
|
"epoch": 0.3314415437003405, |
|
"grad_norm": 0.904114842414856, |
|
"learning_rate": 3.759624444443858e-05, |
|
"loss": 0.036, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 0.33200908059023837, |
|
"grad_norm": 0.07346749305725098, |
|
"learning_rate": 3.744260064094604e-05, |
|
"loss": 0.0009, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.3325766174801362, |
|
"grad_norm": 0.03927430883049965, |
|
"learning_rate": 3.728908329032567e-05, |
|
"loss": 0.0008, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 0.33314415437003403, |
|
"grad_norm": 0.42941439151763916, |
|
"learning_rate": 3.713569393849543e-05, |
|
"loss": 0.015, |
|
"step": 587 |
|
}, |
|
{ |
|
"epoch": 0.3337116912599319, |
|
"grad_norm": 0.03521761670708656, |
|
"learning_rate": 3.69824341300844e-05, |
|
"loss": 0.0008, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 0.33427922814982974, |
|
"grad_norm": 0.019192036241292953, |
|
"learning_rate": 3.6829305408417166e-05, |
|
"loss": 0.0005, |
|
"step": 589 |
|
}, |
|
{ |
|
"epoch": 0.3348467650397276, |
|
"grad_norm": 0.07493746280670166, |
|
"learning_rate": 3.6676309315498256e-05, |
|
"loss": 0.0011, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.3354143019296254, |
|
"grad_norm": 0.43895918130874634, |
|
"learning_rate": 3.6523447391996614e-05, |
|
"loss": 0.0059, |
|
"step": 591 |
|
}, |
|
{ |
|
"epoch": 0.33598183881952326, |
|
"grad_norm": 0.6594648361206055, |
|
"learning_rate": 3.6370721177230116e-05, |
|
"loss": 0.0177, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 0.3365493757094211, |
|
"grad_norm": 0.12148375064134598, |
|
"learning_rate": 3.6218132209150045e-05, |
|
"loss": 0.0016, |
|
"step": 593 |
|
}, |
|
{ |
|
"epoch": 0.337116912599319, |
|
"grad_norm": 0.033791981637477875, |
|
"learning_rate": 3.606568202432562e-05, |
|
"loss": 0.0006, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 0.3376844494892168, |
|
"grad_norm": 0.0, |
|
"learning_rate": 3.591337215792852e-05, |
|
"loss": 0.0, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.33825198637911463, |
|
"grad_norm": 0.12862201035022736, |
|
"learning_rate": 3.5761204143717385e-05, |
|
"loss": 0.0021, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 0.3388195232690125, |
|
"grad_norm": 0.05550703406333923, |
|
"learning_rate": 3.560917951402245e-05, |
|
"loss": 0.0005, |
|
"step": 597 |
|
}, |
|
{ |
|
"epoch": 0.33938706015891035, |
|
"grad_norm": 0.0, |
|
"learning_rate": 3.545729979973005e-05, |
|
"loss": 0.0, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 0.33995459704880815, |
|
"grad_norm": 0.0, |
|
"learning_rate": 3.530556653026721e-05, |
|
"loss": 0.0, |
|
"step": 599 |
|
}, |
|
{ |
|
"epoch": 0.340522133938706, |
|
"grad_norm": 0.0, |
|
"learning_rate": 3.515398123358627e-05, |
|
"loss": 0.0, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.340522133938706, |
|
"eval_loss": NaN, |
|
"eval_runtime": 107.34, |
|
"eval_samples_per_second": 27.65, |
|
"eval_steps_per_second": 6.913, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.34108967082860386, |
|
"grad_norm": 0.9571943283081055, |
|
"learning_rate": 3.5002545436149474e-05, |
|
"loss": 0.0473, |
|
"step": 601 |
|
}, |
|
{ |
|
"epoch": 0.3416572077185017, |
|
"grad_norm": 0.480672150850296, |
|
"learning_rate": 3.485126066291364e-05, |
|
"loss": 0.0109, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 0.3422247446083995, |
|
"grad_norm": 0.6047912836074829, |
|
"learning_rate": 3.470012843731476e-05, |
|
"loss": 0.0188, |
|
"step": 603 |
|
}, |
|
{ |
|
"epoch": 0.3427922814982974, |
|
"grad_norm": 0.02252427488565445, |
|
"learning_rate": 3.4549150281252636e-05, |
|
"loss": 0.0008, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 0.34335981838819524, |
|
"grad_norm": 0.1912216693162918, |
|
"learning_rate": 3.439832771507565e-05, |
|
"loss": 0.0036, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.3439273552780931, |
|
"grad_norm": 0.18382948637008667, |
|
"learning_rate": 3.424766225756537e-05, |
|
"loss": 0.0077, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 0.3444948921679909, |
|
"grad_norm": 0.10974773019552231, |
|
"learning_rate": 3.4097155425921254e-05, |
|
"loss": 0.0018, |
|
"step": 607 |
|
}, |
|
{ |
|
"epoch": 0.34506242905788875, |
|
"grad_norm": 0.32770535349845886, |
|
"learning_rate": 3.394680873574546e-05, |
|
"loss": 0.0061, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 0.3456299659477866, |
|
"grad_norm": 0.389616996049881, |
|
"learning_rate": 3.3796623701027476e-05, |
|
"loss": 0.0097, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 0.34619750283768447, |
|
"grad_norm": 0.11117340624332428, |
|
"learning_rate": 3.364660183412892e-05, |
|
"loss": 0.0024, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.34676503972758227, |
|
"grad_norm": 0.11243616044521332, |
|
"learning_rate": 3.349674464576834e-05, |
|
"loss": 0.0022, |
|
"step": 611 |
|
}, |
|
{ |
|
"epoch": 0.3473325766174801, |
|
"grad_norm": 0.05497328191995621, |
|
"learning_rate": 3.334705364500596e-05, |
|
"loss": 0.0012, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 0.347900113507378, |
|
"grad_norm": 0.03620595484972, |
|
"learning_rate": 3.3197530339228487e-05, |
|
"loss": 0.0011, |
|
"step": 613 |
|
}, |
|
{ |
|
"epoch": 0.34846765039727584, |
|
"grad_norm": 0.029850907623767853, |
|
"learning_rate": 3.304817623413397e-05, |
|
"loss": 0.0007, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 0.34903518728717364, |
|
"grad_norm": 0.4595206677913666, |
|
"learning_rate": 3.289899283371657e-05, |
|
"loss": 0.0065, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.3496027241770715, |
|
"grad_norm": 0.3553248941898346, |
|
"learning_rate": 3.274998164025148e-05, |
|
"loss": 0.006, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 0.35017026106696936, |
|
"grad_norm": 0.06678071618080139, |
|
"learning_rate": 3.260114415427975e-05, |
|
"loss": 0.0012, |
|
"step": 617 |
|
}, |
|
{ |
|
"epoch": 0.3507377979568672, |
|
"grad_norm": 0.08883315324783325, |
|
"learning_rate": 3.2452481874593234e-05, |
|
"loss": 0.0022, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 0.351305334846765, |
|
"grad_norm": 0.04673172906041145, |
|
"learning_rate": 3.230399629821942e-05, |
|
"loss": 0.001, |
|
"step": 619 |
|
}, |
|
{ |
|
"epoch": 0.3518728717366629, |
|
"grad_norm": 0.03680291026830673, |
|
"learning_rate": 3.215568892040641e-05, |
|
"loss": 0.0009, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.35244040862656073, |
|
"grad_norm": 0.08393888175487518, |
|
"learning_rate": 3.200756123460788e-05, |
|
"loss": 0.0013, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 0.3530079455164586, |
|
"grad_norm": 0.3266814649105072, |
|
"learning_rate": 3.1859614732467954e-05, |
|
"loss": 0.0146, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 0.3535754824063564, |
|
"grad_norm": 0.38569343090057373, |
|
"learning_rate": 3.171185090380628e-05, |
|
"loss": 0.0099, |
|
"step": 623 |
|
}, |
|
{ |
|
"epoch": 0.35414301929625425, |
|
"grad_norm": 0.02289619669318199, |
|
"learning_rate": 3.156427123660297e-05, |
|
"loss": 0.0005, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 0.3547105561861521, |
|
"grad_norm": 0.3161522448062897, |
|
"learning_rate": 3.141687721698363e-05, |
|
"loss": 0.0036, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.35527809307604996, |
|
"grad_norm": 0.30018478631973267, |
|
"learning_rate": 3.12696703292044e-05, |
|
"loss": 0.0065, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 0.35584562996594776, |
|
"grad_norm": 0.41302579641342163, |
|
"learning_rate": 3.1122652055637015e-05, |
|
"loss": 0.0059, |
|
"step": 627 |
|
}, |
|
{ |
|
"epoch": 0.3564131668558456, |
|
"grad_norm": 0.5455114245414734, |
|
"learning_rate": 3.097582387675385e-05, |
|
"loss": 0.0085, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 0.3569807037457435, |
|
"grad_norm": 0.028173979371786118, |
|
"learning_rate": 3.082918727111304e-05, |
|
"loss": 0.0006, |
|
"step": 629 |
|
}, |
|
{ |
|
"epoch": 0.35754824063564133, |
|
"grad_norm": 0.035780180245637894, |
|
"learning_rate": 3.0682743715343564e-05, |
|
"loss": 0.0005, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.35811577752553914, |
|
"grad_norm": 0.031802088022232056, |
|
"learning_rate": 3.053649468413043e-05, |
|
"loss": 0.0008, |
|
"step": 631 |
|
}, |
|
{ |
|
"epoch": 0.358683314415437, |
|
"grad_norm": 0.5345308184623718, |
|
"learning_rate": 3.0390441650199724e-05, |
|
"loss": 0.0111, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 0.35925085130533485, |
|
"grad_norm": 0.2168167382478714, |
|
"learning_rate": 3.0244586084303905e-05, |
|
"loss": 0.0022, |
|
"step": 633 |
|
}, |
|
{ |
|
"epoch": 0.3598183881952327, |
|
"grad_norm": 0.36197105050086975, |
|
"learning_rate": 3.0098929455206904e-05, |
|
"loss": 0.003, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 0.3603859250851305, |
|
"grad_norm": 0.31182852387428284, |
|
"learning_rate": 2.9953473229669328e-05, |
|
"loss": 0.0045, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.36095346197502837, |
|
"grad_norm": 0.013852439820766449, |
|
"learning_rate": 2.9808218872433767e-05, |
|
"loss": 0.0003, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 0.3615209988649262, |
|
"grad_norm": 0.09768744558095932, |
|
"learning_rate": 2.9663167846209998e-05, |
|
"loss": 0.0016, |
|
"step": 637 |
|
}, |
|
{ |
|
"epoch": 0.3620885357548241, |
|
"grad_norm": 0.8393372297286987, |
|
"learning_rate": 2.9518321611660237e-05, |
|
"loss": 0.0051, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 0.3626560726447219, |
|
"grad_norm": 0.03129115700721741, |
|
"learning_rate": 2.9373681627384447e-05, |
|
"loss": 0.0006, |
|
"step": 639 |
|
}, |
|
{ |
|
"epoch": 0.36322360953461974, |
|
"grad_norm": 0.20783281326293945, |
|
"learning_rate": 2.9229249349905684e-05, |
|
"loss": 0.0013, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.3637911464245176, |
|
"grad_norm": 0.42036134004592896, |
|
"learning_rate": 2.9085026233655365e-05, |
|
"loss": 0.0089, |
|
"step": 641 |
|
}, |
|
{ |
|
"epoch": 0.36435868331441545, |
|
"grad_norm": 0.18196117877960205, |
|
"learning_rate": 2.894101373095867e-05, |
|
"loss": 0.002, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 0.36492622020431326, |
|
"grad_norm": 0.008144269697368145, |
|
"learning_rate": 2.8797213292019926e-05, |
|
"loss": 0.0002, |
|
"step": 643 |
|
}, |
|
{ |
|
"epoch": 0.3654937570942111, |
|
"grad_norm": 4.53141450881958, |
|
"learning_rate": 2.8653626364907917e-05, |
|
"loss": 0.0394, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 0.36606129398410897, |
|
"grad_norm": 0.011452808044850826, |
|
"learning_rate": 2.851025439554142e-05, |
|
"loss": 0.0002, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.36662883087400683, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.8367098827674578e-05, |
|
"loss": 0.0, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 0.36719636776390463, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.8224161102882397e-05, |
|
"loss": 0.0, |
|
"step": 647 |
|
}, |
|
{ |
|
"epoch": 0.3677639046538025, |
|
"grad_norm": 0.11076200008392334, |
|
"learning_rate": 2.8081442660546125e-05, |
|
"loss": 0.0003, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 0.36833144154370034, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.7938944937838923e-05, |
|
"loss": 0.0, |
|
"step": 649 |
|
}, |
|
{ |
|
"epoch": 0.3688989784335982, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.7796669369711294e-05, |
|
"loss": 0.0, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.369466515323496, |
|
"grad_norm": 0.6546966433525085, |
|
"learning_rate": 2.7654617388876615e-05, |
|
"loss": 0.0211, |
|
"step": 651 |
|
}, |
|
{ |
|
"epoch": 0.37003405221339386, |
|
"grad_norm": 0.1961146891117096, |
|
"learning_rate": 2.7512790425796718e-05, |
|
"loss": 0.004, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 0.3706015891032917, |
|
"grad_norm": 0.7561377882957458, |
|
"learning_rate": 2.7371189908667604e-05, |
|
"loss": 0.0221, |
|
"step": 653 |
|
}, |
|
{ |
|
"epoch": 0.3711691259931896, |
|
"grad_norm": 0.2948407530784607, |
|
"learning_rate": 2.7229817263404866e-05, |
|
"loss": 0.0067, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 0.3717366628830874, |
|
"grad_norm": 0.0565456859767437, |
|
"learning_rate": 2.708867391362948e-05, |
|
"loss": 0.0011, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.37230419977298523, |
|
"grad_norm": 0.13881909847259521, |
|
"learning_rate": 2.694776128065345e-05, |
|
"loss": 0.0015, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 0.3728717366628831, |
|
"grad_norm": 0.3793433904647827, |
|
"learning_rate": 2.6807080783465376e-05, |
|
"loss": 0.0015, |
|
"step": 657 |
|
}, |
|
{ |
|
"epoch": 0.37343927355278095, |
|
"grad_norm": 0.03849627077579498, |
|
"learning_rate": 2.6666633838716314e-05, |
|
"loss": 0.0009, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 0.37400681044267875, |
|
"grad_norm": 0.5067241787910461, |
|
"learning_rate": 2.6526421860705473e-05, |
|
"loss": 0.0078, |
|
"step": 659 |
|
}, |
|
{ |
|
"epoch": 0.3745743473325766, |
|
"grad_norm": 0.2140672206878662, |
|
"learning_rate": 2.638644626136587e-05, |
|
"loss": 0.0036, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.37514188422247446, |
|
"grad_norm": 0.051408469676971436, |
|
"learning_rate": 2.6246708450250256e-05, |
|
"loss": 0.0011, |
|
"step": 661 |
|
}, |
|
{ |
|
"epoch": 0.3757094211123723, |
|
"grad_norm": 0.3721614480018616, |
|
"learning_rate": 2.6107209834516854e-05, |
|
"loss": 0.0084, |
|
"step": 662 |
|
}, |
|
{ |
|
"epoch": 0.3762769580022701, |
|
"grad_norm": 0.21189674735069275, |
|
"learning_rate": 2.596795181891514e-05, |
|
"loss": 0.0024, |
|
"step": 663 |
|
}, |
|
{ |
|
"epoch": 0.376844494892168, |
|
"grad_norm": 0.46903058886528015, |
|
"learning_rate": 2.5828935805771802e-05, |
|
"loss": 0.0079, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 0.37741203178206584, |
|
"grad_norm": 0.03936934471130371, |
|
"learning_rate": 2.5690163194976575e-05, |
|
"loss": 0.0007, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.3779795686719637, |
|
"grad_norm": 0.034673310816287994, |
|
"learning_rate": 2.5551635383968065e-05, |
|
"loss": 0.0006, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 0.3785471055618615, |
|
"grad_norm": 0.01133895106613636, |
|
"learning_rate": 2.5413353767719805e-05, |
|
"loss": 0.0004, |
|
"step": 667 |
|
}, |
|
{ |
|
"epoch": 0.37911464245175935, |
|
"grad_norm": 0.48205289244651794, |
|
"learning_rate": 2.5275319738726165e-05, |
|
"loss": 0.0069, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 0.3796821793416572, |
|
"grad_norm": 0.24426761269569397, |
|
"learning_rate": 2.513753468698826e-05, |
|
"loss": 0.0019, |
|
"step": 669 |
|
}, |
|
{ |
|
"epoch": 0.38024971623155507, |
|
"grad_norm": 0.03907699137926102, |
|
"learning_rate": 2.500000000000001e-05, |
|
"loss": 0.0003, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.38081725312145287, |
|
"grad_norm": 0.2291565239429474, |
|
"learning_rate": 2.486271706273421e-05, |
|
"loss": 0.0106, |
|
"step": 671 |
|
}, |
|
{ |
|
"epoch": 0.3813847900113507, |
|
"grad_norm": 0.019995173439383507, |
|
"learning_rate": 2.4725687257628534e-05, |
|
"loss": 0.0004, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 0.3819523269012486, |
|
"grad_norm": 0.10260719805955887, |
|
"learning_rate": 2.4588911964571553e-05, |
|
"loss": 0.0011, |
|
"step": 673 |
|
}, |
|
{ |
|
"epoch": 0.38251986379114644, |
|
"grad_norm": 0.024873068556189537, |
|
"learning_rate": 2.4452392560888976e-05, |
|
"loss": 0.0004, |
|
"step": 674 |
|
}, |
|
{ |
|
"epoch": 0.38308740068104424, |
|
"grad_norm": 0.5002231001853943, |
|
"learning_rate": 2.4316130421329697e-05, |
|
"loss": 0.0108, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.3836549375709421, |
|
"grad_norm": 0.037127118557691574, |
|
"learning_rate": 2.418012691805191e-05, |
|
"loss": 0.0005, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 0.38422247446083996, |
|
"grad_norm": 0.2599027752876282, |
|
"learning_rate": 2.4044383420609406e-05, |
|
"loss": 0.0083, |
|
"step": 677 |
|
}, |
|
{ |
|
"epoch": 0.3847900113507378, |
|
"grad_norm": 0.018559547141194344, |
|
"learning_rate": 2.3908901295937713e-05, |
|
"loss": 0.0004, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 0.3853575482406356, |
|
"grad_norm": 0.13668963313102722, |
|
"learning_rate": 2.3773681908340284e-05, |
|
"loss": 0.0018, |
|
"step": 679 |
|
}, |
|
{ |
|
"epoch": 0.3859250851305335, |
|
"grad_norm": 0.21998494863510132, |
|
"learning_rate": 2.363872661947488e-05, |
|
"loss": 0.0011, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.38649262202043133, |
|
"grad_norm": 0.12420105934143066, |
|
"learning_rate": 2.350403678833976e-05, |
|
"loss": 0.0014, |
|
"step": 681 |
|
}, |
|
{ |
|
"epoch": 0.3870601589103292, |
|
"grad_norm": 0.2006537914276123, |
|
"learning_rate": 2.336961377126001e-05, |
|
"loss": 0.0045, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 0.387627695800227, |
|
"grad_norm": 0.09202957153320312, |
|
"learning_rate": 2.3235458921873925e-05, |
|
"loss": 0.0011, |
|
"step": 683 |
|
}, |
|
{ |
|
"epoch": 0.38819523269012485, |
|
"grad_norm": 0.1019575372338295, |
|
"learning_rate": 2.310157359111938e-05, |
|
"loss": 0.0009, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 0.3887627695800227, |
|
"grad_norm": 0.01464917603880167, |
|
"learning_rate": 2.296795912722014e-05, |
|
"loss": 0.0003, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.38933030646992056, |
|
"grad_norm": 0.024914277717471123, |
|
"learning_rate": 2.283461687567236e-05, |
|
"loss": 0.0004, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 0.38989784335981836, |
|
"grad_norm": 0.03489803895354271, |
|
"learning_rate": 2.2701548179231048e-05, |
|
"loss": 0.0005, |
|
"step": 687 |
|
}, |
|
{ |
|
"epoch": 0.3904653802497162, |
|
"grad_norm": 0.0056303925812244415, |
|
"learning_rate": 2.2568754377896516e-05, |
|
"loss": 0.0002, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 0.3910329171396141, |
|
"grad_norm": 1.0861475467681885, |
|
"learning_rate": 2.2436236808900844e-05, |
|
"loss": 0.0469, |
|
"step": 689 |
|
}, |
|
{ |
|
"epoch": 0.39160045402951194, |
|
"grad_norm": 0.7234563231468201, |
|
"learning_rate": 2.2303996806694488e-05, |
|
"loss": 0.0229, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.39216799091940974, |
|
"grad_norm": 0.01756235770881176, |
|
"learning_rate": 2.2172035702932825e-05, |
|
"loss": 0.0004, |
|
"step": 691 |
|
}, |
|
{ |
|
"epoch": 0.3927355278093076, |
|
"grad_norm": 0.007982458919286728, |
|
"learning_rate": 2.2040354826462668e-05, |
|
"loss": 0.0002, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 0.39330306469920545, |
|
"grad_norm": 0.004965408705174923, |
|
"learning_rate": 2.1908955503308993e-05, |
|
"loss": 0.0001, |
|
"step": 693 |
|
}, |
|
{ |
|
"epoch": 0.3938706015891033, |
|
"grad_norm": 1.1320465803146362, |
|
"learning_rate": 2.1777839056661554e-05, |
|
"loss": 0.0237, |
|
"step": 694 |
|
}, |
|
{ |
|
"epoch": 0.3944381384790011, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.164700680686147e-05, |
|
"loss": 0.0, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.39500567536889897, |
|
"grad_norm": 0.05166594311594963, |
|
"learning_rate": 2.1516460071388062e-05, |
|
"loss": 0.0004, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 0.3955732122587968, |
|
"grad_norm": 0.013484718278050423, |
|
"learning_rate": 2.1386200164845526e-05, |
|
"loss": 0.0002, |
|
"step": 697 |
|
}, |
|
{ |
|
"epoch": 0.3961407491486947, |
|
"grad_norm": 26.556129455566406, |
|
"learning_rate": 2.125622839894964e-05, |
|
"loss": 0.4464, |
|
"step": 698 |
|
}, |
|
{ |
|
"epoch": 0.3967082860385925, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.1126546082514664e-05, |
|
"loss": 0.0, |
|
"step": 699 |
|
}, |
|
{ |
|
"epoch": 0.39727582292849034, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.09971545214401e-05, |
|
"loss": 0.0, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.3978433598183882, |
|
"grad_norm": 1.003913402557373, |
|
"learning_rate": 2.086805501869749e-05, |
|
"loss": 0.0426, |
|
"step": 701 |
|
}, |
|
{ |
|
"epoch": 0.39841089670828606, |
|
"grad_norm": 0.2675124406814575, |
|
"learning_rate": 2.073924887431744e-05, |
|
"loss": 0.0221, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 0.39897843359818386, |
|
"grad_norm": 0.4695562720298767, |
|
"learning_rate": 2.061073738537635e-05, |
|
"loss": 0.0276, |
|
"step": 703 |
|
}, |
|
{ |
|
"epoch": 0.3995459704880817, |
|
"grad_norm": 0.8297376036643982, |
|
"learning_rate": 2.048252184598352e-05, |
|
"loss": 0.0142, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 0.40011350737797957, |
|
"grad_norm": 0.12075504660606384, |
|
"learning_rate": 2.0354603547267985e-05, |
|
"loss": 0.0023, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.40068104426787743, |
|
"grad_norm": 0.36436015367507935, |
|
"learning_rate": 2.0226983777365604e-05, |
|
"loss": 0.0067, |
|
"step": 706 |
|
}, |
|
{ |
|
"epoch": 0.40124858115777523, |
|
"grad_norm": 0.21536526083946228, |
|
"learning_rate": 2.0099663821406056e-05, |
|
"loss": 0.0066, |
|
"step": 707 |
|
}, |
|
{ |
|
"epoch": 0.4018161180476731, |
|
"grad_norm": 0.1440303772687912, |
|
"learning_rate": 1.9972644961499854e-05, |
|
"loss": 0.0027, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 0.40238365493757094, |
|
"grad_norm": 0.31195855140686035, |
|
"learning_rate": 1.9845928476725524e-05, |
|
"loss": 0.0039, |
|
"step": 709 |
|
}, |
|
{ |
|
"epoch": 0.4029511918274688, |
|
"grad_norm": 0.19405895471572876, |
|
"learning_rate": 1.9719515643116674e-05, |
|
"loss": 0.0015, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.4035187287173666, |
|
"grad_norm": 0.118756964802742, |
|
"learning_rate": 1.959340773364911e-05, |
|
"loss": 0.0014, |
|
"step": 711 |
|
}, |
|
{ |
|
"epoch": 0.40408626560726446, |
|
"grad_norm": 0.651408851146698, |
|
"learning_rate": 1.946760601822809e-05, |
|
"loss": 0.0064, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 0.4046538024971623, |
|
"grad_norm": 0.09903181344270706, |
|
"learning_rate": 1.9342111763675512e-05, |
|
"loss": 0.0012, |
|
"step": 713 |
|
}, |
|
{ |
|
"epoch": 0.4052213393870602, |
|
"grad_norm": 0.09159818291664124, |
|
"learning_rate": 1.9216926233717085e-05, |
|
"loss": 0.0012, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 0.405788876276958, |
|
"grad_norm": 0.4659949839115143, |
|
"learning_rate": 1.9092050688969738e-05, |
|
"loss": 0.0086, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.40635641316685583, |
|
"grad_norm": 0.0276198647916317, |
|
"learning_rate": 1.8967486386928817e-05, |
|
"loss": 0.0006, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 0.4069239500567537, |
|
"grad_norm": 0.11385304480791092, |
|
"learning_rate": 1.8843234581955442e-05, |
|
"loss": 0.0015, |
|
"step": 717 |
|
}, |
|
{ |
|
"epoch": 0.40749148694665155, |
|
"grad_norm": 0.30209067463874817, |
|
"learning_rate": 1.8719296525263922e-05, |
|
"loss": 0.0127, |
|
"step": 718 |
|
}, |
|
{ |
|
"epoch": 0.40805902383654935, |
|
"grad_norm": 0.24259290099143982, |
|
"learning_rate": 1.859567346490913e-05, |
|
"loss": 0.0012, |
|
"step": 719 |
|
}, |
|
{ |
|
"epoch": 0.4086265607264472, |
|
"grad_norm": 0.2819889783859253, |
|
"learning_rate": 1.847236664577389e-05, |
|
"loss": 0.0059, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.40919409761634506, |
|
"grad_norm": 0.5837430953979492, |
|
"learning_rate": 1.8349377309556486e-05, |
|
"loss": 0.0041, |
|
"step": 721 |
|
}, |
|
{ |
|
"epoch": 0.4097616345062429, |
|
"grad_norm": 0.05538428574800491, |
|
"learning_rate": 1.8226706694758195e-05, |
|
"loss": 0.001, |
|
"step": 722 |
|
}, |
|
{ |
|
"epoch": 0.4103291713961407, |
|
"grad_norm": 0.08718933165073395, |
|
"learning_rate": 1.810435603667075e-05, |
|
"loss": 0.0009, |
|
"step": 723 |
|
}, |
|
{ |
|
"epoch": 0.4108967082860386, |
|
"grad_norm": 0.09461364895105362, |
|
"learning_rate": 1.7982326567363888e-05, |
|
"loss": 0.0017, |
|
"step": 724 |
|
}, |
|
{ |
|
"epoch": 0.41146424517593644, |
|
"grad_norm": 0.43470796942710876, |
|
"learning_rate": 1.7860619515673033e-05, |
|
"loss": 0.0176, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.4120317820658343, |
|
"grad_norm": 0.04546342045068741, |
|
"learning_rate": 1.773923610718686e-05, |
|
"loss": 0.0009, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 0.4125993189557321, |
|
"grad_norm": 0.014890948310494423, |
|
"learning_rate": 1.7618177564234905e-05, |
|
"loss": 0.0003, |
|
"step": 727 |
|
}, |
|
{ |
|
"epoch": 0.41316685584562995, |
|
"grad_norm": 0.030182119458913803, |
|
"learning_rate": 1.7497445105875377e-05, |
|
"loss": 0.0005, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 0.4137343927355278, |
|
"grad_norm": 0.05278665944933891, |
|
"learning_rate": 1.73770399478828e-05, |
|
"loss": 0.0008, |
|
"step": 729 |
|
}, |
|
{ |
|
"epoch": 0.41430192962542567, |
|
"grad_norm": 0.4548901915550232, |
|
"learning_rate": 1.725696330273575e-05, |
|
"loss": 0.0118, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.41486946651532347, |
|
"grad_norm": 0.33028581738471985, |
|
"learning_rate": 1.7137216379604727e-05, |
|
"loss": 0.0071, |
|
"step": 731 |
|
}, |
|
{ |
|
"epoch": 0.41543700340522133, |
|
"grad_norm": 0.0861010029911995, |
|
"learning_rate": 1.7017800384339928e-05, |
|
"loss": 0.001, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 0.4160045402951192, |
|
"grad_norm": 0.438575804233551, |
|
"learning_rate": 1.6898716519459074e-05, |
|
"loss": 0.0071, |
|
"step": 733 |
|
}, |
|
{ |
|
"epoch": 0.41657207718501704, |
|
"grad_norm": 0.05357427895069122, |
|
"learning_rate": 1.6779965984135377e-05, |
|
"loss": 0.0006, |
|
"step": 734 |
|
}, |
|
{ |
|
"epoch": 0.41713961407491484, |
|
"grad_norm": 0.022530531510710716, |
|
"learning_rate": 1.6661549974185424e-05, |
|
"loss": 0.0003, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.4177071509648127, |
|
"grad_norm": 0.4733809232711792, |
|
"learning_rate": 1.6543469682057106e-05, |
|
"loss": 0.0127, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 0.41827468785471056, |
|
"grad_norm": 0.12539038062095642, |
|
"learning_rate": 1.6425726296817633e-05, |
|
"loss": 0.0014, |
|
"step": 737 |
|
}, |
|
{ |
|
"epoch": 0.4188422247446084, |
|
"grad_norm": 0.4548875689506531, |
|
"learning_rate": 1.6308321004141607e-05, |
|
"loss": 0.0076, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 0.4194097616345062, |
|
"grad_norm": 0.0097389817237854, |
|
"learning_rate": 1.619125498629904e-05, |
|
"loss": 0.0002, |
|
"step": 739 |
|
}, |
|
{ |
|
"epoch": 0.4199772985244041, |
|
"grad_norm": 0.019004186615347862, |
|
"learning_rate": 1.60745294221434e-05, |
|
"loss": 0.0004, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.42054483541430193, |
|
"grad_norm": 0.03138939291238785, |
|
"learning_rate": 1.595814548709983e-05, |
|
"loss": 0.0004, |
|
"step": 741 |
|
}, |
|
{ |
|
"epoch": 0.4211123723041998, |
|
"grad_norm": 0.5367324948310852, |
|
"learning_rate": 1.5842104353153287e-05, |
|
"loss": 0.0249, |
|
"step": 742 |
|
}, |
|
{ |
|
"epoch": 0.4216799091940976, |
|
"grad_norm": 1.0344882011413574, |
|
"learning_rate": 1.5726407188836673e-05, |
|
"loss": 0.0335, |
|
"step": 743 |
|
}, |
|
{ |
|
"epoch": 0.42224744608399545, |
|
"grad_norm": 0.0143516156822443, |
|
"learning_rate": 1.5611055159219152e-05, |
|
"loss": 0.0003, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 0.4228149829738933, |
|
"grad_norm": 31.227991104125977, |
|
"learning_rate": 1.549604942589441e-05, |
|
"loss": 1.1122, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.42338251986379116, |
|
"grad_norm": 0.10639174282550812, |
|
"learning_rate": 1.5381391146968866e-05, |
|
"loss": 0.0009, |
|
"step": 746 |
|
}, |
|
{ |
|
"epoch": 0.42395005675368896, |
|
"grad_norm": 0.05225389450788498, |
|
"learning_rate": 1.526708147705013e-05, |
|
"loss": 0.0003, |
|
"step": 747 |
|
}, |
|
{ |
|
"epoch": 0.4245175936435868, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.5153121567235335e-05, |
|
"loss": 0.0, |
|
"step": 748 |
|
}, |
|
{ |
|
"epoch": 0.4250851305334847, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.5039512565099467e-05, |
|
"loss": 0.0, |
|
"step": 749 |
|
}, |
|
{ |
|
"epoch": 0.42565266742338254, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.4926255614683932e-05, |
|
"loss": 0.0, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.42622020431328034, |
|
"grad_norm": 0.5881960988044739, |
|
"learning_rate": 1.481335185648498e-05, |
|
"loss": 0.0209, |
|
"step": 751 |
|
}, |
|
{ |
|
"epoch": 0.4267877412031782, |
|
"grad_norm": 0.46018141508102417, |
|
"learning_rate": 1.4700802427442179e-05, |
|
"loss": 0.009, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 0.42735527809307605, |
|
"grad_norm": 0.40768754482269287, |
|
"learning_rate": 1.458860846092705e-05, |
|
"loss": 0.0032, |
|
"step": 753 |
|
}, |
|
{ |
|
"epoch": 0.4279228149829739, |
|
"grad_norm": 0.2659337520599365, |
|
"learning_rate": 1.4476771086731567e-05, |
|
"loss": 0.0061, |
|
"step": 754 |
|
}, |
|
{ |
|
"epoch": 0.4284903518728717, |
|
"grad_norm": 0.10703348368406296, |
|
"learning_rate": 1.4365291431056871e-05, |
|
"loss": 0.0017, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.42905788876276957, |
|
"grad_norm": 0.3611052930355072, |
|
"learning_rate": 1.4254170616501827e-05, |
|
"loss": 0.0034, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 0.4296254256526674, |
|
"grad_norm": 0.05121847242116928, |
|
"learning_rate": 1.414340976205183e-05, |
|
"loss": 0.001, |
|
"step": 757 |
|
}, |
|
{ |
|
"epoch": 0.4301929625425653, |
|
"grad_norm": 0.31806862354278564, |
|
"learning_rate": 1.4033009983067452e-05, |
|
"loss": 0.0059, |
|
"step": 758 |
|
}, |
|
{ |
|
"epoch": 0.4307604994324631, |
|
"grad_norm": 0.05238351970911026, |
|
"learning_rate": 1.3922972391273226e-05, |
|
"loss": 0.0011, |
|
"step": 759 |
|
}, |
|
{ |
|
"epoch": 0.43132803632236094, |
|
"grad_norm": 0.17556647956371307, |
|
"learning_rate": 1.3813298094746491e-05, |
|
"loss": 0.0029, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.4318955732122588, |
|
"grad_norm": 0.1977948248386383, |
|
"learning_rate": 1.3703988197906209e-05, |
|
"loss": 0.0043, |
|
"step": 761 |
|
}, |
|
{ |
|
"epoch": 0.43246311010215666, |
|
"grad_norm": 0.058601368218660355, |
|
"learning_rate": 1.3595043801501794e-05, |
|
"loss": 0.0013, |
|
"step": 762 |
|
}, |
|
{ |
|
"epoch": 0.43303064699205446, |
|
"grad_norm": 0.2709505558013916, |
|
"learning_rate": 1.3486466002602133e-05, |
|
"loss": 0.0022, |
|
"step": 763 |
|
}, |
|
{ |
|
"epoch": 0.4335981838819523, |
|
"grad_norm": 0.04408566281199455, |
|
"learning_rate": 1.3378255894584463e-05, |
|
"loss": 0.0008, |
|
"step": 764 |
|
}, |
|
{ |
|
"epoch": 0.43416572077185017, |
|
"grad_norm": 0.034973569214344025, |
|
"learning_rate": 1.327041456712334e-05, |
|
"loss": 0.0006, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.43473325766174803, |
|
"grad_norm": 0.32786574959754944, |
|
"learning_rate": 1.3162943106179749e-05, |
|
"loss": 0.018, |
|
"step": 766 |
|
}, |
|
{ |
|
"epoch": 0.43530079455164583, |
|
"grad_norm": 0.05485441908240318, |
|
"learning_rate": 1.3055842593990131e-05, |
|
"loss": 0.0005, |
|
"step": 767 |
|
}, |
|
{ |
|
"epoch": 0.4358683314415437, |
|
"grad_norm": 0.07898583263158798, |
|
"learning_rate": 1.2949114109055415e-05, |
|
"loss": 0.0013, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 0.43643586833144155, |
|
"grad_norm": 0.03237922489643097, |
|
"learning_rate": 1.2842758726130283e-05, |
|
"loss": 0.0007, |
|
"step": 769 |
|
}, |
|
{ |
|
"epoch": 0.4370034052213394, |
|
"grad_norm": 0.05610362067818642, |
|
"learning_rate": 1.2736777516212266e-05, |
|
"loss": 0.0008, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.4375709421112372, |
|
"grad_norm": 0.24216708540916443, |
|
"learning_rate": 1.2631171546530968e-05, |
|
"loss": 0.0037, |
|
"step": 771 |
|
}, |
|
{ |
|
"epoch": 0.43813847900113506, |
|
"grad_norm": 0.07961627095937729, |
|
"learning_rate": 1.2525941880537307e-05, |
|
"loss": 0.0013, |
|
"step": 772 |
|
}, |
|
{ |
|
"epoch": 0.4387060158910329, |
|
"grad_norm": 0.11050142347812653, |
|
"learning_rate": 1.2421089577892869e-05, |
|
"loss": 0.0012, |
|
"step": 773 |
|
}, |
|
{ |
|
"epoch": 0.4392735527809308, |
|
"grad_norm": 0.04272003099322319, |
|
"learning_rate": 1.2316615694459189e-05, |
|
"loss": 0.0006, |
|
"step": 774 |
|
}, |
|
{ |
|
"epoch": 0.4398410896708286, |
|
"grad_norm": 0.02150142751634121, |
|
"learning_rate": 1.2212521282287092e-05, |
|
"loss": 0.0004, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.44040862656072643, |
|
"grad_norm": 0.08354512602090836, |
|
"learning_rate": 1.2108807389606158e-05, |
|
"loss": 0.001, |
|
"step": 776 |
|
}, |
|
{ |
|
"epoch": 0.4409761634506243, |
|
"grad_norm": 0.1265098601579666, |
|
"learning_rate": 1.2005475060814159e-05, |
|
"loss": 0.0018, |
|
"step": 777 |
|
}, |
|
{ |
|
"epoch": 0.44154370034052215, |
|
"grad_norm": 0.18211375176906586, |
|
"learning_rate": 1.1902525336466464e-05, |
|
"loss": 0.002, |
|
"step": 778 |
|
}, |
|
{ |
|
"epoch": 0.44211123723041995, |
|
"grad_norm": 0.017222406342625618, |
|
"learning_rate": 1.1799959253265668e-05, |
|
"loss": 0.0004, |
|
"step": 779 |
|
}, |
|
{ |
|
"epoch": 0.4426787741203178, |
|
"grad_norm": 0.030056394636631012, |
|
"learning_rate": 1.1697777844051105e-05, |
|
"loss": 0.0007, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.44324631101021567, |
|
"grad_norm": 0.2576983571052551, |
|
"learning_rate": 1.1595982137788403e-05, |
|
"loss": 0.002, |
|
"step": 781 |
|
}, |
|
{ |
|
"epoch": 0.4438138479001135, |
|
"grad_norm": 0.20658354461193085, |
|
"learning_rate": 1.1494573159559213e-05, |
|
"loss": 0.0021, |
|
"step": 782 |
|
}, |
|
{ |
|
"epoch": 0.4443813847900113, |
|
"grad_norm": 0.324457049369812, |
|
"learning_rate": 1.1393551930550828e-05, |
|
"loss": 0.0023, |
|
"step": 783 |
|
}, |
|
{ |
|
"epoch": 0.4449489216799092, |
|
"grad_norm": 0.2382335364818573, |
|
"learning_rate": 1.1292919468045877e-05, |
|
"loss": 0.0017, |
|
"step": 784 |
|
}, |
|
{ |
|
"epoch": 0.44551645856980704, |
|
"grad_norm": 0.3122727572917938, |
|
"learning_rate": 1.1192676785412154e-05, |
|
"loss": 0.0041, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.4460839954597049, |
|
"grad_norm": 0.06047174334526062, |
|
"learning_rate": 1.1092824892092373e-05, |
|
"loss": 0.0011, |
|
"step": 786 |
|
}, |
|
{ |
|
"epoch": 0.4466515323496027, |
|
"grad_norm": 0.12172012776136398, |
|
"learning_rate": 1.099336479359398e-05, |
|
"loss": 0.002, |
|
"step": 787 |
|
}, |
|
{ |
|
"epoch": 0.44721906923950056, |
|
"grad_norm": 0.05569892004132271, |
|
"learning_rate": 1.0894297491479045e-05, |
|
"loss": 0.0008, |
|
"step": 788 |
|
}, |
|
{ |
|
"epoch": 0.4477866061293984, |
|
"grad_norm": 0.02484039030969143, |
|
"learning_rate": 1.0795623983354215e-05, |
|
"loss": 0.0004, |
|
"step": 789 |
|
}, |
|
{ |
|
"epoch": 0.44835414301929627, |
|
"grad_norm": 0.0289757139980793, |
|
"learning_rate": 1.0697345262860636e-05, |
|
"loss": 0.0005, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.44892167990919407, |
|
"grad_norm": 0.025877099484205246, |
|
"learning_rate": 1.0599462319663905e-05, |
|
"loss": 0.0004, |
|
"step": 791 |
|
}, |
|
{ |
|
"epoch": 0.44948921679909193, |
|
"grad_norm": 0.011032159440219402, |
|
"learning_rate": 1.0501976139444191e-05, |
|
"loss": 0.0002, |
|
"step": 792 |
|
}, |
|
{ |
|
"epoch": 0.4500567536889898, |
|
"grad_norm": 0.008858336135745049, |
|
"learning_rate": 1.0404887703886251e-05, |
|
"loss": 0.0001, |
|
"step": 793 |
|
}, |
|
{ |
|
"epoch": 0.45062429057888764, |
|
"grad_norm": 0.022364582866430283, |
|
"learning_rate": 1.0308197990669538e-05, |
|
"loss": 0.0003, |
|
"step": 794 |
|
}, |
|
{ |
|
"epoch": 0.45119182746878544, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.021190797345839e-05, |
|
"loss": 0.0, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.4517593643586833, |
|
"grad_norm": 0.1561431884765625, |
|
"learning_rate": 1.0116018621892237e-05, |
|
"loss": 0.0011, |
|
"step": 796 |
|
}, |
|
{ |
|
"epoch": 0.45232690124858116, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.0020530901575754e-05, |
|
"loss": 0.0, |
|
"step": 797 |
|
}, |
|
{ |
|
"epoch": 0.452894438138479, |
|
"grad_norm": 0.0, |
|
"learning_rate": 9.92544577406923e-06, |
|
"loss": 0.0, |
|
"step": 798 |
|
}, |
|
{ |
|
"epoch": 0.4534619750283768, |
|
"grad_norm": 0.0, |
|
"learning_rate": 9.830764196878872e-06, |
|
"loss": 0.0, |
|
"step": 799 |
|
}, |
|
{ |
|
"epoch": 0.4540295119182747, |
|
"grad_norm": 0.0, |
|
"learning_rate": 9.73648712344707e-06, |
|
"loss": 0.0, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.4540295119182747, |
|
"eval_loss": NaN, |
|
"eval_runtime": 105.9039, |
|
"eval_samples_per_second": 28.025, |
|
"eval_steps_per_second": 7.006, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.45459704880817253, |
|
"grad_norm": 0.623717725276947, |
|
"learning_rate": 9.642615503142926e-06, |
|
"loss": 0.0176, |
|
"step": 801 |
|
}, |
|
{ |
|
"epoch": 0.4551645856980704, |
|
"grad_norm": 0.24405649304389954, |
|
"learning_rate": 9.549150281252633e-06, |
|
"loss": 0.0043, |
|
"step": 802 |
|
}, |
|
{ |
|
"epoch": 0.4557321225879682, |
|
"grad_norm": 0.04640405625104904, |
|
"learning_rate": 9.456092398969902e-06, |
|
"loss": 0.0013, |
|
"step": 803 |
|
}, |
|
{ |
|
"epoch": 0.45629965947786605, |
|
"grad_norm": 0.018343493342399597, |
|
"learning_rate": 9.363442793386606e-06, |
|
"loss": 0.0005, |
|
"step": 804 |
|
}, |
|
{ |
|
"epoch": 0.4568671963677639, |
|
"grad_norm": 0.1869243085384369, |
|
"learning_rate": 9.271202397483215e-06, |
|
"loss": 0.0109, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.45743473325766176, |
|
"grad_norm": 1.0756665468215942, |
|
"learning_rate": 9.179372140119525e-06, |
|
"loss": 0.0142, |
|
"step": 806 |
|
}, |
|
{ |
|
"epoch": 0.45800227014755956, |
|
"grad_norm": 0.23151446878910065, |
|
"learning_rate": 9.087952946025175e-06, |
|
"loss": 0.0147, |
|
"step": 807 |
|
}, |
|
{ |
|
"epoch": 0.4585698070374574, |
|
"grad_norm": 0.01738697662949562, |
|
"learning_rate": 8.996945735790447e-06, |
|
"loss": 0.0004, |
|
"step": 808 |
|
}, |
|
{ |
|
"epoch": 0.4591373439273553, |
|
"grad_norm": 0.45961514115333557, |
|
"learning_rate": 8.906351425856952e-06, |
|
"loss": 0.0045, |
|
"step": 809 |
|
}, |
|
{ |
|
"epoch": 0.45970488081725314, |
|
"grad_norm": 0.3801596760749817, |
|
"learning_rate": 8.816170928508365e-06, |
|
"loss": 0.009, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.46027241770715094, |
|
"grad_norm": 0.03849168121814728, |
|
"learning_rate": 8.7264051518613e-06, |
|
"loss": 0.0008, |
|
"step": 811 |
|
}, |
|
{ |
|
"epoch": 0.4608399545970488, |
|
"grad_norm": 0.2326851785182953, |
|
"learning_rate": 8.637054999856148e-06, |
|
"loss": 0.0093, |
|
"step": 812 |
|
}, |
|
{ |
|
"epoch": 0.46140749148694665, |
|
"grad_norm": 0.048841919749975204, |
|
"learning_rate": 8.548121372247918e-06, |
|
"loss": 0.0007, |
|
"step": 813 |
|
}, |
|
{ |
|
"epoch": 0.4619750283768445, |
|
"grad_norm": 0.02097911760210991, |
|
"learning_rate": 8.459605164597267e-06, |
|
"loss": 0.0005, |
|
"step": 814 |
|
}, |
|
{ |
|
"epoch": 0.4625425652667423, |
|
"grad_norm": 0.3414818346500397, |
|
"learning_rate": 8.371507268261437e-06, |
|
"loss": 0.0023, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.46311010215664017, |
|
"grad_norm": 0.08118417859077454, |
|
"learning_rate": 8.283828570385238e-06, |
|
"loss": 0.0008, |
|
"step": 816 |
|
}, |
|
{ |
|
"epoch": 0.463677639046538, |
|
"grad_norm": 0.41794729232788086, |
|
"learning_rate": 8.196569953892202e-06, |
|
"loss": 0.0112, |
|
"step": 817 |
|
}, |
|
{ |
|
"epoch": 0.4642451759364359, |
|
"grad_norm": 0.3035317063331604, |
|
"learning_rate": 8.109732297475635e-06, |
|
"loss": 0.0123, |
|
"step": 818 |
|
}, |
|
{ |
|
"epoch": 0.4648127128263337, |
|
"grad_norm": 0.03364351764321327, |
|
"learning_rate": 8.023316475589754e-06, |
|
"loss": 0.0007, |
|
"step": 819 |
|
}, |
|
{ |
|
"epoch": 0.46538024971623154, |
|
"grad_norm": 0.48411476612091064, |
|
"learning_rate": 7.937323358440935e-06, |
|
"loss": 0.0161, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.4659477866061294, |
|
"grad_norm": 0.020044121891260147, |
|
"learning_rate": 7.851753811978924e-06, |
|
"loss": 0.0005, |
|
"step": 821 |
|
}, |
|
{ |
|
"epoch": 0.46651532349602726, |
|
"grad_norm": 0.011223547160625458, |
|
"learning_rate": 7.766608697888095e-06, |
|
"loss": 0.0002, |
|
"step": 822 |
|
}, |
|
{ |
|
"epoch": 0.46708286038592506, |
|
"grad_norm": 0.16546539962291718, |
|
"learning_rate": 7.681888873578786e-06, |
|
"loss": 0.0025, |
|
"step": 823 |
|
}, |
|
{ |
|
"epoch": 0.4676503972758229, |
|
"grad_norm": 0.009762106463313103, |
|
"learning_rate": 7.597595192178702e-06, |
|
"loss": 0.0003, |
|
"step": 824 |
|
}, |
|
{ |
|
"epoch": 0.4682179341657208, |
|
"grad_norm": 0.023298079147934914, |
|
"learning_rate": 7.513728502524286e-06, |
|
"loss": 0.0004, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.46878547105561863, |
|
"grad_norm": 0.18507546186447144, |
|
"learning_rate": 7.430289649152156e-06, |
|
"loss": 0.0015, |
|
"step": 826 |
|
}, |
|
{ |
|
"epoch": 0.46935300794551643, |
|
"grad_norm": 0.37739697098731995, |
|
"learning_rate": 7.347279472290647e-06, |
|
"loss": 0.0096, |
|
"step": 827 |
|
}, |
|
{ |
|
"epoch": 0.4699205448354143, |
|
"grad_norm": 0.10507706552743912, |
|
"learning_rate": 7.264698807851328e-06, |
|
"loss": 0.0015, |
|
"step": 828 |
|
}, |
|
{ |
|
"epoch": 0.47048808172531215, |
|
"grad_norm": 0.049794506281614304, |
|
"learning_rate": 7.182548487420554e-06, |
|
"loss": 0.0009, |
|
"step": 829 |
|
}, |
|
{ |
|
"epoch": 0.47105561861521, |
|
"grad_norm": 0.061873581260442734, |
|
"learning_rate": 7.100829338251147e-06, |
|
"loss": 0.001, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.4716231555051078, |
|
"grad_norm": 0.02593647502362728, |
|
"learning_rate": 7.019542183254046e-06, |
|
"loss": 0.0004, |
|
"step": 831 |
|
}, |
|
{ |
|
"epoch": 0.47219069239500566, |
|
"grad_norm": 0.6185386776924133, |
|
"learning_rate": 6.9386878409899715e-06, |
|
"loss": 0.0073, |
|
"step": 832 |
|
}, |
|
{ |
|
"epoch": 0.4727582292849035, |
|
"grad_norm": 0.666622519493103, |
|
"learning_rate": 6.858267125661272e-06, |
|
"loss": 0.0093, |
|
"step": 833 |
|
}, |
|
{ |
|
"epoch": 0.4733257661748014, |
|
"grad_norm": 0.05342670530080795, |
|
"learning_rate": 6.778280847103669e-06, |
|
"loss": 0.0008, |
|
"step": 834 |
|
}, |
|
{ |
|
"epoch": 0.4738933030646992, |
|
"grad_norm": 1.0183546543121338, |
|
"learning_rate": 6.698729810778065e-06, |
|
"loss": 0.0078, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.47446083995459704, |
|
"grad_norm": 0.1871764212846756, |
|
"learning_rate": 6.619614817762537e-06, |
|
"loss": 0.0014, |
|
"step": 836 |
|
}, |
|
{ |
|
"epoch": 0.4750283768444949, |
|
"grad_norm": 0.19295842945575714, |
|
"learning_rate": 6.540936664744196e-06, |
|
"loss": 0.002, |
|
"step": 837 |
|
}, |
|
{ |
|
"epoch": 0.47559591373439275, |
|
"grad_norm": 0.7019402384757996, |
|
"learning_rate": 6.462696144011149e-06, |
|
"loss": 0.0207, |
|
"step": 838 |
|
}, |
|
{ |
|
"epoch": 0.47616345062429055, |
|
"grad_norm": 0.02959679253399372, |
|
"learning_rate": 6.384894043444567e-06, |
|
"loss": 0.0004, |
|
"step": 839 |
|
}, |
|
{ |
|
"epoch": 0.4767309875141884, |
|
"grad_norm": 0.1982329785823822, |
|
"learning_rate": 6.3075311465107535e-06, |
|
"loss": 0.0008, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.47729852440408627, |
|
"grad_norm": 0.03872201591730118, |
|
"learning_rate": 6.230608232253227e-06, |
|
"loss": 0.0004, |
|
"step": 841 |
|
}, |
|
{ |
|
"epoch": 0.4778660612939841, |
|
"grad_norm": 0.08221829682588577, |
|
"learning_rate": 6.154126075284855e-06, |
|
"loss": 0.0006, |
|
"step": 842 |
|
}, |
|
{ |
|
"epoch": 0.4784335981838819, |
|
"grad_norm": 0.007916197180747986, |
|
"learning_rate": 6.078085445780129e-06, |
|
"loss": 0.0002, |
|
"step": 843 |
|
}, |
|
{ |
|
"epoch": 0.4790011350737798, |
|
"grad_norm": 0.008437985554337502, |
|
"learning_rate": 6.002487109467347e-06, |
|
"loss": 0.0002, |
|
"step": 844 |
|
}, |
|
{ |
|
"epoch": 0.47956867196367764, |
|
"grad_norm": 0.005791019182652235, |
|
"learning_rate": 5.927331827620903e-06, |
|
"loss": 0.0001, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.4801362088535755, |
|
"grad_norm": 0.0, |
|
"learning_rate": 5.852620357053651e-06, |
|
"loss": 0.0, |
|
"step": 846 |
|
}, |
|
{ |
|
"epoch": 0.4807037457434733, |
|
"grad_norm": 0.0, |
|
"learning_rate": 5.778353450109286e-06, |
|
"loss": 0.0, |
|
"step": 847 |
|
}, |
|
{ |
|
"epoch": 0.48127128263337116, |
|
"grad_norm": 0.0, |
|
"learning_rate": 5.704531854654721e-06, |
|
"loss": 0.0, |
|
"step": 848 |
|
}, |
|
{ |
|
"epoch": 0.481838819523269, |
|
"grad_norm": 0.0, |
|
"learning_rate": 5.631156314072605e-06, |
|
"loss": 0.0, |
|
"step": 849 |
|
}, |
|
{ |
|
"epoch": 0.48240635641316687, |
|
"grad_norm": 0.0, |
|
"learning_rate": 5.558227567253832e-06, |
|
"loss": 0.0, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.48297389330306467, |
|
"grad_norm": 0.3478143811225891, |
|
"learning_rate": 5.485746348590048e-06, |
|
"loss": 0.0098, |
|
"step": 851 |
|
}, |
|
{ |
|
"epoch": 0.48354143019296253, |
|
"grad_norm": 0.38309186697006226, |
|
"learning_rate": 5.413713387966329e-06, |
|
"loss": 0.0101, |
|
"step": 852 |
|
}, |
|
{ |
|
"epoch": 0.4841089670828604, |
|
"grad_norm": 0.28978919982910156, |
|
"learning_rate": 5.34212941075381e-06, |
|
"loss": 0.0048, |
|
"step": 853 |
|
}, |
|
{ |
|
"epoch": 0.48467650397275824, |
|
"grad_norm": 0.016624854877591133, |
|
"learning_rate": 5.270995137802315e-06, |
|
"loss": 0.0004, |
|
"step": 854 |
|
}, |
|
{ |
|
"epoch": 0.48524404086265605, |
|
"grad_norm": 0.03391743823885918, |
|
"learning_rate": 5.200311285433213e-06, |
|
"loss": 0.0006, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.4858115777525539, |
|
"grad_norm": 0.1863984912633896, |
|
"learning_rate": 5.13007856543209e-06, |
|
"loss": 0.0104, |
|
"step": 856 |
|
}, |
|
{ |
|
"epoch": 0.48637911464245176, |
|
"grad_norm": 0.01976456306874752, |
|
"learning_rate": 5.060297685041659e-06, |
|
"loss": 0.0004, |
|
"step": 857 |
|
}, |
|
{ |
|
"epoch": 0.4869466515323496, |
|
"grad_norm": 0.21336333453655243, |
|
"learning_rate": 4.99096934695461e-06, |
|
"loss": 0.0115, |
|
"step": 858 |
|
}, |
|
{ |
|
"epoch": 0.4875141884222474, |
|
"grad_norm": 0.0573849081993103, |
|
"learning_rate": 4.922094249306558e-06, |
|
"loss": 0.0007, |
|
"step": 859 |
|
}, |
|
{ |
|
"epoch": 0.4880817253121453, |
|
"grad_norm": 0.0516238808631897, |
|
"learning_rate": 4.853673085668947e-06, |
|
"loss": 0.0008, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.48864926220204313, |
|
"grad_norm": 0.548546314239502, |
|
"learning_rate": 4.78570654504214e-06, |
|
"loss": 0.0216, |
|
"step": 861 |
|
}, |
|
{ |
|
"epoch": 0.489216799091941, |
|
"grad_norm": 0.03412328287959099, |
|
"learning_rate": 4.7181953118484556e-06, |
|
"loss": 0.0006, |
|
"step": 862 |
|
}, |
|
{ |
|
"epoch": 0.4897843359818388, |
|
"grad_norm": 0.03500758484005928, |
|
"learning_rate": 4.651140065925269e-06, |
|
"loss": 0.0008, |
|
"step": 863 |
|
}, |
|
{ |
|
"epoch": 0.49035187287173665, |
|
"grad_norm": 0.019669918343424797, |
|
"learning_rate": 4.58454148251814e-06, |
|
"loss": 0.0004, |
|
"step": 864 |
|
}, |
|
{ |
|
"epoch": 0.4909194097616345, |
|
"grad_norm": 0.3114485442638397, |
|
"learning_rate": 4.5184002322740785e-06, |
|
"loss": 0.0028, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.49148694665153236, |
|
"grad_norm": 0.5113534331321716, |
|
"learning_rate": 4.452716981234744e-06, |
|
"loss": 0.0185, |
|
"step": 866 |
|
}, |
|
{ |
|
"epoch": 0.49205448354143017, |
|
"grad_norm": 0.07297579944133759, |
|
"learning_rate": 4.387492390829734e-06, |
|
"loss": 0.0009, |
|
"step": 867 |
|
}, |
|
{ |
|
"epoch": 0.492622020431328, |
|
"grad_norm": 0.4376738667488098, |
|
"learning_rate": 4.322727117869951e-06, |
|
"loss": 0.0091, |
|
"step": 868 |
|
}, |
|
{ |
|
"epoch": 0.4931895573212259, |
|
"grad_norm": 0.011865437030792236, |
|
"learning_rate": 4.258421814540992e-06, |
|
"loss": 0.0002, |
|
"step": 869 |
|
}, |
|
{ |
|
"epoch": 0.49375709421112374, |
|
"grad_norm": 0.16000713407993317, |
|
"learning_rate": 4.19457712839652e-06, |
|
"loss": 0.0013, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.49432463110102154, |
|
"grad_norm": 0.15423361957073212, |
|
"learning_rate": 4.131193702351827e-06, |
|
"loss": 0.0024, |
|
"step": 871 |
|
}, |
|
{ |
|
"epoch": 0.4948921679909194, |
|
"grad_norm": 0.022781820967793465, |
|
"learning_rate": 4.068272174677335e-06, |
|
"loss": 0.0004, |
|
"step": 872 |
|
}, |
|
{ |
|
"epoch": 0.49545970488081725, |
|
"grad_norm": 0.025359636172652245, |
|
"learning_rate": 4.005813178992091e-06, |
|
"loss": 0.0005, |
|
"step": 873 |
|
}, |
|
{ |
|
"epoch": 0.4960272417707151, |
|
"grad_norm": 0.029715919867157936, |
|
"learning_rate": 3.9438173442575e-06, |
|
"loss": 0.0006, |
|
"step": 874 |
|
}, |
|
{ |
|
"epoch": 0.4965947786606129, |
|
"grad_norm": 0.019626963883638382, |
|
"learning_rate": 3.8822852947709375e-06, |
|
"loss": 0.0003, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.49716231555051077, |
|
"grad_norm": 0.05726097524166107, |
|
"learning_rate": 3.821217650159453e-06, |
|
"loss": 0.0008, |
|
"step": 876 |
|
}, |
|
{ |
|
"epoch": 0.4977298524404086, |
|
"grad_norm": 0.2366546243429184, |
|
"learning_rate": 3.760615025373543e-06, |
|
"loss": 0.0064, |
|
"step": 877 |
|
}, |
|
{ |
|
"epoch": 0.4982973893303065, |
|
"grad_norm": 0.10293308645486832, |
|
"learning_rate": 3.700478030680987e-06, |
|
"loss": 0.0014, |
|
"step": 878 |
|
}, |
|
{ |
|
"epoch": 0.4988649262202043, |
|
"grad_norm": 0.01754389889538288, |
|
"learning_rate": 3.6408072716606346e-06, |
|
"loss": 0.0003, |
|
"step": 879 |
|
}, |
|
{ |
|
"epoch": 0.49943246311010214, |
|
"grad_norm": 0.8322834968566895, |
|
"learning_rate": 3.581603349196372e-06, |
|
"loss": 0.0213, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.2963551878929138, |
|
"learning_rate": 3.522866859471047e-06, |
|
"loss": 0.0038, |
|
"step": 881 |
|
}, |
|
{ |
|
"epoch": 0.5005675368898979, |
|
"grad_norm": 0.47917166352272034, |
|
"learning_rate": 3.4645983939604496e-06, |
|
"loss": 0.008, |
|
"step": 882 |
|
}, |
|
{ |
|
"epoch": 0.5011350737797957, |
|
"grad_norm": 0.04047662764787674, |
|
"learning_rate": 3.406798539427386e-06, |
|
"loss": 0.0004, |
|
"step": 883 |
|
}, |
|
{ |
|
"epoch": 0.5017026106696936, |
|
"grad_norm": 0.03451113775372505, |
|
"learning_rate": 3.349467877915746e-06, |
|
"loss": 0.0002, |
|
"step": 884 |
|
}, |
|
{ |
|
"epoch": 0.5022701475595914, |
|
"grad_norm": 0.04126652702689171, |
|
"learning_rate": 3.2926069867446675e-06, |
|
"loss": 0.0005, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.5028376844494892, |
|
"grad_norm": 0.5223488211631775, |
|
"learning_rate": 3.2362164385026706e-06, |
|
"loss": 0.0093, |
|
"step": 886 |
|
}, |
|
{ |
|
"epoch": 0.503405221339387, |
|
"grad_norm": 0.06192615255713463, |
|
"learning_rate": 3.180296801041971e-06, |
|
"loss": 0.0007, |
|
"step": 887 |
|
}, |
|
{ |
|
"epoch": 0.5039727582292849, |
|
"grad_norm": 0.022352147847414017, |
|
"learning_rate": 3.1248486374726883e-06, |
|
"loss": 0.0003, |
|
"step": 888 |
|
}, |
|
{ |
|
"epoch": 0.5045402951191827, |
|
"grad_norm": 0.3611301779747009, |
|
"learning_rate": 3.069872506157212e-06, |
|
"loss": 0.0028, |
|
"step": 889 |
|
}, |
|
{ |
|
"epoch": 0.5051078320090806, |
|
"grad_norm": 0.4576888978481293, |
|
"learning_rate": 3.0153689607045845e-06, |
|
"loss": 0.0058, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.5056753688989785, |
|
"grad_norm": 0.22780518233776093, |
|
"learning_rate": 2.961338549964893e-06, |
|
"loss": 0.0019, |
|
"step": 891 |
|
}, |
|
{ |
|
"epoch": 0.5062429057888763, |
|
"grad_norm": 0.05007459968328476, |
|
"learning_rate": 2.9077818180237693e-06, |
|
"loss": 0.0005, |
|
"step": 892 |
|
}, |
|
{ |
|
"epoch": 0.5068104426787742, |
|
"grad_norm": 0.012189110741019249, |
|
"learning_rate": 2.8546993041969173e-06, |
|
"loss": 0.0002, |
|
"step": 893 |
|
}, |
|
{ |
|
"epoch": 0.5073779795686719, |
|
"grad_norm": 0.009090066887438297, |
|
"learning_rate": 2.802091543024671e-06, |
|
"loss": 0.0002, |
|
"step": 894 |
|
}, |
|
{ |
|
"epoch": 0.5079455164585698, |
|
"grad_norm": 0.6339737176895142, |
|
"learning_rate": 2.7499590642665774e-06, |
|
"loss": 0.0025, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.5085130533484676, |
|
"grad_norm": 0.06957720965147018, |
|
"learning_rate": 2.6983023928961404e-06, |
|
"loss": 0.0005, |
|
"step": 896 |
|
}, |
|
{ |
|
"epoch": 0.5090805902383655, |
|
"grad_norm": 7.535338878631592, |
|
"learning_rate": 2.647122049095463e-06, |
|
"loss": 0.0468, |
|
"step": 897 |
|
}, |
|
{ |
|
"epoch": 0.5096481271282634, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.596418548250029e-06, |
|
"loss": 0.0, |
|
"step": 898 |
|
}, |
|
{ |
|
"epoch": 0.5102156640181612, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.546192400943537e-06, |
|
"loss": 0.0, |
|
"step": 899 |
|
}, |
|
{ |
|
"epoch": 0.5107832009080591, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.496444112952734e-06, |
|
"loss": 0.0, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.5113507377979569, |
|
"grad_norm": 0.33791133761405945, |
|
"learning_rate": 2.4471741852423237e-06, |
|
"loss": 0.01, |
|
"step": 901 |
|
}, |
|
{ |
|
"epoch": 0.5119182746878547, |
|
"grad_norm": 0.10187462717294693, |
|
"learning_rate": 2.3983831139599287e-06, |
|
"loss": 0.002, |
|
"step": 902 |
|
}, |
|
{ |
|
"epoch": 0.5124858115777525, |
|
"grad_norm": 0.02221851982176304, |
|
"learning_rate": 2.3500713904311024e-06, |
|
"loss": 0.0007, |
|
"step": 903 |
|
}, |
|
{ |
|
"epoch": 0.5130533484676504, |
|
"grad_norm": 0.014832521788775921, |
|
"learning_rate": 2.3022395011543686e-06, |
|
"loss": 0.0005, |
|
"step": 904 |
|
}, |
|
{ |
|
"epoch": 0.5136208853575482, |
|
"grad_norm": 0.26342910528182983, |
|
"learning_rate": 2.2548879277963064e-06, |
|
"loss": 0.0045, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.5141884222474461, |
|
"grad_norm": 0.08771803975105286, |
|
"learning_rate": 2.208017147186736e-06, |
|
"loss": 0.0009, |
|
"step": 906 |
|
}, |
|
{ |
|
"epoch": 0.514755959137344, |
|
"grad_norm": 0.2147466242313385, |
|
"learning_rate": 2.161627631313923e-06, |
|
"loss": 0.001, |
|
"step": 907 |
|
}, |
|
{ |
|
"epoch": 0.5153234960272418, |
|
"grad_norm": 0.09128167480230331, |
|
"learning_rate": 2.1157198473197414e-06, |
|
"loss": 0.0013, |
|
"step": 908 |
|
}, |
|
{ |
|
"epoch": 0.5158910329171397, |
|
"grad_norm": 0.01900799199938774, |
|
"learning_rate": 2.070294257495081e-06, |
|
"loss": 0.0004, |
|
"step": 909 |
|
}, |
|
{ |
|
"epoch": 0.5164585698070374, |
|
"grad_norm": 0.040807388722896576, |
|
"learning_rate": 2.0253513192751373e-06, |
|
"loss": 0.0006, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.5170261066969353, |
|
"grad_norm": 0.19579406082630157, |
|
"learning_rate": 1.9808914852347813e-06, |
|
"loss": 0.0024, |
|
"step": 911 |
|
}, |
|
{ |
|
"epoch": 0.5175936435868331, |
|
"grad_norm": 0.22570084035396576, |
|
"learning_rate": 1.9369152030840556e-06, |
|
"loss": 0.011, |
|
"step": 912 |
|
}, |
|
{ |
|
"epoch": 0.518161180476731, |
|
"grad_norm": 0.16357550024986267, |
|
"learning_rate": 1.8934229156636452e-06, |
|
"loss": 0.0018, |
|
"step": 913 |
|
}, |
|
{ |
|
"epoch": 0.5187287173666288, |
|
"grad_norm": 0.024271734058856964, |
|
"learning_rate": 1.8504150609403858e-06, |
|
"loss": 0.0004, |
|
"step": 914 |
|
}, |
|
{ |
|
"epoch": 0.5192962542565267, |
|
"grad_norm": 0.4943500757217407, |
|
"learning_rate": 1.807892072002898e-06, |
|
"loss": 0.0099, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.5198637911464246, |
|
"grad_norm": 0.22430936992168427, |
|
"learning_rate": 1.7658543770572189e-06, |
|
"loss": 0.0062, |
|
"step": 916 |
|
}, |
|
{ |
|
"epoch": 0.5204313280363224, |
|
"grad_norm": 0.02454625442624092, |
|
"learning_rate": 1.724302399422456e-06, |
|
"loss": 0.0006, |
|
"step": 917 |
|
}, |
|
{ |
|
"epoch": 0.5209988649262202, |
|
"grad_norm": 0.11323986947536469, |
|
"learning_rate": 1.6832365575265741e-06, |
|
"loss": 0.0014, |
|
"step": 918 |
|
}, |
|
{ |
|
"epoch": 0.521566401816118, |
|
"grad_norm": 0.03107013925909996, |
|
"learning_rate": 1.6426572649021476e-06, |
|
"loss": 0.0006, |
|
"step": 919 |
|
}, |
|
{ |
|
"epoch": 0.5221339387060159, |
|
"grad_norm": 0.04345984384417534, |
|
"learning_rate": 1.6025649301821876e-06, |
|
"loss": 0.0004, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.5227014755959137, |
|
"grad_norm": 0.4001345932483673, |
|
"learning_rate": 1.5629599570960718e-06, |
|
"loss": 0.0115, |
|
"step": 921 |
|
}, |
|
{ |
|
"epoch": 0.5232690124858116, |
|
"grad_norm": 0.09491916000843048, |
|
"learning_rate": 1.523842744465437e-06, |
|
"loss": 0.0006, |
|
"step": 922 |
|
}, |
|
{ |
|
"epoch": 0.5238365493757094, |
|
"grad_norm": 0.17167732119560242, |
|
"learning_rate": 1.4852136862001764e-06, |
|
"loss": 0.0021, |
|
"step": 923 |
|
}, |
|
{ |
|
"epoch": 0.5244040862656073, |
|
"grad_norm": 0.09016118198633194, |
|
"learning_rate": 1.4470731712944884e-06, |
|
"loss": 0.0004, |
|
"step": 924 |
|
}, |
|
{ |
|
"epoch": 0.5249716231555052, |
|
"grad_norm": 0.10207764804363251, |
|
"learning_rate": 1.4094215838229176e-06, |
|
"loss": 0.001, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.5255391600454029, |
|
"grad_norm": 0.009659104980528355, |
|
"learning_rate": 1.372259302936546e-06, |
|
"loss": 0.0002, |
|
"step": 926 |
|
}, |
|
{ |
|
"epoch": 0.5261066969353008, |
|
"grad_norm": 0.022223835811018944, |
|
"learning_rate": 1.3355867028591208e-06, |
|
"loss": 0.0004, |
|
"step": 927 |
|
}, |
|
{ |
|
"epoch": 0.5266742338251986, |
|
"grad_norm": 0.3852534890174866, |
|
"learning_rate": 1.2994041528833266e-06, |
|
"loss": 0.0176, |
|
"step": 928 |
|
}, |
|
{ |
|
"epoch": 0.5272417707150965, |
|
"grad_norm": 0.032568030059337616, |
|
"learning_rate": 1.2637120173670358e-06, |
|
"loss": 0.0005, |
|
"step": 929 |
|
}, |
|
{ |
|
"epoch": 0.5278093076049943, |
|
"grad_norm": 0.12185148894786835, |
|
"learning_rate": 1.2285106557296477e-06, |
|
"loss": 0.0014, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.5283768444948922, |
|
"grad_norm": 0.610008716583252, |
|
"learning_rate": 1.1938004224484988e-06, |
|
"loss": 0.0051, |
|
"step": 931 |
|
}, |
|
{ |
|
"epoch": 0.52894438138479, |
|
"grad_norm": 0.06961622089147568, |
|
"learning_rate": 1.1595816670552428e-06, |
|
"loss": 0.0011, |
|
"step": 932 |
|
}, |
|
{ |
|
"epoch": 0.5295119182746879, |
|
"grad_norm": 0.055191271007061005, |
|
"learning_rate": 1.1258547341323699e-06, |
|
"loss": 0.0005, |
|
"step": 933 |
|
}, |
|
{ |
|
"epoch": 0.5300794551645857, |
|
"grad_norm": 0.43228384852409363, |
|
"learning_rate": 1.0926199633097157e-06, |
|
"loss": 0.0173, |
|
"step": 934 |
|
}, |
|
{ |
|
"epoch": 0.5306469920544835, |
|
"grad_norm": 0.4304471015930176, |
|
"learning_rate": 1.0598776892610685e-06, |
|
"loss": 0.0156, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.5312145289443814, |
|
"grad_norm": 0.02435118891298771, |
|
"learning_rate": 1.02762824170074e-06, |
|
"loss": 0.0003, |
|
"step": 936 |
|
}, |
|
{ |
|
"epoch": 0.5317820658342792, |
|
"grad_norm": 0.18826240301132202, |
|
"learning_rate": 9.958719453803278e-07, |
|
"loss": 0.002, |
|
"step": 937 |
|
}, |
|
{ |
|
"epoch": 0.5323496027241771, |
|
"grad_norm": 0.5460970401763916, |
|
"learning_rate": 9.646091200853802e-07, |
|
"loss": 0.0294, |
|
"step": 938 |
|
}, |
|
{ |
|
"epoch": 0.5329171396140749, |
|
"grad_norm": 0.013325286097824574, |
|
"learning_rate": 9.338400806321978e-07, |
|
"loss": 0.0002, |
|
"step": 939 |
|
}, |
|
{ |
|
"epoch": 0.5334846765039728, |
|
"grad_norm": 0.010715479031205177, |
|
"learning_rate": 9.035651368646648e-07, |
|
"loss": 0.0002, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.5340522133938707, |
|
"grad_norm": 0.05249933898448944, |
|
"learning_rate": 8.737845936511335e-07, |
|
"loss": 0.0007, |
|
"step": 941 |
|
}, |
|
{ |
|
"epoch": 0.5346197502837684, |
|
"grad_norm": 0.6176870465278625, |
|
"learning_rate": 8.444987508813451e-07, |
|
"loss": 0.0114, |
|
"step": 942 |
|
}, |
|
{ |
|
"epoch": 0.5351872871736663, |
|
"grad_norm": 0.05264892801642418, |
|
"learning_rate": 8.157079034633974e-07, |
|
"loss": 0.0003, |
|
"step": 943 |
|
}, |
|
{ |
|
"epoch": 0.5357548240635641, |
|
"grad_norm": 0.022312408313155174, |
|
"learning_rate": 7.874123413208145e-07, |
|
"loss": 0.0002, |
|
"step": 944 |
|
}, |
|
{ |
|
"epoch": 0.536322360953462, |
|
"grad_norm": 0.0, |
|
"learning_rate": 7.596123493895991e-07, |
|
"loss": 0.0, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.5368898978433598, |
|
"grad_norm": 0.0, |
|
"learning_rate": 7.323082076153509e-07, |
|
"loss": 0.0, |
|
"step": 946 |
|
}, |
|
{ |
|
"epoch": 0.5374574347332577, |
|
"grad_norm": 0.0, |
|
"learning_rate": 7.055001909504755e-07, |
|
"loss": 0.0, |
|
"step": 947 |
|
}, |
|
{ |
|
"epoch": 0.5380249716231555, |
|
"grad_norm": 0.0, |
|
"learning_rate": 6.791885693514133e-07, |
|
"loss": 0.0, |
|
"step": 948 |
|
}, |
|
{ |
|
"epoch": 0.5385925085130534, |
|
"grad_norm": 29.0640926361084, |
|
"learning_rate": 6.533736077758868e-07, |
|
"loss": 0.3484, |
|
"step": 949 |
|
}, |
|
{ |
|
"epoch": 0.5391600454029511, |
|
"grad_norm": 0.0, |
|
"learning_rate": 6.280555661802856e-07, |
|
"loss": 0.0, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.539727582292849, |
|
"grad_norm": 0.7819038033485413, |
|
"learning_rate": 6.032346995169968e-07, |
|
"loss": 0.0194, |
|
"step": 951 |
|
}, |
|
{ |
|
"epoch": 0.5402951191827469, |
|
"grad_norm": 0.15347006916999817, |
|
"learning_rate": 5.78911257731879e-07, |
|
"loss": 0.0101, |
|
"step": 952 |
|
}, |
|
{ |
|
"epoch": 0.5408626560726447, |
|
"grad_norm": 0.2514842748641968, |
|
"learning_rate": 5.550854857617193e-07, |
|
"loss": 0.0147, |
|
"step": 953 |
|
}, |
|
{ |
|
"epoch": 0.5414301929625426, |
|
"grad_norm": 0.018654726445674896, |
|
"learning_rate": 5.317576235317756e-07, |
|
"loss": 0.0004, |
|
"step": 954 |
|
}, |
|
{ |
|
"epoch": 0.5419977298524404, |
|
"grad_norm": 0.017752651125192642, |
|
"learning_rate": 5.089279059533658e-07, |
|
"loss": 0.0005, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.5425652667423383, |
|
"grad_norm": 0.010392699390649796, |
|
"learning_rate": 4.865965629214819e-07, |
|
"loss": 0.0003, |
|
"step": 956 |
|
}, |
|
{ |
|
"epoch": 0.5431328036322361, |
|
"grad_norm": 0.183840811252594, |
|
"learning_rate": 4.647638193125137e-07, |
|
"loss": 0.009, |
|
"step": 957 |
|
}, |
|
{ |
|
"epoch": 0.5437003405221339, |
|
"grad_norm": 0.22836080193519592, |
|
"learning_rate": 4.434298949819449e-07, |
|
"loss": 0.0039, |
|
"step": 958 |
|
}, |
|
{ |
|
"epoch": 0.5442678774120318, |
|
"grad_norm": 0.19406422972679138, |
|
"learning_rate": 4.2259500476214407e-07, |
|
"loss": 0.0091, |
|
"step": 959 |
|
}, |
|
{ |
|
"epoch": 0.5448354143019296, |
|
"grad_norm": 0.015072612091898918, |
|
"learning_rate": 4.02259358460233e-07, |
|
"loss": 0.0004, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.5454029511918275, |
|
"grad_norm": 0.09216684103012085, |
|
"learning_rate": 3.824231608559492e-07, |
|
"loss": 0.0006, |
|
"step": 961 |
|
}, |
|
{ |
|
"epoch": 0.5459704880817253, |
|
"grad_norm": 0.35321223735809326, |
|
"learning_rate": 3.630866116995757e-07, |
|
"loss": 0.006, |
|
"step": 962 |
|
}, |
|
{ |
|
"epoch": 0.5465380249716232, |
|
"grad_norm": 0.1484595090150833, |
|
"learning_rate": 3.4424990570994797e-07, |
|
"loss": 0.0039, |
|
"step": 963 |
|
}, |
|
{ |
|
"epoch": 0.547105561861521, |
|
"grad_norm": 0.25822874903678894, |
|
"learning_rate": 3.2591323257248893e-07, |
|
"loss": 0.0036, |
|
"step": 964 |
|
}, |
|
{ |
|
"epoch": 0.5476730987514189, |
|
"grad_norm": 0.11979275941848755, |
|
"learning_rate": 3.080767769372939e-07, |
|
"loss": 0.0012, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.5482406356413166, |
|
"grad_norm": 0.013929195702075958, |
|
"learning_rate": 2.907407184172706e-07, |
|
"loss": 0.0003, |
|
"step": 966 |
|
}, |
|
{ |
|
"epoch": 0.5488081725312145, |
|
"grad_norm": 0.017956310883164406, |
|
"learning_rate": 2.7390523158633554e-07, |
|
"loss": 0.0004, |
|
"step": 967 |
|
}, |
|
{ |
|
"epoch": 0.5493757094211124, |
|
"grad_norm": 0.010877071879804134, |
|
"learning_rate": 2.5757048597765396e-07, |
|
"loss": 0.0003, |
|
"step": 968 |
|
}, |
|
{ |
|
"epoch": 0.5499432463110102, |
|
"grad_norm": 0.016699308529496193, |
|
"learning_rate": 2.4173664608193593e-07, |
|
"loss": 0.0004, |
|
"step": 969 |
|
}, |
|
{ |
|
"epoch": 0.5505107832009081, |
|
"grad_norm": 0.014117494225502014, |
|
"learning_rate": 2.2640387134577058e-07, |
|
"loss": 0.0003, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.5510783200908059, |
|
"grad_norm": 0.3465084135532379, |
|
"learning_rate": 2.1157231617002783e-07, |
|
"loss": 0.0134, |
|
"step": 971 |
|
}, |
|
{ |
|
"epoch": 0.5516458569807038, |
|
"grad_norm": 0.098577119410038, |
|
"learning_rate": 1.9724212990830938e-07, |
|
"loss": 0.0015, |
|
"step": 972 |
|
}, |
|
{ |
|
"epoch": 0.5522133938706016, |
|
"grad_norm": 0.008477923460304737, |
|
"learning_rate": 1.8341345686543332e-07, |
|
"loss": 0.0002, |
|
"step": 973 |
|
}, |
|
{ |
|
"epoch": 0.5527809307604994, |
|
"grad_norm": 0.31904277205467224, |
|
"learning_rate": 1.7008643629596866e-07, |
|
"loss": 0.0125, |
|
"step": 974 |
|
}, |
|
{ |
|
"epoch": 0.5533484676503972, |
|
"grad_norm": 0.02196822501718998, |
|
"learning_rate": 1.5726120240288634e-07, |
|
"loss": 0.0005, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.5539160045402951, |
|
"grad_norm": 0.03046775981783867, |
|
"learning_rate": 1.449378843361271e-07, |
|
"loss": 0.0004, |
|
"step": 976 |
|
}, |
|
{ |
|
"epoch": 0.554483541430193, |
|
"grad_norm": 0.021513327956199646, |
|
"learning_rate": 1.3311660619138578e-07, |
|
"loss": 0.0004, |
|
"step": 977 |
|
}, |
|
{ |
|
"epoch": 0.5550510783200908, |
|
"grad_norm": 0.01769246906042099, |
|
"learning_rate": 1.2179748700879012e-07, |
|
"loss": 0.0004, |
|
"step": 978 |
|
}, |
|
{ |
|
"epoch": 0.5556186152099887, |
|
"grad_norm": 0.12877629697322845, |
|
"learning_rate": 1.109806407717462e-07, |
|
"loss": 0.0014, |
|
"step": 979 |
|
}, |
|
{ |
|
"epoch": 0.5561861520998865, |
|
"grad_norm": 0.3173483610153198, |
|
"learning_rate": 1.0066617640578368e-07, |
|
"loss": 0.0027, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.5567536889897844, |
|
"grad_norm": 0.045826442539691925, |
|
"learning_rate": 9.085419777743465e-08, |
|
"loss": 0.0007, |
|
"step": 981 |
|
}, |
|
{ |
|
"epoch": 0.5573212258796821, |
|
"grad_norm": 0.01739875227212906, |
|
"learning_rate": 8.15448036932176e-08, |
|
"loss": 0.0002, |
|
"step": 982 |
|
}, |
|
{ |
|
"epoch": 0.55788876276958, |
|
"grad_norm": 0.04034648835659027, |
|
"learning_rate": 7.273808789862724e-08, |
|
"loss": 0.0005, |
|
"step": 983 |
|
}, |
|
{ |
|
"epoch": 0.5584562996594779, |
|
"grad_norm": 0.02119840681552887, |
|
"learning_rate": 6.443413907720186e-08, |
|
"loss": 0.0003, |
|
"step": 984 |
|
}, |
|
{ |
|
"epoch": 0.5590238365493757, |
|
"grad_norm": 0.006225614342838526, |
|
"learning_rate": 5.663304084960186e-08, |
|
"loss": 0.0001, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.5595913734392736, |
|
"grad_norm": 0.07903040200471878, |
|
"learning_rate": 4.933487177280482e-08, |
|
"loss": 0.0005, |
|
"step": 986 |
|
}, |
|
{ |
|
"epoch": 0.5601589103291714, |
|
"grad_norm": 0.5586972832679749, |
|
"learning_rate": 4.253970533929508e-08, |
|
"loss": 0.011, |
|
"step": 987 |
|
}, |
|
{ |
|
"epoch": 0.5607264472190693, |
|
"grad_norm": 0.9101510047912598, |
|
"learning_rate": 3.624760997631982e-08, |
|
"loss": 0.0241, |
|
"step": 988 |
|
}, |
|
{ |
|
"epoch": 0.5612939841089671, |
|
"grad_norm": 0.04058132693171501, |
|
"learning_rate": 3.04586490452119e-08, |
|
"loss": 0.0004, |
|
"step": 989 |
|
}, |
|
{ |
|
"epoch": 0.5618615209988649, |
|
"grad_norm": 0.017602860927581787, |
|
"learning_rate": 2.5172880840745873e-08, |
|
"loss": 0.0003, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.5624290578887627, |
|
"grad_norm": 0.008799027651548386, |
|
"learning_rate": 2.0390358590538504e-08, |
|
"loss": 0.0002, |
|
"step": 991 |
|
}, |
|
{ |
|
"epoch": 0.5629965947786606, |
|
"grad_norm": 0.022075001150369644, |
|
"learning_rate": 1.6111130454543598e-08, |
|
"loss": 0.0003, |
|
"step": 992 |
|
}, |
|
{ |
|
"epoch": 0.5635641316685585, |
|
"grad_norm": 0.00847385823726654, |
|
"learning_rate": 1.2335239524541299e-08, |
|
"loss": 0.0002, |
|
"step": 993 |
|
}, |
|
{ |
|
"epoch": 0.5641316685584563, |
|
"grad_norm": 0.039732273668050766, |
|
"learning_rate": 9.06272382371065e-09, |
|
"loss": 0.0004, |
|
"step": 994 |
|
}, |
|
{ |
|
"epoch": 0.5646992054483542, |
|
"grad_norm": 0.022346949204802513, |
|
"learning_rate": 6.293616306246586e-09, |
|
"loss": 0.0003, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.565266742338252, |
|
"grad_norm": 0.0, |
|
"learning_rate": 4.0279448570323954e-09, |
|
"loss": 0.0, |
|
"step": 996 |
|
}, |
|
{ |
|
"epoch": 0.5658342792281499, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.265732291356626e-09, |
|
"loss": 0.0, |
|
"step": 997 |
|
}, |
|
{ |
|
"epoch": 0.5664018161180476, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.0069963546743832e-09, |
|
"loss": 0.0, |
|
"step": 998 |
|
}, |
|
{ |
|
"epoch": 0.5669693530079455, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.5174972244634833e-10, |
|
"loss": 0.0, |
|
"step": 999 |
|
}, |
|
{ |
|
"epoch": 0.5675368898978433, |
|
"grad_norm": 0.0, |
|
"learning_rate": 0.0, |
|
"loss": 0.0, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.5675368898978433, |
|
"eval_loss": NaN, |
|
"eval_runtime": 105.9662, |
|
"eval_samples_per_second": 28.009, |
|
"eval_steps_per_second": 7.002, |
|
"step": 1000 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 1000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 200, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 5, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 4 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.3001540599808e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|