|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.5904059040590406, |
|
"eval_steps": 100, |
|
"global_step": 800, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0007380073800738007, |
|
"grad_norm": 931.379638671875, |
|
"learning_rate": 6.150061500615006e-08, |
|
"loss": 2.4703, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0007380073800738007, |
|
"eval_loss": 2.1151068210601807, |
|
"eval_runtime": 311.9468, |
|
"eval_samples_per_second": 3.683, |
|
"eval_steps_per_second": 0.308, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0014760147601476014, |
|
"grad_norm": 940.4727172851562, |
|
"learning_rate": 1.2300123001230013e-07, |
|
"loss": 2.6788, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.002214022140221402, |
|
"grad_norm": 744.969970703125, |
|
"learning_rate": 1.845018450184502e-07, |
|
"loss": 2.4456, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.002952029520295203, |
|
"grad_norm": 824.5645141601562, |
|
"learning_rate": 2.4600246002460025e-07, |
|
"loss": 2.6876, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.0036900369003690036, |
|
"grad_norm": 790.6527099609375, |
|
"learning_rate": 3.075030750307503e-07, |
|
"loss": 2.4629, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.004428044280442804, |
|
"grad_norm": 582.4039306640625, |
|
"learning_rate": 3.690036900369004e-07, |
|
"loss": 2.3769, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0051660516605166054, |
|
"grad_norm": 542.76513671875, |
|
"learning_rate": 4.3050430504305045e-07, |
|
"loss": 2.1601, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.005904059040590406, |
|
"grad_norm": 570.0616455078125, |
|
"learning_rate": 4.920049200492005e-07, |
|
"loss": 2.1235, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.006642066420664207, |
|
"grad_norm": 630.7283935546875, |
|
"learning_rate": 5.535055350553506e-07, |
|
"loss": 2.1857, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.007380073800738007, |
|
"grad_norm": 397.8863220214844, |
|
"learning_rate": 6.150061500615006e-07, |
|
"loss": 1.9655, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.008118081180811807, |
|
"grad_norm": 387.4375915527344, |
|
"learning_rate": 6.765067650676507e-07, |
|
"loss": 1.8162, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.008856088560885609, |
|
"grad_norm": 261.5195617675781, |
|
"learning_rate": 7.380073800738008e-07, |
|
"loss": 1.8043, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.00959409594095941, |
|
"grad_norm": 216.66661071777344, |
|
"learning_rate": 7.995079950799507e-07, |
|
"loss": 1.7341, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.010332103321033211, |
|
"grad_norm": 200.43228149414062, |
|
"learning_rate": 8.610086100861009e-07, |
|
"loss": 1.6827, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.01107011070110701, |
|
"grad_norm": 213.2593536376953, |
|
"learning_rate": 9.22509225092251e-07, |
|
"loss": 1.6452, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.011808118081180811, |
|
"grad_norm": 146.7362518310547, |
|
"learning_rate": 9.84009840098401e-07, |
|
"loss": 1.6459, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.012546125461254613, |
|
"grad_norm": 143.30966186523438, |
|
"learning_rate": 1.045510455104551e-06, |
|
"loss": 1.6676, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.013284132841328414, |
|
"grad_norm": 177.24832153320312, |
|
"learning_rate": 1.1070110701107011e-06, |
|
"loss": 1.4307, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.014022140221402213, |
|
"grad_norm": 134.13116455078125, |
|
"learning_rate": 1.1685116851168512e-06, |
|
"loss": 1.4712, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.014760147601476014, |
|
"grad_norm": 107.87165069580078, |
|
"learning_rate": 1.2300123001230013e-06, |
|
"loss": 1.5757, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.015498154981549815, |
|
"grad_norm": 100.48570251464844, |
|
"learning_rate": 1.2915129151291513e-06, |
|
"loss": 1.5647, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.016236162361623615, |
|
"grad_norm": 96.30101776123047, |
|
"learning_rate": 1.3530135301353014e-06, |
|
"loss": 1.3431, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.016974169741697416, |
|
"grad_norm": 99.80168151855469, |
|
"learning_rate": 1.4145141451414515e-06, |
|
"loss": 1.4753, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.017712177121771217, |
|
"grad_norm": 86.59078216552734, |
|
"learning_rate": 1.4760147601476015e-06, |
|
"loss": 1.4402, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.01845018450184502, |
|
"grad_norm": 107.12730407714844, |
|
"learning_rate": 1.5375153751537516e-06, |
|
"loss": 1.4129, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.01918819188191882, |
|
"grad_norm": 86.11123657226562, |
|
"learning_rate": 1.5990159901599014e-06, |
|
"loss": 1.3141, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.01992619926199262, |
|
"grad_norm": 81.71781158447266, |
|
"learning_rate": 1.6605166051660517e-06, |
|
"loss": 1.3644, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.020664206642066422, |
|
"grad_norm": 81.71916961669922, |
|
"learning_rate": 1.7220172201722018e-06, |
|
"loss": 1.3631, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.021402214022140223, |
|
"grad_norm": 65.515625, |
|
"learning_rate": 1.783517835178352e-06, |
|
"loss": 1.3915, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.02214022140221402, |
|
"grad_norm": 82.60952758789062, |
|
"learning_rate": 1.845018450184502e-06, |
|
"loss": 1.2512, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.022878228782287822, |
|
"grad_norm": 78.03673553466797, |
|
"learning_rate": 1.9065190651906518e-06, |
|
"loss": 1.4272, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.023616236162361623, |
|
"grad_norm": 73.9189453125, |
|
"learning_rate": 1.968019680196802e-06, |
|
"loss": 1.3549, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.024354243542435424, |
|
"grad_norm": 75.15375518798828, |
|
"learning_rate": 2.029520295202952e-06, |
|
"loss": 1.1933, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.025092250922509225, |
|
"grad_norm": 68.5103530883789, |
|
"learning_rate": 2.091020910209102e-06, |
|
"loss": 1.2598, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.025830258302583026, |
|
"grad_norm": 63.10990905761719, |
|
"learning_rate": 2.1525215252152524e-06, |
|
"loss": 1.2143, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.026568265682656828, |
|
"grad_norm": 75.12173461914062, |
|
"learning_rate": 2.2140221402214023e-06, |
|
"loss": 1.2827, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.02730627306273063, |
|
"grad_norm": 69.23287963867188, |
|
"learning_rate": 2.2755227552275526e-06, |
|
"loss": 1.4106, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.028044280442804426, |
|
"grad_norm": 82.09547424316406, |
|
"learning_rate": 2.3370233702337024e-06, |
|
"loss": 1.1135, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.028782287822878228, |
|
"grad_norm": 89.76222229003906, |
|
"learning_rate": 2.3985239852398527e-06, |
|
"loss": 1.3469, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.02952029520295203, |
|
"grad_norm": 75.77232360839844, |
|
"learning_rate": 2.4600246002460025e-06, |
|
"loss": 1.1857, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.03025830258302583, |
|
"grad_norm": 64.25336456298828, |
|
"learning_rate": 2.5215252152521524e-06, |
|
"loss": 1.2452, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.03099630996309963, |
|
"grad_norm": 64.85978698730469, |
|
"learning_rate": 2.5830258302583027e-06, |
|
"loss": 1.1511, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.03173431734317343, |
|
"grad_norm": 61.36198043823242, |
|
"learning_rate": 2.6445264452644525e-06, |
|
"loss": 1.1056, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.03247232472324723, |
|
"grad_norm": 63.63357925415039, |
|
"learning_rate": 2.706027060270603e-06, |
|
"loss": 1.3231, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.033210332103321034, |
|
"grad_norm": 60.254825592041016, |
|
"learning_rate": 2.767527675276753e-06, |
|
"loss": 1.1552, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.03394833948339483, |
|
"grad_norm": 69.51408386230469, |
|
"learning_rate": 2.829028290282903e-06, |
|
"loss": 1.2972, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.03468634686346864, |
|
"grad_norm": 60.74787902832031, |
|
"learning_rate": 2.890528905289053e-06, |
|
"loss": 1.2444, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.035424354243542434, |
|
"grad_norm": 62.291412353515625, |
|
"learning_rate": 2.952029520295203e-06, |
|
"loss": 1.2342, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.03616236162361624, |
|
"grad_norm": 67.48091125488281, |
|
"learning_rate": 3.0135301353013533e-06, |
|
"loss": 1.2894, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.03690036900369004, |
|
"grad_norm": 57.86232376098633, |
|
"learning_rate": 3.075030750307503e-06, |
|
"loss": 1.1071, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.037638376383763834, |
|
"grad_norm": 62.488731384277344, |
|
"learning_rate": 3.136531365313653e-06, |
|
"loss": 1.2249, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.03837638376383764, |
|
"grad_norm": 56.59815979003906, |
|
"learning_rate": 3.198031980319803e-06, |
|
"loss": 1.2616, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.03911439114391144, |
|
"grad_norm": 58.92403030395508, |
|
"learning_rate": 3.2595325953259536e-06, |
|
"loss": 1.2147, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.03985239852398524, |
|
"grad_norm": 63.04093933105469, |
|
"learning_rate": 3.3210332103321034e-06, |
|
"loss": 1.2363, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.04059040590405904, |
|
"grad_norm": 57.72414779663086, |
|
"learning_rate": 3.3825338253382537e-06, |
|
"loss": 1.1719, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.041328413284132844, |
|
"grad_norm": 61.95828628540039, |
|
"learning_rate": 3.4440344403444036e-06, |
|
"loss": 1.1956, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.04206642066420664, |
|
"grad_norm": 58.07041549682617, |
|
"learning_rate": 3.5055350553505534e-06, |
|
"loss": 1.1326, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.042804428044280446, |
|
"grad_norm": 61.18100357055664, |
|
"learning_rate": 3.567035670356704e-06, |
|
"loss": 1.2313, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.043542435424354244, |
|
"grad_norm": 58.01974868774414, |
|
"learning_rate": 3.628536285362854e-06, |
|
"loss": 1.1833, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.04428044280442804, |
|
"grad_norm": 58.43510437011719, |
|
"learning_rate": 3.690036900369004e-06, |
|
"loss": 1.232, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.045018450184501846, |
|
"grad_norm": 56.53025817871094, |
|
"learning_rate": 3.7515375153751537e-06, |
|
"loss": 1.202, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.045756457564575644, |
|
"grad_norm": 59.623043060302734, |
|
"learning_rate": 3.8130381303813035e-06, |
|
"loss": 1.2188, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.04649446494464945, |
|
"grad_norm": 54.90254211425781, |
|
"learning_rate": 3.874538745387454e-06, |
|
"loss": 1.1324, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.047232472324723246, |
|
"grad_norm": 56.264732360839844, |
|
"learning_rate": 3.936039360393604e-06, |
|
"loss": 1.1797, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.04797047970479705, |
|
"grad_norm": 56.26121520996094, |
|
"learning_rate": 3.997539975399754e-06, |
|
"loss": 1.1777, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.04870848708487085, |
|
"grad_norm": 53.94155502319336, |
|
"learning_rate": 4.059040590405904e-06, |
|
"loss": 1.1077, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.04944649446494465, |
|
"grad_norm": 56.105831146240234, |
|
"learning_rate": 4.120541205412054e-06, |
|
"loss": 1.1559, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.05018450184501845, |
|
"grad_norm": 65.91514587402344, |
|
"learning_rate": 4.182041820418204e-06, |
|
"loss": 1.0554, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.05092250922509225, |
|
"grad_norm": 67.19110107421875, |
|
"learning_rate": 4.243542435424354e-06, |
|
"loss": 1.2161, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.05166051660516605, |
|
"grad_norm": 55.92790603637695, |
|
"learning_rate": 4.305043050430505e-06, |
|
"loss": 1.2577, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.05239852398523985, |
|
"grad_norm": 61.967750549316406, |
|
"learning_rate": 4.366543665436655e-06, |
|
"loss": 1.1944, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.053136531365313655, |
|
"grad_norm": 54.48695373535156, |
|
"learning_rate": 4.428044280442805e-06, |
|
"loss": 0.9314, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.05387453874538745, |
|
"grad_norm": 59.03939437866211, |
|
"learning_rate": 4.489544895448955e-06, |
|
"loss": 1.2708, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.05461254612546126, |
|
"grad_norm": 57.15635299682617, |
|
"learning_rate": 4.551045510455105e-06, |
|
"loss": 1.1294, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.055350553505535055, |
|
"grad_norm": 57.40306091308594, |
|
"learning_rate": 4.612546125461255e-06, |
|
"loss": 1.0417, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.05608856088560885, |
|
"grad_norm": 114.00467681884766, |
|
"learning_rate": 4.674046740467405e-06, |
|
"loss": 1.0973, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.05682656826568266, |
|
"grad_norm": 55.897666931152344, |
|
"learning_rate": 4.735547355473555e-06, |
|
"loss": 1.024, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.057564575645756455, |
|
"grad_norm": 54.696266174316406, |
|
"learning_rate": 4.797047970479705e-06, |
|
"loss": 1.0549, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.05830258302583026, |
|
"grad_norm": 58.518489837646484, |
|
"learning_rate": 4.858548585485855e-06, |
|
"loss": 1.1007, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.05904059040590406, |
|
"grad_norm": 55.24943923950195, |
|
"learning_rate": 4.920049200492005e-06, |
|
"loss": 1.223, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.05977859778597786, |
|
"grad_norm": 55.647605895996094, |
|
"learning_rate": 4.981549815498155e-06, |
|
"loss": 1.0745, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.06051660516605166, |
|
"grad_norm": 52.201297760009766, |
|
"learning_rate": 5.043050430504305e-06, |
|
"loss": 1.1459, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.061254612546125464, |
|
"grad_norm": 49.60506820678711, |
|
"learning_rate": 5.1045510455104555e-06, |
|
"loss": 1.0853, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.06199261992619926, |
|
"grad_norm": 53.66012191772461, |
|
"learning_rate": 5.166051660516605e-06, |
|
"loss": 0.999, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.06273062730627306, |
|
"grad_norm": 58.5854606628418, |
|
"learning_rate": 5.227552275522755e-06, |
|
"loss": 1.3074, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.06346863468634686, |
|
"grad_norm": 58.91031265258789, |
|
"learning_rate": 5.289052890528905e-06, |
|
"loss": 1.0567, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.06420664206642067, |
|
"grad_norm": 57.29990005493164, |
|
"learning_rate": 5.350553505535055e-06, |
|
"loss": 1.1709, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.06494464944649446, |
|
"grad_norm": 48.71859359741211, |
|
"learning_rate": 5.412054120541206e-06, |
|
"loss": 1.2049, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.06568265682656826, |
|
"grad_norm": 50.770084381103516, |
|
"learning_rate": 5.4735547355473555e-06, |
|
"loss": 0.9872, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.06642066420664207, |
|
"grad_norm": 58.15389633178711, |
|
"learning_rate": 5.535055350553506e-06, |
|
"loss": 1.0798, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.06715867158671587, |
|
"grad_norm": 61.212825775146484, |
|
"learning_rate": 5.596555965559656e-06, |
|
"loss": 1.0654, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.06789667896678966, |
|
"grad_norm": 56.70602798461914, |
|
"learning_rate": 5.658056580565806e-06, |
|
"loss": 1.1565, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.06863468634686347, |
|
"grad_norm": 54.07913589477539, |
|
"learning_rate": 5.7195571955719566e-06, |
|
"loss": 1.1315, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.06937269372693727, |
|
"grad_norm": 55.931495666503906, |
|
"learning_rate": 5.781057810578106e-06, |
|
"loss": 1.0347, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.07011070110701106, |
|
"grad_norm": 58.78949737548828, |
|
"learning_rate": 5.842558425584256e-06, |
|
"loss": 1.1858, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.07084870848708487, |
|
"grad_norm": 53.04726791381836, |
|
"learning_rate": 5.904059040590406e-06, |
|
"loss": 1.1227, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.07158671586715867, |
|
"grad_norm": 51.182315826416016, |
|
"learning_rate": 5.965559655596556e-06, |
|
"loss": 1.1926, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.07232472324723248, |
|
"grad_norm": 55.08806610107422, |
|
"learning_rate": 6.027060270602707e-06, |
|
"loss": 1.1339, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.07306273062730627, |
|
"grad_norm": 53.554542541503906, |
|
"learning_rate": 6.0885608856088565e-06, |
|
"loss": 1.1762, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.07380073800738007, |
|
"grad_norm": 56.95305252075195, |
|
"learning_rate": 6.150061500615006e-06, |
|
"loss": 1.1414, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.07380073800738007, |
|
"eval_loss": 1.4010363817214966, |
|
"eval_runtime": 325.9171, |
|
"eval_samples_per_second": 3.525, |
|
"eval_steps_per_second": 0.295, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.07453874538745388, |
|
"grad_norm": 51.188621520996094, |
|
"learning_rate": 6.211562115621156e-06, |
|
"loss": 1.129, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.07527675276752767, |
|
"grad_norm": 55.20896530151367, |
|
"learning_rate": 6.273062730627306e-06, |
|
"loss": 1.1451, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.07601476014760147, |
|
"grad_norm": 49.773399353027344, |
|
"learning_rate": 6.334563345633457e-06, |
|
"loss": 1.2308, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.07675276752767528, |
|
"grad_norm": 52.89494323730469, |
|
"learning_rate": 6.396063960639606e-06, |
|
"loss": 1.2243, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.07749077490774908, |
|
"grad_norm": 53.44047546386719, |
|
"learning_rate": 6.4575645756457565e-06, |
|
"loss": 1.0611, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.07822878228782287, |
|
"grad_norm": 53.227176666259766, |
|
"learning_rate": 6.519065190651907e-06, |
|
"loss": 1.0153, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.07896678966789668, |
|
"grad_norm": 53.29740524291992, |
|
"learning_rate": 6.580565805658056e-06, |
|
"loss": 1.0539, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.07970479704797048, |
|
"grad_norm": 52.415748596191406, |
|
"learning_rate": 6.642066420664207e-06, |
|
"loss": 1.1556, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.08044280442804429, |
|
"grad_norm": 52.891544342041016, |
|
"learning_rate": 6.703567035670357e-06, |
|
"loss": 1.0613, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.08118081180811808, |
|
"grad_norm": 56.652835845947266, |
|
"learning_rate": 6.7650676506765074e-06, |
|
"loss": 1.053, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.08191881918819188, |
|
"grad_norm": 52.22764587402344, |
|
"learning_rate": 6.826568265682657e-06, |
|
"loss": 1.0757, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.08265682656826569, |
|
"grad_norm": 51.05937576293945, |
|
"learning_rate": 6.888068880688807e-06, |
|
"loss": 1.1943, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.08339483394833948, |
|
"grad_norm": 53.054378509521484, |
|
"learning_rate": 6.949569495694958e-06, |
|
"loss": 1.0704, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.08413284132841328, |
|
"grad_norm": 54.2965202331543, |
|
"learning_rate": 7.011070110701107e-06, |
|
"loss": 1.0442, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.08487084870848709, |
|
"grad_norm": 52.170867919921875, |
|
"learning_rate": 7.0725707257072575e-06, |
|
"loss": 1.2318, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.08560885608856089, |
|
"grad_norm": 51.29275894165039, |
|
"learning_rate": 7.134071340713408e-06, |
|
"loss": 1.0306, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.08634686346863468, |
|
"grad_norm": 54.07830047607422, |
|
"learning_rate": 7.195571955719557e-06, |
|
"loss": 1.1537, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.08708487084870849, |
|
"grad_norm": 47.52810287475586, |
|
"learning_rate": 7.257072570725708e-06, |
|
"loss": 1.1096, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.08782287822878229, |
|
"grad_norm": 52.45383071899414, |
|
"learning_rate": 7.318573185731857e-06, |
|
"loss": 1.0466, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.08856088560885608, |
|
"grad_norm": 51.74037551879883, |
|
"learning_rate": 7.380073800738008e-06, |
|
"loss": 1.0032, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.08929889298892989, |
|
"grad_norm": 52.04569625854492, |
|
"learning_rate": 7.441574415744158e-06, |
|
"loss": 1.1626, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.09003690036900369, |
|
"grad_norm": 51.20045852661133, |
|
"learning_rate": 7.503075030750307e-06, |
|
"loss": 1.1133, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.0907749077490775, |
|
"grad_norm": 50.70725631713867, |
|
"learning_rate": 7.564575645756458e-06, |
|
"loss": 1.1191, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.09151291512915129, |
|
"grad_norm": 50.703460693359375, |
|
"learning_rate": 7.626076260762607e-06, |
|
"loss": 1.0609, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.09225092250922509, |
|
"grad_norm": 53.20537185668945, |
|
"learning_rate": 7.687576875768759e-06, |
|
"loss": 1.1237, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.0929889298892989, |
|
"grad_norm": 51.74738693237305, |
|
"learning_rate": 7.749077490774908e-06, |
|
"loss": 1.1255, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.09372693726937269, |
|
"grad_norm": 47.27532958984375, |
|
"learning_rate": 7.810578105781058e-06, |
|
"loss": 1.0753, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.09446494464944649, |
|
"grad_norm": 46.608150482177734, |
|
"learning_rate": 7.872078720787208e-06, |
|
"loss": 1.0709, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.0952029520295203, |
|
"grad_norm": 52.357460021972656, |
|
"learning_rate": 7.933579335793358e-06, |
|
"loss": 1.1539, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.0959409594095941, |
|
"grad_norm": 48.45564270019531, |
|
"learning_rate": 7.995079950799508e-06, |
|
"loss": 1.1129, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.09667896678966789, |
|
"grad_norm": 52.05830383300781, |
|
"learning_rate": 8.05658056580566e-06, |
|
"loss": 1.1821, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.0974169741697417, |
|
"grad_norm": 53.559852600097656, |
|
"learning_rate": 8.118081180811808e-06, |
|
"loss": 1.1161, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.0981549815498155, |
|
"grad_norm": 54.30366134643555, |
|
"learning_rate": 8.179581795817959e-06, |
|
"loss": 1.0108, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.0988929889298893, |
|
"grad_norm": 49.463932037353516, |
|
"learning_rate": 8.241082410824107e-06, |
|
"loss": 1.0384, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.0996309963099631, |
|
"grad_norm": 73.52909088134766, |
|
"learning_rate": 8.302583025830259e-06, |
|
"loss": 1.1694, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.1003690036900369, |
|
"grad_norm": 45.32145309448242, |
|
"learning_rate": 8.364083640836409e-06, |
|
"loss": 1.0721, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.1011070110701107, |
|
"grad_norm": 51.58095932006836, |
|
"learning_rate": 8.425584255842559e-06, |
|
"loss": 0.9633, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.1018450184501845, |
|
"grad_norm": 52.928436279296875, |
|
"learning_rate": 8.487084870848708e-06, |
|
"loss": 1.0783, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.1025830258302583, |
|
"grad_norm": 48.393550872802734, |
|
"learning_rate": 8.548585485854858e-06, |
|
"loss": 0.9844, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.1033210332103321, |
|
"grad_norm": 46.03611373901367, |
|
"learning_rate": 8.61008610086101e-06, |
|
"loss": 1.1052, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.10405904059040591, |
|
"grad_norm": 49.10841751098633, |
|
"learning_rate": 8.67158671586716e-06, |
|
"loss": 1.1086, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.1047970479704797, |
|
"grad_norm": 47.779212951660156, |
|
"learning_rate": 8.73308733087331e-06, |
|
"loss": 1.1376, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.1055350553505535, |
|
"grad_norm": 51.112693786621094, |
|
"learning_rate": 8.79458794587946e-06, |
|
"loss": 1.1465, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.10627306273062731, |
|
"grad_norm": 43.86711502075195, |
|
"learning_rate": 8.85608856088561e-06, |
|
"loss": 0.9845, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.1070110701107011, |
|
"grad_norm": 45.53451156616211, |
|
"learning_rate": 8.917589175891759e-06, |
|
"loss": 1.1196, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.1077490774907749, |
|
"grad_norm": 51.35363006591797, |
|
"learning_rate": 8.97908979089791e-06, |
|
"loss": 1.0202, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.10848708487084871, |
|
"grad_norm": 45.318607330322266, |
|
"learning_rate": 9.040590405904059e-06, |
|
"loss": 1.0156, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.10922509225092251, |
|
"grad_norm": 45.83018493652344, |
|
"learning_rate": 9.10209102091021e-06, |
|
"loss": 0.9637, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.1099630996309963, |
|
"grad_norm": 52.667728424072266, |
|
"learning_rate": 9.163591635916358e-06, |
|
"loss": 1.0344, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.11070110701107011, |
|
"grad_norm": 49.742897033691406, |
|
"learning_rate": 9.22509225092251e-06, |
|
"loss": 0.9486, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.11143911439114391, |
|
"grad_norm": 50.35558319091797, |
|
"learning_rate": 9.28659286592866e-06, |
|
"loss": 1.1685, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.1121771217712177, |
|
"grad_norm": 49.48957824707031, |
|
"learning_rate": 9.34809348093481e-06, |
|
"loss": 0.9666, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.11291512915129151, |
|
"grad_norm": 46.834129333496094, |
|
"learning_rate": 9.40959409594096e-06, |
|
"loss": 1.0137, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.11365313653136531, |
|
"grad_norm": 46.92979049682617, |
|
"learning_rate": 9.47109471094711e-06, |
|
"loss": 1.042, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.11439114391143912, |
|
"grad_norm": 43.96043014526367, |
|
"learning_rate": 9.53259532595326e-06, |
|
"loss": 1.0363, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.11512915129151291, |
|
"grad_norm": 48.00889587402344, |
|
"learning_rate": 9.59409594095941e-06, |
|
"loss": 0.9697, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.11586715867158671, |
|
"grad_norm": 50.71873474121094, |
|
"learning_rate": 9.65559655596556e-06, |
|
"loss": 1.1216, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.11660516605166052, |
|
"grad_norm": 51.51930236816406, |
|
"learning_rate": 9.71709717097171e-06, |
|
"loss": 1.0876, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.11734317343173432, |
|
"grad_norm": 44.15366744995117, |
|
"learning_rate": 9.77859778597786e-06, |
|
"loss": 1.0607, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.11808118081180811, |
|
"grad_norm": 41.848602294921875, |
|
"learning_rate": 9.84009840098401e-06, |
|
"loss": 1.0026, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.11881918819188192, |
|
"grad_norm": 45.18868637084961, |
|
"learning_rate": 9.90159901599016e-06, |
|
"loss": 1.1803, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.11955719557195572, |
|
"grad_norm": 45.788673400878906, |
|
"learning_rate": 9.96309963099631e-06, |
|
"loss": 1.1451, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.12029520295202951, |
|
"grad_norm": 46.45803451538086, |
|
"learning_rate": 1.0024600246002461e-05, |
|
"loss": 0.9769, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.12103321033210332, |
|
"grad_norm": 46.782840728759766, |
|
"learning_rate": 1.008610086100861e-05, |
|
"loss": 1.2505, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.12177121771217712, |
|
"grad_norm": 45.39817810058594, |
|
"learning_rate": 1.0147601476014761e-05, |
|
"loss": 1.0927, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.12250922509225093, |
|
"grad_norm": 43.27733612060547, |
|
"learning_rate": 1.0209102091020911e-05, |
|
"loss": 1.1247, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.12324723247232472, |
|
"grad_norm": 47.766231536865234, |
|
"learning_rate": 1.027060270602706e-05, |
|
"loss": 1.079, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.12398523985239852, |
|
"grad_norm": 46.73952865600586, |
|
"learning_rate": 1.033210332103321e-05, |
|
"loss": 0.8357, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.12472324723247233, |
|
"grad_norm": 46.83552551269531, |
|
"learning_rate": 1.039360393603936e-05, |
|
"loss": 1.2159, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.12546125461254612, |
|
"grad_norm": 44.146846771240234, |
|
"learning_rate": 1.045510455104551e-05, |
|
"loss": 0.9941, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.12619926199261994, |
|
"grad_norm": 45.29106140136719, |
|
"learning_rate": 1.0516605166051662e-05, |
|
"loss": 1.1314, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.12693726937269373, |
|
"grad_norm": 46.10059356689453, |
|
"learning_rate": 1.057810578105781e-05, |
|
"loss": 1.0239, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.12767527675276752, |
|
"grad_norm": 42.55729293823242, |
|
"learning_rate": 1.0639606396063962e-05, |
|
"loss": 1.0389, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.12841328413284134, |
|
"grad_norm": 43.775760650634766, |
|
"learning_rate": 1.070110701107011e-05, |
|
"loss": 1.1492, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.12915129151291513, |
|
"grad_norm": 42.141910552978516, |
|
"learning_rate": 1.0762607626076261e-05, |
|
"loss": 1.092, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.12988929889298892, |
|
"grad_norm": 44.42767333984375, |
|
"learning_rate": 1.0824108241082411e-05, |
|
"loss": 1.1159, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.13062730627306274, |
|
"grad_norm": 38.9581184387207, |
|
"learning_rate": 1.0885608856088561e-05, |
|
"loss": 1.0921, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.13136531365313653, |
|
"grad_norm": 43.585147857666016, |
|
"learning_rate": 1.0947109471094711e-05, |
|
"loss": 1.0784, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.13210332103321032, |
|
"grad_norm": 49.25750732421875, |
|
"learning_rate": 1.100861008610086e-05, |
|
"loss": 1.1589, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.13284132841328414, |
|
"grad_norm": 38.27066421508789, |
|
"learning_rate": 1.1070110701107012e-05, |
|
"loss": 0.9549, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.13357933579335793, |
|
"grad_norm": 43.95482635498047, |
|
"learning_rate": 1.1131611316113162e-05, |
|
"loss": 1.1084, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.13431734317343175, |
|
"grad_norm": 47.86146926879883, |
|
"learning_rate": 1.1193111931119312e-05, |
|
"loss": 1.0305, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.13505535055350554, |
|
"grad_norm": 41.17548370361328, |
|
"learning_rate": 1.1254612546125462e-05, |
|
"loss": 1.0341, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.13579335793357933, |
|
"grad_norm": 50.34139633178711, |
|
"learning_rate": 1.1316113161131612e-05, |
|
"loss": 0.9769, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.13653136531365315, |
|
"grad_norm": 41.7880973815918, |
|
"learning_rate": 1.1377613776137762e-05, |
|
"loss": 1.047, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.13726937269372694, |
|
"grad_norm": 43.598392486572266, |
|
"learning_rate": 1.1439114391143913e-05, |
|
"loss": 0.9553, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.13800738007380073, |
|
"grad_norm": 44.27220153808594, |
|
"learning_rate": 1.1500615006150061e-05, |
|
"loss": 1.1314, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.13874538745387455, |
|
"grad_norm": 38.91771697998047, |
|
"learning_rate": 1.1562115621156213e-05, |
|
"loss": 1.0132, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.13948339483394834, |
|
"grad_norm": 44.32412338256836, |
|
"learning_rate": 1.1623616236162361e-05, |
|
"loss": 1.0672, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.14022140221402213, |
|
"grad_norm": 43.45479202270508, |
|
"learning_rate": 1.1685116851168513e-05, |
|
"loss": 1.0519, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.14095940959409595, |
|
"grad_norm": 46.94374084472656, |
|
"learning_rate": 1.1746617466174662e-05, |
|
"loss": 1.0721, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.14169741697416974, |
|
"grad_norm": 48.714927673339844, |
|
"learning_rate": 1.1808118081180812e-05, |
|
"loss": 1.095, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.14243542435424356, |
|
"grad_norm": 48.29472732543945, |
|
"learning_rate": 1.1869618696186962e-05, |
|
"loss": 1.1482, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.14317343173431735, |
|
"grad_norm": 43.912288665771484, |
|
"learning_rate": 1.1931119311193112e-05, |
|
"loss": 1.0994, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.14391143911439114, |
|
"grad_norm": 41.308799743652344, |
|
"learning_rate": 1.1992619926199262e-05, |
|
"loss": 1.2074, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.14464944649446496, |
|
"grad_norm": 43.36037826538086, |
|
"learning_rate": 1.2054120541205413e-05, |
|
"loss": 1.1435, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.14538745387453875, |
|
"grad_norm": 40.67462158203125, |
|
"learning_rate": 1.2115621156211563e-05, |
|
"loss": 0.9609, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.14612546125461254, |
|
"grad_norm": 43.331241607666016, |
|
"learning_rate": 1.2177121771217713e-05, |
|
"loss": 1.0909, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.14686346863468636, |
|
"grad_norm": 41.213863372802734, |
|
"learning_rate": 1.2238622386223863e-05, |
|
"loss": 1.0955, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.14760147601476015, |
|
"grad_norm": 43.54401397705078, |
|
"learning_rate": 1.2300123001230013e-05, |
|
"loss": 1.1855, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.14760147601476015, |
|
"eval_loss": 1.3390393257141113, |
|
"eval_runtime": 355.9656, |
|
"eval_samples_per_second": 3.228, |
|
"eval_steps_per_second": 0.27, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.14833948339483394, |
|
"grad_norm": 45.116146087646484, |
|
"learning_rate": 1.2361623616236164e-05, |
|
"loss": 1.1331, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.14907749077490776, |
|
"grad_norm": 48.80164337158203, |
|
"learning_rate": 1.2423124231242312e-05, |
|
"loss": 1.157, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.14981549815498155, |
|
"grad_norm": 41.02751922607422, |
|
"learning_rate": 1.2484624846248464e-05, |
|
"loss": 1.1237, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.15055350553505534, |
|
"grad_norm": 42.61967086791992, |
|
"learning_rate": 1.2546125461254612e-05, |
|
"loss": 1.1693, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.15129151291512916, |
|
"grad_norm": 43.75822067260742, |
|
"learning_rate": 1.2607626076260764e-05, |
|
"loss": 1.1545, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.15202952029520295, |
|
"grad_norm": 40.50026321411133, |
|
"learning_rate": 1.2669126691266914e-05, |
|
"loss": 1.061, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.15276752767527677, |
|
"grad_norm": 41.14898681640625, |
|
"learning_rate": 1.2730627306273063e-05, |
|
"loss": 0.9864, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.15350553505535056, |
|
"grad_norm": 44.43930435180664, |
|
"learning_rate": 1.2792127921279212e-05, |
|
"loss": 1.0444, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.15424354243542435, |
|
"grad_norm": 42.351226806640625, |
|
"learning_rate": 1.2853628536285365e-05, |
|
"loss": 1.0966, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.15498154981549817, |
|
"grad_norm": 39.365440368652344, |
|
"learning_rate": 1.2915129151291513e-05, |
|
"loss": 1.0987, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.15571955719557196, |
|
"grad_norm": 44.90658950805664, |
|
"learning_rate": 1.2976629766297663e-05, |
|
"loss": 1.0399, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.15645756457564575, |
|
"grad_norm": 38.08787536621094, |
|
"learning_rate": 1.3038130381303814e-05, |
|
"loss": 0.9539, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.15719557195571957, |
|
"grad_norm": 40.93101501464844, |
|
"learning_rate": 1.3099630996309964e-05, |
|
"loss": 0.9497, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.15793357933579336, |
|
"grad_norm": 42.12691116333008, |
|
"learning_rate": 1.3161131611316112e-05, |
|
"loss": 1.0591, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.15867158671586715, |
|
"grad_norm": 39.68405532836914, |
|
"learning_rate": 1.3222632226322266e-05, |
|
"loss": 1.1084, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.15940959409594097, |
|
"grad_norm": 46.32451629638672, |
|
"learning_rate": 1.3284132841328414e-05, |
|
"loss": 0.9886, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.16014760147601476, |
|
"grad_norm": 43.83405303955078, |
|
"learning_rate": 1.3345633456334564e-05, |
|
"loss": 1.0409, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.16088560885608857, |
|
"grad_norm": 46.454429626464844, |
|
"learning_rate": 1.3407134071340713e-05, |
|
"loss": 0.927, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.16162361623616237, |
|
"grad_norm": 43.32332229614258, |
|
"learning_rate": 1.3468634686346865e-05, |
|
"loss": 1.0885, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.16236162361623616, |
|
"grad_norm": 38.92317581176758, |
|
"learning_rate": 1.3530135301353015e-05, |
|
"loss": 1.1205, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.16309963099630997, |
|
"grad_norm": 36.57090759277344, |
|
"learning_rate": 1.3591635916359163e-05, |
|
"loss": 1.0607, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.16383763837638377, |
|
"grad_norm": 39.162147521972656, |
|
"learning_rate": 1.3653136531365315e-05, |
|
"loss": 1.1395, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.16457564575645756, |
|
"grad_norm": 40.069610595703125, |
|
"learning_rate": 1.3714637146371464e-05, |
|
"loss": 0.993, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.16531365313653137, |
|
"grad_norm": 38.262664794921875, |
|
"learning_rate": 1.3776137761377614e-05, |
|
"loss": 1.0751, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.16605166051660517, |
|
"grad_norm": 38.50648498535156, |
|
"learning_rate": 1.3837638376383766e-05, |
|
"loss": 1.0874, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.16678966789667896, |
|
"grad_norm": 41.57286834716797, |
|
"learning_rate": 1.3899138991389916e-05, |
|
"loss": 1.061, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.16752767527675277, |
|
"grad_norm": 38.842124938964844, |
|
"learning_rate": 1.3960639606396064e-05, |
|
"loss": 0.9865, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.16826568265682657, |
|
"grad_norm": 40.79179382324219, |
|
"learning_rate": 1.4022140221402214e-05, |
|
"loss": 1.0104, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.16900369003690036, |
|
"grad_norm": 40.540042877197266, |
|
"learning_rate": 1.4083640836408365e-05, |
|
"loss": 0.9352, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.16974169741697417, |
|
"grad_norm": 39.385459899902344, |
|
"learning_rate": 1.4145141451414515e-05, |
|
"loss": 1.0731, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.17047970479704797, |
|
"grad_norm": 40.35080337524414, |
|
"learning_rate": 1.4206642066420663e-05, |
|
"loss": 1.1106, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.17121771217712178, |
|
"grad_norm": 37.7828254699707, |
|
"learning_rate": 1.4268142681426816e-05, |
|
"loss": 1.0902, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.17195571955719557, |
|
"grad_norm": 38.59387969970703, |
|
"learning_rate": 1.4329643296432965e-05, |
|
"loss": 1.0837, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.17269372693726937, |
|
"grad_norm": 40.220245361328125, |
|
"learning_rate": 1.4391143911439114e-05, |
|
"loss": 1.1002, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.17343173431734318, |
|
"grad_norm": 41.30938720703125, |
|
"learning_rate": 1.4452644526445266e-05, |
|
"loss": 0.9605, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.17416974169741697, |
|
"grad_norm": 42.54692840576172, |
|
"learning_rate": 1.4514145141451416e-05, |
|
"loss": 1.1135, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.17490774907749077, |
|
"grad_norm": 38.45701217651367, |
|
"learning_rate": 1.4575645756457566e-05, |
|
"loss": 1.2065, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.17564575645756458, |
|
"grad_norm": 40.34320068359375, |
|
"learning_rate": 1.4637146371463714e-05, |
|
"loss": 1.0331, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.17638376383763837, |
|
"grad_norm": 39.82585144042969, |
|
"learning_rate": 1.4698646986469865e-05, |
|
"loss": 1.1597, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.17712177121771217, |
|
"grad_norm": 39.45707321166992, |
|
"learning_rate": 1.4760147601476015e-05, |
|
"loss": 1.1008, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.17785977859778598, |
|
"grad_norm": 37.564231872558594, |
|
"learning_rate": 1.4821648216482165e-05, |
|
"loss": 0.9734, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.17859778597785977, |
|
"grad_norm": 40.75583267211914, |
|
"learning_rate": 1.4883148831488317e-05, |
|
"loss": 1.1324, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.1793357933579336, |
|
"grad_norm": 36.91340255737305, |
|
"learning_rate": 1.4944649446494467e-05, |
|
"loss": 0.8858, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.18007380073800738, |
|
"grad_norm": 41.43409729003906, |
|
"learning_rate": 1.5006150061500615e-05, |
|
"loss": 1.127, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.18081180811808117, |
|
"grad_norm": 39.64106750488281, |
|
"learning_rate": 1.5067650676506768e-05, |
|
"loss": 1.0394, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.181549815498155, |
|
"grad_norm": 39.24397277832031, |
|
"learning_rate": 1.5129151291512916e-05, |
|
"loss": 1.1139, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.18228782287822878, |
|
"grad_norm": 39.08576965332031, |
|
"learning_rate": 1.5190651906519066e-05, |
|
"loss": 1.1373, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.18302583025830257, |
|
"grad_norm": 37.38773727416992, |
|
"learning_rate": 1.5252152521525214e-05, |
|
"loss": 0.9942, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.1837638376383764, |
|
"grad_norm": 39.011505126953125, |
|
"learning_rate": 1.5313653136531367e-05, |
|
"loss": 1.1033, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.18450184501845018, |
|
"grad_norm": 38.647705078125, |
|
"learning_rate": 1.5375153751537517e-05, |
|
"loss": 1.0039, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.18523985239852397, |
|
"grad_norm": 36.8840446472168, |
|
"learning_rate": 1.5436654366543664e-05, |
|
"loss": 1.037, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.1859778597785978, |
|
"grad_norm": 39.59068298339844, |
|
"learning_rate": 1.5498154981549817e-05, |
|
"loss": 1.1113, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.18671586715867158, |
|
"grad_norm": 35.01139450073242, |
|
"learning_rate": 1.5559655596555967e-05, |
|
"loss": 1.0766, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.18745387453874537, |
|
"grad_norm": 42.80155944824219, |
|
"learning_rate": 1.5621156211562117e-05, |
|
"loss": 1.2052, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.1881918819188192, |
|
"grad_norm": 37.67293930053711, |
|
"learning_rate": 1.5682656826568266e-05, |
|
"loss": 1.054, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.18892988929889298, |
|
"grad_norm": 35.59282684326172, |
|
"learning_rate": 1.5744157441574416e-05, |
|
"loss": 1.1038, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.1896678966789668, |
|
"grad_norm": 36.562198638916016, |
|
"learning_rate": 1.5805658056580566e-05, |
|
"loss": 1.1277, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.1904059040590406, |
|
"grad_norm": 38.406944274902344, |
|
"learning_rate": 1.5867158671586716e-05, |
|
"loss": 1.0396, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.19114391143911438, |
|
"grad_norm": 37.851539611816406, |
|
"learning_rate": 1.5928659286592866e-05, |
|
"loss": 1.0541, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.1918819188191882, |
|
"grad_norm": 34.81989669799805, |
|
"learning_rate": 1.5990159901599016e-05, |
|
"loss": 1.0241, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.192619926199262, |
|
"grad_norm": 38.74085235595703, |
|
"learning_rate": 1.6051660516605166e-05, |
|
"loss": 1.0709, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.19335793357933578, |
|
"grad_norm": 41.59756088256836, |
|
"learning_rate": 1.611316113161132e-05, |
|
"loss": 1.2334, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.1940959409594096, |
|
"grad_norm": 35.79509353637695, |
|
"learning_rate": 1.617466174661747e-05, |
|
"loss": 1.0133, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.1948339483394834, |
|
"grad_norm": 39.88947677612305, |
|
"learning_rate": 1.6236162361623615e-05, |
|
"loss": 1.0831, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.19557195571955718, |
|
"grad_norm": 35.988487243652344, |
|
"learning_rate": 1.629766297662977e-05, |
|
"loss": 1.0962, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.196309963099631, |
|
"grad_norm": 36.9556999206543, |
|
"learning_rate": 1.6359163591635918e-05, |
|
"loss": 1.1309, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.1970479704797048, |
|
"grad_norm": 36.95020294189453, |
|
"learning_rate": 1.6420664206642068e-05, |
|
"loss": 1.0556, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.1977859778597786, |
|
"grad_norm": 36.589324951171875, |
|
"learning_rate": 1.6482164821648215e-05, |
|
"loss": 1.0871, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.1985239852398524, |
|
"grad_norm": 38.176605224609375, |
|
"learning_rate": 1.6543665436654368e-05, |
|
"loss": 1.0362, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.1992619926199262, |
|
"grad_norm": 40.13340759277344, |
|
"learning_rate": 1.6605166051660518e-05, |
|
"loss": 0.9606, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 40.80103302001953, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 1.0099, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.2007380073800738, |
|
"grad_norm": 37.991947174072266, |
|
"learning_rate": 1.6728167281672817e-05, |
|
"loss": 1.1559, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.2014760147601476, |
|
"grad_norm": 35.638126373291016, |
|
"learning_rate": 1.6789667896678967e-05, |
|
"loss": 1.0468, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.2022140221402214, |
|
"grad_norm": 36.0762825012207, |
|
"learning_rate": 1.6851168511685117e-05, |
|
"loss": 0.9843, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.2029520295202952, |
|
"grad_norm": 39.42917251586914, |
|
"learning_rate": 1.691266912669127e-05, |
|
"loss": 0.995, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.203690036900369, |
|
"grad_norm": 38.73271179199219, |
|
"learning_rate": 1.6974169741697417e-05, |
|
"loss": 1.1101, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.2044280442804428, |
|
"grad_norm": 34.4466667175293, |
|
"learning_rate": 1.7035670356703567e-05, |
|
"loss": 1.1769, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.2051660516605166, |
|
"grad_norm": 38.39332580566406, |
|
"learning_rate": 1.7097170971709716e-05, |
|
"loss": 1.2032, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.2059040590405904, |
|
"grad_norm": 36.46586227416992, |
|
"learning_rate": 1.715867158671587e-05, |
|
"loss": 1.2505, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.2066420664206642, |
|
"grad_norm": 38.546119689941406, |
|
"learning_rate": 1.722017220172202e-05, |
|
"loss": 1.0471, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.207380073800738, |
|
"grad_norm": 36.11763381958008, |
|
"learning_rate": 1.7281672816728166e-05, |
|
"loss": 1.1173, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.20811808118081182, |
|
"grad_norm": 36.332969665527344, |
|
"learning_rate": 1.734317343173432e-05, |
|
"loss": 0.989, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.2088560885608856, |
|
"grad_norm": 36.8829231262207, |
|
"learning_rate": 1.740467404674047e-05, |
|
"loss": 1.0894, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.2095940959409594, |
|
"grad_norm": 35.905765533447266, |
|
"learning_rate": 1.746617466174662e-05, |
|
"loss": 1.1755, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.21033210332103322, |
|
"grad_norm": 31.39859962463379, |
|
"learning_rate": 1.752767527675277e-05, |
|
"loss": 1.089, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.211070110701107, |
|
"grad_norm": 36.529537200927734, |
|
"learning_rate": 1.758917589175892e-05, |
|
"loss": 1.0632, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.2118081180811808, |
|
"grad_norm": 38.358001708984375, |
|
"learning_rate": 1.765067650676507e-05, |
|
"loss": 1.1177, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.21254612546125462, |
|
"grad_norm": 37.179325103759766, |
|
"learning_rate": 1.771217712177122e-05, |
|
"loss": 1.0513, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.2132841328413284, |
|
"grad_norm": 35.38275146484375, |
|
"learning_rate": 1.7773677736777368e-05, |
|
"loss": 1.0212, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.2140221402214022, |
|
"grad_norm": 37.132389068603516, |
|
"learning_rate": 1.7835178351783518e-05, |
|
"loss": 1.089, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.21476014760147602, |
|
"grad_norm": 34.594783782958984, |
|
"learning_rate": 1.7896678966789668e-05, |
|
"loss": 1.1115, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.2154981549815498, |
|
"grad_norm": 36.57194137573242, |
|
"learning_rate": 1.795817958179582e-05, |
|
"loss": 0.9911, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.21623616236162363, |
|
"grad_norm": 34.58879470825195, |
|
"learning_rate": 1.8019680196801968e-05, |
|
"loss": 1.0169, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.21697416974169742, |
|
"grad_norm": 33.588539123535156, |
|
"learning_rate": 1.8081180811808117e-05, |
|
"loss": 1.0345, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.2177121771217712, |
|
"grad_norm": 34.15876007080078, |
|
"learning_rate": 1.814268142681427e-05, |
|
"loss": 1.0387, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.21845018450184503, |
|
"grad_norm": 40.78740310668945, |
|
"learning_rate": 1.820418204182042e-05, |
|
"loss": 1.0292, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.21918819188191882, |
|
"grad_norm": 38.307064056396484, |
|
"learning_rate": 1.826568265682657e-05, |
|
"loss": 1.087, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.2199261992619926, |
|
"grad_norm": 33.9033203125, |
|
"learning_rate": 1.8327183271832717e-05, |
|
"loss": 1.0356, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.22066420664206643, |
|
"grad_norm": 38.69232940673828, |
|
"learning_rate": 1.838868388683887e-05, |
|
"loss": 1.0239, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.22140221402214022, |
|
"grad_norm": 34.63215637207031, |
|
"learning_rate": 1.845018450184502e-05, |
|
"loss": 1.1614, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.22140221402214022, |
|
"eval_loss": 1.3183945417404175, |
|
"eval_runtime": 343.8464, |
|
"eval_samples_per_second": 3.342, |
|
"eval_steps_per_second": 0.279, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.222140221402214, |
|
"grad_norm": 33.18867111206055, |
|
"learning_rate": 1.851168511685117e-05, |
|
"loss": 1.1519, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.22287822878228783, |
|
"grad_norm": 34.760982513427734, |
|
"learning_rate": 1.857318573185732e-05, |
|
"loss": 1.1005, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.22361623616236162, |
|
"grad_norm": 34.268043518066406, |
|
"learning_rate": 1.863468634686347e-05, |
|
"loss": 1.0483, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.2243542435424354, |
|
"grad_norm": 35.12160873413086, |
|
"learning_rate": 1.869618696186962e-05, |
|
"loss": 1.1201, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.22509225092250923, |
|
"grad_norm": 38.57670974731445, |
|
"learning_rate": 1.8757687576875773e-05, |
|
"loss": 1.0204, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.22583025830258302, |
|
"grad_norm": 34.495235443115234, |
|
"learning_rate": 1.881918819188192e-05, |
|
"loss": 1.1887, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.22656826568265684, |
|
"grad_norm": 36.18799591064453, |
|
"learning_rate": 1.888068880688807e-05, |
|
"loss": 0.8969, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.22730627306273063, |
|
"grad_norm": 35.36030960083008, |
|
"learning_rate": 1.894218942189422e-05, |
|
"loss": 1.1272, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.22804428044280442, |
|
"grad_norm": 34.50253677368164, |
|
"learning_rate": 1.9003690036900372e-05, |
|
"loss": 0.9908, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.22878228782287824, |
|
"grad_norm": 33.003875732421875, |
|
"learning_rate": 1.906519065190652e-05, |
|
"loss": 1.0323, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.22952029520295203, |
|
"grad_norm": 33.84071731567383, |
|
"learning_rate": 1.912669126691267e-05, |
|
"loss": 1.0924, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.23025830258302582, |
|
"grad_norm": 37.590694427490234, |
|
"learning_rate": 1.918819188191882e-05, |
|
"loss": 1.0558, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.23099630996309964, |
|
"grad_norm": 1048.1514892578125, |
|
"learning_rate": 1.924969249692497e-05, |
|
"loss": 0.9793, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.23173431734317343, |
|
"grad_norm": 32.7579460144043, |
|
"learning_rate": 1.931119311193112e-05, |
|
"loss": 0.9345, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.23247232472324722, |
|
"grad_norm": 41.32646942138672, |
|
"learning_rate": 1.937269372693727e-05, |
|
"loss": 1.0441, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.23321033210332104, |
|
"grad_norm": 39.139198303222656, |
|
"learning_rate": 1.943419434194342e-05, |
|
"loss": 1.0545, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.23394833948339483, |
|
"grad_norm": 35.99794006347656, |
|
"learning_rate": 1.949569495694957e-05, |
|
"loss": 1.1261, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.23468634686346865, |
|
"grad_norm": 34.20968246459961, |
|
"learning_rate": 1.955719557195572e-05, |
|
"loss": 0.9836, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.23542435424354244, |
|
"grad_norm": 33.9476203918457, |
|
"learning_rate": 1.961869618696187e-05, |
|
"loss": 1.0345, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.23616236162361623, |
|
"grad_norm": 35.6599235534668, |
|
"learning_rate": 1.968019680196802e-05, |
|
"loss": 1.0316, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.23690036900369005, |
|
"grad_norm": 34.30624008178711, |
|
"learning_rate": 1.974169741697417e-05, |
|
"loss": 0.9987, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.23763837638376384, |
|
"grad_norm": 34.07005310058594, |
|
"learning_rate": 1.980319803198032e-05, |
|
"loss": 1.0052, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.23837638376383763, |
|
"grad_norm": 33.085777282714844, |
|
"learning_rate": 1.986469864698647e-05, |
|
"loss": 1.1424, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.23911439114391145, |
|
"grad_norm": 34.74597930908203, |
|
"learning_rate": 1.992619926199262e-05, |
|
"loss": 1.1401, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.23985239852398524, |
|
"grad_norm": 36.55511474609375, |
|
"learning_rate": 1.9987699876998773e-05, |
|
"loss": 0.9437, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.24059040590405903, |
|
"grad_norm": 35.86470031738281, |
|
"learning_rate": 2.0049200492004923e-05, |
|
"loss": 1.1535, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.24132841328413285, |
|
"grad_norm": 33.983421325683594, |
|
"learning_rate": 2.011070110701107e-05, |
|
"loss": 1.1367, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.24206642066420664, |
|
"grad_norm": 36.45722198486328, |
|
"learning_rate": 2.017220172201722e-05, |
|
"loss": 1.0917, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.24280442804428043, |
|
"grad_norm": 36.953060150146484, |
|
"learning_rate": 2.0233702337023372e-05, |
|
"loss": 1.0107, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.24354243542435425, |
|
"grad_norm": 37.92033004760742, |
|
"learning_rate": 2.0295202952029522e-05, |
|
"loss": 1.2084, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.24428044280442804, |
|
"grad_norm": 31.74508285522461, |
|
"learning_rate": 2.035670356703567e-05, |
|
"loss": 1.0421, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.24501845018450186, |
|
"grad_norm": 37.19945526123047, |
|
"learning_rate": 2.0418204182041822e-05, |
|
"loss": 1.082, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.24575645756457565, |
|
"grad_norm": 32.649444580078125, |
|
"learning_rate": 2.0479704797047972e-05, |
|
"loss": 1.1345, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.24649446494464944, |
|
"grad_norm": 36.957977294921875, |
|
"learning_rate": 2.054120541205412e-05, |
|
"loss": 1.0192, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.24723247232472326, |
|
"grad_norm": 32.36549377441406, |
|
"learning_rate": 2.060270602706027e-05, |
|
"loss": 1.1387, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.24797047970479705, |
|
"grad_norm": 30.191532135009766, |
|
"learning_rate": 2.066420664206642e-05, |
|
"loss": 1.0083, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.24870848708487084, |
|
"grad_norm": 31.56035804748535, |
|
"learning_rate": 2.072570725707257e-05, |
|
"loss": 1.1246, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.24944649446494466, |
|
"grad_norm": 36.50621032714844, |
|
"learning_rate": 2.078720787207872e-05, |
|
"loss": 1.1289, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.25018450184501845, |
|
"grad_norm": 32.51582336425781, |
|
"learning_rate": 2.084870848708487e-05, |
|
"loss": 0.9957, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.25092250922509224, |
|
"grad_norm": 40.50331115722656, |
|
"learning_rate": 2.091020910209102e-05, |
|
"loss": 1.0179, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.25166051660516603, |
|
"grad_norm": 38.418792724609375, |
|
"learning_rate": 2.097170971709717e-05, |
|
"loss": 1.1074, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.2523985239852399, |
|
"grad_norm": 33.0310173034668, |
|
"learning_rate": 2.1033210332103324e-05, |
|
"loss": 1.1623, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.25313653136531367, |
|
"grad_norm": 30.66373062133789, |
|
"learning_rate": 2.1094710947109474e-05, |
|
"loss": 0.9796, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.25387453874538746, |
|
"grad_norm": 30.335712432861328, |
|
"learning_rate": 2.115621156211562e-05, |
|
"loss": 1.0376, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.25461254612546125, |
|
"grad_norm": 33.595855712890625, |
|
"learning_rate": 2.1217712177121773e-05, |
|
"loss": 1.0289, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.25535055350553504, |
|
"grad_norm": 30.422454833984375, |
|
"learning_rate": 2.1279212792127923e-05, |
|
"loss": 1.0815, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.25608856088560883, |
|
"grad_norm": 38.317386627197266, |
|
"learning_rate": 2.1340713407134073e-05, |
|
"loss": 1.0096, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.2568265682656827, |
|
"grad_norm": 36.44529342651367, |
|
"learning_rate": 2.140221402214022e-05, |
|
"loss": 1.006, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.25756457564575647, |
|
"grad_norm": 33.271060943603516, |
|
"learning_rate": 2.1463714637146373e-05, |
|
"loss": 0.9819, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.25830258302583026, |
|
"grad_norm": 35.99654769897461, |
|
"learning_rate": 2.1525215252152523e-05, |
|
"loss": 1.1038, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.25904059040590405, |
|
"grad_norm": 34.73610305786133, |
|
"learning_rate": 2.1586715867158673e-05, |
|
"loss": 1.065, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.25977859778597784, |
|
"grad_norm": 37.899776458740234, |
|
"learning_rate": 2.1648216482164822e-05, |
|
"loss": 1.1092, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.2605166051660517, |
|
"grad_norm": 36.49541473388672, |
|
"learning_rate": 2.1709717097170972e-05, |
|
"loss": 1.1665, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.2612546125461255, |
|
"grad_norm": 35.63615798950195, |
|
"learning_rate": 2.1771217712177122e-05, |
|
"loss": 1.1201, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.26199261992619927, |
|
"grad_norm": 34.21985626220703, |
|
"learning_rate": 2.1832718327183275e-05, |
|
"loss": 1.0518, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.26273062730627306, |
|
"grad_norm": 33.33612823486328, |
|
"learning_rate": 2.1894218942189422e-05, |
|
"loss": 0.964, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.26346863468634685, |
|
"grad_norm": 33.31211471557617, |
|
"learning_rate": 2.195571955719557e-05, |
|
"loss": 1.0508, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.26420664206642064, |
|
"grad_norm": 32.13766860961914, |
|
"learning_rate": 2.201722017220172e-05, |
|
"loss": 1.1904, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.2649446494464945, |
|
"grad_norm": 38.23426818847656, |
|
"learning_rate": 2.2078720787207875e-05, |
|
"loss": 1.044, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.2656826568265683, |
|
"grad_norm": 30.594451904296875, |
|
"learning_rate": 2.2140221402214025e-05, |
|
"loss": 0.8797, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.26642066420664207, |
|
"grad_norm": 33.05818557739258, |
|
"learning_rate": 2.220172201722017e-05, |
|
"loss": 1.1213, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.26715867158671586, |
|
"grad_norm": 31.24005126953125, |
|
"learning_rate": 2.2263222632226324e-05, |
|
"loss": 1.1148, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.26789667896678965, |
|
"grad_norm": 33.34355926513672, |
|
"learning_rate": 2.2324723247232474e-05, |
|
"loss": 1.0186, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.2686346863468635, |
|
"grad_norm": 32.711002349853516, |
|
"learning_rate": 2.2386223862238624e-05, |
|
"loss": 1.0628, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.2693726937269373, |
|
"grad_norm": 31.853166580200195, |
|
"learning_rate": 2.2447724477244774e-05, |
|
"loss": 1.0366, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.2701107011070111, |
|
"grad_norm": 32.53550720214844, |
|
"learning_rate": 2.2509225092250924e-05, |
|
"loss": 1.076, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.27084870848708487, |
|
"grad_norm": 29.53455924987793, |
|
"learning_rate": 2.2570725707257074e-05, |
|
"loss": 1.0598, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.27158671586715866, |
|
"grad_norm": 34.44631576538086, |
|
"learning_rate": 2.2632226322263223e-05, |
|
"loss": 1.1174, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.27232472324723245, |
|
"grad_norm": 33.80080032348633, |
|
"learning_rate": 2.2693726937269373e-05, |
|
"loss": 1.205, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.2730627306273063, |
|
"grad_norm": 33.64272689819336, |
|
"learning_rate": 2.2755227552275523e-05, |
|
"loss": 1.1677, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.2738007380073801, |
|
"grad_norm": 32.4225959777832, |
|
"learning_rate": 2.2816728167281673e-05, |
|
"loss": 0.9153, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.2745387453874539, |
|
"grad_norm": 32.35124969482422, |
|
"learning_rate": 2.2878228782287826e-05, |
|
"loss": 1.0536, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.27527675276752767, |
|
"grad_norm": 32.049827575683594, |
|
"learning_rate": 2.2939729397293973e-05, |
|
"loss": 1.1493, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.27601476014760146, |
|
"grad_norm": 29.892070770263672, |
|
"learning_rate": 2.3001230012300123e-05, |
|
"loss": 1.0047, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.2767527675276753, |
|
"grad_norm": 30.831012725830078, |
|
"learning_rate": 2.3062730627306276e-05, |
|
"loss": 1.0843, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.2774907749077491, |
|
"grad_norm": 31.903175354003906, |
|
"learning_rate": 2.3124231242312426e-05, |
|
"loss": 1.0552, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.2782287822878229, |
|
"grad_norm": 31.119150161743164, |
|
"learning_rate": 2.3185731857318575e-05, |
|
"loss": 1.0762, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.2789667896678967, |
|
"grad_norm": 34.476524353027344, |
|
"learning_rate": 2.3247232472324722e-05, |
|
"loss": 0.925, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.27970479704797047, |
|
"grad_norm": 33.33213806152344, |
|
"learning_rate": 2.3308733087330875e-05, |
|
"loss": 1.0427, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.28044280442804426, |
|
"grad_norm": 30.07733917236328, |
|
"learning_rate": 2.3370233702337025e-05, |
|
"loss": 1.1158, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.2811808118081181, |
|
"grad_norm": 36.79194259643555, |
|
"learning_rate": 2.3431734317343175e-05, |
|
"loss": 0.969, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.2819188191881919, |
|
"grad_norm": 32.193233489990234, |
|
"learning_rate": 2.3493234932349325e-05, |
|
"loss": 0.938, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.2826568265682657, |
|
"grad_norm": 35.39616394042969, |
|
"learning_rate": 2.3554735547355475e-05, |
|
"loss": 1.0384, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.2833948339483395, |
|
"grad_norm": 32.57839584350586, |
|
"learning_rate": 2.3616236162361624e-05, |
|
"loss": 1.0573, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.28413284132841327, |
|
"grad_norm": 34.920528411865234, |
|
"learning_rate": 2.3677736777367778e-05, |
|
"loss": 0.9427, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.2848708487084871, |
|
"grad_norm": 34.9754753112793, |
|
"learning_rate": 2.3739237392373924e-05, |
|
"loss": 1.0893, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.2856088560885609, |
|
"grad_norm": 31.592897415161133, |
|
"learning_rate": 2.3800738007380074e-05, |
|
"loss": 1.1378, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.2863468634686347, |
|
"grad_norm": 32.26739501953125, |
|
"learning_rate": 2.3862238622386224e-05, |
|
"loss": 1.0627, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.2870848708487085, |
|
"grad_norm": 30.732433319091797, |
|
"learning_rate": 2.3923739237392377e-05, |
|
"loss": 1.0358, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.2878228782287823, |
|
"grad_norm": 34.005191802978516, |
|
"learning_rate": 2.3985239852398524e-05, |
|
"loss": 1.1111, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.28856088560885607, |
|
"grad_norm": 30.67894744873047, |
|
"learning_rate": 2.4046740467404673e-05, |
|
"loss": 0.9718, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.2892988929889299, |
|
"grad_norm": 28.351181030273438, |
|
"learning_rate": 2.4108241082410827e-05, |
|
"loss": 1.0609, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.2900369003690037, |
|
"grad_norm": 32.102474212646484, |
|
"learning_rate": 2.4169741697416977e-05, |
|
"loss": 1.1381, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.2907749077490775, |
|
"grad_norm": 33.687625885009766, |
|
"learning_rate": 2.4231242312423126e-05, |
|
"loss": 1.1188, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.2915129151291513, |
|
"grad_norm": 33.333797454833984, |
|
"learning_rate": 2.4292742927429276e-05, |
|
"loss": 1.1755, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.2922509225092251, |
|
"grad_norm": 29.862483978271484, |
|
"learning_rate": 2.4354243542435426e-05, |
|
"loss": 0.9939, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.29298892988929887, |
|
"grad_norm": 34.118682861328125, |
|
"learning_rate": 2.4415744157441576e-05, |
|
"loss": 1.0769, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.2937269372693727, |
|
"grad_norm": 31.04990005493164, |
|
"learning_rate": 2.4477244772447726e-05, |
|
"loss": 0.9994, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.2944649446494465, |
|
"grad_norm": 31.455734252929688, |
|
"learning_rate": 2.4538745387453876e-05, |
|
"loss": 1.052, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.2952029520295203, |
|
"grad_norm": 33.53933334350586, |
|
"learning_rate": 2.4600246002460025e-05, |
|
"loss": 1.0479, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.2952029520295203, |
|
"eval_loss": 1.3168951272964478, |
|
"eval_runtime": 307.3734, |
|
"eval_samples_per_second": 3.738, |
|
"eval_steps_per_second": 0.312, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.2959409594095941, |
|
"grad_norm": 30.59261703491211, |
|
"learning_rate": 2.4661746617466175e-05, |
|
"loss": 1.0978, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 0.2966789667896679, |
|
"grad_norm": 30.34042739868164, |
|
"learning_rate": 2.472324723247233e-05, |
|
"loss": 0.9811, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.2974169741697417, |
|
"grad_norm": 30.172008514404297, |
|
"learning_rate": 2.4784747847478475e-05, |
|
"loss": 1.1006, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 0.2981549815498155, |
|
"grad_norm": 34.521026611328125, |
|
"learning_rate": 2.4846248462484625e-05, |
|
"loss": 1.0414, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.2988929889298893, |
|
"grad_norm": 32.659603118896484, |
|
"learning_rate": 2.4907749077490778e-05, |
|
"loss": 1.0581, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.2996309963099631, |
|
"grad_norm": 30.84364128112793, |
|
"learning_rate": 2.4969249692496928e-05, |
|
"loss": 1.0734, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.3003690036900369, |
|
"grad_norm": 31.31522560119629, |
|
"learning_rate": 2.5030750307503074e-05, |
|
"loss": 1.1324, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 0.3011070110701107, |
|
"grad_norm": 30.90158462524414, |
|
"learning_rate": 2.5092250922509224e-05, |
|
"loss": 1.0875, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.3018450184501845, |
|
"grad_norm": 32.63178634643555, |
|
"learning_rate": 2.5153751537515374e-05, |
|
"loss": 0.947, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 0.3025830258302583, |
|
"grad_norm": 31.25884246826172, |
|
"learning_rate": 2.5215252152521527e-05, |
|
"loss": 0.9885, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.3033210332103321, |
|
"grad_norm": 31.27341651916504, |
|
"learning_rate": 2.5276752767527677e-05, |
|
"loss": 1.0252, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 0.3040590405904059, |
|
"grad_norm": 32.48451232910156, |
|
"learning_rate": 2.5338253382533827e-05, |
|
"loss": 0.9561, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.3047970479704797, |
|
"grad_norm": 32.380348205566406, |
|
"learning_rate": 2.5399753997539977e-05, |
|
"loss": 1.0956, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 0.30553505535055353, |
|
"grad_norm": 35.79043960571289, |
|
"learning_rate": 2.5461254612546127e-05, |
|
"loss": 0.9773, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.3062730627306273, |
|
"grad_norm": 32.07080078125, |
|
"learning_rate": 2.5522755227552277e-05, |
|
"loss": 0.9709, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.3070110701107011, |
|
"grad_norm": 30.587440490722656, |
|
"learning_rate": 2.5584255842558423e-05, |
|
"loss": 1.1602, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.3077490774907749, |
|
"grad_norm": 32.147560119628906, |
|
"learning_rate": 2.564575645756458e-05, |
|
"loss": 1.0212, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 0.3084870848708487, |
|
"grad_norm": 28.960500717163086, |
|
"learning_rate": 2.570725707257073e-05, |
|
"loss": 1.0724, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.3092250922509225, |
|
"grad_norm": 31.89568519592285, |
|
"learning_rate": 2.5768757687576876e-05, |
|
"loss": 1.0993, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 0.30996309963099633, |
|
"grad_norm": 28.9609317779541, |
|
"learning_rate": 2.5830258302583026e-05, |
|
"loss": 1.0213, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.3107011070110701, |
|
"grad_norm": 32.195152282714844, |
|
"learning_rate": 2.5891758917589176e-05, |
|
"loss": 1.1312, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 0.3114391143911439, |
|
"grad_norm": 32.34213638305664, |
|
"learning_rate": 2.5953259532595326e-05, |
|
"loss": 1.1141, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.3121771217712177, |
|
"grad_norm": 30.041015625, |
|
"learning_rate": 2.6014760147601475e-05, |
|
"loss": 1.0912, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 0.3129151291512915, |
|
"grad_norm": 34.097068786621094, |
|
"learning_rate": 2.607626076260763e-05, |
|
"loss": 1.1144, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.31365313653136534, |
|
"grad_norm": 33.118072509765625, |
|
"learning_rate": 2.613776137761378e-05, |
|
"loss": 1.0424, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.31439114391143913, |
|
"grad_norm": 32.24378967285156, |
|
"learning_rate": 2.619926199261993e-05, |
|
"loss": 1.1218, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.3151291512915129, |
|
"grad_norm": 29.910358428955078, |
|
"learning_rate": 2.6260762607626078e-05, |
|
"loss": 1.0043, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 0.3158671586715867, |
|
"grad_norm": 28.464271545410156, |
|
"learning_rate": 2.6322263222632225e-05, |
|
"loss": 1.0377, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.3166051660516605, |
|
"grad_norm": 33.54305648803711, |
|
"learning_rate": 2.6383763837638375e-05, |
|
"loss": 1.0836, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 0.3173431734317343, |
|
"grad_norm": 33.36182403564453, |
|
"learning_rate": 2.644526445264453e-05, |
|
"loss": 0.935, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.31808118081180814, |
|
"grad_norm": 30.69318962097168, |
|
"learning_rate": 2.650676506765068e-05, |
|
"loss": 0.9393, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 0.31881918819188193, |
|
"grad_norm": 31.307289123535156, |
|
"learning_rate": 2.6568265682656828e-05, |
|
"loss": 1.0578, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.3195571955719557, |
|
"grad_norm": 30.9537353515625, |
|
"learning_rate": 2.6629766297662977e-05, |
|
"loss": 0.978, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 0.3202952029520295, |
|
"grad_norm": 34.1992073059082, |
|
"learning_rate": 2.6691266912669127e-05, |
|
"loss": 1.109, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.3210332103321033, |
|
"grad_norm": 35.864681243896484, |
|
"learning_rate": 2.6752767527675277e-05, |
|
"loss": 1.0984, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.32177121771217715, |
|
"grad_norm": 37.84678649902344, |
|
"learning_rate": 2.6814268142681427e-05, |
|
"loss": 1.1034, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.32250922509225094, |
|
"grad_norm": 32.07746124267578, |
|
"learning_rate": 2.687576875768758e-05, |
|
"loss": 0.9589, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 0.32324723247232473, |
|
"grad_norm": 30.982397079467773, |
|
"learning_rate": 2.693726937269373e-05, |
|
"loss": 0.957, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.3239852398523985, |
|
"grad_norm": 32.20938491821289, |
|
"learning_rate": 2.699876998769988e-05, |
|
"loss": 1.0958, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 0.3247232472324723, |
|
"grad_norm": 30.640172958374023, |
|
"learning_rate": 2.706027060270603e-05, |
|
"loss": 1.0231, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.3254612546125461, |
|
"grad_norm": 31.90199851989746, |
|
"learning_rate": 2.7121771217712176e-05, |
|
"loss": 1.1002, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 0.32619926199261995, |
|
"grad_norm": 30.51987075805664, |
|
"learning_rate": 2.7183271832718326e-05, |
|
"loss": 1.162, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 0.32693726937269374, |
|
"grad_norm": 31.501314163208008, |
|
"learning_rate": 2.7244772447724476e-05, |
|
"loss": 1.0607, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 0.32767527675276753, |
|
"grad_norm": 28.6356143951416, |
|
"learning_rate": 2.730627306273063e-05, |
|
"loss": 0.9796, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.3284132841328413, |
|
"grad_norm": 31.74925422668457, |
|
"learning_rate": 2.736777367773678e-05, |
|
"loss": 1.1158, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.3291512915129151, |
|
"grad_norm": 34.154579162597656, |
|
"learning_rate": 2.742927429274293e-05, |
|
"loss": 1.0649, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 0.3298892988929889, |
|
"grad_norm": 32.25503158569336, |
|
"learning_rate": 2.749077490774908e-05, |
|
"loss": 1.1914, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 0.33062730627306275, |
|
"grad_norm": 37.06145477294922, |
|
"learning_rate": 2.755227552275523e-05, |
|
"loss": 1.0985, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.33136531365313654, |
|
"grad_norm": 31.48094367980957, |
|
"learning_rate": 2.761377613776138e-05, |
|
"loss": 1.0892, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 0.33210332103321033, |
|
"grad_norm": 32.612770080566406, |
|
"learning_rate": 2.767527675276753e-05, |
|
"loss": 1.0109, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.3328413284132841, |
|
"grad_norm": 31.58296775817871, |
|
"learning_rate": 2.773677736777368e-05, |
|
"loss": 0.97, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 0.3335793357933579, |
|
"grad_norm": 34.60434341430664, |
|
"learning_rate": 2.779827798277983e-05, |
|
"loss": 1.0432, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.33431734317343176, |
|
"grad_norm": 34.914894104003906, |
|
"learning_rate": 2.7859778597785978e-05, |
|
"loss": 1.1001, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 0.33505535055350555, |
|
"grad_norm": 35.59685134887695, |
|
"learning_rate": 2.7921279212792128e-05, |
|
"loss": 1.2244, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 0.33579335793357934, |
|
"grad_norm": 29.713642120361328, |
|
"learning_rate": 2.7982779827798277e-05, |
|
"loss": 0.9019, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.33653136531365313, |
|
"grad_norm": 31.13001823425293, |
|
"learning_rate": 2.8044280442804427e-05, |
|
"loss": 1.0366, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.3372693726937269, |
|
"grad_norm": 30.281965255737305, |
|
"learning_rate": 2.810578105781058e-05, |
|
"loss": 1.0273, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 0.3380073800738007, |
|
"grad_norm": 31.66211700439453, |
|
"learning_rate": 2.816728167281673e-05, |
|
"loss": 1.1194, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 0.33874538745387456, |
|
"grad_norm": 30.275386810302734, |
|
"learning_rate": 2.822878228782288e-05, |
|
"loss": 1.0575, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 0.33948339483394835, |
|
"grad_norm": 29.42925453186035, |
|
"learning_rate": 2.829028290282903e-05, |
|
"loss": 0.9656, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.34022140221402214, |
|
"grad_norm": 32.71029281616211, |
|
"learning_rate": 2.835178351783518e-05, |
|
"loss": 1.1847, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 0.34095940959409593, |
|
"grad_norm": 29.633073806762695, |
|
"learning_rate": 2.8413284132841326e-05, |
|
"loss": 1.0942, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.3416974169741697, |
|
"grad_norm": 31.828601837158203, |
|
"learning_rate": 2.8474784747847476e-05, |
|
"loss": 1.0376, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 0.34243542435424357, |
|
"grad_norm": 30.043981552124023, |
|
"learning_rate": 2.8536285362853633e-05, |
|
"loss": 1.0835, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.34317343173431736, |
|
"grad_norm": 33.54213333129883, |
|
"learning_rate": 2.8597785977859783e-05, |
|
"loss": 0.996, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.34391143911439115, |
|
"grad_norm": 29.244539260864258, |
|
"learning_rate": 2.865928659286593e-05, |
|
"loss": 1.0677, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 0.34464944649446494, |
|
"grad_norm": 30.86827278137207, |
|
"learning_rate": 2.872078720787208e-05, |
|
"loss": 0.9887, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 0.34538745387453873, |
|
"grad_norm": 31.78754997253418, |
|
"learning_rate": 2.878228782287823e-05, |
|
"loss": 0.9915, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.3461254612546125, |
|
"grad_norm": 32.79195785522461, |
|
"learning_rate": 2.884378843788438e-05, |
|
"loss": 1.1147, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 0.34686346863468637, |
|
"grad_norm": 33.397979736328125, |
|
"learning_rate": 2.8905289052890532e-05, |
|
"loss": 0.9495, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.34760147601476016, |
|
"grad_norm": 33.192649841308594, |
|
"learning_rate": 2.8966789667896682e-05, |
|
"loss": 1.0026, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 0.34833948339483395, |
|
"grad_norm": 32.53486251831055, |
|
"learning_rate": 2.9028290282902832e-05, |
|
"loss": 1.0896, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.34907749077490774, |
|
"grad_norm": 29.988269805908203, |
|
"learning_rate": 2.908979089790898e-05, |
|
"loss": 1.0286, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 0.34981549815498153, |
|
"grad_norm": 30.389328002929688, |
|
"learning_rate": 2.915129151291513e-05, |
|
"loss": 1.0617, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 0.3505535055350554, |
|
"grad_norm": 32.341678619384766, |
|
"learning_rate": 2.9212792127921278e-05, |
|
"loss": 0.9784, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.35129151291512917, |
|
"grad_norm": 34.1507453918457, |
|
"learning_rate": 2.9274292742927428e-05, |
|
"loss": 1.1268, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.35202952029520296, |
|
"grad_norm": 30.625898361206055, |
|
"learning_rate": 2.9335793357933584e-05, |
|
"loss": 1.1621, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 0.35276752767527675, |
|
"grad_norm": 29.35662841796875, |
|
"learning_rate": 2.939729397293973e-05, |
|
"loss": 0.9967, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 0.35350553505535054, |
|
"grad_norm": 28.236364364624023, |
|
"learning_rate": 2.945879458794588e-05, |
|
"loss": 1.0189, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 0.35424354243542433, |
|
"grad_norm": 29.935972213745117, |
|
"learning_rate": 2.952029520295203e-05, |
|
"loss": 1.1403, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.3549815498154982, |
|
"grad_norm": 30.732343673706055, |
|
"learning_rate": 2.958179581795818e-05, |
|
"loss": 1.0329, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 0.35571955719557197, |
|
"grad_norm": 27.611663818359375, |
|
"learning_rate": 2.964329643296433e-05, |
|
"loss": 0.9701, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 0.35645756457564576, |
|
"grad_norm": 26.146472930908203, |
|
"learning_rate": 2.970479704797048e-05, |
|
"loss": 1.0555, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 0.35719557195571955, |
|
"grad_norm": 27.38328742980957, |
|
"learning_rate": 2.9766297662976633e-05, |
|
"loss": 1.0839, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 0.35793357933579334, |
|
"grad_norm": 30.21470832824707, |
|
"learning_rate": 2.9827798277982783e-05, |
|
"loss": 0.9601, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.3586715867158672, |
|
"grad_norm": 33.275665283203125, |
|
"learning_rate": 2.9889298892988933e-05, |
|
"loss": 0.9648, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 0.359409594095941, |
|
"grad_norm": 32.144935607910156, |
|
"learning_rate": 2.995079950799508e-05, |
|
"loss": 1.0774, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 0.36014760147601477, |
|
"grad_norm": 33.03762435913086, |
|
"learning_rate": 3.001230012300123e-05, |
|
"loss": 1.0353, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.36088560885608856, |
|
"grad_norm": 29.72600555419922, |
|
"learning_rate": 3.007380073800738e-05, |
|
"loss": 1.0075, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 0.36162361623616235, |
|
"grad_norm": 31.551420211791992, |
|
"learning_rate": 3.0135301353013536e-05, |
|
"loss": 1.1612, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.36236162361623614, |
|
"grad_norm": 31.255245208740234, |
|
"learning_rate": 3.0196801968019682e-05, |
|
"loss": 1.1291, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 0.36309963099631, |
|
"grad_norm": 28.523984909057617, |
|
"learning_rate": 3.0258302583025832e-05, |
|
"loss": 0.8965, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.3638376383763838, |
|
"grad_norm": 27.026256561279297, |
|
"learning_rate": 3.0319803198031982e-05, |
|
"loss": 0.9842, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 0.36457564575645757, |
|
"grad_norm": 27.513683319091797, |
|
"learning_rate": 3.0381303813038132e-05, |
|
"loss": 1.0663, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 0.36531365313653136, |
|
"grad_norm": 28.917890548706055, |
|
"learning_rate": 3.0442804428044282e-05, |
|
"loss": 1.0083, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.36605166051660515, |
|
"grad_norm": 30.66982650756836, |
|
"learning_rate": 3.0504305043050428e-05, |
|
"loss": 1.065, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.36678966789667894, |
|
"grad_norm": 29.29199981689453, |
|
"learning_rate": 3.056580565805658e-05, |
|
"loss": 1.1113, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 0.3675276752767528, |
|
"grad_norm": 30.53307342529297, |
|
"learning_rate": 3.0627306273062735e-05, |
|
"loss": 1.0564, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 0.3682656826568266, |
|
"grad_norm": 27.8240909576416, |
|
"learning_rate": 3.068880688806888e-05, |
|
"loss": 1.086, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 0.36900369003690037, |
|
"grad_norm": 33.0767936706543, |
|
"learning_rate": 3.0750307503075034e-05, |
|
"loss": 1.0258, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.36900369003690037, |
|
"eval_loss": 1.317694902420044, |
|
"eval_runtime": 307.5192, |
|
"eval_samples_per_second": 3.736, |
|
"eval_steps_per_second": 0.312, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.36974169741697416, |
|
"grad_norm": 29.415969848632812, |
|
"learning_rate": 3.081180811808118e-05, |
|
"loss": 1.1926, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 0.37047970479704795, |
|
"grad_norm": 28.967937469482422, |
|
"learning_rate": 3.087330873308733e-05, |
|
"loss": 1.0652, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 0.3712177121771218, |
|
"grad_norm": 30.757186889648438, |
|
"learning_rate": 3.093480934809348e-05, |
|
"loss": 1.0759, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 0.3719557195571956, |
|
"grad_norm": 29.12079429626465, |
|
"learning_rate": 3.0996309963099634e-05, |
|
"loss": 1.0171, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 0.3726937269372694, |
|
"grad_norm": 27.398155212402344, |
|
"learning_rate": 3.105781057810579e-05, |
|
"loss": 1.0255, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.37343173431734317, |
|
"grad_norm": 30.28290557861328, |
|
"learning_rate": 3.1119311193111933e-05, |
|
"loss": 1.0215, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 0.37416974169741696, |
|
"grad_norm": 32.874385833740234, |
|
"learning_rate": 3.118081180811808e-05, |
|
"loss": 0.9185, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 0.37490774907749075, |
|
"grad_norm": 32.606929779052734, |
|
"learning_rate": 3.124231242312423e-05, |
|
"loss": 0.921, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 0.3756457564575646, |
|
"grad_norm": 32.026466369628906, |
|
"learning_rate": 3.130381303813038e-05, |
|
"loss": 0.9647, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 0.3763837638376384, |
|
"grad_norm": 28.804256439208984, |
|
"learning_rate": 3.136531365313653e-05, |
|
"loss": 0.9783, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.3771217712177122, |
|
"grad_norm": 33.4760627746582, |
|
"learning_rate": 3.1426814268142686e-05, |
|
"loss": 1.1102, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 0.37785977859778597, |
|
"grad_norm": 27.7533016204834, |
|
"learning_rate": 3.148831488314883e-05, |
|
"loss": 1.0607, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 0.37859778597785976, |
|
"grad_norm": 30.21308135986328, |
|
"learning_rate": 3.1549815498154986e-05, |
|
"loss": 0.9921, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 0.3793357933579336, |
|
"grad_norm": 30.123981475830078, |
|
"learning_rate": 3.161131611316113e-05, |
|
"loss": 1.0603, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 0.3800738007380074, |
|
"grad_norm": 31.298110961914062, |
|
"learning_rate": 3.167281672816728e-05, |
|
"loss": 1.0396, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.3808118081180812, |
|
"grad_norm": 29.31854248046875, |
|
"learning_rate": 3.173431734317343e-05, |
|
"loss": 1.0797, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 0.381549815498155, |
|
"grad_norm": 32.191680908203125, |
|
"learning_rate": 3.1795817958179585e-05, |
|
"loss": 0.9568, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 0.38228782287822877, |
|
"grad_norm": 31.62862777709961, |
|
"learning_rate": 3.185731857318573e-05, |
|
"loss": 1.1659, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 0.38302583025830256, |
|
"grad_norm": 28.874908447265625, |
|
"learning_rate": 3.1918819188191885e-05, |
|
"loss": 1.0192, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 0.3837638376383764, |
|
"grad_norm": 28.602893829345703, |
|
"learning_rate": 3.198031980319803e-05, |
|
"loss": 1.064, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.3845018450184502, |
|
"grad_norm": 30.128530502319336, |
|
"learning_rate": 3.2041820418204185e-05, |
|
"loss": 0.9613, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 0.385239852398524, |
|
"grad_norm": 29.335969924926758, |
|
"learning_rate": 3.210332103321033e-05, |
|
"loss": 1.0305, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 0.3859778597785978, |
|
"grad_norm": 28.34609031677246, |
|
"learning_rate": 3.2164821648216484e-05, |
|
"loss": 1.1001, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 0.38671586715867157, |
|
"grad_norm": 29.133621215820312, |
|
"learning_rate": 3.222632226322264e-05, |
|
"loss": 1.0011, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 0.3874538745387454, |
|
"grad_norm": 29.79188346862793, |
|
"learning_rate": 3.2287822878228784e-05, |
|
"loss": 0.8858, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.3881918819188192, |
|
"grad_norm": 33.12505340576172, |
|
"learning_rate": 3.234932349323494e-05, |
|
"loss": 1.0749, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 0.388929889298893, |
|
"grad_norm": 28.103736877441406, |
|
"learning_rate": 3.2410824108241084e-05, |
|
"loss": 1.02, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 0.3896678966789668, |
|
"grad_norm": 29.42950439453125, |
|
"learning_rate": 3.247232472324723e-05, |
|
"loss": 1.0181, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 0.3904059040590406, |
|
"grad_norm": 28.812963485717773, |
|
"learning_rate": 3.2533825338253383e-05, |
|
"loss": 1.1254, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 0.39114391143911437, |
|
"grad_norm": 30.136219024658203, |
|
"learning_rate": 3.259532595325954e-05, |
|
"loss": 1.1222, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.3918819188191882, |
|
"grad_norm": 33.467960357666016, |
|
"learning_rate": 3.265682656826568e-05, |
|
"loss": 1.0028, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 0.392619926199262, |
|
"grad_norm": 32.62849044799805, |
|
"learning_rate": 3.2718327183271836e-05, |
|
"loss": 1.1019, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 0.3933579335793358, |
|
"grad_norm": 31.51215171813965, |
|
"learning_rate": 3.277982779827798e-05, |
|
"loss": 1.1408, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 0.3940959409594096, |
|
"grad_norm": 31.761720657348633, |
|
"learning_rate": 3.2841328413284136e-05, |
|
"loss": 0.9927, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 0.3948339483394834, |
|
"grad_norm": 28.129587173461914, |
|
"learning_rate": 3.290282902829028e-05, |
|
"loss": 0.9439, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.3955719557195572, |
|
"grad_norm": 31.913143157958984, |
|
"learning_rate": 3.296432964329643e-05, |
|
"loss": 1.0182, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 0.396309963099631, |
|
"grad_norm": 28.858692169189453, |
|
"learning_rate": 3.302583025830259e-05, |
|
"loss": 1.1423, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 0.3970479704797048, |
|
"grad_norm": 39.564964294433594, |
|
"learning_rate": 3.3087330873308736e-05, |
|
"loss": 1.0672, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 0.3977859778597786, |
|
"grad_norm": 35.25300216674805, |
|
"learning_rate": 3.314883148831489e-05, |
|
"loss": 1.0794, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 0.3985239852398524, |
|
"grad_norm": 28.474002838134766, |
|
"learning_rate": 3.3210332103321035e-05, |
|
"loss": 1.1484, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.3992619926199262, |
|
"grad_norm": 33.87021255493164, |
|
"learning_rate": 3.327183271832718e-05, |
|
"loss": 1.1114, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 28.42962074279785, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 1.0833, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 0.4007380073800738, |
|
"grad_norm": 30.21544075012207, |
|
"learning_rate": 3.339483394833948e-05, |
|
"loss": 1.1128, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 0.4014760147601476, |
|
"grad_norm": 29.623260498046875, |
|
"learning_rate": 3.3456334563345635e-05, |
|
"loss": 1.0984, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 0.4022140221402214, |
|
"grad_norm": 34.08790588378906, |
|
"learning_rate": 3.351783517835179e-05, |
|
"loss": 1.1091, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.4029520295202952, |
|
"grad_norm": 33.139915466308594, |
|
"learning_rate": 3.3579335793357934e-05, |
|
"loss": 1.0295, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 0.40369003690036903, |
|
"grad_norm": 35.7862663269043, |
|
"learning_rate": 3.364083640836409e-05, |
|
"loss": 1.1139, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 0.4044280442804428, |
|
"grad_norm": 28.253767013549805, |
|
"learning_rate": 3.3702337023370234e-05, |
|
"loss": 1.0673, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 0.4051660516605166, |
|
"grad_norm": 32.525115966796875, |
|
"learning_rate": 3.376383763837638e-05, |
|
"loss": 1.0096, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 0.4059040590405904, |
|
"grad_norm": 27.90035057067871, |
|
"learning_rate": 3.382533825338254e-05, |
|
"loss": 0.9391, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.4066420664206642, |
|
"grad_norm": 30.637134552001953, |
|
"learning_rate": 3.388683886838869e-05, |
|
"loss": 1.187, |
|
"step": 551 |
|
}, |
|
{ |
|
"epoch": 0.407380073800738, |
|
"grad_norm": 29.55883026123047, |
|
"learning_rate": 3.3948339483394833e-05, |
|
"loss": 1.0611, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 0.40811808118081183, |
|
"grad_norm": 30.938365936279297, |
|
"learning_rate": 3.400984009840099e-05, |
|
"loss": 1.0229, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 0.4088560885608856, |
|
"grad_norm": 29.01971435546875, |
|
"learning_rate": 3.407134071340713e-05, |
|
"loss": 1.1979, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 0.4095940959409594, |
|
"grad_norm": 28.88690185546875, |
|
"learning_rate": 3.4132841328413286e-05, |
|
"loss": 0.9652, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.4103321033210332, |
|
"grad_norm": 30.13008689880371, |
|
"learning_rate": 3.419434194341943e-05, |
|
"loss": 1.023, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 0.411070110701107, |
|
"grad_norm": 30.277244567871094, |
|
"learning_rate": 3.4255842558425586e-05, |
|
"loss": 1.0419, |
|
"step": 557 |
|
}, |
|
{ |
|
"epoch": 0.4118081180811808, |
|
"grad_norm": 31.770061492919922, |
|
"learning_rate": 3.431734317343174e-05, |
|
"loss": 1.1385, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 0.41254612546125463, |
|
"grad_norm": 28.85527992248535, |
|
"learning_rate": 3.4378843788437886e-05, |
|
"loss": 1.0576, |
|
"step": 559 |
|
}, |
|
{ |
|
"epoch": 0.4132841328413284, |
|
"grad_norm": 27.674936294555664, |
|
"learning_rate": 3.444034440344404e-05, |
|
"loss": 0.9695, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.4140221402214022, |
|
"grad_norm": 30.44672203063965, |
|
"learning_rate": 3.4501845018450186e-05, |
|
"loss": 1.1668, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 0.414760147601476, |
|
"grad_norm": 26.084020614624023, |
|
"learning_rate": 3.456334563345633e-05, |
|
"loss": 1.0662, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 0.4154981549815498, |
|
"grad_norm": 29.204233169555664, |
|
"learning_rate": 3.4624846248462485e-05, |
|
"loss": 1.0049, |
|
"step": 563 |
|
}, |
|
{ |
|
"epoch": 0.41623616236162364, |
|
"grad_norm": 31.064088821411133, |
|
"learning_rate": 3.468634686346864e-05, |
|
"loss": 1.0733, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 0.41697416974169743, |
|
"grad_norm": 28.714794158935547, |
|
"learning_rate": 3.4747847478474785e-05, |
|
"loss": 1.1252, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.4177121771217712, |
|
"grad_norm": 36.692623138427734, |
|
"learning_rate": 3.480934809348094e-05, |
|
"loss": 1.1517, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 0.418450184501845, |
|
"grad_norm": 29.342973709106445, |
|
"learning_rate": 3.4870848708487085e-05, |
|
"loss": 1.1617, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 0.4191881918819188, |
|
"grad_norm": 30.187889099121094, |
|
"learning_rate": 3.493234932349324e-05, |
|
"loss": 1.1766, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 0.4199261992619926, |
|
"grad_norm": 27.71148681640625, |
|
"learning_rate": 3.4993849938499384e-05, |
|
"loss": 1.1307, |
|
"step": 569 |
|
}, |
|
{ |
|
"epoch": 0.42066420664206644, |
|
"grad_norm": 26.817026138305664, |
|
"learning_rate": 3.505535055350554e-05, |
|
"loss": 1.1422, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.42140221402214023, |
|
"grad_norm": 29.25654411315918, |
|
"learning_rate": 3.511685116851169e-05, |
|
"loss": 1.0934, |
|
"step": 571 |
|
}, |
|
{ |
|
"epoch": 0.422140221402214, |
|
"grad_norm": 28.460424423217773, |
|
"learning_rate": 3.517835178351784e-05, |
|
"loss": 1.1308, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 0.4228782287822878, |
|
"grad_norm": 27.779157638549805, |
|
"learning_rate": 3.5239852398523984e-05, |
|
"loss": 1.0648, |
|
"step": 573 |
|
}, |
|
{ |
|
"epoch": 0.4236162361623616, |
|
"grad_norm": 32.28572082519531, |
|
"learning_rate": 3.530135301353014e-05, |
|
"loss": 1.0195, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 0.42435424354243545, |
|
"grad_norm": 30.577444076538086, |
|
"learning_rate": 3.5362853628536283e-05, |
|
"loss": 1.0581, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.42509225092250924, |
|
"grad_norm": 27.929576873779297, |
|
"learning_rate": 3.542435424354244e-05, |
|
"loss": 1.0838, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 0.42583025830258303, |
|
"grad_norm": 30.955745697021484, |
|
"learning_rate": 3.548585485854859e-05, |
|
"loss": 1.0065, |
|
"step": 577 |
|
}, |
|
{ |
|
"epoch": 0.4265682656826568, |
|
"grad_norm": 30.847639083862305, |
|
"learning_rate": 3.5547355473554736e-05, |
|
"loss": 1.0464, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 0.4273062730627306, |
|
"grad_norm": 26.83955192565918, |
|
"learning_rate": 3.560885608856089e-05, |
|
"loss": 1.0382, |
|
"step": 579 |
|
}, |
|
{ |
|
"epoch": 0.4280442804428044, |
|
"grad_norm": 28.2490177154541, |
|
"learning_rate": 3.5670356703567036e-05, |
|
"loss": 0.9712, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.42878228782287825, |
|
"grad_norm": 28.63175392150879, |
|
"learning_rate": 3.573185731857319e-05, |
|
"loss": 0.9944, |
|
"step": 581 |
|
}, |
|
{ |
|
"epoch": 0.42952029520295204, |
|
"grad_norm": 27.138669967651367, |
|
"learning_rate": 3.5793357933579336e-05, |
|
"loss": 1.1288, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 0.43025830258302583, |
|
"grad_norm": 28.75208282470703, |
|
"learning_rate": 3.585485854858548e-05, |
|
"loss": 1.0389, |
|
"step": 583 |
|
}, |
|
{ |
|
"epoch": 0.4309963099630996, |
|
"grad_norm": 29.765209197998047, |
|
"learning_rate": 3.591635916359164e-05, |
|
"loss": 1.0375, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 0.4317343173431734, |
|
"grad_norm": 31.77211570739746, |
|
"learning_rate": 3.597785977859779e-05, |
|
"loss": 1.1282, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.43247232472324726, |
|
"grad_norm": 28.593671798706055, |
|
"learning_rate": 3.6039360393603935e-05, |
|
"loss": 1.0487, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 0.43321033210332105, |
|
"grad_norm": 28.624773025512695, |
|
"learning_rate": 3.610086100861009e-05, |
|
"loss": 1.0686, |
|
"step": 587 |
|
}, |
|
{ |
|
"epoch": 0.43394833948339484, |
|
"grad_norm": 27.676698684692383, |
|
"learning_rate": 3.6162361623616235e-05, |
|
"loss": 1.1286, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 0.43468634686346863, |
|
"grad_norm": 28.334789276123047, |
|
"learning_rate": 3.622386223862239e-05, |
|
"loss": 1.0686, |
|
"step": 589 |
|
}, |
|
{ |
|
"epoch": 0.4354243542435424, |
|
"grad_norm": 24.738544464111328, |
|
"learning_rate": 3.628536285362854e-05, |
|
"loss": 0.8636, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.4361623616236162, |
|
"grad_norm": 29.112049102783203, |
|
"learning_rate": 3.634686346863469e-05, |
|
"loss": 0.9369, |
|
"step": 591 |
|
}, |
|
{ |
|
"epoch": 0.43690036900369006, |
|
"grad_norm": 29.67219352722168, |
|
"learning_rate": 3.640836408364084e-05, |
|
"loss": 1.0318, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 0.43763837638376385, |
|
"grad_norm": 32.45582580566406, |
|
"learning_rate": 3.646986469864699e-05, |
|
"loss": 1.0077, |
|
"step": 593 |
|
}, |
|
{ |
|
"epoch": 0.43837638376383764, |
|
"grad_norm": 30.126052856445312, |
|
"learning_rate": 3.653136531365314e-05, |
|
"loss": 1.1069, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 0.43911439114391143, |
|
"grad_norm": 30.43257713317871, |
|
"learning_rate": 3.659286592865929e-05, |
|
"loss": 1.0041, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.4398523985239852, |
|
"grad_norm": 28.884113311767578, |
|
"learning_rate": 3.6654366543665434e-05, |
|
"loss": 1.0136, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 0.44059040590405907, |
|
"grad_norm": 28.1043758392334, |
|
"learning_rate": 3.6715867158671594e-05, |
|
"loss": 1.0784, |
|
"step": 597 |
|
}, |
|
{ |
|
"epoch": 0.44132841328413286, |
|
"grad_norm": 29.222322463989258, |
|
"learning_rate": 3.677736777367774e-05, |
|
"loss": 0.994, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 0.44206642066420665, |
|
"grad_norm": 31.78004264831543, |
|
"learning_rate": 3.683886838868389e-05, |
|
"loss": 0.9989, |
|
"step": 599 |
|
}, |
|
{ |
|
"epoch": 0.44280442804428044, |
|
"grad_norm": 26.486068725585938, |
|
"learning_rate": 3.690036900369004e-05, |
|
"loss": 1.0936, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.44280442804428044, |
|
"eval_loss": 1.3267521858215332, |
|
"eval_runtime": 309.179, |
|
"eval_samples_per_second": 3.716, |
|
"eval_steps_per_second": 0.31, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.44354243542435423, |
|
"grad_norm": 28.453187942504883, |
|
"learning_rate": 3.6961869618696186e-05, |
|
"loss": 1.0252, |
|
"step": 601 |
|
}, |
|
{ |
|
"epoch": 0.444280442804428, |
|
"grad_norm": 30.434410095214844, |
|
"learning_rate": 3.702337023370234e-05, |
|
"loss": 1.1192, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 0.44501845018450187, |
|
"grad_norm": 28.11585807800293, |
|
"learning_rate": 3.7084870848708486e-05, |
|
"loss": 0.9912, |
|
"step": 603 |
|
}, |
|
{ |
|
"epoch": 0.44575645756457566, |
|
"grad_norm": 32.852027893066406, |
|
"learning_rate": 3.714637146371464e-05, |
|
"loss": 0.9976, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 0.44649446494464945, |
|
"grad_norm": 26.785593032836914, |
|
"learning_rate": 3.720787207872079e-05, |
|
"loss": 1.0799, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.44723247232472324, |
|
"grad_norm": 28.873849868774414, |
|
"learning_rate": 3.726937269372694e-05, |
|
"loss": 1.0319, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 0.44797047970479703, |
|
"grad_norm": 31.951059341430664, |
|
"learning_rate": 3.7330873308733085e-05, |
|
"loss": 1.0665, |
|
"step": 607 |
|
}, |
|
{ |
|
"epoch": 0.4487084870848708, |
|
"grad_norm": 26.902822494506836, |
|
"learning_rate": 3.739237392373924e-05, |
|
"loss": 1.0973, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 0.44944649446494467, |
|
"grad_norm": 31.43962287902832, |
|
"learning_rate": 3.7453874538745385e-05, |
|
"loss": 0.997, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 0.45018450184501846, |
|
"grad_norm": 28.310514450073242, |
|
"learning_rate": 3.7515375153751545e-05, |
|
"loss": 1.1017, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.45092250922509225, |
|
"grad_norm": 26.364179611206055, |
|
"learning_rate": 3.757687576875769e-05, |
|
"loss": 1.087, |
|
"step": 611 |
|
}, |
|
{ |
|
"epoch": 0.45166051660516604, |
|
"grad_norm": 26.653833389282227, |
|
"learning_rate": 3.763837638376384e-05, |
|
"loss": 1.0061, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 0.45239852398523983, |
|
"grad_norm": 30.07135581970215, |
|
"learning_rate": 3.769987699876999e-05, |
|
"loss": 1.0957, |
|
"step": 613 |
|
}, |
|
{ |
|
"epoch": 0.4531365313653137, |
|
"grad_norm": 27.822776794433594, |
|
"learning_rate": 3.776137761377614e-05, |
|
"loss": 1.0992, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 0.45387453874538747, |
|
"grad_norm": 31.2148494720459, |
|
"learning_rate": 3.782287822878229e-05, |
|
"loss": 0.9902, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.45461254612546126, |
|
"grad_norm": 34.85270309448242, |
|
"learning_rate": 3.788437884378844e-05, |
|
"loss": 1.1163, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 0.45535055350553505, |
|
"grad_norm": 27.64411735534668, |
|
"learning_rate": 3.794587945879459e-05, |
|
"loss": 1.1273, |
|
"step": 617 |
|
}, |
|
{ |
|
"epoch": 0.45608856088560884, |
|
"grad_norm": 28.515451431274414, |
|
"learning_rate": 3.8007380073800744e-05, |
|
"loss": 1.0642, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 0.45682656826568263, |
|
"grad_norm": 34.522491455078125, |
|
"learning_rate": 3.806888068880689e-05, |
|
"loss": 0.9994, |
|
"step": 619 |
|
}, |
|
{ |
|
"epoch": 0.4575645756457565, |
|
"grad_norm": 30.255014419555664, |
|
"learning_rate": 3.813038130381304e-05, |
|
"loss": 1.091, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.45830258302583027, |
|
"grad_norm": 30.578969955444336, |
|
"learning_rate": 3.819188191881919e-05, |
|
"loss": 1.1066, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 0.45904059040590406, |
|
"grad_norm": 27.243410110473633, |
|
"learning_rate": 3.825338253382534e-05, |
|
"loss": 1.1007, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 0.45977859778597785, |
|
"grad_norm": 29.49376678466797, |
|
"learning_rate": 3.831488314883149e-05, |
|
"loss": 1.1645, |
|
"step": 623 |
|
}, |
|
{ |
|
"epoch": 0.46051660516605164, |
|
"grad_norm": 30.315433502197266, |
|
"learning_rate": 3.837638376383764e-05, |
|
"loss": 1.0911, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 0.4612546125461255, |
|
"grad_norm": 31.19307518005371, |
|
"learning_rate": 3.843788437884379e-05, |
|
"loss": 1.1693, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.4619926199261993, |
|
"grad_norm": 27.844942092895508, |
|
"learning_rate": 3.849938499384994e-05, |
|
"loss": 1.0592, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 0.46273062730627307, |
|
"grad_norm": 29.83812141418457, |
|
"learning_rate": 3.856088560885609e-05, |
|
"loss": 1.0263, |
|
"step": 627 |
|
}, |
|
{ |
|
"epoch": 0.46346863468634686, |
|
"grad_norm": 27.992292404174805, |
|
"learning_rate": 3.862238622386224e-05, |
|
"loss": 1.0808, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 0.46420664206642065, |
|
"grad_norm": 27.693565368652344, |
|
"learning_rate": 3.868388683886839e-05, |
|
"loss": 0.9699, |
|
"step": 629 |
|
}, |
|
{ |
|
"epoch": 0.46494464944649444, |
|
"grad_norm": 29.0965633392334, |
|
"learning_rate": 3.874538745387454e-05, |
|
"loss": 1.1051, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.4656826568265683, |
|
"grad_norm": 29.10242462158203, |
|
"learning_rate": 3.8806888068880695e-05, |
|
"loss": 1.0039, |
|
"step": 631 |
|
}, |
|
{ |
|
"epoch": 0.4664206642066421, |
|
"grad_norm": 32.43134307861328, |
|
"learning_rate": 3.886838868388684e-05, |
|
"loss": 1.064, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 0.46715867158671587, |
|
"grad_norm": 29.64716148376465, |
|
"learning_rate": 3.892988929889299e-05, |
|
"loss": 1.0935, |
|
"step": 633 |
|
}, |
|
{ |
|
"epoch": 0.46789667896678966, |
|
"grad_norm": 29.36592674255371, |
|
"learning_rate": 3.899138991389914e-05, |
|
"loss": 1.0937, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 0.46863468634686345, |
|
"grad_norm": 28.95639991760254, |
|
"learning_rate": 3.905289052890529e-05, |
|
"loss": 1.0853, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.4693726937269373, |
|
"grad_norm": 29.89202308654785, |
|
"learning_rate": 3.911439114391144e-05, |
|
"loss": 1.0811, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 0.4701107011070111, |
|
"grad_norm": 29.48238754272461, |
|
"learning_rate": 3.9175891758917595e-05, |
|
"loss": 1.1506, |
|
"step": 637 |
|
}, |
|
{ |
|
"epoch": 0.4708487084870849, |
|
"grad_norm": 28.27334213256836, |
|
"learning_rate": 3.923739237392374e-05, |
|
"loss": 1.1197, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 0.47158671586715867, |
|
"grad_norm": 28.055349349975586, |
|
"learning_rate": 3.9298892988929894e-05, |
|
"loss": 1.0456, |
|
"step": 639 |
|
}, |
|
{ |
|
"epoch": 0.47232472324723246, |
|
"grad_norm": 27.234216690063477, |
|
"learning_rate": 3.936039360393604e-05, |
|
"loss": 0.9234, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.47306273062730625, |
|
"grad_norm": 28.06637191772461, |
|
"learning_rate": 3.942189421894219e-05, |
|
"loss": 1.0727, |
|
"step": 641 |
|
}, |
|
{ |
|
"epoch": 0.4738007380073801, |
|
"grad_norm": 32.17995834350586, |
|
"learning_rate": 3.948339483394834e-05, |
|
"loss": 0.9436, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 0.4745387453874539, |
|
"grad_norm": 34.09589767456055, |
|
"learning_rate": 3.954489544895449e-05, |
|
"loss": 1.1217, |
|
"step": 643 |
|
}, |
|
{ |
|
"epoch": 0.4752767527675277, |
|
"grad_norm": 28.410308837890625, |
|
"learning_rate": 3.960639606396064e-05, |
|
"loss": 1.2026, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 0.47601476014760147, |
|
"grad_norm": 30.437602996826172, |
|
"learning_rate": 3.9667896678966793e-05, |
|
"loss": 1.197, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.47675276752767526, |
|
"grad_norm": 25.85258674621582, |
|
"learning_rate": 3.972939729397294e-05, |
|
"loss": 0.9193, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 0.4774907749077491, |
|
"grad_norm": 30.591075897216797, |
|
"learning_rate": 3.979089790897909e-05, |
|
"loss": 1.0067, |
|
"step": 647 |
|
}, |
|
{ |
|
"epoch": 0.4782287822878229, |
|
"grad_norm": 35.56831741333008, |
|
"learning_rate": 3.985239852398524e-05, |
|
"loss": 1.0091, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 0.4789667896678967, |
|
"grad_norm": 28.925878524780273, |
|
"learning_rate": 3.991389913899139e-05, |
|
"loss": 1.0451, |
|
"step": 649 |
|
}, |
|
{ |
|
"epoch": 0.4797047970479705, |
|
"grad_norm": 26.45174789428711, |
|
"learning_rate": 3.9975399753997546e-05, |
|
"loss": 1.1113, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.48044280442804427, |
|
"grad_norm": 32.575260162353516, |
|
"learning_rate": 4.003690036900369e-05, |
|
"loss": 1.1129, |
|
"step": 651 |
|
}, |
|
{ |
|
"epoch": 0.48118081180811806, |
|
"grad_norm": 31.939918518066406, |
|
"learning_rate": 4.0098400984009846e-05, |
|
"loss": 1.0175, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 0.4819188191881919, |
|
"grad_norm": 72.9084701538086, |
|
"learning_rate": 4.015990159901599e-05, |
|
"loss": 0.966, |
|
"step": 653 |
|
}, |
|
{ |
|
"epoch": 0.4826568265682657, |
|
"grad_norm": 32.10757827758789, |
|
"learning_rate": 4.022140221402214e-05, |
|
"loss": 1.124, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 0.4833948339483395, |
|
"grad_norm": 35.528778076171875, |
|
"learning_rate": 4.028290282902829e-05, |
|
"loss": 1.133, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.4841328413284133, |
|
"grad_norm": 29.31783676147461, |
|
"learning_rate": 4.034440344403444e-05, |
|
"loss": 1.0159, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 0.48487084870848707, |
|
"grad_norm": 28.90894889831543, |
|
"learning_rate": 4.040590405904059e-05, |
|
"loss": 1.0186, |
|
"step": 657 |
|
}, |
|
{ |
|
"epoch": 0.48560885608856086, |
|
"grad_norm": 27.08976173400879, |
|
"learning_rate": 4.0467404674046745e-05, |
|
"loss": 0.8565, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 0.4863468634686347, |
|
"grad_norm": 32.08723831176758, |
|
"learning_rate": 4.052890528905289e-05, |
|
"loss": 0.9848, |
|
"step": 659 |
|
}, |
|
{ |
|
"epoch": 0.4870848708487085, |
|
"grad_norm": 31.9980525970459, |
|
"learning_rate": 4.0590405904059045e-05, |
|
"loss": 1.2406, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.4878228782287823, |
|
"grad_norm": 27.090219497680664, |
|
"learning_rate": 4.065190651906519e-05, |
|
"loss": 1.2235, |
|
"step": 661 |
|
}, |
|
{ |
|
"epoch": 0.4885608856088561, |
|
"grad_norm": 42.83357620239258, |
|
"learning_rate": 4.071340713407134e-05, |
|
"loss": 1.1278, |
|
"step": 662 |
|
}, |
|
{ |
|
"epoch": 0.48929889298892987, |
|
"grad_norm": 28.690671920776367, |
|
"learning_rate": 4.077490774907749e-05, |
|
"loss": 1.19, |
|
"step": 663 |
|
}, |
|
{ |
|
"epoch": 0.4900369003690037, |
|
"grad_norm": 32.07972717285156, |
|
"learning_rate": 4.0836408364083644e-05, |
|
"loss": 1.1053, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 0.4907749077490775, |
|
"grad_norm": 29.517995834350586, |
|
"learning_rate": 4.08979089790898e-05, |
|
"loss": 0.9924, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.4915129151291513, |
|
"grad_norm": 36.88546371459961, |
|
"learning_rate": 4.0959409594095944e-05, |
|
"loss": 1.097, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 0.4922509225092251, |
|
"grad_norm": 27.41716957092285, |
|
"learning_rate": 4.102091020910209e-05, |
|
"loss": 1.1743, |
|
"step": 667 |
|
}, |
|
{ |
|
"epoch": 0.4929889298892989, |
|
"grad_norm": 36.04215621948242, |
|
"learning_rate": 4.108241082410824e-05, |
|
"loss": 1.0724, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 0.49372693726937267, |
|
"grad_norm": 31.058218002319336, |
|
"learning_rate": 4.114391143911439e-05, |
|
"loss": 1.2224, |
|
"step": 669 |
|
}, |
|
{ |
|
"epoch": 0.4944649446494465, |
|
"grad_norm": 30.21110725402832, |
|
"learning_rate": 4.120541205412054e-05, |
|
"loss": 1.0559, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.4952029520295203, |
|
"grad_norm": 317.8634033203125, |
|
"learning_rate": 4.1266912669126696e-05, |
|
"loss": 1.1668, |
|
"step": 671 |
|
}, |
|
{ |
|
"epoch": 0.4959409594095941, |
|
"grad_norm": 30.09259605407715, |
|
"learning_rate": 4.132841328413284e-05, |
|
"loss": 1.107, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 0.4966789667896679, |
|
"grad_norm": 30.432334899902344, |
|
"learning_rate": 4.1389913899138996e-05, |
|
"loss": 1.0314, |
|
"step": 673 |
|
}, |
|
{ |
|
"epoch": 0.4974169741697417, |
|
"grad_norm": 29.147876739501953, |
|
"learning_rate": 4.145141451414514e-05, |
|
"loss": 1.1293, |
|
"step": 674 |
|
}, |
|
{ |
|
"epoch": 0.4981549815498155, |
|
"grad_norm": 28.299036026000977, |
|
"learning_rate": 4.151291512915129e-05, |
|
"loss": 0.9126, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.4988929889298893, |
|
"grad_norm": 27.47956085205078, |
|
"learning_rate": 4.157441574415744e-05, |
|
"loss": 1.1542, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 0.4996309963099631, |
|
"grad_norm": 31.645191192626953, |
|
"learning_rate": 4.1635916359163595e-05, |
|
"loss": 1.0069, |
|
"step": 677 |
|
}, |
|
{ |
|
"epoch": 0.5003690036900369, |
|
"grad_norm": 28.30335235595703, |
|
"learning_rate": 4.169741697416974e-05, |
|
"loss": 1.1189, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 0.5011070110701107, |
|
"grad_norm": 28.922136306762695, |
|
"learning_rate": 4.1758917589175895e-05, |
|
"loss": 1.0529, |
|
"step": 679 |
|
}, |
|
{ |
|
"epoch": 0.5018450184501845, |
|
"grad_norm": 29.070533752441406, |
|
"learning_rate": 4.182041820418204e-05, |
|
"loss": 1.0434, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.5025830258302583, |
|
"grad_norm": 34.41718292236328, |
|
"learning_rate": 4.1881918819188195e-05, |
|
"loss": 1.0136, |
|
"step": 681 |
|
}, |
|
{ |
|
"epoch": 0.5033210332103321, |
|
"grad_norm": 30.644197463989258, |
|
"learning_rate": 4.194341943419434e-05, |
|
"loss": 1.2139, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 0.5040590405904058, |
|
"grad_norm": 31.38071060180664, |
|
"learning_rate": 4.2004920049200495e-05, |
|
"loss": 1.0473, |
|
"step": 683 |
|
}, |
|
{ |
|
"epoch": 0.5047970479704798, |
|
"grad_norm": 28.35428237915039, |
|
"learning_rate": 4.206642066420665e-05, |
|
"loss": 1.1185, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 0.5055350553505535, |
|
"grad_norm": 30.84862518310547, |
|
"learning_rate": 4.2127921279212794e-05, |
|
"loss": 1.0605, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.5062730627306273, |
|
"grad_norm": 28.12001609802246, |
|
"learning_rate": 4.218942189421895e-05, |
|
"loss": 1.0421, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 0.5070110701107011, |
|
"grad_norm": 94.46589660644531, |
|
"learning_rate": 4.2250922509225094e-05, |
|
"loss": 1.1904, |
|
"step": 687 |
|
}, |
|
{ |
|
"epoch": 0.5077490774907749, |
|
"grad_norm": 28.075532913208008, |
|
"learning_rate": 4.231242312423124e-05, |
|
"loss": 0.9612, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 0.5084870848708487, |
|
"grad_norm": 33.0609245300293, |
|
"learning_rate": 4.2373923739237394e-05, |
|
"loss": 1.1283, |
|
"step": 689 |
|
}, |
|
{ |
|
"epoch": 0.5092250922509225, |
|
"grad_norm": 31.729276657104492, |
|
"learning_rate": 4.243542435424355e-05, |
|
"loss": 1.1253, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.5099630996309963, |
|
"grad_norm": 29.362197875976562, |
|
"learning_rate": 4.249692496924969e-05, |
|
"loss": 1.0577, |
|
"step": 691 |
|
}, |
|
{ |
|
"epoch": 0.5107011070110701, |
|
"grad_norm": 27.433551788330078, |
|
"learning_rate": 4.2558425584255847e-05, |
|
"loss": 0.9175, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 0.5114391143911439, |
|
"grad_norm": 28.477914810180664, |
|
"learning_rate": 4.261992619926199e-05, |
|
"loss": 1.1097, |
|
"step": 693 |
|
}, |
|
{ |
|
"epoch": 0.5121771217712177, |
|
"grad_norm": 26.180309295654297, |
|
"learning_rate": 4.2681426814268146e-05, |
|
"loss": 0.9585, |
|
"step": 694 |
|
}, |
|
{ |
|
"epoch": 0.5129151291512916, |
|
"grad_norm": 28.950037002563477, |
|
"learning_rate": 4.274292742927429e-05, |
|
"loss": 0.9945, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.5136531365313654, |
|
"grad_norm": 33.97092819213867, |
|
"learning_rate": 4.280442804428044e-05, |
|
"loss": 1.1573, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 0.5143911439114391, |
|
"grad_norm": 31.86573600769043, |
|
"learning_rate": 4.28659286592866e-05, |
|
"loss": 1.1061, |
|
"step": 697 |
|
}, |
|
{ |
|
"epoch": 0.5151291512915129, |
|
"grad_norm": 35.693443298339844, |
|
"learning_rate": 4.2927429274292746e-05, |
|
"loss": 1.1284, |
|
"step": 698 |
|
}, |
|
{ |
|
"epoch": 0.5158671586715867, |
|
"grad_norm": 29.409988403320312, |
|
"learning_rate": 4.29889298892989e-05, |
|
"loss": 1.2092, |
|
"step": 699 |
|
}, |
|
{ |
|
"epoch": 0.5166051660516605, |
|
"grad_norm": 28.83966636657715, |
|
"learning_rate": 4.3050430504305045e-05, |
|
"loss": 0.9437, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.5166051660516605, |
|
"eval_loss": 1.3298412561416626, |
|
"eval_runtime": 307.823, |
|
"eval_samples_per_second": 3.733, |
|
"eval_steps_per_second": 0.312, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.5173431734317343, |
|
"grad_norm": 26.844846725463867, |
|
"learning_rate": 4.311193111931119e-05, |
|
"loss": 1.112, |
|
"step": 701 |
|
}, |
|
{ |
|
"epoch": 0.5180811808118081, |
|
"grad_norm": 28.658428192138672, |
|
"learning_rate": 4.3173431734317345e-05, |
|
"loss": 0.9918, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 0.5188191881918819, |
|
"grad_norm": 32.5452995300293, |
|
"learning_rate": 4.323493234932349e-05, |
|
"loss": 1.0447, |
|
"step": 703 |
|
}, |
|
{ |
|
"epoch": 0.5195571955719557, |
|
"grad_norm": 26.970304489135742, |
|
"learning_rate": 4.3296432964329645e-05, |
|
"loss": 1.1313, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 0.5202952029520295, |
|
"grad_norm": 28.920679092407227, |
|
"learning_rate": 4.33579335793358e-05, |
|
"loss": 1.0252, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.5210332103321034, |
|
"grad_norm": 40.62504959106445, |
|
"learning_rate": 4.3419434194341945e-05, |
|
"loss": 1.1523, |
|
"step": 706 |
|
}, |
|
{ |
|
"epoch": 0.5217712177121772, |
|
"grad_norm": 30.851390838623047, |
|
"learning_rate": 4.34809348093481e-05, |
|
"loss": 0.9797, |
|
"step": 707 |
|
}, |
|
{ |
|
"epoch": 0.522509225092251, |
|
"grad_norm": 27.900365829467773, |
|
"learning_rate": 4.3542435424354244e-05, |
|
"loss": 1.0221, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 0.5232472324723247, |
|
"grad_norm": 26.0831356048584, |
|
"learning_rate": 4.360393603936039e-05, |
|
"loss": 1.0887, |
|
"step": 709 |
|
}, |
|
{ |
|
"epoch": 0.5239852398523985, |
|
"grad_norm": 29.75108528137207, |
|
"learning_rate": 4.366543665436655e-05, |
|
"loss": 0.9471, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.5247232472324723, |
|
"grad_norm": 31.546483993530273, |
|
"learning_rate": 4.37269372693727e-05, |
|
"loss": 1.1382, |
|
"step": 711 |
|
}, |
|
{ |
|
"epoch": 0.5254612546125461, |
|
"grad_norm": 27.857818603515625, |
|
"learning_rate": 4.3788437884378844e-05, |
|
"loss": 1.1366, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 0.5261992619926199, |
|
"grad_norm": 26.583192825317383, |
|
"learning_rate": 4.3849938499385e-05, |
|
"loss": 1.0916, |
|
"step": 713 |
|
}, |
|
{ |
|
"epoch": 0.5269372693726937, |
|
"grad_norm": 30.150146484375, |
|
"learning_rate": 4.391143911439114e-05, |
|
"loss": 1.0575, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 0.5276752767527675, |
|
"grad_norm": 27.24560546875, |
|
"learning_rate": 4.3972939729397297e-05, |
|
"loss": 1.0683, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.5284132841328413, |
|
"grad_norm": 29.45226287841797, |
|
"learning_rate": 4.403444034440344e-05, |
|
"loss": 1.1279, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 0.5291512915129152, |
|
"grad_norm": 28.790172576904297, |
|
"learning_rate": 4.4095940959409596e-05, |
|
"loss": 1.1934, |
|
"step": 717 |
|
}, |
|
{ |
|
"epoch": 0.529889298892989, |
|
"grad_norm": 42.536705017089844, |
|
"learning_rate": 4.415744157441575e-05, |
|
"loss": 1.061, |
|
"step": 718 |
|
}, |
|
{ |
|
"epoch": 0.5306273062730628, |
|
"grad_norm": 28.66362953186035, |
|
"learning_rate": 4.4218942189421896e-05, |
|
"loss": 1.0786, |
|
"step": 719 |
|
}, |
|
{ |
|
"epoch": 0.5313653136531366, |
|
"grad_norm": 25.908044815063477, |
|
"learning_rate": 4.428044280442805e-05, |
|
"loss": 1.0297, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.5321033210332103, |
|
"grad_norm": 28.063125610351562, |
|
"learning_rate": 4.4341943419434196e-05, |
|
"loss": 0.9617, |
|
"step": 721 |
|
}, |
|
{ |
|
"epoch": 0.5328413284132841, |
|
"grad_norm": 27.69817352294922, |
|
"learning_rate": 4.440344403444034e-05, |
|
"loss": 1.1288, |
|
"step": 722 |
|
}, |
|
{ |
|
"epoch": 0.5335793357933579, |
|
"grad_norm": 30.366674423217773, |
|
"learning_rate": 4.4464944649446495e-05, |
|
"loss": 1.1191, |
|
"step": 723 |
|
}, |
|
{ |
|
"epoch": 0.5343173431734317, |
|
"grad_norm": 30.783306121826172, |
|
"learning_rate": 4.452644526445265e-05, |
|
"loss": 1.1135, |
|
"step": 724 |
|
}, |
|
{ |
|
"epoch": 0.5350553505535055, |
|
"grad_norm": 28.302270889282227, |
|
"learning_rate": 4.4587945879458795e-05, |
|
"loss": 1.1435, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.5357933579335793, |
|
"grad_norm": 28.51706314086914, |
|
"learning_rate": 4.464944649446495e-05, |
|
"loss": 1.1788, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 0.5365313653136531, |
|
"grad_norm": 31.32042121887207, |
|
"learning_rate": 4.4710947109471095e-05, |
|
"loss": 1.1667, |
|
"step": 727 |
|
}, |
|
{ |
|
"epoch": 0.537269372693727, |
|
"grad_norm": 28.812145233154297, |
|
"learning_rate": 4.477244772447725e-05, |
|
"loss": 1.0405, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 0.5380073800738008, |
|
"grad_norm": 26.23000717163086, |
|
"learning_rate": 4.4833948339483395e-05, |
|
"loss": 1.0401, |
|
"step": 729 |
|
}, |
|
{ |
|
"epoch": 0.5387453874538746, |
|
"grad_norm": 81.6714859008789, |
|
"learning_rate": 4.489544895448955e-05, |
|
"loss": 1.1202, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.5394833948339484, |
|
"grad_norm": 27.881044387817383, |
|
"learning_rate": 4.49569495694957e-05, |
|
"loss": 1.0516, |
|
"step": 731 |
|
}, |
|
{ |
|
"epoch": 0.5402214022140222, |
|
"grad_norm": 29.472396850585938, |
|
"learning_rate": 4.501845018450185e-05, |
|
"loss": 1.1779, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 0.5409594095940959, |
|
"grad_norm": 28.200910568237305, |
|
"learning_rate": 4.5079950799507994e-05, |
|
"loss": 1.018, |
|
"step": 733 |
|
}, |
|
{ |
|
"epoch": 0.5416974169741697, |
|
"grad_norm": 28.53663444519043, |
|
"learning_rate": 4.514145141451415e-05, |
|
"loss": 1.1831, |
|
"step": 734 |
|
}, |
|
{ |
|
"epoch": 0.5424354243542435, |
|
"grad_norm": 36.12836837768555, |
|
"learning_rate": 4.5202952029520294e-05, |
|
"loss": 1.1069, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.5431734317343173, |
|
"grad_norm": 29.165285110473633, |
|
"learning_rate": 4.526445264452645e-05, |
|
"loss": 0.9832, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 0.5439114391143911, |
|
"grad_norm": 27.385562896728516, |
|
"learning_rate": 4.53259532595326e-05, |
|
"loss": 1.0543, |
|
"step": 737 |
|
}, |
|
{ |
|
"epoch": 0.5446494464944649, |
|
"grad_norm": 32.897945404052734, |
|
"learning_rate": 4.5387453874538747e-05, |
|
"loss": 1.1396, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 0.5453874538745388, |
|
"grad_norm": 29.424503326416016, |
|
"learning_rate": 4.54489544895449e-05, |
|
"loss": 1.0489, |
|
"step": 739 |
|
}, |
|
{ |
|
"epoch": 0.5461254612546126, |
|
"grad_norm": 31.19598960876465, |
|
"learning_rate": 4.5510455104551046e-05, |
|
"loss": 1.2388, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.5468634686346864, |
|
"grad_norm": 28.53763198852539, |
|
"learning_rate": 4.55719557195572e-05, |
|
"loss": 1.0937, |
|
"step": 741 |
|
}, |
|
{ |
|
"epoch": 0.5476014760147602, |
|
"grad_norm": 29.64959716796875, |
|
"learning_rate": 4.5633456334563346e-05, |
|
"loss": 1.0849, |
|
"step": 742 |
|
}, |
|
{ |
|
"epoch": 0.548339483394834, |
|
"grad_norm": 27.357303619384766, |
|
"learning_rate": 4.569495694956949e-05, |
|
"loss": 1.1383, |
|
"step": 743 |
|
}, |
|
{ |
|
"epoch": 0.5490774907749078, |
|
"grad_norm": 27.413957595825195, |
|
"learning_rate": 4.575645756457565e-05, |
|
"loss": 1.079, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 0.5498154981549815, |
|
"grad_norm": 29.784135818481445, |
|
"learning_rate": 4.58179581795818e-05, |
|
"loss": 0.9767, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.5505535055350553, |
|
"grad_norm": 55.847591400146484, |
|
"learning_rate": 4.5879458794587945e-05, |
|
"loss": 1.0553, |
|
"step": 746 |
|
}, |
|
{ |
|
"epoch": 0.5512915129151291, |
|
"grad_norm": 320.68597412109375, |
|
"learning_rate": 4.59409594095941e-05, |
|
"loss": 1.1933, |
|
"step": 747 |
|
}, |
|
{ |
|
"epoch": 0.5520295202952029, |
|
"grad_norm": 26.938758850097656, |
|
"learning_rate": 4.6002460024600245e-05, |
|
"loss": 1.0217, |
|
"step": 748 |
|
}, |
|
{ |
|
"epoch": 0.5527675276752767, |
|
"grad_norm": 32.755672454833984, |
|
"learning_rate": 4.60639606396064e-05, |
|
"loss": 1.099, |
|
"step": 749 |
|
}, |
|
{ |
|
"epoch": 0.5535055350553506, |
|
"grad_norm": 30.825178146362305, |
|
"learning_rate": 4.612546125461255e-05, |
|
"loss": 1.1831, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.5542435424354244, |
|
"grad_norm": 26.865983963012695, |
|
"learning_rate": 4.61869618696187e-05, |
|
"loss": 1.172, |
|
"step": 751 |
|
}, |
|
{ |
|
"epoch": 0.5549815498154982, |
|
"grad_norm": 27.207359313964844, |
|
"learning_rate": 4.624846248462485e-05, |
|
"loss": 1.1431, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 0.555719557195572, |
|
"grad_norm": 31.474943161010742, |
|
"learning_rate": 4.6309963099631e-05, |
|
"loss": 1.0282, |
|
"step": 753 |
|
}, |
|
{ |
|
"epoch": 0.5564575645756458, |
|
"grad_norm": 31.235960006713867, |
|
"learning_rate": 4.637146371463715e-05, |
|
"loss": 1.0688, |
|
"step": 754 |
|
}, |
|
{ |
|
"epoch": 0.5571955719557196, |
|
"grad_norm": 26.043094635009766, |
|
"learning_rate": 4.64329643296433e-05, |
|
"loss": 1.0858, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.5579335793357934, |
|
"grad_norm": 28.13475227355957, |
|
"learning_rate": 4.6494464944649444e-05, |
|
"loss": 0.9878, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 0.5586715867158671, |
|
"grad_norm": 28.513853073120117, |
|
"learning_rate": 4.6555965559655604e-05, |
|
"loss": 0.9452, |
|
"step": 757 |
|
}, |
|
{ |
|
"epoch": 0.5594095940959409, |
|
"grad_norm": 28.906461715698242, |
|
"learning_rate": 4.661746617466175e-05, |
|
"loss": 0.9423, |
|
"step": 758 |
|
}, |
|
{ |
|
"epoch": 0.5601476014760147, |
|
"grad_norm": 33.28678894042969, |
|
"learning_rate": 4.66789667896679e-05, |
|
"loss": 1.1832, |
|
"step": 759 |
|
}, |
|
{ |
|
"epoch": 0.5608856088560885, |
|
"grad_norm": 29.69910430908203, |
|
"learning_rate": 4.674046740467405e-05, |
|
"loss": 1.0487, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.5616236162361624, |
|
"grad_norm": 81.67484283447266, |
|
"learning_rate": 4.6801968019680197e-05, |
|
"loss": 1.0789, |
|
"step": 761 |
|
}, |
|
{ |
|
"epoch": 0.5623616236162362, |
|
"grad_norm": 32.282474517822266, |
|
"learning_rate": 4.686346863468635e-05, |
|
"loss": 1.0681, |
|
"step": 762 |
|
}, |
|
{ |
|
"epoch": 0.56309963099631, |
|
"grad_norm": 28.49372673034668, |
|
"learning_rate": 4.6924969249692496e-05, |
|
"loss": 1.2814, |
|
"step": 763 |
|
}, |
|
{ |
|
"epoch": 0.5638376383763838, |
|
"grad_norm": 33.509033203125, |
|
"learning_rate": 4.698646986469865e-05, |
|
"loss": 1.056, |
|
"step": 764 |
|
}, |
|
{ |
|
"epoch": 0.5645756457564576, |
|
"grad_norm": 31.451663970947266, |
|
"learning_rate": 4.70479704797048e-05, |
|
"loss": 1.1701, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.5653136531365314, |
|
"grad_norm": 28.21207618713379, |
|
"learning_rate": 4.710947109471095e-05, |
|
"loss": 1.1759, |
|
"step": 766 |
|
}, |
|
{ |
|
"epoch": 0.5660516605166052, |
|
"grad_norm": 25.11651611328125, |
|
"learning_rate": 4.7170971709717096e-05, |
|
"loss": 1.0304, |
|
"step": 767 |
|
}, |
|
{ |
|
"epoch": 0.566789667896679, |
|
"grad_norm": 26.841819763183594, |
|
"learning_rate": 4.723247232472325e-05, |
|
"loss": 1.035, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 0.5675276752767527, |
|
"grad_norm": 26.381568908691406, |
|
"learning_rate": 4.7293972939729395e-05, |
|
"loss": 1.1387, |
|
"step": 769 |
|
}, |
|
{ |
|
"epoch": 0.5682656826568265, |
|
"grad_norm": 29.644023895263672, |
|
"learning_rate": 4.7355473554735555e-05, |
|
"loss": 1.0994, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.5690036900369003, |
|
"grad_norm": 31.37369728088379, |
|
"learning_rate": 4.74169741697417e-05, |
|
"loss": 1.124, |
|
"step": 771 |
|
}, |
|
{ |
|
"epoch": 0.5697416974169742, |
|
"grad_norm": 29.403026580810547, |
|
"learning_rate": 4.747847478474785e-05, |
|
"loss": 1.0507, |
|
"step": 772 |
|
}, |
|
{ |
|
"epoch": 0.570479704797048, |
|
"grad_norm": 28.384349822998047, |
|
"learning_rate": 4.7539975399754e-05, |
|
"loss": 1.0764, |
|
"step": 773 |
|
}, |
|
{ |
|
"epoch": 0.5712177121771218, |
|
"grad_norm": 67.28231811523438, |
|
"learning_rate": 4.760147601476015e-05, |
|
"loss": 1.0503, |
|
"step": 774 |
|
}, |
|
{ |
|
"epoch": 0.5719557195571956, |
|
"grad_norm": 29.146886825561523, |
|
"learning_rate": 4.76629766297663e-05, |
|
"loss": 1.0136, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.5726937269372694, |
|
"grad_norm": 60.2903938293457, |
|
"learning_rate": 4.772447724477245e-05, |
|
"loss": 1.0508, |
|
"step": 776 |
|
}, |
|
{ |
|
"epoch": 0.5734317343173432, |
|
"grad_norm": 28.743024826049805, |
|
"learning_rate": 4.77859778597786e-05, |
|
"loss": 1.0318, |
|
"step": 777 |
|
}, |
|
{ |
|
"epoch": 0.574169741697417, |
|
"grad_norm": 30.6608943939209, |
|
"learning_rate": 4.7847478474784754e-05, |
|
"loss": 1.09, |
|
"step": 778 |
|
}, |
|
{ |
|
"epoch": 0.5749077490774908, |
|
"grad_norm": 56.827152252197266, |
|
"learning_rate": 4.79089790897909e-05, |
|
"loss": 0.9897, |
|
"step": 779 |
|
}, |
|
{ |
|
"epoch": 0.5756457564575646, |
|
"grad_norm": 32.71049499511719, |
|
"learning_rate": 4.797047970479705e-05, |
|
"loss": 1.1841, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.5763837638376383, |
|
"grad_norm": 29.06208038330078, |
|
"learning_rate": 4.80319803198032e-05, |
|
"loss": 1.2583, |
|
"step": 781 |
|
}, |
|
{ |
|
"epoch": 0.5771217712177121, |
|
"grad_norm": 26.83561897277832, |
|
"learning_rate": 4.809348093480935e-05, |
|
"loss": 1.07, |
|
"step": 782 |
|
}, |
|
{ |
|
"epoch": 0.5778597785977859, |
|
"grad_norm": 28.882770538330078, |
|
"learning_rate": 4.81549815498155e-05, |
|
"loss": 1.1521, |
|
"step": 783 |
|
}, |
|
{ |
|
"epoch": 0.5785977859778598, |
|
"grad_norm": 90.4433822631836, |
|
"learning_rate": 4.821648216482165e-05, |
|
"loss": 1.0745, |
|
"step": 784 |
|
}, |
|
{ |
|
"epoch": 0.5793357933579336, |
|
"grad_norm": 30.003938674926758, |
|
"learning_rate": 4.82779827798278e-05, |
|
"loss": 1.1911, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.5800738007380074, |
|
"grad_norm": 63.82630920410156, |
|
"learning_rate": 4.833948339483395e-05, |
|
"loss": 1.1678, |
|
"step": 786 |
|
}, |
|
{ |
|
"epoch": 0.5808118081180812, |
|
"grad_norm": 238.10055541992188, |
|
"learning_rate": 4.84009840098401e-05, |
|
"loss": 1.3465, |
|
"step": 787 |
|
}, |
|
{ |
|
"epoch": 0.581549815498155, |
|
"grad_norm": 311.88134765625, |
|
"learning_rate": 4.846248462484625e-05, |
|
"loss": 2.3681, |
|
"step": 788 |
|
}, |
|
{ |
|
"epoch": 0.5822878228782288, |
|
"grad_norm": 79.10831451416016, |
|
"learning_rate": 4.85239852398524e-05, |
|
"loss": 1.3595, |
|
"step": 789 |
|
}, |
|
{ |
|
"epoch": 0.5830258302583026, |
|
"grad_norm": 195.71676635742188, |
|
"learning_rate": 4.858548585485855e-05, |
|
"loss": 1.8711, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.5837638376383764, |
|
"grad_norm": 223.3916015625, |
|
"learning_rate": 4.8646986469864706e-05, |
|
"loss": 2.4052, |
|
"step": 791 |
|
}, |
|
{ |
|
"epoch": 0.5845018450184502, |
|
"grad_norm": 33.84809875488281, |
|
"learning_rate": 4.870848708487085e-05, |
|
"loss": 1.04, |
|
"step": 792 |
|
}, |
|
{ |
|
"epoch": 0.5852398523985239, |
|
"grad_norm": 134.23912048339844, |
|
"learning_rate": 4.8769987699877e-05, |
|
"loss": 2.6438, |
|
"step": 793 |
|
}, |
|
{ |
|
"epoch": 0.5859778597785977, |
|
"grad_norm": 44.83888244628906, |
|
"learning_rate": 4.883148831488315e-05, |
|
"loss": 1.4395, |
|
"step": 794 |
|
}, |
|
{ |
|
"epoch": 0.5867158671586716, |
|
"grad_norm": 299.56988525390625, |
|
"learning_rate": 4.88929889298893e-05, |
|
"loss": 1.6989, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.5874538745387454, |
|
"grad_norm": 284.4837341308594, |
|
"learning_rate": 4.895448954489545e-05, |
|
"loss": 2.2906, |
|
"step": 796 |
|
}, |
|
{ |
|
"epoch": 0.5881918819188192, |
|
"grad_norm": 53.7056884765625, |
|
"learning_rate": 4.9015990159901605e-05, |
|
"loss": 1.3129, |
|
"step": 797 |
|
}, |
|
{ |
|
"epoch": 0.588929889298893, |
|
"grad_norm": 117.8404769897461, |
|
"learning_rate": 4.907749077490775e-05, |
|
"loss": 1.916, |
|
"step": 798 |
|
}, |
|
{ |
|
"epoch": 0.5896678966789668, |
|
"grad_norm": 51.02519607543945, |
|
"learning_rate": 4.9138991389913904e-05, |
|
"loss": 2.0983, |
|
"step": 799 |
|
}, |
|
{ |
|
"epoch": 0.5904059040590406, |
|
"grad_norm": 376.12225341796875, |
|
"learning_rate": 4.920049200492005e-05, |
|
"loss": 1.5877, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.5904059040590406, |
|
"eval_loss": 1.3728182315826416, |
|
"eval_runtime": 305.6963, |
|
"eval_samples_per_second": 3.759, |
|
"eval_steps_per_second": 0.314, |
|
"step": 800 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 4065, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.5604991497978511e+18, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|