{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5904059040590406, "eval_steps": 100, "global_step": 800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007380073800738007, "grad_norm": 931.379638671875, "learning_rate": 6.150061500615006e-08, "loss": 2.4703, "step": 1 }, { "epoch": 0.0007380073800738007, "eval_loss": 2.1151068210601807, "eval_runtime": 311.9468, "eval_samples_per_second": 3.683, "eval_steps_per_second": 0.308, "step": 1 }, { "epoch": 0.0014760147601476014, "grad_norm": 940.4727172851562, "learning_rate": 1.2300123001230013e-07, "loss": 2.6788, "step": 2 }, { "epoch": 0.002214022140221402, "grad_norm": 744.969970703125, "learning_rate": 1.845018450184502e-07, "loss": 2.4456, "step": 3 }, { "epoch": 0.002952029520295203, "grad_norm": 824.5645141601562, "learning_rate": 2.4600246002460025e-07, "loss": 2.6876, "step": 4 }, { "epoch": 0.0036900369003690036, "grad_norm": 790.6527099609375, "learning_rate": 3.075030750307503e-07, "loss": 2.4629, "step": 5 }, { "epoch": 0.004428044280442804, "grad_norm": 582.4039306640625, "learning_rate": 3.690036900369004e-07, "loss": 2.3769, "step": 6 }, { "epoch": 0.0051660516605166054, "grad_norm": 542.76513671875, "learning_rate": 4.3050430504305045e-07, "loss": 2.1601, "step": 7 }, { "epoch": 0.005904059040590406, "grad_norm": 570.0616455078125, "learning_rate": 4.920049200492005e-07, "loss": 2.1235, "step": 8 }, { "epoch": 0.006642066420664207, "grad_norm": 630.7283935546875, "learning_rate": 5.535055350553506e-07, "loss": 2.1857, "step": 9 }, { "epoch": 0.007380073800738007, "grad_norm": 397.8863220214844, "learning_rate": 6.150061500615006e-07, "loss": 1.9655, "step": 10 }, { "epoch": 0.008118081180811807, "grad_norm": 387.4375915527344, "learning_rate": 6.765067650676507e-07, "loss": 1.8162, "step": 11 }, { "epoch": 0.008856088560885609, "grad_norm": 261.5195617675781, "learning_rate": 7.380073800738008e-07, "loss": 1.8043, "step": 12 }, { "epoch": 0.00959409594095941, "grad_norm": 216.66661071777344, "learning_rate": 7.995079950799507e-07, "loss": 1.7341, "step": 13 }, { "epoch": 0.010332103321033211, "grad_norm": 200.43228149414062, "learning_rate": 8.610086100861009e-07, "loss": 1.6827, "step": 14 }, { "epoch": 0.01107011070110701, "grad_norm": 213.2593536376953, "learning_rate": 9.22509225092251e-07, "loss": 1.6452, "step": 15 }, { "epoch": 0.011808118081180811, "grad_norm": 146.7362518310547, "learning_rate": 9.84009840098401e-07, "loss": 1.6459, "step": 16 }, { "epoch": 0.012546125461254613, "grad_norm": 143.30966186523438, "learning_rate": 1.045510455104551e-06, "loss": 1.6676, "step": 17 }, { "epoch": 0.013284132841328414, "grad_norm": 177.24832153320312, "learning_rate": 1.1070110701107011e-06, "loss": 1.4307, "step": 18 }, { "epoch": 0.014022140221402213, "grad_norm": 134.13116455078125, "learning_rate": 1.1685116851168512e-06, "loss": 1.4712, "step": 19 }, { "epoch": 0.014760147601476014, "grad_norm": 107.87165069580078, "learning_rate": 1.2300123001230013e-06, "loss": 1.5757, "step": 20 }, { "epoch": 0.015498154981549815, "grad_norm": 100.48570251464844, "learning_rate": 1.2915129151291513e-06, "loss": 1.5647, "step": 21 }, { "epoch": 0.016236162361623615, "grad_norm": 96.30101776123047, "learning_rate": 1.3530135301353014e-06, "loss": 1.3431, "step": 22 }, { "epoch": 0.016974169741697416, "grad_norm": 99.80168151855469, "learning_rate": 1.4145141451414515e-06, "loss": 1.4753, "step": 23 }, { "epoch": 0.017712177121771217, "grad_norm": 86.59078216552734, "learning_rate": 1.4760147601476015e-06, "loss": 1.4402, "step": 24 }, { "epoch": 0.01845018450184502, "grad_norm": 107.12730407714844, "learning_rate": 1.5375153751537516e-06, "loss": 1.4129, "step": 25 }, { "epoch": 0.01918819188191882, "grad_norm": 86.11123657226562, "learning_rate": 1.5990159901599014e-06, "loss": 1.3141, "step": 26 }, { "epoch": 0.01992619926199262, "grad_norm": 81.71781158447266, "learning_rate": 1.6605166051660517e-06, "loss": 1.3644, "step": 27 }, { "epoch": 0.020664206642066422, "grad_norm": 81.71916961669922, "learning_rate": 1.7220172201722018e-06, "loss": 1.3631, "step": 28 }, { "epoch": 0.021402214022140223, "grad_norm": 65.515625, "learning_rate": 1.783517835178352e-06, "loss": 1.3915, "step": 29 }, { "epoch": 0.02214022140221402, "grad_norm": 82.60952758789062, "learning_rate": 1.845018450184502e-06, "loss": 1.2512, "step": 30 }, { "epoch": 0.022878228782287822, "grad_norm": 78.03673553466797, "learning_rate": 1.9065190651906518e-06, "loss": 1.4272, "step": 31 }, { "epoch": 0.023616236162361623, "grad_norm": 73.9189453125, "learning_rate": 1.968019680196802e-06, "loss": 1.3549, "step": 32 }, { "epoch": 0.024354243542435424, "grad_norm": 75.15375518798828, "learning_rate": 2.029520295202952e-06, "loss": 1.1933, "step": 33 }, { "epoch": 0.025092250922509225, "grad_norm": 68.5103530883789, "learning_rate": 2.091020910209102e-06, "loss": 1.2598, "step": 34 }, { "epoch": 0.025830258302583026, "grad_norm": 63.10990905761719, "learning_rate": 2.1525215252152524e-06, "loss": 1.2143, "step": 35 }, { "epoch": 0.026568265682656828, "grad_norm": 75.12173461914062, "learning_rate": 2.2140221402214023e-06, "loss": 1.2827, "step": 36 }, { "epoch": 0.02730627306273063, "grad_norm": 69.23287963867188, "learning_rate": 2.2755227552275526e-06, "loss": 1.4106, "step": 37 }, { "epoch": 0.028044280442804426, "grad_norm": 82.09547424316406, "learning_rate": 2.3370233702337024e-06, "loss": 1.1135, "step": 38 }, { "epoch": 0.028782287822878228, "grad_norm": 89.76222229003906, "learning_rate": 2.3985239852398527e-06, "loss": 1.3469, "step": 39 }, { "epoch": 0.02952029520295203, "grad_norm": 75.77232360839844, "learning_rate": 2.4600246002460025e-06, "loss": 1.1857, "step": 40 }, { "epoch": 0.03025830258302583, "grad_norm": 64.25336456298828, "learning_rate": 2.5215252152521524e-06, "loss": 1.2452, "step": 41 }, { "epoch": 0.03099630996309963, "grad_norm": 64.85978698730469, "learning_rate": 2.5830258302583027e-06, "loss": 1.1511, "step": 42 }, { "epoch": 0.03173431734317343, "grad_norm": 61.36198043823242, "learning_rate": 2.6445264452644525e-06, "loss": 1.1056, "step": 43 }, { "epoch": 0.03247232472324723, "grad_norm": 63.63357925415039, "learning_rate": 2.706027060270603e-06, "loss": 1.3231, "step": 44 }, { "epoch": 0.033210332103321034, "grad_norm": 60.254825592041016, "learning_rate": 2.767527675276753e-06, "loss": 1.1552, "step": 45 }, { "epoch": 0.03394833948339483, "grad_norm": 69.51408386230469, "learning_rate": 2.829028290282903e-06, "loss": 1.2972, "step": 46 }, { "epoch": 0.03468634686346864, "grad_norm": 60.74787902832031, "learning_rate": 2.890528905289053e-06, "loss": 1.2444, "step": 47 }, { "epoch": 0.035424354243542434, "grad_norm": 62.291412353515625, "learning_rate": 2.952029520295203e-06, "loss": 1.2342, "step": 48 }, { "epoch": 0.03616236162361624, "grad_norm": 67.48091125488281, "learning_rate": 3.0135301353013533e-06, "loss": 1.2894, "step": 49 }, { "epoch": 0.03690036900369004, "grad_norm": 57.86232376098633, "learning_rate": 3.075030750307503e-06, "loss": 1.1071, "step": 50 }, { "epoch": 0.037638376383763834, "grad_norm": 62.488731384277344, "learning_rate": 3.136531365313653e-06, "loss": 1.2249, "step": 51 }, { "epoch": 0.03837638376383764, "grad_norm": 56.59815979003906, "learning_rate": 3.198031980319803e-06, "loss": 1.2616, "step": 52 }, { "epoch": 0.03911439114391144, "grad_norm": 58.92403030395508, "learning_rate": 3.2595325953259536e-06, "loss": 1.2147, "step": 53 }, { "epoch": 0.03985239852398524, "grad_norm": 63.04093933105469, "learning_rate": 3.3210332103321034e-06, "loss": 1.2363, "step": 54 }, { "epoch": 0.04059040590405904, "grad_norm": 57.72414779663086, "learning_rate": 3.3825338253382537e-06, "loss": 1.1719, "step": 55 }, { "epoch": 0.041328413284132844, "grad_norm": 61.95828628540039, "learning_rate": 3.4440344403444036e-06, "loss": 1.1956, "step": 56 }, { "epoch": 0.04206642066420664, "grad_norm": 58.07041549682617, "learning_rate": 3.5055350553505534e-06, "loss": 1.1326, "step": 57 }, { "epoch": 0.042804428044280446, "grad_norm": 61.18100357055664, "learning_rate": 3.567035670356704e-06, "loss": 1.2313, "step": 58 }, { "epoch": 0.043542435424354244, "grad_norm": 58.01974868774414, "learning_rate": 3.628536285362854e-06, "loss": 1.1833, "step": 59 }, { "epoch": 0.04428044280442804, "grad_norm": 58.43510437011719, "learning_rate": 3.690036900369004e-06, "loss": 1.232, "step": 60 }, { "epoch": 0.045018450184501846, "grad_norm": 56.53025817871094, "learning_rate": 3.7515375153751537e-06, "loss": 1.202, "step": 61 }, { "epoch": 0.045756457564575644, "grad_norm": 59.623043060302734, "learning_rate": 3.8130381303813035e-06, "loss": 1.2188, "step": 62 }, { "epoch": 0.04649446494464945, "grad_norm": 54.90254211425781, "learning_rate": 3.874538745387454e-06, "loss": 1.1324, "step": 63 }, { "epoch": 0.047232472324723246, "grad_norm": 56.264732360839844, "learning_rate": 3.936039360393604e-06, "loss": 1.1797, "step": 64 }, { "epoch": 0.04797047970479705, "grad_norm": 56.26121520996094, "learning_rate": 3.997539975399754e-06, "loss": 1.1777, "step": 65 }, { "epoch": 0.04870848708487085, "grad_norm": 53.94155502319336, "learning_rate": 4.059040590405904e-06, "loss": 1.1077, "step": 66 }, { "epoch": 0.04944649446494465, "grad_norm": 56.105831146240234, "learning_rate": 4.120541205412054e-06, "loss": 1.1559, "step": 67 }, { "epoch": 0.05018450184501845, "grad_norm": 65.91514587402344, "learning_rate": 4.182041820418204e-06, "loss": 1.0554, "step": 68 }, { "epoch": 0.05092250922509225, "grad_norm": 67.19110107421875, "learning_rate": 4.243542435424354e-06, "loss": 1.2161, "step": 69 }, { "epoch": 0.05166051660516605, "grad_norm": 55.92790603637695, "learning_rate": 4.305043050430505e-06, "loss": 1.2577, "step": 70 }, { "epoch": 0.05239852398523985, "grad_norm": 61.967750549316406, "learning_rate": 4.366543665436655e-06, "loss": 1.1944, "step": 71 }, { "epoch": 0.053136531365313655, "grad_norm": 54.48695373535156, "learning_rate": 4.428044280442805e-06, "loss": 0.9314, "step": 72 }, { "epoch": 0.05387453874538745, "grad_norm": 59.03939437866211, "learning_rate": 4.489544895448955e-06, "loss": 1.2708, "step": 73 }, { "epoch": 0.05461254612546126, "grad_norm": 57.15635299682617, "learning_rate": 4.551045510455105e-06, "loss": 1.1294, "step": 74 }, { "epoch": 0.055350553505535055, "grad_norm": 57.40306091308594, "learning_rate": 4.612546125461255e-06, "loss": 1.0417, "step": 75 }, { "epoch": 0.05608856088560885, "grad_norm": 114.00467681884766, "learning_rate": 4.674046740467405e-06, "loss": 1.0973, "step": 76 }, { "epoch": 0.05682656826568266, "grad_norm": 55.897666931152344, "learning_rate": 4.735547355473555e-06, "loss": 1.024, "step": 77 }, { "epoch": 0.057564575645756455, "grad_norm": 54.696266174316406, "learning_rate": 4.797047970479705e-06, "loss": 1.0549, "step": 78 }, { "epoch": 0.05830258302583026, "grad_norm": 58.518489837646484, "learning_rate": 4.858548585485855e-06, "loss": 1.1007, "step": 79 }, { "epoch": 0.05904059040590406, "grad_norm": 55.24943923950195, "learning_rate": 4.920049200492005e-06, "loss": 1.223, "step": 80 }, { "epoch": 0.05977859778597786, "grad_norm": 55.647605895996094, "learning_rate": 4.981549815498155e-06, "loss": 1.0745, "step": 81 }, { "epoch": 0.06051660516605166, "grad_norm": 52.201297760009766, "learning_rate": 5.043050430504305e-06, "loss": 1.1459, "step": 82 }, { "epoch": 0.061254612546125464, "grad_norm": 49.60506820678711, "learning_rate": 5.1045510455104555e-06, "loss": 1.0853, "step": 83 }, { "epoch": 0.06199261992619926, "grad_norm": 53.66012191772461, "learning_rate": 5.166051660516605e-06, "loss": 0.999, "step": 84 }, { "epoch": 0.06273062730627306, "grad_norm": 58.5854606628418, "learning_rate": 5.227552275522755e-06, "loss": 1.3074, "step": 85 }, { "epoch": 0.06346863468634686, "grad_norm": 58.91031265258789, "learning_rate": 5.289052890528905e-06, "loss": 1.0567, "step": 86 }, { "epoch": 0.06420664206642067, "grad_norm": 57.29990005493164, "learning_rate": 5.350553505535055e-06, "loss": 1.1709, "step": 87 }, { "epoch": 0.06494464944649446, "grad_norm": 48.71859359741211, "learning_rate": 5.412054120541206e-06, "loss": 1.2049, "step": 88 }, { "epoch": 0.06568265682656826, "grad_norm": 50.770084381103516, "learning_rate": 5.4735547355473555e-06, "loss": 0.9872, "step": 89 }, { "epoch": 0.06642066420664207, "grad_norm": 58.15389633178711, "learning_rate": 5.535055350553506e-06, "loss": 1.0798, "step": 90 }, { "epoch": 0.06715867158671587, "grad_norm": 61.212825775146484, "learning_rate": 5.596555965559656e-06, "loss": 1.0654, "step": 91 }, { "epoch": 0.06789667896678966, "grad_norm": 56.70602798461914, "learning_rate": 5.658056580565806e-06, "loss": 1.1565, "step": 92 }, { "epoch": 0.06863468634686347, "grad_norm": 54.07913589477539, "learning_rate": 5.7195571955719566e-06, "loss": 1.1315, "step": 93 }, { "epoch": 0.06937269372693727, "grad_norm": 55.931495666503906, "learning_rate": 5.781057810578106e-06, "loss": 1.0347, "step": 94 }, { "epoch": 0.07011070110701106, "grad_norm": 58.78949737548828, "learning_rate": 5.842558425584256e-06, "loss": 1.1858, "step": 95 }, { "epoch": 0.07084870848708487, "grad_norm": 53.04726791381836, "learning_rate": 5.904059040590406e-06, "loss": 1.1227, "step": 96 }, { "epoch": 0.07158671586715867, "grad_norm": 51.182315826416016, "learning_rate": 5.965559655596556e-06, "loss": 1.1926, "step": 97 }, { "epoch": 0.07232472324723248, "grad_norm": 55.08806610107422, "learning_rate": 6.027060270602707e-06, "loss": 1.1339, "step": 98 }, { "epoch": 0.07306273062730627, "grad_norm": 53.554542541503906, "learning_rate": 6.0885608856088565e-06, "loss": 1.1762, "step": 99 }, { "epoch": 0.07380073800738007, "grad_norm": 56.95305252075195, "learning_rate": 6.150061500615006e-06, "loss": 1.1414, "step": 100 }, { "epoch": 0.07380073800738007, "eval_loss": 1.4010363817214966, "eval_runtime": 325.9171, "eval_samples_per_second": 3.525, "eval_steps_per_second": 0.295, "step": 100 }, { "epoch": 0.07453874538745388, "grad_norm": 51.188621520996094, "learning_rate": 6.211562115621156e-06, "loss": 1.129, "step": 101 }, { "epoch": 0.07527675276752767, "grad_norm": 55.20896530151367, "learning_rate": 6.273062730627306e-06, "loss": 1.1451, "step": 102 }, { "epoch": 0.07601476014760147, "grad_norm": 49.773399353027344, "learning_rate": 6.334563345633457e-06, "loss": 1.2308, "step": 103 }, { "epoch": 0.07675276752767528, "grad_norm": 52.89494323730469, "learning_rate": 6.396063960639606e-06, "loss": 1.2243, "step": 104 }, { "epoch": 0.07749077490774908, "grad_norm": 53.44047546386719, "learning_rate": 6.4575645756457565e-06, "loss": 1.0611, "step": 105 }, { "epoch": 0.07822878228782287, "grad_norm": 53.227176666259766, "learning_rate": 6.519065190651907e-06, "loss": 1.0153, "step": 106 }, { "epoch": 0.07896678966789668, "grad_norm": 53.29740524291992, "learning_rate": 6.580565805658056e-06, "loss": 1.0539, "step": 107 }, { "epoch": 0.07970479704797048, "grad_norm": 52.415748596191406, "learning_rate": 6.642066420664207e-06, "loss": 1.1556, "step": 108 }, { "epoch": 0.08044280442804429, "grad_norm": 52.891544342041016, "learning_rate": 6.703567035670357e-06, "loss": 1.0613, "step": 109 }, { "epoch": 0.08118081180811808, "grad_norm": 56.652835845947266, "learning_rate": 6.7650676506765074e-06, "loss": 1.053, "step": 110 }, { "epoch": 0.08191881918819188, "grad_norm": 52.22764587402344, "learning_rate": 6.826568265682657e-06, "loss": 1.0757, "step": 111 }, { "epoch": 0.08265682656826569, "grad_norm": 51.05937576293945, "learning_rate": 6.888068880688807e-06, "loss": 1.1943, "step": 112 }, { "epoch": 0.08339483394833948, "grad_norm": 53.054378509521484, "learning_rate": 6.949569495694958e-06, "loss": 1.0704, "step": 113 }, { "epoch": 0.08413284132841328, "grad_norm": 54.2965202331543, "learning_rate": 7.011070110701107e-06, "loss": 1.0442, "step": 114 }, { "epoch": 0.08487084870848709, "grad_norm": 52.170867919921875, "learning_rate": 7.0725707257072575e-06, "loss": 1.2318, "step": 115 }, { "epoch": 0.08560885608856089, "grad_norm": 51.29275894165039, "learning_rate": 7.134071340713408e-06, "loss": 1.0306, "step": 116 }, { "epoch": 0.08634686346863468, "grad_norm": 54.07830047607422, "learning_rate": 7.195571955719557e-06, "loss": 1.1537, "step": 117 }, { "epoch": 0.08708487084870849, "grad_norm": 47.52810287475586, "learning_rate": 7.257072570725708e-06, "loss": 1.1096, "step": 118 }, { "epoch": 0.08782287822878229, "grad_norm": 52.45383071899414, "learning_rate": 7.318573185731857e-06, "loss": 1.0466, "step": 119 }, { "epoch": 0.08856088560885608, "grad_norm": 51.74037551879883, "learning_rate": 7.380073800738008e-06, "loss": 1.0032, "step": 120 }, { "epoch": 0.08929889298892989, "grad_norm": 52.04569625854492, "learning_rate": 7.441574415744158e-06, "loss": 1.1626, "step": 121 }, { "epoch": 0.09003690036900369, "grad_norm": 51.20045852661133, "learning_rate": 7.503075030750307e-06, "loss": 1.1133, "step": 122 }, { "epoch": 0.0907749077490775, "grad_norm": 50.70725631713867, "learning_rate": 7.564575645756458e-06, "loss": 1.1191, "step": 123 }, { "epoch": 0.09151291512915129, "grad_norm": 50.703460693359375, "learning_rate": 7.626076260762607e-06, "loss": 1.0609, "step": 124 }, { "epoch": 0.09225092250922509, "grad_norm": 53.20537185668945, "learning_rate": 7.687576875768759e-06, "loss": 1.1237, "step": 125 }, { "epoch": 0.0929889298892989, "grad_norm": 51.74738693237305, "learning_rate": 7.749077490774908e-06, "loss": 1.1255, "step": 126 }, { "epoch": 0.09372693726937269, "grad_norm": 47.27532958984375, "learning_rate": 7.810578105781058e-06, "loss": 1.0753, "step": 127 }, { "epoch": 0.09446494464944649, "grad_norm": 46.608150482177734, "learning_rate": 7.872078720787208e-06, "loss": 1.0709, "step": 128 }, { "epoch": 0.0952029520295203, "grad_norm": 52.357460021972656, "learning_rate": 7.933579335793358e-06, "loss": 1.1539, "step": 129 }, { "epoch": 0.0959409594095941, "grad_norm": 48.45564270019531, "learning_rate": 7.995079950799508e-06, "loss": 1.1129, "step": 130 }, { "epoch": 0.09667896678966789, "grad_norm": 52.05830383300781, "learning_rate": 8.05658056580566e-06, "loss": 1.1821, "step": 131 }, { "epoch": 0.0974169741697417, "grad_norm": 53.559852600097656, "learning_rate": 8.118081180811808e-06, "loss": 1.1161, "step": 132 }, { "epoch": 0.0981549815498155, "grad_norm": 54.30366134643555, "learning_rate": 8.179581795817959e-06, "loss": 1.0108, "step": 133 }, { "epoch": 0.0988929889298893, "grad_norm": 49.463932037353516, "learning_rate": 8.241082410824107e-06, "loss": 1.0384, "step": 134 }, { "epoch": 0.0996309963099631, "grad_norm": 73.52909088134766, "learning_rate": 8.302583025830259e-06, "loss": 1.1694, "step": 135 }, { "epoch": 0.1003690036900369, "grad_norm": 45.32145309448242, "learning_rate": 8.364083640836409e-06, "loss": 1.0721, "step": 136 }, { "epoch": 0.1011070110701107, "grad_norm": 51.58095932006836, "learning_rate": 8.425584255842559e-06, "loss": 0.9633, "step": 137 }, { "epoch": 0.1018450184501845, "grad_norm": 52.928436279296875, "learning_rate": 8.487084870848708e-06, "loss": 1.0783, "step": 138 }, { "epoch": 0.1025830258302583, "grad_norm": 48.393550872802734, "learning_rate": 8.548585485854858e-06, "loss": 0.9844, "step": 139 }, { "epoch": 0.1033210332103321, "grad_norm": 46.03611373901367, "learning_rate": 8.61008610086101e-06, "loss": 1.1052, "step": 140 }, { "epoch": 0.10405904059040591, "grad_norm": 49.10841751098633, "learning_rate": 8.67158671586716e-06, "loss": 1.1086, "step": 141 }, { "epoch": 0.1047970479704797, "grad_norm": 47.779212951660156, "learning_rate": 8.73308733087331e-06, "loss": 1.1376, "step": 142 }, { "epoch": 0.1055350553505535, "grad_norm": 51.112693786621094, "learning_rate": 8.79458794587946e-06, "loss": 1.1465, "step": 143 }, { "epoch": 0.10627306273062731, "grad_norm": 43.86711502075195, "learning_rate": 8.85608856088561e-06, "loss": 0.9845, "step": 144 }, { "epoch": 0.1070110701107011, "grad_norm": 45.53451156616211, "learning_rate": 8.917589175891759e-06, "loss": 1.1196, "step": 145 }, { "epoch": 0.1077490774907749, "grad_norm": 51.35363006591797, "learning_rate": 8.97908979089791e-06, "loss": 1.0202, "step": 146 }, { "epoch": 0.10848708487084871, "grad_norm": 45.318607330322266, "learning_rate": 9.040590405904059e-06, "loss": 1.0156, "step": 147 }, { "epoch": 0.10922509225092251, "grad_norm": 45.83018493652344, "learning_rate": 9.10209102091021e-06, "loss": 0.9637, "step": 148 }, { "epoch": 0.1099630996309963, "grad_norm": 52.667728424072266, "learning_rate": 9.163591635916358e-06, "loss": 1.0344, "step": 149 }, { "epoch": 0.11070110701107011, "grad_norm": 49.742897033691406, "learning_rate": 9.22509225092251e-06, "loss": 0.9486, "step": 150 }, { "epoch": 0.11143911439114391, "grad_norm": 50.35558319091797, "learning_rate": 9.28659286592866e-06, "loss": 1.1685, "step": 151 }, { "epoch": 0.1121771217712177, "grad_norm": 49.48957824707031, "learning_rate": 9.34809348093481e-06, "loss": 0.9666, "step": 152 }, { "epoch": 0.11291512915129151, "grad_norm": 46.834129333496094, "learning_rate": 9.40959409594096e-06, "loss": 1.0137, "step": 153 }, { "epoch": 0.11365313653136531, "grad_norm": 46.92979049682617, "learning_rate": 9.47109471094711e-06, "loss": 1.042, "step": 154 }, { "epoch": 0.11439114391143912, "grad_norm": 43.96043014526367, "learning_rate": 9.53259532595326e-06, "loss": 1.0363, "step": 155 }, { "epoch": 0.11512915129151291, "grad_norm": 48.00889587402344, "learning_rate": 9.59409594095941e-06, "loss": 0.9697, "step": 156 }, { "epoch": 0.11586715867158671, "grad_norm": 50.71873474121094, "learning_rate": 9.65559655596556e-06, "loss": 1.1216, "step": 157 }, { "epoch": 0.11660516605166052, "grad_norm": 51.51930236816406, "learning_rate": 9.71709717097171e-06, "loss": 1.0876, "step": 158 }, { "epoch": 0.11734317343173432, "grad_norm": 44.15366744995117, "learning_rate": 9.77859778597786e-06, "loss": 1.0607, "step": 159 }, { "epoch": 0.11808118081180811, "grad_norm": 41.848602294921875, "learning_rate": 9.84009840098401e-06, "loss": 1.0026, "step": 160 }, { "epoch": 0.11881918819188192, "grad_norm": 45.18868637084961, "learning_rate": 9.90159901599016e-06, "loss": 1.1803, "step": 161 }, { "epoch": 0.11955719557195572, "grad_norm": 45.788673400878906, "learning_rate": 9.96309963099631e-06, "loss": 1.1451, "step": 162 }, { "epoch": 0.12029520295202951, "grad_norm": 46.45803451538086, "learning_rate": 1.0024600246002461e-05, "loss": 0.9769, "step": 163 }, { "epoch": 0.12103321033210332, "grad_norm": 46.782840728759766, "learning_rate": 1.008610086100861e-05, "loss": 1.2505, "step": 164 }, { "epoch": 0.12177121771217712, "grad_norm": 45.39817810058594, "learning_rate": 1.0147601476014761e-05, "loss": 1.0927, "step": 165 }, { "epoch": 0.12250922509225093, "grad_norm": 43.27733612060547, "learning_rate": 1.0209102091020911e-05, "loss": 1.1247, "step": 166 }, { "epoch": 0.12324723247232472, "grad_norm": 47.766231536865234, "learning_rate": 1.027060270602706e-05, "loss": 1.079, "step": 167 }, { "epoch": 0.12398523985239852, "grad_norm": 46.73952865600586, "learning_rate": 1.033210332103321e-05, "loss": 0.8357, "step": 168 }, { "epoch": 0.12472324723247233, "grad_norm": 46.83552551269531, "learning_rate": 1.039360393603936e-05, "loss": 1.2159, "step": 169 }, { "epoch": 0.12546125461254612, "grad_norm": 44.146846771240234, "learning_rate": 1.045510455104551e-05, "loss": 0.9941, "step": 170 }, { "epoch": 0.12619926199261994, "grad_norm": 45.29106140136719, "learning_rate": 1.0516605166051662e-05, "loss": 1.1314, "step": 171 }, { "epoch": 0.12693726937269373, "grad_norm": 46.10059356689453, "learning_rate": 1.057810578105781e-05, "loss": 1.0239, "step": 172 }, { "epoch": 0.12767527675276752, "grad_norm": 42.55729293823242, "learning_rate": 1.0639606396063962e-05, "loss": 1.0389, "step": 173 }, { "epoch": 0.12841328413284134, "grad_norm": 43.775760650634766, "learning_rate": 1.070110701107011e-05, "loss": 1.1492, "step": 174 }, { "epoch": 0.12915129151291513, "grad_norm": 42.141910552978516, "learning_rate": 1.0762607626076261e-05, "loss": 1.092, "step": 175 }, { "epoch": 0.12988929889298892, "grad_norm": 44.42767333984375, "learning_rate": 1.0824108241082411e-05, "loss": 1.1159, "step": 176 }, { "epoch": 0.13062730627306274, "grad_norm": 38.9581184387207, "learning_rate": 1.0885608856088561e-05, "loss": 1.0921, "step": 177 }, { "epoch": 0.13136531365313653, "grad_norm": 43.585147857666016, "learning_rate": 1.0947109471094711e-05, "loss": 1.0784, "step": 178 }, { "epoch": 0.13210332103321032, "grad_norm": 49.25750732421875, "learning_rate": 1.100861008610086e-05, "loss": 1.1589, "step": 179 }, { "epoch": 0.13284132841328414, "grad_norm": 38.27066421508789, "learning_rate": 1.1070110701107012e-05, "loss": 0.9549, "step": 180 }, { "epoch": 0.13357933579335793, "grad_norm": 43.95482635498047, "learning_rate": 1.1131611316113162e-05, "loss": 1.1084, "step": 181 }, { "epoch": 0.13431734317343175, "grad_norm": 47.86146926879883, "learning_rate": 1.1193111931119312e-05, "loss": 1.0305, "step": 182 }, { "epoch": 0.13505535055350554, "grad_norm": 41.17548370361328, "learning_rate": 1.1254612546125462e-05, "loss": 1.0341, "step": 183 }, { "epoch": 0.13579335793357933, "grad_norm": 50.34139633178711, "learning_rate": 1.1316113161131612e-05, "loss": 0.9769, "step": 184 }, { "epoch": 0.13653136531365315, "grad_norm": 41.7880973815918, "learning_rate": 1.1377613776137762e-05, "loss": 1.047, "step": 185 }, { "epoch": 0.13726937269372694, "grad_norm": 43.598392486572266, "learning_rate": 1.1439114391143913e-05, "loss": 0.9553, "step": 186 }, { "epoch": 0.13800738007380073, "grad_norm": 44.27220153808594, "learning_rate": 1.1500615006150061e-05, "loss": 1.1314, "step": 187 }, { "epoch": 0.13874538745387455, "grad_norm": 38.91771697998047, "learning_rate": 1.1562115621156213e-05, "loss": 1.0132, "step": 188 }, { "epoch": 0.13948339483394834, "grad_norm": 44.32412338256836, "learning_rate": 1.1623616236162361e-05, "loss": 1.0672, "step": 189 }, { "epoch": 0.14022140221402213, "grad_norm": 43.45479202270508, "learning_rate": 1.1685116851168513e-05, "loss": 1.0519, "step": 190 }, { "epoch": 0.14095940959409595, "grad_norm": 46.94374084472656, "learning_rate": 1.1746617466174662e-05, "loss": 1.0721, "step": 191 }, { "epoch": 0.14169741697416974, "grad_norm": 48.714927673339844, "learning_rate": 1.1808118081180812e-05, "loss": 1.095, "step": 192 }, { "epoch": 0.14243542435424356, "grad_norm": 48.29472732543945, "learning_rate": 1.1869618696186962e-05, "loss": 1.1482, "step": 193 }, { "epoch": 0.14317343173431735, "grad_norm": 43.912288665771484, "learning_rate": 1.1931119311193112e-05, "loss": 1.0994, "step": 194 }, { "epoch": 0.14391143911439114, "grad_norm": 41.308799743652344, "learning_rate": 1.1992619926199262e-05, "loss": 1.2074, "step": 195 }, { "epoch": 0.14464944649446496, "grad_norm": 43.36037826538086, "learning_rate": 1.2054120541205413e-05, "loss": 1.1435, "step": 196 }, { "epoch": 0.14538745387453875, "grad_norm": 40.67462158203125, "learning_rate": 1.2115621156211563e-05, "loss": 0.9609, "step": 197 }, { "epoch": 0.14612546125461254, "grad_norm": 43.331241607666016, "learning_rate": 1.2177121771217713e-05, "loss": 1.0909, "step": 198 }, { "epoch": 0.14686346863468636, "grad_norm": 41.213863372802734, "learning_rate": 1.2238622386223863e-05, "loss": 1.0955, "step": 199 }, { "epoch": 0.14760147601476015, "grad_norm": 43.54401397705078, "learning_rate": 1.2300123001230013e-05, "loss": 1.1855, "step": 200 }, { "epoch": 0.14760147601476015, "eval_loss": 1.3390393257141113, "eval_runtime": 355.9656, "eval_samples_per_second": 3.228, "eval_steps_per_second": 0.27, "step": 200 }, { "epoch": 0.14833948339483394, "grad_norm": 45.116146087646484, "learning_rate": 1.2361623616236164e-05, "loss": 1.1331, "step": 201 }, { "epoch": 0.14907749077490776, "grad_norm": 48.80164337158203, "learning_rate": 1.2423124231242312e-05, "loss": 1.157, "step": 202 }, { "epoch": 0.14981549815498155, "grad_norm": 41.02751922607422, "learning_rate": 1.2484624846248464e-05, "loss": 1.1237, "step": 203 }, { "epoch": 0.15055350553505534, "grad_norm": 42.61967086791992, "learning_rate": 1.2546125461254612e-05, "loss": 1.1693, "step": 204 }, { "epoch": 0.15129151291512916, "grad_norm": 43.75822067260742, "learning_rate": 1.2607626076260764e-05, "loss": 1.1545, "step": 205 }, { "epoch": 0.15202952029520295, "grad_norm": 40.50026321411133, "learning_rate": 1.2669126691266914e-05, "loss": 1.061, "step": 206 }, { "epoch": 0.15276752767527677, "grad_norm": 41.14898681640625, "learning_rate": 1.2730627306273063e-05, "loss": 0.9864, "step": 207 }, { "epoch": 0.15350553505535056, "grad_norm": 44.43930435180664, "learning_rate": 1.2792127921279212e-05, "loss": 1.0444, "step": 208 }, { "epoch": 0.15424354243542435, "grad_norm": 42.351226806640625, "learning_rate": 1.2853628536285365e-05, "loss": 1.0966, "step": 209 }, { "epoch": 0.15498154981549817, "grad_norm": 39.365440368652344, "learning_rate": 1.2915129151291513e-05, "loss": 1.0987, "step": 210 }, { "epoch": 0.15571955719557196, "grad_norm": 44.90658950805664, "learning_rate": 1.2976629766297663e-05, "loss": 1.0399, "step": 211 }, { "epoch": 0.15645756457564575, "grad_norm": 38.08787536621094, "learning_rate": 1.3038130381303814e-05, "loss": 0.9539, "step": 212 }, { "epoch": 0.15719557195571957, "grad_norm": 40.93101501464844, "learning_rate": 1.3099630996309964e-05, "loss": 0.9497, "step": 213 }, { "epoch": 0.15793357933579336, "grad_norm": 42.12691116333008, "learning_rate": 1.3161131611316112e-05, "loss": 1.0591, "step": 214 }, { "epoch": 0.15867158671586715, "grad_norm": 39.68405532836914, "learning_rate": 1.3222632226322266e-05, "loss": 1.1084, "step": 215 }, { "epoch": 0.15940959409594097, "grad_norm": 46.32451629638672, "learning_rate": 1.3284132841328414e-05, "loss": 0.9886, "step": 216 }, { "epoch": 0.16014760147601476, "grad_norm": 43.83405303955078, "learning_rate": 1.3345633456334564e-05, "loss": 1.0409, "step": 217 }, { "epoch": 0.16088560885608857, "grad_norm": 46.454429626464844, "learning_rate": 1.3407134071340713e-05, "loss": 0.927, "step": 218 }, { "epoch": 0.16162361623616237, "grad_norm": 43.32332229614258, "learning_rate": 1.3468634686346865e-05, "loss": 1.0885, "step": 219 }, { "epoch": 0.16236162361623616, "grad_norm": 38.92317581176758, "learning_rate": 1.3530135301353015e-05, "loss": 1.1205, "step": 220 }, { "epoch": 0.16309963099630997, "grad_norm": 36.57090759277344, "learning_rate": 1.3591635916359163e-05, "loss": 1.0607, "step": 221 }, { "epoch": 0.16383763837638377, "grad_norm": 39.162147521972656, "learning_rate": 1.3653136531365315e-05, "loss": 1.1395, "step": 222 }, { "epoch": 0.16457564575645756, "grad_norm": 40.069610595703125, "learning_rate": 1.3714637146371464e-05, "loss": 0.993, "step": 223 }, { "epoch": 0.16531365313653137, "grad_norm": 38.262664794921875, "learning_rate": 1.3776137761377614e-05, "loss": 1.0751, "step": 224 }, { "epoch": 0.16605166051660517, "grad_norm": 38.50648498535156, "learning_rate": 1.3837638376383766e-05, "loss": 1.0874, "step": 225 }, { "epoch": 0.16678966789667896, "grad_norm": 41.57286834716797, "learning_rate": 1.3899138991389916e-05, "loss": 1.061, "step": 226 }, { "epoch": 0.16752767527675277, "grad_norm": 38.842124938964844, "learning_rate": 1.3960639606396064e-05, "loss": 0.9865, "step": 227 }, { "epoch": 0.16826568265682657, "grad_norm": 40.79179382324219, "learning_rate": 1.4022140221402214e-05, "loss": 1.0104, "step": 228 }, { "epoch": 0.16900369003690036, "grad_norm": 40.540042877197266, "learning_rate": 1.4083640836408365e-05, "loss": 0.9352, "step": 229 }, { "epoch": 0.16974169741697417, "grad_norm": 39.385459899902344, "learning_rate": 1.4145141451414515e-05, "loss": 1.0731, "step": 230 }, { "epoch": 0.17047970479704797, "grad_norm": 40.35080337524414, "learning_rate": 1.4206642066420663e-05, "loss": 1.1106, "step": 231 }, { "epoch": 0.17121771217712178, "grad_norm": 37.7828254699707, "learning_rate": 1.4268142681426816e-05, "loss": 1.0902, "step": 232 }, { "epoch": 0.17195571955719557, "grad_norm": 38.59387969970703, "learning_rate": 1.4329643296432965e-05, "loss": 1.0837, "step": 233 }, { "epoch": 0.17269372693726937, "grad_norm": 40.220245361328125, "learning_rate": 1.4391143911439114e-05, "loss": 1.1002, "step": 234 }, { "epoch": 0.17343173431734318, "grad_norm": 41.30938720703125, "learning_rate": 1.4452644526445266e-05, "loss": 0.9605, "step": 235 }, { "epoch": 0.17416974169741697, "grad_norm": 42.54692840576172, "learning_rate": 1.4514145141451416e-05, "loss": 1.1135, "step": 236 }, { "epoch": 0.17490774907749077, "grad_norm": 38.45701217651367, "learning_rate": 1.4575645756457566e-05, "loss": 1.2065, "step": 237 }, { "epoch": 0.17564575645756458, "grad_norm": 40.34320068359375, "learning_rate": 1.4637146371463714e-05, "loss": 1.0331, "step": 238 }, { "epoch": 0.17638376383763837, "grad_norm": 39.82585144042969, "learning_rate": 1.4698646986469865e-05, "loss": 1.1597, "step": 239 }, { "epoch": 0.17712177121771217, "grad_norm": 39.45707321166992, "learning_rate": 1.4760147601476015e-05, "loss": 1.1008, "step": 240 }, { "epoch": 0.17785977859778598, "grad_norm": 37.564231872558594, "learning_rate": 1.4821648216482165e-05, "loss": 0.9734, "step": 241 }, { "epoch": 0.17859778597785977, "grad_norm": 40.75583267211914, "learning_rate": 1.4883148831488317e-05, "loss": 1.1324, "step": 242 }, { "epoch": 0.1793357933579336, "grad_norm": 36.91340255737305, "learning_rate": 1.4944649446494467e-05, "loss": 0.8858, "step": 243 }, { "epoch": 0.18007380073800738, "grad_norm": 41.43409729003906, "learning_rate": 1.5006150061500615e-05, "loss": 1.127, "step": 244 }, { "epoch": 0.18081180811808117, "grad_norm": 39.64106750488281, "learning_rate": 1.5067650676506768e-05, "loss": 1.0394, "step": 245 }, { "epoch": 0.181549815498155, "grad_norm": 39.24397277832031, "learning_rate": 1.5129151291512916e-05, "loss": 1.1139, "step": 246 }, { "epoch": 0.18228782287822878, "grad_norm": 39.08576965332031, "learning_rate": 1.5190651906519066e-05, "loss": 1.1373, "step": 247 }, { "epoch": 0.18302583025830257, "grad_norm": 37.38773727416992, "learning_rate": 1.5252152521525214e-05, "loss": 0.9942, "step": 248 }, { "epoch": 0.1837638376383764, "grad_norm": 39.011505126953125, "learning_rate": 1.5313653136531367e-05, "loss": 1.1033, "step": 249 }, { "epoch": 0.18450184501845018, "grad_norm": 38.647705078125, "learning_rate": 1.5375153751537517e-05, "loss": 1.0039, "step": 250 }, { "epoch": 0.18523985239852397, "grad_norm": 36.8840446472168, "learning_rate": 1.5436654366543664e-05, "loss": 1.037, "step": 251 }, { "epoch": 0.1859778597785978, "grad_norm": 39.59068298339844, "learning_rate": 1.5498154981549817e-05, "loss": 1.1113, "step": 252 }, { "epoch": 0.18671586715867158, "grad_norm": 35.01139450073242, "learning_rate": 1.5559655596555967e-05, "loss": 1.0766, "step": 253 }, { "epoch": 0.18745387453874537, "grad_norm": 42.80155944824219, "learning_rate": 1.5621156211562117e-05, "loss": 1.2052, "step": 254 }, { "epoch": 0.1881918819188192, "grad_norm": 37.67293930053711, "learning_rate": 1.5682656826568266e-05, "loss": 1.054, "step": 255 }, { "epoch": 0.18892988929889298, "grad_norm": 35.59282684326172, "learning_rate": 1.5744157441574416e-05, "loss": 1.1038, "step": 256 }, { "epoch": 0.1896678966789668, "grad_norm": 36.562198638916016, "learning_rate": 1.5805658056580566e-05, "loss": 1.1277, "step": 257 }, { "epoch": 0.1904059040590406, "grad_norm": 38.406944274902344, "learning_rate": 1.5867158671586716e-05, "loss": 1.0396, "step": 258 }, { "epoch": 0.19114391143911438, "grad_norm": 37.851539611816406, "learning_rate": 1.5928659286592866e-05, "loss": 1.0541, "step": 259 }, { "epoch": 0.1918819188191882, "grad_norm": 34.81989669799805, "learning_rate": 1.5990159901599016e-05, "loss": 1.0241, "step": 260 }, { "epoch": 0.192619926199262, "grad_norm": 38.74085235595703, "learning_rate": 1.6051660516605166e-05, "loss": 1.0709, "step": 261 }, { "epoch": 0.19335793357933578, "grad_norm": 41.59756088256836, "learning_rate": 1.611316113161132e-05, "loss": 1.2334, "step": 262 }, { "epoch": 0.1940959409594096, "grad_norm": 35.79509353637695, "learning_rate": 1.617466174661747e-05, "loss": 1.0133, "step": 263 }, { "epoch": 0.1948339483394834, "grad_norm": 39.88947677612305, "learning_rate": 1.6236162361623615e-05, "loss": 1.0831, "step": 264 }, { "epoch": 0.19557195571955718, "grad_norm": 35.988487243652344, "learning_rate": 1.629766297662977e-05, "loss": 1.0962, "step": 265 }, { "epoch": 0.196309963099631, "grad_norm": 36.9556999206543, "learning_rate": 1.6359163591635918e-05, "loss": 1.1309, "step": 266 }, { "epoch": 0.1970479704797048, "grad_norm": 36.95020294189453, "learning_rate": 1.6420664206642068e-05, "loss": 1.0556, "step": 267 }, { "epoch": 0.1977859778597786, "grad_norm": 36.589324951171875, "learning_rate": 1.6482164821648215e-05, "loss": 1.0871, "step": 268 }, { "epoch": 0.1985239852398524, "grad_norm": 38.176605224609375, "learning_rate": 1.6543665436654368e-05, "loss": 1.0362, "step": 269 }, { "epoch": 0.1992619926199262, "grad_norm": 40.13340759277344, "learning_rate": 1.6605166051660518e-05, "loss": 0.9606, "step": 270 }, { "epoch": 0.2, "grad_norm": 40.80103302001953, "learning_rate": 1.6666666666666667e-05, "loss": 1.0099, "step": 271 }, { "epoch": 0.2007380073800738, "grad_norm": 37.991947174072266, "learning_rate": 1.6728167281672817e-05, "loss": 1.1559, "step": 272 }, { "epoch": 0.2014760147601476, "grad_norm": 35.638126373291016, "learning_rate": 1.6789667896678967e-05, "loss": 1.0468, "step": 273 }, { "epoch": 0.2022140221402214, "grad_norm": 36.0762825012207, "learning_rate": 1.6851168511685117e-05, "loss": 0.9843, "step": 274 }, { "epoch": 0.2029520295202952, "grad_norm": 39.42917251586914, "learning_rate": 1.691266912669127e-05, "loss": 0.995, "step": 275 }, { "epoch": 0.203690036900369, "grad_norm": 38.73271179199219, "learning_rate": 1.6974169741697417e-05, "loss": 1.1101, "step": 276 }, { "epoch": 0.2044280442804428, "grad_norm": 34.4466667175293, "learning_rate": 1.7035670356703567e-05, "loss": 1.1769, "step": 277 }, { "epoch": 0.2051660516605166, "grad_norm": 38.39332580566406, "learning_rate": 1.7097170971709716e-05, "loss": 1.2032, "step": 278 }, { "epoch": 0.2059040590405904, "grad_norm": 36.46586227416992, "learning_rate": 1.715867158671587e-05, "loss": 1.2505, "step": 279 }, { "epoch": 0.2066420664206642, "grad_norm": 38.546119689941406, "learning_rate": 1.722017220172202e-05, "loss": 1.0471, "step": 280 }, { "epoch": 0.207380073800738, "grad_norm": 36.11763381958008, "learning_rate": 1.7281672816728166e-05, "loss": 1.1173, "step": 281 }, { "epoch": 0.20811808118081182, "grad_norm": 36.332969665527344, "learning_rate": 1.734317343173432e-05, "loss": 0.989, "step": 282 }, { "epoch": 0.2088560885608856, "grad_norm": 36.8829231262207, "learning_rate": 1.740467404674047e-05, "loss": 1.0894, "step": 283 }, { "epoch": 0.2095940959409594, "grad_norm": 35.905765533447266, "learning_rate": 1.746617466174662e-05, "loss": 1.1755, "step": 284 }, { "epoch": 0.21033210332103322, "grad_norm": 31.39859962463379, "learning_rate": 1.752767527675277e-05, "loss": 1.089, "step": 285 }, { "epoch": 0.211070110701107, "grad_norm": 36.529537200927734, "learning_rate": 1.758917589175892e-05, "loss": 1.0632, "step": 286 }, { "epoch": 0.2118081180811808, "grad_norm": 38.358001708984375, "learning_rate": 1.765067650676507e-05, "loss": 1.1177, "step": 287 }, { "epoch": 0.21254612546125462, "grad_norm": 37.179325103759766, "learning_rate": 1.771217712177122e-05, "loss": 1.0513, "step": 288 }, { "epoch": 0.2132841328413284, "grad_norm": 35.38275146484375, "learning_rate": 1.7773677736777368e-05, "loss": 1.0212, "step": 289 }, { "epoch": 0.2140221402214022, "grad_norm": 37.132389068603516, "learning_rate": 1.7835178351783518e-05, "loss": 1.089, "step": 290 }, { "epoch": 0.21476014760147602, "grad_norm": 34.594783782958984, "learning_rate": 1.7896678966789668e-05, "loss": 1.1115, "step": 291 }, { "epoch": 0.2154981549815498, "grad_norm": 36.57194137573242, "learning_rate": 1.795817958179582e-05, "loss": 0.9911, "step": 292 }, { "epoch": 0.21623616236162363, "grad_norm": 34.58879470825195, "learning_rate": 1.8019680196801968e-05, "loss": 1.0169, "step": 293 }, { "epoch": 0.21697416974169742, "grad_norm": 33.588539123535156, "learning_rate": 1.8081180811808117e-05, "loss": 1.0345, "step": 294 }, { "epoch": 0.2177121771217712, "grad_norm": 34.15876007080078, "learning_rate": 1.814268142681427e-05, "loss": 1.0387, "step": 295 }, { "epoch": 0.21845018450184503, "grad_norm": 40.78740310668945, "learning_rate": 1.820418204182042e-05, "loss": 1.0292, "step": 296 }, { "epoch": 0.21918819188191882, "grad_norm": 38.307064056396484, "learning_rate": 1.826568265682657e-05, "loss": 1.087, "step": 297 }, { "epoch": 0.2199261992619926, "grad_norm": 33.9033203125, "learning_rate": 1.8327183271832717e-05, "loss": 1.0356, "step": 298 }, { "epoch": 0.22066420664206643, "grad_norm": 38.69232940673828, "learning_rate": 1.838868388683887e-05, "loss": 1.0239, "step": 299 }, { "epoch": 0.22140221402214022, "grad_norm": 34.63215637207031, "learning_rate": 1.845018450184502e-05, "loss": 1.1614, "step": 300 }, { "epoch": 0.22140221402214022, "eval_loss": 1.3183945417404175, "eval_runtime": 343.8464, "eval_samples_per_second": 3.342, "eval_steps_per_second": 0.279, "step": 300 }, { "epoch": 0.222140221402214, "grad_norm": 33.18867111206055, "learning_rate": 1.851168511685117e-05, "loss": 1.1519, "step": 301 }, { "epoch": 0.22287822878228783, "grad_norm": 34.760982513427734, "learning_rate": 1.857318573185732e-05, "loss": 1.1005, "step": 302 }, { "epoch": 0.22361623616236162, "grad_norm": 34.268043518066406, "learning_rate": 1.863468634686347e-05, "loss": 1.0483, "step": 303 }, { "epoch": 0.2243542435424354, "grad_norm": 35.12160873413086, "learning_rate": 1.869618696186962e-05, "loss": 1.1201, "step": 304 }, { "epoch": 0.22509225092250923, "grad_norm": 38.57670974731445, "learning_rate": 1.8757687576875773e-05, "loss": 1.0204, "step": 305 }, { "epoch": 0.22583025830258302, "grad_norm": 34.495235443115234, "learning_rate": 1.881918819188192e-05, "loss": 1.1887, "step": 306 }, { "epoch": 0.22656826568265684, "grad_norm": 36.18799591064453, "learning_rate": 1.888068880688807e-05, "loss": 0.8969, "step": 307 }, { "epoch": 0.22730627306273063, "grad_norm": 35.36030960083008, "learning_rate": 1.894218942189422e-05, "loss": 1.1272, "step": 308 }, { "epoch": 0.22804428044280442, "grad_norm": 34.50253677368164, "learning_rate": 1.9003690036900372e-05, "loss": 0.9908, "step": 309 }, { "epoch": 0.22878228782287824, "grad_norm": 33.003875732421875, "learning_rate": 1.906519065190652e-05, "loss": 1.0323, "step": 310 }, { "epoch": 0.22952029520295203, "grad_norm": 33.84071731567383, "learning_rate": 1.912669126691267e-05, "loss": 1.0924, "step": 311 }, { "epoch": 0.23025830258302582, "grad_norm": 37.590694427490234, "learning_rate": 1.918819188191882e-05, "loss": 1.0558, "step": 312 }, { "epoch": 0.23099630996309964, "grad_norm": 1048.1514892578125, "learning_rate": 1.924969249692497e-05, "loss": 0.9793, "step": 313 }, { "epoch": 0.23173431734317343, "grad_norm": 32.7579460144043, "learning_rate": 1.931119311193112e-05, "loss": 0.9345, "step": 314 }, { "epoch": 0.23247232472324722, "grad_norm": 41.32646942138672, "learning_rate": 1.937269372693727e-05, "loss": 1.0441, "step": 315 }, { "epoch": 0.23321033210332104, "grad_norm": 39.139198303222656, "learning_rate": 1.943419434194342e-05, "loss": 1.0545, "step": 316 }, { "epoch": 0.23394833948339483, "grad_norm": 35.99794006347656, "learning_rate": 1.949569495694957e-05, "loss": 1.1261, "step": 317 }, { "epoch": 0.23468634686346865, "grad_norm": 34.20968246459961, "learning_rate": 1.955719557195572e-05, "loss": 0.9836, "step": 318 }, { "epoch": 0.23542435424354244, "grad_norm": 33.9476203918457, "learning_rate": 1.961869618696187e-05, "loss": 1.0345, "step": 319 }, { "epoch": 0.23616236162361623, "grad_norm": 35.6599235534668, "learning_rate": 1.968019680196802e-05, "loss": 1.0316, "step": 320 }, { "epoch": 0.23690036900369005, "grad_norm": 34.30624008178711, "learning_rate": 1.974169741697417e-05, "loss": 0.9987, "step": 321 }, { "epoch": 0.23763837638376384, "grad_norm": 34.07005310058594, "learning_rate": 1.980319803198032e-05, "loss": 1.0052, "step": 322 }, { "epoch": 0.23837638376383763, "grad_norm": 33.085777282714844, "learning_rate": 1.986469864698647e-05, "loss": 1.1424, "step": 323 }, { "epoch": 0.23911439114391145, "grad_norm": 34.74597930908203, "learning_rate": 1.992619926199262e-05, "loss": 1.1401, "step": 324 }, { "epoch": 0.23985239852398524, "grad_norm": 36.55511474609375, "learning_rate": 1.9987699876998773e-05, "loss": 0.9437, "step": 325 }, { "epoch": 0.24059040590405903, "grad_norm": 35.86470031738281, "learning_rate": 2.0049200492004923e-05, "loss": 1.1535, "step": 326 }, { "epoch": 0.24132841328413285, "grad_norm": 33.983421325683594, "learning_rate": 2.011070110701107e-05, "loss": 1.1367, "step": 327 }, { "epoch": 0.24206642066420664, "grad_norm": 36.45722198486328, "learning_rate": 2.017220172201722e-05, "loss": 1.0917, "step": 328 }, { "epoch": 0.24280442804428043, "grad_norm": 36.953060150146484, "learning_rate": 2.0233702337023372e-05, "loss": 1.0107, "step": 329 }, { "epoch": 0.24354243542435425, "grad_norm": 37.92033004760742, "learning_rate": 2.0295202952029522e-05, "loss": 1.2084, "step": 330 }, { "epoch": 0.24428044280442804, "grad_norm": 31.74508285522461, "learning_rate": 2.035670356703567e-05, "loss": 1.0421, "step": 331 }, { "epoch": 0.24501845018450186, "grad_norm": 37.19945526123047, "learning_rate": 2.0418204182041822e-05, "loss": 1.082, "step": 332 }, { "epoch": 0.24575645756457565, "grad_norm": 32.649444580078125, "learning_rate": 2.0479704797047972e-05, "loss": 1.1345, "step": 333 }, { "epoch": 0.24649446494464944, "grad_norm": 36.957977294921875, "learning_rate": 2.054120541205412e-05, "loss": 1.0192, "step": 334 }, { "epoch": 0.24723247232472326, "grad_norm": 32.36549377441406, "learning_rate": 2.060270602706027e-05, "loss": 1.1387, "step": 335 }, { "epoch": 0.24797047970479705, "grad_norm": 30.191532135009766, "learning_rate": 2.066420664206642e-05, "loss": 1.0083, "step": 336 }, { "epoch": 0.24870848708487084, "grad_norm": 31.56035804748535, "learning_rate": 2.072570725707257e-05, "loss": 1.1246, "step": 337 }, { "epoch": 0.24944649446494466, "grad_norm": 36.50621032714844, "learning_rate": 2.078720787207872e-05, "loss": 1.1289, "step": 338 }, { "epoch": 0.25018450184501845, "grad_norm": 32.51582336425781, "learning_rate": 2.084870848708487e-05, "loss": 0.9957, "step": 339 }, { "epoch": 0.25092250922509224, "grad_norm": 40.50331115722656, "learning_rate": 2.091020910209102e-05, "loss": 1.0179, "step": 340 }, { "epoch": 0.25166051660516603, "grad_norm": 38.418792724609375, "learning_rate": 2.097170971709717e-05, "loss": 1.1074, "step": 341 }, { "epoch": 0.2523985239852399, "grad_norm": 33.0310173034668, "learning_rate": 2.1033210332103324e-05, "loss": 1.1623, "step": 342 }, { "epoch": 0.25313653136531367, "grad_norm": 30.66373062133789, "learning_rate": 2.1094710947109474e-05, "loss": 0.9796, "step": 343 }, { "epoch": 0.25387453874538746, "grad_norm": 30.335712432861328, "learning_rate": 2.115621156211562e-05, "loss": 1.0376, "step": 344 }, { "epoch": 0.25461254612546125, "grad_norm": 33.595855712890625, "learning_rate": 2.1217712177121773e-05, "loss": 1.0289, "step": 345 }, { "epoch": 0.25535055350553504, "grad_norm": 30.422454833984375, "learning_rate": 2.1279212792127923e-05, "loss": 1.0815, "step": 346 }, { "epoch": 0.25608856088560883, "grad_norm": 38.317386627197266, "learning_rate": 2.1340713407134073e-05, "loss": 1.0096, "step": 347 }, { "epoch": 0.2568265682656827, "grad_norm": 36.44529342651367, "learning_rate": 2.140221402214022e-05, "loss": 1.006, "step": 348 }, { "epoch": 0.25756457564575647, "grad_norm": 33.271060943603516, "learning_rate": 2.1463714637146373e-05, "loss": 0.9819, "step": 349 }, { "epoch": 0.25830258302583026, "grad_norm": 35.99654769897461, "learning_rate": 2.1525215252152523e-05, "loss": 1.1038, "step": 350 }, { "epoch": 0.25904059040590405, "grad_norm": 34.73610305786133, "learning_rate": 2.1586715867158673e-05, "loss": 1.065, "step": 351 }, { "epoch": 0.25977859778597784, "grad_norm": 37.899776458740234, "learning_rate": 2.1648216482164822e-05, "loss": 1.1092, "step": 352 }, { "epoch": 0.2605166051660517, "grad_norm": 36.49541473388672, "learning_rate": 2.1709717097170972e-05, "loss": 1.1665, "step": 353 }, { "epoch": 0.2612546125461255, "grad_norm": 35.63615798950195, "learning_rate": 2.1771217712177122e-05, "loss": 1.1201, "step": 354 }, { "epoch": 0.26199261992619927, "grad_norm": 34.21985626220703, "learning_rate": 2.1832718327183275e-05, "loss": 1.0518, "step": 355 }, { "epoch": 0.26273062730627306, "grad_norm": 33.33612823486328, "learning_rate": 2.1894218942189422e-05, "loss": 0.964, "step": 356 }, { "epoch": 0.26346863468634685, "grad_norm": 33.31211471557617, "learning_rate": 2.195571955719557e-05, "loss": 1.0508, "step": 357 }, { "epoch": 0.26420664206642064, "grad_norm": 32.13766860961914, "learning_rate": 2.201722017220172e-05, "loss": 1.1904, "step": 358 }, { "epoch": 0.2649446494464945, "grad_norm": 38.23426818847656, "learning_rate": 2.2078720787207875e-05, "loss": 1.044, "step": 359 }, { "epoch": 0.2656826568265683, "grad_norm": 30.594451904296875, "learning_rate": 2.2140221402214025e-05, "loss": 0.8797, "step": 360 }, { "epoch": 0.26642066420664207, "grad_norm": 33.05818557739258, "learning_rate": 2.220172201722017e-05, "loss": 1.1213, "step": 361 }, { "epoch": 0.26715867158671586, "grad_norm": 31.24005126953125, "learning_rate": 2.2263222632226324e-05, "loss": 1.1148, "step": 362 }, { "epoch": 0.26789667896678965, "grad_norm": 33.34355926513672, "learning_rate": 2.2324723247232474e-05, "loss": 1.0186, "step": 363 }, { "epoch": 0.2686346863468635, "grad_norm": 32.711002349853516, "learning_rate": 2.2386223862238624e-05, "loss": 1.0628, "step": 364 }, { "epoch": 0.2693726937269373, "grad_norm": 31.853166580200195, "learning_rate": 2.2447724477244774e-05, "loss": 1.0366, "step": 365 }, { "epoch": 0.2701107011070111, "grad_norm": 32.53550720214844, "learning_rate": 2.2509225092250924e-05, "loss": 1.076, "step": 366 }, { "epoch": 0.27084870848708487, "grad_norm": 29.53455924987793, "learning_rate": 2.2570725707257074e-05, "loss": 1.0598, "step": 367 }, { "epoch": 0.27158671586715866, "grad_norm": 34.44631576538086, "learning_rate": 2.2632226322263223e-05, "loss": 1.1174, "step": 368 }, { "epoch": 0.27232472324723245, "grad_norm": 33.80080032348633, "learning_rate": 2.2693726937269373e-05, "loss": 1.205, "step": 369 }, { "epoch": 0.2730627306273063, "grad_norm": 33.64272689819336, "learning_rate": 2.2755227552275523e-05, "loss": 1.1677, "step": 370 }, { "epoch": 0.2738007380073801, "grad_norm": 32.4225959777832, "learning_rate": 2.2816728167281673e-05, "loss": 0.9153, "step": 371 }, { "epoch": 0.2745387453874539, "grad_norm": 32.35124969482422, "learning_rate": 2.2878228782287826e-05, "loss": 1.0536, "step": 372 }, { "epoch": 0.27527675276752767, "grad_norm": 32.049827575683594, "learning_rate": 2.2939729397293973e-05, "loss": 1.1493, "step": 373 }, { "epoch": 0.27601476014760146, "grad_norm": 29.892070770263672, "learning_rate": 2.3001230012300123e-05, "loss": 1.0047, "step": 374 }, { "epoch": 0.2767527675276753, "grad_norm": 30.831012725830078, "learning_rate": 2.3062730627306276e-05, "loss": 1.0843, "step": 375 }, { "epoch": 0.2774907749077491, "grad_norm": 31.903175354003906, "learning_rate": 2.3124231242312426e-05, "loss": 1.0552, "step": 376 }, { "epoch": 0.2782287822878229, "grad_norm": 31.119150161743164, "learning_rate": 2.3185731857318575e-05, "loss": 1.0762, "step": 377 }, { "epoch": 0.2789667896678967, "grad_norm": 34.476524353027344, "learning_rate": 2.3247232472324722e-05, "loss": 0.925, "step": 378 }, { "epoch": 0.27970479704797047, "grad_norm": 33.33213806152344, "learning_rate": 2.3308733087330875e-05, "loss": 1.0427, "step": 379 }, { "epoch": 0.28044280442804426, "grad_norm": 30.07733917236328, "learning_rate": 2.3370233702337025e-05, "loss": 1.1158, "step": 380 }, { "epoch": 0.2811808118081181, "grad_norm": 36.79194259643555, "learning_rate": 2.3431734317343175e-05, "loss": 0.969, "step": 381 }, { "epoch": 0.2819188191881919, "grad_norm": 32.193233489990234, "learning_rate": 2.3493234932349325e-05, "loss": 0.938, "step": 382 }, { "epoch": 0.2826568265682657, "grad_norm": 35.39616394042969, "learning_rate": 2.3554735547355475e-05, "loss": 1.0384, "step": 383 }, { "epoch": 0.2833948339483395, "grad_norm": 32.57839584350586, "learning_rate": 2.3616236162361624e-05, "loss": 1.0573, "step": 384 }, { "epoch": 0.28413284132841327, "grad_norm": 34.920528411865234, "learning_rate": 2.3677736777367778e-05, "loss": 0.9427, "step": 385 }, { "epoch": 0.2848708487084871, "grad_norm": 34.9754753112793, "learning_rate": 2.3739237392373924e-05, "loss": 1.0893, "step": 386 }, { "epoch": 0.2856088560885609, "grad_norm": 31.592897415161133, "learning_rate": 2.3800738007380074e-05, "loss": 1.1378, "step": 387 }, { "epoch": 0.2863468634686347, "grad_norm": 32.26739501953125, "learning_rate": 2.3862238622386224e-05, "loss": 1.0627, "step": 388 }, { "epoch": 0.2870848708487085, "grad_norm": 30.732433319091797, "learning_rate": 2.3923739237392377e-05, "loss": 1.0358, "step": 389 }, { "epoch": 0.2878228782287823, "grad_norm": 34.005191802978516, "learning_rate": 2.3985239852398524e-05, "loss": 1.1111, "step": 390 }, { "epoch": 0.28856088560885607, "grad_norm": 30.67894744873047, "learning_rate": 2.4046740467404673e-05, "loss": 0.9718, "step": 391 }, { "epoch": 0.2892988929889299, "grad_norm": 28.351181030273438, "learning_rate": 2.4108241082410827e-05, "loss": 1.0609, "step": 392 }, { "epoch": 0.2900369003690037, "grad_norm": 32.102474212646484, "learning_rate": 2.4169741697416977e-05, "loss": 1.1381, "step": 393 }, { "epoch": 0.2907749077490775, "grad_norm": 33.687625885009766, "learning_rate": 2.4231242312423126e-05, "loss": 1.1188, "step": 394 }, { "epoch": 0.2915129151291513, "grad_norm": 33.333797454833984, "learning_rate": 2.4292742927429276e-05, "loss": 1.1755, "step": 395 }, { "epoch": 0.2922509225092251, "grad_norm": 29.862483978271484, "learning_rate": 2.4354243542435426e-05, "loss": 0.9939, "step": 396 }, { "epoch": 0.29298892988929887, "grad_norm": 34.118682861328125, "learning_rate": 2.4415744157441576e-05, "loss": 1.0769, "step": 397 }, { "epoch": 0.2937269372693727, "grad_norm": 31.04990005493164, "learning_rate": 2.4477244772447726e-05, "loss": 0.9994, "step": 398 }, { "epoch": 0.2944649446494465, "grad_norm": 31.455734252929688, "learning_rate": 2.4538745387453876e-05, "loss": 1.052, "step": 399 }, { "epoch": 0.2952029520295203, "grad_norm": 33.53933334350586, "learning_rate": 2.4600246002460025e-05, "loss": 1.0479, "step": 400 }, { "epoch": 0.2952029520295203, "eval_loss": 1.3168951272964478, "eval_runtime": 307.3734, "eval_samples_per_second": 3.738, "eval_steps_per_second": 0.312, "step": 400 }, { "epoch": 0.2959409594095941, "grad_norm": 30.59261703491211, "learning_rate": 2.4661746617466175e-05, "loss": 1.0978, "step": 401 }, { "epoch": 0.2966789667896679, "grad_norm": 30.34042739868164, "learning_rate": 2.472324723247233e-05, "loss": 0.9811, "step": 402 }, { "epoch": 0.2974169741697417, "grad_norm": 30.172008514404297, "learning_rate": 2.4784747847478475e-05, "loss": 1.1006, "step": 403 }, { "epoch": 0.2981549815498155, "grad_norm": 34.521026611328125, "learning_rate": 2.4846248462484625e-05, "loss": 1.0414, "step": 404 }, { "epoch": 0.2988929889298893, "grad_norm": 32.659603118896484, "learning_rate": 2.4907749077490778e-05, "loss": 1.0581, "step": 405 }, { "epoch": 0.2996309963099631, "grad_norm": 30.84364128112793, "learning_rate": 2.4969249692496928e-05, "loss": 1.0734, "step": 406 }, { "epoch": 0.3003690036900369, "grad_norm": 31.31522560119629, "learning_rate": 2.5030750307503074e-05, "loss": 1.1324, "step": 407 }, { "epoch": 0.3011070110701107, "grad_norm": 30.90158462524414, "learning_rate": 2.5092250922509224e-05, "loss": 1.0875, "step": 408 }, { "epoch": 0.3018450184501845, "grad_norm": 32.63178634643555, "learning_rate": 2.5153751537515374e-05, "loss": 0.947, "step": 409 }, { "epoch": 0.3025830258302583, "grad_norm": 31.25884246826172, "learning_rate": 2.5215252152521527e-05, "loss": 0.9885, "step": 410 }, { "epoch": 0.3033210332103321, "grad_norm": 31.27341651916504, "learning_rate": 2.5276752767527677e-05, "loss": 1.0252, "step": 411 }, { "epoch": 0.3040590405904059, "grad_norm": 32.48451232910156, "learning_rate": 2.5338253382533827e-05, "loss": 0.9561, "step": 412 }, { "epoch": 0.3047970479704797, "grad_norm": 32.380348205566406, "learning_rate": 2.5399753997539977e-05, "loss": 1.0956, "step": 413 }, { "epoch": 0.30553505535055353, "grad_norm": 35.79043960571289, "learning_rate": 2.5461254612546127e-05, "loss": 0.9773, "step": 414 }, { "epoch": 0.3062730627306273, "grad_norm": 32.07080078125, "learning_rate": 2.5522755227552277e-05, "loss": 0.9709, "step": 415 }, { "epoch": 0.3070110701107011, "grad_norm": 30.587440490722656, "learning_rate": 2.5584255842558423e-05, "loss": 1.1602, "step": 416 }, { "epoch": 0.3077490774907749, "grad_norm": 32.147560119628906, "learning_rate": 2.564575645756458e-05, "loss": 1.0212, "step": 417 }, { "epoch": 0.3084870848708487, "grad_norm": 28.960500717163086, "learning_rate": 2.570725707257073e-05, "loss": 1.0724, "step": 418 }, { "epoch": 0.3092250922509225, "grad_norm": 31.89568519592285, "learning_rate": 2.5768757687576876e-05, "loss": 1.0993, "step": 419 }, { "epoch": 0.30996309963099633, "grad_norm": 28.9609317779541, "learning_rate": 2.5830258302583026e-05, "loss": 1.0213, "step": 420 }, { "epoch": 0.3107011070110701, "grad_norm": 32.195152282714844, "learning_rate": 2.5891758917589176e-05, "loss": 1.1312, "step": 421 }, { "epoch": 0.3114391143911439, "grad_norm": 32.34213638305664, "learning_rate": 2.5953259532595326e-05, "loss": 1.1141, "step": 422 }, { "epoch": 0.3121771217712177, "grad_norm": 30.041015625, "learning_rate": 2.6014760147601475e-05, "loss": 1.0912, "step": 423 }, { "epoch": 0.3129151291512915, "grad_norm": 34.097068786621094, "learning_rate": 2.607626076260763e-05, "loss": 1.1144, "step": 424 }, { "epoch": 0.31365313653136534, "grad_norm": 33.118072509765625, "learning_rate": 2.613776137761378e-05, "loss": 1.0424, "step": 425 }, { "epoch": 0.31439114391143913, "grad_norm": 32.24378967285156, "learning_rate": 2.619926199261993e-05, "loss": 1.1218, "step": 426 }, { "epoch": 0.3151291512915129, "grad_norm": 29.910358428955078, "learning_rate": 2.6260762607626078e-05, "loss": 1.0043, "step": 427 }, { "epoch": 0.3158671586715867, "grad_norm": 28.464271545410156, "learning_rate": 2.6322263222632225e-05, "loss": 1.0377, "step": 428 }, { "epoch": 0.3166051660516605, "grad_norm": 33.54305648803711, "learning_rate": 2.6383763837638375e-05, "loss": 1.0836, "step": 429 }, { "epoch": 0.3173431734317343, "grad_norm": 33.36182403564453, "learning_rate": 2.644526445264453e-05, "loss": 0.935, "step": 430 }, { "epoch": 0.31808118081180814, "grad_norm": 30.69318962097168, "learning_rate": 2.650676506765068e-05, "loss": 0.9393, "step": 431 }, { "epoch": 0.31881918819188193, "grad_norm": 31.307289123535156, "learning_rate": 2.6568265682656828e-05, "loss": 1.0578, "step": 432 }, { "epoch": 0.3195571955719557, "grad_norm": 30.9537353515625, "learning_rate": 2.6629766297662977e-05, "loss": 0.978, "step": 433 }, { "epoch": 0.3202952029520295, "grad_norm": 34.1992073059082, "learning_rate": 2.6691266912669127e-05, "loss": 1.109, "step": 434 }, { "epoch": 0.3210332103321033, "grad_norm": 35.864681243896484, "learning_rate": 2.6752767527675277e-05, "loss": 1.0984, "step": 435 }, { "epoch": 0.32177121771217715, "grad_norm": 37.84678649902344, "learning_rate": 2.6814268142681427e-05, "loss": 1.1034, "step": 436 }, { "epoch": 0.32250922509225094, "grad_norm": 32.07746124267578, "learning_rate": 2.687576875768758e-05, "loss": 0.9589, "step": 437 }, { "epoch": 0.32324723247232473, "grad_norm": 30.982397079467773, "learning_rate": 2.693726937269373e-05, "loss": 0.957, "step": 438 }, { "epoch": 0.3239852398523985, "grad_norm": 32.20938491821289, "learning_rate": 2.699876998769988e-05, "loss": 1.0958, "step": 439 }, { "epoch": 0.3247232472324723, "grad_norm": 30.640172958374023, "learning_rate": 2.706027060270603e-05, "loss": 1.0231, "step": 440 }, { "epoch": 0.3254612546125461, "grad_norm": 31.90199851989746, "learning_rate": 2.7121771217712176e-05, "loss": 1.1002, "step": 441 }, { "epoch": 0.32619926199261995, "grad_norm": 30.51987075805664, "learning_rate": 2.7183271832718326e-05, "loss": 1.162, "step": 442 }, { "epoch": 0.32693726937269374, "grad_norm": 31.501314163208008, "learning_rate": 2.7244772447724476e-05, "loss": 1.0607, "step": 443 }, { "epoch": 0.32767527675276753, "grad_norm": 28.6356143951416, "learning_rate": 2.730627306273063e-05, "loss": 0.9796, "step": 444 }, { "epoch": 0.3284132841328413, "grad_norm": 31.74925422668457, "learning_rate": 2.736777367773678e-05, "loss": 1.1158, "step": 445 }, { "epoch": 0.3291512915129151, "grad_norm": 34.154579162597656, "learning_rate": 2.742927429274293e-05, "loss": 1.0649, "step": 446 }, { "epoch": 0.3298892988929889, "grad_norm": 32.25503158569336, "learning_rate": 2.749077490774908e-05, "loss": 1.1914, "step": 447 }, { "epoch": 0.33062730627306275, "grad_norm": 37.06145477294922, "learning_rate": 2.755227552275523e-05, "loss": 1.0985, "step": 448 }, { "epoch": 0.33136531365313654, "grad_norm": 31.48094367980957, "learning_rate": 2.761377613776138e-05, "loss": 1.0892, "step": 449 }, { "epoch": 0.33210332103321033, "grad_norm": 32.612770080566406, "learning_rate": 2.767527675276753e-05, "loss": 1.0109, "step": 450 }, { "epoch": 0.3328413284132841, "grad_norm": 31.58296775817871, "learning_rate": 2.773677736777368e-05, "loss": 0.97, "step": 451 }, { "epoch": 0.3335793357933579, "grad_norm": 34.60434341430664, "learning_rate": 2.779827798277983e-05, "loss": 1.0432, "step": 452 }, { "epoch": 0.33431734317343176, "grad_norm": 34.914894104003906, "learning_rate": 2.7859778597785978e-05, "loss": 1.1001, "step": 453 }, { "epoch": 0.33505535055350555, "grad_norm": 35.59685134887695, "learning_rate": 2.7921279212792128e-05, "loss": 1.2244, "step": 454 }, { "epoch": 0.33579335793357934, "grad_norm": 29.713642120361328, "learning_rate": 2.7982779827798277e-05, "loss": 0.9019, "step": 455 }, { "epoch": 0.33653136531365313, "grad_norm": 31.13001823425293, "learning_rate": 2.8044280442804427e-05, "loss": 1.0366, "step": 456 }, { "epoch": 0.3372693726937269, "grad_norm": 30.281965255737305, "learning_rate": 2.810578105781058e-05, "loss": 1.0273, "step": 457 }, { "epoch": 0.3380073800738007, "grad_norm": 31.66211700439453, "learning_rate": 2.816728167281673e-05, "loss": 1.1194, "step": 458 }, { "epoch": 0.33874538745387456, "grad_norm": 30.275386810302734, "learning_rate": 2.822878228782288e-05, "loss": 1.0575, "step": 459 }, { "epoch": 0.33948339483394835, "grad_norm": 29.42925453186035, "learning_rate": 2.829028290282903e-05, "loss": 0.9656, "step": 460 }, { "epoch": 0.34022140221402214, "grad_norm": 32.71029281616211, "learning_rate": 2.835178351783518e-05, "loss": 1.1847, "step": 461 }, { "epoch": 0.34095940959409593, "grad_norm": 29.633073806762695, "learning_rate": 2.8413284132841326e-05, "loss": 1.0942, "step": 462 }, { "epoch": 0.3416974169741697, "grad_norm": 31.828601837158203, "learning_rate": 2.8474784747847476e-05, "loss": 1.0376, "step": 463 }, { "epoch": 0.34243542435424357, "grad_norm": 30.043981552124023, "learning_rate": 2.8536285362853633e-05, "loss": 1.0835, "step": 464 }, { "epoch": 0.34317343173431736, "grad_norm": 33.54213333129883, "learning_rate": 2.8597785977859783e-05, "loss": 0.996, "step": 465 }, { "epoch": 0.34391143911439115, "grad_norm": 29.244539260864258, "learning_rate": 2.865928659286593e-05, "loss": 1.0677, "step": 466 }, { "epoch": 0.34464944649446494, "grad_norm": 30.86827278137207, "learning_rate": 2.872078720787208e-05, "loss": 0.9887, "step": 467 }, { "epoch": 0.34538745387453873, "grad_norm": 31.78754997253418, "learning_rate": 2.878228782287823e-05, "loss": 0.9915, "step": 468 }, { "epoch": 0.3461254612546125, "grad_norm": 32.79195785522461, "learning_rate": 2.884378843788438e-05, "loss": 1.1147, "step": 469 }, { "epoch": 0.34686346863468637, "grad_norm": 33.397979736328125, "learning_rate": 2.8905289052890532e-05, "loss": 0.9495, "step": 470 }, { "epoch": 0.34760147601476016, "grad_norm": 33.192649841308594, "learning_rate": 2.8966789667896682e-05, "loss": 1.0026, "step": 471 }, { "epoch": 0.34833948339483395, "grad_norm": 32.53486251831055, "learning_rate": 2.9028290282902832e-05, "loss": 1.0896, "step": 472 }, { "epoch": 0.34907749077490774, "grad_norm": 29.988269805908203, "learning_rate": 2.908979089790898e-05, "loss": 1.0286, "step": 473 }, { "epoch": 0.34981549815498153, "grad_norm": 30.389328002929688, "learning_rate": 2.915129151291513e-05, "loss": 1.0617, "step": 474 }, { "epoch": 0.3505535055350554, "grad_norm": 32.341678619384766, "learning_rate": 2.9212792127921278e-05, "loss": 0.9784, "step": 475 }, { "epoch": 0.35129151291512917, "grad_norm": 34.1507453918457, "learning_rate": 2.9274292742927428e-05, "loss": 1.1268, "step": 476 }, { "epoch": 0.35202952029520296, "grad_norm": 30.625898361206055, "learning_rate": 2.9335793357933584e-05, "loss": 1.1621, "step": 477 }, { "epoch": 0.35276752767527675, "grad_norm": 29.35662841796875, "learning_rate": 2.939729397293973e-05, "loss": 0.9967, "step": 478 }, { "epoch": 0.35350553505535054, "grad_norm": 28.236364364624023, "learning_rate": 2.945879458794588e-05, "loss": 1.0189, "step": 479 }, { "epoch": 0.35424354243542433, "grad_norm": 29.935972213745117, "learning_rate": 2.952029520295203e-05, "loss": 1.1403, "step": 480 }, { "epoch": 0.3549815498154982, "grad_norm": 30.732343673706055, "learning_rate": 2.958179581795818e-05, "loss": 1.0329, "step": 481 }, { "epoch": 0.35571955719557197, "grad_norm": 27.611663818359375, "learning_rate": 2.964329643296433e-05, "loss": 0.9701, "step": 482 }, { "epoch": 0.35645756457564576, "grad_norm": 26.146472930908203, "learning_rate": 2.970479704797048e-05, "loss": 1.0555, "step": 483 }, { "epoch": 0.35719557195571955, "grad_norm": 27.38328742980957, "learning_rate": 2.9766297662976633e-05, "loss": 1.0839, "step": 484 }, { "epoch": 0.35793357933579334, "grad_norm": 30.21470832824707, "learning_rate": 2.9827798277982783e-05, "loss": 0.9601, "step": 485 }, { "epoch": 0.3586715867158672, "grad_norm": 33.275665283203125, "learning_rate": 2.9889298892988933e-05, "loss": 0.9648, "step": 486 }, { "epoch": 0.359409594095941, "grad_norm": 32.144935607910156, "learning_rate": 2.995079950799508e-05, "loss": 1.0774, "step": 487 }, { "epoch": 0.36014760147601477, "grad_norm": 33.03762435913086, "learning_rate": 3.001230012300123e-05, "loss": 1.0353, "step": 488 }, { "epoch": 0.36088560885608856, "grad_norm": 29.72600555419922, "learning_rate": 3.007380073800738e-05, "loss": 1.0075, "step": 489 }, { "epoch": 0.36162361623616235, "grad_norm": 31.551420211791992, "learning_rate": 3.0135301353013536e-05, "loss": 1.1612, "step": 490 }, { "epoch": 0.36236162361623614, "grad_norm": 31.255245208740234, "learning_rate": 3.0196801968019682e-05, "loss": 1.1291, "step": 491 }, { "epoch": 0.36309963099631, "grad_norm": 28.523984909057617, "learning_rate": 3.0258302583025832e-05, "loss": 0.8965, "step": 492 }, { "epoch": 0.3638376383763838, "grad_norm": 27.026256561279297, "learning_rate": 3.0319803198031982e-05, "loss": 0.9842, "step": 493 }, { "epoch": 0.36457564575645757, "grad_norm": 27.513683319091797, "learning_rate": 3.0381303813038132e-05, "loss": 1.0663, "step": 494 }, { "epoch": 0.36531365313653136, "grad_norm": 28.917890548706055, "learning_rate": 3.0442804428044282e-05, "loss": 1.0083, "step": 495 }, { "epoch": 0.36605166051660515, "grad_norm": 30.66982650756836, "learning_rate": 3.0504305043050428e-05, "loss": 1.065, "step": 496 }, { "epoch": 0.36678966789667894, "grad_norm": 29.29199981689453, "learning_rate": 3.056580565805658e-05, "loss": 1.1113, "step": 497 }, { "epoch": 0.3675276752767528, "grad_norm": 30.53307342529297, "learning_rate": 3.0627306273062735e-05, "loss": 1.0564, "step": 498 }, { "epoch": 0.3682656826568266, "grad_norm": 27.8240909576416, "learning_rate": 3.068880688806888e-05, "loss": 1.086, "step": 499 }, { "epoch": 0.36900369003690037, "grad_norm": 33.0767936706543, "learning_rate": 3.0750307503075034e-05, "loss": 1.0258, "step": 500 }, { "epoch": 0.36900369003690037, "eval_loss": 1.317694902420044, "eval_runtime": 307.5192, "eval_samples_per_second": 3.736, "eval_steps_per_second": 0.312, "step": 500 }, { "epoch": 0.36974169741697416, "grad_norm": 29.415969848632812, "learning_rate": 3.081180811808118e-05, "loss": 1.1926, "step": 501 }, { "epoch": 0.37047970479704795, "grad_norm": 28.967937469482422, "learning_rate": 3.087330873308733e-05, "loss": 1.0652, "step": 502 }, { "epoch": 0.3712177121771218, "grad_norm": 30.757186889648438, "learning_rate": 3.093480934809348e-05, "loss": 1.0759, "step": 503 }, { "epoch": 0.3719557195571956, "grad_norm": 29.12079429626465, "learning_rate": 3.0996309963099634e-05, "loss": 1.0171, "step": 504 }, { "epoch": 0.3726937269372694, "grad_norm": 27.398155212402344, "learning_rate": 3.105781057810579e-05, "loss": 1.0255, "step": 505 }, { "epoch": 0.37343173431734317, "grad_norm": 30.28290557861328, "learning_rate": 3.1119311193111933e-05, "loss": 1.0215, "step": 506 }, { "epoch": 0.37416974169741696, "grad_norm": 32.874385833740234, "learning_rate": 3.118081180811808e-05, "loss": 0.9185, "step": 507 }, { "epoch": 0.37490774907749075, "grad_norm": 32.606929779052734, "learning_rate": 3.124231242312423e-05, "loss": 0.921, "step": 508 }, { "epoch": 0.3756457564575646, "grad_norm": 32.026466369628906, "learning_rate": 3.130381303813038e-05, "loss": 0.9647, "step": 509 }, { "epoch": 0.3763837638376384, "grad_norm": 28.804256439208984, "learning_rate": 3.136531365313653e-05, "loss": 0.9783, "step": 510 }, { "epoch": 0.3771217712177122, "grad_norm": 33.4760627746582, "learning_rate": 3.1426814268142686e-05, "loss": 1.1102, "step": 511 }, { "epoch": 0.37785977859778597, "grad_norm": 27.7533016204834, "learning_rate": 3.148831488314883e-05, "loss": 1.0607, "step": 512 }, { "epoch": 0.37859778597785976, "grad_norm": 30.21308135986328, "learning_rate": 3.1549815498154986e-05, "loss": 0.9921, "step": 513 }, { "epoch": 0.3793357933579336, "grad_norm": 30.123981475830078, "learning_rate": 3.161131611316113e-05, "loss": 1.0603, "step": 514 }, { "epoch": 0.3800738007380074, "grad_norm": 31.298110961914062, "learning_rate": 3.167281672816728e-05, "loss": 1.0396, "step": 515 }, { "epoch": 0.3808118081180812, "grad_norm": 29.31854248046875, "learning_rate": 3.173431734317343e-05, "loss": 1.0797, "step": 516 }, { "epoch": 0.381549815498155, "grad_norm": 32.191680908203125, "learning_rate": 3.1795817958179585e-05, "loss": 0.9568, "step": 517 }, { "epoch": 0.38228782287822877, "grad_norm": 31.62862777709961, "learning_rate": 3.185731857318573e-05, "loss": 1.1659, "step": 518 }, { "epoch": 0.38302583025830256, "grad_norm": 28.874908447265625, "learning_rate": 3.1918819188191885e-05, "loss": 1.0192, "step": 519 }, { "epoch": 0.3837638376383764, "grad_norm": 28.602893829345703, "learning_rate": 3.198031980319803e-05, "loss": 1.064, "step": 520 }, { "epoch": 0.3845018450184502, "grad_norm": 30.128530502319336, "learning_rate": 3.2041820418204185e-05, "loss": 0.9613, "step": 521 }, { "epoch": 0.385239852398524, "grad_norm": 29.335969924926758, "learning_rate": 3.210332103321033e-05, "loss": 1.0305, "step": 522 }, { "epoch": 0.3859778597785978, "grad_norm": 28.34609031677246, "learning_rate": 3.2164821648216484e-05, "loss": 1.1001, "step": 523 }, { "epoch": 0.38671586715867157, "grad_norm": 29.133621215820312, "learning_rate": 3.222632226322264e-05, "loss": 1.0011, "step": 524 }, { "epoch": 0.3874538745387454, "grad_norm": 29.79188346862793, "learning_rate": 3.2287822878228784e-05, "loss": 0.8858, "step": 525 }, { "epoch": 0.3881918819188192, "grad_norm": 33.12505340576172, "learning_rate": 3.234932349323494e-05, "loss": 1.0749, "step": 526 }, { "epoch": 0.388929889298893, "grad_norm": 28.103736877441406, "learning_rate": 3.2410824108241084e-05, "loss": 1.02, "step": 527 }, { "epoch": 0.3896678966789668, "grad_norm": 29.42950439453125, "learning_rate": 3.247232472324723e-05, "loss": 1.0181, "step": 528 }, { "epoch": 0.3904059040590406, "grad_norm": 28.812963485717773, "learning_rate": 3.2533825338253383e-05, "loss": 1.1254, "step": 529 }, { "epoch": 0.39114391143911437, "grad_norm": 30.136219024658203, "learning_rate": 3.259532595325954e-05, "loss": 1.1222, "step": 530 }, { "epoch": 0.3918819188191882, "grad_norm": 33.467960357666016, "learning_rate": 3.265682656826568e-05, "loss": 1.0028, "step": 531 }, { "epoch": 0.392619926199262, "grad_norm": 32.62849044799805, "learning_rate": 3.2718327183271836e-05, "loss": 1.1019, "step": 532 }, { "epoch": 0.3933579335793358, "grad_norm": 31.51215171813965, "learning_rate": 3.277982779827798e-05, "loss": 1.1408, "step": 533 }, { "epoch": 0.3940959409594096, "grad_norm": 31.761720657348633, "learning_rate": 3.2841328413284136e-05, "loss": 0.9927, "step": 534 }, { "epoch": 0.3948339483394834, "grad_norm": 28.129587173461914, "learning_rate": 3.290282902829028e-05, "loss": 0.9439, "step": 535 }, { "epoch": 0.3955719557195572, "grad_norm": 31.913143157958984, "learning_rate": 3.296432964329643e-05, "loss": 1.0182, "step": 536 }, { "epoch": 0.396309963099631, "grad_norm": 28.858692169189453, "learning_rate": 3.302583025830259e-05, "loss": 1.1423, "step": 537 }, { "epoch": 0.3970479704797048, "grad_norm": 39.564964294433594, "learning_rate": 3.3087330873308736e-05, "loss": 1.0672, "step": 538 }, { "epoch": 0.3977859778597786, "grad_norm": 35.25300216674805, "learning_rate": 3.314883148831489e-05, "loss": 1.0794, "step": 539 }, { "epoch": 0.3985239852398524, "grad_norm": 28.474002838134766, "learning_rate": 3.3210332103321035e-05, "loss": 1.1484, "step": 540 }, { "epoch": 0.3992619926199262, "grad_norm": 33.87021255493164, "learning_rate": 3.327183271832718e-05, "loss": 1.1114, "step": 541 }, { "epoch": 0.4, "grad_norm": 28.42962074279785, "learning_rate": 3.3333333333333335e-05, "loss": 1.0833, "step": 542 }, { "epoch": 0.4007380073800738, "grad_norm": 30.21544075012207, "learning_rate": 3.339483394833948e-05, "loss": 1.1128, "step": 543 }, { "epoch": 0.4014760147601476, "grad_norm": 29.623260498046875, "learning_rate": 3.3456334563345635e-05, "loss": 1.0984, "step": 544 }, { "epoch": 0.4022140221402214, "grad_norm": 34.08790588378906, "learning_rate": 3.351783517835179e-05, "loss": 1.1091, "step": 545 }, { "epoch": 0.4029520295202952, "grad_norm": 33.139915466308594, "learning_rate": 3.3579335793357934e-05, "loss": 1.0295, "step": 546 }, { "epoch": 0.40369003690036903, "grad_norm": 35.7862663269043, "learning_rate": 3.364083640836409e-05, "loss": 1.1139, "step": 547 }, { "epoch": 0.4044280442804428, "grad_norm": 28.253767013549805, "learning_rate": 3.3702337023370234e-05, "loss": 1.0673, "step": 548 }, { "epoch": 0.4051660516605166, "grad_norm": 32.525115966796875, "learning_rate": 3.376383763837638e-05, "loss": 1.0096, "step": 549 }, { "epoch": 0.4059040590405904, "grad_norm": 27.90035057067871, "learning_rate": 3.382533825338254e-05, "loss": 0.9391, "step": 550 }, { "epoch": 0.4066420664206642, "grad_norm": 30.637134552001953, "learning_rate": 3.388683886838869e-05, "loss": 1.187, "step": 551 }, { "epoch": 0.407380073800738, "grad_norm": 29.55883026123047, "learning_rate": 3.3948339483394833e-05, "loss": 1.0611, "step": 552 }, { "epoch": 0.40811808118081183, "grad_norm": 30.938365936279297, "learning_rate": 3.400984009840099e-05, "loss": 1.0229, "step": 553 }, { "epoch": 0.4088560885608856, "grad_norm": 29.01971435546875, "learning_rate": 3.407134071340713e-05, "loss": 1.1979, "step": 554 }, { "epoch": 0.4095940959409594, "grad_norm": 28.88690185546875, "learning_rate": 3.4132841328413286e-05, "loss": 0.9652, "step": 555 }, { "epoch": 0.4103321033210332, "grad_norm": 30.13008689880371, "learning_rate": 3.419434194341943e-05, "loss": 1.023, "step": 556 }, { "epoch": 0.411070110701107, "grad_norm": 30.277244567871094, "learning_rate": 3.4255842558425586e-05, "loss": 1.0419, "step": 557 }, { "epoch": 0.4118081180811808, "grad_norm": 31.770061492919922, "learning_rate": 3.431734317343174e-05, "loss": 1.1385, "step": 558 }, { "epoch": 0.41254612546125463, "grad_norm": 28.85527992248535, "learning_rate": 3.4378843788437886e-05, "loss": 1.0576, "step": 559 }, { "epoch": 0.4132841328413284, "grad_norm": 27.674936294555664, "learning_rate": 3.444034440344404e-05, "loss": 0.9695, "step": 560 }, { "epoch": 0.4140221402214022, "grad_norm": 30.44672203063965, "learning_rate": 3.4501845018450186e-05, "loss": 1.1668, "step": 561 }, { "epoch": 0.414760147601476, "grad_norm": 26.084020614624023, "learning_rate": 3.456334563345633e-05, "loss": 1.0662, "step": 562 }, { "epoch": 0.4154981549815498, "grad_norm": 29.204233169555664, "learning_rate": 3.4624846248462485e-05, "loss": 1.0049, "step": 563 }, { "epoch": 0.41623616236162364, "grad_norm": 31.064088821411133, "learning_rate": 3.468634686346864e-05, "loss": 1.0733, "step": 564 }, { "epoch": 0.41697416974169743, "grad_norm": 28.714794158935547, "learning_rate": 3.4747847478474785e-05, "loss": 1.1252, "step": 565 }, { "epoch": 0.4177121771217712, "grad_norm": 36.692623138427734, "learning_rate": 3.480934809348094e-05, "loss": 1.1517, "step": 566 }, { "epoch": 0.418450184501845, "grad_norm": 29.342973709106445, "learning_rate": 3.4870848708487085e-05, "loss": 1.1617, "step": 567 }, { "epoch": 0.4191881918819188, "grad_norm": 30.187889099121094, "learning_rate": 3.493234932349324e-05, "loss": 1.1766, "step": 568 }, { "epoch": 0.4199261992619926, "grad_norm": 27.71148681640625, "learning_rate": 3.4993849938499384e-05, "loss": 1.1307, "step": 569 }, { "epoch": 0.42066420664206644, "grad_norm": 26.817026138305664, "learning_rate": 3.505535055350554e-05, "loss": 1.1422, "step": 570 }, { "epoch": 0.42140221402214023, "grad_norm": 29.25654411315918, "learning_rate": 3.511685116851169e-05, "loss": 1.0934, "step": 571 }, { "epoch": 0.422140221402214, "grad_norm": 28.460424423217773, "learning_rate": 3.517835178351784e-05, "loss": 1.1308, "step": 572 }, { "epoch": 0.4228782287822878, "grad_norm": 27.779157638549805, "learning_rate": 3.5239852398523984e-05, "loss": 1.0648, "step": 573 }, { "epoch": 0.4236162361623616, "grad_norm": 32.28572082519531, "learning_rate": 3.530135301353014e-05, "loss": 1.0195, "step": 574 }, { "epoch": 0.42435424354243545, "grad_norm": 30.577444076538086, "learning_rate": 3.5362853628536283e-05, "loss": 1.0581, "step": 575 }, { "epoch": 0.42509225092250924, "grad_norm": 27.929576873779297, "learning_rate": 3.542435424354244e-05, "loss": 1.0838, "step": 576 }, { "epoch": 0.42583025830258303, "grad_norm": 30.955745697021484, "learning_rate": 3.548585485854859e-05, "loss": 1.0065, "step": 577 }, { "epoch": 0.4265682656826568, "grad_norm": 30.847639083862305, "learning_rate": 3.5547355473554736e-05, "loss": 1.0464, "step": 578 }, { "epoch": 0.4273062730627306, "grad_norm": 26.83955192565918, "learning_rate": 3.560885608856089e-05, "loss": 1.0382, "step": 579 }, { "epoch": 0.4280442804428044, "grad_norm": 28.2490177154541, "learning_rate": 3.5670356703567036e-05, "loss": 0.9712, "step": 580 }, { "epoch": 0.42878228782287825, "grad_norm": 28.63175392150879, "learning_rate": 3.573185731857319e-05, "loss": 0.9944, "step": 581 }, { "epoch": 0.42952029520295204, "grad_norm": 27.138669967651367, "learning_rate": 3.5793357933579336e-05, "loss": 1.1288, "step": 582 }, { "epoch": 0.43025830258302583, "grad_norm": 28.75208282470703, "learning_rate": 3.585485854858548e-05, "loss": 1.0389, "step": 583 }, { "epoch": 0.4309963099630996, "grad_norm": 29.765209197998047, "learning_rate": 3.591635916359164e-05, "loss": 1.0375, "step": 584 }, { "epoch": 0.4317343173431734, "grad_norm": 31.77211570739746, "learning_rate": 3.597785977859779e-05, "loss": 1.1282, "step": 585 }, { "epoch": 0.43247232472324726, "grad_norm": 28.593671798706055, "learning_rate": 3.6039360393603935e-05, "loss": 1.0487, "step": 586 }, { "epoch": 0.43321033210332105, "grad_norm": 28.624773025512695, "learning_rate": 3.610086100861009e-05, "loss": 1.0686, "step": 587 }, { "epoch": 0.43394833948339484, "grad_norm": 27.676698684692383, "learning_rate": 3.6162361623616235e-05, "loss": 1.1286, "step": 588 }, { "epoch": 0.43468634686346863, "grad_norm": 28.334789276123047, "learning_rate": 3.622386223862239e-05, "loss": 1.0686, "step": 589 }, { "epoch": 0.4354243542435424, "grad_norm": 24.738544464111328, "learning_rate": 3.628536285362854e-05, "loss": 0.8636, "step": 590 }, { "epoch": 0.4361623616236162, "grad_norm": 29.112049102783203, "learning_rate": 3.634686346863469e-05, "loss": 0.9369, "step": 591 }, { "epoch": 0.43690036900369006, "grad_norm": 29.67219352722168, "learning_rate": 3.640836408364084e-05, "loss": 1.0318, "step": 592 }, { "epoch": 0.43763837638376385, "grad_norm": 32.45582580566406, "learning_rate": 3.646986469864699e-05, "loss": 1.0077, "step": 593 }, { "epoch": 0.43837638376383764, "grad_norm": 30.126052856445312, "learning_rate": 3.653136531365314e-05, "loss": 1.1069, "step": 594 }, { "epoch": 0.43911439114391143, "grad_norm": 30.43257713317871, "learning_rate": 3.659286592865929e-05, "loss": 1.0041, "step": 595 }, { "epoch": 0.4398523985239852, "grad_norm": 28.884113311767578, "learning_rate": 3.6654366543665434e-05, "loss": 1.0136, "step": 596 }, { "epoch": 0.44059040590405907, "grad_norm": 28.1043758392334, "learning_rate": 3.6715867158671594e-05, "loss": 1.0784, "step": 597 }, { "epoch": 0.44132841328413286, "grad_norm": 29.222322463989258, "learning_rate": 3.677736777367774e-05, "loss": 0.994, "step": 598 }, { "epoch": 0.44206642066420665, "grad_norm": 31.78004264831543, "learning_rate": 3.683886838868389e-05, "loss": 0.9989, "step": 599 }, { "epoch": 0.44280442804428044, "grad_norm": 26.486068725585938, "learning_rate": 3.690036900369004e-05, "loss": 1.0936, "step": 600 }, { "epoch": 0.44280442804428044, "eval_loss": 1.3267521858215332, "eval_runtime": 309.179, "eval_samples_per_second": 3.716, "eval_steps_per_second": 0.31, "step": 600 }, { "epoch": 0.44354243542435423, "grad_norm": 28.453187942504883, "learning_rate": 3.6961869618696186e-05, "loss": 1.0252, "step": 601 }, { "epoch": 0.444280442804428, "grad_norm": 30.434410095214844, "learning_rate": 3.702337023370234e-05, "loss": 1.1192, "step": 602 }, { "epoch": 0.44501845018450187, "grad_norm": 28.11585807800293, "learning_rate": 3.7084870848708486e-05, "loss": 0.9912, "step": 603 }, { "epoch": 0.44575645756457566, "grad_norm": 32.852027893066406, "learning_rate": 3.714637146371464e-05, "loss": 0.9976, "step": 604 }, { "epoch": 0.44649446494464945, "grad_norm": 26.785593032836914, "learning_rate": 3.720787207872079e-05, "loss": 1.0799, "step": 605 }, { "epoch": 0.44723247232472324, "grad_norm": 28.873849868774414, "learning_rate": 3.726937269372694e-05, "loss": 1.0319, "step": 606 }, { "epoch": 0.44797047970479703, "grad_norm": 31.951059341430664, "learning_rate": 3.7330873308733085e-05, "loss": 1.0665, "step": 607 }, { "epoch": 0.4487084870848708, "grad_norm": 26.902822494506836, "learning_rate": 3.739237392373924e-05, "loss": 1.0973, "step": 608 }, { "epoch": 0.44944649446494467, "grad_norm": 31.43962287902832, "learning_rate": 3.7453874538745385e-05, "loss": 0.997, "step": 609 }, { "epoch": 0.45018450184501846, "grad_norm": 28.310514450073242, "learning_rate": 3.7515375153751545e-05, "loss": 1.1017, "step": 610 }, { "epoch": 0.45092250922509225, "grad_norm": 26.364179611206055, "learning_rate": 3.757687576875769e-05, "loss": 1.087, "step": 611 }, { "epoch": 0.45166051660516604, "grad_norm": 26.653833389282227, "learning_rate": 3.763837638376384e-05, "loss": 1.0061, "step": 612 }, { "epoch": 0.45239852398523983, "grad_norm": 30.07135581970215, "learning_rate": 3.769987699876999e-05, "loss": 1.0957, "step": 613 }, { "epoch": 0.4531365313653137, "grad_norm": 27.822776794433594, "learning_rate": 3.776137761377614e-05, "loss": 1.0992, "step": 614 }, { "epoch": 0.45387453874538747, "grad_norm": 31.2148494720459, "learning_rate": 3.782287822878229e-05, "loss": 0.9902, "step": 615 }, { "epoch": 0.45461254612546126, "grad_norm": 34.85270309448242, "learning_rate": 3.788437884378844e-05, "loss": 1.1163, "step": 616 }, { "epoch": 0.45535055350553505, "grad_norm": 27.64411735534668, "learning_rate": 3.794587945879459e-05, "loss": 1.1273, "step": 617 }, { "epoch": 0.45608856088560884, "grad_norm": 28.515451431274414, "learning_rate": 3.8007380073800744e-05, "loss": 1.0642, "step": 618 }, { "epoch": 0.45682656826568263, "grad_norm": 34.522491455078125, "learning_rate": 3.806888068880689e-05, "loss": 0.9994, "step": 619 }, { "epoch": 0.4575645756457565, "grad_norm": 30.255014419555664, "learning_rate": 3.813038130381304e-05, "loss": 1.091, "step": 620 }, { "epoch": 0.45830258302583027, "grad_norm": 30.578969955444336, "learning_rate": 3.819188191881919e-05, "loss": 1.1066, "step": 621 }, { "epoch": 0.45904059040590406, "grad_norm": 27.243410110473633, "learning_rate": 3.825338253382534e-05, "loss": 1.1007, "step": 622 }, { "epoch": 0.45977859778597785, "grad_norm": 29.49376678466797, "learning_rate": 3.831488314883149e-05, "loss": 1.1645, "step": 623 }, { "epoch": 0.46051660516605164, "grad_norm": 30.315433502197266, "learning_rate": 3.837638376383764e-05, "loss": 1.0911, "step": 624 }, { "epoch": 0.4612546125461255, "grad_norm": 31.19307518005371, "learning_rate": 3.843788437884379e-05, "loss": 1.1693, "step": 625 }, { "epoch": 0.4619926199261993, "grad_norm": 27.844942092895508, "learning_rate": 3.849938499384994e-05, "loss": 1.0592, "step": 626 }, { "epoch": 0.46273062730627307, "grad_norm": 29.83812141418457, "learning_rate": 3.856088560885609e-05, "loss": 1.0263, "step": 627 }, { "epoch": 0.46346863468634686, "grad_norm": 27.992292404174805, "learning_rate": 3.862238622386224e-05, "loss": 1.0808, "step": 628 }, { "epoch": 0.46420664206642065, "grad_norm": 27.693565368652344, "learning_rate": 3.868388683886839e-05, "loss": 0.9699, "step": 629 }, { "epoch": 0.46494464944649444, "grad_norm": 29.0965633392334, "learning_rate": 3.874538745387454e-05, "loss": 1.1051, "step": 630 }, { "epoch": 0.4656826568265683, "grad_norm": 29.10242462158203, "learning_rate": 3.8806888068880695e-05, "loss": 1.0039, "step": 631 }, { "epoch": 0.4664206642066421, "grad_norm": 32.43134307861328, "learning_rate": 3.886838868388684e-05, "loss": 1.064, "step": 632 }, { "epoch": 0.46715867158671587, "grad_norm": 29.64716148376465, "learning_rate": 3.892988929889299e-05, "loss": 1.0935, "step": 633 }, { "epoch": 0.46789667896678966, "grad_norm": 29.36592674255371, "learning_rate": 3.899138991389914e-05, "loss": 1.0937, "step": 634 }, { "epoch": 0.46863468634686345, "grad_norm": 28.95639991760254, "learning_rate": 3.905289052890529e-05, "loss": 1.0853, "step": 635 }, { "epoch": 0.4693726937269373, "grad_norm": 29.89202308654785, "learning_rate": 3.911439114391144e-05, "loss": 1.0811, "step": 636 }, { "epoch": 0.4701107011070111, "grad_norm": 29.48238754272461, "learning_rate": 3.9175891758917595e-05, "loss": 1.1506, "step": 637 }, { "epoch": 0.4708487084870849, "grad_norm": 28.27334213256836, "learning_rate": 3.923739237392374e-05, "loss": 1.1197, "step": 638 }, { "epoch": 0.47158671586715867, "grad_norm": 28.055349349975586, "learning_rate": 3.9298892988929894e-05, "loss": 1.0456, "step": 639 }, { "epoch": 0.47232472324723246, "grad_norm": 27.234216690063477, "learning_rate": 3.936039360393604e-05, "loss": 0.9234, "step": 640 }, { "epoch": 0.47306273062730625, "grad_norm": 28.06637191772461, "learning_rate": 3.942189421894219e-05, "loss": 1.0727, "step": 641 }, { "epoch": 0.4738007380073801, "grad_norm": 32.17995834350586, "learning_rate": 3.948339483394834e-05, "loss": 0.9436, "step": 642 }, { "epoch": 0.4745387453874539, "grad_norm": 34.09589767456055, "learning_rate": 3.954489544895449e-05, "loss": 1.1217, "step": 643 }, { "epoch": 0.4752767527675277, "grad_norm": 28.410308837890625, "learning_rate": 3.960639606396064e-05, "loss": 1.2026, "step": 644 }, { "epoch": 0.47601476014760147, "grad_norm": 30.437602996826172, "learning_rate": 3.9667896678966793e-05, "loss": 1.197, "step": 645 }, { "epoch": 0.47675276752767526, "grad_norm": 25.85258674621582, "learning_rate": 3.972939729397294e-05, "loss": 0.9193, "step": 646 }, { "epoch": 0.4774907749077491, "grad_norm": 30.591075897216797, "learning_rate": 3.979089790897909e-05, "loss": 1.0067, "step": 647 }, { "epoch": 0.4782287822878229, "grad_norm": 35.56831741333008, "learning_rate": 3.985239852398524e-05, "loss": 1.0091, "step": 648 }, { "epoch": 0.4789667896678967, "grad_norm": 28.925878524780273, "learning_rate": 3.991389913899139e-05, "loss": 1.0451, "step": 649 }, { "epoch": 0.4797047970479705, "grad_norm": 26.45174789428711, "learning_rate": 3.9975399753997546e-05, "loss": 1.1113, "step": 650 }, { "epoch": 0.48044280442804427, "grad_norm": 32.575260162353516, "learning_rate": 4.003690036900369e-05, "loss": 1.1129, "step": 651 }, { "epoch": 0.48118081180811806, "grad_norm": 31.939918518066406, "learning_rate": 4.0098400984009846e-05, "loss": 1.0175, "step": 652 }, { "epoch": 0.4819188191881919, "grad_norm": 72.9084701538086, "learning_rate": 4.015990159901599e-05, "loss": 0.966, "step": 653 }, { "epoch": 0.4826568265682657, "grad_norm": 32.10757827758789, "learning_rate": 4.022140221402214e-05, "loss": 1.124, "step": 654 }, { "epoch": 0.4833948339483395, "grad_norm": 35.528778076171875, "learning_rate": 4.028290282902829e-05, "loss": 1.133, "step": 655 }, { "epoch": 0.4841328413284133, "grad_norm": 29.31783676147461, "learning_rate": 4.034440344403444e-05, "loss": 1.0159, "step": 656 }, { "epoch": 0.48487084870848707, "grad_norm": 28.90894889831543, "learning_rate": 4.040590405904059e-05, "loss": 1.0186, "step": 657 }, { "epoch": 0.48560885608856086, "grad_norm": 27.08976173400879, "learning_rate": 4.0467404674046745e-05, "loss": 0.8565, "step": 658 }, { "epoch": 0.4863468634686347, "grad_norm": 32.08723831176758, "learning_rate": 4.052890528905289e-05, "loss": 0.9848, "step": 659 }, { "epoch": 0.4870848708487085, "grad_norm": 31.9980525970459, "learning_rate": 4.0590405904059045e-05, "loss": 1.2406, "step": 660 }, { "epoch": 0.4878228782287823, "grad_norm": 27.090219497680664, "learning_rate": 4.065190651906519e-05, "loss": 1.2235, "step": 661 }, { "epoch": 0.4885608856088561, "grad_norm": 42.83357620239258, "learning_rate": 4.071340713407134e-05, "loss": 1.1278, "step": 662 }, { "epoch": 0.48929889298892987, "grad_norm": 28.690671920776367, "learning_rate": 4.077490774907749e-05, "loss": 1.19, "step": 663 }, { "epoch": 0.4900369003690037, "grad_norm": 32.07972717285156, "learning_rate": 4.0836408364083644e-05, "loss": 1.1053, "step": 664 }, { "epoch": 0.4907749077490775, "grad_norm": 29.517995834350586, "learning_rate": 4.08979089790898e-05, "loss": 0.9924, "step": 665 }, { "epoch": 0.4915129151291513, "grad_norm": 36.88546371459961, "learning_rate": 4.0959409594095944e-05, "loss": 1.097, "step": 666 }, { "epoch": 0.4922509225092251, "grad_norm": 27.41716957092285, "learning_rate": 4.102091020910209e-05, "loss": 1.1743, "step": 667 }, { "epoch": 0.4929889298892989, "grad_norm": 36.04215621948242, "learning_rate": 4.108241082410824e-05, "loss": 1.0724, "step": 668 }, { "epoch": 0.49372693726937267, "grad_norm": 31.058218002319336, "learning_rate": 4.114391143911439e-05, "loss": 1.2224, "step": 669 }, { "epoch": 0.4944649446494465, "grad_norm": 30.21110725402832, "learning_rate": 4.120541205412054e-05, "loss": 1.0559, "step": 670 }, { "epoch": 0.4952029520295203, "grad_norm": 317.8634033203125, "learning_rate": 4.1266912669126696e-05, "loss": 1.1668, "step": 671 }, { "epoch": 0.4959409594095941, "grad_norm": 30.09259605407715, "learning_rate": 4.132841328413284e-05, "loss": 1.107, "step": 672 }, { "epoch": 0.4966789667896679, "grad_norm": 30.432334899902344, "learning_rate": 4.1389913899138996e-05, "loss": 1.0314, "step": 673 }, { "epoch": 0.4974169741697417, "grad_norm": 29.147876739501953, "learning_rate": 4.145141451414514e-05, "loss": 1.1293, "step": 674 }, { "epoch": 0.4981549815498155, "grad_norm": 28.299036026000977, "learning_rate": 4.151291512915129e-05, "loss": 0.9126, "step": 675 }, { "epoch": 0.4988929889298893, "grad_norm": 27.47956085205078, "learning_rate": 4.157441574415744e-05, "loss": 1.1542, "step": 676 }, { "epoch": 0.4996309963099631, "grad_norm": 31.645191192626953, "learning_rate": 4.1635916359163595e-05, "loss": 1.0069, "step": 677 }, { "epoch": 0.5003690036900369, "grad_norm": 28.30335235595703, "learning_rate": 4.169741697416974e-05, "loss": 1.1189, "step": 678 }, { "epoch": 0.5011070110701107, "grad_norm": 28.922136306762695, "learning_rate": 4.1758917589175895e-05, "loss": 1.0529, "step": 679 }, { "epoch": 0.5018450184501845, "grad_norm": 29.070533752441406, "learning_rate": 4.182041820418204e-05, "loss": 1.0434, "step": 680 }, { "epoch": 0.5025830258302583, "grad_norm": 34.41718292236328, "learning_rate": 4.1881918819188195e-05, "loss": 1.0136, "step": 681 }, { "epoch": 0.5033210332103321, "grad_norm": 30.644197463989258, "learning_rate": 4.194341943419434e-05, "loss": 1.2139, "step": 682 }, { "epoch": 0.5040590405904058, "grad_norm": 31.38071060180664, "learning_rate": 4.2004920049200495e-05, "loss": 1.0473, "step": 683 }, { "epoch": 0.5047970479704798, "grad_norm": 28.35428237915039, "learning_rate": 4.206642066420665e-05, "loss": 1.1185, "step": 684 }, { "epoch": 0.5055350553505535, "grad_norm": 30.84862518310547, "learning_rate": 4.2127921279212794e-05, "loss": 1.0605, "step": 685 }, { "epoch": 0.5062730627306273, "grad_norm": 28.12001609802246, "learning_rate": 4.218942189421895e-05, "loss": 1.0421, "step": 686 }, { "epoch": 0.5070110701107011, "grad_norm": 94.46589660644531, "learning_rate": 4.2250922509225094e-05, "loss": 1.1904, "step": 687 }, { "epoch": 0.5077490774907749, "grad_norm": 28.075532913208008, "learning_rate": 4.231242312423124e-05, "loss": 0.9612, "step": 688 }, { "epoch": 0.5084870848708487, "grad_norm": 33.0609245300293, "learning_rate": 4.2373923739237394e-05, "loss": 1.1283, "step": 689 }, { "epoch": 0.5092250922509225, "grad_norm": 31.729276657104492, "learning_rate": 4.243542435424355e-05, "loss": 1.1253, "step": 690 }, { "epoch": 0.5099630996309963, "grad_norm": 29.362197875976562, "learning_rate": 4.249692496924969e-05, "loss": 1.0577, "step": 691 }, { "epoch": 0.5107011070110701, "grad_norm": 27.433551788330078, "learning_rate": 4.2558425584255847e-05, "loss": 0.9175, "step": 692 }, { "epoch": 0.5114391143911439, "grad_norm": 28.477914810180664, "learning_rate": 4.261992619926199e-05, "loss": 1.1097, "step": 693 }, { "epoch": 0.5121771217712177, "grad_norm": 26.180309295654297, "learning_rate": 4.2681426814268146e-05, "loss": 0.9585, "step": 694 }, { "epoch": 0.5129151291512916, "grad_norm": 28.950037002563477, "learning_rate": 4.274292742927429e-05, "loss": 0.9945, "step": 695 }, { "epoch": 0.5136531365313654, "grad_norm": 33.97092819213867, "learning_rate": 4.280442804428044e-05, "loss": 1.1573, "step": 696 }, { "epoch": 0.5143911439114391, "grad_norm": 31.86573600769043, "learning_rate": 4.28659286592866e-05, "loss": 1.1061, "step": 697 }, { "epoch": 0.5151291512915129, "grad_norm": 35.693443298339844, "learning_rate": 4.2927429274292746e-05, "loss": 1.1284, "step": 698 }, { "epoch": 0.5158671586715867, "grad_norm": 29.409988403320312, "learning_rate": 4.29889298892989e-05, "loss": 1.2092, "step": 699 }, { "epoch": 0.5166051660516605, "grad_norm": 28.83966636657715, "learning_rate": 4.3050430504305045e-05, "loss": 0.9437, "step": 700 }, { "epoch": 0.5166051660516605, "eval_loss": 1.3298412561416626, "eval_runtime": 307.823, "eval_samples_per_second": 3.733, "eval_steps_per_second": 0.312, "step": 700 }, { "epoch": 0.5173431734317343, "grad_norm": 26.844846725463867, "learning_rate": 4.311193111931119e-05, "loss": 1.112, "step": 701 }, { "epoch": 0.5180811808118081, "grad_norm": 28.658428192138672, "learning_rate": 4.3173431734317345e-05, "loss": 0.9918, "step": 702 }, { "epoch": 0.5188191881918819, "grad_norm": 32.5452995300293, "learning_rate": 4.323493234932349e-05, "loss": 1.0447, "step": 703 }, { "epoch": 0.5195571955719557, "grad_norm": 26.970304489135742, "learning_rate": 4.3296432964329645e-05, "loss": 1.1313, "step": 704 }, { "epoch": 0.5202952029520295, "grad_norm": 28.920679092407227, "learning_rate": 4.33579335793358e-05, "loss": 1.0252, "step": 705 }, { "epoch": 0.5210332103321034, "grad_norm": 40.62504959106445, "learning_rate": 4.3419434194341945e-05, "loss": 1.1523, "step": 706 }, { "epoch": 0.5217712177121772, "grad_norm": 30.851390838623047, "learning_rate": 4.34809348093481e-05, "loss": 0.9797, "step": 707 }, { "epoch": 0.522509225092251, "grad_norm": 27.900365829467773, "learning_rate": 4.3542435424354244e-05, "loss": 1.0221, "step": 708 }, { "epoch": 0.5232472324723247, "grad_norm": 26.0831356048584, "learning_rate": 4.360393603936039e-05, "loss": 1.0887, "step": 709 }, { "epoch": 0.5239852398523985, "grad_norm": 29.75108528137207, "learning_rate": 4.366543665436655e-05, "loss": 0.9471, "step": 710 }, { "epoch": 0.5247232472324723, "grad_norm": 31.546483993530273, "learning_rate": 4.37269372693727e-05, "loss": 1.1382, "step": 711 }, { "epoch": 0.5254612546125461, "grad_norm": 27.857818603515625, "learning_rate": 4.3788437884378844e-05, "loss": 1.1366, "step": 712 }, { "epoch": 0.5261992619926199, "grad_norm": 26.583192825317383, "learning_rate": 4.3849938499385e-05, "loss": 1.0916, "step": 713 }, { "epoch": 0.5269372693726937, "grad_norm": 30.150146484375, "learning_rate": 4.391143911439114e-05, "loss": 1.0575, "step": 714 }, { "epoch": 0.5276752767527675, "grad_norm": 27.24560546875, "learning_rate": 4.3972939729397297e-05, "loss": 1.0683, "step": 715 }, { "epoch": 0.5284132841328413, "grad_norm": 29.45226287841797, "learning_rate": 4.403444034440344e-05, "loss": 1.1279, "step": 716 }, { "epoch": 0.5291512915129152, "grad_norm": 28.790172576904297, "learning_rate": 4.4095940959409596e-05, "loss": 1.1934, "step": 717 }, { "epoch": 0.529889298892989, "grad_norm": 42.536705017089844, "learning_rate": 4.415744157441575e-05, "loss": 1.061, "step": 718 }, { "epoch": 0.5306273062730628, "grad_norm": 28.66362953186035, "learning_rate": 4.4218942189421896e-05, "loss": 1.0786, "step": 719 }, { "epoch": 0.5313653136531366, "grad_norm": 25.908044815063477, "learning_rate": 4.428044280442805e-05, "loss": 1.0297, "step": 720 }, { "epoch": 0.5321033210332103, "grad_norm": 28.063125610351562, "learning_rate": 4.4341943419434196e-05, "loss": 0.9617, "step": 721 }, { "epoch": 0.5328413284132841, "grad_norm": 27.69817352294922, "learning_rate": 4.440344403444034e-05, "loss": 1.1288, "step": 722 }, { "epoch": 0.5335793357933579, "grad_norm": 30.366674423217773, "learning_rate": 4.4464944649446495e-05, "loss": 1.1191, "step": 723 }, { "epoch": 0.5343173431734317, "grad_norm": 30.783306121826172, "learning_rate": 4.452644526445265e-05, "loss": 1.1135, "step": 724 }, { "epoch": 0.5350553505535055, "grad_norm": 28.302270889282227, "learning_rate": 4.4587945879458795e-05, "loss": 1.1435, "step": 725 }, { "epoch": 0.5357933579335793, "grad_norm": 28.51706314086914, "learning_rate": 4.464944649446495e-05, "loss": 1.1788, "step": 726 }, { "epoch": 0.5365313653136531, "grad_norm": 31.32042121887207, "learning_rate": 4.4710947109471095e-05, "loss": 1.1667, "step": 727 }, { "epoch": 0.537269372693727, "grad_norm": 28.812145233154297, "learning_rate": 4.477244772447725e-05, "loss": 1.0405, "step": 728 }, { "epoch": 0.5380073800738008, "grad_norm": 26.23000717163086, "learning_rate": 4.4833948339483395e-05, "loss": 1.0401, "step": 729 }, { "epoch": 0.5387453874538746, "grad_norm": 81.6714859008789, "learning_rate": 4.489544895448955e-05, "loss": 1.1202, "step": 730 }, { "epoch": 0.5394833948339484, "grad_norm": 27.881044387817383, "learning_rate": 4.49569495694957e-05, "loss": 1.0516, "step": 731 }, { "epoch": 0.5402214022140222, "grad_norm": 29.472396850585938, "learning_rate": 4.501845018450185e-05, "loss": 1.1779, "step": 732 }, { "epoch": 0.5409594095940959, "grad_norm": 28.200910568237305, "learning_rate": 4.5079950799507994e-05, "loss": 1.018, "step": 733 }, { "epoch": 0.5416974169741697, "grad_norm": 28.53663444519043, "learning_rate": 4.514145141451415e-05, "loss": 1.1831, "step": 734 }, { "epoch": 0.5424354243542435, "grad_norm": 36.12836837768555, "learning_rate": 4.5202952029520294e-05, "loss": 1.1069, "step": 735 }, { "epoch": 0.5431734317343173, "grad_norm": 29.165285110473633, "learning_rate": 4.526445264452645e-05, "loss": 0.9832, "step": 736 }, { "epoch": 0.5439114391143911, "grad_norm": 27.385562896728516, "learning_rate": 4.53259532595326e-05, "loss": 1.0543, "step": 737 }, { "epoch": 0.5446494464944649, "grad_norm": 32.897945404052734, "learning_rate": 4.5387453874538747e-05, "loss": 1.1396, "step": 738 }, { "epoch": 0.5453874538745388, "grad_norm": 29.424503326416016, "learning_rate": 4.54489544895449e-05, "loss": 1.0489, "step": 739 }, { "epoch": 0.5461254612546126, "grad_norm": 31.19598960876465, "learning_rate": 4.5510455104551046e-05, "loss": 1.2388, "step": 740 }, { "epoch": 0.5468634686346864, "grad_norm": 28.53763198852539, "learning_rate": 4.55719557195572e-05, "loss": 1.0937, "step": 741 }, { "epoch": 0.5476014760147602, "grad_norm": 29.64959716796875, "learning_rate": 4.5633456334563346e-05, "loss": 1.0849, "step": 742 }, { "epoch": 0.548339483394834, "grad_norm": 27.357303619384766, "learning_rate": 4.569495694956949e-05, "loss": 1.1383, "step": 743 }, { "epoch": 0.5490774907749078, "grad_norm": 27.413957595825195, "learning_rate": 4.575645756457565e-05, "loss": 1.079, "step": 744 }, { "epoch": 0.5498154981549815, "grad_norm": 29.784135818481445, "learning_rate": 4.58179581795818e-05, "loss": 0.9767, "step": 745 }, { "epoch": 0.5505535055350553, "grad_norm": 55.847591400146484, "learning_rate": 4.5879458794587945e-05, "loss": 1.0553, "step": 746 }, { "epoch": 0.5512915129151291, "grad_norm": 320.68597412109375, "learning_rate": 4.59409594095941e-05, "loss": 1.1933, "step": 747 }, { "epoch": 0.5520295202952029, "grad_norm": 26.938758850097656, "learning_rate": 4.6002460024600245e-05, "loss": 1.0217, "step": 748 }, { "epoch": 0.5527675276752767, "grad_norm": 32.755672454833984, "learning_rate": 4.60639606396064e-05, "loss": 1.099, "step": 749 }, { "epoch": 0.5535055350553506, "grad_norm": 30.825178146362305, "learning_rate": 4.612546125461255e-05, "loss": 1.1831, "step": 750 }, { "epoch": 0.5542435424354244, "grad_norm": 26.865983963012695, "learning_rate": 4.61869618696187e-05, "loss": 1.172, "step": 751 }, { "epoch": 0.5549815498154982, "grad_norm": 27.207359313964844, "learning_rate": 4.624846248462485e-05, "loss": 1.1431, "step": 752 }, { "epoch": 0.555719557195572, "grad_norm": 31.474943161010742, "learning_rate": 4.6309963099631e-05, "loss": 1.0282, "step": 753 }, { "epoch": 0.5564575645756458, "grad_norm": 31.235960006713867, "learning_rate": 4.637146371463715e-05, "loss": 1.0688, "step": 754 }, { "epoch": 0.5571955719557196, "grad_norm": 26.043094635009766, "learning_rate": 4.64329643296433e-05, "loss": 1.0858, "step": 755 }, { "epoch": 0.5579335793357934, "grad_norm": 28.13475227355957, "learning_rate": 4.6494464944649444e-05, "loss": 0.9878, "step": 756 }, { "epoch": 0.5586715867158671, "grad_norm": 28.513853073120117, "learning_rate": 4.6555965559655604e-05, "loss": 0.9452, "step": 757 }, { "epoch": 0.5594095940959409, "grad_norm": 28.906461715698242, "learning_rate": 4.661746617466175e-05, "loss": 0.9423, "step": 758 }, { "epoch": 0.5601476014760147, "grad_norm": 33.28678894042969, "learning_rate": 4.66789667896679e-05, "loss": 1.1832, "step": 759 }, { "epoch": 0.5608856088560885, "grad_norm": 29.69910430908203, "learning_rate": 4.674046740467405e-05, "loss": 1.0487, "step": 760 }, { "epoch": 0.5616236162361624, "grad_norm": 81.67484283447266, "learning_rate": 4.6801968019680197e-05, "loss": 1.0789, "step": 761 }, { "epoch": 0.5623616236162362, "grad_norm": 32.282474517822266, "learning_rate": 4.686346863468635e-05, "loss": 1.0681, "step": 762 }, { "epoch": 0.56309963099631, "grad_norm": 28.49372673034668, "learning_rate": 4.6924969249692496e-05, "loss": 1.2814, "step": 763 }, { "epoch": 0.5638376383763838, "grad_norm": 33.509033203125, "learning_rate": 4.698646986469865e-05, "loss": 1.056, "step": 764 }, { "epoch": 0.5645756457564576, "grad_norm": 31.451663970947266, "learning_rate": 4.70479704797048e-05, "loss": 1.1701, "step": 765 }, { "epoch": 0.5653136531365314, "grad_norm": 28.21207618713379, "learning_rate": 4.710947109471095e-05, "loss": 1.1759, "step": 766 }, { "epoch": 0.5660516605166052, "grad_norm": 25.11651611328125, "learning_rate": 4.7170971709717096e-05, "loss": 1.0304, "step": 767 }, { "epoch": 0.566789667896679, "grad_norm": 26.841819763183594, "learning_rate": 4.723247232472325e-05, "loss": 1.035, "step": 768 }, { "epoch": 0.5675276752767527, "grad_norm": 26.381568908691406, "learning_rate": 4.7293972939729395e-05, "loss": 1.1387, "step": 769 }, { "epoch": 0.5682656826568265, "grad_norm": 29.644023895263672, "learning_rate": 4.7355473554735555e-05, "loss": 1.0994, "step": 770 }, { "epoch": 0.5690036900369003, "grad_norm": 31.37369728088379, "learning_rate": 4.74169741697417e-05, "loss": 1.124, "step": 771 }, { "epoch": 0.5697416974169742, "grad_norm": 29.403026580810547, "learning_rate": 4.747847478474785e-05, "loss": 1.0507, "step": 772 }, { "epoch": 0.570479704797048, "grad_norm": 28.384349822998047, "learning_rate": 4.7539975399754e-05, "loss": 1.0764, "step": 773 }, { "epoch": 0.5712177121771218, "grad_norm": 67.28231811523438, "learning_rate": 4.760147601476015e-05, "loss": 1.0503, "step": 774 }, { "epoch": 0.5719557195571956, "grad_norm": 29.146886825561523, "learning_rate": 4.76629766297663e-05, "loss": 1.0136, "step": 775 }, { "epoch": 0.5726937269372694, "grad_norm": 60.2903938293457, "learning_rate": 4.772447724477245e-05, "loss": 1.0508, "step": 776 }, { "epoch": 0.5734317343173432, "grad_norm": 28.743024826049805, "learning_rate": 4.77859778597786e-05, "loss": 1.0318, "step": 777 }, { "epoch": 0.574169741697417, "grad_norm": 30.6608943939209, "learning_rate": 4.7847478474784754e-05, "loss": 1.09, "step": 778 }, { "epoch": 0.5749077490774908, "grad_norm": 56.827152252197266, "learning_rate": 4.79089790897909e-05, "loss": 0.9897, "step": 779 }, { "epoch": 0.5756457564575646, "grad_norm": 32.71049499511719, "learning_rate": 4.797047970479705e-05, "loss": 1.1841, "step": 780 }, { "epoch": 0.5763837638376383, "grad_norm": 29.06208038330078, "learning_rate": 4.80319803198032e-05, "loss": 1.2583, "step": 781 }, { "epoch": 0.5771217712177121, "grad_norm": 26.83561897277832, "learning_rate": 4.809348093480935e-05, "loss": 1.07, "step": 782 }, { "epoch": 0.5778597785977859, "grad_norm": 28.882770538330078, "learning_rate": 4.81549815498155e-05, "loss": 1.1521, "step": 783 }, { "epoch": 0.5785977859778598, "grad_norm": 90.4433822631836, "learning_rate": 4.821648216482165e-05, "loss": 1.0745, "step": 784 }, { "epoch": 0.5793357933579336, "grad_norm": 30.003938674926758, "learning_rate": 4.82779827798278e-05, "loss": 1.1911, "step": 785 }, { "epoch": 0.5800738007380074, "grad_norm": 63.82630920410156, "learning_rate": 4.833948339483395e-05, "loss": 1.1678, "step": 786 }, { "epoch": 0.5808118081180812, "grad_norm": 238.10055541992188, "learning_rate": 4.84009840098401e-05, "loss": 1.3465, "step": 787 }, { "epoch": 0.581549815498155, "grad_norm": 311.88134765625, "learning_rate": 4.846248462484625e-05, "loss": 2.3681, "step": 788 }, { "epoch": 0.5822878228782288, "grad_norm": 79.10831451416016, "learning_rate": 4.85239852398524e-05, "loss": 1.3595, "step": 789 }, { "epoch": 0.5830258302583026, "grad_norm": 195.71676635742188, "learning_rate": 4.858548585485855e-05, "loss": 1.8711, "step": 790 }, { "epoch": 0.5837638376383764, "grad_norm": 223.3916015625, "learning_rate": 4.8646986469864706e-05, "loss": 2.4052, "step": 791 }, { "epoch": 0.5845018450184502, "grad_norm": 33.84809875488281, "learning_rate": 4.870848708487085e-05, "loss": 1.04, "step": 792 }, { "epoch": 0.5852398523985239, "grad_norm": 134.23912048339844, "learning_rate": 4.8769987699877e-05, "loss": 2.6438, "step": 793 }, { "epoch": 0.5859778597785977, "grad_norm": 44.83888244628906, "learning_rate": 4.883148831488315e-05, "loss": 1.4395, "step": 794 }, { "epoch": 0.5867158671586716, "grad_norm": 299.56988525390625, "learning_rate": 4.88929889298893e-05, "loss": 1.6989, "step": 795 }, { "epoch": 0.5874538745387454, "grad_norm": 284.4837341308594, "learning_rate": 4.895448954489545e-05, "loss": 2.2906, "step": 796 }, { "epoch": 0.5881918819188192, "grad_norm": 53.7056884765625, "learning_rate": 4.9015990159901605e-05, "loss": 1.3129, "step": 797 }, { "epoch": 0.588929889298893, "grad_norm": 117.8404769897461, "learning_rate": 4.907749077490775e-05, "loss": 1.916, "step": 798 }, { "epoch": 0.5896678966789668, "grad_norm": 51.02519607543945, "learning_rate": 4.9138991389913904e-05, "loss": 2.0983, "step": 799 }, { "epoch": 0.5904059040590406, "grad_norm": 376.12225341796875, "learning_rate": 4.920049200492005e-05, "loss": 1.5877, "step": 800 }, { "epoch": 0.5904059040590406, "eval_loss": 1.3728182315826416, "eval_runtime": 305.6963, "eval_samples_per_second": 3.759, "eval_steps_per_second": 0.314, "step": 800 } ], "logging_steps": 1, "max_steps": 4065, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.5604991497978511e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }