train_vlm-conv-ckpt-800-ybhxe88t / trainer_state.json
theblackcat102's picture
Upload folder using huggingface_hub
8cd673a verified
raw
history blame
141 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.5904059040590406,
"eval_steps": 100,
"global_step": 800,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0007380073800738007,
"grad_norm": 931.379638671875,
"learning_rate": 6.150061500615006e-08,
"loss": 2.4703,
"step": 1
},
{
"epoch": 0.0007380073800738007,
"eval_loss": 2.1151068210601807,
"eval_runtime": 311.9468,
"eval_samples_per_second": 3.683,
"eval_steps_per_second": 0.308,
"step": 1
},
{
"epoch": 0.0014760147601476014,
"grad_norm": 940.4727172851562,
"learning_rate": 1.2300123001230013e-07,
"loss": 2.6788,
"step": 2
},
{
"epoch": 0.002214022140221402,
"grad_norm": 744.969970703125,
"learning_rate": 1.845018450184502e-07,
"loss": 2.4456,
"step": 3
},
{
"epoch": 0.002952029520295203,
"grad_norm": 824.5645141601562,
"learning_rate": 2.4600246002460025e-07,
"loss": 2.6876,
"step": 4
},
{
"epoch": 0.0036900369003690036,
"grad_norm": 790.6527099609375,
"learning_rate": 3.075030750307503e-07,
"loss": 2.4629,
"step": 5
},
{
"epoch": 0.004428044280442804,
"grad_norm": 582.4039306640625,
"learning_rate": 3.690036900369004e-07,
"loss": 2.3769,
"step": 6
},
{
"epoch": 0.0051660516605166054,
"grad_norm": 542.76513671875,
"learning_rate": 4.3050430504305045e-07,
"loss": 2.1601,
"step": 7
},
{
"epoch": 0.005904059040590406,
"grad_norm": 570.0616455078125,
"learning_rate": 4.920049200492005e-07,
"loss": 2.1235,
"step": 8
},
{
"epoch": 0.006642066420664207,
"grad_norm": 630.7283935546875,
"learning_rate": 5.535055350553506e-07,
"loss": 2.1857,
"step": 9
},
{
"epoch": 0.007380073800738007,
"grad_norm": 397.8863220214844,
"learning_rate": 6.150061500615006e-07,
"loss": 1.9655,
"step": 10
},
{
"epoch": 0.008118081180811807,
"grad_norm": 387.4375915527344,
"learning_rate": 6.765067650676507e-07,
"loss": 1.8162,
"step": 11
},
{
"epoch": 0.008856088560885609,
"grad_norm": 261.5195617675781,
"learning_rate": 7.380073800738008e-07,
"loss": 1.8043,
"step": 12
},
{
"epoch": 0.00959409594095941,
"grad_norm": 216.66661071777344,
"learning_rate": 7.995079950799507e-07,
"loss": 1.7341,
"step": 13
},
{
"epoch": 0.010332103321033211,
"grad_norm": 200.43228149414062,
"learning_rate": 8.610086100861009e-07,
"loss": 1.6827,
"step": 14
},
{
"epoch": 0.01107011070110701,
"grad_norm": 213.2593536376953,
"learning_rate": 9.22509225092251e-07,
"loss": 1.6452,
"step": 15
},
{
"epoch": 0.011808118081180811,
"grad_norm": 146.7362518310547,
"learning_rate": 9.84009840098401e-07,
"loss": 1.6459,
"step": 16
},
{
"epoch": 0.012546125461254613,
"grad_norm": 143.30966186523438,
"learning_rate": 1.045510455104551e-06,
"loss": 1.6676,
"step": 17
},
{
"epoch": 0.013284132841328414,
"grad_norm": 177.24832153320312,
"learning_rate": 1.1070110701107011e-06,
"loss": 1.4307,
"step": 18
},
{
"epoch": 0.014022140221402213,
"grad_norm": 134.13116455078125,
"learning_rate": 1.1685116851168512e-06,
"loss": 1.4712,
"step": 19
},
{
"epoch": 0.014760147601476014,
"grad_norm": 107.87165069580078,
"learning_rate": 1.2300123001230013e-06,
"loss": 1.5757,
"step": 20
},
{
"epoch": 0.015498154981549815,
"grad_norm": 100.48570251464844,
"learning_rate": 1.2915129151291513e-06,
"loss": 1.5647,
"step": 21
},
{
"epoch": 0.016236162361623615,
"grad_norm": 96.30101776123047,
"learning_rate": 1.3530135301353014e-06,
"loss": 1.3431,
"step": 22
},
{
"epoch": 0.016974169741697416,
"grad_norm": 99.80168151855469,
"learning_rate": 1.4145141451414515e-06,
"loss": 1.4753,
"step": 23
},
{
"epoch": 0.017712177121771217,
"grad_norm": 86.59078216552734,
"learning_rate": 1.4760147601476015e-06,
"loss": 1.4402,
"step": 24
},
{
"epoch": 0.01845018450184502,
"grad_norm": 107.12730407714844,
"learning_rate": 1.5375153751537516e-06,
"loss": 1.4129,
"step": 25
},
{
"epoch": 0.01918819188191882,
"grad_norm": 86.11123657226562,
"learning_rate": 1.5990159901599014e-06,
"loss": 1.3141,
"step": 26
},
{
"epoch": 0.01992619926199262,
"grad_norm": 81.71781158447266,
"learning_rate": 1.6605166051660517e-06,
"loss": 1.3644,
"step": 27
},
{
"epoch": 0.020664206642066422,
"grad_norm": 81.71916961669922,
"learning_rate": 1.7220172201722018e-06,
"loss": 1.3631,
"step": 28
},
{
"epoch": 0.021402214022140223,
"grad_norm": 65.515625,
"learning_rate": 1.783517835178352e-06,
"loss": 1.3915,
"step": 29
},
{
"epoch": 0.02214022140221402,
"grad_norm": 82.60952758789062,
"learning_rate": 1.845018450184502e-06,
"loss": 1.2512,
"step": 30
},
{
"epoch": 0.022878228782287822,
"grad_norm": 78.03673553466797,
"learning_rate": 1.9065190651906518e-06,
"loss": 1.4272,
"step": 31
},
{
"epoch": 0.023616236162361623,
"grad_norm": 73.9189453125,
"learning_rate": 1.968019680196802e-06,
"loss": 1.3549,
"step": 32
},
{
"epoch": 0.024354243542435424,
"grad_norm": 75.15375518798828,
"learning_rate": 2.029520295202952e-06,
"loss": 1.1933,
"step": 33
},
{
"epoch": 0.025092250922509225,
"grad_norm": 68.5103530883789,
"learning_rate": 2.091020910209102e-06,
"loss": 1.2598,
"step": 34
},
{
"epoch": 0.025830258302583026,
"grad_norm": 63.10990905761719,
"learning_rate": 2.1525215252152524e-06,
"loss": 1.2143,
"step": 35
},
{
"epoch": 0.026568265682656828,
"grad_norm": 75.12173461914062,
"learning_rate": 2.2140221402214023e-06,
"loss": 1.2827,
"step": 36
},
{
"epoch": 0.02730627306273063,
"grad_norm": 69.23287963867188,
"learning_rate": 2.2755227552275526e-06,
"loss": 1.4106,
"step": 37
},
{
"epoch": 0.028044280442804426,
"grad_norm": 82.09547424316406,
"learning_rate": 2.3370233702337024e-06,
"loss": 1.1135,
"step": 38
},
{
"epoch": 0.028782287822878228,
"grad_norm": 89.76222229003906,
"learning_rate": 2.3985239852398527e-06,
"loss": 1.3469,
"step": 39
},
{
"epoch": 0.02952029520295203,
"grad_norm": 75.77232360839844,
"learning_rate": 2.4600246002460025e-06,
"loss": 1.1857,
"step": 40
},
{
"epoch": 0.03025830258302583,
"grad_norm": 64.25336456298828,
"learning_rate": 2.5215252152521524e-06,
"loss": 1.2452,
"step": 41
},
{
"epoch": 0.03099630996309963,
"grad_norm": 64.85978698730469,
"learning_rate": 2.5830258302583027e-06,
"loss": 1.1511,
"step": 42
},
{
"epoch": 0.03173431734317343,
"grad_norm": 61.36198043823242,
"learning_rate": 2.6445264452644525e-06,
"loss": 1.1056,
"step": 43
},
{
"epoch": 0.03247232472324723,
"grad_norm": 63.63357925415039,
"learning_rate": 2.706027060270603e-06,
"loss": 1.3231,
"step": 44
},
{
"epoch": 0.033210332103321034,
"grad_norm": 60.254825592041016,
"learning_rate": 2.767527675276753e-06,
"loss": 1.1552,
"step": 45
},
{
"epoch": 0.03394833948339483,
"grad_norm": 69.51408386230469,
"learning_rate": 2.829028290282903e-06,
"loss": 1.2972,
"step": 46
},
{
"epoch": 0.03468634686346864,
"grad_norm": 60.74787902832031,
"learning_rate": 2.890528905289053e-06,
"loss": 1.2444,
"step": 47
},
{
"epoch": 0.035424354243542434,
"grad_norm": 62.291412353515625,
"learning_rate": 2.952029520295203e-06,
"loss": 1.2342,
"step": 48
},
{
"epoch": 0.03616236162361624,
"grad_norm": 67.48091125488281,
"learning_rate": 3.0135301353013533e-06,
"loss": 1.2894,
"step": 49
},
{
"epoch": 0.03690036900369004,
"grad_norm": 57.86232376098633,
"learning_rate": 3.075030750307503e-06,
"loss": 1.1071,
"step": 50
},
{
"epoch": 0.037638376383763834,
"grad_norm": 62.488731384277344,
"learning_rate": 3.136531365313653e-06,
"loss": 1.2249,
"step": 51
},
{
"epoch": 0.03837638376383764,
"grad_norm": 56.59815979003906,
"learning_rate": 3.198031980319803e-06,
"loss": 1.2616,
"step": 52
},
{
"epoch": 0.03911439114391144,
"grad_norm": 58.92403030395508,
"learning_rate": 3.2595325953259536e-06,
"loss": 1.2147,
"step": 53
},
{
"epoch": 0.03985239852398524,
"grad_norm": 63.04093933105469,
"learning_rate": 3.3210332103321034e-06,
"loss": 1.2363,
"step": 54
},
{
"epoch": 0.04059040590405904,
"grad_norm": 57.72414779663086,
"learning_rate": 3.3825338253382537e-06,
"loss": 1.1719,
"step": 55
},
{
"epoch": 0.041328413284132844,
"grad_norm": 61.95828628540039,
"learning_rate": 3.4440344403444036e-06,
"loss": 1.1956,
"step": 56
},
{
"epoch": 0.04206642066420664,
"grad_norm": 58.07041549682617,
"learning_rate": 3.5055350553505534e-06,
"loss": 1.1326,
"step": 57
},
{
"epoch": 0.042804428044280446,
"grad_norm": 61.18100357055664,
"learning_rate": 3.567035670356704e-06,
"loss": 1.2313,
"step": 58
},
{
"epoch": 0.043542435424354244,
"grad_norm": 58.01974868774414,
"learning_rate": 3.628536285362854e-06,
"loss": 1.1833,
"step": 59
},
{
"epoch": 0.04428044280442804,
"grad_norm": 58.43510437011719,
"learning_rate": 3.690036900369004e-06,
"loss": 1.232,
"step": 60
},
{
"epoch": 0.045018450184501846,
"grad_norm": 56.53025817871094,
"learning_rate": 3.7515375153751537e-06,
"loss": 1.202,
"step": 61
},
{
"epoch": 0.045756457564575644,
"grad_norm": 59.623043060302734,
"learning_rate": 3.8130381303813035e-06,
"loss": 1.2188,
"step": 62
},
{
"epoch": 0.04649446494464945,
"grad_norm": 54.90254211425781,
"learning_rate": 3.874538745387454e-06,
"loss": 1.1324,
"step": 63
},
{
"epoch": 0.047232472324723246,
"grad_norm": 56.264732360839844,
"learning_rate": 3.936039360393604e-06,
"loss": 1.1797,
"step": 64
},
{
"epoch": 0.04797047970479705,
"grad_norm": 56.26121520996094,
"learning_rate": 3.997539975399754e-06,
"loss": 1.1777,
"step": 65
},
{
"epoch": 0.04870848708487085,
"grad_norm": 53.94155502319336,
"learning_rate": 4.059040590405904e-06,
"loss": 1.1077,
"step": 66
},
{
"epoch": 0.04944649446494465,
"grad_norm": 56.105831146240234,
"learning_rate": 4.120541205412054e-06,
"loss": 1.1559,
"step": 67
},
{
"epoch": 0.05018450184501845,
"grad_norm": 65.91514587402344,
"learning_rate": 4.182041820418204e-06,
"loss": 1.0554,
"step": 68
},
{
"epoch": 0.05092250922509225,
"grad_norm": 67.19110107421875,
"learning_rate": 4.243542435424354e-06,
"loss": 1.2161,
"step": 69
},
{
"epoch": 0.05166051660516605,
"grad_norm": 55.92790603637695,
"learning_rate": 4.305043050430505e-06,
"loss": 1.2577,
"step": 70
},
{
"epoch": 0.05239852398523985,
"grad_norm": 61.967750549316406,
"learning_rate": 4.366543665436655e-06,
"loss": 1.1944,
"step": 71
},
{
"epoch": 0.053136531365313655,
"grad_norm": 54.48695373535156,
"learning_rate": 4.428044280442805e-06,
"loss": 0.9314,
"step": 72
},
{
"epoch": 0.05387453874538745,
"grad_norm": 59.03939437866211,
"learning_rate": 4.489544895448955e-06,
"loss": 1.2708,
"step": 73
},
{
"epoch": 0.05461254612546126,
"grad_norm": 57.15635299682617,
"learning_rate": 4.551045510455105e-06,
"loss": 1.1294,
"step": 74
},
{
"epoch": 0.055350553505535055,
"grad_norm": 57.40306091308594,
"learning_rate": 4.612546125461255e-06,
"loss": 1.0417,
"step": 75
},
{
"epoch": 0.05608856088560885,
"grad_norm": 114.00467681884766,
"learning_rate": 4.674046740467405e-06,
"loss": 1.0973,
"step": 76
},
{
"epoch": 0.05682656826568266,
"grad_norm": 55.897666931152344,
"learning_rate": 4.735547355473555e-06,
"loss": 1.024,
"step": 77
},
{
"epoch": 0.057564575645756455,
"grad_norm": 54.696266174316406,
"learning_rate": 4.797047970479705e-06,
"loss": 1.0549,
"step": 78
},
{
"epoch": 0.05830258302583026,
"grad_norm": 58.518489837646484,
"learning_rate": 4.858548585485855e-06,
"loss": 1.1007,
"step": 79
},
{
"epoch": 0.05904059040590406,
"grad_norm": 55.24943923950195,
"learning_rate": 4.920049200492005e-06,
"loss": 1.223,
"step": 80
},
{
"epoch": 0.05977859778597786,
"grad_norm": 55.647605895996094,
"learning_rate": 4.981549815498155e-06,
"loss": 1.0745,
"step": 81
},
{
"epoch": 0.06051660516605166,
"grad_norm": 52.201297760009766,
"learning_rate": 5.043050430504305e-06,
"loss": 1.1459,
"step": 82
},
{
"epoch": 0.061254612546125464,
"grad_norm": 49.60506820678711,
"learning_rate": 5.1045510455104555e-06,
"loss": 1.0853,
"step": 83
},
{
"epoch": 0.06199261992619926,
"grad_norm": 53.66012191772461,
"learning_rate": 5.166051660516605e-06,
"loss": 0.999,
"step": 84
},
{
"epoch": 0.06273062730627306,
"grad_norm": 58.5854606628418,
"learning_rate": 5.227552275522755e-06,
"loss": 1.3074,
"step": 85
},
{
"epoch": 0.06346863468634686,
"grad_norm": 58.91031265258789,
"learning_rate": 5.289052890528905e-06,
"loss": 1.0567,
"step": 86
},
{
"epoch": 0.06420664206642067,
"grad_norm": 57.29990005493164,
"learning_rate": 5.350553505535055e-06,
"loss": 1.1709,
"step": 87
},
{
"epoch": 0.06494464944649446,
"grad_norm": 48.71859359741211,
"learning_rate": 5.412054120541206e-06,
"loss": 1.2049,
"step": 88
},
{
"epoch": 0.06568265682656826,
"grad_norm": 50.770084381103516,
"learning_rate": 5.4735547355473555e-06,
"loss": 0.9872,
"step": 89
},
{
"epoch": 0.06642066420664207,
"grad_norm": 58.15389633178711,
"learning_rate": 5.535055350553506e-06,
"loss": 1.0798,
"step": 90
},
{
"epoch": 0.06715867158671587,
"grad_norm": 61.212825775146484,
"learning_rate": 5.596555965559656e-06,
"loss": 1.0654,
"step": 91
},
{
"epoch": 0.06789667896678966,
"grad_norm": 56.70602798461914,
"learning_rate": 5.658056580565806e-06,
"loss": 1.1565,
"step": 92
},
{
"epoch": 0.06863468634686347,
"grad_norm": 54.07913589477539,
"learning_rate": 5.7195571955719566e-06,
"loss": 1.1315,
"step": 93
},
{
"epoch": 0.06937269372693727,
"grad_norm": 55.931495666503906,
"learning_rate": 5.781057810578106e-06,
"loss": 1.0347,
"step": 94
},
{
"epoch": 0.07011070110701106,
"grad_norm": 58.78949737548828,
"learning_rate": 5.842558425584256e-06,
"loss": 1.1858,
"step": 95
},
{
"epoch": 0.07084870848708487,
"grad_norm": 53.04726791381836,
"learning_rate": 5.904059040590406e-06,
"loss": 1.1227,
"step": 96
},
{
"epoch": 0.07158671586715867,
"grad_norm": 51.182315826416016,
"learning_rate": 5.965559655596556e-06,
"loss": 1.1926,
"step": 97
},
{
"epoch": 0.07232472324723248,
"grad_norm": 55.08806610107422,
"learning_rate": 6.027060270602707e-06,
"loss": 1.1339,
"step": 98
},
{
"epoch": 0.07306273062730627,
"grad_norm": 53.554542541503906,
"learning_rate": 6.0885608856088565e-06,
"loss": 1.1762,
"step": 99
},
{
"epoch": 0.07380073800738007,
"grad_norm": 56.95305252075195,
"learning_rate": 6.150061500615006e-06,
"loss": 1.1414,
"step": 100
},
{
"epoch": 0.07380073800738007,
"eval_loss": 1.4010363817214966,
"eval_runtime": 325.9171,
"eval_samples_per_second": 3.525,
"eval_steps_per_second": 0.295,
"step": 100
},
{
"epoch": 0.07453874538745388,
"grad_norm": 51.188621520996094,
"learning_rate": 6.211562115621156e-06,
"loss": 1.129,
"step": 101
},
{
"epoch": 0.07527675276752767,
"grad_norm": 55.20896530151367,
"learning_rate": 6.273062730627306e-06,
"loss": 1.1451,
"step": 102
},
{
"epoch": 0.07601476014760147,
"grad_norm": 49.773399353027344,
"learning_rate": 6.334563345633457e-06,
"loss": 1.2308,
"step": 103
},
{
"epoch": 0.07675276752767528,
"grad_norm": 52.89494323730469,
"learning_rate": 6.396063960639606e-06,
"loss": 1.2243,
"step": 104
},
{
"epoch": 0.07749077490774908,
"grad_norm": 53.44047546386719,
"learning_rate": 6.4575645756457565e-06,
"loss": 1.0611,
"step": 105
},
{
"epoch": 0.07822878228782287,
"grad_norm": 53.227176666259766,
"learning_rate": 6.519065190651907e-06,
"loss": 1.0153,
"step": 106
},
{
"epoch": 0.07896678966789668,
"grad_norm": 53.29740524291992,
"learning_rate": 6.580565805658056e-06,
"loss": 1.0539,
"step": 107
},
{
"epoch": 0.07970479704797048,
"grad_norm": 52.415748596191406,
"learning_rate": 6.642066420664207e-06,
"loss": 1.1556,
"step": 108
},
{
"epoch": 0.08044280442804429,
"grad_norm": 52.891544342041016,
"learning_rate": 6.703567035670357e-06,
"loss": 1.0613,
"step": 109
},
{
"epoch": 0.08118081180811808,
"grad_norm": 56.652835845947266,
"learning_rate": 6.7650676506765074e-06,
"loss": 1.053,
"step": 110
},
{
"epoch": 0.08191881918819188,
"grad_norm": 52.22764587402344,
"learning_rate": 6.826568265682657e-06,
"loss": 1.0757,
"step": 111
},
{
"epoch": 0.08265682656826569,
"grad_norm": 51.05937576293945,
"learning_rate": 6.888068880688807e-06,
"loss": 1.1943,
"step": 112
},
{
"epoch": 0.08339483394833948,
"grad_norm": 53.054378509521484,
"learning_rate": 6.949569495694958e-06,
"loss": 1.0704,
"step": 113
},
{
"epoch": 0.08413284132841328,
"grad_norm": 54.2965202331543,
"learning_rate": 7.011070110701107e-06,
"loss": 1.0442,
"step": 114
},
{
"epoch": 0.08487084870848709,
"grad_norm": 52.170867919921875,
"learning_rate": 7.0725707257072575e-06,
"loss": 1.2318,
"step": 115
},
{
"epoch": 0.08560885608856089,
"grad_norm": 51.29275894165039,
"learning_rate": 7.134071340713408e-06,
"loss": 1.0306,
"step": 116
},
{
"epoch": 0.08634686346863468,
"grad_norm": 54.07830047607422,
"learning_rate": 7.195571955719557e-06,
"loss": 1.1537,
"step": 117
},
{
"epoch": 0.08708487084870849,
"grad_norm": 47.52810287475586,
"learning_rate": 7.257072570725708e-06,
"loss": 1.1096,
"step": 118
},
{
"epoch": 0.08782287822878229,
"grad_norm": 52.45383071899414,
"learning_rate": 7.318573185731857e-06,
"loss": 1.0466,
"step": 119
},
{
"epoch": 0.08856088560885608,
"grad_norm": 51.74037551879883,
"learning_rate": 7.380073800738008e-06,
"loss": 1.0032,
"step": 120
},
{
"epoch": 0.08929889298892989,
"grad_norm": 52.04569625854492,
"learning_rate": 7.441574415744158e-06,
"loss": 1.1626,
"step": 121
},
{
"epoch": 0.09003690036900369,
"grad_norm": 51.20045852661133,
"learning_rate": 7.503075030750307e-06,
"loss": 1.1133,
"step": 122
},
{
"epoch": 0.0907749077490775,
"grad_norm": 50.70725631713867,
"learning_rate": 7.564575645756458e-06,
"loss": 1.1191,
"step": 123
},
{
"epoch": 0.09151291512915129,
"grad_norm": 50.703460693359375,
"learning_rate": 7.626076260762607e-06,
"loss": 1.0609,
"step": 124
},
{
"epoch": 0.09225092250922509,
"grad_norm": 53.20537185668945,
"learning_rate": 7.687576875768759e-06,
"loss": 1.1237,
"step": 125
},
{
"epoch": 0.0929889298892989,
"grad_norm": 51.74738693237305,
"learning_rate": 7.749077490774908e-06,
"loss": 1.1255,
"step": 126
},
{
"epoch": 0.09372693726937269,
"grad_norm": 47.27532958984375,
"learning_rate": 7.810578105781058e-06,
"loss": 1.0753,
"step": 127
},
{
"epoch": 0.09446494464944649,
"grad_norm": 46.608150482177734,
"learning_rate": 7.872078720787208e-06,
"loss": 1.0709,
"step": 128
},
{
"epoch": 0.0952029520295203,
"grad_norm": 52.357460021972656,
"learning_rate": 7.933579335793358e-06,
"loss": 1.1539,
"step": 129
},
{
"epoch": 0.0959409594095941,
"grad_norm": 48.45564270019531,
"learning_rate": 7.995079950799508e-06,
"loss": 1.1129,
"step": 130
},
{
"epoch": 0.09667896678966789,
"grad_norm": 52.05830383300781,
"learning_rate": 8.05658056580566e-06,
"loss": 1.1821,
"step": 131
},
{
"epoch": 0.0974169741697417,
"grad_norm": 53.559852600097656,
"learning_rate": 8.118081180811808e-06,
"loss": 1.1161,
"step": 132
},
{
"epoch": 0.0981549815498155,
"grad_norm": 54.30366134643555,
"learning_rate": 8.179581795817959e-06,
"loss": 1.0108,
"step": 133
},
{
"epoch": 0.0988929889298893,
"grad_norm": 49.463932037353516,
"learning_rate": 8.241082410824107e-06,
"loss": 1.0384,
"step": 134
},
{
"epoch": 0.0996309963099631,
"grad_norm": 73.52909088134766,
"learning_rate": 8.302583025830259e-06,
"loss": 1.1694,
"step": 135
},
{
"epoch": 0.1003690036900369,
"grad_norm": 45.32145309448242,
"learning_rate": 8.364083640836409e-06,
"loss": 1.0721,
"step": 136
},
{
"epoch": 0.1011070110701107,
"grad_norm": 51.58095932006836,
"learning_rate": 8.425584255842559e-06,
"loss": 0.9633,
"step": 137
},
{
"epoch": 0.1018450184501845,
"grad_norm": 52.928436279296875,
"learning_rate": 8.487084870848708e-06,
"loss": 1.0783,
"step": 138
},
{
"epoch": 0.1025830258302583,
"grad_norm": 48.393550872802734,
"learning_rate": 8.548585485854858e-06,
"loss": 0.9844,
"step": 139
},
{
"epoch": 0.1033210332103321,
"grad_norm": 46.03611373901367,
"learning_rate": 8.61008610086101e-06,
"loss": 1.1052,
"step": 140
},
{
"epoch": 0.10405904059040591,
"grad_norm": 49.10841751098633,
"learning_rate": 8.67158671586716e-06,
"loss": 1.1086,
"step": 141
},
{
"epoch": 0.1047970479704797,
"grad_norm": 47.779212951660156,
"learning_rate": 8.73308733087331e-06,
"loss": 1.1376,
"step": 142
},
{
"epoch": 0.1055350553505535,
"grad_norm": 51.112693786621094,
"learning_rate": 8.79458794587946e-06,
"loss": 1.1465,
"step": 143
},
{
"epoch": 0.10627306273062731,
"grad_norm": 43.86711502075195,
"learning_rate": 8.85608856088561e-06,
"loss": 0.9845,
"step": 144
},
{
"epoch": 0.1070110701107011,
"grad_norm": 45.53451156616211,
"learning_rate": 8.917589175891759e-06,
"loss": 1.1196,
"step": 145
},
{
"epoch": 0.1077490774907749,
"grad_norm": 51.35363006591797,
"learning_rate": 8.97908979089791e-06,
"loss": 1.0202,
"step": 146
},
{
"epoch": 0.10848708487084871,
"grad_norm": 45.318607330322266,
"learning_rate": 9.040590405904059e-06,
"loss": 1.0156,
"step": 147
},
{
"epoch": 0.10922509225092251,
"grad_norm": 45.83018493652344,
"learning_rate": 9.10209102091021e-06,
"loss": 0.9637,
"step": 148
},
{
"epoch": 0.1099630996309963,
"grad_norm": 52.667728424072266,
"learning_rate": 9.163591635916358e-06,
"loss": 1.0344,
"step": 149
},
{
"epoch": 0.11070110701107011,
"grad_norm": 49.742897033691406,
"learning_rate": 9.22509225092251e-06,
"loss": 0.9486,
"step": 150
},
{
"epoch": 0.11143911439114391,
"grad_norm": 50.35558319091797,
"learning_rate": 9.28659286592866e-06,
"loss": 1.1685,
"step": 151
},
{
"epoch": 0.1121771217712177,
"grad_norm": 49.48957824707031,
"learning_rate": 9.34809348093481e-06,
"loss": 0.9666,
"step": 152
},
{
"epoch": 0.11291512915129151,
"grad_norm": 46.834129333496094,
"learning_rate": 9.40959409594096e-06,
"loss": 1.0137,
"step": 153
},
{
"epoch": 0.11365313653136531,
"grad_norm": 46.92979049682617,
"learning_rate": 9.47109471094711e-06,
"loss": 1.042,
"step": 154
},
{
"epoch": 0.11439114391143912,
"grad_norm": 43.96043014526367,
"learning_rate": 9.53259532595326e-06,
"loss": 1.0363,
"step": 155
},
{
"epoch": 0.11512915129151291,
"grad_norm": 48.00889587402344,
"learning_rate": 9.59409594095941e-06,
"loss": 0.9697,
"step": 156
},
{
"epoch": 0.11586715867158671,
"grad_norm": 50.71873474121094,
"learning_rate": 9.65559655596556e-06,
"loss": 1.1216,
"step": 157
},
{
"epoch": 0.11660516605166052,
"grad_norm": 51.51930236816406,
"learning_rate": 9.71709717097171e-06,
"loss": 1.0876,
"step": 158
},
{
"epoch": 0.11734317343173432,
"grad_norm": 44.15366744995117,
"learning_rate": 9.77859778597786e-06,
"loss": 1.0607,
"step": 159
},
{
"epoch": 0.11808118081180811,
"grad_norm": 41.848602294921875,
"learning_rate": 9.84009840098401e-06,
"loss": 1.0026,
"step": 160
},
{
"epoch": 0.11881918819188192,
"grad_norm": 45.18868637084961,
"learning_rate": 9.90159901599016e-06,
"loss": 1.1803,
"step": 161
},
{
"epoch": 0.11955719557195572,
"grad_norm": 45.788673400878906,
"learning_rate": 9.96309963099631e-06,
"loss": 1.1451,
"step": 162
},
{
"epoch": 0.12029520295202951,
"grad_norm": 46.45803451538086,
"learning_rate": 1.0024600246002461e-05,
"loss": 0.9769,
"step": 163
},
{
"epoch": 0.12103321033210332,
"grad_norm": 46.782840728759766,
"learning_rate": 1.008610086100861e-05,
"loss": 1.2505,
"step": 164
},
{
"epoch": 0.12177121771217712,
"grad_norm": 45.39817810058594,
"learning_rate": 1.0147601476014761e-05,
"loss": 1.0927,
"step": 165
},
{
"epoch": 0.12250922509225093,
"grad_norm": 43.27733612060547,
"learning_rate": 1.0209102091020911e-05,
"loss": 1.1247,
"step": 166
},
{
"epoch": 0.12324723247232472,
"grad_norm": 47.766231536865234,
"learning_rate": 1.027060270602706e-05,
"loss": 1.079,
"step": 167
},
{
"epoch": 0.12398523985239852,
"grad_norm": 46.73952865600586,
"learning_rate": 1.033210332103321e-05,
"loss": 0.8357,
"step": 168
},
{
"epoch": 0.12472324723247233,
"grad_norm": 46.83552551269531,
"learning_rate": 1.039360393603936e-05,
"loss": 1.2159,
"step": 169
},
{
"epoch": 0.12546125461254612,
"grad_norm": 44.146846771240234,
"learning_rate": 1.045510455104551e-05,
"loss": 0.9941,
"step": 170
},
{
"epoch": 0.12619926199261994,
"grad_norm": 45.29106140136719,
"learning_rate": 1.0516605166051662e-05,
"loss": 1.1314,
"step": 171
},
{
"epoch": 0.12693726937269373,
"grad_norm": 46.10059356689453,
"learning_rate": 1.057810578105781e-05,
"loss": 1.0239,
"step": 172
},
{
"epoch": 0.12767527675276752,
"grad_norm": 42.55729293823242,
"learning_rate": 1.0639606396063962e-05,
"loss": 1.0389,
"step": 173
},
{
"epoch": 0.12841328413284134,
"grad_norm": 43.775760650634766,
"learning_rate": 1.070110701107011e-05,
"loss": 1.1492,
"step": 174
},
{
"epoch": 0.12915129151291513,
"grad_norm": 42.141910552978516,
"learning_rate": 1.0762607626076261e-05,
"loss": 1.092,
"step": 175
},
{
"epoch": 0.12988929889298892,
"grad_norm": 44.42767333984375,
"learning_rate": 1.0824108241082411e-05,
"loss": 1.1159,
"step": 176
},
{
"epoch": 0.13062730627306274,
"grad_norm": 38.9581184387207,
"learning_rate": 1.0885608856088561e-05,
"loss": 1.0921,
"step": 177
},
{
"epoch": 0.13136531365313653,
"grad_norm": 43.585147857666016,
"learning_rate": 1.0947109471094711e-05,
"loss": 1.0784,
"step": 178
},
{
"epoch": 0.13210332103321032,
"grad_norm": 49.25750732421875,
"learning_rate": 1.100861008610086e-05,
"loss": 1.1589,
"step": 179
},
{
"epoch": 0.13284132841328414,
"grad_norm": 38.27066421508789,
"learning_rate": 1.1070110701107012e-05,
"loss": 0.9549,
"step": 180
},
{
"epoch": 0.13357933579335793,
"grad_norm": 43.95482635498047,
"learning_rate": 1.1131611316113162e-05,
"loss": 1.1084,
"step": 181
},
{
"epoch": 0.13431734317343175,
"grad_norm": 47.86146926879883,
"learning_rate": 1.1193111931119312e-05,
"loss": 1.0305,
"step": 182
},
{
"epoch": 0.13505535055350554,
"grad_norm": 41.17548370361328,
"learning_rate": 1.1254612546125462e-05,
"loss": 1.0341,
"step": 183
},
{
"epoch": 0.13579335793357933,
"grad_norm": 50.34139633178711,
"learning_rate": 1.1316113161131612e-05,
"loss": 0.9769,
"step": 184
},
{
"epoch": 0.13653136531365315,
"grad_norm": 41.7880973815918,
"learning_rate": 1.1377613776137762e-05,
"loss": 1.047,
"step": 185
},
{
"epoch": 0.13726937269372694,
"grad_norm": 43.598392486572266,
"learning_rate": 1.1439114391143913e-05,
"loss": 0.9553,
"step": 186
},
{
"epoch": 0.13800738007380073,
"grad_norm": 44.27220153808594,
"learning_rate": 1.1500615006150061e-05,
"loss": 1.1314,
"step": 187
},
{
"epoch": 0.13874538745387455,
"grad_norm": 38.91771697998047,
"learning_rate": 1.1562115621156213e-05,
"loss": 1.0132,
"step": 188
},
{
"epoch": 0.13948339483394834,
"grad_norm": 44.32412338256836,
"learning_rate": 1.1623616236162361e-05,
"loss": 1.0672,
"step": 189
},
{
"epoch": 0.14022140221402213,
"grad_norm": 43.45479202270508,
"learning_rate": 1.1685116851168513e-05,
"loss": 1.0519,
"step": 190
},
{
"epoch": 0.14095940959409595,
"grad_norm": 46.94374084472656,
"learning_rate": 1.1746617466174662e-05,
"loss": 1.0721,
"step": 191
},
{
"epoch": 0.14169741697416974,
"grad_norm": 48.714927673339844,
"learning_rate": 1.1808118081180812e-05,
"loss": 1.095,
"step": 192
},
{
"epoch": 0.14243542435424356,
"grad_norm": 48.29472732543945,
"learning_rate": 1.1869618696186962e-05,
"loss": 1.1482,
"step": 193
},
{
"epoch": 0.14317343173431735,
"grad_norm": 43.912288665771484,
"learning_rate": 1.1931119311193112e-05,
"loss": 1.0994,
"step": 194
},
{
"epoch": 0.14391143911439114,
"grad_norm": 41.308799743652344,
"learning_rate": 1.1992619926199262e-05,
"loss": 1.2074,
"step": 195
},
{
"epoch": 0.14464944649446496,
"grad_norm": 43.36037826538086,
"learning_rate": 1.2054120541205413e-05,
"loss": 1.1435,
"step": 196
},
{
"epoch": 0.14538745387453875,
"grad_norm": 40.67462158203125,
"learning_rate": 1.2115621156211563e-05,
"loss": 0.9609,
"step": 197
},
{
"epoch": 0.14612546125461254,
"grad_norm": 43.331241607666016,
"learning_rate": 1.2177121771217713e-05,
"loss": 1.0909,
"step": 198
},
{
"epoch": 0.14686346863468636,
"grad_norm": 41.213863372802734,
"learning_rate": 1.2238622386223863e-05,
"loss": 1.0955,
"step": 199
},
{
"epoch": 0.14760147601476015,
"grad_norm": 43.54401397705078,
"learning_rate": 1.2300123001230013e-05,
"loss": 1.1855,
"step": 200
},
{
"epoch": 0.14760147601476015,
"eval_loss": 1.3390393257141113,
"eval_runtime": 355.9656,
"eval_samples_per_second": 3.228,
"eval_steps_per_second": 0.27,
"step": 200
},
{
"epoch": 0.14833948339483394,
"grad_norm": 45.116146087646484,
"learning_rate": 1.2361623616236164e-05,
"loss": 1.1331,
"step": 201
},
{
"epoch": 0.14907749077490776,
"grad_norm": 48.80164337158203,
"learning_rate": 1.2423124231242312e-05,
"loss": 1.157,
"step": 202
},
{
"epoch": 0.14981549815498155,
"grad_norm": 41.02751922607422,
"learning_rate": 1.2484624846248464e-05,
"loss": 1.1237,
"step": 203
},
{
"epoch": 0.15055350553505534,
"grad_norm": 42.61967086791992,
"learning_rate": 1.2546125461254612e-05,
"loss": 1.1693,
"step": 204
},
{
"epoch": 0.15129151291512916,
"grad_norm": 43.75822067260742,
"learning_rate": 1.2607626076260764e-05,
"loss": 1.1545,
"step": 205
},
{
"epoch": 0.15202952029520295,
"grad_norm": 40.50026321411133,
"learning_rate": 1.2669126691266914e-05,
"loss": 1.061,
"step": 206
},
{
"epoch": 0.15276752767527677,
"grad_norm": 41.14898681640625,
"learning_rate": 1.2730627306273063e-05,
"loss": 0.9864,
"step": 207
},
{
"epoch": 0.15350553505535056,
"grad_norm": 44.43930435180664,
"learning_rate": 1.2792127921279212e-05,
"loss": 1.0444,
"step": 208
},
{
"epoch": 0.15424354243542435,
"grad_norm": 42.351226806640625,
"learning_rate": 1.2853628536285365e-05,
"loss": 1.0966,
"step": 209
},
{
"epoch": 0.15498154981549817,
"grad_norm": 39.365440368652344,
"learning_rate": 1.2915129151291513e-05,
"loss": 1.0987,
"step": 210
},
{
"epoch": 0.15571955719557196,
"grad_norm": 44.90658950805664,
"learning_rate": 1.2976629766297663e-05,
"loss": 1.0399,
"step": 211
},
{
"epoch": 0.15645756457564575,
"grad_norm": 38.08787536621094,
"learning_rate": 1.3038130381303814e-05,
"loss": 0.9539,
"step": 212
},
{
"epoch": 0.15719557195571957,
"grad_norm": 40.93101501464844,
"learning_rate": 1.3099630996309964e-05,
"loss": 0.9497,
"step": 213
},
{
"epoch": 0.15793357933579336,
"grad_norm": 42.12691116333008,
"learning_rate": 1.3161131611316112e-05,
"loss": 1.0591,
"step": 214
},
{
"epoch": 0.15867158671586715,
"grad_norm": 39.68405532836914,
"learning_rate": 1.3222632226322266e-05,
"loss": 1.1084,
"step": 215
},
{
"epoch": 0.15940959409594097,
"grad_norm": 46.32451629638672,
"learning_rate": 1.3284132841328414e-05,
"loss": 0.9886,
"step": 216
},
{
"epoch": 0.16014760147601476,
"grad_norm": 43.83405303955078,
"learning_rate": 1.3345633456334564e-05,
"loss": 1.0409,
"step": 217
},
{
"epoch": 0.16088560885608857,
"grad_norm": 46.454429626464844,
"learning_rate": 1.3407134071340713e-05,
"loss": 0.927,
"step": 218
},
{
"epoch": 0.16162361623616237,
"grad_norm": 43.32332229614258,
"learning_rate": 1.3468634686346865e-05,
"loss": 1.0885,
"step": 219
},
{
"epoch": 0.16236162361623616,
"grad_norm": 38.92317581176758,
"learning_rate": 1.3530135301353015e-05,
"loss": 1.1205,
"step": 220
},
{
"epoch": 0.16309963099630997,
"grad_norm": 36.57090759277344,
"learning_rate": 1.3591635916359163e-05,
"loss": 1.0607,
"step": 221
},
{
"epoch": 0.16383763837638377,
"grad_norm": 39.162147521972656,
"learning_rate": 1.3653136531365315e-05,
"loss": 1.1395,
"step": 222
},
{
"epoch": 0.16457564575645756,
"grad_norm": 40.069610595703125,
"learning_rate": 1.3714637146371464e-05,
"loss": 0.993,
"step": 223
},
{
"epoch": 0.16531365313653137,
"grad_norm": 38.262664794921875,
"learning_rate": 1.3776137761377614e-05,
"loss": 1.0751,
"step": 224
},
{
"epoch": 0.16605166051660517,
"grad_norm": 38.50648498535156,
"learning_rate": 1.3837638376383766e-05,
"loss": 1.0874,
"step": 225
},
{
"epoch": 0.16678966789667896,
"grad_norm": 41.57286834716797,
"learning_rate": 1.3899138991389916e-05,
"loss": 1.061,
"step": 226
},
{
"epoch": 0.16752767527675277,
"grad_norm": 38.842124938964844,
"learning_rate": 1.3960639606396064e-05,
"loss": 0.9865,
"step": 227
},
{
"epoch": 0.16826568265682657,
"grad_norm": 40.79179382324219,
"learning_rate": 1.4022140221402214e-05,
"loss": 1.0104,
"step": 228
},
{
"epoch": 0.16900369003690036,
"grad_norm": 40.540042877197266,
"learning_rate": 1.4083640836408365e-05,
"loss": 0.9352,
"step": 229
},
{
"epoch": 0.16974169741697417,
"grad_norm": 39.385459899902344,
"learning_rate": 1.4145141451414515e-05,
"loss": 1.0731,
"step": 230
},
{
"epoch": 0.17047970479704797,
"grad_norm": 40.35080337524414,
"learning_rate": 1.4206642066420663e-05,
"loss": 1.1106,
"step": 231
},
{
"epoch": 0.17121771217712178,
"grad_norm": 37.7828254699707,
"learning_rate": 1.4268142681426816e-05,
"loss": 1.0902,
"step": 232
},
{
"epoch": 0.17195571955719557,
"grad_norm": 38.59387969970703,
"learning_rate": 1.4329643296432965e-05,
"loss": 1.0837,
"step": 233
},
{
"epoch": 0.17269372693726937,
"grad_norm": 40.220245361328125,
"learning_rate": 1.4391143911439114e-05,
"loss": 1.1002,
"step": 234
},
{
"epoch": 0.17343173431734318,
"grad_norm": 41.30938720703125,
"learning_rate": 1.4452644526445266e-05,
"loss": 0.9605,
"step": 235
},
{
"epoch": 0.17416974169741697,
"grad_norm": 42.54692840576172,
"learning_rate": 1.4514145141451416e-05,
"loss": 1.1135,
"step": 236
},
{
"epoch": 0.17490774907749077,
"grad_norm": 38.45701217651367,
"learning_rate": 1.4575645756457566e-05,
"loss": 1.2065,
"step": 237
},
{
"epoch": 0.17564575645756458,
"grad_norm": 40.34320068359375,
"learning_rate": 1.4637146371463714e-05,
"loss": 1.0331,
"step": 238
},
{
"epoch": 0.17638376383763837,
"grad_norm": 39.82585144042969,
"learning_rate": 1.4698646986469865e-05,
"loss": 1.1597,
"step": 239
},
{
"epoch": 0.17712177121771217,
"grad_norm": 39.45707321166992,
"learning_rate": 1.4760147601476015e-05,
"loss": 1.1008,
"step": 240
},
{
"epoch": 0.17785977859778598,
"grad_norm": 37.564231872558594,
"learning_rate": 1.4821648216482165e-05,
"loss": 0.9734,
"step": 241
},
{
"epoch": 0.17859778597785977,
"grad_norm": 40.75583267211914,
"learning_rate": 1.4883148831488317e-05,
"loss": 1.1324,
"step": 242
},
{
"epoch": 0.1793357933579336,
"grad_norm": 36.91340255737305,
"learning_rate": 1.4944649446494467e-05,
"loss": 0.8858,
"step": 243
},
{
"epoch": 0.18007380073800738,
"grad_norm": 41.43409729003906,
"learning_rate": 1.5006150061500615e-05,
"loss": 1.127,
"step": 244
},
{
"epoch": 0.18081180811808117,
"grad_norm": 39.64106750488281,
"learning_rate": 1.5067650676506768e-05,
"loss": 1.0394,
"step": 245
},
{
"epoch": 0.181549815498155,
"grad_norm": 39.24397277832031,
"learning_rate": 1.5129151291512916e-05,
"loss": 1.1139,
"step": 246
},
{
"epoch": 0.18228782287822878,
"grad_norm": 39.08576965332031,
"learning_rate": 1.5190651906519066e-05,
"loss": 1.1373,
"step": 247
},
{
"epoch": 0.18302583025830257,
"grad_norm": 37.38773727416992,
"learning_rate": 1.5252152521525214e-05,
"loss": 0.9942,
"step": 248
},
{
"epoch": 0.1837638376383764,
"grad_norm": 39.011505126953125,
"learning_rate": 1.5313653136531367e-05,
"loss": 1.1033,
"step": 249
},
{
"epoch": 0.18450184501845018,
"grad_norm": 38.647705078125,
"learning_rate": 1.5375153751537517e-05,
"loss": 1.0039,
"step": 250
},
{
"epoch": 0.18523985239852397,
"grad_norm": 36.8840446472168,
"learning_rate": 1.5436654366543664e-05,
"loss": 1.037,
"step": 251
},
{
"epoch": 0.1859778597785978,
"grad_norm": 39.59068298339844,
"learning_rate": 1.5498154981549817e-05,
"loss": 1.1113,
"step": 252
},
{
"epoch": 0.18671586715867158,
"grad_norm": 35.01139450073242,
"learning_rate": 1.5559655596555967e-05,
"loss": 1.0766,
"step": 253
},
{
"epoch": 0.18745387453874537,
"grad_norm": 42.80155944824219,
"learning_rate": 1.5621156211562117e-05,
"loss": 1.2052,
"step": 254
},
{
"epoch": 0.1881918819188192,
"grad_norm": 37.67293930053711,
"learning_rate": 1.5682656826568266e-05,
"loss": 1.054,
"step": 255
},
{
"epoch": 0.18892988929889298,
"grad_norm": 35.59282684326172,
"learning_rate": 1.5744157441574416e-05,
"loss": 1.1038,
"step": 256
},
{
"epoch": 0.1896678966789668,
"grad_norm": 36.562198638916016,
"learning_rate": 1.5805658056580566e-05,
"loss": 1.1277,
"step": 257
},
{
"epoch": 0.1904059040590406,
"grad_norm": 38.406944274902344,
"learning_rate": 1.5867158671586716e-05,
"loss": 1.0396,
"step": 258
},
{
"epoch": 0.19114391143911438,
"grad_norm": 37.851539611816406,
"learning_rate": 1.5928659286592866e-05,
"loss": 1.0541,
"step": 259
},
{
"epoch": 0.1918819188191882,
"grad_norm": 34.81989669799805,
"learning_rate": 1.5990159901599016e-05,
"loss": 1.0241,
"step": 260
},
{
"epoch": 0.192619926199262,
"grad_norm": 38.74085235595703,
"learning_rate": 1.6051660516605166e-05,
"loss": 1.0709,
"step": 261
},
{
"epoch": 0.19335793357933578,
"grad_norm": 41.59756088256836,
"learning_rate": 1.611316113161132e-05,
"loss": 1.2334,
"step": 262
},
{
"epoch": 0.1940959409594096,
"grad_norm": 35.79509353637695,
"learning_rate": 1.617466174661747e-05,
"loss": 1.0133,
"step": 263
},
{
"epoch": 0.1948339483394834,
"grad_norm": 39.88947677612305,
"learning_rate": 1.6236162361623615e-05,
"loss": 1.0831,
"step": 264
},
{
"epoch": 0.19557195571955718,
"grad_norm": 35.988487243652344,
"learning_rate": 1.629766297662977e-05,
"loss": 1.0962,
"step": 265
},
{
"epoch": 0.196309963099631,
"grad_norm": 36.9556999206543,
"learning_rate": 1.6359163591635918e-05,
"loss": 1.1309,
"step": 266
},
{
"epoch": 0.1970479704797048,
"grad_norm": 36.95020294189453,
"learning_rate": 1.6420664206642068e-05,
"loss": 1.0556,
"step": 267
},
{
"epoch": 0.1977859778597786,
"grad_norm": 36.589324951171875,
"learning_rate": 1.6482164821648215e-05,
"loss": 1.0871,
"step": 268
},
{
"epoch": 0.1985239852398524,
"grad_norm": 38.176605224609375,
"learning_rate": 1.6543665436654368e-05,
"loss": 1.0362,
"step": 269
},
{
"epoch": 0.1992619926199262,
"grad_norm": 40.13340759277344,
"learning_rate": 1.6605166051660518e-05,
"loss": 0.9606,
"step": 270
},
{
"epoch": 0.2,
"grad_norm": 40.80103302001953,
"learning_rate": 1.6666666666666667e-05,
"loss": 1.0099,
"step": 271
},
{
"epoch": 0.2007380073800738,
"grad_norm": 37.991947174072266,
"learning_rate": 1.6728167281672817e-05,
"loss": 1.1559,
"step": 272
},
{
"epoch": 0.2014760147601476,
"grad_norm": 35.638126373291016,
"learning_rate": 1.6789667896678967e-05,
"loss": 1.0468,
"step": 273
},
{
"epoch": 0.2022140221402214,
"grad_norm": 36.0762825012207,
"learning_rate": 1.6851168511685117e-05,
"loss": 0.9843,
"step": 274
},
{
"epoch": 0.2029520295202952,
"grad_norm": 39.42917251586914,
"learning_rate": 1.691266912669127e-05,
"loss": 0.995,
"step": 275
},
{
"epoch": 0.203690036900369,
"grad_norm": 38.73271179199219,
"learning_rate": 1.6974169741697417e-05,
"loss": 1.1101,
"step": 276
},
{
"epoch": 0.2044280442804428,
"grad_norm": 34.4466667175293,
"learning_rate": 1.7035670356703567e-05,
"loss": 1.1769,
"step": 277
},
{
"epoch": 0.2051660516605166,
"grad_norm": 38.39332580566406,
"learning_rate": 1.7097170971709716e-05,
"loss": 1.2032,
"step": 278
},
{
"epoch": 0.2059040590405904,
"grad_norm": 36.46586227416992,
"learning_rate": 1.715867158671587e-05,
"loss": 1.2505,
"step": 279
},
{
"epoch": 0.2066420664206642,
"grad_norm": 38.546119689941406,
"learning_rate": 1.722017220172202e-05,
"loss": 1.0471,
"step": 280
},
{
"epoch": 0.207380073800738,
"grad_norm": 36.11763381958008,
"learning_rate": 1.7281672816728166e-05,
"loss": 1.1173,
"step": 281
},
{
"epoch": 0.20811808118081182,
"grad_norm": 36.332969665527344,
"learning_rate": 1.734317343173432e-05,
"loss": 0.989,
"step": 282
},
{
"epoch": 0.2088560885608856,
"grad_norm": 36.8829231262207,
"learning_rate": 1.740467404674047e-05,
"loss": 1.0894,
"step": 283
},
{
"epoch": 0.2095940959409594,
"grad_norm": 35.905765533447266,
"learning_rate": 1.746617466174662e-05,
"loss": 1.1755,
"step": 284
},
{
"epoch": 0.21033210332103322,
"grad_norm": 31.39859962463379,
"learning_rate": 1.752767527675277e-05,
"loss": 1.089,
"step": 285
},
{
"epoch": 0.211070110701107,
"grad_norm": 36.529537200927734,
"learning_rate": 1.758917589175892e-05,
"loss": 1.0632,
"step": 286
},
{
"epoch": 0.2118081180811808,
"grad_norm": 38.358001708984375,
"learning_rate": 1.765067650676507e-05,
"loss": 1.1177,
"step": 287
},
{
"epoch": 0.21254612546125462,
"grad_norm": 37.179325103759766,
"learning_rate": 1.771217712177122e-05,
"loss": 1.0513,
"step": 288
},
{
"epoch": 0.2132841328413284,
"grad_norm": 35.38275146484375,
"learning_rate": 1.7773677736777368e-05,
"loss": 1.0212,
"step": 289
},
{
"epoch": 0.2140221402214022,
"grad_norm": 37.132389068603516,
"learning_rate": 1.7835178351783518e-05,
"loss": 1.089,
"step": 290
},
{
"epoch": 0.21476014760147602,
"grad_norm": 34.594783782958984,
"learning_rate": 1.7896678966789668e-05,
"loss": 1.1115,
"step": 291
},
{
"epoch": 0.2154981549815498,
"grad_norm": 36.57194137573242,
"learning_rate": 1.795817958179582e-05,
"loss": 0.9911,
"step": 292
},
{
"epoch": 0.21623616236162363,
"grad_norm": 34.58879470825195,
"learning_rate": 1.8019680196801968e-05,
"loss": 1.0169,
"step": 293
},
{
"epoch": 0.21697416974169742,
"grad_norm": 33.588539123535156,
"learning_rate": 1.8081180811808117e-05,
"loss": 1.0345,
"step": 294
},
{
"epoch": 0.2177121771217712,
"grad_norm": 34.15876007080078,
"learning_rate": 1.814268142681427e-05,
"loss": 1.0387,
"step": 295
},
{
"epoch": 0.21845018450184503,
"grad_norm": 40.78740310668945,
"learning_rate": 1.820418204182042e-05,
"loss": 1.0292,
"step": 296
},
{
"epoch": 0.21918819188191882,
"grad_norm": 38.307064056396484,
"learning_rate": 1.826568265682657e-05,
"loss": 1.087,
"step": 297
},
{
"epoch": 0.2199261992619926,
"grad_norm": 33.9033203125,
"learning_rate": 1.8327183271832717e-05,
"loss": 1.0356,
"step": 298
},
{
"epoch": 0.22066420664206643,
"grad_norm": 38.69232940673828,
"learning_rate": 1.838868388683887e-05,
"loss": 1.0239,
"step": 299
},
{
"epoch": 0.22140221402214022,
"grad_norm": 34.63215637207031,
"learning_rate": 1.845018450184502e-05,
"loss": 1.1614,
"step": 300
},
{
"epoch": 0.22140221402214022,
"eval_loss": 1.3183945417404175,
"eval_runtime": 343.8464,
"eval_samples_per_second": 3.342,
"eval_steps_per_second": 0.279,
"step": 300
},
{
"epoch": 0.222140221402214,
"grad_norm": 33.18867111206055,
"learning_rate": 1.851168511685117e-05,
"loss": 1.1519,
"step": 301
},
{
"epoch": 0.22287822878228783,
"grad_norm": 34.760982513427734,
"learning_rate": 1.857318573185732e-05,
"loss": 1.1005,
"step": 302
},
{
"epoch": 0.22361623616236162,
"grad_norm": 34.268043518066406,
"learning_rate": 1.863468634686347e-05,
"loss": 1.0483,
"step": 303
},
{
"epoch": 0.2243542435424354,
"grad_norm": 35.12160873413086,
"learning_rate": 1.869618696186962e-05,
"loss": 1.1201,
"step": 304
},
{
"epoch": 0.22509225092250923,
"grad_norm": 38.57670974731445,
"learning_rate": 1.8757687576875773e-05,
"loss": 1.0204,
"step": 305
},
{
"epoch": 0.22583025830258302,
"grad_norm": 34.495235443115234,
"learning_rate": 1.881918819188192e-05,
"loss": 1.1887,
"step": 306
},
{
"epoch": 0.22656826568265684,
"grad_norm": 36.18799591064453,
"learning_rate": 1.888068880688807e-05,
"loss": 0.8969,
"step": 307
},
{
"epoch": 0.22730627306273063,
"grad_norm": 35.36030960083008,
"learning_rate": 1.894218942189422e-05,
"loss": 1.1272,
"step": 308
},
{
"epoch": 0.22804428044280442,
"grad_norm": 34.50253677368164,
"learning_rate": 1.9003690036900372e-05,
"loss": 0.9908,
"step": 309
},
{
"epoch": 0.22878228782287824,
"grad_norm": 33.003875732421875,
"learning_rate": 1.906519065190652e-05,
"loss": 1.0323,
"step": 310
},
{
"epoch": 0.22952029520295203,
"grad_norm": 33.84071731567383,
"learning_rate": 1.912669126691267e-05,
"loss": 1.0924,
"step": 311
},
{
"epoch": 0.23025830258302582,
"grad_norm": 37.590694427490234,
"learning_rate": 1.918819188191882e-05,
"loss": 1.0558,
"step": 312
},
{
"epoch": 0.23099630996309964,
"grad_norm": 1048.1514892578125,
"learning_rate": 1.924969249692497e-05,
"loss": 0.9793,
"step": 313
},
{
"epoch": 0.23173431734317343,
"grad_norm": 32.7579460144043,
"learning_rate": 1.931119311193112e-05,
"loss": 0.9345,
"step": 314
},
{
"epoch": 0.23247232472324722,
"grad_norm": 41.32646942138672,
"learning_rate": 1.937269372693727e-05,
"loss": 1.0441,
"step": 315
},
{
"epoch": 0.23321033210332104,
"grad_norm": 39.139198303222656,
"learning_rate": 1.943419434194342e-05,
"loss": 1.0545,
"step": 316
},
{
"epoch": 0.23394833948339483,
"grad_norm": 35.99794006347656,
"learning_rate": 1.949569495694957e-05,
"loss": 1.1261,
"step": 317
},
{
"epoch": 0.23468634686346865,
"grad_norm": 34.20968246459961,
"learning_rate": 1.955719557195572e-05,
"loss": 0.9836,
"step": 318
},
{
"epoch": 0.23542435424354244,
"grad_norm": 33.9476203918457,
"learning_rate": 1.961869618696187e-05,
"loss": 1.0345,
"step": 319
},
{
"epoch": 0.23616236162361623,
"grad_norm": 35.6599235534668,
"learning_rate": 1.968019680196802e-05,
"loss": 1.0316,
"step": 320
},
{
"epoch": 0.23690036900369005,
"grad_norm": 34.30624008178711,
"learning_rate": 1.974169741697417e-05,
"loss": 0.9987,
"step": 321
},
{
"epoch": 0.23763837638376384,
"grad_norm": 34.07005310058594,
"learning_rate": 1.980319803198032e-05,
"loss": 1.0052,
"step": 322
},
{
"epoch": 0.23837638376383763,
"grad_norm": 33.085777282714844,
"learning_rate": 1.986469864698647e-05,
"loss": 1.1424,
"step": 323
},
{
"epoch": 0.23911439114391145,
"grad_norm": 34.74597930908203,
"learning_rate": 1.992619926199262e-05,
"loss": 1.1401,
"step": 324
},
{
"epoch": 0.23985239852398524,
"grad_norm": 36.55511474609375,
"learning_rate": 1.9987699876998773e-05,
"loss": 0.9437,
"step": 325
},
{
"epoch": 0.24059040590405903,
"grad_norm": 35.86470031738281,
"learning_rate": 2.0049200492004923e-05,
"loss": 1.1535,
"step": 326
},
{
"epoch": 0.24132841328413285,
"grad_norm": 33.983421325683594,
"learning_rate": 2.011070110701107e-05,
"loss": 1.1367,
"step": 327
},
{
"epoch": 0.24206642066420664,
"grad_norm": 36.45722198486328,
"learning_rate": 2.017220172201722e-05,
"loss": 1.0917,
"step": 328
},
{
"epoch": 0.24280442804428043,
"grad_norm": 36.953060150146484,
"learning_rate": 2.0233702337023372e-05,
"loss": 1.0107,
"step": 329
},
{
"epoch": 0.24354243542435425,
"grad_norm": 37.92033004760742,
"learning_rate": 2.0295202952029522e-05,
"loss": 1.2084,
"step": 330
},
{
"epoch": 0.24428044280442804,
"grad_norm": 31.74508285522461,
"learning_rate": 2.035670356703567e-05,
"loss": 1.0421,
"step": 331
},
{
"epoch": 0.24501845018450186,
"grad_norm": 37.19945526123047,
"learning_rate": 2.0418204182041822e-05,
"loss": 1.082,
"step": 332
},
{
"epoch": 0.24575645756457565,
"grad_norm": 32.649444580078125,
"learning_rate": 2.0479704797047972e-05,
"loss": 1.1345,
"step": 333
},
{
"epoch": 0.24649446494464944,
"grad_norm": 36.957977294921875,
"learning_rate": 2.054120541205412e-05,
"loss": 1.0192,
"step": 334
},
{
"epoch": 0.24723247232472326,
"grad_norm": 32.36549377441406,
"learning_rate": 2.060270602706027e-05,
"loss": 1.1387,
"step": 335
},
{
"epoch": 0.24797047970479705,
"grad_norm": 30.191532135009766,
"learning_rate": 2.066420664206642e-05,
"loss": 1.0083,
"step": 336
},
{
"epoch": 0.24870848708487084,
"grad_norm": 31.56035804748535,
"learning_rate": 2.072570725707257e-05,
"loss": 1.1246,
"step": 337
},
{
"epoch": 0.24944649446494466,
"grad_norm": 36.50621032714844,
"learning_rate": 2.078720787207872e-05,
"loss": 1.1289,
"step": 338
},
{
"epoch": 0.25018450184501845,
"grad_norm": 32.51582336425781,
"learning_rate": 2.084870848708487e-05,
"loss": 0.9957,
"step": 339
},
{
"epoch": 0.25092250922509224,
"grad_norm": 40.50331115722656,
"learning_rate": 2.091020910209102e-05,
"loss": 1.0179,
"step": 340
},
{
"epoch": 0.25166051660516603,
"grad_norm": 38.418792724609375,
"learning_rate": 2.097170971709717e-05,
"loss": 1.1074,
"step": 341
},
{
"epoch": 0.2523985239852399,
"grad_norm": 33.0310173034668,
"learning_rate": 2.1033210332103324e-05,
"loss": 1.1623,
"step": 342
},
{
"epoch": 0.25313653136531367,
"grad_norm": 30.66373062133789,
"learning_rate": 2.1094710947109474e-05,
"loss": 0.9796,
"step": 343
},
{
"epoch": 0.25387453874538746,
"grad_norm": 30.335712432861328,
"learning_rate": 2.115621156211562e-05,
"loss": 1.0376,
"step": 344
},
{
"epoch": 0.25461254612546125,
"grad_norm": 33.595855712890625,
"learning_rate": 2.1217712177121773e-05,
"loss": 1.0289,
"step": 345
},
{
"epoch": 0.25535055350553504,
"grad_norm": 30.422454833984375,
"learning_rate": 2.1279212792127923e-05,
"loss": 1.0815,
"step": 346
},
{
"epoch": 0.25608856088560883,
"grad_norm": 38.317386627197266,
"learning_rate": 2.1340713407134073e-05,
"loss": 1.0096,
"step": 347
},
{
"epoch": 0.2568265682656827,
"grad_norm": 36.44529342651367,
"learning_rate": 2.140221402214022e-05,
"loss": 1.006,
"step": 348
},
{
"epoch": 0.25756457564575647,
"grad_norm": 33.271060943603516,
"learning_rate": 2.1463714637146373e-05,
"loss": 0.9819,
"step": 349
},
{
"epoch": 0.25830258302583026,
"grad_norm": 35.99654769897461,
"learning_rate": 2.1525215252152523e-05,
"loss": 1.1038,
"step": 350
},
{
"epoch": 0.25904059040590405,
"grad_norm": 34.73610305786133,
"learning_rate": 2.1586715867158673e-05,
"loss": 1.065,
"step": 351
},
{
"epoch": 0.25977859778597784,
"grad_norm": 37.899776458740234,
"learning_rate": 2.1648216482164822e-05,
"loss": 1.1092,
"step": 352
},
{
"epoch": 0.2605166051660517,
"grad_norm": 36.49541473388672,
"learning_rate": 2.1709717097170972e-05,
"loss": 1.1665,
"step": 353
},
{
"epoch": 0.2612546125461255,
"grad_norm": 35.63615798950195,
"learning_rate": 2.1771217712177122e-05,
"loss": 1.1201,
"step": 354
},
{
"epoch": 0.26199261992619927,
"grad_norm": 34.21985626220703,
"learning_rate": 2.1832718327183275e-05,
"loss": 1.0518,
"step": 355
},
{
"epoch": 0.26273062730627306,
"grad_norm": 33.33612823486328,
"learning_rate": 2.1894218942189422e-05,
"loss": 0.964,
"step": 356
},
{
"epoch": 0.26346863468634685,
"grad_norm": 33.31211471557617,
"learning_rate": 2.195571955719557e-05,
"loss": 1.0508,
"step": 357
},
{
"epoch": 0.26420664206642064,
"grad_norm": 32.13766860961914,
"learning_rate": 2.201722017220172e-05,
"loss": 1.1904,
"step": 358
},
{
"epoch": 0.2649446494464945,
"grad_norm": 38.23426818847656,
"learning_rate": 2.2078720787207875e-05,
"loss": 1.044,
"step": 359
},
{
"epoch": 0.2656826568265683,
"grad_norm": 30.594451904296875,
"learning_rate": 2.2140221402214025e-05,
"loss": 0.8797,
"step": 360
},
{
"epoch": 0.26642066420664207,
"grad_norm": 33.05818557739258,
"learning_rate": 2.220172201722017e-05,
"loss": 1.1213,
"step": 361
},
{
"epoch": 0.26715867158671586,
"grad_norm": 31.24005126953125,
"learning_rate": 2.2263222632226324e-05,
"loss": 1.1148,
"step": 362
},
{
"epoch": 0.26789667896678965,
"grad_norm": 33.34355926513672,
"learning_rate": 2.2324723247232474e-05,
"loss": 1.0186,
"step": 363
},
{
"epoch": 0.2686346863468635,
"grad_norm": 32.711002349853516,
"learning_rate": 2.2386223862238624e-05,
"loss": 1.0628,
"step": 364
},
{
"epoch": 0.2693726937269373,
"grad_norm": 31.853166580200195,
"learning_rate": 2.2447724477244774e-05,
"loss": 1.0366,
"step": 365
},
{
"epoch": 0.2701107011070111,
"grad_norm": 32.53550720214844,
"learning_rate": 2.2509225092250924e-05,
"loss": 1.076,
"step": 366
},
{
"epoch": 0.27084870848708487,
"grad_norm": 29.53455924987793,
"learning_rate": 2.2570725707257074e-05,
"loss": 1.0598,
"step": 367
},
{
"epoch": 0.27158671586715866,
"grad_norm": 34.44631576538086,
"learning_rate": 2.2632226322263223e-05,
"loss": 1.1174,
"step": 368
},
{
"epoch": 0.27232472324723245,
"grad_norm": 33.80080032348633,
"learning_rate": 2.2693726937269373e-05,
"loss": 1.205,
"step": 369
},
{
"epoch": 0.2730627306273063,
"grad_norm": 33.64272689819336,
"learning_rate": 2.2755227552275523e-05,
"loss": 1.1677,
"step": 370
},
{
"epoch": 0.2738007380073801,
"grad_norm": 32.4225959777832,
"learning_rate": 2.2816728167281673e-05,
"loss": 0.9153,
"step": 371
},
{
"epoch": 0.2745387453874539,
"grad_norm": 32.35124969482422,
"learning_rate": 2.2878228782287826e-05,
"loss": 1.0536,
"step": 372
},
{
"epoch": 0.27527675276752767,
"grad_norm": 32.049827575683594,
"learning_rate": 2.2939729397293973e-05,
"loss": 1.1493,
"step": 373
},
{
"epoch": 0.27601476014760146,
"grad_norm": 29.892070770263672,
"learning_rate": 2.3001230012300123e-05,
"loss": 1.0047,
"step": 374
},
{
"epoch": 0.2767527675276753,
"grad_norm": 30.831012725830078,
"learning_rate": 2.3062730627306276e-05,
"loss": 1.0843,
"step": 375
},
{
"epoch": 0.2774907749077491,
"grad_norm": 31.903175354003906,
"learning_rate": 2.3124231242312426e-05,
"loss": 1.0552,
"step": 376
},
{
"epoch": 0.2782287822878229,
"grad_norm": 31.119150161743164,
"learning_rate": 2.3185731857318575e-05,
"loss": 1.0762,
"step": 377
},
{
"epoch": 0.2789667896678967,
"grad_norm": 34.476524353027344,
"learning_rate": 2.3247232472324722e-05,
"loss": 0.925,
"step": 378
},
{
"epoch": 0.27970479704797047,
"grad_norm": 33.33213806152344,
"learning_rate": 2.3308733087330875e-05,
"loss": 1.0427,
"step": 379
},
{
"epoch": 0.28044280442804426,
"grad_norm": 30.07733917236328,
"learning_rate": 2.3370233702337025e-05,
"loss": 1.1158,
"step": 380
},
{
"epoch": 0.2811808118081181,
"grad_norm": 36.79194259643555,
"learning_rate": 2.3431734317343175e-05,
"loss": 0.969,
"step": 381
},
{
"epoch": 0.2819188191881919,
"grad_norm": 32.193233489990234,
"learning_rate": 2.3493234932349325e-05,
"loss": 0.938,
"step": 382
},
{
"epoch": 0.2826568265682657,
"grad_norm": 35.39616394042969,
"learning_rate": 2.3554735547355475e-05,
"loss": 1.0384,
"step": 383
},
{
"epoch": 0.2833948339483395,
"grad_norm": 32.57839584350586,
"learning_rate": 2.3616236162361624e-05,
"loss": 1.0573,
"step": 384
},
{
"epoch": 0.28413284132841327,
"grad_norm": 34.920528411865234,
"learning_rate": 2.3677736777367778e-05,
"loss": 0.9427,
"step": 385
},
{
"epoch": 0.2848708487084871,
"grad_norm": 34.9754753112793,
"learning_rate": 2.3739237392373924e-05,
"loss": 1.0893,
"step": 386
},
{
"epoch": 0.2856088560885609,
"grad_norm": 31.592897415161133,
"learning_rate": 2.3800738007380074e-05,
"loss": 1.1378,
"step": 387
},
{
"epoch": 0.2863468634686347,
"grad_norm": 32.26739501953125,
"learning_rate": 2.3862238622386224e-05,
"loss": 1.0627,
"step": 388
},
{
"epoch": 0.2870848708487085,
"grad_norm": 30.732433319091797,
"learning_rate": 2.3923739237392377e-05,
"loss": 1.0358,
"step": 389
},
{
"epoch": 0.2878228782287823,
"grad_norm": 34.005191802978516,
"learning_rate": 2.3985239852398524e-05,
"loss": 1.1111,
"step": 390
},
{
"epoch": 0.28856088560885607,
"grad_norm": 30.67894744873047,
"learning_rate": 2.4046740467404673e-05,
"loss": 0.9718,
"step": 391
},
{
"epoch": 0.2892988929889299,
"grad_norm": 28.351181030273438,
"learning_rate": 2.4108241082410827e-05,
"loss": 1.0609,
"step": 392
},
{
"epoch": 0.2900369003690037,
"grad_norm": 32.102474212646484,
"learning_rate": 2.4169741697416977e-05,
"loss": 1.1381,
"step": 393
},
{
"epoch": 0.2907749077490775,
"grad_norm": 33.687625885009766,
"learning_rate": 2.4231242312423126e-05,
"loss": 1.1188,
"step": 394
},
{
"epoch": 0.2915129151291513,
"grad_norm": 33.333797454833984,
"learning_rate": 2.4292742927429276e-05,
"loss": 1.1755,
"step": 395
},
{
"epoch": 0.2922509225092251,
"grad_norm": 29.862483978271484,
"learning_rate": 2.4354243542435426e-05,
"loss": 0.9939,
"step": 396
},
{
"epoch": 0.29298892988929887,
"grad_norm": 34.118682861328125,
"learning_rate": 2.4415744157441576e-05,
"loss": 1.0769,
"step": 397
},
{
"epoch": 0.2937269372693727,
"grad_norm": 31.04990005493164,
"learning_rate": 2.4477244772447726e-05,
"loss": 0.9994,
"step": 398
},
{
"epoch": 0.2944649446494465,
"grad_norm": 31.455734252929688,
"learning_rate": 2.4538745387453876e-05,
"loss": 1.052,
"step": 399
},
{
"epoch": 0.2952029520295203,
"grad_norm": 33.53933334350586,
"learning_rate": 2.4600246002460025e-05,
"loss": 1.0479,
"step": 400
},
{
"epoch": 0.2952029520295203,
"eval_loss": 1.3168951272964478,
"eval_runtime": 307.3734,
"eval_samples_per_second": 3.738,
"eval_steps_per_second": 0.312,
"step": 400
},
{
"epoch": 0.2959409594095941,
"grad_norm": 30.59261703491211,
"learning_rate": 2.4661746617466175e-05,
"loss": 1.0978,
"step": 401
},
{
"epoch": 0.2966789667896679,
"grad_norm": 30.34042739868164,
"learning_rate": 2.472324723247233e-05,
"loss": 0.9811,
"step": 402
},
{
"epoch": 0.2974169741697417,
"grad_norm": 30.172008514404297,
"learning_rate": 2.4784747847478475e-05,
"loss": 1.1006,
"step": 403
},
{
"epoch": 0.2981549815498155,
"grad_norm": 34.521026611328125,
"learning_rate": 2.4846248462484625e-05,
"loss": 1.0414,
"step": 404
},
{
"epoch": 0.2988929889298893,
"grad_norm": 32.659603118896484,
"learning_rate": 2.4907749077490778e-05,
"loss": 1.0581,
"step": 405
},
{
"epoch": 0.2996309963099631,
"grad_norm": 30.84364128112793,
"learning_rate": 2.4969249692496928e-05,
"loss": 1.0734,
"step": 406
},
{
"epoch": 0.3003690036900369,
"grad_norm": 31.31522560119629,
"learning_rate": 2.5030750307503074e-05,
"loss": 1.1324,
"step": 407
},
{
"epoch": 0.3011070110701107,
"grad_norm": 30.90158462524414,
"learning_rate": 2.5092250922509224e-05,
"loss": 1.0875,
"step": 408
},
{
"epoch": 0.3018450184501845,
"grad_norm": 32.63178634643555,
"learning_rate": 2.5153751537515374e-05,
"loss": 0.947,
"step": 409
},
{
"epoch": 0.3025830258302583,
"grad_norm": 31.25884246826172,
"learning_rate": 2.5215252152521527e-05,
"loss": 0.9885,
"step": 410
},
{
"epoch": 0.3033210332103321,
"grad_norm": 31.27341651916504,
"learning_rate": 2.5276752767527677e-05,
"loss": 1.0252,
"step": 411
},
{
"epoch": 0.3040590405904059,
"grad_norm": 32.48451232910156,
"learning_rate": 2.5338253382533827e-05,
"loss": 0.9561,
"step": 412
},
{
"epoch": 0.3047970479704797,
"grad_norm": 32.380348205566406,
"learning_rate": 2.5399753997539977e-05,
"loss": 1.0956,
"step": 413
},
{
"epoch": 0.30553505535055353,
"grad_norm": 35.79043960571289,
"learning_rate": 2.5461254612546127e-05,
"loss": 0.9773,
"step": 414
},
{
"epoch": 0.3062730627306273,
"grad_norm": 32.07080078125,
"learning_rate": 2.5522755227552277e-05,
"loss": 0.9709,
"step": 415
},
{
"epoch": 0.3070110701107011,
"grad_norm": 30.587440490722656,
"learning_rate": 2.5584255842558423e-05,
"loss": 1.1602,
"step": 416
},
{
"epoch": 0.3077490774907749,
"grad_norm": 32.147560119628906,
"learning_rate": 2.564575645756458e-05,
"loss": 1.0212,
"step": 417
},
{
"epoch": 0.3084870848708487,
"grad_norm": 28.960500717163086,
"learning_rate": 2.570725707257073e-05,
"loss": 1.0724,
"step": 418
},
{
"epoch": 0.3092250922509225,
"grad_norm": 31.89568519592285,
"learning_rate": 2.5768757687576876e-05,
"loss": 1.0993,
"step": 419
},
{
"epoch": 0.30996309963099633,
"grad_norm": 28.9609317779541,
"learning_rate": 2.5830258302583026e-05,
"loss": 1.0213,
"step": 420
},
{
"epoch": 0.3107011070110701,
"grad_norm": 32.195152282714844,
"learning_rate": 2.5891758917589176e-05,
"loss": 1.1312,
"step": 421
},
{
"epoch": 0.3114391143911439,
"grad_norm": 32.34213638305664,
"learning_rate": 2.5953259532595326e-05,
"loss": 1.1141,
"step": 422
},
{
"epoch": 0.3121771217712177,
"grad_norm": 30.041015625,
"learning_rate": 2.6014760147601475e-05,
"loss": 1.0912,
"step": 423
},
{
"epoch": 0.3129151291512915,
"grad_norm": 34.097068786621094,
"learning_rate": 2.607626076260763e-05,
"loss": 1.1144,
"step": 424
},
{
"epoch": 0.31365313653136534,
"grad_norm": 33.118072509765625,
"learning_rate": 2.613776137761378e-05,
"loss": 1.0424,
"step": 425
},
{
"epoch": 0.31439114391143913,
"grad_norm": 32.24378967285156,
"learning_rate": 2.619926199261993e-05,
"loss": 1.1218,
"step": 426
},
{
"epoch": 0.3151291512915129,
"grad_norm": 29.910358428955078,
"learning_rate": 2.6260762607626078e-05,
"loss": 1.0043,
"step": 427
},
{
"epoch": 0.3158671586715867,
"grad_norm": 28.464271545410156,
"learning_rate": 2.6322263222632225e-05,
"loss": 1.0377,
"step": 428
},
{
"epoch": 0.3166051660516605,
"grad_norm": 33.54305648803711,
"learning_rate": 2.6383763837638375e-05,
"loss": 1.0836,
"step": 429
},
{
"epoch": 0.3173431734317343,
"grad_norm": 33.36182403564453,
"learning_rate": 2.644526445264453e-05,
"loss": 0.935,
"step": 430
},
{
"epoch": 0.31808118081180814,
"grad_norm": 30.69318962097168,
"learning_rate": 2.650676506765068e-05,
"loss": 0.9393,
"step": 431
},
{
"epoch": 0.31881918819188193,
"grad_norm": 31.307289123535156,
"learning_rate": 2.6568265682656828e-05,
"loss": 1.0578,
"step": 432
},
{
"epoch": 0.3195571955719557,
"grad_norm": 30.9537353515625,
"learning_rate": 2.6629766297662977e-05,
"loss": 0.978,
"step": 433
},
{
"epoch": 0.3202952029520295,
"grad_norm": 34.1992073059082,
"learning_rate": 2.6691266912669127e-05,
"loss": 1.109,
"step": 434
},
{
"epoch": 0.3210332103321033,
"grad_norm": 35.864681243896484,
"learning_rate": 2.6752767527675277e-05,
"loss": 1.0984,
"step": 435
},
{
"epoch": 0.32177121771217715,
"grad_norm": 37.84678649902344,
"learning_rate": 2.6814268142681427e-05,
"loss": 1.1034,
"step": 436
},
{
"epoch": 0.32250922509225094,
"grad_norm": 32.07746124267578,
"learning_rate": 2.687576875768758e-05,
"loss": 0.9589,
"step": 437
},
{
"epoch": 0.32324723247232473,
"grad_norm": 30.982397079467773,
"learning_rate": 2.693726937269373e-05,
"loss": 0.957,
"step": 438
},
{
"epoch": 0.3239852398523985,
"grad_norm": 32.20938491821289,
"learning_rate": 2.699876998769988e-05,
"loss": 1.0958,
"step": 439
},
{
"epoch": 0.3247232472324723,
"grad_norm": 30.640172958374023,
"learning_rate": 2.706027060270603e-05,
"loss": 1.0231,
"step": 440
},
{
"epoch": 0.3254612546125461,
"grad_norm": 31.90199851989746,
"learning_rate": 2.7121771217712176e-05,
"loss": 1.1002,
"step": 441
},
{
"epoch": 0.32619926199261995,
"grad_norm": 30.51987075805664,
"learning_rate": 2.7183271832718326e-05,
"loss": 1.162,
"step": 442
},
{
"epoch": 0.32693726937269374,
"grad_norm": 31.501314163208008,
"learning_rate": 2.7244772447724476e-05,
"loss": 1.0607,
"step": 443
},
{
"epoch": 0.32767527675276753,
"grad_norm": 28.6356143951416,
"learning_rate": 2.730627306273063e-05,
"loss": 0.9796,
"step": 444
},
{
"epoch": 0.3284132841328413,
"grad_norm": 31.74925422668457,
"learning_rate": 2.736777367773678e-05,
"loss": 1.1158,
"step": 445
},
{
"epoch": 0.3291512915129151,
"grad_norm": 34.154579162597656,
"learning_rate": 2.742927429274293e-05,
"loss": 1.0649,
"step": 446
},
{
"epoch": 0.3298892988929889,
"grad_norm": 32.25503158569336,
"learning_rate": 2.749077490774908e-05,
"loss": 1.1914,
"step": 447
},
{
"epoch": 0.33062730627306275,
"grad_norm": 37.06145477294922,
"learning_rate": 2.755227552275523e-05,
"loss": 1.0985,
"step": 448
},
{
"epoch": 0.33136531365313654,
"grad_norm": 31.48094367980957,
"learning_rate": 2.761377613776138e-05,
"loss": 1.0892,
"step": 449
},
{
"epoch": 0.33210332103321033,
"grad_norm": 32.612770080566406,
"learning_rate": 2.767527675276753e-05,
"loss": 1.0109,
"step": 450
},
{
"epoch": 0.3328413284132841,
"grad_norm": 31.58296775817871,
"learning_rate": 2.773677736777368e-05,
"loss": 0.97,
"step": 451
},
{
"epoch": 0.3335793357933579,
"grad_norm": 34.60434341430664,
"learning_rate": 2.779827798277983e-05,
"loss": 1.0432,
"step": 452
},
{
"epoch": 0.33431734317343176,
"grad_norm": 34.914894104003906,
"learning_rate": 2.7859778597785978e-05,
"loss": 1.1001,
"step": 453
},
{
"epoch": 0.33505535055350555,
"grad_norm": 35.59685134887695,
"learning_rate": 2.7921279212792128e-05,
"loss": 1.2244,
"step": 454
},
{
"epoch": 0.33579335793357934,
"grad_norm": 29.713642120361328,
"learning_rate": 2.7982779827798277e-05,
"loss": 0.9019,
"step": 455
},
{
"epoch": 0.33653136531365313,
"grad_norm": 31.13001823425293,
"learning_rate": 2.8044280442804427e-05,
"loss": 1.0366,
"step": 456
},
{
"epoch": 0.3372693726937269,
"grad_norm": 30.281965255737305,
"learning_rate": 2.810578105781058e-05,
"loss": 1.0273,
"step": 457
},
{
"epoch": 0.3380073800738007,
"grad_norm": 31.66211700439453,
"learning_rate": 2.816728167281673e-05,
"loss": 1.1194,
"step": 458
},
{
"epoch": 0.33874538745387456,
"grad_norm": 30.275386810302734,
"learning_rate": 2.822878228782288e-05,
"loss": 1.0575,
"step": 459
},
{
"epoch": 0.33948339483394835,
"grad_norm": 29.42925453186035,
"learning_rate": 2.829028290282903e-05,
"loss": 0.9656,
"step": 460
},
{
"epoch": 0.34022140221402214,
"grad_norm": 32.71029281616211,
"learning_rate": 2.835178351783518e-05,
"loss": 1.1847,
"step": 461
},
{
"epoch": 0.34095940959409593,
"grad_norm": 29.633073806762695,
"learning_rate": 2.8413284132841326e-05,
"loss": 1.0942,
"step": 462
},
{
"epoch": 0.3416974169741697,
"grad_norm": 31.828601837158203,
"learning_rate": 2.8474784747847476e-05,
"loss": 1.0376,
"step": 463
},
{
"epoch": 0.34243542435424357,
"grad_norm": 30.043981552124023,
"learning_rate": 2.8536285362853633e-05,
"loss": 1.0835,
"step": 464
},
{
"epoch": 0.34317343173431736,
"grad_norm": 33.54213333129883,
"learning_rate": 2.8597785977859783e-05,
"loss": 0.996,
"step": 465
},
{
"epoch": 0.34391143911439115,
"grad_norm": 29.244539260864258,
"learning_rate": 2.865928659286593e-05,
"loss": 1.0677,
"step": 466
},
{
"epoch": 0.34464944649446494,
"grad_norm": 30.86827278137207,
"learning_rate": 2.872078720787208e-05,
"loss": 0.9887,
"step": 467
},
{
"epoch": 0.34538745387453873,
"grad_norm": 31.78754997253418,
"learning_rate": 2.878228782287823e-05,
"loss": 0.9915,
"step": 468
},
{
"epoch": 0.3461254612546125,
"grad_norm": 32.79195785522461,
"learning_rate": 2.884378843788438e-05,
"loss": 1.1147,
"step": 469
},
{
"epoch": 0.34686346863468637,
"grad_norm": 33.397979736328125,
"learning_rate": 2.8905289052890532e-05,
"loss": 0.9495,
"step": 470
},
{
"epoch": 0.34760147601476016,
"grad_norm": 33.192649841308594,
"learning_rate": 2.8966789667896682e-05,
"loss": 1.0026,
"step": 471
},
{
"epoch": 0.34833948339483395,
"grad_norm": 32.53486251831055,
"learning_rate": 2.9028290282902832e-05,
"loss": 1.0896,
"step": 472
},
{
"epoch": 0.34907749077490774,
"grad_norm": 29.988269805908203,
"learning_rate": 2.908979089790898e-05,
"loss": 1.0286,
"step": 473
},
{
"epoch": 0.34981549815498153,
"grad_norm": 30.389328002929688,
"learning_rate": 2.915129151291513e-05,
"loss": 1.0617,
"step": 474
},
{
"epoch": 0.3505535055350554,
"grad_norm": 32.341678619384766,
"learning_rate": 2.9212792127921278e-05,
"loss": 0.9784,
"step": 475
},
{
"epoch": 0.35129151291512917,
"grad_norm": 34.1507453918457,
"learning_rate": 2.9274292742927428e-05,
"loss": 1.1268,
"step": 476
},
{
"epoch": 0.35202952029520296,
"grad_norm": 30.625898361206055,
"learning_rate": 2.9335793357933584e-05,
"loss": 1.1621,
"step": 477
},
{
"epoch": 0.35276752767527675,
"grad_norm": 29.35662841796875,
"learning_rate": 2.939729397293973e-05,
"loss": 0.9967,
"step": 478
},
{
"epoch": 0.35350553505535054,
"grad_norm": 28.236364364624023,
"learning_rate": 2.945879458794588e-05,
"loss": 1.0189,
"step": 479
},
{
"epoch": 0.35424354243542433,
"grad_norm": 29.935972213745117,
"learning_rate": 2.952029520295203e-05,
"loss": 1.1403,
"step": 480
},
{
"epoch": 0.3549815498154982,
"grad_norm": 30.732343673706055,
"learning_rate": 2.958179581795818e-05,
"loss": 1.0329,
"step": 481
},
{
"epoch": 0.35571955719557197,
"grad_norm": 27.611663818359375,
"learning_rate": 2.964329643296433e-05,
"loss": 0.9701,
"step": 482
},
{
"epoch": 0.35645756457564576,
"grad_norm": 26.146472930908203,
"learning_rate": 2.970479704797048e-05,
"loss": 1.0555,
"step": 483
},
{
"epoch": 0.35719557195571955,
"grad_norm": 27.38328742980957,
"learning_rate": 2.9766297662976633e-05,
"loss": 1.0839,
"step": 484
},
{
"epoch": 0.35793357933579334,
"grad_norm": 30.21470832824707,
"learning_rate": 2.9827798277982783e-05,
"loss": 0.9601,
"step": 485
},
{
"epoch": 0.3586715867158672,
"grad_norm": 33.275665283203125,
"learning_rate": 2.9889298892988933e-05,
"loss": 0.9648,
"step": 486
},
{
"epoch": 0.359409594095941,
"grad_norm": 32.144935607910156,
"learning_rate": 2.995079950799508e-05,
"loss": 1.0774,
"step": 487
},
{
"epoch": 0.36014760147601477,
"grad_norm": 33.03762435913086,
"learning_rate": 3.001230012300123e-05,
"loss": 1.0353,
"step": 488
},
{
"epoch": 0.36088560885608856,
"grad_norm": 29.72600555419922,
"learning_rate": 3.007380073800738e-05,
"loss": 1.0075,
"step": 489
},
{
"epoch": 0.36162361623616235,
"grad_norm": 31.551420211791992,
"learning_rate": 3.0135301353013536e-05,
"loss": 1.1612,
"step": 490
},
{
"epoch": 0.36236162361623614,
"grad_norm": 31.255245208740234,
"learning_rate": 3.0196801968019682e-05,
"loss": 1.1291,
"step": 491
},
{
"epoch": 0.36309963099631,
"grad_norm": 28.523984909057617,
"learning_rate": 3.0258302583025832e-05,
"loss": 0.8965,
"step": 492
},
{
"epoch": 0.3638376383763838,
"grad_norm": 27.026256561279297,
"learning_rate": 3.0319803198031982e-05,
"loss": 0.9842,
"step": 493
},
{
"epoch": 0.36457564575645757,
"grad_norm": 27.513683319091797,
"learning_rate": 3.0381303813038132e-05,
"loss": 1.0663,
"step": 494
},
{
"epoch": 0.36531365313653136,
"grad_norm": 28.917890548706055,
"learning_rate": 3.0442804428044282e-05,
"loss": 1.0083,
"step": 495
},
{
"epoch": 0.36605166051660515,
"grad_norm": 30.66982650756836,
"learning_rate": 3.0504305043050428e-05,
"loss": 1.065,
"step": 496
},
{
"epoch": 0.36678966789667894,
"grad_norm": 29.29199981689453,
"learning_rate": 3.056580565805658e-05,
"loss": 1.1113,
"step": 497
},
{
"epoch": 0.3675276752767528,
"grad_norm": 30.53307342529297,
"learning_rate": 3.0627306273062735e-05,
"loss": 1.0564,
"step": 498
},
{
"epoch": 0.3682656826568266,
"grad_norm": 27.8240909576416,
"learning_rate": 3.068880688806888e-05,
"loss": 1.086,
"step": 499
},
{
"epoch": 0.36900369003690037,
"grad_norm": 33.0767936706543,
"learning_rate": 3.0750307503075034e-05,
"loss": 1.0258,
"step": 500
},
{
"epoch": 0.36900369003690037,
"eval_loss": 1.317694902420044,
"eval_runtime": 307.5192,
"eval_samples_per_second": 3.736,
"eval_steps_per_second": 0.312,
"step": 500
},
{
"epoch": 0.36974169741697416,
"grad_norm": 29.415969848632812,
"learning_rate": 3.081180811808118e-05,
"loss": 1.1926,
"step": 501
},
{
"epoch": 0.37047970479704795,
"grad_norm": 28.967937469482422,
"learning_rate": 3.087330873308733e-05,
"loss": 1.0652,
"step": 502
},
{
"epoch": 0.3712177121771218,
"grad_norm": 30.757186889648438,
"learning_rate": 3.093480934809348e-05,
"loss": 1.0759,
"step": 503
},
{
"epoch": 0.3719557195571956,
"grad_norm": 29.12079429626465,
"learning_rate": 3.0996309963099634e-05,
"loss": 1.0171,
"step": 504
},
{
"epoch": 0.3726937269372694,
"grad_norm": 27.398155212402344,
"learning_rate": 3.105781057810579e-05,
"loss": 1.0255,
"step": 505
},
{
"epoch": 0.37343173431734317,
"grad_norm": 30.28290557861328,
"learning_rate": 3.1119311193111933e-05,
"loss": 1.0215,
"step": 506
},
{
"epoch": 0.37416974169741696,
"grad_norm": 32.874385833740234,
"learning_rate": 3.118081180811808e-05,
"loss": 0.9185,
"step": 507
},
{
"epoch": 0.37490774907749075,
"grad_norm": 32.606929779052734,
"learning_rate": 3.124231242312423e-05,
"loss": 0.921,
"step": 508
},
{
"epoch": 0.3756457564575646,
"grad_norm": 32.026466369628906,
"learning_rate": 3.130381303813038e-05,
"loss": 0.9647,
"step": 509
},
{
"epoch": 0.3763837638376384,
"grad_norm": 28.804256439208984,
"learning_rate": 3.136531365313653e-05,
"loss": 0.9783,
"step": 510
},
{
"epoch": 0.3771217712177122,
"grad_norm": 33.4760627746582,
"learning_rate": 3.1426814268142686e-05,
"loss": 1.1102,
"step": 511
},
{
"epoch": 0.37785977859778597,
"grad_norm": 27.7533016204834,
"learning_rate": 3.148831488314883e-05,
"loss": 1.0607,
"step": 512
},
{
"epoch": 0.37859778597785976,
"grad_norm": 30.21308135986328,
"learning_rate": 3.1549815498154986e-05,
"loss": 0.9921,
"step": 513
},
{
"epoch": 0.3793357933579336,
"grad_norm": 30.123981475830078,
"learning_rate": 3.161131611316113e-05,
"loss": 1.0603,
"step": 514
},
{
"epoch": 0.3800738007380074,
"grad_norm": 31.298110961914062,
"learning_rate": 3.167281672816728e-05,
"loss": 1.0396,
"step": 515
},
{
"epoch": 0.3808118081180812,
"grad_norm": 29.31854248046875,
"learning_rate": 3.173431734317343e-05,
"loss": 1.0797,
"step": 516
},
{
"epoch": 0.381549815498155,
"grad_norm": 32.191680908203125,
"learning_rate": 3.1795817958179585e-05,
"loss": 0.9568,
"step": 517
},
{
"epoch": 0.38228782287822877,
"grad_norm": 31.62862777709961,
"learning_rate": 3.185731857318573e-05,
"loss": 1.1659,
"step": 518
},
{
"epoch": 0.38302583025830256,
"grad_norm": 28.874908447265625,
"learning_rate": 3.1918819188191885e-05,
"loss": 1.0192,
"step": 519
},
{
"epoch": 0.3837638376383764,
"grad_norm": 28.602893829345703,
"learning_rate": 3.198031980319803e-05,
"loss": 1.064,
"step": 520
},
{
"epoch": 0.3845018450184502,
"grad_norm": 30.128530502319336,
"learning_rate": 3.2041820418204185e-05,
"loss": 0.9613,
"step": 521
},
{
"epoch": 0.385239852398524,
"grad_norm": 29.335969924926758,
"learning_rate": 3.210332103321033e-05,
"loss": 1.0305,
"step": 522
},
{
"epoch": 0.3859778597785978,
"grad_norm": 28.34609031677246,
"learning_rate": 3.2164821648216484e-05,
"loss": 1.1001,
"step": 523
},
{
"epoch": 0.38671586715867157,
"grad_norm": 29.133621215820312,
"learning_rate": 3.222632226322264e-05,
"loss": 1.0011,
"step": 524
},
{
"epoch": 0.3874538745387454,
"grad_norm": 29.79188346862793,
"learning_rate": 3.2287822878228784e-05,
"loss": 0.8858,
"step": 525
},
{
"epoch": 0.3881918819188192,
"grad_norm": 33.12505340576172,
"learning_rate": 3.234932349323494e-05,
"loss": 1.0749,
"step": 526
},
{
"epoch": 0.388929889298893,
"grad_norm": 28.103736877441406,
"learning_rate": 3.2410824108241084e-05,
"loss": 1.02,
"step": 527
},
{
"epoch": 0.3896678966789668,
"grad_norm": 29.42950439453125,
"learning_rate": 3.247232472324723e-05,
"loss": 1.0181,
"step": 528
},
{
"epoch": 0.3904059040590406,
"grad_norm": 28.812963485717773,
"learning_rate": 3.2533825338253383e-05,
"loss": 1.1254,
"step": 529
},
{
"epoch": 0.39114391143911437,
"grad_norm": 30.136219024658203,
"learning_rate": 3.259532595325954e-05,
"loss": 1.1222,
"step": 530
},
{
"epoch": 0.3918819188191882,
"grad_norm": 33.467960357666016,
"learning_rate": 3.265682656826568e-05,
"loss": 1.0028,
"step": 531
},
{
"epoch": 0.392619926199262,
"grad_norm": 32.62849044799805,
"learning_rate": 3.2718327183271836e-05,
"loss": 1.1019,
"step": 532
},
{
"epoch": 0.3933579335793358,
"grad_norm": 31.51215171813965,
"learning_rate": 3.277982779827798e-05,
"loss": 1.1408,
"step": 533
},
{
"epoch": 0.3940959409594096,
"grad_norm": 31.761720657348633,
"learning_rate": 3.2841328413284136e-05,
"loss": 0.9927,
"step": 534
},
{
"epoch": 0.3948339483394834,
"grad_norm": 28.129587173461914,
"learning_rate": 3.290282902829028e-05,
"loss": 0.9439,
"step": 535
},
{
"epoch": 0.3955719557195572,
"grad_norm": 31.913143157958984,
"learning_rate": 3.296432964329643e-05,
"loss": 1.0182,
"step": 536
},
{
"epoch": 0.396309963099631,
"grad_norm": 28.858692169189453,
"learning_rate": 3.302583025830259e-05,
"loss": 1.1423,
"step": 537
},
{
"epoch": 0.3970479704797048,
"grad_norm": 39.564964294433594,
"learning_rate": 3.3087330873308736e-05,
"loss": 1.0672,
"step": 538
},
{
"epoch": 0.3977859778597786,
"grad_norm": 35.25300216674805,
"learning_rate": 3.314883148831489e-05,
"loss": 1.0794,
"step": 539
},
{
"epoch": 0.3985239852398524,
"grad_norm": 28.474002838134766,
"learning_rate": 3.3210332103321035e-05,
"loss": 1.1484,
"step": 540
},
{
"epoch": 0.3992619926199262,
"grad_norm": 33.87021255493164,
"learning_rate": 3.327183271832718e-05,
"loss": 1.1114,
"step": 541
},
{
"epoch": 0.4,
"grad_norm": 28.42962074279785,
"learning_rate": 3.3333333333333335e-05,
"loss": 1.0833,
"step": 542
},
{
"epoch": 0.4007380073800738,
"grad_norm": 30.21544075012207,
"learning_rate": 3.339483394833948e-05,
"loss": 1.1128,
"step": 543
},
{
"epoch": 0.4014760147601476,
"grad_norm": 29.623260498046875,
"learning_rate": 3.3456334563345635e-05,
"loss": 1.0984,
"step": 544
},
{
"epoch": 0.4022140221402214,
"grad_norm": 34.08790588378906,
"learning_rate": 3.351783517835179e-05,
"loss": 1.1091,
"step": 545
},
{
"epoch": 0.4029520295202952,
"grad_norm": 33.139915466308594,
"learning_rate": 3.3579335793357934e-05,
"loss": 1.0295,
"step": 546
},
{
"epoch": 0.40369003690036903,
"grad_norm": 35.7862663269043,
"learning_rate": 3.364083640836409e-05,
"loss": 1.1139,
"step": 547
},
{
"epoch": 0.4044280442804428,
"grad_norm": 28.253767013549805,
"learning_rate": 3.3702337023370234e-05,
"loss": 1.0673,
"step": 548
},
{
"epoch": 0.4051660516605166,
"grad_norm": 32.525115966796875,
"learning_rate": 3.376383763837638e-05,
"loss": 1.0096,
"step": 549
},
{
"epoch": 0.4059040590405904,
"grad_norm": 27.90035057067871,
"learning_rate": 3.382533825338254e-05,
"loss": 0.9391,
"step": 550
},
{
"epoch": 0.4066420664206642,
"grad_norm": 30.637134552001953,
"learning_rate": 3.388683886838869e-05,
"loss": 1.187,
"step": 551
},
{
"epoch": 0.407380073800738,
"grad_norm": 29.55883026123047,
"learning_rate": 3.3948339483394833e-05,
"loss": 1.0611,
"step": 552
},
{
"epoch": 0.40811808118081183,
"grad_norm": 30.938365936279297,
"learning_rate": 3.400984009840099e-05,
"loss": 1.0229,
"step": 553
},
{
"epoch": 0.4088560885608856,
"grad_norm": 29.01971435546875,
"learning_rate": 3.407134071340713e-05,
"loss": 1.1979,
"step": 554
},
{
"epoch": 0.4095940959409594,
"grad_norm": 28.88690185546875,
"learning_rate": 3.4132841328413286e-05,
"loss": 0.9652,
"step": 555
},
{
"epoch": 0.4103321033210332,
"grad_norm": 30.13008689880371,
"learning_rate": 3.419434194341943e-05,
"loss": 1.023,
"step": 556
},
{
"epoch": 0.411070110701107,
"grad_norm": 30.277244567871094,
"learning_rate": 3.4255842558425586e-05,
"loss": 1.0419,
"step": 557
},
{
"epoch": 0.4118081180811808,
"grad_norm": 31.770061492919922,
"learning_rate": 3.431734317343174e-05,
"loss": 1.1385,
"step": 558
},
{
"epoch": 0.41254612546125463,
"grad_norm": 28.85527992248535,
"learning_rate": 3.4378843788437886e-05,
"loss": 1.0576,
"step": 559
},
{
"epoch": 0.4132841328413284,
"grad_norm": 27.674936294555664,
"learning_rate": 3.444034440344404e-05,
"loss": 0.9695,
"step": 560
},
{
"epoch": 0.4140221402214022,
"grad_norm": 30.44672203063965,
"learning_rate": 3.4501845018450186e-05,
"loss": 1.1668,
"step": 561
},
{
"epoch": 0.414760147601476,
"grad_norm": 26.084020614624023,
"learning_rate": 3.456334563345633e-05,
"loss": 1.0662,
"step": 562
},
{
"epoch": 0.4154981549815498,
"grad_norm": 29.204233169555664,
"learning_rate": 3.4624846248462485e-05,
"loss": 1.0049,
"step": 563
},
{
"epoch": 0.41623616236162364,
"grad_norm": 31.064088821411133,
"learning_rate": 3.468634686346864e-05,
"loss": 1.0733,
"step": 564
},
{
"epoch": 0.41697416974169743,
"grad_norm": 28.714794158935547,
"learning_rate": 3.4747847478474785e-05,
"loss": 1.1252,
"step": 565
},
{
"epoch": 0.4177121771217712,
"grad_norm": 36.692623138427734,
"learning_rate": 3.480934809348094e-05,
"loss": 1.1517,
"step": 566
},
{
"epoch": 0.418450184501845,
"grad_norm": 29.342973709106445,
"learning_rate": 3.4870848708487085e-05,
"loss": 1.1617,
"step": 567
},
{
"epoch": 0.4191881918819188,
"grad_norm": 30.187889099121094,
"learning_rate": 3.493234932349324e-05,
"loss": 1.1766,
"step": 568
},
{
"epoch": 0.4199261992619926,
"grad_norm": 27.71148681640625,
"learning_rate": 3.4993849938499384e-05,
"loss": 1.1307,
"step": 569
},
{
"epoch": 0.42066420664206644,
"grad_norm": 26.817026138305664,
"learning_rate": 3.505535055350554e-05,
"loss": 1.1422,
"step": 570
},
{
"epoch": 0.42140221402214023,
"grad_norm": 29.25654411315918,
"learning_rate": 3.511685116851169e-05,
"loss": 1.0934,
"step": 571
},
{
"epoch": 0.422140221402214,
"grad_norm": 28.460424423217773,
"learning_rate": 3.517835178351784e-05,
"loss": 1.1308,
"step": 572
},
{
"epoch": 0.4228782287822878,
"grad_norm": 27.779157638549805,
"learning_rate": 3.5239852398523984e-05,
"loss": 1.0648,
"step": 573
},
{
"epoch": 0.4236162361623616,
"grad_norm": 32.28572082519531,
"learning_rate": 3.530135301353014e-05,
"loss": 1.0195,
"step": 574
},
{
"epoch": 0.42435424354243545,
"grad_norm": 30.577444076538086,
"learning_rate": 3.5362853628536283e-05,
"loss": 1.0581,
"step": 575
},
{
"epoch": 0.42509225092250924,
"grad_norm": 27.929576873779297,
"learning_rate": 3.542435424354244e-05,
"loss": 1.0838,
"step": 576
},
{
"epoch": 0.42583025830258303,
"grad_norm": 30.955745697021484,
"learning_rate": 3.548585485854859e-05,
"loss": 1.0065,
"step": 577
},
{
"epoch": 0.4265682656826568,
"grad_norm": 30.847639083862305,
"learning_rate": 3.5547355473554736e-05,
"loss": 1.0464,
"step": 578
},
{
"epoch": 0.4273062730627306,
"grad_norm": 26.83955192565918,
"learning_rate": 3.560885608856089e-05,
"loss": 1.0382,
"step": 579
},
{
"epoch": 0.4280442804428044,
"grad_norm": 28.2490177154541,
"learning_rate": 3.5670356703567036e-05,
"loss": 0.9712,
"step": 580
},
{
"epoch": 0.42878228782287825,
"grad_norm": 28.63175392150879,
"learning_rate": 3.573185731857319e-05,
"loss": 0.9944,
"step": 581
},
{
"epoch": 0.42952029520295204,
"grad_norm": 27.138669967651367,
"learning_rate": 3.5793357933579336e-05,
"loss": 1.1288,
"step": 582
},
{
"epoch": 0.43025830258302583,
"grad_norm": 28.75208282470703,
"learning_rate": 3.585485854858548e-05,
"loss": 1.0389,
"step": 583
},
{
"epoch": 0.4309963099630996,
"grad_norm": 29.765209197998047,
"learning_rate": 3.591635916359164e-05,
"loss": 1.0375,
"step": 584
},
{
"epoch": 0.4317343173431734,
"grad_norm": 31.77211570739746,
"learning_rate": 3.597785977859779e-05,
"loss": 1.1282,
"step": 585
},
{
"epoch": 0.43247232472324726,
"grad_norm": 28.593671798706055,
"learning_rate": 3.6039360393603935e-05,
"loss": 1.0487,
"step": 586
},
{
"epoch": 0.43321033210332105,
"grad_norm": 28.624773025512695,
"learning_rate": 3.610086100861009e-05,
"loss": 1.0686,
"step": 587
},
{
"epoch": 0.43394833948339484,
"grad_norm": 27.676698684692383,
"learning_rate": 3.6162361623616235e-05,
"loss": 1.1286,
"step": 588
},
{
"epoch": 0.43468634686346863,
"grad_norm": 28.334789276123047,
"learning_rate": 3.622386223862239e-05,
"loss": 1.0686,
"step": 589
},
{
"epoch": 0.4354243542435424,
"grad_norm": 24.738544464111328,
"learning_rate": 3.628536285362854e-05,
"loss": 0.8636,
"step": 590
},
{
"epoch": 0.4361623616236162,
"grad_norm": 29.112049102783203,
"learning_rate": 3.634686346863469e-05,
"loss": 0.9369,
"step": 591
},
{
"epoch": 0.43690036900369006,
"grad_norm": 29.67219352722168,
"learning_rate": 3.640836408364084e-05,
"loss": 1.0318,
"step": 592
},
{
"epoch": 0.43763837638376385,
"grad_norm": 32.45582580566406,
"learning_rate": 3.646986469864699e-05,
"loss": 1.0077,
"step": 593
},
{
"epoch": 0.43837638376383764,
"grad_norm": 30.126052856445312,
"learning_rate": 3.653136531365314e-05,
"loss": 1.1069,
"step": 594
},
{
"epoch": 0.43911439114391143,
"grad_norm": 30.43257713317871,
"learning_rate": 3.659286592865929e-05,
"loss": 1.0041,
"step": 595
},
{
"epoch": 0.4398523985239852,
"grad_norm": 28.884113311767578,
"learning_rate": 3.6654366543665434e-05,
"loss": 1.0136,
"step": 596
},
{
"epoch": 0.44059040590405907,
"grad_norm": 28.1043758392334,
"learning_rate": 3.6715867158671594e-05,
"loss": 1.0784,
"step": 597
},
{
"epoch": 0.44132841328413286,
"grad_norm": 29.222322463989258,
"learning_rate": 3.677736777367774e-05,
"loss": 0.994,
"step": 598
},
{
"epoch": 0.44206642066420665,
"grad_norm": 31.78004264831543,
"learning_rate": 3.683886838868389e-05,
"loss": 0.9989,
"step": 599
},
{
"epoch": 0.44280442804428044,
"grad_norm": 26.486068725585938,
"learning_rate": 3.690036900369004e-05,
"loss": 1.0936,
"step": 600
},
{
"epoch": 0.44280442804428044,
"eval_loss": 1.3267521858215332,
"eval_runtime": 309.179,
"eval_samples_per_second": 3.716,
"eval_steps_per_second": 0.31,
"step": 600
},
{
"epoch": 0.44354243542435423,
"grad_norm": 28.453187942504883,
"learning_rate": 3.6961869618696186e-05,
"loss": 1.0252,
"step": 601
},
{
"epoch": 0.444280442804428,
"grad_norm": 30.434410095214844,
"learning_rate": 3.702337023370234e-05,
"loss": 1.1192,
"step": 602
},
{
"epoch": 0.44501845018450187,
"grad_norm": 28.11585807800293,
"learning_rate": 3.7084870848708486e-05,
"loss": 0.9912,
"step": 603
},
{
"epoch": 0.44575645756457566,
"grad_norm": 32.852027893066406,
"learning_rate": 3.714637146371464e-05,
"loss": 0.9976,
"step": 604
},
{
"epoch": 0.44649446494464945,
"grad_norm": 26.785593032836914,
"learning_rate": 3.720787207872079e-05,
"loss": 1.0799,
"step": 605
},
{
"epoch": 0.44723247232472324,
"grad_norm": 28.873849868774414,
"learning_rate": 3.726937269372694e-05,
"loss": 1.0319,
"step": 606
},
{
"epoch": 0.44797047970479703,
"grad_norm": 31.951059341430664,
"learning_rate": 3.7330873308733085e-05,
"loss": 1.0665,
"step": 607
},
{
"epoch": 0.4487084870848708,
"grad_norm": 26.902822494506836,
"learning_rate": 3.739237392373924e-05,
"loss": 1.0973,
"step": 608
},
{
"epoch": 0.44944649446494467,
"grad_norm": 31.43962287902832,
"learning_rate": 3.7453874538745385e-05,
"loss": 0.997,
"step": 609
},
{
"epoch": 0.45018450184501846,
"grad_norm": 28.310514450073242,
"learning_rate": 3.7515375153751545e-05,
"loss": 1.1017,
"step": 610
},
{
"epoch": 0.45092250922509225,
"grad_norm": 26.364179611206055,
"learning_rate": 3.757687576875769e-05,
"loss": 1.087,
"step": 611
},
{
"epoch": 0.45166051660516604,
"grad_norm": 26.653833389282227,
"learning_rate": 3.763837638376384e-05,
"loss": 1.0061,
"step": 612
},
{
"epoch": 0.45239852398523983,
"grad_norm": 30.07135581970215,
"learning_rate": 3.769987699876999e-05,
"loss": 1.0957,
"step": 613
},
{
"epoch": 0.4531365313653137,
"grad_norm": 27.822776794433594,
"learning_rate": 3.776137761377614e-05,
"loss": 1.0992,
"step": 614
},
{
"epoch": 0.45387453874538747,
"grad_norm": 31.2148494720459,
"learning_rate": 3.782287822878229e-05,
"loss": 0.9902,
"step": 615
},
{
"epoch": 0.45461254612546126,
"grad_norm": 34.85270309448242,
"learning_rate": 3.788437884378844e-05,
"loss": 1.1163,
"step": 616
},
{
"epoch": 0.45535055350553505,
"grad_norm": 27.64411735534668,
"learning_rate": 3.794587945879459e-05,
"loss": 1.1273,
"step": 617
},
{
"epoch": 0.45608856088560884,
"grad_norm": 28.515451431274414,
"learning_rate": 3.8007380073800744e-05,
"loss": 1.0642,
"step": 618
},
{
"epoch": 0.45682656826568263,
"grad_norm": 34.522491455078125,
"learning_rate": 3.806888068880689e-05,
"loss": 0.9994,
"step": 619
},
{
"epoch": 0.4575645756457565,
"grad_norm": 30.255014419555664,
"learning_rate": 3.813038130381304e-05,
"loss": 1.091,
"step": 620
},
{
"epoch": 0.45830258302583027,
"grad_norm": 30.578969955444336,
"learning_rate": 3.819188191881919e-05,
"loss": 1.1066,
"step": 621
},
{
"epoch": 0.45904059040590406,
"grad_norm": 27.243410110473633,
"learning_rate": 3.825338253382534e-05,
"loss": 1.1007,
"step": 622
},
{
"epoch": 0.45977859778597785,
"grad_norm": 29.49376678466797,
"learning_rate": 3.831488314883149e-05,
"loss": 1.1645,
"step": 623
},
{
"epoch": 0.46051660516605164,
"grad_norm": 30.315433502197266,
"learning_rate": 3.837638376383764e-05,
"loss": 1.0911,
"step": 624
},
{
"epoch": 0.4612546125461255,
"grad_norm": 31.19307518005371,
"learning_rate": 3.843788437884379e-05,
"loss": 1.1693,
"step": 625
},
{
"epoch": 0.4619926199261993,
"grad_norm": 27.844942092895508,
"learning_rate": 3.849938499384994e-05,
"loss": 1.0592,
"step": 626
},
{
"epoch": 0.46273062730627307,
"grad_norm": 29.83812141418457,
"learning_rate": 3.856088560885609e-05,
"loss": 1.0263,
"step": 627
},
{
"epoch": 0.46346863468634686,
"grad_norm": 27.992292404174805,
"learning_rate": 3.862238622386224e-05,
"loss": 1.0808,
"step": 628
},
{
"epoch": 0.46420664206642065,
"grad_norm": 27.693565368652344,
"learning_rate": 3.868388683886839e-05,
"loss": 0.9699,
"step": 629
},
{
"epoch": 0.46494464944649444,
"grad_norm": 29.0965633392334,
"learning_rate": 3.874538745387454e-05,
"loss": 1.1051,
"step": 630
},
{
"epoch": 0.4656826568265683,
"grad_norm": 29.10242462158203,
"learning_rate": 3.8806888068880695e-05,
"loss": 1.0039,
"step": 631
},
{
"epoch": 0.4664206642066421,
"grad_norm": 32.43134307861328,
"learning_rate": 3.886838868388684e-05,
"loss": 1.064,
"step": 632
},
{
"epoch": 0.46715867158671587,
"grad_norm": 29.64716148376465,
"learning_rate": 3.892988929889299e-05,
"loss": 1.0935,
"step": 633
},
{
"epoch": 0.46789667896678966,
"grad_norm": 29.36592674255371,
"learning_rate": 3.899138991389914e-05,
"loss": 1.0937,
"step": 634
},
{
"epoch": 0.46863468634686345,
"grad_norm": 28.95639991760254,
"learning_rate": 3.905289052890529e-05,
"loss": 1.0853,
"step": 635
},
{
"epoch": 0.4693726937269373,
"grad_norm": 29.89202308654785,
"learning_rate": 3.911439114391144e-05,
"loss": 1.0811,
"step": 636
},
{
"epoch": 0.4701107011070111,
"grad_norm": 29.48238754272461,
"learning_rate": 3.9175891758917595e-05,
"loss": 1.1506,
"step": 637
},
{
"epoch": 0.4708487084870849,
"grad_norm": 28.27334213256836,
"learning_rate": 3.923739237392374e-05,
"loss": 1.1197,
"step": 638
},
{
"epoch": 0.47158671586715867,
"grad_norm": 28.055349349975586,
"learning_rate": 3.9298892988929894e-05,
"loss": 1.0456,
"step": 639
},
{
"epoch": 0.47232472324723246,
"grad_norm": 27.234216690063477,
"learning_rate": 3.936039360393604e-05,
"loss": 0.9234,
"step": 640
},
{
"epoch": 0.47306273062730625,
"grad_norm": 28.06637191772461,
"learning_rate": 3.942189421894219e-05,
"loss": 1.0727,
"step": 641
},
{
"epoch": 0.4738007380073801,
"grad_norm": 32.17995834350586,
"learning_rate": 3.948339483394834e-05,
"loss": 0.9436,
"step": 642
},
{
"epoch": 0.4745387453874539,
"grad_norm": 34.09589767456055,
"learning_rate": 3.954489544895449e-05,
"loss": 1.1217,
"step": 643
},
{
"epoch": 0.4752767527675277,
"grad_norm": 28.410308837890625,
"learning_rate": 3.960639606396064e-05,
"loss": 1.2026,
"step": 644
},
{
"epoch": 0.47601476014760147,
"grad_norm": 30.437602996826172,
"learning_rate": 3.9667896678966793e-05,
"loss": 1.197,
"step": 645
},
{
"epoch": 0.47675276752767526,
"grad_norm": 25.85258674621582,
"learning_rate": 3.972939729397294e-05,
"loss": 0.9193,
"step": 646
},
{
"epoch": 0.4774907749077491,
"grad_norm": 30.591075897216797,
"learning_rate": 3.979089790897909e-05,
"loss": 1.0067,
"step": 647
},
{
"epoch": 0.4782287822878229,
"grad_norm": 35.56831741333008,
"learning_rate": 3.985239852398524e-05,
"loss": 1.0091,
"step": 648
},
{
"epoch": 0.4789667896678967,
"grad_norm": 28.925878524780273,
"learning_rate": 3.991389913899139e-05,
"loss": 1.0451,
"step": 649
},
{
"epoch": 0.4797047970479705,
"grad_norm": 26.45174789428711,
"learning_rate": 3.9975399753997546e-05,
"loss": 1.1113,
"step": 650
},
{
"epoch": 0.48044280442804427,
"grad_norm": 32.575260162353516,
"learning_rate": 4.003690036900369e-05,
"loss": 1.1129,
"step": 651
},
{
"epoch": 0.48118081180811806,
"grad_norm": 31.939918518066406,
"learning_rate": 4.0098400984009846e-05,
"loss": 1.0175,
"step": 652
},
{
"epoch": 0.4819188191881919,
"grad_norm": 72.9084701538086,
"learning_rate": 4.015990159901599e-05,
"loss": 0.966,
"step": 653
},
{
"epoch": 0.4826568265682657,
"grad_norm": 32.10757827758789,
"learning_rate": 4.022140221402214e-05,
"loss": 1.124,
"step": 654
},
{
"epoch": 0.4833948339483395,
"grad_norm": 35.528778076171875,
"learning_rate": 4.028290282902829e-05,
"loss": 1.133,
"step": 655
},
{
"epoch": 0.4841328413284133,
"grad_norm": 29.31783676147461,
"learning_rate": 4.034440344403444e-05,
"loss": 1.0159,
"step": 656
},
{
"epoch": 0.48487084870848707,
"grad_norm": 28.90894889831543,
"learning_rate": 4.040590405904059e-05,
"loss": 1.0186,
"step": 657
},
{
"epoch": 0.48560885608856086,
"grad_norm": 27.08976173400879,
"learning_rate": 4.0467404674046745e-05,
"loss": 0.8565,
"step": 658
},
{
"epoch": 0.4863468634686347,
"grad_norm": 32.08723831176758,
"learning_rate": 4.052890528905289e-05,
"loss": 0.9848,
"step": 659
},
{
"epoch": 0.4870848708487085,
"grad_norm": 31.9980525970459,
"learning_rate": 4.0590405904059045e-05,
"loss": 1.2406,
"step": 660
},
{
"epoch": 0.4878228782287823,
"grad_norm": 27.090219497680664,
"learning_rate": 4.065190651906519e-05,
"loss": 1.2235,
"step": 661
},
{
"epoch": 0.4885608856088561,
"grad_norm": 42.83357620239258,
"learning_rate": 4.071340713407134e-05,
"loss": 1.1278,
"step": 662
},
{
"epoch": 0.48929889298892987,
"grad_norm": 28.690671920776367,
"learning_rate": 4.077490774907749e-05,
"loss": 1.19,
"step": 663
},
{
"epoch": 0.4900369003690037,
"grad_norm": 32.07972717285156,
"learning_rate": 4.0836408364083644e-05,
"loss": 1.1053,
"step": 664
},
{
"epoch": 0.4907749077490775,
"grad_norm": 29.517995834350586,
"learning_rate": 4.08979089790898e-05,
"loss": 0.9924,
"step": 665
},
{
"epoch": 0.4915129151291513,
"grad_norm": 36.88546371459961,
"learning_rate": 4.0959409594095944e-05,
"loss": 1.097,
"step": 666
},
{
"epoch": 0.4922509225092251,
"grad_norm": 27.41716957092285,
"learning_rate": 4.102091020910209e-05,
"loss": 1.1743,
"step": 667
},
{
"epoch": 0.4929889298892989,
"grad_norm": 36.04215621948242,
"learning_rate": 4.108241082410824e-05,
"loss": 1.0724,
"step": 668
},
{
"epoch": 0.49372693726937267,
"grad_norm": 31.058218002319336,
"learning_rate": 4.114391143911439e-05,
"loss": 1.2224,
"step": 669
},
{
"epoch": 0.4944649446494465,
"grad_norm": 30.21110725402832,
"learning_rate": 4.120541205412054e-05,
"loss": 1.0559,
"step": 670
},
{
"epoch": 0.4952029520295203,
"grad_norm": 317.8634033203125,
"learning_rate": 4.1266912669126696e-05,
"loss": 1.1668,
"step": 671
},
{
"epoch": 0.4959409594095941,
"grad_norm": 30.09259605407715,
"learning_rate": 4.132841328413284e-05,
"loss": 1.107,
"step": 672
},
{
"epoch": 0.4966789667896679,
"grad_norm": 30.432334899902344,
"learning_rate": 4.1389913899138996e-05,
"loss": 1.0314,
"step": 673
},
{
"epoch": 0.4974169741697417,
"grad_norm": 29.147876739501953,
"learning_rate": 4.145141451414514e-05,
"loss": 1.1293,
"step": 674
},
{
"epoch": 0.4981549815498155,
"grad_norm": 28.299036026000977,
"learning_rate": 4.151291512915129e-05,
"loss": 0.9126,
"step": 675
},
{
"epoch": 0.4988929889298893,
"grad_norm": 27.47956085205078,
"learning_rate": 4.157441574415744e-05,
"loss": 1.1542,
"step": 676
},
{
"epoch": 0.4996309963099631,
"grad_norm": 31.645191192626953,
"learning_rate": 4.1635916359163595e-05,
"loss": 1.0069,
"step": 677
},
{
"epoch": 0.5003690036900369,
"grad_norm": 28.30335235595703,
"learning_rate": 4.169741697416974e-05,
"loss": 1.1189,
"step": 678
},
{
"epoch": 0.5011070110701107,
"grad_norm": 28.922136306762695,
"learning_rate": 4.1758917589175895e-05,
"loss": 1.0529,
"step": 679
},
{
"epoch": 0.5018450184501845,
"grad_norm": 29.070533752441406,
"learning_rate": 4.182041820418204e-05,
"loss": 1.0434,
"step": 680
},
{
"epoch": 0.5025830258302583,
"grad_norm": 34.41718292236328,
"learning_rate": 4.1881918819188195e-05,
"loss": 1.0136,
"step": 681
},
{
"epoch": 0.5033210332103321,
"grad_norm": 30.644197463989258,
"learning_rate": 4.194341943419434e-05,
"loss": 1.2139,
"step": 682
},
{
"epoch": 0.5040590405904058,
"grad_norm": 31.38071060180664,
"learning_rate": 4.2004920049200495e-05,
"loss": 1.0473,
"step": 683
},
{
"epoch": 0.5047970479704798,
"grad_norm": 28.35428237915039,
"learning_rate": 4.206642066420665e-05,
"loss": 1.1185,
"step": 684
},
{
"epoch": 0.5055350553505535,
"grad_norm": 30.84862518310547,
"learning_rate": 4.2127921279212794e-05,
"loss": 1.0605,
"step": 685
},
{
"epoch": 0.5062730627306273,
"grad_norm": 28.12001609802246,
"learning_rate": 4.218942189421895e-05,
"loss": 1.0421,
"step": 686
},
{
"epoch": 0.5070110701107011,
"grad_norm": 94.46589660644531,
"learning_rate": 4.2250922509225094e-05,
"loss": 1.1904,
"step": 687
},
{
"epoch": 0.5077490774907749,
"grad_norm": 28.075532913208008,
"learning_rate": 4.231242312423124e-05,
"loss": 0.9612,
"step": 688
},
{
"epoch": 0.5084870848708487,
"grad_norm": 33.0609245300293,
"learning_rate": 4.2373923739237394e-05,
"loss": 1.1283,
"step": 689
},
{
"epoch": 0.5092250922509225,
"grad_norm": 31.729276657104492,
"learning_rate": 4.243542435424355e-05,
"loss": 1.1253,
"step": 690
},
{
"epoch": 0.5099630996309963,
"grad_norm": 29.362197875976562,
"learning_rate": 4.249692496924969e-05,
"loss": 1.0577,
"step": 691
},
{
"epoch": 0.5107011070110701,
"grad_norm": 27.433551788330078,
"learning_rate": 4.2558425584255847e-05,
"loss": 0.9175,
"step": 692
},
{
"epoch": 0.5114391143911439,
"grad_norm": 28.477914810180664,
"learning_rate": 4.261992619926199e-05,
"loss": 1.1097,
"step": 693
},
{
"epoch": 0.5121771217712177,
"grad_norm": 26.180309295654297,
"learning_rate": 4.2681426814268146e-05,
"loss": 0.9585,
"step": 694
},
{
"epoch": 0.5129151291512916,
"grad_norm": 28.950037002563477,
"learning_rate": 4.274292742927429e-05,
"loss": 0.9945,
"step": 695
},
{
"epoch": 0.5136531365313654,
"grad_norm": 33.97092819213867,
"learning_rate": 4.280442804428044e-05,
"loss": 1.1573,
"step": 696
},
{
"epoch": 0.5143911439114391,
"grad_norm": 31.86573600769043,
"learning_rate": 4.28659286592866e-05,
"loss": 1.1061,
"step": 697
},
{
"epoch": 0.5151291512915129,
"grad_norm": 35.693443298339844,
"learning_rate": 4.2927429274292746e-05,
"loss": 1.1284,
"step": 698
},
{
"epoch": 0.5158671586715867,
"grad_norm": 29.409988403320312,
"learning_rate": 4.29889298892989e-05,
"loss": 1.2092,
"step": 699
},
{
"epoch": 0.5166051660516605,
"grad_norm": 28.83966636657715,
"learning_rate": 4.3050430504305045e-05,
"loss": 0.9437,
"step": 700
},
{
"epoch": 0.5166051660516605,
"eval_loss": 1.3298412561416626,
"eval_runtime": 307.823,
"eval_samples_per_second": 3.733,
"eval_steps_per_second": 0.312,
"step": 700
},
{
"epoch": 0.5173431734317343,
"grad_norm": 26.844846725463867,
"learning_rate": 4.311193111931119e-05,
"loss": 1.112,
"step": 701
},
{
"epoch": 0.5180811808118081,
"grad_norm": 28.658428192138672,
"learning_rate": 4.3173431734317345e-05,
"loss": 0.9918,
"step": 702
},
{
"epoch": 0.5188191881918819,
"grad_norm": 32.5452995300293,
"learning_rate": 4.323493234932349e-05,
"loss": 1.0447,
"step": 703
},
{
"epoch": 0.5195571955719557,
"grad_norm": 26.970304489135742,
"learning_rate": 4.3296432964329645e-05,
"loss": 1.1313,
"step": 704
},
{
"epoch": 0.5202952029520295,
"grad_norm": 28.920679092407227,
"learning_rate": 4.33579335793358e-05,
"loss": 1.0252,
"step": 705
},
{
"epoch": 0.5210332103321034,
"grad_norm": 40.62504959106445,
"learning_rate": 4.3419434194341945e-05,
"loss": 1.1523,
"step": 706
},
{
"epoch": 0.5217712177121772,
"grad_norm": 30.851390838623047,
"learning_rate": 4.34809348093481e-05,
"loss": 0.9797,
"step": 707
},
{
"epoch": 0.522509225092251,
"grad_norm": 27.900365829467773,
"learning_rate": 4.3542435424354244e-05,
"loss": 1.0221,
"step": 708
},
{
"epoch": 0.5232472324723247,
"grad_norm": 26.0831356048584,
"learning_rate": 4.360393603936039e-05,
"loss": 1.0887,
"step": 709
},
{
"epoch": 0.5239852398523985,
"grad_norm": 29.75108528137207,
"learning_rate": 4.366543665436655e-05,
"loss": 0.9471,
"step": 710
},
{
"epoch": 0.5247232472324723,
"grad_norm": 31.546483993530273,
"learning_rate": 4.37269372693727e-05,
"loss": 1.1382,
"step": 711
},
{
"epoch": 0.5254612546125461,
"grad_norm": 27.857818603515625,
"learning_rate": 4.3788437884378844e-05,
"loss": 1.1366,
"step": 712
},
{
"epoch": 0.5261992619926199,
"grad_norm": 26.583192825317383,
"learning_rate": 4.3849938499385e-05,
"loss": 1.0916,
"step": 713
},
{
"epoch": 0.5269372693726937,
"grad_norm": 30.150146484375,
"learning_rate": 4.391143911439114e-05,
"loss": 1.0575,
"step": 714
},
{
"epoch": 0.5276752767527675,
"grad_norm": 27.24560546875,
"learning_rate": 4.3972939729397297e-05,
"loss": 1.0683,
"step": 715
},
{
"epoch": 0.5284132841328413,
"grad_norm": 29.45226287841797,
"learning_rate": 4.403444034440344e-05,
"loss": 1.1279,
"step": 716
},
{
"epoch": 0.5291512915129152,
"grad_norm": 28.790172576904297,
"learning_rate": 4.4095940959409596e-05,
"loss": 1.1934,
"step": 717
},
{
"epoch": 0.529889298892989,
"grad_norm": 42.536705017089844,
"learning_rate": 4.415744157441575e-05,
"loss": 1.061,
"step": 718
},
{
"epoch": 0.5306273062730628,
"grad_norm": 28.66362953186035,
"learning_rate": 4.4218942189421896e-05,
"loss": 1.0786,
"step": 719
},
{
"epoch": 0.5313653136531366,
"grad_norm": 25.908044815063477,
"learning_rate": 4.428044280442805e-05,
"loss": 1.0297,
"step": 720
},
{
"epoch": 0.5321033210332103,
"grad_norm": 28.063125610351562,
"learning_rate": 4.4341943419434196e-05,
"loss": 0.9617,
"step": 721
},
{
"epoch": 0.5328413284132841,
"grad_norm": 27.69817352294922,
"learning_rate": 4.440344403444034e-05,
"loss": 1.1288,
"step": 722
},
{
"epoch": 0.5335793357933579,
"grad_norm": 30.366674423217773,
"learning_rate": 4.4464944649446495e-05,
"loss": 1.1191,
"step": 723
},
{
"epoch": 0.5343173431734317,
"grad_norm": 30.783306121826172,
"learning_rate": 4.452644526445265e-05,
"loss": 1.1135,
"step": 724
},
{
"epoch": 0.5350553505535055,
"grad_norm": 28.302270889282227,
"learning_rate": 4.4587945879458795e-05,
"loss": 1.1435,
"step": 725
},
{
"epoch": 0.5357933579335793,
"grad_norm": 28.51706314086914,
"learning_rate": 4.464944649446495e-05,
"loss": 1.1788,
"step": 726
},
{
"epoch": 0.5365313653136531,
"grad_norm": 31.32042121887207,
"learning_rate": 4.4710947109471095e-05,
"loss": 1.1667,
"step": 727
},
{
"epoch": 0.537269372693727,
"grad_norm": 28.812145233154297,
"learning_rate": 4.477244772447725e-05,
"loss": 1.0405,
"step": 728
},
{
"epoch": 0.5380073800738008,
"grad_norm": 26.23000717163086,
"learning_rate": 4.4833948339483395e-05,
"loss": 1.0401,
"step": 729
},
{
"epoch": 0.5387453874538746,
"grad_norm": 81.6714859008789,
"learning_rate": 4.489544895448955e-05,
"loss": 1.1202,
"step": 730
},
{
"epoch": 0.5394833948339484,
"grad_norm": 27.881044387817383,
"learning_rate": 4.49569495694957e-05,
"loss": 1.0516,
"step": 731
},
{
"epoch": 0.5402214022140222,
"grad_norm": 29.472396850585938,
"learning_rate": 4.501845018450185e-05,
"loss": 1.1779,
"step": 732
},
{
"epoch": 0.5409594095940959,
"grad_norm": 28.200910568237305,
"learning_rate": 4.5079950799507994e-05,
"loss": 1.018,
"step": 733
},
{
"epoch": 0.5416974169741697,
"grad_norm": 28.53663444519043,
"learning_rate": 4.514145141451415e-05,
"loss": 1.1831,
"step": 734
},
{
"epoch": 0.5424354243542435,
"grad_norm": 36.12836837768555,
"learning_rate": 4.5202952029520294e-05,
"loss": 1.1069,
"step": 735
},
{
"epoch": 0.5431734317343173,
"grad_norm": 29.165285110473633,
"learning_rate": 4.526445264452645e-05,
"loss": 0.9832,
"step": 736
},
{
"epoch": 0.5439114391143911,
"grad_norm": 27.385562896728516,
"learning_rate": 4.53259532595326e-05,
"loss": 1.0543,
"step": 737
},
{
"epoch": 0.5446494464944649,
"grad_norm": 32.897945404052734,
"learning_rate": 4.5387453874538747e-05,
"loss": 1.1396,
"step": 738
},
{
"epoch": 0.5453874538745388,
"grad_norm": 29.424503326416016,
"learning_rate": 4.54489544895449e-05,
"loss": 1.0489,
"step": 739
},
{
"epoch": 0.5461254612546126,
"grad_norm": 31.19598960876465,
"learning_rate": 4.5510455104551046e-05,
"loss": 1.2388,
"step": 740
},
{
"epoch": 0.5468634686346864,
"grad_norm": 28.53763198852539,
"learning_rate": 4.55719557195572e-05,
"loss": 1.0937,
"step": 741
},
{
"epoch": 0.5476014760147602,
"grad_norm": 29.64959716796875,
"learning_rate": 4.5633456334563346e-05,
"loss": 1.0849,
"step": 742
},
{
"epoch": 0.548339483394834,
"grad_norm": 27.357303619384766,
"learning_rate": 4.569495694956949e-05,
"loss": 1.1383,
"step": 743
},
{
"epoch": 0.5490774907749078,
"grad_norm": 27.413957595825195,
"learning_rate": 4.575645756457565e-05,
"loss": 1.079,
"step": 744
},
{
"epoch": 0.5498154981549815,
"grad_norm": 29.784135818481445,
"learning_rate": 4.58179581795818e-05,
"loss": 0.9767,
"step": 745
},
{
"epoch": 0.5505535055350553,
"grad_norm": 55.847591400146484,
"learning_rate": 4.5879458794587945e-05,
"loss": 1.0553,
"step": 746
},
{
"epoch": 0.5512915129151291,
"grad_norm": 320.68597412109375,
"learning_rate": 4.59409594095941e-05,
"loss": 1.1933,
"step": 747
},
{
"epoch": 0.5520295202952029,
"grad_norm": 26.938758850097656,
"learning_rate": 4.6002460024600245e-05,
"loss": 1.0217,
"step": 748
},
{
"epoch": 0.5527675276752767,
"grad_norm": 32.755672454833984,
"learning_rate": 4.60639606396064e-05,
"loss": 1.099,
"step": 749
},
{
"epoch": 0.5535055350553506,
"grad_norm": 30.825178146362305,
"learning_rate": 4.612546125461255e-05,
"loss": 1.1831,
"step": 750
},
{
"epoch": 0.5542435424354244,
"grad_norm": 26.865983963012695,
"learning_rate": 4.61869618696187e-05,
"loss": 1.172,
"step": 751
},
{
"epoch": 0.5549815498154982,
"grad_norm": 27.207359313964844,
"learning_rate": 4.624846248462485e-05,
"loss": 1.1431,
"step": 752
},
{
"epoch": 0.555719557195572,
"grad_norm": 31.474943161010742,
"learning_rate": 4.6309963099631e-05,
"loss": 1.0282,
"step": 753
},
{
"epoch": 0.5564575645756458,
"grad_norm": 31.235960006713867,
"learning_rate": 4.637146371463715e-05,
"loss": 1.0688,
"step": 754
},
{
"epoch": 0.5571955719557196,
"grad_norm": 26.043094635009766,
"learning_rate": 4.64329643296433e-05,
"loss": 1.0858,
"step": 755
},
{
"epoch": 0.5579335793357934,
"grad_norm": 28.13475227355957,
"learning_rate": 4.6494464944649444e-05,
"loss": 0.9878,
"step": 756
},
{
"epoch": 0.5586715867158671,
"grad_norm": 28.513853073120117,
"learning_rate": 4.6555965559655604e-05,
"loss": 0.9452,
"step": 757
},
{
"epoch": 0.5594095940959409,
"grad_norm": 28.906461715698242,
"learning_rate": 4.661746617466175e-05,
"loss": 0.9423,
"step": 758
},
{
"epoch": 0.5601476014760147,
"grad_norm": 33.28678894042969,
"learning_rate": 4.66789667896679e-05,
"loss": 1.1832,
"step": 759
},
{
"epoch": 0.5608856088560885,
"grad_norm": 29.69910430908203,
"learning_rate": 4.674046740467405e-05,
"loss": 1.0487,
"step": 760
},
{
"epoch": 0.5616236162361624,
"grad_norm": 81.67484283447266,
"learning_rate": 4.6801968019680197e-05,
"loss": 1.0789,
"step": 761
},
{
"epoch": 0.5623616236162362,
"grad_norm": 32.282474517822266,
"learning_rate": 4.686346863468635e-05,
"loss": 1.0681,
"step": 762
},
{
"epoch": 0.56309963099631,
"grad_norm": 28.49372673034668,
"learning_rate": 4.6924969249692496e-05,
"loss": 1.2814,
"step": 763
},
{
"epoch": 0.5638376383763838,
"grad_norm": 33.509033203125,
"learning_rate": 4.698646986469865e-05,
"loss": 1.056,
"step": 764
},
{
"epoch": 0.5645756457564576,
"grad_norm": 31.451663970947266,
"learning_rate": 4.70479704797048e-05,
"loss": 1.1701,
"step": 765
},
{
"epoch": 0.5653136531365314,
"grad_norm": 28.21207618713379,
"learning_rate": 4.710947109471095e-05,
"loss": 1.1759,
"step": 766
},
{
"epoch": 0.5660516605166052,
"grad_norm": 25.11651611328125,
"learning_rate": 4.7170971709717096e-05,
"loss": 1.0304,
"step": 767
},
{
"epoch": 0.566789667896679,
"grad_norm": 26.841819763183594,
"learning_rate": 4.723247232472325e-05,
"loss": 1.035,
"step": 768
},
{
"epoch": 0.5675276752767527,
"grad_norm": 26.381568908691406,
"learning_rate": 4.7293972939729395e-05,
"loss": 1.1387,
"step": 769
},
{
"epoch": 0.5682656826568265,
"grad_norm": 29.644023895263672,
"learning_rate": 4.7355473554735555e-05,
"loss": 1.0994,
"step": 770
},
{
"epoch": 0.5690036900369003,
"grad_norm": 31.37369728088379,
"learning_rate": 4.74169741697417e-05,
"loss": 1.124,
"step": 771
},
{
"epoch": 0.5697416974169742,
"grad_norm": 29.403026580810547,
"learning_rate": 4.747847478474785e-05,
"loss": 1.0507,
"step": 772
},
{
"epoch": 0.570479704797048,
"grad_norm": 28.384349822998047,
"learning_rate": 4.7539975399754e-05,
"loss": 1.0764,
"step": 773
},
{
"epoch": 0.5712177121771218,
"grad_norm": 67.28231811523438,
"learning_rate": 4.760147601476015e-05,
"loss": 1.0503,
"step": 774
},
{
"epoch": 0.5719557195571956,
"grad_norm": 29.146886825561523,
"learning_rate": 4.76629766297663e-05,
"loss": 1.0136,
"step": 775
},
{
"epoch": 0.5726937269372694,
"grad_norm": 60.2903938293457,
"learning_rate": 4.772447724477245e-05,
"loss": 1.0508,
"step": 776
},
{
"epoch": 0.5734317343173432,
"grad_norm": 28.743024826049805,
"learning_rate": 4.77859778597786e-05,
"loss": 1.0318,
"step": 777
},
{
"epoch": 0.574169741697417,
"grad_norm": 30.6608943939209,
"learning_rate": 4.7847478474784754e-05,
"loss": 1.09,
"step": 778
},
{
"epoch": 0.5749077490774908,
"grad_norm": 56.827152252197266,
"learning_rate": 4.79089790897909e-05,
"loss": 0.9897,
"step": 779
},
{
"epoch": 0.5756457564575646,
"grad_norm": 32.71049499511719,
"learning_rate": 4.797047970479705e-05,
"loss": 1.1841,
"step": 780
},
{
"epoch": 0.5763837638376383,
"grad_norm": 29.06208038330078,
"learning_rate": 4.80319803198032e-05,
"loss": 1.2583,
"step": 781
},
{
"epoch": 0.5771217712177121,
"grad_norm": 26.83561897277832,
"learning_rate": 4.809348093480935e-05,
"loss": 1.07,
"step": 782
},
{
"epoch": 0.5778597785977859,
"grad_norm": 28.882770538330078,
"learning_rate": 4.81549815498155e-05,
"loss": 1.1521,
"step": 783
},
{
"epoch": 0.5785977859778598,
"grad_norm": 90.4433822631836,
"learning_rate": 4.821648216482165e-05,
"loss": 1.0745,
"step": 784
},
{
"epoch": 0.5793357933579336,
"grad_norm": 30.003938674926758,
"learning_rate": 4.82779827798278e-05,
"loss": 1.1911,
"step": 785
},
{
"epoch": 0.5800738007380074,
"grad_norm": 63.82630920410156,
"learning_rate": 4.833948339483395e-05,
"loss": 1.1678,
"step": 786
},
{
"epoch": 0.5808118081180812,
"grad_norm": 238.10055541992188,
"learning_rate": 4.84009840098401e-05,
"loss": 1.3465,
"step": 787
},
{
"epoch": 0.581549815498155,
"grad_norm": 311.88134765625,
"learning_rate": 4.846248462484625e-05,
"loss": 2.3681,
"step": 788
},
{
"epoch": 0.5822878228782288,
"grad_norm": 79.10831451416016,
"learning_rate": 4.85239852398524e-05,
"loss": 1.3595,
"step": 789
},
{
"epoch": 0.5830258302583026,
"grad_norm": 195.71676635742188,
"learning_rate": 4.858548585485855e-05,
"loss": 1.8711,
"step": 790
},
{
"epoch": 0.5837638376383764,
"grad_norm": 223.3916015625,
"learning_rate": 4.8646986469864706e-05,
"loss": 2.4052,
"step": 791
},
{
"epoch": 0.5845018450184502,
"grad_norm": 33.84809875488281,
"learning_rate": 4.870848708487085e-05,
"loss": 1.04,
"step": 792
},
{
"epoch": 0.5852398523985239,
"grad_norm": 134.23912048339844,
"learning_rate": 4.8769987699877e-05,
"loss": 2.6438,
"step": 793
},
{
"epoch": 0.5859778597785977,
"grad_norm": 44.83888244628906,
"learning_rate": 4.883148831488315e-05,
"loss": 1.4395,
"step": 794
},
{
"epoch": 0.5867158671586716,
"grad_norm": 299.56988525390625,
"learning_rate": 4.88929889298893e-05,
"loss": 1.6989,
"step": 795
},
{
"epoch": 0.5874538745387454,
"grad_norm": 284.4837341308594,
"learning_rate": 4.895448954489545e-05,
"loss": 2.2906,
"step": 796
},
{
"epoch": 0.5881918819188192,
"grad_norm": 53.7056884765625,
"learning_rate": 4.9015990159901605e-05,
"loss": 1.3129,
"step": 797
},
{
"epoch": 0.588929889298893,
"grad_norm": 117.8404769897461,
"learning_rate": 4.907749077490775e-05,
"loss": 1.916,
"step": 798
},
{
"epoch": 0.5896678966789668,
"grad_norm": 51.02519607543945,
"learning_rate": 4.9138991389913904e-05,
"loss": 2.0983,
"step": 799
},
{
"epoch": 0.5904059040590406,
"grad_norm": 376.12225341796875,
"learning_rate": 4.920049200492005e-05,
"loss": 1.5877,
"step": 800
},
{
"epoch": 0.5904059040590406,
"eval_loss": 1.3728182315826416,
"eval_runtime": 305.6963,
"eval_samples_per_second": 3.759,
"eval_steps_per_second": 0.314,
"step": 800
}
],
"logging_steps": 1,
"max_steps": 4065,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.5604991497978511e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}