{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.17380271653645946,
  "eval_steps": 500,
  "global_step": 20000,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.0017380271653645947,
      "grad_norm": 1.1632381677627563,
      "learning_rate": 4.9978491913828615e-05,
      "loss": 3.6439,
      "step": 200
    },
    {
      "epoch": 0.0034760543307291894,
      "grad_norm": 0.6136592626571655,
      "learning_rate": 4.995676657426156e-05,
      "loss": 2.3619,
      "step": 400
    },
    {
      "epoch": 0.005214081496093784,
      "grad_norm": 0.6270021796226501,
      "learning_rate": 4.99350412346945e-05,
      "loss": 2.0395,
      "step": 600
    },
    {
      "epoch": 0.006952108661458379,
      "grad_norm": 0.9146378636360168,
      "learning_rate": 4.991331589512744e-05,
      "loss": 1.8894,
      "step": 800
    },
    {
      "epoch": 0.008690135826822973,
      "grad_norm": 0.7162560224533081,
      "learning_rate": 4.989159055556039e-05,
      "loss": 1.8242,
      "step": 1000
    },
    {
      "epoch": 0.010428162992187568,
      "grad_norm": 0.31322506070137024,
      "learning_rate": 4.9869865215993326e-05,
      "loss": 1.8681,
      "step": 1200
    },
    {
      "epoch": 0.012166190157552163,
      "grad_norm": 0.5570130348205566,
      "learning_rate": 4.984813987642627e-05,
      "loss": 1.8099,
      "step": 1400
    },
    {
      "epoch": 0.013904217322916758,
      "grad_norm": 0.6080171465873718,
      "learning_rate": 4.982641453685921e-05,
      "loss": 1.7641,
      "step": 1600
    },
    {
      "epoch": 0.015642244488281352,
      "grad_norm": 0.553460955619812,
      "learning_rate": 4.980468919729215e-05,
      "loss": 1.7712,
      "step": 1800
    },
    {
      "epoch": 0.017380271653645946,
      "grad_norm": 0.625199019908905,
      "learning_rate": 4.97829638577251e-05,
      "loss": 1.7565,
      "step": 2000
    },
    {
      "epoch": 0.019118298819010542,
      "grad_norm": 0.579010546207428,
      "learning_rate": 4.9761238518158044e-05,
      "loss": 1.7322,
      "step": 2200
    },
    {
      "epoch": 0.020856325984375135,
      "grad_norm": 0.7429983615875244,
      "learning_rate": 4.9739513178590984e-05,
      "loss": 1.7407,
      "step": 2400
    },
    {
      "epoch": 0.022594353149739732,
      "grad_norm": 0.5801926255226135,
      "learning_rate": 4.971778783902393e-05,
      "loss": 1.6487,
      "step": 2600
    },
    {
      "epoch": 0.024332380315104325,
      "grad_norm": 0.7074835300445557,
      "learning_rate": 4.969606249945687e-05,
      "loss": 1.6959,
      "step": 2800
    },
    {
      "epoch": 0.02607040748046892,
      "grad_norm": 0.6824275255203247,
      "learning_rate": 4.967433715988981e-05,
      "loss": 1.6958,
      "step": 3000
    },
    {
      "epoch": 0.027808434645833515,
      "grad_norm": 0.43216443061828613,
      "learning_rate": 4.9652611820322756e-05,
      "loss": 1.6824,
      "step": 3200
    },
    {
      "epoch": 0.029546461811198108,
      "grad_norm": 0.7867545485496521,
      "learning_rate": 4.9630886480755695e-05,
      "loss": 1.6671,
      "step": 3400
    },
    {
      "epoch": 0.031284488976562705,
      "grad_norm": 0.77516108751297,
      "learning_rate": 4.9609161141188635e-05,
      "loss": 1.6389,
      "step": 3600
    },
    {
      "epoch": 0.0330225161419273,
      "grad_norm": 0.5014050602912903,
      "learning_rate": 4.958743580162158e-05,
      "loss": 1.629,
      "step": 3800
    },
    {
      "epoch": 0.03476054330729189,
      "grad_norm": 0.6006432771682739,
      "learning_rate": 4.956571046205453e-05,
      "loss": 1.5977,
      "step": 4000
    },
    {
      "epoch": 0.036498570472656484,
      "grad_norm": 0.6153438091278076,
      "learning_rate": 4.954398512248747e-05,
      "loss": 1.5879,
      "step": 4200
    },
    {
      "epoch": 0.038236597638021085,
      "grad_norm": 0.8877372145652771,
      "learning_rate": 4.952225978292041e-05,
      "loss": 1.599,
      "step": 4400
    },
    {
      "epoch": 0.03997462480338568,
      "grad_norm": 0.7173994183540344,
      "learning_rate": 4.950053444335335e-05,
      "loss": 1.6205,
      "step": 4600
    },
    {
      "epoch": 0.04171265196875027,
      "grad_norm": 0.8379663228988647,
      "learning_rate": 4.947880910378629e-05,
      "loss": 1.5794,
      "step": 4800
    },
    {
      "epoch": 0.043450679134114864,
      "grad_norm": 0.6160171031951904,
      "learning_rate": 4.945708376421924e-05,
      "loss": 1.5656,
      "step": 5000
    },
    {
      "epoch": 0.045188706299479464,
      "grad_norm": 0.8642494082450867,
      "learning_rate": 4.943535842465218e-05,
      "loss": 1.5665,
      "step": 5200
    },
    {
      "epoch": 0.04692673346484406,
      "grad_norm": 0.6872414350509644,
      "learning_rate": 4.941363308508512e-05,
      "loss": 1.5552,
      "step": 5400
    },
    {
      "epoch": 0.04866476063020865,
      "grad_norm": 0.9998211860656738,
      "learning_rate": 4.9391907745518064e-05,
      "loss": 1.5458,
      "step": 5600
    },
    {
      "epoch": 0.050402787795573244,
      "grad_norm": 1.2175588607788086,
      "learning_rate": 4.937018240595101e-05,
      "loss": 1.5295,
      "step": 5800
    },
    {
      "epoch": 0.05214081496093784,
      "grad_norm": 1.0134257078170776,
      "learning_rate": 4.934845706638395e-05,
      "loss": 1.516,
      "step": 6000
    },
    {
      "epoch": 0.05387884212630244,
      "grad_norm": 0.8104642033576965,
      "learning_rate": 4.9326731726816896e-05,
      "loss": 1.5285,
      "step": 6200
    },
    {
      "epoch": 0.05561686929166703,
      "grad_norm": 0.9005429148674011,
      "learning_rate": 4.9305006387249836e-05,
      "loss": 1.5069,
      "step": 6400
    },
    {
      "epoch": 0.05735489645703162,
      "grad_norm": 0.8855582475662231,
      "learning_rate": 4.9283281047682775e-05,
      "loss": 1.5046,
      "step": 6600
    },
    {
      "epoch": 0.059092923622396216,
      "grad_norm": 0.7807704210281372,
      "learning_rate": 4.926155570811572e-05,
      "loss": 1.4663,
      "step": 6800
    },
    {
      "epoch": 0.06083095078776081,
      "grad_norm": 1.2552438974380493,
      "learning_rate": 4.923983036854866e-05,
      "loss": 1.486,
      "step": 7000
    },
    {
      "epoch": 0.06256897795312541,
      "grad_norm": 1.0079654455184937,
      "learning_rate": 4.92181050289816e-05,
      "loss": 1.4569,
      "step": 7200
    },
    {
      "epoch": 0.06430700511849,
      "grad_norm": 1.0267302989959717,
      "learning_rate": 4.919637968941455e-05,
      "loss": 1.4746,
      "step": 7400
    },
    {
      "epoch": 0.0660450322838546,
      "grad_norm": 1.1427829265594482,
      "learning_rate": 4.9174654349847494e-05,
      "loss": 1.4867,
      "step": 7600
    },
    {
      "epoch": 0.0677830594492192,
      "grad_norm": 0.9080005884170532,
      "learning_rate": 4.915292901028043e-05,
      "loss": 1.4789,
      "step": 7800
    },
    {
      "epoch": 0.06952108661458378,
      "grad_norm": 0.78159499168396,
      "learning_rate": 4.913120367071338e-05,
      "loss": 1.4435,
      "step": 8000
    },
    {
      "epoch": 0.07125911377994838,
      "grad_norm": 0.9199485778808594,
      "learning_rate": 4.910947833114632e-05,
      "loss": 1.4698,
      "step": 8200
    },
    {
      "epoch": 0.07299714094531297,
      "grad_norm": 1.1556053161621094,
      "learning_rate": 4.908775299157926e-05,
      "loss": 1.4233,
      "step": 8400
    },
    {
      "epoch": 0.07473516811067757,
      "grad_norm": 0.6093395948410034,
      "learning_rate": 4.9066027652012205e-05,
      "loss": 1.4607,
      "step": 8600
    },
    {
      "epoch": 0.07647319527604217,
      "grad_norm": 0.7765551209449768,
      "learning_rate": 4.9044302312445144e-05,
      "loss": 1.4067,
      "step": 8800
    },
    {
      "epoch": 0.07821122244140676,
      "grad_norm": 0.9261316061019897,
      "learning_rate": 4.9022576972878084e-05,
      "loss": 1.4437,
      "step": 9000
    },
    {
      "epoch": 0.07994924960677136,
      "grad_norm": 0.737016499042511,
      "learning_rate": 4.900085163331103e-05,
      "loss": 1.4394,
      "step": 9200
    },
    {
      "epoch": 0.08168727677213594,
      "grad_norm": 1.0518062114715576,
      "learning_rate": 4.897912629374397e-05,
      "loss": 1.442,
      "step": 9400
    },
    {
      "epoch": 0.08342530393750054,
      "grad_norm": 0.9163209795951843,
      "learning_rate": 4.8957400954176916e-05,
      "loss": 1.4126,
      "step": 9600
    },
    {
      "epoch": 0.08516333110286514,
      "grad_norm": 1.1651362180709839,
      "learning_rate": 4.893567561460986e-05,
      "loss": 1.4397,
      "step": 9800
    },
    {
      "epoch": 0.08690135826822973,
      "grad_norm": 1.2389508485794067,
      "learning_rate": 4.89139502750428e-05,
      "loss": 1.4226,
      "step": 10000
    },
    {
      "epoch": 0.08863938543359433,
      "grad_norm": 1.009730339050293,
      "learning_rate": 4.889222493547574e-05,
      "loss": 1.4643,
      "step": 10200
    },
    {
      "epoch": 0.09037741259895893,
      "grad_norm": 1.3371009826660156,
      "learning_rate": 4.887049959590869e-05,
      "loss": 1.4221,
      "step": 10400
    },
    {
      "epoch": 0.09211543976432351,
      "grad_norm": 1.0338963270187378,
      "learning_rate": 4.884877425634163e-05,
      "loss": 1.4122,
      "step": 10600
    },
    {
      "epoch": 0.09385346692968811,
      "grad_norm": 1.0023767948150635,
      "learning_rate": 4.8827048916774574e-05,
      "loss": 1.4034,
      "step": 10800
    },
    {
      "epoch": 0.0955914940950527,
      "grad_norm": 1.4514521360397339,
      "learning_rate": 4.880532357720751e-05,
      "loss": 1.4356,
      "step": 11000
    },
    {
      "epoch": 0.0973295212604173,
      "grad_norm": 1.0462247133255005,
      "learning_rate": 4.878359823764045e-05,
      "loss": 1.4038,
      "step": 11200
    },
    {
      "epoch": 0.0990675484257819,
      "grad_norm": 1.0881024599075317,
      "learning_rate": 4.87618728980734e-05,
      "loss": 1.3521,
      "step": 11400
    },
    {
      "epoch": 0.10080557559114649,
      "grad_norm": 1.1503826379776,
      "learning_rate": 4.8740147558506345e-05,
      "loss": 1.3455,
      "step": 11600
    },
    {
      "epoch": 0.10254360275651109,
      "grad_norm": 1.1788356304168701,
      "learning_rate": 4.8718422218939285e-05,
      "loss": 1.4246,
      "step": 11800
    },
    {
      "epoch": 0.10428162992187567,
      "grad_norm": 0.9009695649147034,
      "learning_rate": 4.8696696879372225e-05,
      "loss": 1.3701,
      "step": 12000
    },
    {
      "epoch": 0.10601965708724027,
      "grad_norm": 0.7886667251586914,
      "learning_rate": 4.867497153980517e-05,
      "loss": 1.3843,
      "step": 12200
    },
    {
      "epoch": 0.10775768425260487,
      "grad_norm": 1.0017770528793335,
      "learning_rate": 4.865335482693595e-05,
      "loss": 1.3785,
      "step": 12400
    },
    {
      "epoch": 0.10949571141796946,
      "grad_norm": 0.901871383190155,
      "learning_rate": 4.863162948736889e-05,
      "loss": 1.3627,
      "step": 12600
    },
    {
      "epoch": 0.11123373858333406,
      "grad_norm": 0.9240642189979553,
      "learning_rate": 4.860990414780183e-05,
      "loss": 1.3397,
      "step": 12800
    },
    {
      "epoch": 0.11297176574869865,
      "grad_norm": 1.2550582885742188,
      "learning_rate": 4.8588178808234776e-05,
      "loss": 1.368,
      "step": 13000
    },
    {
      "epoch": 0.11470979291406325,
      "grad_norm": 0.9313985705375671,
      "learning_rate": 4.8566453468667715e-05,
      "loss": 1.344,
      "step": 13200
    },
    {
      "epoch": 0.11644782007942785,
      "grad_norm": 0.8634843826293945,
      "learning_rate": 4.854472812910066e-05,
      "loss": 1.3308,
      "step": 13400
    },
    {
      "epoch": 0.11818584724479243,
      "grad_norm": 1.2060052156448364,
      "learning_rate": 4.85230027895336e-05,
      "loss": 1.355,
      "step": 13600
    },
    {
      "epoch": 0.11992387441015703,
      "grad_norm": 1.0419443845748901,
      "learning_rate": 4.850127744996655e-05,
      "loss": 1.3469,
      "step": 13800
    },
    {
      "epoch": 0.12166190157552162,
      "grad_norm": 1.2425956726074219,
      "learning_rate": 4.847955211039949e-05,
      "loss": 1.3368,
      "step": 14000
    },
    {
      "epoch": 0.12339992874088622,
      "grad_norm": 1.0397825241088867,
      "learning_rate": 4.8457826770832433e-05,
      "loss": 1.3211,
      "step": 14200
    },
    {
      "epoch": 0.12513795590625082,
      "grad_norm": 0.8406294584274292,
      "learning_rate": 4.843621005796321e-05,
      "loss": 1.3375,
      "step": 14400
    },
    {
      "epoch": 0.1268759830716154,
      "grad_norm": 0.816184401512146,
      "learning_rate": 4.841448471839615e-05,
      "loss": 1.3351,
      "step": 14600
    },
    {
      "epoch": 0.12861401023698,
      "grad_norm": 1.1904360055923462,
      "learning_rate": 4.839275937882909e-05,
      "loss": 1.3174,
      "step": 14800
    },
    {
      "epoch": 0.1303520374023446,
      "grad_norm": 1.2890825271606445,
      "learning_rate": 4.837103403926204e-05,
      "loss": 1.3294,
      "step": 15000
    },
    {
      "epoch": 0.1320900645677092,
      "grad_norm": 0.9586935639381409,
      "learning_rate": 4.834930869969498e-05,
      "loss": 1.2934,
      "step": 15200
    },
    {
      "epoch": 0.13382809173307378,
      "grad_norm": 0.9654845595359802,
      "learning_rate": 4.832758336012792e-05,
      "loss": 1.3386,
      "step": 15400
    },
    {
      "epoch": 0.1355661188984384,
      "grad_norm": 1.1789395809173584,
      "learning_rate": 4.8305858020560864e-05,
      "loss": 1.3499,
      "step": 15600
    },
    {
      "epoch": 0.13730414606380298,
      "grad_norm": 1.2728456258773804,
      "learning_rate": 4.82841326809938e-05,
      "loss": 1.3396,
      "step": 15800
    },
    {
      "epoch": 0.13904217322916756,
      "grad_norm": 1.0807838439941406,
      "learning_rate": 4.826240734142675e-05,
      "loss": 1.3369,
      "step": 16000
    },
    {
      "epoch": 0.14078020039453218,
      "grad_norm": 1.11849045753479,
      "learning_rate": 4.8240682001859696e-05,
      "loss": 1.3664,
      "step": 16200
    },
    {
      "epoch": 0.14251822755989677,
      "grad_norm": 1.5169202089309692,
      "learning_rate": 4.821906528899047e-05,
      "loss": 1.3352,
      "step": 16400
    },
    {
      "epoch": 0.14425625472526135,
      "grad_norm": 0.8817140460014343,
      "learning_rate": 4.819733994942341e-05,
      "loss": 1.2924,
      "step": 16600
    },
    {
      "epoch": 0.14599428189062594,
      "grad_norm": 1.1285990476608276,
      "learning_rate": 4.8175614609856355e-05,
      "loss": 1.3497,
      "step": 16800
    },
    {
      "epoch": 0.14773230905599055,
      "grad_norm": 1.1072745323181152,
      "learning_rate": 4.81538892702893e-05,
      "loss": 1.3129,
      "step": 17000
    },
    {
      "epoch": 0.14947033622135514,
      "grad_norm": 1.1911921501159668,
      "learning_rate": 4.813216393072224e-05,
      "loss": 1.312,
      "step": 17200
    },
    {
      "epoch": 0.15120836338671972,
      "grad_norm": 0.7891075611114502,
      "learning_rate": 4.811043859115518e-05,
      "loss": 1.281,
      "step": 17400
    },
    {
      "epoch": 0.15294639055208434,
      "grad_norm": 0.9016463756561279,
      "learning_rate": 4.8088713251588126e-05,
      "loss": 1.3118,
      "step": 17600
    },
    {
      "epoch": 0.15468441771744892,
      "grad_norm": 1.1260063648223877,
      "learning_rate": 4.8066987912021066e-05,
      "loss": 1.2743,
      "step": 17800
    },
    {
      "epoch": 0.1564224448828135,
      "grad_norm": 1.0370497703552246,
      "learning_rate": 4.8045262572454005e-05,
      "loss": 1.3013,
      "step": 18000
    },
    {
      "epoch": 0.15816047204817812,
      "grad_norm": 1.4182652235031128,
      "learning_rate": 4.802353723288695e-05,
      "loss": 1.2994,
      "step": 18200
    },
    {
      "epoch": 0.1598984992135427,
      "grad_norm": 1.1322426795959473,
      "learning_rate": 4.800192052001773e-05,
      "loss": 1.3339,
      "step": 18400
    },
    {
      "epoch": 0.1616365263789073,
      "grad_norm": 1.4774497747421265,
      "learning_rate": 4.798019518045067e-05,
      "loss": 1.3381,
      "step": 18600
    },
    {
      "epoch": 0.16337455354427188,
      "grad_norm": 1.3371450901031494,
      "learning_rate": 4.795846984088361e-05,
      "loss": 1.304,
      "step": 18800
    },
    {
      "epoch": 0.1651125807096365,
      "grad_norm": 0.8607128858566284,
      "learning_rate": 4.793674450131656e-05,
      "loss": 1.2686,
      "step": 19000
    },
    {
      "epoch": 0.16685060787500108,
      "grad_norm": 1.1792031526565552,
      "learning_rate": 4.79150191617495e-05,
      "loss": 1.3099,
      "step": 19200
    },
    {
      "epoch": 0.16858863504036567,
      "grad_norm": 1.274556040763855,
      "learning_rate": 4.789329382218244e-05,
      "loss": 1.2745,
      "step": 19400
    },
    {
      "epoch": 0.17032666220573028,
      "grad_norm": 0.7774292230606079,
      "learning_rate": 4.787156848261539e-05,
      "loss": 1.2905,
      "step": 19600
    },
    {
      "epoch": 0.17206468937109487,
      "grad_norm": 1.204541802406311,
      "learning_rate": 4.784984314304833e-05,
      "loss": 1.3014,
      "step": 19800
    },
    {
      "epoch": 0.17380271653645946,
      "grad_norm": 0.9959656000137329,
      "learning_rate": 4.782811780348127e-05,
      "loss": 1.2798,
      "step": 20000
    }
  ],
  "logging_steps": 200,
  "max_steps": 460292,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 4,
  "save_steps": 10000,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 1.178779779072e+16,
  "train_batch_size": 16,
  "trial_name": null,
  "trial_params": null
}