{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 2.992412746585736,
  "eval_steps": 500,
  "global_step": 246,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.12139605462822459,
      "grad_norm": 1.9068916730195116,
      "learning_rate": 5e-06,
      "loss": 0.7721,
      "step": 10
    },
    {
      "epoch": 0.24279210925644917,
      "grad_norm": 5.44810384092203,
      "learning_rate": 5e-06,
      "loss": 0.6638,
      "step": 20
    },
    {
      "epoch": 0.36418816388467373,
      "grad_norm": 0.8162292861303865,
      "learning_rate": 5e-06,
      "loss": 0.6289,
      "step": 30
    },
    {
      "epoch": 0.48558421851289835,
      "grad_norm": 0.724722400146013,
      "learning_rate": 5e-06,
      "loss": 0.6062,
      "step": 40
    },
    {
      "epoch": 0.6069802731411229,
      "grad_norm": 0.6325755456980601,
      "learning_rate": 5e-06,
      "loss": 0.5886,
      "step": 50
    },
    {
      "epoch": 0.7283763277693475,
      "grad_norm": 0.5102096530669636,
      "learning_rate": 5e-06,
      "loss": 0.5763,
      "step": 60
    },
    {
      "epoch": 0.849772382397572,
      "grad_norm": 0.6134528530146113,
      "learning_rate": 5e-06,
      "loss": 0.5635,
      "step": 70
    },
    {
      "epoch": 0.9711684370257967,
      "grad_norm": 0.6520975040339092,
      "learning_rate": 5e-06,
      "loss": 0.5578,
      "step": 80
    },
    {
      "epoch": 0.9954476479514416,
      "eval_loss": 0.5475569367408752,
      "eval_runtime": 31.1849,
      "eval_samples_per_second": 71.156,
      "eval_steps_per_second": 1.122,
      "step": 82
    },
    {
      "epoch": 1.095599393019727,
      "grad_norm": 0.9030012716394636,
      "learning_rate": 5e-06,
      "loss": 0.5739,
      "step": 90
    },
    {
      "epoch": 1.2169954476479514,
      "grad_norm": 0.5546978323548724,
      "learning_rate": 5e-06,
      "loss": 0.5098,
      "step": 100
    },
    {
      "epoch": 1.338391502276176,
      "grad_norm": 0.7373972665017838,
      "learning_rate": 5e-06,
      "loss": 0.5099,
      "step": 110
    },
    {
      "epoch": 1.4597875569044005,
      "grad_norm": 0.8335652060900699,
      "learning_rate": 5e-06,
      "loss": 0.5086,
      "step": 120
    },
    {
      "epoch": 1.5811836115326252,
      "grad_norm": 0.9482742766383457,
      "learning_rate": 5e-06,
      "loss": 0.4988,
      "step": 130
    },
    {
      "epoch": 1.7025796661608497,
      "grad_norm": 0.7107692585969188,
      "learning_rate": 5e-06,
      "loss": 0.4973,
      "step": 140
    },
    {
      "epoch": 1.8239757207890743,
      "grad_norm": 0.6956293579831972,
      "learning_rate": 5e-06,
      "loss": 0.5006,
      "step": 150
    },
    {
      "epoch": 1.945371775417299,
      "grad_norm": 0.5537353905530825,
      "learning_rate": 5e-06,
      "loss": 0.4962,
      "step": 160
    },
    {
      "epoch": 1.9939301972685888,
      "eval_loss": 0.5237926244735718,
      "eval_runtime": 31.2236,
      "eval_samples_per_second": 71.068,
      "eval_steps_per_second": 1.121,
      "step": 164
    },
    {
      "epoch": 2.069802731411229,
      "grad_norm": 0.9676846382246841,
      "learning_rate": 5e-06,
      "loss": 0.5181,
      "step": 170
    },
    {
      "epoch": 2.191198786039454,
      "grad_norm": 1.0604432739536909,
      "learning_rate": 5e-06,
      "loss": 0.4483,
      "step": 180
    },
    {
      "epoch": 2.3125948406676784,
      "grad_norm": 0.9072874578025836,
      "learning_rate": 5e-06,
      "loss": 0.4481,
      "step": 190
    },
    {
      "epoch": 2.433990895295903,
      "grad_norm": 0.93397296136386,
      "learning_rate": 5e-06,
      "loss": 0.4503,
      "step": 200
    },
    {
      "epoch": 2.5553869499241273,
      "grad_norm": 0.5608137627707893,
      "learning_rate": 5e-06,
      "loss": 0.4475,
      "step": 210
    },
    {
      "epoch": 2.676783004552352,
      "grad_norm": 0.7216376866533744,
      "learning_rate": 5e-06,
      "loss": 0.4491,
      "step": 220
    },
    {
      "epoch": 2.7981790591805766,
      "grad_norm": 0.7502162152741092,
      "learning_rate": 5e-06,
      "loss": 0.4475,
      "step": 230
    },
    {
      "epoch": 2.919575113808801,
      "grad_norm": 0.6902724753233441,
      "learning_rate": 5e-06,
      "loss": 0.4505,
      "step": 240
    },
    {
      "epoch": 2.992412746585736,
      "eval_loss": 0.5186718702316284,
      "eval_runtime": 31.6067,
      "eval_samples_per_second": 70.207,
      "eval_steps_per_second": 1.107,
      "step": 246
    },
    {
      "epoch": 2.992412746585736,
      "step": 246,
      "total_flos": 411849782722560.0,
      "train_loss": 0.5282489497487138,
      "train_runtime": 6430.2484,
      "train_samples_per_second": 19.666,
      "train_steps_per_second": 0.038
    }
  ],
  "logging_steps": 10,
  "max_steps": 246,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 3,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 411849782722560.0,
  "train_batch_size": 8,
  "trial_name": null,
  "trial_params": null
}