{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 1.984,
  "eval_steps": 1,
  "global_step": 124,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.016,
      "grad_norm": 58.75,
      "learning_rate": 2.5e-05,
      "loss": 1.6327,
      "step": 1
    },
    {
      "epoch": 0.016,
      "eval_accuracy": 0.344,
      "eval_loss": 1.8050549030303955,
      "eval_runtime": 8.5979,
      "eval_samples_per_second": 29.077,
      "eval_steps_per_second": 3.722,
      "step": 1
    },
    {
      "epoch": 0.032,
      "grad_norm": 173.0,
      "learning_rate": 5e-05,
      "loss": 1.2182,
      "step": 2
    },
    {
      "epoch": 0.032,
      "eval_accuracy": 0.368,
      "eval_loss": 1.5831865072250366,
      "eval_runtime": 8.6593,
      "eval_samples_per_second": 28.871,
      "eval_steps_per_second": 3.695,
      "step": 2
    },
    {
      "epoch": 0.048,
      "grad_norm": 78.5,
      "learning_rate": 4.959016393442623e-05,
      "loss": 1.0166,
      "step": 3
    },
    {
      "epoch": 0.048,
      "eval_accuracy": 0.48,
      "eval_loss": 1.2497016191482544,
      "eval_runtime": 8.6547,
      "eval_samples_per_second": 28.886,
      "eval_steps_per_second": 3.697,
      "step": 3
    },
    {
      "epoch": 0.064,
      "grad_norm": 205.0,
      "learning_rate": 4.918032786885246e-05,
      "loss": 1.1151,
      "step": 4
    },
    {
      "epoch": 0.064,
      "eval_accuracy": 0.592,
      "eval_loss": 0.9809591174125671,
      "eval_runtime": 8.6606,
      "eval_samples_per_second": 28.866,
      "eval_steps_per_second": 3.695,
      "step": 4
    },
    {
      "epoch": 0.08,
      "grad_norm": 92.0,
      "learning_rate": 4.8770491803278687e-05,
      "loss": 1.1203,
      "step": 5
    },
    {
      "epoch": 0.08,
      "eval_accuracy": 0.616,
      "eval_loss": 0.9002195000648499,
      "eval_runtime": 8.6562,
      "eval_samples_per_second": 28.881,
      "eval_steps_per_second": 3.697,
      "step": 5
    },
    {
      "epoch": 0.096,
      "grad_norm": 39.5,
      "learning_rate": 4.836065573770492e-05,
      "loss": 0.3129,
      "step": 6
    },
    {
      "epoch": 0.096,
      "eval_accuracy": 0.692,
      "eval_loss": 0.8504685759544373,
      "eval_runtime": 8.6632,
      "eval_samples_per_second": 28.858,
      "eval_steps_per_second": 3.694,
      "step": 6
    },
    {
      "epoch": 0.112,
      "grad_norm": 93.0,
      "learning_rate": 4.795081967213115e-05,
      "loss": 0.989,
      "step": 7
    },
    {
      "epoch": 0.112,
      "eval_accuracy": 0.72,
      "eval_loss": 0.8811690807342529,
      "eval_runtime": 8.6664,
      "eval_samples_per_second": 28.847,
      "eval_steps_per_second": 3.692,
      "step": 7
    },
    {
      "epoch": 0.128,
      "grad_norm": 69.5,
      "learning_rate": 4.754098360655738e-05,
      "loss": 0.6991,
      "step": 8
    },
    {
      "epoch": 0.128,
      "eval_accuracy": 0.68,
      "eval_loss": 1.079397439956665,
      "eval_runtime": 8.6622,
      "eval_samples_per_second": 28.861,
      "eval_steps_per_second": 3.694,
      "step": 8
    },
    {
      "epoch": 0.144,
      "grad_norm": 161.0,
      "learning_rate": 4.713114754098361e-05,
      "loss": 1.2626,
      "step": 9
    },
    {
      "epoch": 0.144,
      "eval_accuracy": 0.688,
      "eval_loss": 1.0678237676620483,
      "eval_runtime": 8.6644,
      "eval_samples_per_second": 28.854,
      "eval_steps_per_second": 3.693,
      "step": 9
    },
    {
      "epoch": 0.16,
      "grad_norm": 155.0,
      "learning_rate": 4.672131147540984e-05,
      "loss": 0.7883,
      "step": 10
    },
    {
      "epoch": 0.16,
      "eval_accuracy": 0.696,
      "eval_loss": 0.88979172706604,
      "eval_runtime": 8.6685,
      "eval_samples_per_second": 28.84,
      "eval_steps_per_second": 3.692,
      "step": 10
    },
    {
      "epoch": 0.176,
      "grad_norm": 71.5,
      "learning_rate": 4.631147540983607e-05,
      "loss": 0.2973,
      "step": 11
    },
    {
      "epoch": 0.176,
      "eval_accuracy": 0.768,
      "eval_loss": 0.7034730315208435,
      "eval_runtime": 8.6628,
      "eval_samples_per_second": 28.859,
      "eval_steps_per_second": 3.694,
      "step": 11
    },
    {
      "epoch": 0.192,
      "grad_norm": 32.5,
      "learning_rate": 4.59016393442623e-05,
      "loss": 0.3976,
      "step": 12
    },
    {
      "epoch": 0.192,
      "eval_accuracy": 0.772,
      "eval_loss": 0.64277583360672,
      "eval_runtime": 8.6669,
      "eval_samples_per_second": 28.845,
      "eval_steps_per_second": 3.692,
      "step": 12
    },
    {
      "epoch": 0.208,
      "grad_norm": 85.0,
      "learning_rate": 4.549180327868853e-05,
      "loss": 0.8966,
      "step": 13
    },
    {
      "epoch": 0.208,
      "eval_accuracy": 0.776,
      "eval_loss": 0.5894673466682434,
      "eval_runtime": 8.6718,
      "eval_samples_per_second": 28.829,
      "eval_steps_per_second": 3.69,
      "step": 13
    },
    {
      "epoch": 0.224,
      "grad_norm": 93.0,
      "learning_rate": 4.508196721311476e-05,
      "loss": 0.3748,
      "step": 14
    },
    {
      "epoch": 0.224,
      "eval_accuracy": 0.748,
      "eval_loss": 0.6436864137649536,
      "eval_runtime": 8.6619,
      "eval_samples_per_second": 28.862,
      "eval_steps_per_second": 3.694,
      "step": 14
    },
    {
      "epoch": 0.24,
      "grad_norm": 102.0,
      "learning_rate": 4.467213114754098e-05,
      "loss": 0.6883,
      "step": 15
    },
    {
      "epoch": 0.24,
      "eval_accuracy": 0.74,
      "eval_loss": 0.6454311609268188,
      "eval_runtime": 8.6684,
      "eval_samples_per_second": 28.841,
      "eval_steps_per_second": 3.692,
      "step": 15
    },
    {
      "epoch": 0.256,
      "grad_norm": 40.5,
      "learning_rate": 4.426229508196721e-05,
      "loss": 0.3292,
      "step": 16
    },
    {
      "epoch": 0.256,
      "eval_accuracy": 0.708,
      "eval_loss": 0.8357064127922058,
      "eval_runtime": 8.6666,
      "eval_samples_per_second": 28.846,
      "eval_steps_per_second": 3.692,
      "step": 16
    },
    {
      "epoch": 0.272,
      "grad_norm": 138.0,
      "learning_rate": 4.3852459016393444e-05,
      "loss": 1.0341,
      "step": 17
    },
    {
      "epoch": 0.272,
      "eval_accuracy": 0.692,
      "eval_loss": 0.920940101146698,
      "eval_runtime": 8.6644,
      "eval_samples_per_second": 28.854,
      "eval_steps_per_second": 3.693,
      "step": 17
    },
    {
      "epoch": 0.288,
      "grad_norm": 97.5,
      "learning_rate": 4.3442622950819674e-05,
      "loss": 0.8867,
      "step": 18
    },
    {
      "epoch": 0.288,
      "eval_accuracy": 0.708,
      "eval_loss": 0.8621469736099243,
      "eval_runtime": 8.6638,
      "eval_samples_per_second": 28.856,
      "eval_steps_per_second": 3.694,
      "step": 18
    },
    {
      "epoch": 0.304,
      "grad_norm": 176.0,
      "learning_rate": 4.3032786885245904e-05,
      "loss": 1.2041,
      "step": 19
    },
    {
      "epoch": 0.304,
      "eval_accuracy": 0.744,
      "eval_loss": 0.67635178565979,
      "eval_runtime": 8.6671,
      "eval_samples_per_second": 28.845,
      "eval_steps_per_second": 3.692,
      "step": 19
    },
    {
      "epoch": 0.32,
      "grad_norm": 150.0,
      "learning_rate": 4.262295081967213e-05,
      "loss": 0.9002,
      "step": 20
    },
    {
      "epoch": 0.32,
      "eval_accuracy": 0.732,
      "eval_loss": 0.5985668301582336,
      "eval_runtime": 8.6631,
      "eval_samples_per_second": 28.858,
      "eval_steps_per_second": 3.694,
      "step": 20
    },
    {
      "epoch": 0.336,
      "grad_norm": 53.0,
      "learning_rate": 4.2213114754098365e-05,
      "loss": 0.8948,
      "step": 21
    },
    {
      "epoch": 0.336,
      "eval_accuracy": 0.716,
      "eval_loss": 0.652230978012085,
      "eval_runtime": 8.6655,
      "eval_samples_per_second": 28.85,
      "eval_steps_per_second": 3.693,
      "step": 21
    },
    {
      "epoch": 0.352,
      "grad_norm": 174.0,
      "learning_rate": 4.1803278688524595e-05,
      "loss": 0.86,
      "step": 22
    },
    {
      "epoch": 0.352,
      "eval_accuracy": 0.728,
      "eval_loss": 0.6597179174423218,
      "eval_runtime": 8.6672,
      "eval_samples_per_second": 28.844,
      "eval_steps_per_second": 3.692,
      "step": 22
    },
    {
      "epoch": 0.368,
      "grad_norm": 156.0,
      "learning_rate": 4.1393442622950826e-05,
      "loss": 0.6364,
      "step": 23
    },
    {
      "epoch": 0.368,
      "eval_accuracy": 0.744,
      "eval_loss": 0.5796850919723511,
      "eval_runtime": 8.664,
      "eval_samples_per_second": 28.855,
      "eval_steps_per_second": 3.693,
      "step": 23
    },
    {
      "epoch": 0.384,
      "grad_norm": 53.25,
      "learning_rate": 4.098360655737705e-05,
      "loss": 0.2094,
      "step": 24
    },
    {
      "epoch": 0.384,
      "eval_accuracy": 0.748,
      "eval_loss": 0.5883631706237793,
      "eval_runtime": 8.6686,
      "eval_samples_per_second": 28.84,
      "eval_steps_per_second": 3.692,
      "step": 24
    },
    {
      "epoch": 0.4,
      "grad_norm": 87.5,
      "learning_rate": 4.057377049180328e-05,
      "loss": 0.4607,
      "step": 25
    },
    {
      "epoch": 0.4,
      "eval_accuracy": 0.768,
      "eval_loss": 0.5390456318855286,
      "eval_runtime": 8.6866,
      "eval_samples_per_second": 28.78,
      "eval_steps_per_second": 3.684,
      "step": 25
    },
    {
      "epoch": 0.416,
      "grad_norm": 155.0,
      "learning_rate": 4.016393442622951e-05,
      "loss": 0.814,
      "step": 26
    },
    {
      "epoch": 0.416,
      "eval_accuracy": 0.78,
      "eval_loss": 0.4743637144565582,
      "eval_runtime": 8.6531,
      "eval_samples_per_second": 28.892,
      "eval_steps_per_second": 3.698,
      "step": 26
    },
    {
      "epoch": 0.432,
      "grad_norm": 41.0,
      "learning_rate": 3.975409836065574e-05,
      "loss": 0.5358,
      "step": 27
    },
    {
      "epoch": 0.432,
      "eval_accuracy": 0.776,
      "eval_loss": 0.4668542146682739,
      "eval_runtime": 8.6595,
      "eval_samples_per_second": 28.87,
      "eval_steps_per_second": 3.695,
      "step": 27
    },
    {
      "epoch": 0.448,
      "grad_norm": 131.0,
      "learning_rate": 3.934426229508197e-05,
      "loss": 0.5556,
      "step": 28
    },
    {
      "epoch": 0.448,
      "eval_accuracy": 0.736,
      "eval_loss": 0.6067003011703491,
      "eval_runtime": 8.6518,
      "eval_samples_per_second": 28.896,
      "eval_steps_per_second": 3.699,
      "step": 28
    },
    {
      "epoch": 0.464,
      "grad_norm": 126.5,
      "learning_rate": 3.89344262295082e-05,
      "loss": 0.505,
      "step": 29
    },
    {
      "epoch": 0.464,
      "eval_accuracy": 0.712,
      "eval_loss": 0.7375366687774658,
      "eval_runtime": 8.6519,
      "eval_samples_per_second": 28.895,
      "eval_steps_per_second": 3.699,
      "step": 29
    },
    {
      "epoch": 0.48,
      "grad_norm": 171.0,
      "learning_rate": 3.8524590163934424e-05,
      "loss": 0.9589,
      "step": 30
    },
    {
      "epoch": 0.48,
      "eval_accuracy": 0.704,
      "eval_loss": 0.7679601311683655,
      "eval_runtime": 8.6582,
      "eval_samples_per_second": 28.874,
      "eval_steps_per_second": 3.696,
      "step": 30
    },
    {
      "epoch": 0.496,
      "grad_norm": 150.0,
      "learning_rate": 3.8114754098360655e-05,
      "loss": 0.74,
      "step": 31
    },
    {
      "epoch": 0.496,
      "eval_accuracy": 0.732,
      "eval_loss": 0.6937733888626099,
      "eval_runtime": 8.6569,
      "eval_samples_per_second": 28.879,
      "eval_steps_per_second": 3.696,
      "step": 31
    },
    {
      "epoch": 0.512,
      "grad_norm": 79.5,
      "learning_rate": 3.7704918032786885e-05,
      "loss": 0.5474,
      "step": 32
    },
    {
      "epoch": 0.512,
      "eval_accuracy": 0.748,
      "eval_loss": 0.5756805539131165,
      "eval_runtime": 8.6562,
      "eval_samples_per_second": 28.881,
      "eval_steps_per_second": 3.697,
      "step": 32
    },
    {
      "epoch": 0.528,
      "grad_norm": 112.5,
      "learning_rate": 3.729508196721312e-05,
      "loss": 0.4916,
      "step": 33
    },
    {
      "epoch": 0.528,
      "eval_accuracy": 0.792,
      "eval_loss": 0.47289371490478516,
      "eval_runtime": 8.6581,
      "eval_samples_per_second": 28.875,
      "eval_steps_per_second": 3.696,
      "step": 33
    },
    {
      "epoch": 0.544,
      "grad_norm": 33.0,
      "learning_rate": 3.6885245901639346e-05,
      "loss": 0.8822,
      "step": 34
    },
    {
      "epoch": 0.544,
      "eval_accuracy": 0.82,
      "eval_loss": 0.4487142264842987,
      "eval_runtime": 8.6584,
      "eval_samples_per_second": 28.874,
      "eval_steps_per_second": 3.696,
      "step": 34
    },
    {
      "epoch": 0.56,
      "grad_norm": 84.5,
      "learning_rate": 3.6475409836065576e-05,
      "loss": 0.7691,
      "step": 35
    },
    {
      "epoch": 0.56,
      "eval_accuracy": 0.812,
      "eval_loss": 0.45519721508026123,
      "eval_runtime": 8.6547,
      "eval_samples_per_second": 28.886,
      "eval_steps_per_second": 3.697,
      "step": 35
    },
    {
      "epoch": 0.576,
      "grad_norm": 28.625,
      "learning_rate": 3.6065573770491806e-05,
      "loss": 0.4743,
      "step": 36
    },
    {
      "epoch": 0.576,
      "eval_accuracy": 0.764,
      "eval_loss": 0.5331873893737793,
      "eval_runtime": 8.6566,
      "eval_samples_per_second": 28.88,
      "eval_steps_per_second": 3.697,
      "step": 36
    },
    {
      "epoch": 0.592,
      "grad_norm": 23.875,
      "learning_rate": 3.5655737704918037e-05,
      "loss": 0.3101,
      "step": 37
    },
    {
      "epoch": 0.592,
      "eval_accuracy": 0.744,
      "eval_loss": 0.6849313974380493,
      "eval_runtime": 8.6571,
      "eval_samples_per_second": 28.878,
      "eval_steps_per_second": 3.696,
      "step": 37
    },
    {
      "epoch": 0.608,
      "grad_norm": 103.5,
      "learning_rate": 3.524590163934427e-05,
      "loss": 0.962,
      "step": 38
    },
    {
      "epoch": 0.608,
      "eval_accuracy": 0.724,
      "eval_loss": 0.7783421874046326,
      "eval_runtime": 8.6583,
      "eval_samples_per_second": 28.874,
      "eval_steps_per_second": 3.696,
      "step": 38
    },
    {
      "epoch": 0.624,
      "grad_norm": 133.0,
      "learning_rate": 3.483606557377049e-05,
      "loss": 0.5671,
      "step": 39
    },
    {
      "epoch": 0.624,
      "eval_accuracy": 0.712,
      "eval_loss": 0.7919518947601318,
      "eval_runtime": 8.661,
      "eval_samples_per_second": 28.865,
      "eval_steps_per_second": 3.695,
      "step": 39
    },
    {
      "epoch": 0.64,
      "grad_norm": 193.0,
      "learning_rate": 3.442622950819672e-05,
      "loss": 0.7741,
      "step": 40
    },
    {
      "epoch": 0.64,
      "eval_accuracy": 0.724,
      "eval_loss": 0.7195008397102356,
      "eval_runtime": 8.6644,
      "eval_samples_per_second": 28.854,
      "eval_steps_per_second": 3.693,
      "step": 40
    },
    {
      "epoch": 0.656,
      "grad_norm": 236.0,
      "learning_rate": 3.401639344262295e-05,
      "loss": 0.9336,
      "step": 41
    },
    {
      "epoch": 0.656,
      "eval_accuracy": 0.784,
      "eval_loss": 0.5999830365180969,
      "eval_runtime": 8.6611,
      "eval_samples_per_second": 28.865,
      "eval_steps_per_second": 3.695,
      "step": 41
    },
    {
      "epoch": 0.672,
      "grad_norm": 194.0,
      "learning_rate": 3.360655737704918e-05,
      "loss": 0.9252,
      "step": 42
    },
    {
      "epoch": 0.672,
      "eval_accuracy": 0.812,
      "eval_loss": 0.4787631928920746,
      "eval_runtime": 8.6643,
      "eval_samples_per_second": 28.854,
      "eval_steps_per_second": 3.693,
      "step": 42
    },
    {
      "epoch": 0.688,
      "grad_norm": 102.0,
      "learning_rate": 3.319672131147541e-05,
      "loss": 0.2934,
      "step": 43
    },
    {
      "epoch": 0.688,
      "eval_accuracy": 0.812,
      "eval_loss": 0.41090723872184753,
      "eval_runtime": 8.6614,
      "eval_samples_per_second": 28.864,
      "eval_steps_per_second": 3.695,
      "step": 43
    },
    {
      "epoch": 0.704,
      "grad_norm": 87.5,
      "learning_rate": 3.2786885245901635e-05,
      "loss": 0.4936,
      "step": 44
    },
    {
      "epoch": 0.704,
      "eval_accuracy": 0.78,
      "eval_loss": 0.46753987669944763,
      "eval_runtime": 8.6615,
      "eval_samples_per_second": 28.863,
      "eval_steps_per_second": 3.694,
      "step": 44
    },
    {
      "epoch": 0.72,
      "grad_norm": 18.375,
      "learning_rate": 3.237704918032787e-05,
      "loss": 0.3223,
      "step": 45
    },
    {
      "epoch": 0.72,
      "eval_accuracy": 0.748,
      "eval_loss": 0.5864301919937134,
      "eval_runtime": 8.6597,
      "eval_samples_per_second": 28.869,
      "eval_steps_per_second": 3.695,
      "step": 45
    },
    {
      "epoch": 0.736,
      "grad_norm": 121.0,
      "learning_rate": 3.19672131147541e-05,
      "loss": 0.408,
      "step": 46
    },
    {
      "epoch": 0.736,
      "eval_accuracy": 0.728,
      "eval_loss": 0.6596755981445312,
      "eval_runtime": 8.663,
      "eval_samples_per_second": 28.858,
      "eval_steps_per_second": 3.694,
      "step": 46
    },
    {
      "epoch": 0.752,
      "grad_norm": 54.0,
      "learning_rate": 3.155737704918033e-05,
      "loss": 0.759,
      "step": 47
    },
    {
      "epoch": 0.752,
      "eval_accuracy": 0.728,
      "eval_loss": 0.6460751891136169,
      "eval_runtime": 8.6565,
      "eval_samples_per_second": 28.88,
      "eval_steps_per_second": 3.697,
      "step": 47
    },
    {
      "epoch": 0.768,
      "grad_norm": 114.0,
      "learning_rate": 3.114754098360656e-05,
      "loss": 0.6628,
      "step": 48
    },
    {
      "epoch": 0.768,
      "eval_accuracy": 0.744,
      "eval_loss": 0.5938560962677002,
      "eval_runtime": 8.6567,
      "eval_samples_per_second": 28.879,
      "eval_steps_per_second": 3.697,
      "step": 48
    },
    {
      "epoch": 0.784,
      "grad_norm": 111.0,
      "learning_rate": 3.073770491803279e-05,
      "loss": 0.761,
      "step": 49
    },
    {
      "epoch": 0.784,
      "eval_accuracy": 0.804,
      "eval_loss": 0.5164662003517151,
      "eval_runtime": 8.6557,
      "eval_samples_per_second": 28.883,
      "eval_steps_per_second": 3.697,
      "step": 49
    },
    {
      "epoch": 0.8,
      "grad_norm": 32.0,
      "learning_rate": 3.0327868852459017e-05,
      "loss": 0.308,
      "step": 50
    },
    {
      "epoch": 0.8,
      "eval_accuracy": 0.836,
      "eval_loss": 0.43705108761787415,
      "eval_runtime": 8.6584,
      "eval_samples_per_second": 28.874,
      "eval_steps_per_second": 3.696,
      "step": 50
    },
    {
      "epoch": 0.816,
      "grad_norm": 78.0,
      "learning_rate": 2.9918032786885248e-05,
      "loss": 0.4859,
      "step": 51
    },
    {
      "epoch": 0.816,
      "eval_accuracy": 0.856,
      "eval_loss": 0.3826155364513397,
      "eval_runtime": 8.6539,
      "eval_samples_per_second": 28.889,
      "eval_steps_per_second": 3.698,
      "step": 51
    },
    {
      "epoch": 0.832,
      "grad_norm": 24.5,
      "learning_rate": 2.9508196721311478e-05,
      "loss": 0.6841,
      "step": 52
    },
    {
      "epoch": 0.832,
      "eval_accuracy": 0.828,
      "eval_loss": 0.3742530345916748,
      "eval_runtime": 8.6541,
      "eval_samples_per_second": 28.888,
      "eval_steps_per_second": 3.698,
      "step": 52
    },
    {
      "epoch": 0.848,
      "grad_norm": 37.5,
      "learning_rate": 2.9098360655737705e-05,
      "loss": 0.7852,
      "step": 53
    },
    {
      "epoch": 0.848,
      "eval_accuracy": 0.8,
      "eval_loss": 0.43144190311431885,
      "eval_runtime": 8.653,
      "eval_samples_per_second": 28.892,
      "eval_steps_per_second": 3.698,
      "step": 53
    },
    {
      "epoch": 0.864,
      "grad_norm": 91.0,
      "learning_rate": 2.8688524590163935e-05,
      "loss": 0.3388,
      "step": 54
    },
    {
      "epoch": 0.864,
      "eval_accuracy": 0.792,
      "eval_loss": 0.501422107219696,
      "eval_runtime": 8.6518,
      "eval_samples_per_second": 28.896,
      "eval_steps_per_second": 3.699,
      "step": 54
    },
    {
      "epoch": 0.88,
      "grad_norm": 17.625,
      "learning_rate": 2.8278688524590162e-05,
      "loss": 0.3829,
      "step": 55
    },
    {
      "epoch": 0.88,
      "eval_accuracy": 0.768,
      "eval_loss": 0.5729050040245056,
      "eval_runtime": 8.6468,
      "eval_samples_per_second": 28.912,
      "eval_steps_per_second": 3.701,
      "step": 55
    },
    {
      "epoch": 0.896,
      "grad_norm": 93.5,
      "learning_rate": 2.7868852459016392e-05,
      "loss": 0.6144,
      "step": 56
    },
    {
      "epoch": 0.896,
      "eval_accuracy": 0.764,
      "eval_loss": 0.6807990074157715,
      "eval_runtime": 8.6452,
      "eval_samples_per_second": 28.918,
      "eval_steps_per_second": 3.701,
      "step": 56
    },
    {
      "epoch": 0.912,
      "grad_norm": 28.5,
      "learning_rate": 2.7459016393442626e-05,
      "loss": 0.3515,
      "step": 57
    },
    {
      "epoch": 0.912,
      "eval_accuracy": 0.756,
      "eval_loss": 0.7396586537361145,
      "eval_runtime": 8.6535,
      "eval_samples_per_second": 28.89,
      "eval_steps_per_second": 3.698,
      "step": 57
    },
    {
      "epoch": 0.928,
      "grad_norm": 112.5,
      "learning_rate": 2.7049180327868856e-05,
      "loss": 0.3028,
      "step": 58
    },
    {
      "epoch": 0.928,
      "eval_accuracy": 0.756,
      "eval_loss": 0.745948314666748,
      "eval_runtime": 8.6584,
      "eval_samples_per_second": 28.874,
      "eval_steps_per_second": 3.696,
      "step": 58
    },
    {
      "epoch": 0.944,
      "grad_norm": 164.0,
      "learning_rate": 2.6639344262295087e-05,
      "loss": 0.6729,
      "step": 59
    },
    {
      "epoch": 0.944,
      "eval_accuracy": 0.752,
      "eval_loss": 0.7118371725082397,
      "eval_runtime": 8.6567,
      "eval_samples_per_second": 28.879,
      "eval_steps_per_second": 3.697,
      "step": 59
    },
    {
      "epoch": 0.96,
      "grad_norm": 131.0,
      "learning_rate": 2.6229508196721314e-05,
      "loss": 0.4634,
      "step": 60
    },
    {
      "epoch": 0.96,
      "eval_accuracy": 0.76,
      "eval_loss": 0.6441870331764221,
      "eval_runtime": 8.6557,
      "eval_samples_per_second": 28.883,
      "eval_steps_per_second": 3.697,
      "step": 60
    },
    {
      "epoch": 0.976,
      "grad_norm": 127.5,
      "learning_rate": 2.5819672131147544e-05,
      "loss": 0.5924,
      "step": 61
    },
    {
      "epoch": 0.976,
      "eval_accuracy": 0.776,
      "eval_loss": 0.5635260939598083,
      "eval_runtime": 8.6577,
      "eval_samples_per_second": 28.876,
      "eval_steps_per_second": 3.696,
      "step": 61
    },
    {
      "epoch": 0.992,
      "grad_norm": 130.0,
      "learning_rate": 2.540983606557377e-05,
      "loss": 0.5527,
      "step": 62
    },
    {
      "epoch": 0.992,
      "eval_accuracy": 0.796,
      "eval_loss": 0.4781284034252167,
      "eval_runtime": 8.6636,
      "eval_samples_per_second": 28.856,
      "eval_steps_per_second": 3.694,
      "step": 62
    },
    {
      "epoch": 1.008,
      "grad_norm": 43.0,
      "learning_rate": 2.5e-05,
      "loss": 0.1542,
      "step": 63
    },
    {
      "epoch": 1.008,
      "eval_accuracy": 0.82,
      "eval_loss": 0.4085061252117157,
      "eval_runtime": 8.6612,
      "eval_samples_per_second": 28.864,
      "eval_steps_per_second": 3.695,
      "step": 63
    },
    {
      "epoch": 1.024,
      "grad_norm": 147.0,
      "learning_rate": 2.459016393442623e-05,
      "loss": 0.3714,
      "step": 64
    },
    {
      "epoch": 1.024,
      "eval_accuracy": 0.848,
      "eval_loss": 0.37276288866996765,
      "eval_runtime": 8.6673,
      "eval_samples_per_second": 28.844,
      "eval_steps_per_second": 3.692,
      "step": 64
    },
    {
      "epoch": 1.04,
      "grad_norm": 34.5,
      "learning_rate": 2.418032786885246e-05,
      "loss": 0.1124,
      "step": 65
    },
    {
      "epoch": 1.04,
      "eval_accuracy": 0.848,
      "eval_loss": 0.36895105242729187,
      "eval_runtime": 8.6746,
      "eval_samples_per_second": 28.82,
      "eval_steps_per_second": 3.689,
      "step": 65
    },
    {
      "epoch": 1.056,
      "grad_norm": 50.25,
      "learning_rate": 2.377049180327869e-05,
      "loss": 0.1433,
      "step": 66
    },
    {
      "epoch": 1.056,
      "eval_accuracy": 0.844,
      "eval_loss": 0.3762807548046112,
      "eval_runtime": 8.6794,
      "eval_samples_per_second": 28.804,
      "eval_steps_per_second": 3.687,
      "step": 66
    },
    {
      "epoch": 1.072,
      "grad_norm": 85.5,
      "learning_rate": 2.336065573770492e-05,
      "loss": 0.2446,
      "step": 67
    },
    {
      "epoch": 1.072,
      "eval_accuracy": 0.84,
      "eval_loss": 0.38033661246299744,
      "eval_runtime": 8.6709,
      "eval_samples_per_second": 28.832,
      "eval_steps_per_second": 3.691,
      "step": 67
    },
    {
      "epoch": 1.088,
      "grad_norm": 120.5,
      "learning_rate": 2.295081967213115e-05,
      "loss": 0.6573,
      "step": 68
    },
    {
      "epoch": 1.088,
      "eval_accuracy": 0.848,
      "eval_loss": 0.37577661871910095,
      "eval_runtime": 8.6746,
      "eval_samples_per_second": 28.82,
      "eval_steps_per_second": 3.689,
      "step": 68
    },
    {
      "epoch": 1.104,
      "grad_norm": 32.25,
      "learning_rate": 2.254098360655738e-05,
      "loss": 0.1509,
      "step": 69
    },
    {
      "epoch": 1.104,
      "eval_accuracy": 0.848,
      "eval_loss": 0.36732277274131775,
      "eval_runtime": 8.6668,
      "eval_samples_per_second": 28.846,
      "eval_steps_per_second": 3.692,
      "step": 69
    },
    {
      "epoch": 1.12,
      "grad_norm": 36.0,
      "learning_rate": 2.2131147540983607e-05,
      "loss": 0.2131,
      "step": 70
    },
    {
      "epoch": 1.12,
      "eval_accuracy": 0.856,
      "eval_loss": 0.36693572998046875,
      "eval_runtime": 8.667,
      "eval_samples_per_second": 28.845,
      "eval_steps_per_second": 3.692,
      "step": 70
    },
    {
      "epoch": 1.1360000000000001,
      "grad_norm": 35.0,
      "learning_rate": 2.1721311475409837e-05,
      "loss": 0.077,
      "step": 71
    },
    {
      "epoch": 1.1360000000000001,
      "eval_accuracy": 0.836,
      "eval_loss": 0.3619978427886963,
      "eval_runtime": 8.671,
      "eval_samples_per_second": 28.832,
      "eval_steps_per_second": 3.69,
      "step": 71
    },
    {
      "epoch": 1.152,
      "grad_norm": 21.625,
      "learning_rate": 2.1311475409836064e-05,
      "loss": 0.2332,
      "step": 72
    },
    {
      "epoch": 1.152,
      "eval_accuracy": 0.832,
      "eval_loss": 0.36414313316345215,
      "eval_runtime": 8.6706,
      "eval_samples_per_second": 28.833,
      "eval_steps_per_second": 3.691,
      "step": 72
    },
    {
      "epoch": 1.168,
      "grad_norm": 69.5,
      "learning_rate": 2.0901639344262298e-05,
      "loss": 0.2056,
      "step": 73
    },
    {
      "epoch": 1.168,
      "eval_accuracy": 0.836,
      "eval_loss": 0.36293938755989075,
      "eval_runtime": 8.6724,
      "eval_samples_per_second": 28.827,
      "eval_steps_per_second": 3.69,
      "step": 73
    },
    {
      "epoch": 1.184,
      "grad_norm": 9.5,
      "learning_rate": 2.0491803278688525e-05,
      "loss": 0.1412,
      "step": 74
    },
    {
      "epoch": 1.184,
      "eval_accuracy": 0.844,
      "eval_loss": 0.3655231297016144,
      "eval_runtime": 8.6711,
      "eval_samples_per_second": 28.831,
      "eval_steps_per_second": 3.69,
      "step": 74
    },
    {
      "epoch": 1.2,
      "grad_norm": 35.25,
      "learning_rate": 2.0081967213114755e-05,
      "loss": 0.1982,
      "step": 75
    },
    {
      "epoch": 1.2,
      "eval_accuracy": 0.84,
      "eval_loss": 0.3644102215766907,
      "eval_runtime": 8.6641,
      "eval_samples_per_second": 28.855,
      "eval_steps_per_second": 3.693,
      "step": 75
    },
    {
      "epoch": 1.216,
      "grad_norm": 12.875,
      "learning_rate": 1.9672131147540985e-05,
      "loss": 0.2003,
      "step": 76
    },
    {
      "epoch": 1.216,
      "eval_accuracy": 0.84,
      "eval_loss": 0.3651863932609558,
      "eval_runtime": 8.6665,
      "eval_samples_per_second": 28.847,
      "eval_steps_per_second": 3.692,
      "step": 76
    },
    {
      "epoch": 1.232,
      "grad_norm": 7.28125,
      "learning_rate": 1.9262295081967212e-05,
      "loss": 0.0934,
      "step": 77
    },
    {
      "epoch": 1.232,
      "eval_accuracy": 0.84,
      "eval_loss": 0.3709143102169037,
      "eval_runtime": 8.6583,
      "eval_samples_per_second": 28.874,
      "eval_steps_per_second": 3.696,
      "step": 77
    },
    {
      "epoch": 1.248,
      "grad_norm": 42.25,
      "learning_rate": 1.8852459016393442e-05,
      "loss": 0.1577,
      "step": 78
    },
    {
      "epoch": 1.248,
      "eval_accuracy": 0.836,
      "eval_loss": 0.37103718519210815,
      "eval_runtime": 8.6594,
      "eval_samples_per_second": 28.87,
      "eval_steps_per_second": 3.695,
      "step": 78
    },
    {
      "epoch": 1.264,
      "grad_norm": 25.25,
      "learning_rate": 1.8442622950819673e-05,
      "loss": 0.3063,
      "step": 79
    },
    {
      "epoch": 1.264,
      "eval_accuracy": 0.832,
      "eval_loss": 0.3689051866531372,
      "eval_runtime": 8.6658,
      "eval_samples_per_second": 28.849,
      "eval_steps_per_second": 3.693,
      "step": 79
    },
    {
      "epoch": 1.28,
      "grad_norm": 31.625,
      "learning_rate": 1.8032786885245903e-05,
      "loss": 0.2724,
      "step": 80
    },
    {
      "epoch": 1.28,
      "eval_accuracy": 0.832,
      "eval_loss": 0.3685128688812256,
      "eval_runtime": 8.6623,
      "eval_samples_per_second": 28.861,
      "eval_steps_per_second": 3.694,
      "step": 80
    },
    {
      "epoch": 1.296,
      "grad_norm": 34.75,
      "learning_rate": 1.7622950819672133e-05,
      "loss": 0.4324,
      "step": 81
    },
    {
      "epoch": 1.296,
      "eval_accuracy": 0.836,
      "eval_loss": 0.3717711567878723,
      "eval_runtime": 8.6564,
      "eval_samples_per_second": 28.88,
      "eval_steps_per_second": 3.697,
      "step": 81
    },
    {
      "epoch": 1.312,
      "grad_norm": 33.0,
      "learning_rate": 1.721311475409836e-05,
      "loss": 0.1911,
      "step": 82
    },
    {
      "epoch": 1.312,
      "eval_accuracy": 0.84,
      "eval_loss": 0.3723936080932617,
      "eval_runtime": 8.6687,
      "eval_samples_per_second": 28.839,
      "eval_steps_per_second": 3.691,
      "step": 82
    },
    {
      "epoch": 1.328,
      "grad_norm": 16.125,
      "learning_rate": 1.680327868852459e-05,
      "loss": 0.1936,
      "step": 83
    },
    {
      "epoch": 1.328,
      "eval_accuracy": 0.84,
      "eval_loss": 0.3704240024089813,
      "eval_runtime": 8.6668,
      "eval_samples_per_second": 28.846,
      "eval_steps_per_second": 3.692,
      "step": 83
    },
    {
      "epoch": 1.3439999999999999,
      "grad_norm": 34.75,
      "learning_rate": 1.6393442622950818e-05,
      "loss": 0.0839,
      "step": 84
    },
    {
      "epoch": 1.3439999999999999,
      "eval_accuracy": 0.832,
      "eval_loss": 0.36510899662971497,
      "eval_runtime": 8.661,
      "eval_samples_per_second": 28.865,
      "eval_steps_per_second": 3.695,
      "step": 84
    },
    {
      "epoch": 1.3599999999999999,
      "grad_norm": 40.0,
      "learning_rate": 1.598360655737705e-05,
      "loss": 0.2661,
      "step": 85
    },
    {
      "epoch": 1.3599999999999999,
      "eval_accuracy": 0.84,
      "eval_loss": 0.3661534786224365,
      "eval_runtime": 8.6702,
      "eval_samples_per_second": 28.834,
      "eval_steps_per_second": 3.691,
      "step": 85
    },
    {
      "epoch": 1.376,
      "grad_norm": 52.5,
      "learning_rate": 1.557377049180328e-05,
      "loss": 0.1679,
      "step": 86
    },
    {
      "epoch": 1.376,
      "eval_accuracy": 0.848,
      "eval_loss": 0.36859577894210815,
      "eval_runtime": 8.6649,
      "eval_samples_per_second": 28.852,
      "eval_steps_per_second": 3.693,
      "step": 86
    },
    {
      "epoch": 1.392,
      "grad_norm": 12.75,
      "learning_rate": 1.5163934426229509e-05,
      "loss": 0.0698,
      "step": 87
    },
    {
      "epoch": 1.392,
      "eval_accuracy": 0.852,
      "eval_loss": 0.3691750466823578,
      "eval_runtime": 8.6861,
      "eval_samples_per_second": 28.782,
      "eval_steps_per_second": 3.684,
      "step": 87
    },
    {
      "epoch": 1.408,
      "grad_norm": 39.25,
      "learning_rate": 1.4754098360655739e-05,
      "loss": 0.1173,
      "step": 88
    },
    {
      "epoch": 1.408,
      "eval_accuracy": 0.856,
      "eval_loss": 0.3779418170452118,
      "eval_runtime": 8.6673,
      "eval_samples_per_second": 28.844,
      "eval_steps_per_second": 3.692,
      "step": 88
    },
    {
      "epoch": 1.424,
      "grad_norm": 21.5,
      "learning_rate": 1.4344262295081968e-05,
      "loss": 0.3727,
      "step": 89
    },
    {
      "epoch": 1.424,
      "eval_accuracy": 0.86,
      "eval_loss": 0.38709089159965515,
      "eval_runtime": 8.6636,
      "eval_samples_per_second": 28.856,
      "eval_steps_per_second": 3.694,
      "step": 89
    },
    {
      "epoch": 1.44,
      "grad_norm": 18.25,
      "learning_rate": 1.3934426229508196e-05,
      "loss": 0.3828,
      "step": 90
    },
    {
      "epoch": 1.44,
      "eval_accuracy": 0.86,
      "eval_loss": 0.3986479640007019,
      "eval_runtime": 8.6565,
      "eval_samples_per_second": 28.88,
      "eval_steps_per_second": 3.697,
      "step": 90
    },
    {
      "epoch": 1.456,
      "grad_norm": 29.875,
      "learning_rate": 1.3524590163934428e-05,
      "loss": 0.0911,
      "step": 91
    },
    {
      "epoch": 1.456,
      "eval_accuracy": 0.84,
      "eval_loss": 0.4078799784183502,
      "eval_runtime": 8.654,
      "eval_samples_per_second": 28.888,
      "eval_steps_per_second": 3.698,
      "step": 91
    },
    {
      "epoch": 1.472,
      "grad_norm": 40.75,
      "learning_rate": 1.3114754098360657e-05,
      "loss": 0.1798,
      "step": 92
    },
    {
      "epoch": 1.472,
      "eval_accuracy": 0.832,
      "eval_loss": 0.4203779399394989,
      "eval_runtime": 8.6654,
      "eval_samples_per_second": 28.85,
      "eval_steps_per_second": 3.693,
      "step": 92
    },
    {
      "epoch": 1.488,
      "grad_norm": 15.6875,
      "learning_rate": 1.2704918032786885e-05,
      "loss": 0.0851,
      "step": 93
    },
    {
      "epoch": 1.488,
      "eval_accuracy": 0.832,
      "eval_loss": 0.4253535568714142,
      "eval_runtime": 8.6605,
      "eval_samples_per_second": 28.867,
      "eval_steps_per_second": 3.695,
      "step": 93
    },
    {
      "epoch": 1.504,
      "grad_norm": 21.0,
      "learning_rate": 1.2295081967213116e-05,
      "loss": 0.0962,
      "step": 94
    },
    {
      "epoch": 1.504,
      "eval_accuracy": 0.832,
      "eval_loss": 0.42336249351501465,
      "eval_runtime": 8.6599,
      "eval_samples_per_second": 28.869,
      "eval_steps_per_second": 3.695,
      "step": 94
    },
    {
      "epoch": 1.52,
      "grad_norm": 111.0,
      "learning_rate": 1.1885245901639344e-05,
      "loss": 0.3427,
      "step": 95
    },
    {
      "epoch": 1.52,
      "eval_accuracy": 0.828,
      "eval_loss": 0.4188750684261322,
      "eval_runtime": 8.6648,
      "eval_samples_per_second": 28.852,
      "eval_steps_per_second": 3.693,
      "step": 95
    },
    {
      "epoch": 1.536,
      "grad_norm": 27.5,
      "learning_rate": 1.1475409836065575e-05,
      "loss": 0.0881,
      "step": 96
    },
    {
      "epoch": 1.536,
      "eval_accuracy": 0.84,
      "eval_loss": 0.4100199043750763,
      "eval_runtime": 8.6603,
      "eval_samples_per_second": 28.867,
      "eval_steps_per_second": 3.695,
      "step": 96
    },
    {
      "epoch": 1.552,
      "grad_norm": 69.0,
      "learning_rate": 1.1065573770491803e-05,
      "loss": 0.1664,
      "step": 97
    },
    {
      "epoch": 1.552,
      "eval_accuracy": 0.844,
      "eval_loss": 0.39891311526298523,
      "eval_runtime": 8.6581,
      "eval_samples_per_second": 28.875,
      "eval_steps_per_second": 3.696,
      "step": 97
    },
    {
      "epoch": 1.568,
      "grad_norm": 46.25,
      "learning_rate": 1.0655737704918032e-05,
      "loss": 0.6757,
      "step": 98
    },
    {
      "epoch": 1.568,
      "eval_accuracy": 0.856,
      "eval_loss": 0.3860258162021637,
      "eval_runtime": 8.66,
      "eval_samples_per_second": 28.868,
      "eval_steps_per_second": 3.695,
      "step": 98
    },
    {
      "epoch": 1.584,
      "grad_norm": 5.875,
      "learning_rate": 1.0245901639344262e-05,
      "loss": 0.0751,
      "step": 99
    },
    {
      "epoch": 1.584,
      "eval_accuracy": 0.86,
      "eval_loss": 0.3817059397697449,
      "eval_runtime": 8.6608,
      "eval_samples_per_second": 28.866,
      "eval_steps_per_second": 3.695,
      "step": 99
    },
    {
      "epoch": 1.6,
      "grad_norm": 64.0,
      "learning_rate": 9.836065573770493e-06,
      "loss": 0.1923,
      "step": 100
    },
    {
      "epoch": 1.6,
      "eval_accuracy": 0.856,
      "eval_loss": 0.37669360637664795,
      "eval_runtime": 8.6602,
      "eval_samples_per_second": 28.868,
      "eval_steps_per_second": 3.695,
      "step": 100
    },
    {
      "epoch": 1.616,
      "grad_norm": 11.75,
      "learning_rate": 9.426229508196721e-06,
      "loss": 0.0365,
      "step": 101
    },
    {
      "epoch": 1.616,
      "eval_accuracy": 0.848,
      "eval_loss": 0.3779665231704712,
      "eval_runtime": 8.6661,
      "eval_samples_per_second": 28.848,
      "eval_steps_per_second": 3.693,
      "step": 101
    },
    {
      "epoch": 1.6320000000000001,
      "grad_norm": 13.75,
      "learning_rate": 9.016393442622952e-06,
      "loss": 0.0895,
      "step": 102
    },
    {
      "epoch": 1.6320000000000001,
      "eval_accuracy": 0.848,
      "eval_loss": 0.3783411383628845,
      "eval_runtime": 8.6625,
      "eval_samples_per_second": 28.86,
      "eval_steps_per_second": 3.694,
      "step": 102
    },
    {
      "epoch": 1.6480000000000001,
      "grad_norm": 87.0,
      "learning_rate": 8.60655737704918e-06,
      "loss": 0.3337,
      "step": 103
    },
    {
      "epoch": 1.6480000000000001,
      "eval_accuracy": 0.852,
      "eval_loss": 0.3828529417514801,
      "eval_runtime": 8.6587,
      "eval_samples_per_second": 28.873,
      "eval_steps_per_second": 3.696,
      "step": 103
    },
    {
      "epoch": 1.6640000000000001,
      "grad_norm": 29.375,
      "learning_rate": 8.196721311475409e-06,
      "loss": 0.1525,
      "step": 104
    },
    {
      "epoch": 1.6640000000000001,
      "eval_accuracy": 0.852,
      "eval_loss": 0.38398581743240356,
      "eval_runtime": 8.6634,
      "eval_samples_per_second": 28.857,
      "eval_steps_per_second": 3.694,
      "step": 104
    },
    {
      "epoch": 1.6800000000000002,
      "grad_norm": 10.0625,
      "learning_rate": 7.78688524590164e-06,
      "loss": 0.0973,
      "step": 105
    },
    {
      "epoch": 1.6800000000000002,
      "eval_accuracy": 0.844,
      "eval_loss": 0.3848567605018616,
      "eval_runtime": 8.6582,
      "eval_samples_per_second": 28.874,
      "eval_steps_per_second": 3.696,
      "step": 105
    },
    {
      "epoch": 1.696,
      "grad_norm": 77.5,
      "learning_rate": 7.3770491803278695e-06,
      "loss": 0.2626,
      "step": 106
    },
    {
      "epoch": 1.696,
      "eval_accuracy": 0.852,
      "eval_loss": 0.385408878326416,
      "eval_runtime": 8.6577,
      "eval_samples_per_second": 28.876,
      "eval_steps_per_second": 3.696,
      "step": 106
    },
    {
      "epoch": 1.712,
      "grad_norm": 9.5,
      "learning_rate": 6.967213114754098e-06,
      "loss": 0.0585,
      "step": 107
    },
    {
      "epoch": 1.712,
      "eval_accuracy": 0.848,
      "eval_loss": 0.38454535603523254,
      "eval_runtime": 8.6544,
      "eval_samples_per_second": 28.887,
      "eval_steps_per_second": 3.698,
      "step": 107
    },
    {
      "epoch": 1.728,
      "grad_norm": 55.0,
      "learning_rate": 6.557377049180328e-06,
      "loss": 0.2257,
      "step": 108
    },
    {
      "epoch": 1.728,
      "eval_accuracy": 0.852,
      "eval_loss": 0.38379326462745667,
      "eval_runtime": 8.6523,
      "eval_samples_per_second": 28.894,
      "eval_steps_per_second": 3.698,
      "step": 108
    },
    {
      "epoch": 1.744,
      "grad_norm": 32.5,
      "learning_rate": 6.147540983606558e-06,
      "loss": 0.1137,
      "step": 109
    },
    {
      "epoch": 1.744,
      "eval_accuracy": 0.852,
      "eval_loss": 0.3833220303058624,
      "eval_runtime": 8.652,
      "eval_samples_per_second": 28.895,
      "eval_steps_per_second": 3.699,
      "step": 109
    },
    {
      "epoch": 1.76,
      "grad_norm": 15.4375,
      "learning_rate": 5.737704918032787e-06,
      "loss": 0.283,
      "step": 110
    },
    {
      "epoch": 1.76,
      "eval_accuracy": 0.852,
      "eval_loss": 0.37939703464508057,
      "eval_runtime": 8.6539,
      "eval_samples_per_second": 28.889,
      "eval_steps_per_second": 3.698,
      "step": 110
    },
    {
      "epoch": 1.776,
      "grad_norm": 27.375,
      "learning_rate": 5.327868852459016e-06,
      "loss": 0.1111,
      "step": 111
    },
    {
      "epoch": 1.776,
      "eval_accuracy": 0.852,
      "eval_loss": 0.3771066665649414,
      "eval_runtime": 8.6565,
      "eval_samples_per_second": 28.88,
      "eval_steps_per_second": 3.697,
      "step": 111
    },
    {
      "epoch": 1.792,
      "grad_norm": 26.125,
      "learning_rate": 4.918032786885246e-06,
      "loss": 0.1367,
      "step": 112
    },
    {
      "epoch": 1.792,
      "eval_accuracy": 0.852,
      "eval_loss": 0.3757225275039673,
      "eval_runtime": 8.6575,
      "eval_samples_per_second": 28.877,
      "eval_steps_per_second": 3.696,
      "step": 112
    },
    {
      "epoch": 1.808,
      "grad_norm": 24.875,
      "learning_rate": 4.508196721311476e-06,
      "loss": 0.0762,
      "step": 113
    },
    {
      "epoch": 1.808,
      "eval_accuracy": 0.852,
      "eval_loss": 0.3756250739097595,
      "eval_runtime": 8.6535,
      "eval_samples_per_second": 28.89,
      "eval_steps_per_second": 3.698,
      "step": 113
    },
    {
      "epoch": 1.8239999999999998,
      "grad_norm": 47.5,
      "learning_rate": 4.098360655737704e-06,
      "loss": 0.133,
      "step": 114
    },
    {
      "epoch": 1.8239999999999998,
      "eval_accuracy": 0.852,
      "eval_loss": 0.37420740723609924,
      "eval_runtime": 8.6587,
      "eval_samples_per_second": 28.873,
      "eval_steps_per_second": 3.696,
      "step": 114
    },
    {
      "epoch": 1.8399999999999999,
      "grad_norm": 22.625,
      "learning_rate": 3.6885245901639347e-06,
      "loss": 0.2904,
      "step": 115
    },
    {
      "epoch": 1.8399999999999999,
      "eval_accuracy": 0.852,
      "eval_loss": 0.372751921415329,
      "eval_runtime": 8.6548,
      "eval_samples_per_second": 28.886,
      "eval_steps_per_second": 3.697,
      "step": 115
    },
    {
      "epoch": 1.8559999999999999,
      "grad_norm": 16.75,
      "learning_rate": 3.278688524590164e-06,
      "loss": 0.1686,
      "step": 116
    },
    {
      "epoch": 1.8559999999999999,
      "eval_accuracy": 0.848,
      "eval_loss": 0.3734797239303589,
      "eval_runtime": 8.6629,
      "eval_samples_per_second": 28.859,
      "eval_steps_per_second": 3.694,
      "step": 116
    },
    {
      "epoch": 1.8719999999999999,
      "grad_norm": 35.25,
      "learning_rate": 2.8688524590163937e-06,
      "loss": 0.0737,
      "step": 117
    },
    {
      "epoch": 1.8719999999999999,
      "eval_accuracy": 0.848,
      "eval_loss": 0.3713564872741699,
      "eval_runtime": 8.6633,
      "eval_samples_per_second": 28.857,
      "eval_steps_per_second": 3.694,
      "step": 117
    },
    {
      "epoch": 1.888,
      "grad_norm": 100.0,
      "learning_rate": 2.459016393442623e-06,
      "loss": 0.2758,
      "step": 118
    },
    {
      "epoch": 1.888,
      "eval_accuracy": 0.848,
      "eval_loss": 0.3682093322277069,
      "eval_runtime": 8.6631,
      "eval_samples_per_second": 28.858,
      "eval_steps_per_second": 3.694,
      "step": 118
    },
    {
      "epoch": 1.904,
      "grad_norm": 22.5,
      "learning_rate": 2.049180327868852e-06,
      "loss": 0.0542,
      "step": 119
    },
    {
      "epoch": 1.904,
      "eval_accuracy": 0.848,
      "eval_loss": 0.3716946542263031,
      "eval_runtime": 8.6618,
      "eval_samples_per_second": 28.862,
      "eval_steps_per_second": 3.694,
      "step": 119
    },
    {
      "epoch": 1.92,
      "grad_norm": 69.0,
      "learning_rate": 1.639344262295082e-06,
      "loss": 0.1646,
      "step": 120
    },
    {
      "epoch": 1.92,
      "eval_accuracy": 0.848,
      "eval_loss": 0.3682910203933716,
      "eval_runtime": 8.6617,
      "eval_samples_per_second": 28.863,
      "eval_steps_per_second": 3.694,
      "step": 120
    },
    {
      "epoch": 1.936,
      "grad_norm": 31.125,
      "learning_rate": 1.2295081967213116e-06,
      "loss": 0.4908,
      "step": 121
    },
    {
      "epoch": 1.936,
      "eval_accuracy": 0.848,
      "eval_loss": 0.3708224594593048,
      "eval_runtime": 8.6585,
      "eval_samples_per_second": 28.873,
      "eval_steps_per_second": 3.696,
      "step": 121
    },
    {
      "epoch": 1.952,
      "grad_norm": 55.75,
      "learning_rate": 8.19672131147541e-07,
      "loss": 0.3249,
      "step": 122
    },
    {
      "epoch": 1.952,
      "eval_accuracy": 0.848,
      "eval_loss": 0.36828938126564026,
      "eval_runtime": 8.6603,
      "eval_samples_per_second": 28.867,
      "eval_steps_per_second": 3.695,
      "step": 122
    },
    {
      "epoch": 1.968,
      "grad_norm": 22.375,
      "learning_rate": 4.098360655737705e-07,
      "loss": 0.1096,
      "step": 123
    },
    {
      "epoch": 1.968,
      "eval_accuracy": 0.852,
      "eval_loss": 0.3706204891204834,
      "eval_runtime": 8.6631,
      "eval_samples_per_second": 28.858,
      "eval_steps_per_second": 3.694,
      "step": 123
    },
    {
      "epoch": 1.984,
      "grad_norm": 52.5,
      "learning_rate": 0.0,
      "loss": 0.1758,
      "step": 124
    },
    {
      "epoch": 1.984,
      "eval_accuracy": 0.852,
      "eval_loss": 0.36876150965690613,
      "eval_runtime": 8.6609,
      "eval_samples_per_second": 28.865,
      "eval_steps_per_second": 3.695,
      "step": 124
    },
    {
      "epoch": 1.984,
      "step": 124,
      "total_flos": 1.693315531538432e+16,
      "train_loss": 0.4407010670871504,
      "train_runtime": 1267.2396,
      "train_samples_per_second": 1.578,
      "train_steps_per_second": 0.098
    }
  ],
  "logging_steps": 1,
  "max_steps": 124,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 2,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": false,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 1.693315531538432e+16,
  "train_batch_size": 2,
  "trial_name": null,
  "trial_params": null
}