{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.9968454258675079,
  "eval_steps": 60,
  "global_step": 237,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.004206098843322818,
      "grad_norm": 0.5299676656723022,
      "learning_rate": 2e-05,
      "loss": 1.77,
      "step": 1
    },
    {
      "epoch": 0.004206098843322818,
      "eval_loss": 1.9898090362548828,
      "eval_runtime": 65.4901,
      "eval_samples_per_second": 41.167,
      "eval_steps_per_second": 20.583,
      "step": 1
    },
    {
      "epoch": 0.008412197686645636,
      "grad_norm": 0.5349143743515015,
      "learning_rate": 4e-05,
      "loss": 1.745,
      "step": 2
    },
    {
      "epoch": 0.012618296529968454,
      "grad_norm": 0.5094612240791321,
      "learning_rate": 6e-05,
      "loss": 1.7007,
      "step": 3
    },
    {
      "epoch": 0.016824395373291272,
      "grad_norm": 0.5268917083740234,
      "learning_rate": 8e-05,
      "loss": 1.6582,
      "step": 4
    },
    {
      "epoch": 0.02103049421661409,
      "grad_norm": 0.5398459434509277,
      "learning_rate": 0.0001,
      "loss": 1.7391,
      "step": 5
    },
    {
      "epoch": 0.025236593059936908,
      "grad_norm": 0.5613242983818054,
      "learning_rate": 0.00012,
      "loss": 1.6436,
      "step": 6
    },
    {
      "epoch": 0.029442691903259727,
      "grad_norm": 0.45200833678245544,
      "learning_rate": 0.00014,
      "loss": 1.5859,
      "step": 7
    },
    {
      "epoch": 0.033648790746582544,
      "grad_norm": 0.3515471816062927,
      "learning_rate": 0.00016,
      "loss": 1.4385,
      "step": 8
    },
    {
      "epoch": 0.03785488958990536,
      "grad_norm": 0.2859397530555725,
      "learning_rate": 0.00018,
      "loss": 1.427,
      "step": 9
    },
    {
      "epoch": 0.04206098843322818,
      "grad_norm": 0.3456244170665741,
      "learning_rate": 0.0002,
      "loss": 1.408,
      "step": 10
    },
    {
      "epoch": 0.046267087276550996,
      "grad_norm": 0.42806366086006165,
      "learning_rate": 0.0001999904234053922,
      "loss": 1.4541,
      "step": 11
    },
    {
      "epoch": 0.050473186119873815,
      "grad_norm": 0.5130056142807007,
      "learning_rate": 0.00019996169545579207,
      "loss": 1.3664,
      "step": 12
    },
    {
      "epoch": 0.054679284963196635,
      "grad_norm": 0.39732199907302856,
      "learning_rate": 0.00019991382165351814,
      "loss": 1.3276,
      "step": 13
    },
    {
      "epoch": 0.058885383806519455,
      "grad_norm": 0.3794059157371521,
      "learning_rate": 0.00019984681116793038,
      "loss": 1.3153,
      "step": 14
    },
    {
      "epoch": 0.06309148264984227,
      "grad_norm": 0.27593305706977844,
      "learning_rate": 0.00019976067683367385,
      "loss": 1.2554,
      "step": 15
    },
    {
      "epoch": 0.06729758149316509,
      "grad_norm": 0.28591713309288025,
      "learning_rate": 0.00019965543514822062,
      "loss": 1.283,
      "step": 16
    },
    {
      "epoch": 0.07150368033648791,
      "grad_norm": 0.26724520325660706,
      "learning_rate": 0.00019953110626870979,
      "loss": 1.1646,
      "step": 17
    },
    {
      "epoch": 0.07570977917981073,
      "grad_norm": 0.24611811339855194,
      "learning_rate": 0.0001993877140080869,
      "loss": 1.1762,
      "step": 18
    },
    {
      "epoch": 0.07991587802313355,
      "grad_norm": 0.2281356304883957,
      "learning_rate": 0.000199225285830543,
      "loss": 1.1467,
      "step": 19
    },
    {
      "epoch": 0.08412197686645637,
      "grad_norm": 0.22052225470542908,
      "learning_rate": 0.00019904385284625424,
      "loss": 1.1377,
      "step": 20
    },
    {
      "epoch": 0.08832807570977919,
      "grad_norm": 0.23453611135482788,
      "learning_rate": 0.00019884344980542338,
      "loss": 1.1162,
      "step": 21
    },
    {
      "epoch": 0.09253417455310199,
      "grad_norm": 0.22467325627803802,
      "learning_rate": 0.00019862411509162406,
      "loss": 1.155,
      "step": 22
    },
    {
      "epoch": 0.09674027339642481,
      "grad_norm": 0.2170630544424057,
      "learning_rate": 0.00019838589071444903,
      "loss": 1.1279,
      "step": 23
    },
    {
      "epoch": 0.10094637223974763,
      "grad_norm": 0.21346993744373322,
      "learning_rate": 0.00019812882230146398,
      "loss": 1.0946,
      "step": 24
    },
    {
      "epoch": 0.10515247108307045,
      "grad_norm": 0.21408380568027496,
      "learning_rate": 0.00019785295908946848,
      "loss": 1.0889,
      "step": 25
    },
    {
      "epoch": 0.10935856992639327,
      "grad_norm": 0.22000430524349213,
      "learning_rate": 0.0001975583539150655,
      "loss": 1.0476,
      "step": 26
    },
    {
      "epoch": 0.11356466876971609,
      "grad_norm": 0.20778758823871613,
      "learning_rate": 0.00019724506320454153,
      "loss": 1.0954,
      "step": 27
    },
    {
      "epoch": 0.11777076761303891,
      "grad_norm": 0.22037693858146667,
      "learning_rate": 0.00019691314696305913,
      "loss": 1.055,
      "step": 28
    },
    {
      "epoch": 0.12197686645636173,
      "grad_norm": 0.20428280532360077,
      "learning_rate": 0.0001965626687631641,
      "loss": 1.0159,
      "step": 29
    },
    {
      "epoch": 0.12618296529968454,
      "grad_norm": 0.20502522587776184,
      "learning_rate": 0.00019619369573260924,
      "loss": 1.0254,
      "step": 30
    },
    {
      "epoch": 0.13038906414300735,
      "grad_norm": 0.2062043696641922,
      "learning_rate": 0.0001958062985414972,
      "loss": 0.9779,
      "step": 31
    },
    {
      "epoch": 0.13459516298633017,
      "grad_norm": 0.22229152917861938,
      "learning_rate": 0.00019540055138874505,
      "loss": 1.0201,
      "step": 32
    },
    {
      "epoch": 0.138801261829653,
      "grad_norm": 0.21910454332828522,
      "learning_rate": 0.00019497653198787264,
      "loss": 0.9958,
      "step": 33
    },
    {
      "epoch": 0.14300736067297581,
      "grad_norm": 0.22630847990512848,
      "learning_rate": 0.0001945343215521182,
      "loss": 0.9892,
      "step": 34
    },
    {
      "epoch": 0.14721345951629863,
      "grad_norm": 0.21370179951190948,
      "learning_rate": 0.00019407400477888315,
      "loss": 0.9409,
      "step": 35
    },
    {
      "epoch": 0.15141955835962145,
      "grad_norm": 0.22368259727954865,
      "learning_rate": 0.00019359566983351013,
      "loss": 0.9626,
      "step": 36
    },
    {
      "epoch": 0.15562565720294427,
      "grad_norm": 0.24231955409049988,
      "learning_rate": 0.00019309940833239626,
      "loss": 0.9914,
      "step": 37
    },
    {
      "epoch": 0.1598317560462671,
      "grad_norm": 0.24762062728405,
      "learning_rate": 0.00019258531532544585,
      "loss": 0.9311,
      "step": 38
    },
    {
      "epoch": 0.1640378548895899,
      "grad_norm": 0.21248659491539001,
      "learning_rate": 0.00019205348927786532,
      "loss": 0.9399,
      "step": 39
    },
    {
      "epoch": 0.16824395373291273,
      "grad_norm": 0.2374017834663391,
      "learning_rate": 0.00019150403205130383,
      "loss": 0.9664,
      "step": 40
    },
    {
      "epoch": 0.17245005257623555,
      "grad_norm": 0.25241079926490784,
      "learning_rate": 0.0001909370488843436,
      "loss": 0.9475,
      "step": 41
    },
    {
      "epoch": 0.17665615141955837,
      "grad_norm": 0.24083252251148224,
      "learning_rate": 0.00019035264837234347,
      "loss": 0.9602,
      "step": 42
    },
    {
      "epoch": 0.1808622502628812,
      "grad_norm": 0.24024806916713715,
      "learning_rate": 0.0001897509424466393,
      "loss": 0.9167,
      "step": 43
    },
    {
      "epoch": 0.18506834910620398,
      "grad_norm": 0.2538228929042816,
      "learning_rate": 0.0001891320463531055,
      "loss": 0.904,
      "step": 44
    },
    {
      "epoch": 0.1892744479495268,
      "grad_norm": 0.2393723875284195,
      "learning_rate": 0.00018849607863008193,
      "loss": 0.8927,
      "step": 45
    },
    {
      "epoch": 0.19348054679284962,
      "grad_norm": 0.2394389659166336,
      "learning_rate": 0.00018784316108566996,
      "loss": 0.8675,
      "step": 46
    },
    {
      "epoch": 0.19768664563617244,
      "grad_norm": 0.24351197481155396,
      "learning_rate": 0.00018717341877440226,
      "loss": 0.873,
      "step": 47
    },
    {
      "epoch": 0.20189274447949526,
      "grad_norm": 0.2396727055311203,
      "learning_rate": 0.000186486979973291,
      "loss": 0.8972,
      "step": 48
    },
    {
      "epoch": 0.20609884332281808,
      "grad_norm": 0.2674885392189026,
      "learning_rate": 0.0001857839761572586,
      "loss": 0.8613,
      "step": 49
    },
    {
      "epoch": 0.2103049421661409,
      "grad_norm": 0.25012922286987305,
      "learning_rate": 0.00018506454197395606,
      "loss": 0.8481,
      "step": 50
    },
    {
      "epoch": 0.21451104100946372,
      "grad_norm": 0.23941218852996826,
      "learning_rate": 0.0001843288152179739,
      "loss": 0.8638,
      "step": 51
    },
    {
      "epoch": 0.21871713985278654,
      "grad_norm": 0.25679612159729004,
      "learning_rate": 0.00018357693680444976,
      "loss": 0.8928,
      "step": 52
    },
    {
      "epoch": 0.22292323869610936,
      "grad_norm": 0.25766387581825256,
      "learning_rate": 0.00018280905074207884,
      "loss": 0.8997,
      "step": 53
    },
    {
      "epoch": 0.22712933753943218,
      "grad_norm": 0.24009671807289124,
      "learning_rate": 0.00018202530410553163,
      "loss": 0.8536,
      "step": 54
    },
    {
      "epoch": 0.231335436382755,
      "grad_norm": 0.24763701856136322,
      "learning_rate": 0.00018122584700728443,
      "loss": 0.8581,
      "step": 55
    },
    {
      "epoch": 0.23554153522607782,
      "grad_norm": 0.2651236653327942,
      "learning_rate": 0.0001804108325688679,
      "loss": 0.8164,
      "step": 56
    },
    {
      "epoch": 0.23974763406940064,
      "grad_norm": 0.23978441953659058,
      "learning_rate": 0.0001795804168915396,
      "loss": 0.8321,
      "step": 57
    },
    {
      "epoch": 0.24395373291272346,
      "grad_norm": 0.2508217394351959,
      "learning_rate": 0.00017873475902638553,
      "loss": 0.815,
      "step": 58
    },
    {
      "epoch": 0.24815983175604628,
      "grad_norm": 0.2765346169471741,
      "learning_rate": 0.00017787402094385666,
      "loss": 0.8674,
      "step": 59
    },
    {
      "epoch": 0.25236593059936907,
      "grad_norm": 0.27468088269233704,
      "learning_rate": 0.00017699836750274662,
      "loss": 0.8841,
      "step": 60
    },
    {
      "epoch": 0.25236593059936907,
      "eval_loss": 1.075273036956787,
      "eval_runtime": 66.5472,
      "eval_samples_per_second": 40.513,
      "eval_steps_per_second": 20.256,
      "step": 60
    },
    {
      "epoch": 0.2565720294426919,
      "grad_norm": 0.27056071162223816,
      "learning_rate": 0.00017610796641861581,
      "loss": 0.8459,
      "step": 61
    },
    {
      "epoch": 0.2607781282860147,
      "grad_norm": 0.2631956934928894,
      "learning_rate": 0.00017520298823166873,
      "loss": 0.8853,
      "step": 62
    },
    {
      "epoch": 0.26498422712933756,
      "grad_norm": 0.28352680802345276,
      "learning_rate": 0.00017428360627408978,
      "loss": 0.8625,
      "step": 63
    },
    {
      "epoch": 0.26919032597266035,
      "grad_norm": 0.24897028505802155,
      "learning_rate": 0.00017334999663684504,
      "loss": 0.8627,
      "step": 64
    },
    {
      "epoch": 0.2733964248159832,
      "grad_norm": 0.2620624303817749,
      "learning_rate": 0.00017240233813595478,
      "loss": 0.8088,
      "step": 65
    },
    {
      "epoch": 0.277602523659306,
      "grad_norm": 0.24983716011047363,
      "learning_rate": 0.0001714408122782448,
      "loss": 0.8318,
      "step": 66
    },
    {
      "epoch": 0.28180862250262884,
      "grad_norm": 0.2667708694934845,
      "learning_rate": 0.000170465603226582,
      "loss": 0.8368,
      "step": 67
    },
    {
      "epoch": 0.28601472134595163,
      "grad_norm": 0.2828388214111328,
      "learning_rate": 0.0001694768977646013,
      "loss": 0.8282,
      "step": 68
    },
    {
      "epoch": 0.2902208201892745,
      "grad_norm": 0.2581498324871063,
      "learning_rate": 0.0001684748852609306,
      "loss": 0.8375,
      "step": 69
    },
    {
      "epoch": 0.29442691903259727,
      "grad_norm": 0.27101799845695496,
      "learning_rate": 0.0001674597576329207,
      "loss": 0.818,
      "step": 70
    },
    {
      "epoch": 0.29863301787592006,
      "grad_norm": 0.27231255173683167,
      "learning_rate": 0.00016643170930988698,
      "loss": 0.843,
      "step": 71
    },
    {
      "epoch": 0.3028391167192429,
      "grad_norm": 0.2566690444946289,
      "learning_rate": 0.00016539093719586994,
      "loss": 0.8348,
      "step": 72
    },
    {
      "epoch": 0.3070452155625657,
      "grad_norm": 0.2482360601425171,
      "learning_rate": 0.00016433764063192194,
      "loss": 0.8122,
      "step": 73
    },
    {
      "epoch": 0.31125131440588855,
      "grad_norm": 0.25742995738983154,
      "learning_rate": 0.00016327202135792685,
      "loss": 0.776,
      "step": 74
    },
    {
      "epoch": 0.31545741324921134,
      "grad_norm": 0.25104233622550964,
      "learning_rate": 0.00016219428347396053,
      "loss": 0.7823,
      "step": 75
    },
    {
      "epoch": 0.3196635120925342,
      "grad_norm": 0.2921640872955322,
      "learning_rate": 0.00016110463340119913,
      "loss": 0.8127,
      "step": 76
    },
    {
      "epoch": 0.323869610935857,
      "grad_norm": 0.26554426550865173,
      "learning_rate": 0.00016000327984238292,
      "loss": 0.7716,
      "step": 77
    },
    {
      "epoch": 0.3280757097791798,
      "grad_norm": 0.24784542620182037,
      "learning_rate": 0.00015889043374184286,
      "loss": 0.7714,
      "step": 78
    },
    {
      "epoch": 0.3322818086225026,
      "grad_norm": 0.26592087745666504,
      "learning_rate": 0.0001577663082450984,
      "loss": 0.7397,
      "step": 79
    },
    {
      "epoch": 0.33648790746582546,
      "grad_norm": 0.3072431683540344,
      "learning_rate": 0.00015663111865803285,
      "loss": 0.7579,
      "step": 80
    },
    {
      "epoch": 0.34069400630914826,
      "grad_norm": 0.29445305466651917,
      "learning_rate": 0.00015548508240565583,
      "loss": 0.7998,
      "step": 81
    },
    {
      "epoch": 0.3449001051524711,
      "grad_norm": 0.26053521037101746,
      "learning_rate": 0.0001543284189904592,
      "loss": 0.7832,
      "step": 82
    },
    {
      "epoch": 0.3491062039957939,
      "grad_norm": 0.2956802248954773,
      "learning_rate": 0.00015316134995037545,
      "loss": 0.8054,
      "step": 83
    },
    {
      "epoch": 0.35331230283911674,
      "grad_norm": 0.2673921287059784,
      "learning_rate": 0.00015198409881634617,
      "loss": 0.8061,
      "step": 84
    },
    {
      "epoch": 0.35751840168243953,
      "grad_norm": 0.2793889045715332,
      "learning_rate": 0.00015079689106950854,
      "loss": 0.7902,
      "step": 85
    },
    {
      "epoch": 0.3617245005257624,
      "grad_norm": 0.26718223094940186,
      "learning_rate": 0.00014959995409800873,
      "loss": 0.7769,
      "step": 86
    },
    {
      "epoch": 0.3659305993690852,
      "grad_norm": 0.300536572933197,
      "learning_rate": 0.00014839351715344968,
      "loss": 0.8245,
      "step": 87
    },
    {
      "epoch": 0.37013669821240797,
      "grad_norm": 0.2824515998363495,
      "learning_rate": 0.00014717781130698212,
      "loss": 0.8122,
      "step": 88
    },
    {
      "epoch": 0.3743427970557308,
      "grad_norm": 0.28050506114959717,
      "learning_rate": 0.00014595306940504716,
      "loss": 0.778,
      "step": 89
    },
    {
      "epoch": 0.3785488958990536,
      "grad_norm": 0.2906787395477295,
      "learning_rate": 0.00014471952602477866,
      "loss": 0.7703,
      "step": 90
    },
    {
      "epoch": 0.38275499474237645,
      "grad_norm": 0.298177033662796,
      "learning_rate": 0.00014347741742907433,
      "loss": 0.7672,
      "step": 91
    },
    {
      "epoch": 0.38696109358569925,
      "grad_norm": 0.27583765983581543,
      "learning_rate": 0.00014222698152134374,
      "loss": 0.7784,
      "step": 92
    },
    {
      "epoch": 0.3911671924290221,
      "grad_norm": 0.28834670782089233,
      "learning_rate": 0.0001409684577999423,
      "loss": 0.8278,
      "step": 93
    },
    {
      "epoch": 0.3953732912723449,
      "grad_norm": 0.29721811413764954,
      "learning_rate": 0.00013970208731229974,
      "loss": 0.7997,
      "step": 94
    },
    {
      "epoch": 0.39957939011566773,
      "grad_norm": 0.2688146233558655,
      "learning_rate": 0.00013842811260875168,
      "loss": 0.7465,
      "step": 95
    },
    {
      "epoch": 0.4037854889589905,
      "grad_norm": 0.27095234394073486,
      "learning_rate": 0.0001371467776960837,
      "loss": 0.757,
      "step": 96
    },
    {
      "epoch": 0.40799158780231337,
      "grad_norm": 0.30743858218193054,
      "learning_rate": 0.0001358583279907961,
      "loss": 0.7882,
      "step": 97
    },
    {
      "epoch": 0.41219768664563616,
      "grad_norm": 0.274873822927475,
      "learning_rate": 0.00013456301027209882,
      "loss": 0.7737,
      "step": 98
    },
    {
      "epoch": 0.416403785488959,
      "grad_norm": 0.25485867261886597,
      "learning_rate": 0.00013326107263464558,
      "loss": 0.7454,
      "step": 99
    },
    {
      "epoch": 0.4206098843322818,
      "grad_norm": 0.2994694709777832,
      "learning_rate": 0.00013195276444101547,
      "loss": 0.8133,
      "step": 100
    },
    {
      "epoch": 0.42481598317560465,
      "grad_norm": 0.2943129241466522,
      "learning_rate": 0.0001306383362739523,
      "loss": 0.7501,
      "step": 101
    },
    {
      "epoch": 0.42902208201892744,
      "grad_norm": 0.2888595163822174,
      "learning_rate": 0.0001293180398883701,
      "loss": 0.7522,
      "step": 102
    },
    {
      "epoch": 0.4332281808622503,
      "grad_norm": 0.28455743193626404,
      "learning_rate": 0.00012799212816313376,
      "loss": 0.7278,
      "step": 103
    },
    {
      "epoch": 0.4374342797055731,
      "grad_norm": 0.32477039098739624,
      "learning_rate": 0.00012666085505262485,
      "loss": 0.819,
      "step": 104
    },
    {
      "epoch": 0.4416403785488959,
      "grad_norm": 0.28067031502723694,
      "learning_rate": 0.00012532447553810126,
      "loss": 0.7979,
      "step": 105
    },
    {
      "epoch": 0.4458464773922187,
      "grad_norm": 0.26430413126945496,
      "learning_rate": 0.00012398324557885994,
      "loss": 0.7497,
      "step": 106
    },
    {
      "epoch": 0.4500525762355415,
      "grad_norm": 0.27110588550567627,
      "learning_rate": 0.00012263742206321287,
      "loss": 0.7937,
      "step": 107
    },
    {
      "epoch": 0.45425867507886436,
      "grad_norm": 0.287041574716568,
      "learning_rate": 0.0001212872627592845,
      "loss": 0.7897,
      "step": 108
    },
    {
      "epoch": 0.45846477392218715,
      "grad_norm": 0.28561776876449585,
      "learning_rate": 0.00011993302626564102,
      "loss": 0.8011,
      "step": 109
    },
    {
      "epoch": 0.46267087276551,
      "grad_norm": 0.2852155566215515,
      "learning_rate": 0.00011857497196176049,
      "loss": 0.7426,
      "step": 110
    },
    {
      "epoch": 0.4668769716088328,
      "grad_norm": 0.2712121903896332,
      "learning_rate": 0.00011721335995835336,
      "loss": 0.7277,
      "step": 111
    },
    {
      "epoch": 0.47108307045215564,
      "grad_norm": 0.2779647409915924,
      "learning_rate": 0.00011584845104754304,
      "loss": 0.7698,
      "step": 112
    },
    {
      "epoch": 0.47528916929547843,
      "grad_norm": 0.2774654030799866,
      "learning_rate": 0.00011448050665291587,
      "loss": 0.7583,
      "step": 113
    },
    {
      "epoch": 0.4794952681388013,
      "grad_norm": 0.3046507239341736,
      "learning_rate": 0.00011310978877945007,
      "loss": 0.7987,
      "step": 114
    },
    {
      "epoch": 0.48370136698212407,
      "grad_norm": 0.2816363573074341,
      "learning_rate": 0.00011173655996333357,
      "loss": 0.7898,
      "step": 115
    },
    {
      "epoch": 0.4879074658254469,
      "grad_norm": 0.27383196353912354,
      "learning_rate": 0.00011036108322167988,
      "loss": 0.7248,
      "step": 116
    },
    {
      "epoch": 0.4921135646687697,
      "grad_norm": 0.28104445338249207,
      "learning_rate": 0.00010898362200215197,
      "loss": 0.7144,
      "step": 117
    },
    {
      "epoch": 0.49631966351209256,
      "grad_norm": 0.28643152117729187,
      "learning_rate": 0.0001076044401325036,
      "loss": 0.7856,
      "step": 118
    },
    {
      "epoch": 0.5005257623554153,
      "grad_norm": 0.261483371257782,
      "learning_rate": 0.0001062238017700478,
      "loss": 0.7429,
      "step": 119
    },
    {
      "epoch": 0.5047318611987381,
      "grad_norm": 0.2796306908130646,
      "learning_rate": 0.00010484197135106263,
      "loss": 0.7772,
      "step": 120
    },
    {
      "epoch": 0.5047318611987381,
      "eval_loss": 0.9960550665855408,
      "eval_runtime": 65.8413,
      "eval_samples_per_second": 40.947,
      "eval_steps_per_second": 20.473,
      "step": 120
    },
    {
      "epoch": 0.508937960042061,
      "grad_norm": 0.3079998791217804,
      "learning_rate": 0.00010345921354014279,
      "loss": 0.7497,
      "step": 121
    },
    {
      "epoch": 0.5131440588853838,
      "grad_norm": 0.3106074929237366,
      "learning_rate": 0.00010207579317950827,
      "loss": 0.7568,
      "step": 122
    },
    {
      "epoch": 0.5173501577287066,
      "grad_norm": 0.27859166264533997,
      "learning_rate": 0.00010069197523827833,
      "loss": 0.7695,
      "step": 123
    },
    {
      "epoch": 0.5215562565720294,
      "grad_norm": 0.2840277850627899,
      "learning_rate": 9.930802476172169e-05,
      "loss": 0.7815,
      "step": 124
    },
    {
      "epoch": 0.5257623554153522,
      "grad_norm": 0.28042981028556824,
      "learning_rate": 9.792420682049174e-05,
      "loss": 0.7546,
      "step": 125
    },
    {
      "epoch": 0.5299684542586751,
      "grad_norm": 0.2857164144515991,
      "learning_rate": 9.654078645985722e-05,
      "loss": 0.7617,
      "step": 126
    },
    {
      "epoch": 0.5341745531019979,
      "grad_norm": 0.29590827226638794,
      "learning_rate": 9.515802864893739e-05,
      "loss": 0.748,
      "step": 127
    },
    {
      "epoch": 0.5383806519453207,
      "grad_norm": 0.29375162720680237,
      "learning_rate": 9.377619822995219e-05,
      "loss": 0.7532,
      "step": 128
    },
    {
      "epoch": 0.5425867507886435,
      "grad_norm": 0.28436464071273804,
      "learning_rate": 9.239555986749645e-05,
      "loss": 0.7511,
      "step": 129
    },
    {
      "epoch": 0.5467928496319664,
      "grad_norm": 0.29677248001098633,
      "learning_rate": 9.101637799784804e-05,
      "loss": 0.7456,
      "step": 130
    },
    {
      "epoch": 0.5509989484752892,
      "grad_norm": 0.27983585000038147,
      "learning_rate": 8.963891677832011e-05,
      "loss": 0.6888,
      "step": 131
    },
    {
      "epoch": 0.555205047318612,
      "grad_norm": 0.27517008781433105,
      "learning_rate": 8.826344003666647e-05,
      "loss": 0.7431,
      "step": 132
    },
    {
      "epoch": 0.5594111461619348,
      "grad_norm": 0.28197160363197327,
      "learning_rate": 8.689021122054996e-05,
      "loss": 0.7379,
      "step": 133
    },
    {
      "epoch": 0.5636172450052577,
      "grad_norm": 0.29125264286994934,
      "learning_rate": 8.551949334708415e-05,
      "loss": 0.7639,
      "step": 134
    },
    {
      "epoch": 0.5678233438485805,
      "grad_norm": 0.2851899266242981,
      "learning_rate": 8.415154895245697e-05,
      "loss": 0.7764,
      "step": 135
    },
    {
      "epoch": 0.5720294426919033,
      "grad_norm": 0.2771802544593811,
      "learning_rate": 8.278664004164665e-05,
      "loss": 0.6961,
      "step": 136
    },
    {
      "epoch": 0.576235541535226,
      "grad_norm": 0.27956414222717285,
      "learning_rate": 8.142502803823955e-05,
      "loss": 0.7454,
      "step": 137
    },
    {
      "epoch": 0.580441640378549,
      "grad_norm": 0.30068668723106384,
      "learning_rate": 8.0066973734359e-05,
      "loss": 0.7683,
      "step": 138
    },
    {
      "epoch": 0.5846477392218717,
      "grad_norm": 0.2820778489112854,
      "learning_rate": 7.871273724071553e-05,
      "loss": 0.7412,
      "step": 139
    },
    {
      "epoch": 0.5888538380651945,
      "grad_norm": 0.2672085165977478,
      "learning_rate": 7.736257793678714e-05,
      "loss": 0.716,
      "step": 140
    },
    {
      "epoch": 0.5930599369085173,
      "grad_norm": 0.27900293469429016,
      "learning_rate": 7.601675442114009e-05,
      "loss": 0.7259,
      "step": 141
    },
    {
      "epoch": 0.5972660357518401,
      "grad_norm": 0.2954063415527344,
      "learning_rate": 7.46755244618988e-05,
      "loss": 0.7047,
      "step": 142
    },
    {
      "epoch": 0.601472134595163,
      "grad_norm": 0.3212134838104248,
      "learning_rate": 7.333914494737514e-05,
      "loss": 0.7657,
      "step": 143
    },
    {
      "epoch": 0.6056782334384858,
      "grad_norm": 0.30651283264160156,
      "learning_rate": 7.200787183686625e-05,
      "loss": 0.7489,
      "step": 144
    },
    {
      "epoch": 0.6098843322818086,
      "grad_norm": 0.26834797859191895,
      "learning_rate": 7.068196011162994e-05,
      "loss": 0.7484,
      "step": 145
    },
    {
      "epoch": 0.6140904311251314,
      "grad_norm": 0.2777973711490631,
      "learning_rate": 6.936166372604773e-05,
      "loss": 0.7245,
      "step": 146
    },
    {
      "epoch": 0.6182965299684543,
      "grad_norm": 0.293694406747818,
      "learning_rate": 6.804723555898458e-05,
      "loss": 0.7211,
      "step": 147
    },
    {
      "epoch": 0.6225026288117771,
      "grad_norm": 0.28515610098838806,
      "learning_rate": 6.673892736535448e-05,
      "loss": 0.7439,
      "step": 148
    },
    {
      "epoch": 0.6267087276550999,
      "grad_norm": 0.2929891049861908,
      "learning_rate": 6.543698972790117e-05,
      "loss": 0.7434,
      "step": 149
    },
    {
      "epoch": 0.6309148264984227,
      "grad_norm": 0.29031944274902344,
      "learning_rate": 6.414167200920391e-05,
      "loss": 0.7176,
      "step": 150
    },
    {
      "epoch": 0.6351209253417456,
      "grad_norm": 0.2764637768268585,
      "learning_rate": 6.28532223039163e-05,
      "loss": 0.7503,
      "step": 151
    },
    {
      "epoch": 0.6393270241850684,
      "grad_norm": 0.2900468707084656,
      "learning_rate": 6.157188739124834e-05,
      "loss": 0.6879,
      "step": 152
    },
    {
      "epoch": 0.6435331230283912,
      "grad_norm": 0.2989012897014618,
      "learning_rate": 6.029791268770029e-05,
      "loss": 0.7135,
      "step": 153
    },
    {
      "epoch": 0.647739221871714,
      "grad_norm": 0.2998535931110382,
      "learning_rate": 5.903154220005771e-05,
      "loss": 0.7171,
      "step": 154
    },
    {
      "epoch": 0.6519453207150369,
      "grad_norm": 0.27283868193626404,
      "learning_rate": 5.777301847865629e-05,
      "loss": 0.7112,
      "step": 155
    },
    {
      "epoch": 0.6561514195583596,
      "grad_norm": 0.2988041341304779,
      "learning_rate": 5.652258257092569e-05,
      "loss": 0.7444,
      "step": 156
    },
    {
      "epoch": 0.6603575184016824,
      "grad_norm": 0.2845938205718994,
      "learning_rate": 5.528047397522133e-05,
      "loss": 0.716,
      "step": 157
    },
    {
      "epoch": 0.6645636172450052,
      "grad_norm": 0.29695218801498413,
      "learning_rate": 5.404693059495285e-05,
      "loss": 0.7585,
      "step": 158
    },
    {
      "epoch": 0.668769716088328,
      "grad_norm": 0.28558245301246643,
      "learning_rate": 5.282218869301788e-05,
      "loss": 0.6908,
      "step": 159
    },
    {
      "epoch": 0.6729758149316509,
      "grad_norm": 0.280200719833374,
      "learning_rate": 5.160648284655032e-05,
      "loss": 0.7508,
      "step": 160
    },
    {
      "epoch": 0.6771819137749737,
      "grad_norm": 0.2981257438659668,
      "learning_rate": 5.040004590199128e-05,
      "loss": 0.7147,
      "step": 161
    },
    {
      "epoch": 0.6813880126182965,
      "grad_norm": 0.2873106598854065,
      "learning_rate": 4.920310893049146e-05,
      "loss": 0.7011,
      "step": 162
    },
    {
      "epoch": 0.6855941114616193,
      "grad_norm": 0.2717635929584503,
      "learning_rate": 4.801590118365383e-05,
      "loss": 0.6668,
      "step": 163
    },
    {
      "epoch": 0.6898002103049422,
      "grad_norm": 0.27607038617134094,
      "learning_rate": 4.683865004962452e-05,
      "loss": 0.7033,
      "step": 164
    },
    {
      "epoch": 0.694006309148265,
      "grad_norm": 0.2881218194961548,
      "learning_rate": 4.567158100954083e-05,
      "loss": 0.7275,
      "step": 165
    },
    {
      "epoch": 0.6982124079915878,
      "grad_norm": 0.2758018672466278,
      "learning_rate": 4.4514917594344184e-05,
      "loss": 0.737,
      "step": 166
    },
    {
      "epoch": 0.7024185068349106,
      "grad_norm": 0.29527172446250916,
      "learning_rate": 4.3368881341967135e-05,
      "loss": 0.7433,
      "step": 167
    },
    {
      "epoch": 0.7066246056782335,
      "grad_norm": 0.2847643792629242,
      "learning_rate": 4.223369175490162e-05,
      "loss": 0.7471,
      "step": 168
    },
    {
      "epoch": 0.7108307045215563,
      "grad_norm": 0.2958676815032959,
      "learning_rate": 4.110956625815713e-05,
      "loss": 0.6838,
      "step": 169
    },
    {
      "epoch": 0.7150368033648791,
      "grad_norm": 0.28350576758384705,
      "learning_rate": 3.9996720157617094e-05,
      "loss": 0.7306,
      "step": 170
    },
    {
      "epoch": 0.7192429022082019,
      "grad_norm": 0.2808986008167267,
      "learning_rate": 3.8895366598800896e-05,
      "loss": 0.6823,
      "step": 171
    },
    {
      "epoch": 0.7234490010515248,
      "grad_norm": 0.2684039771556854,
      "learning_rate": 3.780571652603949e-05,
      "loss": 0.7105,
      "step": 172
    },
    {
      "epoch": 0.7276550998948476,
      "grad_norm": 0.28138425946235657,
      "learning_rate": 3.672797864207316e-05,
      "loss": 0.7221,
      "step": 173
    },
    {
      "epoch": 0.7318611987381703,
      "grad_norm": 0.2772335708141327,
      "learning_rate": 3.566235936807808e-05,
      "loss": 0.6835,
      "step": 174
    },
    {
      "epoch": 0.7360672975814931,
      "grad_norm": 0.27244430780410767,
      "learning_rate": 3.460906280413007e-05,
      "loss": 0.6577,
      "step": 175
    },
    {
      "epoch": 0.7402733964248159,
      "grad_norm": 0.2977088689804077,
      "learning_rate": 3.3568290690113034e-05,
      "loss": 0.7213,
      "step": 176
    },
    {
      "epoch": 0.7444794952681388,
      "grad_norm": 0.289736270904541,
      "learning_rate": 3.25402423670793e-05,
      "loss": 0.7154,
      "step": 177
    },
    {
      "epoch": 0.7486855941114616,
      "grad_norm": 0.287818044424057,
      "learning_rate": 3.1525114739069415e-05,
      "loss": 0.6977,
      "step": 178
    },
    {
      "epoch": 0.7528916929547844,
      "grad_norm": 0.31408464908599854,
      "learning_rate": 3.0523102235398714e-05,
      "loss": 0.781,
      "step": 179
    },
    {
      "epoch": 0.7570977917981072,
      "grad_norm": 0.27790582180023193,
      "learning_rate": 2.9534396773417994e-05,
      "loss": 0.7169,
      "step": 180
    },
    {
      "epoch": 0.7570977917981072,
      "eval_loss": 0.9679059386253357,
      "eval_runtime": 66.127,
      "eval_samples_per_second": 40.77,
      "eval_steps_per_second": 20.385,
      "step": 180
    },
    {
      "epoch": 0.7613038906414301,
      "grad_norm": 0.28392866253852844,
      "learning_rate": 2.855918772175522e-05,
      "loss": 0.6662,
      "step": 181
    },
    {
      "epoch": 0.7655099894847529,
      "grad_norm": 0.2941664159297943,
      "learning_rate": 2.7597661864045233e-05,
      "loss": 0.6816,
      "step": 182
    },
    {
      "epoch": 0.7697160883280757,
      "grad_norm": 0.2740324139595032,
      "learning_rate": 2.6650003363154963e-05,
      "loss": 0.7046,
      "step": 183
    },
    {
      "epoch": 0.7739221871713985,
      "grad_norm": 0.2933352291584015,
      "learning_rate": 2.5716393725910215e-05,
      "loss": 0.7208,
      "step": 184
    },
    {
      "epoch": 0.7781282860147214,
      "grad_norm": 0.2843799591064453,
      "learning_rate": 2.47970117683313e-05,
      "loss": 0.685,
      "step": 185
    },
    {
      "epoch": 0.7823343848580442,
      "grad_norm": 0.27152329683303833,
      "learning_rate": 2.389203358138419e-05,
      "loss": 0.7176,
      "step": 186
    },
    {
      "epoch": 0.786540483701367,
      "grad_norm": 0.2916063964366913,
      "learning_rate": 2.3001632497253424e-05,
      "loss": 0.7439,
      "step": 187
    },
    {
      "epoch": 0.7907465825446898,
      "grad_norm": 0.27915897965431213,
      "learning_rate": 2.2125979056143364e-05,
      "loss": 0.7,
      "step": 188
    },
    {
      "epoch": 0.7949526813880127,
      "grad_norm": 0.30191752314567566,
      "learning_rate": 2.1265240973614486e-05,
      "loss": 0.7377,
      "step": 189
    },
    {
      "epoch": 0.7991587802313355,
      "grad_norm": 0.286101758480072,
      "learning_rate": 2.0419583108460418e-05,
      "loss": 0.6916,
      "step": 190
    },
    {
      "epoch": 0.8033648790746583,
      "grad_norm": 0.2800692319869995,
      "learning_rate": 1.958916743113214e-05,
      "loss": 0.7374,
      "step": 191
    },
    {
      "epoch": 0.807570977917981,
      "grad_norm": 0.27292168140411377,
      "learning_rate": 1.877415299271561e-05,
      "loss": 0.6757,
      "step": 192
    },
    {
      "epoch": 0.8117770767613038,
      "grad_norm": 0.28094640374183655,
      "learning_rate": 1.7974695894468384e-05,
      "loss": 0.7024,
      "step": 193
    },
    {
      "epoch": 0.8159831756046267,
      "grad_norm": 0.2871862053871155,
      "learning_rate": 1.7190949257921196e-05,
      "loss": 0.7173,
      "step": 194
    },
    {
      "epoch": 0.8201892744479495,
      "grad_norm": 0.27189600467681885,
      "learning_rate": 1.642306319555027e-05,
      "loss": 0.7019,
      "step": 195
    },
    {
      "epoch": 0.8243953732912723,
      "grad_norm": 0.28526559472084045,
      "learning_rate": 1.5671184782026106e-05,
      "loss": 0.7113,
      "step": 196
    },
    {
      "epoch": 0.8286014721345951,
      "grad_norm": 0.2855590283870697,
      "learning_rate": 1.4935458026043959e-05,
      "loss": 0.6977,
      "step": 197
    },
    {
      "epoch": 0.832807570977918,
      "grad_norm": 0.28118449449539185,
      "learning_rate": 1.4216023842741455e-05,
      "loss": 0.7241,
      "step": 198
    },
    {
      "epoch": 0.8370136698212408,
      "grad_norm": 0.28818827867507935,
      "learning_rate": 1.3513020026709023e-05,
      "loss": 0.6964,
      "step": 199
    },
    {
      "epoch": 0.8412197686645636,
      "grad_norm": 0.3235337436199188,
      "learning_rate": 1.2826581225597767e-05,
      "loss": 0.7406,
      "step": 200
    },
    {
      "epoch": 0.8454258675078864,
      "grad_norm": 0.2899198830127716,
      "learning_rate": 1.2156838914330072e-05,
      "loss": 0.7374,
      "step": 201
    },
    {
      "epoch": 0.8496319663512093,
      "grad_norm": 0.28662335872650146,
      "learning_rate": 1.1503921369918091e-05,
      "loss": 0.7039,
      "step": 202
    },
    {
      "epoch": 0.8538380651945321,
      "grad_norm": 0.2748032510280609,
      "learning_rate": 1.0867953646894525e-05,
      "loss": 0.7517,
      "step": 203
    },
    {
      "epoch": 0.8580441640378549,
      "grad_norm": 0.27125102281570435,
      "learning_rate": 1.0249057553360742e-05,
      "loss": 0.6948,
      "step": 204
    },
    {
      "epoch": 0.8622502628811777,
      "grad_norm": 0.2795623242855072,
      "learning_rate": 9.647351627656543e-06,
      "loss": 0.7123,
      "step": 205
    },
    {
      "epoch": 0.8664563617245006,
      "grad_norm": 0.28939002752304077,
      "learning_rate": 9.062951115656403e-06,
      "loss": 0.7266,
      "step": 206
    },
    {
      "epoch": 0.8706624605678234,
      "grad_norm": 0.2878707945346832,
      "learning_rate": 8.495967948696192e-06,
      "loss": 0.7335,
      "step": 207
    },
    {
      "epoch": 0.8748685594111462,
      "grad_norm": 0.27489086985588074,
      "learning_rate": 7.946510722134692e-06,
      "loss": 0.692,
      "step": 208
    },
    {
      "epoch": 0.879074658254469,
      "grad_norm": 0.2869216799736023,
      "learning_rate": 7.4146846745541506e-06,
      "loss": 0.7193,
      "step": 209
    },
    {
      "epoch": 0.8832807570977917,
      "grad_norm": 0.2801933288574219,
      "learning_rate": 6.900591667603751e-06,
      "loss": 0.7178,
      "step": 210
    },
    {
      "epoch": 0.8874868559411146,
      "grad_norm": 0.2767332196235657,
      "learning_rate": 6.40433016648988e-06,
      "loss": 0.7499,
      "step": 211
    },
    {
      "epoch": 0.8916929547844374,
      "grad_norm": 0.2783336043357849,
      "learning_rate": 5.925995221116853e-06,
      "loss": 0.7152,
      "step": 212
    },
    {
      "epoch": 0.8958990536277602,
      "grad_norm": 0.27832481265068054,
      "learning_rate": 5.465678447881828e-06,
      "loss": 0.6977,
      "step": 213
    },
    {
      "epoch": 0.900105152471083,
      "grad_norm": 0.2835717499256134,
      "learning_rate": 5.023468012127364e-06,
      "loss": 0.7251,
      "step": 214
    },
    {
      "epoch": 0.9043112513144059,
      "grad_norm": 0.27503538131713867,
      "learning_rate": 4.599448611254964e-06,
      "loss": 0.7166,
      "step": 215
    },
    {
      "epoch": 0.9085173501577287,
      "grad_norm": 0.26619476079940796,
      "learning_rate": 4.193701458502807e-06,
      "loss": 0.7095,
      "step": 216
    },
    {
      "epoch": 0.9127234490010515,
      "grad_norm": 0.2752280533313751,
      "learning_rate": 3.80630426739077e-06,
      "loss": 0.7412,
      "step": 217
    },
    {
      "epoch": 0.9169295478443743,
      "grad_norm": 0.281093567609787,
      "learning_rate": 3.4373312368358944e-06,
      "loss": 0.7592,
      "step": 218
    },
    {
      "epoch": 0.9211356466876972,
      "grad_norm": 0.28015753626823425,
      "learning_rate": 3.086853036940862e-06,
      "loss": 0.7104,
      "step": 219
    },
    {
      "epoch": 0.92534174553102,
      "grad_norm": 0.2644014358520508,
      "learning_rate": 2.754936795458485e-06,
      "loss": 0.6985,
      "step": 220
    },
    {
      "epoch": 0.9295478443743428,
      "grad_norm": 0.2755027413368225,
      "learning_rate": 2.4416460849345123e-06,
      "loss": 0.7159,
      "step": 221
    },
    {
      "epoch": 0.9337539432176656,
      "grad_norm": 0.28020283579826355,
      "learning_rate": 2.1470409105315283e-06,
      "loss": 0.7389,
      "step": 222
    },
    {
      "epoch": 0.9379600420609885,
      "grad_norm": 0.2773683965206146,
      "learning_rate": 1.8711776985360308e-06,
      "loss": 0.686,
      "step": 223
    },
    {
      "epoch": 0.9421661409043113,
      "grad_norm": 0.2784758508205414,
      "learning_rate": 1.61410928555098e-06,
      "loss": 0.6857,
      "step": 224
    },
    {
      "epoch": 0.9463722397476341,
      "grad_norm": 0.2857016623020172,
      "learning_rate": 1.3758849083759352e-06,
      "loss": 0.6982,
      "step": 225
    },
    {
      "epoch": 0.9505783385909569,
      "grad_norm": 0.27618998289108276,
      "learning_rate": 1.1565501945766222e-06,
      "loss": 0.7328,
      "step": 226
    },
    {
      "epoch": 0.9547844374342797,
      "grad_norm": 0.273423969745636,
      "learning_rate": 9.56147153745779e-07,
      "loss": 0.6762,
      "step": 227
    },
    {
      "epoch": 0.9589905362776026,
      "grad_norm": 0.2603454291820526,
      "learning_rate": 7.747141694570026e-07,
      "loss": 0.6784,
      "step": 228
    },
    {
      "epoch": 0.9631966351209253,
      "grad_norm": 0.2638219892978668,
      "learning_rate": 6.122859919130974e-07,
      "loss": 0.731,
      "step": 229
    },
    {
      "epoch": 0.9674027339642481,
      "grad_norm": 0.28604456782341003,
      "learning_rate": 4.6889373129022085e-07,
      "loss": 0.6937,
      "step": 230
    },
    {
      "epoch": 0.9716088328075709,
      "grad_norm": 0.2867179811000824,
      "learning_rate": 3.445648517793942e-07,
      "loss": 0.7492,
      "step": 231
    },
    {
      "epoch": 0.9758149316508938,
      "grad_norm": 0.27991774678230286,
      "learning_rate": 2.3932316632614416e-07,
      "loss": 0.7411,
      "step": 232
    },
    {
      "epoch": 0.9800210304942166,
      "grad_norm": 0.2658878266811371,
      "learning_rate": 1.5318883206962842e-07,
      "loss": 0.7317,
      "step": 233
    },
    {
      "epoch": 0.9842271293375394,
      "grad_norm": 0.26533135771751404,
      "learning_rate": 8.617834648185774e-08,
      "loss": 0.6636,
      "step": 234
    },
    {
      "epoch": 0.9884332281808622,
      "grad_norm": 0.26577314734458923,
      "learning_rate": 3.8304544207945495e-08,
      "loss": 0.7273,
      "step": 235
    },
    {
      "epoch": 0.9926393270241851,
      "grad_norm": 0.2715383768081665,
      "learning_rate": 9.576594607807465e-09,
      "loss": 0.7253,
      "step": 236
    },
    {
      "epoch": 0.9968454258675079,
      "grad_norm": 0.28140708804130554,
      "learning_rate": 0.0,
      "loss": 0.6756,
      "step": 237
    }
  ],
  "logging_steps": 1,
  "max_steps": 237,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 1,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 2.319764496895181e+16,
  "train_batch_size": 2,
  "trial_name": null,
  "trial_params": null
}