Upload 8 files

Browse files

Files changed (8) hide show

config.json +29 -0
generation_config.json +5 -0
model.safetensors +3 -0
optimizer.pt +3 -0
rng_state.pth +3 -0
scheduler.pt +3 -0
trainer_state.json +3921 -0
training_args.bin +3 -0

config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": null,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 1,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 512,
+  "initializer_range": 0.02,
+  "intermediate_size": 2048,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 2048,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 8,
+  "num_hidden_layers": 6,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.39.2",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 384
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "_from_model_config": true,
+  "eos_token_id": 1,
+  "transformers_version": "4.39.2"
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cd75b7f8c67702815af97644adc44cb8293ea3d6ab29636c11810aa72fd78a47
+size 77242672

optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b51a5b06a16148881aef4419f115048eae5462c2e56366c6ba1c9a8c9d2c3c5d
+size 154513210

rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:592c43fcd4f224a713b94e9b0547d56fff2d04c9a3f50aaae922858b09cf4f47
+size 14244

scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:54170defda7f8314ce25911dc27a6c89c123f5c8df2ad3e4ecbdadd81aee4d82
+size 1000

trainer_state.json ADDED Viewed

	@@ -0,0 +1,3921 @@

+{
+  "best_metric": 2.5235376358032227,
+  "best_model_checkpoint": "./results/checkpoint-50000",
+  "epoch": 0.3637362257638006,
+  "eval_steps": 1000,
+  "global_step": 50000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0,
+      "grad_norm": 3.156810998916626,
+      "learning_rate": 5e-06,
+      "loss": 5.0881,
+      "step": 100
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 1.8936021327972412,
+      "learning_rate": 1e-05,
+      "loss": 3.4609,
+      "step": 200
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 0.8277048468589783,
+      "learning_rate": 1.5e-05,
+      "loss": 2.9956,
+      "step": 300
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 0.5409151315689087,
+      "learning_rate": 2e-05,
+      "loss": 2.7702,
+      "step": 400
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 0.37298905849456787,
+      "learning_rate": 2.5e-05,
+      "loss": 2.7116,
+      "step": 500
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 1.4856789112091064,
+      "learning_rate": 3e-05,
+      "loss": 2.676,
+      "step": 600
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.9477183222770691,
+      "learning_rate": 3.5e-05,
+      "loss": 2.6509,
+      "step": 700
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.42317140102386475,
+      "learning_rate": 4e-05,
+      "loss": 2.6364,
+      "step": 800
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.9827988743782043,
+      "learning_rate": 4.5e-05,
+      "loss": 2.6289,
+      "step": 900
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.647011935710907,
+      "learning_rate": 5e-05,
+      "loss": 2.6236,
+      "step": 1000
+    },
+    {
+      "epoch": 0.01,
+      "eval_loss": 2.6208438873291016,
+      "eval_runtime": 5241.4331,
+      "eval_samples_per_second": 1118.979,
+      "eval_steps_per_second": 69.936,
+      "step": 1000
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.3872681260108948,
+      "learning_rate": 5.500000000000001e-05,
+      "loss": 2.6201,
+      "step": 1100
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.6852623224258423,
+      "learning_rate": 6e-05,
+      "loss": 2.6152,
+      "step": 1200
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.5895536541938782,
+      "learning_rate": 6.500000000000001e-05,
+      "loss": 2.6152,
+      "step": 1300
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.5872493386268616,
+      "learning_rate": 7e-05,
+      "loss": 2.6124,
+      "step": 1400
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.6140819787979126,
+      "learning_rate": 7.500000000000001e-05,
+      "loss": 2.6106,
+      "step": 1500
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.6721034646034241,
+      "learning_rate": 8e-05,
+      "loss": 2.6097,
+      "step": 1600
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.3682301640510559,
+      "learning_rate": 8.5e-05,
+      "loss": 2.6075,
+      "step": 1700
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.4152977764606476,
+      "learning_rate": 9e-05,
+      "loss": 2.6065,
+      "step": 1800
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.30114710330963135,
+      "learning_rate": 9.5e-05,
+      "loss": 2.6067,
+      "step": 1900
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.29172569513320923,
+      "learning_rate": 0.0001,
+      "loss": 2.6056,
+      "step": 2000
+    },
+    {
+      "epoch": 0.01,
+      "eval_loss": 2.604510545730591,
+      "eval_runtime": 5009.0383,
+      "eval_samples_per_second": 1170.894,
+      "eval_steps_per_second": 73.181,
+      "step": 2000
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.26403701305389404,
+      "learning_rate": 9.999999909099618e-05,
+      "loss": 2.6061,
+      "step": 2100
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.42730650305747986,
+      "learning_rate": 9.999999636398476e-05,
+      "loss": 2.6038,
+      "step": 2200
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.2776353657245636,
+      "learning_rate": 9.999999181896581e-05,
+      "loss": 2.6048,
+      "step": 2300
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.2872585952281952,
+      "learning_rate": 9.999998545593951e-05,
+      "loss": 2.6031,
+      "step": 2400
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.24884726107120514,
+      "learning_rate": 9.999997727490612e-05,
+      "loss": 2.6007,
+      "step": 2500
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.2905128002166748,
+      "learning_rate": 9.99999672758659e-05,
+      "loss": 2.6001,
+      "step": 2600
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.22506840527057648,
+      "learning_rate": 9.999995545881924e-05,
+      "loss": 2.6002,
+      "step": 2700
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.235252246260643,
+      "learning_rate": 9.999994182376653e-05,
+      "loss": 2.5991,
+      "step": 2800
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.3328251838684082,
+      "learning_rate": 9.999992637070832e-05,
+      "loss": 2.5997,
+      "step": 2900
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.23325322568416595,
+      "learning_rate": 9.999990909964513e-05,
+      "loss": 2.5995,
+      "step": 3000
+    },
+    {
+      "epoch": 0.02,
+      "eval_loss": 2.598254919052124,
+      "eval_runtime": 5203.4796,
+      "eval_samples_per_second": 1127.141,
+      "eval_steps_per_second": 70.446,
+      "step": 3000
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.2796269655227661,
+      "learning_rate": 9.99998900105776e-05,
+      "loss": 2.5984,
+      "step": 3100
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.21093028783798218,
+      "learning_rate": 9.999986910350642e-05,
+      "loss": 2.5985,
+      "step": 3200
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.22558662295341492,
+      "learning_rate": 9.999984637843238e-05,
+      "loss": 2.5978,
+      "step": 3300
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.3363204598426819,
+      "learning_rate": 9.999982183535627e-05,
+      "loss": 2.5965,
+      "step": 3400
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.27081742882728577,
+      "learning_rate": 9.9999795474279e-05,
+      "loss": 2.5961,
+      "step": 3500
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.23502503335475922,
+      "learning_rate": 9.999976729520151e-05,
+      "loss": 2.5956,
+      "step": 3600
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.17260690033435822,
+      "learning_rate": 9.999973729812485e-05,
+      "loss": 2.5955,
+      "step": 3700
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.16165785491466522,
+      "learning_rate": 9.999970548305009e-05,
+      "loss": 2.5955,
+      "step": 3800
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.24890035390853882,
+      "learning_rate": 9.99996718499784e-05,
+      "loss": 2.5946,
+      "step": 3900
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.30852416157722473,
+      "learning_rate": 9.999963639891102e-05,
+      "loss": 2.5956,
+      "step": 4000
+    },
+    {
+      "epoch": 0.03,
+      "eval_loss": 2.594769239425659,
+      "eval_runtime": 5243.3493,
+      "eval_samples_per_second": 1118.57,
+      "eval_steps_per_second": 69.911,
+      "step": 4000
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.37085938453674316,
+      "learning_rate": 9.999959912984918e-05,
+      "loss": 2.5947,
+      "step": 4100
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.3395659029483795,
+      "learning_rate": 9.999956004279429e-05,
+      "loss": 2.5955,
+      "step": 4200
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.16178911924362183,
+      "learning_rate": 9.999951913774777e-05,
+      "loss": 2.594,
+      "step": 4300
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.32948923110961914,
+      "learning_rate": 9.999947641471107e-05,
+      "loss": 2.5941,
+      "step": 4400
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.20488527417182922,
+      "learning_rate": 9.999943187368577e-05,
+      "loss": 2.5934,
+      "step": 4500
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.2330751270055771,
+      "learning_rate": 9.999938551467348e-05,
+      "loss": 2.593,
+      "step": 4600
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.2759188711643219,
+      "learning_rate": 9.99993373376759e-05,
+      "loss": 2.5934,
+      "step": 4700
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.20582658052444458,
+      "learning_rate": 9.999928734269477e-05,
+      "loss": 2.5921,
+      "step": 4800
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.27048927545547485,
+      "learning_rate": 9.99992355297319e-05,
+      "loss": 2.5908,
+      "step": 4900
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.23385432362556458,
+      "learning_rate": 9.999918189878918e-05,
+      "loss": 2.5921,
+      "step": 5000
+    },
+    {
+      "epoch": 0.04,
+      "eval_loss": 2.5919315814971924,
+      "eval_runtime": 5496.6841,
+      "eval_samples_per_second": 1067.017,
+      "eval_steps_per_second": 66.689,
+      "step": 5000
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.22557920217514038,
+      "learning_rate": 9.999912644986859e-05,
+      "loss": 2.5931,
+      "step": 5100
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.23983053863048553,
+      "learning_rate": 9.99990691829721e-05,
+      "loss": 2.5915,
+      "step": 5200
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.18743467330932617,
+      "learning_rate": 9.999901009810181e-05,
+      "loss": 2.5913,
+      "step": 5300
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.20421144366264343,
+      "learning_rate": 9.999894919525987e-05,
+      "loss": 2.59,
+      "step": 5400
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.19018307328224182,
+      "learning_rate": 9.999888647444851e-05,
+      "loss": 2.5908,
+      "step": 5500
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.23669780790805817,
+      "learning_rate": 9.999882193566997e-05,
+      "loss": 2.5908,
+      "step": 5600
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.20617301762104034,
+      "learning_rate": 9.999875557892664e-05,
+      "loss": 2.5902,
+      "step": 5700
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.1931978017091751,
+      "learning_rate": 9.999868740422092e-05,
+      "loss": 2.5907,
+      "step": 5800
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.19971776008605957,
+      "learning_rate": 9.999861741155526e-05,
+      "loss": 2.5885,
+      "step": 5900
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.21058019995689392,
+      "learning_rate": 9.999854560093225e-05,
+      "loss": 2.5897,
+      "step": 6000
+    },
+    {
+      "epoch": 0.04,
+      "eval_loss": 2.5891263484954834,
+      "eval_runtime": 5309.84,
+      "eval_samples_per_second": 1104.563,
+      "eval_steps_per_second": 69.035,
+      "step": 6000
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.2691722512245178,
+      "learning_rate": 9.999847197235446e-05,
+      "loss": 2.5882,
+      "step": 6100
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.2411184161901474,
+      "learning_rate": 9.99983965258246e-05,
+      "loss": 2.589,
+      "step": 6200
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.15842053294181824,
+      "learning_rate": 9.99983192613454e-05,
+      "loss": 2.5892,
+      "step": 6300
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.17729991674423218,
+      "learning_rate": 9.999824017891965e-05,
+      "loss": 2.588,
+      "step": 6400
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.22472384572029114,
+      "learning_rate": 9.999815927855027e-05,
+      "loss": 2.5873,
+      "step": 6500
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.2445155680179596,
+      "learning_rate": 9.999807656024016e-05,
+      "loss": 2.5871,
+      "step": 6600
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.2309512048959732,
+      "learning_rate": 9.999799202399236e-05,
+      "loss": 2.5872,
+      "step": 6700
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.24274510145187378,
+      "learning_rate": 9.999790566980991e-05,
+      "loss": 2.5863,
+      "step": 6800
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.2553771436214447,
+      "learning_rate": 9.999781749769597e-05,
+      "loss": 2.5866,
+      "step": 6900
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.2770930230617523,
+      "learning_rate": 9.999772750765375e-05,
+      "loss": 2.5867,
+      "step": 7000
+    },
+    {
+      "epoch": 0.05,
+      "eval_loss": 2.5862584114074707,
+      "eval_runtime": 5428.0291,
+      "eval_samples_per_second": 1080.512,
+      "eval_steps_per_second": 67.532,
+      "step": 7000
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.20760175585746765,
+      "learning_rate": 9.999763569968652e-05,
+      "loss": 2.5864,
+      "step": 7100
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.195896178483963,
+      "learning_rate": 9.999754207379762e-05,
+      "loss": 2.5859,
+      "step": 7200
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.17725147306919098,
+      "learning_rate": 9.999744662999042e-05,
+      "loss": 2.5862,
+      "step": 7300
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.19556549191474915,
+      "learning_rate": 9.999734936826843e-05,
+      "loss": 2.5849,
+      "step": 7400
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.1898818463087082,
+      "learning_rate": 9.999725028863518e-05,
+      "loss": 2.5838,
+      "step": 7500
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.1693861335515976,
+      "learning_rate": 9.999714939109426e-05,
+      "loss": 2.5842,
+      "step": 7600
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.2052980363368988,
+      "learning_rate": 9.999704667564935e-05,
+      "loss": 2.5846,
+      "step": 7700
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.24053046107292175,
+      "learning_rate": 9.999694214230418e-05,
+      "loss": 2.5853,
+      "step": 7800
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.18848931789398193,
+      "learning_rate": 9.999683579106255e-05,
+      "loss": 2.5847,
+      "step": 7900
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.21658702194690704,
+      "learning_rate": 9.999672762192834e-05,
+      "loss": 2.5834,
+      "step": 8000
+    },
+    {
+      "epoch": 0.06,
+      "eval_loss": 2.5834951400756836,
+      "eval_runtime": 5812.5431,
+      "eval_samples_per_second": 1009.034,
+      "eval_steps_per_second": 63.065,
+      "step": 8000
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.2054041624069214,
+      "learning_rate": 9.999661763490544e-05,
+      "loss": 2.5846,
+      "step": 8100
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.19781599938869476,
+      "learning_rate": 9.99965058299979e-05,
+      "loss": 2.5828,
+      "step": 8200
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.25480276346206665,
+      "learning_rate": 9.999639220720978e-05,
+      "loss": 2.5824,
+      "step": 8300
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.24584674835205078,
+      "learning_rate": 9.999627676654517e-05,
+      "loss": 2.5815,
+      "step": 8400
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.21407808363437653,
+      "learning_rate": 9.999615950800832e-05,
+      "loss": 2.5816,
+      "step": 8500
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.19315016269683838,
+      "learning_rate": 9.999604043160346e-05,
+      "loss": 2.582,
+      "step": 8600
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.18959668278694153,
+      "learning_rate": 9.999591953733491e-05,
+      "loss": 2.5823,
+      "step": 8700
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.2016109973192215,
+      "learning_rate": 9.99957968252071e-05,
+      "loss": 2.5817,
+      "step": 8800
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.1687597781419754,
+      "learning_rate": 9.999567229522448e-05,
+      "loss": 2.5808,
+      "step": 8900
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.17627790570259094,
+      "learning_rate": 9.999554594739155e-05,
+      "loss": 2.5809,
+      "step": 9000
+    },
+    {
+      "epoch": 0.07,
+      "eval_loss": 2.5812571048736572,
+      "eval_runtime": 5682.2226,
+      "eval_samples_per_second": 1032.176,
+      "eval_steps_per_second": 64.511,
+      "step": 9000
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.12426480650901794,
+      "learning_rate": 9.999541778171295e-05,
+      "loss": 2.5808,
+      "step": 9100
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.20493678748607635,
+      "learning_rate": 9.999528779819331e-05,
+      "loss": 2.582,
+      "step": 9200
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.2577107548713684,
+      "learning_rate": 9.999515599683736e-05,
+      "loss": 2.5821,
+      "step": 9300
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.17233897745609283,
+      "learning_rate": 9.999502237764991e-05,
+      "loss": 2.5805,
+      "step": 9400
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.17032714188098907,
+      "learning_rate": 9.99948869406358e-05,
+      "loss": 2.5797,
+      "step": 9500
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.21331337094306946,
+      "learning_rate": 9.999474968579994e-05,
+      "loss": 2.5807,
+      "step": 9600
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.1757407933473587,
+      "learning_rate": 9.999461061314734e-05,
+      "loss": 2.5796,
+      "step": 9700
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.20256169140338898,
+      "learning_rate": 9.999446972268308e-05,
+      "loss": 2.5805,
+      "step": 9800
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.16938987374305725,
+      "learning_rate": 9.999432701441223e-05,
+      "loss": 2.5793,
+      "step": 9900
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.15477782487869263,
+      "learning_rate": 9.999418248834002e-05,
+      "loss": 2.5797,
+      "step": 10000
+    },
+    {
+      "epoch": 0.07,
+      "eval_loss": 2.579577684402466,
+      "eval_runtime": 5716.7677,
+      "eval_samples_per_second": 1025.939,
+      "eval_steps_per_second": 64.121,
+      "step": 10000
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.1892194151878357,
+      "learning_rate": 9.99940361444717e-05,
+      "loss": 2.5807,
+      "step": 10100
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.1950315237045288,
+      "learning_rate": 9.999388798281258e-05,
+      "loss": 2.5803,
+      "step": 10200
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.182656928896904,
+      "learning_rate": 9.999373800336806e-05,
+      "loss": 2.579,
+      "step": 10300
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.2610529959201813,
+      "learning_rate": 9.999358620614357e-05,
+      "loss": 2.5798,
+      "step": 10400
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.1903856247663498,
+      "learning_rate": 9.999343259114464e-05,
+      "loss": 2.5782,
+      "step": 10500
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.2288079857826233,
+      "learning_rate": 9.999327715837687e-05,
+      "loss": 2.5798,
+      "step": 10600
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.22872629761695862,
+      "learning_rate": 9.99931199078459e-05,
+      "loss": 2.5777,
+      "step": 10700
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.22784186899662018,
+      "learning_rate": 9.999296083955744e-05,
+      "loss": 2.5783,
+      "step": 10800
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.1781127005815506,
+      "learning_rate": 9.999279995351729e-05,
+      "loss": 2.5784,
+      "step": 10900
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.19801098108291626,
+      "learning_rate": 9.99926372497313e-05,
+      "loss": 2.5786,
+      "step": 11000
+    },
+    {
+      "epoch": 0.08,
+      "eval_loss": 2.5776655673980713,
+      "eval_runtime": 5841.1118,
+      "eval_samples_per_second": 1004.099,
+      "eval_steps_per_second": 62.756,
+      "step": 11000
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.24839451909065247,
+      "learning_rate": 9.999247272820536e-05,
+      "loss": 2.5777,
+      "step": 11100
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.18496295809745789,
+      "learning_rate": 9.999230638894548e-05,
+      "loss": 2.5784,
+      "step": 11200
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.1987435519695282,
+      "learning_rate": 9.99921382319577e-05,
+      "loss": 2.5779,
+      "step": 11300
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.23174594342708588,
+      "learning_rate": 9.999196825724813e-05,
+      "loss": 2.5781,
+      "step": 11400
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.18109597265720367,
+      "learning_rate": 9.999179646482295e-05,
+      "loss": 2.5785,
+      "step": 11500
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.18909253180027008,
+      "learning_rate": 9.999162285468841e-05,
+      "loss": 2.5779,
+      "step": 11600
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.1274472177028656,
+      "learning_rate": 9.999144742685083e-05,
+      "loss": 2.5774,
+      "step": 11700
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.14917853474617004,
+      "learning_rate": 9.999127018131655e-05,
+      "loss": 2.5759,
+      "step": 11800
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.13930270075798035,
+      "learning_rate": 9.999109111809207e-05,
+      "loss": 2.5769,
+      "step": 11900
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.18673798441886902,
+      "learning_rate": 9.999091023718388e-05,
+      "loss": 2.5765,
+      "step": 12000
+    },
+    {
+      "epoch": 0.09,
+      "eval_loss": 2.576554775238037,
+      "eval_runtime": 5806.3953,
+      "eval_samples_per_second": 1010.102,
+      "eval_steps_per_second": 63.131,
+      "step": 12000
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.23524725437164307,
+      "learning_rate": 9.999072753859854e-05,
+      "loss": 2.577,
+      "step": 12100
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.23039047420024872,
+      "learning_rate": 9.99905430223427e-05,
+      "loss": 2.5777,
+      "step": 12200
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.17183104157447815,
+      "learning_rate": 9.99903566884231e-05,
+      "loss": 2.5757,
+      "step": 12300
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.22224171459674835,
+      "learning_rate": 9.999016853684646e-05,
+      "loss": 2.5775,
+      "step": 12400
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.2552145719528198,
+      "learning_rate": 9.998997856761967e-05,
+      "loss": 2.5753,
+      "step": 12500
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.14996840059757233,
+      "learning_rate": 9.998978678074961e-05,
+      "loss": 2.577,
+      "step": 12600
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.22422178089618683,
+      "learning_rate": 9.998959317624325e-05,
+      "loss": 2.576,
+      "step": 12700
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.15495595335960388,
+      "learning_rate": 9.998939775410767e-05,
+      "loss": 2.5773,
+      "step": 12800
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.21027402579784393,
+      "learning_rate": 9.998920051434992e-05,
+      "loss": 2.575,
+      "step": 12900
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.17667286098003387,
+      "learning_rate": 9.99890014569772e-05,
+      "loss": 2.5765,
+      "step": 13000
+    },
+    {
+      "epoch": 0.09,
+      "eval_loss": 2.574887275695801,
+      "eval_runtime": 5869.3511,
+      "eval_samples_per_second": 999.268,
+      "eval_steps_per_second": 62.454,
+      "step": 13000
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.21658362448215485,
+      "learning_rate": 9.998880058199675e-05,
+      "loss": 2.5741,
+      "step": 13100
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.20807647705078125,
+      "learning_rate": 9.998859788941588e-05,
+      "loss": 2.5756,
+      "step": 13200
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.1646522879600525,
+      "learning_rate": 9.998839337924195e-05,
+      "loss": 2.5756,
+      "step": 13300
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.166889950633049,
+      "learning_rate": 9.998818705148238e-05,
+      "loss": 2.5745,
+      "step": 13400
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.25859805941581726,
+      "learning_rate": 9.998797890614469e-05,
+      "loss": 2.5747,
+      "step": 13500
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.20016120374202728,
+      "learning_rate": 9.998776894323645e-05,
+      "loss": 2.5746,
+      "step": 13600
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.16718171536922455,
+      "learning_rate": 9.998755716276528e-05,
+      "loss": 2.574,
+      "step": 13700
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.20205742120742798,
+      "learning_rate": 9.998734356473892e-05,
+      "loss": 2.5754,
+      "step": 13800
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.14121076464653015,
+      "learning_rate": 9.998712814916508e-05,
+      "loss": 2.5746,
+      "step": 13900
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.21463043987751007,
+      "learning_rate": 9.998691091605163e-05,
+      "loss": 2.5748,
+      "step": 14000
+    },
+    {
+      "epoch": 0.1,
+      "eval_loss": 2.573892116546631,
+      "eval_runtime": 5790.9885,
+      "eval_samples_per_second": 1012.79,
+      "eval_steps_per_second": 63.299,
+      "step": 14000
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.21442490816116333,
+      "learning_rate": 9.998669406490967e-05,
+      "loss": 2.5752,
+      "step": 14100
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.1974353939294815,
+      "learning_rate": 9.998647321491594e-05,
+      "loss": 2.5733,
+      "step": 14200
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.17868639528751373,
+      "learning_rate": 9.99862505474064e-05,
+      "loss": 2.5733,
+      "step": 14300
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.18465746939182281,
+      "learning_rate": 9.998602606238913e-05,
+      "loss": 2.5727,
+      "step": 14400
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.22418785095214844,
+      "learning_rate": 9.998579975987234e-05,
+      "loss": 2.5739,
+      "step": 14500
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.20102180540561676,
+      "learning_rate": 9.998557163986423e-05,
+      "loss": 2.5726,
+      "step": 14600
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.1889944076538086,
+      "learning_rate": 9.998534170237307e-05,
+      "loss": 2.5719,
+      "step": 14700
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.1904602348804474,
+      "learning_rate": 9.998510994740727e-05,
+      "loss": 2.5719,
+      "step": 14800
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.20775534212589264,
+      "learning_rate": 9.998487637497522e-05,
+      "loss": 2.5725,
+      "step": 14900
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.203061044216156,
+      "learning_rate": 9.998464334798072e-05,
+      "loss": 2.5733,
+      "step": 15000
+    },
+    {
+      "epoch": 0.11,
+      "eval_loss": 2.5725255012512207,
+      "eval_runtime": 5867.3015,
+      "eval_samples_per_second": 999.617,
+      "eval_steps_per_second": 62.476,
+      "step": 15000
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.1729467511177063,
+      "learning_rate": 9.998440615881618e-05,
+      "loss": 2.5729,
+      "step": 15100
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.1721724420785904,
+      "learning_rate": 9.998416715221101e-05,
+      "loss": 2.5719,
+      "step": 15200
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.170950248837471,
+      "learning_rate": 9.998392632817387e-05,
+      "loss": 2.5725,
+      "step": 15300
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.1992483288049698,
+      "learning_rate": 9.998368368671353e-05,
+      "loss": 2.573,
+      "step": 15400
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.19748467206954956,
+      "learning_rate": 9.998343922783881e-05,
+      "loss": 2.5719,
+      "step": 15500
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.22308781743049622,
+      "learning_rate": 9.99831929515586e-05,
+      "loss": 2.5712,
+      "step": 15600
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.17296306788921356,
+      "learning_rate": 9.998294485788187e-05,
+      "loss": 2.5709,
+      "step": 15700
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.2266322374343872,
+      "learning_rate": 9.99826949468176e-05,
+      "loss": 2.5716,
+      "step": 15800
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.1890050321817398,
+      "learning_rate": 9.998244321837492e-05,
+      "loss": 2.5714,
+      "step": 15900
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.14952068030834198,
+      "learning_rate": 9.998218967256294e-05,
+      "loss": 2.5724,
+      "step": 16000
+    },
+    {
+      "epoch": 0.12,
+      "eval_loss": 2.570789098739624,
+      "eval_runtime": 5730.232,
+      "eval_samples_per_second": 1023.528,
+      "eval_steps_per_second": 63.971,
+      "step": 16000
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.20020237565040588,
+      "learning_rate": 9.998193430939093e-05,
+      "loss": 2.5712,
+      "step": 16100
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.2040109634399414,
+      "learning_rate": 9.998167712886813e-05,
+      "loss": 2.5722,
+      "step": 16200
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.14661578834056854,
+      "learning_rate": 9.998141813100392e-05,
+      "loss": 2.5713,
+      "step": 16300
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.17079943418502808,
+      "learning_rate": 9.998115731580771e-05,
+      "loss": 2.5718,
+      "step": 16400
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.17369835078716278,
+      "learning_rate": 9.998089468328898e-05,
+      "loss": 2.5721,
+      "step": 16500
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.16338428854942322,
+      "learning_rate": 9.998063023345725e-05,
+      "loss": 2.5718,
+      "step": 16600
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.16794905066490173,
+      "learning_rate": 9.99803639663222e-05,
+      "loss": 2.5708,
+      "step": 16700
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.20138411223888397,
+      "learning_rate": 9.998009588189345e-05,
+      "loss": 2.5696,
+      "step": 16800
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.23276585340499878,
+      "learning_rate": 9.99798259801808e-05,
+      "loss": 2.5691,
+      "step": 16900
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.1607973575592041,
+      "learning_rate": 9.997955426119402e-05,
+      "loss": 2.5714,
+      "step": 17000
+    },
+    {
+      "epoch": 0.12,
+      "eval_loss": 2.569423198699951,
+      "eval_runtime": 5840.5846,
+      "eval_samples_per_second": 1004.189,
+      "eval_steps_per_second": 62.762,
+      "step": 17000
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.1984170377254486,
+      "learning_rate": 9.997928072494302e-05,
+      "loss": 2.5703,
+      "step": 17100
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.22356334328651428,
+      "learning_rate": 9.997900537143772e-05,
+      "loss": 2.5711,
+      "step": 17200
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.1815560907125473,
+      "learning_rate": 9.997873098139096e-05,
+      "loss": 2.5697,
+      "step": 17300
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.27280667424201965,
+      "learning_rate": 9.997845201157949e-05,
+      "loss": 2.5697,
+      "step": 17400
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.15276475250720978,
+      "learning_rate": 9.997817122454387e-05,
+      "loss": 2.5675,
+      "step": 17500
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.19267494976520538,
+      "learning_rate": 9.997788862029429e-05,
+      "loss": 2.569,
+      "step": 17600
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.1580013483762741,
+      "learning_rate": 9.997760419884105e-05,
+      "loss": 2.5704,
+      "step": 17700
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.21980580687522888,
+      "learning_rate": 9.997731796019448e-05,
+      "loss": 2.5702,
+      "step": 17800
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.2392682433128357,
+      "learning_rate": 9.997702990436498e-05,
+      "loss": 2.5696,
+      "step": 17900
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.2011999785900116,
+      "learning_rate": 9.997674003136303e-05,
+      "loss": 2.5682,
+      "step": 18000
+    },
+    {
+      "epoch": 0.13,
+      "eval_loss": 2.5681583881378174,
+      "eval_runtime": 5608.808,
+      "eval_samples_per_second": 1045.686,
+      "eval_steps_per_second": 65.355,
+      "step": 18000
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.23138105869293213,
+      "learning_rate": 9.997644834119919e-05,
+      "loss": 2.5685,
+      "step": 18100
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.19521869719028473,
+      "learning_rate": 9.997615483388406e-05,
+      "loss": 2.5685,
+      "step": 18200
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.22132566571235657,
+      "learning_rate": 9.997585950942827e-05,
+      "loss": 2.5687,
+      "step": 18300
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.19450217485427856,
+      "learning_rate": 9.99755623678426e-05,
+      "loss": 2.5666,
+      "step": 18400
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.20810718834400177,
+      "learning_rate": 9.997526340913785e-05,
+      "loss": 2.5681,
+      "step": 18500
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.1711287647485733,
+      "learning_rate": 9.997496263332487e-05,
+      "loss": 2.5666,
+      "step": 18600
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.15737326443195343,
+      "learning_rate": 9.997466004041462e-05,
+      "loss": 2.5663,
+      "step": 18700
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.19085867702960968,
+      "learning_rate": 9.997435563041809e-05,
+      "loss": 2.569,
+      "step": 18800
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.20687063038349152,
+      "learning_rate": 9.997404940334637e-05,
+      "loss": 2.5654,
+      "step": 18900
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.21126438677310944,
+      "learning_rate": 9.997374135921054e-05,
+      "loss": 2.5677,
+      "step": 19000
+    },
+    {
+      "epoch": 0.14,
+      "eval_loss": 2.5664379596710205,
+      "eval_runtime": 5437.5739,
+      "eval_samples_per_second": 1078.616,
+      "eval_steps_per_second": 67.414,
+      "step": 19000
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.1988440304994583,
+      "learning_rate": 9.997343149802186e-05,
+      "loss": 2.5667,
+      "step": 19100
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.2025243192911148,
+      "learning_rate": 9.997311981979155e-05,
+      "loss": 2.5674,
+      "step": 19200
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.15592768788337708,
+      "learning_rate": 9.997280632453097e-05,
+      "loss": 2.5679,
+      "step": 19300
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.18881608545780182,
+      "learning_rate": 9.997249101225153e-05,
+      "loss": 2.5664,
+      "step": 19400
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.2082224190235138,
+      "learning_rate": 9.997217706325169e-05,
+      "loss": 2.5653,
+      "step": 19500
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.20503577589988708,
+      "learning_rate": 9.997185813513884e-05,
+      "loss": 2.5675,
+      "step": 19600
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.157631978392601,
+      "learning_rate": 9.997153739004159e-05,
+      "loss": 2.5655,
+      "step": 19700
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.1750982105731964,
+      "learning_rate": 9.997121482797162e-05,
+      "loss": 2.566,
+      "step": 19800
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.2016943246126175,
+      "learning_rate": 9.997089044894064e-05,
+      "loss": 2.564,
+      "step": 19900
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.23606260120868683,
+      "learning_rate": 9.997056425296043e-05,
+      "loss": 2.5643,
+      "step": 20000
+    },
+    {
+      "epoch": 0.15,
+      "eval_loss": 2.564971446990967,
+      "eval_runtime": 5311.9496,
+      "eval_samples_per_second": 1104.124,
+      "eval_steps_per_second": 69.008,
+      "step": 20000
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.21572504937648773,
+      "learning_rate": 9.997023624004287e-05,
+      "loss": 2.5655,
+      "step": 20100
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.1901276409626007,
+      "learning_rate": 9.996990641019987e-05,
+      "loss": 2.5646,
+      "step": 20200
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.1661011129617691,
+      "learning_rate": 9.996957476344345e-05,
+      "loss": 2.5656,
+      "step": 20300
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.1857927441596985,
+      "learning_rate": 9.996924129978566e-05,
+      "loss": 2.5641,
+      "step": 20400
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.17554591596126556,
+      "learning_rate": 9.996890601923861e-05,
+      "loss": 2.565,
+      "step": 20500
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.19168546795845032,
+      "learning_rate": 9.99685689218145e-05,
+      "loss": 2.5634,
+      "step": 20600
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.24985399842262268,
+      "learning_rate": 9.996823000752557e-05,
+      "loss": 2.5645,
+      "step": 20700
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.17692847549915314,
+      "learning_rate": 9.996788927638418e-05,
+      "loss": 2.5623,
+      "step": 20800
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.19862769544124603,
+      "learning_rate": 9.996754672840269e-05,
+      "loss": 2.5637,
+      "step": 20900
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.15231578052043915,
+      "learning_rate": 9.996720236359356e-05,
+      "loss": 2.5629,
+      "step": 21000
+    },
+    {
+      "epoch": 0.15,
+      "eval_loss": 2.5632758140563965,
+      "eval_runtime": 5223.2614,
+      "eval_samples_per_second": 1122.872,
+      "eval_steps_per_second": 70.18,
+      "step": 21000
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.15703310072422028,
+      "learning_rate": 9.996685618196933e-05,
+      "loss": 2.5647,
+      "step": 21100
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.21033476293087006,
+      "learning_rate": 9.996650818354254e-05,
+      "loss": 2.5616,
+      "step": 21200
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.17880433797836304,
+      "learning_rate": 9.99661583683259e-05,
+      "loss": 2.5632,
+      "step": 21300
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.17895223200321198,
+      "learning_rate": 9.996580673633208e-05,
+      "loss": 2.5612,
+      "step": 21400
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.1816408783197403,
+      "learning_rate": 9.996545683105445e-05,
+      "loss": 2.5627,
+      "step": 21500
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.17461837828159332,
+      "learning_rate": 9.996510158371221e-05,
+      "loss": 2.5622,
+      "step": 21600
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.22741451859474182,
+      "learning_rate": 9.996474451963123e-05,
+      "loss": 2.5626,
+      "step": 21700
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.2556922733783722,
+      "learning_rate": 9.996438563882451e-05,
+      "loss": 2.5627,
+      "step": 21800
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.18055401742458344,
+      "learning_rate": 9.99640249413051e-05,
+      "loss": 2.5623,
+      "step": 21900
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.1370435506105423,
+      "learning_rate": 9.996366242708609e-05,
+      "loss": 2.5629,
+      "step": 22000
+    },
+    {
+      "epoch": 0.16,
+      "eval_loss": 2.5613725185394287,
+      "eval_runtime": 5233.9822,
+      "eval_samples_per_second": 1120.572,
+      "eval_steps_per_second": 70.036,
+      "step": 22000
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.150621235370636,
+      "learning_rate": 9.996329809618068e-05,
+      "loss": 2.5595,
+      "step": 22100
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.23477642238140106,
+      "learning_rate": 9.996293194860211e-05,
+      "loss": 2.561,
+      "step": 22200
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.20428013801574707,
+      "learning_rate": 9.996256398436372e-05,
+      "loss": 2.5589,
+      "step": 22300
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.18161137402057648,
+      "learning_rate": 9.996219420347885e-05,
+      "loss": 2.5618,
+      "step": 22400
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.18925230205059052,
+      "learning_rate": 9.996182260596096e-05,
+      "loss": 2.5579,
+      "step": 22500
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.20608845353126526,
+      "learning_rate": 9.996144919182355e-05,
+      "loss": 2.5592,
+      "step": 22600
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.19250337779521942,
+      "learning_rate": 9.996107396108024e-05,
+      "loss": 2.5596,
+      "step": 22700
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.2164393663406372,
+      "learning_rate": 9.996069691374462e-05,
+      "loss": 2.5596,
+      "step": 22800
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.16476331651210785,
+      "learning_rate": 9.996031804983043e-05,
+      "loss": 2.5606,
+      "step": 22900
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.15099835395812988,
+      "learning_rate": 9.995993736935145e-05,
+      "loss": 2.5612,
+      "step": 23000
+    },
+    {
+      "epoch": 0.17,
+      "eval_loss": 2.5598702430725098,
+      "eval_runtime": 6082.4008,
+      "eval_samples_per_second": 964.266,
+      "eval_steps_per_second": 60.267,
+      "step": 23000
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.1863376796245575,
+      "learning_rate": 9.99595548723215e-05,
+      "loss": 2.5593,
+      "step": 23100
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.19632737338542938,
+      "learning_rate": 9.99591705587545e-05,
+      "loss": 2.5594,
+      "step": 23200
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.18330197036266327,
+      "learning_rate": 9.995878442866442e-05,
+      "loss": 2.5598,
+      "step": 23300
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.23508641123771667,
+      "learning_rate": 9.995839648206531e-05,
+      "loss": 2.5579,
+      "step": 23400
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.2019587904214859,
+      "learning_rate": 9.99580106255938e-05,
+      "loss": 2.5584,
+      "step": 23500
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.20707851648330688,
+      "learning_rate": 9.995761906418372e-05,
+      "loss": 2.5589,
+      "step": 23600
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.16306054592132568,
+      "learning_rate": 9.995722568630698e-05,
+      "loss": 2.5576,
+      "step": 23700
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.21417422592639923,
+      "learning_rate": 9.995683049197788e-05,
+      "loss": 2.5574,
+      "step": 23800
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.21078960597515106,
+      "learning_rate": 9.99564334812108e-05,
+      "loss": 2.5588,
+      "step": 23900
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.17299005389213562,
+      "learning_rate": 9.995603465402014e-05,
+      "loss": 2.5567,
+      "step": 24000
+    },
+    {
+      "epoch": 0.17,
+      "eval_loss": 2.5584349632263184,
+      "eval_runtime": 5391.8597,
+      "eval_samples_per_second": 1087.761,
+      "eval_steps_per_second": 67.985,
+      "step": 24000
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.1550675630569458,
+      "learning_rate": 9.995563401042043e-05,
+      "loss": 2.5576,
+      "step": 24100
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.21063561737537384,
+      "learning_rate": 9.995523155042623e-05,
+      "loss": 2.5585,
+      "step": 24200
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.19711926579475403,
+      "learning_rate": 9.995482727405219e-05,
+      "loss": 2.5589,
+      "step": 24300
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.19466377794742584,
+      "learning_rate": 9.995442118131297e-05,
+      "loss": 2.5579,
+      "step": 24400
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.1623954474925995,
+      "learning_rate": 9.995401327222338e-05,
+      "loss": 2.5577,
+      "step": 24500
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.18352490663528442,
+      "learning_rate": 9.995360354679822e-05,
+      "loss": 2.5583,
+      "step": 24600
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.17624753713607788,
+      "learning_rate": 9.99531920050524e-05,
+      "loss": 2.5577,
+      "step": 24700
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.15336942672729492,
+      "learning_rate": 9.995277864700089e-05,
+      "loss": 2.5578,
+      "step": 24800
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.17434526979923248,
+      "learning_rate": 9.99523634726587e-05,
+      "loss": 2.5569,
+      "step": 24900
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.21592725813388824,
+      "learning_rate": 9.995194648204095e-05,
+      "loss": 2.5543,
+      "step": 25000
+    },
+    {
+      "epoch": 0.18,
+      "eval_loss": 2.5564043521881104,
+      "eval_runtime": 5516.1995,
+      "eval_samples_per_second": 1063.242,
+      "eval_steps_per_second": 66.453,
+      "step": 25000
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.15827420353889465,
+      "learning_rate": 9.99515276751628e-05,
+      "loss": 2.5554,
+      "step": 25100
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.17489619553089142,
+      "learning_rate": 9.995110705203945e-05,
+      "loss": 2.5566,
+      "step": 25200
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.18893638253211975,
+      "learning_rate": 9.995068461268622e-05,
+      "loss": 2.5561,
+      "step": 25300
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.19673244655132294,
+      "learning_rate": 9.995026035711845e-05,
+      "loss": 2.5569,
+      "step": 25400
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.18465718626976013,
+      "learning_rate": 9.994983855505939e-05,
+      "loss": 2.5554,
+      "step": 25500
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.1976306438446045,
+      "learning_rate": 9.994941068527068e-05,
+      "loss": 2.5561,
+      "step": 25600
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.1467328816652298,
+      "learning_rate": 9.994898099931376e-05,
+      "loss": 2.5542,
+      "step": 25700
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.20342576503753662,
+      "learning_rate": 9.994854949720426e-05,
+      "loss": 2.555,
+      "step": 25800
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.20438461005687714,
+      "learning_rate": 9.994811617895786e-05,
+      "loss": 2.5553,
+      "step": 25900
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.16016176342964172,
+      "learning_rate": 9.994768104459032e-05,
+      "loss": 2.5568,
+      "step": 26000
+    },
+    {
+      "epoch": 0.19,
+      "eval_loss": 2.554506778717041,
+      "eval_runtime": 5432.0955,
+      "eval_samples_per_second": 1079.704,
+      "eval_steps_per_second": 67.482,
+      "step": 26000
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.14238539338111877,
+      "learning_rate": 9.994724409411746e-05,
+      "loss": 2.5552,
+      "step": 26100
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.23573274910449982,
+      "learning_rate": 9.994680532755518e-05,
+      "loss": 2.5523,
+      "step": 26200
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.19863677024841309,
+      "learning_rate": 9.994636474491942e-05,
+      "loss": 2.5529,
+      "step": 26300
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.21995621919631958,
+      "learning_rate": 9.994592234622619e-05,
+      "loss": 2.5538,
+      "step": 26400
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.22063715755939484,
+      "learning_rate": 9.99454781314916e-05,
+      "loss": 2.5529,
+      "step": 26500
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.1817554086446762,
+      "learning_rate": 9.99450321007318e-05,
+      "loss": 2.5542,
+      "step": 26600
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.16820061206817627,
+      "learning_rate": 9.994458425396298e-05,
+      "loss": 2.5532,
+      "step": 26700
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.22539284825325012,
+      "learning_rate": 9.994413459120146e-05,
+      "loss": 2.5546,
+      "step": 26800
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.17431499063968658,
+      "learning_rate": 9.994368311246356e-05,
+      "loss": 2.5535,
+      "step": 26900
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.21042422950267792,
+      "learning_rate": 9.99432298177657e-05,
+      "loss": 2.554,
+      "step": 27000
+    },
+    {
+      "epoch": 0.2,
+      "eval_loss": 2.5529098510742188,
+      "eval_runtime": 7939.2552,
+      "eval_samples_per_second": 738.741,
+      "eval_steps_per_second": 46.171,
+      "step": 27000
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.18547575175762177,
+      "learning_rate": 9.994277470712439e-05,
+      "loss": 2.5533,
+      "step": 27100
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.17700788378715515,
+      "learning_rate": 9.994231778055614e-05,
+      "loss": 2.553,
+      "step": 27200
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.2162560373544693,
+      "learning_rate": 9.99418590380776e-05,
+      "loss": 2.5531,
+      "step": 27300
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.17806923389434814,
+      "learning_rate": 9.99413984797054e-05,
+      "loss": 2.5508,
+      "step": 27400
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.1959153264760971,
+      "learning_rate": 9.994094073818738e-05,
+      "loss": 2.5521,
+      "step": 27500
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.2098911553621292,
+      "learning_rate": 9.994047656623675e-05,
+      "loss": 2.5518,
+      "step": 27600
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.17420220375061035,
+      "learning_rate": 9.994001057844278e-05,
+      "loss": 2.5508,
+      "step": 27700
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.1538473218679428,
+      "learning_rate": 9.993954277482238e-05,
+      "loss": 2.5506,
+      "step": 27800
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.1710011512041092,
+      "learning_rate": 9.993907315539257e-05,
+      "loss": 2.5509,
+      "step": 27900
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.14763277769088745,
+      "learning_rate": 9.993860172017044e-05,
+      "loss": 2.5514,
+      "step": 28000
+    },
+    {
+      "epoch": 0.2,
+      "eval_loss": 2.5512099266052246,
+      "eval_runtime": 9244.95,
+      "eval_samples_per_second": 634.406,
+      "eval_steps_per_second": 39.65,
+      "step": 28000
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.24788980185985565,
+      "learning_rate": 9.99381284691731e-05,
+      "loss": 2.5507,
+      "step": 28100
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.16047827899456024,
+      "learning_rate": 9.993765340241779e-05,
+      "loss": 2.5504,
+      "step": 28200
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.2097485363483429,
+      "learning_rate": 9.993717651992176e-05,
+      "loss": 2.5496,
+      "step": 28300
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.15989455580711365,
+      "learning_rate": 9.993669782170236e-05,
+      "loss": 2.5511,
+      "step": 28400
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.1990625262260437,
+      "learning_rate": 9.9936217307777e-05,
+      "loss": 2.5491,
+      "step": 28500
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.17394055426120758,
+      "learning_rate": 9.993573497816314e-05,
+      "loss": 2.5503,
+      "step": 28600
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.17688289284706116,
+      "learning_rate": 9.993525083287832e-05,
+      "loss": 2.5487,
+      "step": 28700
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.20647858083248138,
+      "learning_rate": 9.993476487194015e-05,
+      "loss": 2.5502,
+      "step": 28800
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.17351941764354706,
+      "learning_rate": 9.99342770953663e-05,
+      "loss": 2.5503,
+      "step": 28900
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.16259269416332245,
+      "learning_rate": 9.993379240808367e-05,
+      "loss": 2.5505,
+      "step": 29000
+    },
+    {
+      "epoch": 0.21,
+      "eval_loss": 2.5498733520507812,
+      "eval_runtime": 8094.52,
+      "eval_samples_per_second": 724.571,
+      "eval_steps_per_second": 45.286,
+      "step": 29000
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.152243971824646,
+      "learning_rate": 9.993330101844764e-05,
+      "loss": 2.5494,
+      "step": 29100
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.21322308480739594,
+      "learning_rate": 9.993280781322914e-05,
+      "loss": 2.5495,
+      "step": 29200
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.17743776738643646,
+      "learning_rate": 9.993231279244612e-05,
+      "loss": 2.55,
+      "step": 29300
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.19075080752372742,
+      "learning_rate": 9.993181595611659e-05,
+      "loss": 2.5477,
+      "step": 29400
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.14956378936767578,
+      "learning_rate": 9.993131730425858e-05,
+      "loss": 2.5499,
+      "step": 29500
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.1913817673921585,
+      "learning_rate": 9.993081683689026e-05,
+      "loss": 2.5499,
+      "step": 29600
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.1898573786020279,
+      "learning_rate": 9.99303145540298e-05,
+      "loss": 2.5478,
+      "step": 29700
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.18223215639591217,
+      "learning_rate": 9.992981045569545e-05,
+      "loss": 2.5497,
+      "step": 29800
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.17807921767234802,
+      "learning_rate": 9.992930454190558e-05,
+      "loss": 2.5482,
+      "step": 29900
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.19245785474777222,
+      "learning_rate": 9.992879681267855e-05,
+      "loss": 2.5477,
+      "step": 30000
+    },
+    {
+      "epoch": 0.22,
+      "eval_loss": 2.5480847358703613,
+      "eval_runtime": 5418.7874,
+      "eval_samples_per_second": 1082.355,
+      "eval_steps_per_second": 67.647,
+      "step": 30000
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.16644108295440674,
+      "learning_rate": 9.992828726803284e-05,
+      "loss": 2.5462,
+      "step": 30100
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.21199798583984375,
+      "learning_rate": 9.992777590798698e-05,
+      "loss": 2.5472,
+      "step": 30200
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.16835100948810577,
+      "learning_rate": 9.992726273255957e-05,
+      "loss": 2.5492,
+      "step": 30300
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.24301642179489136,
+      "learning_rate": 9.992674774176924e-05,
+      "loss": 2.547,
+      "step": 30400
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.1855507344007492,
+      "learning_rate": 9.992623093563473e-05,
+      "loss": 2.5482,
+      "step": 30500
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.20825399458408356,
+      "learning_rate": 9.992571231417482e-05,
+      "loss": 2.5472,
+      "step": 30600
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.18983405828475952,
+      "learning_rate": 9.99251918774084e-05,
+      "loss": 2.5456,
+      "step": 30700
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.19868837296962738,
+      "learning_rate": 9.992466962535437e-05,
+      "loss": 2.5472,
+      "step": 30800
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.217344731092453,
+      "learning_rate": 9.99241455580317e-05,
+      "loss": 2.5468,
+      "step": 30900
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.21630564332008362,
+      "learning_rate": 9.992361967545949e-05,
+      "loss": 2.5461,
+      "step": 31000
+    },
+    {
+      "epoch": 0.23,
+      "eval_loss": 2.5460619926452637,
+      "eval_runtime": 5334.7258,
+      "eval_samples_per_second": 1099.41,
+      "eval_steps_per_second": 68.713,
+      "step": 31000
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.20726899802684784,
+      "learning_rate": 9.992309197765681e-05,
+      "loss": 2.5479,
+      "step": 31100
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.20710667967796326,
+      "learning_rate": 9.99225624646429e-05,
+      "loss": 2.5472,
+      "step": 31200
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.23984268307685852,
+      "learning_rate": 9.992203113643699e-05,
+      "loss": 2.5449,
+      "step": 31300
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.1859433650970459,
+      "learning_rate": 9.992149799305838e-05,
+      "loss": 2.5456,
+      "step": 31400
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.19020648300647736,
+      "learning_rate": 9.992096303452647e-05,
+      "loss": 2.5472,
+      "step": 31500
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.18619538843631744,
+      "learning_rate": 9.992042626086073e-05,
+      "loss": 2.5446,
+      "step": 31600
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.15984103083610535,
+      "learning_rate": 9.991988767208065e-05,
+      "loss": 2.5429,
+      "step": 31700
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.1874154657125473,
+      "learning_rate": 9.991934726820583e-05,
+      "loss": 2.5457,
+      "step": 31800
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.21573562920093536,
+      "learning_rate": 9.99188050492559e-05,
+      "loss": 2.5451,
+      "step": 31900
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.20766520500183105,
+      "learning_rate": 9.991826101525059e-05,
+      "loss": 2.5457,
+      "step": 32000
+    },
+    {
+      "epoch": 0.23,
+      "eval_loss": 2.5444440841674805,
+      "eval_runtime": 5469.9575,
+      "eval_samples_per_second": 1072.23,
+      "eval_steps_per_second": 67.014,
+      "step": 32000
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.1947011798620224,
+      "learning_rate": 9.991771516620969e-05,
+      "loss": 2.5439,
+      "step": 32100
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.20974138379096985,
+      "learning_rate": 9.991716750215303e-05,
+      "loss": 2.5455,
+      "step": 32200
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.20095574855804443,
+      "learning_rate": 9.991661802310053e-05,
+      "loss": 2.544,
+      "step": 32300
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.15232567489147186,
+      "learning_rate": 9.991606672907218e-05,
+      "loss": 2.5447,
+      "step": 32400
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.1890016496181488,
+      "learning_rate": 9.99155191601618e-05,
+      "loss": 2.5438,
+      "step": 32500
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.15099911391735077,
+      "learning_rate": 9.991496425439117e-05,
+      "loss": 2.5426,
+      "step": 32600
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.19564999639987946,
+      "learning_rate": 9.991440753370483e-05,
+      "loss": 2.5433,
+      "step": 32700
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.23213982582092285,
+      "learning_rate": 9.991384899812299e-05,
+      "loss": 2.5451,
+      "step": 32800
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.21088984608650208,
+      "learning_rate": 9.991328864766595e-05,
+      "loss": 2.5423,
+      "step": 32900
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.2080707848072052,
+      "learning_rate": 9.991272648235412e-05,
+      "loss": 2.542,
+      "step": 33000
+    },
+    {
+      "epoch": 0.24,
+      "eval_loss": 2.543639898300171,
+      "eval_runtime": 5500.5758,
+      "eval_samples_per_second": 1066.262,
+      "eval_steps_per_second": 66.641,
+      "step": 33000
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.20043937861919403,
+      "learning_rate": 9.991216250220794e-05,
+      "loss": 2.5434,
+      "step": 33100
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.26814088225364685,
+      "learning_rate": 9.991159670724789e-05,
+      "loss": 2.5439,
+      "step": 33200
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.1790783852338791,
+      "learning_rate": 9.991102909749455e-05,
+      "loss": 2.5422,
+      "step": 33300
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.17718929052352905,
+      "learning_rate": 9.991045967296856e-05,
+      "loss": 2.5414,
+      "step": 33400
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.19124653935432434,
+      "learning_rate": 9.990988843369065e-05,
+      "loss": 2.5409,
+      "step": 33500
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.19628183543682098,
+      "learning_rate": 9.990931537968155e-05,
+      "loss": 2.5423,
+      "step": 33600
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.2127145528793335,
+      "learning_rate": 9.990874051096211e-05,
+      "loss": 2.5451,
+      "step": 33700
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.19408264756202698,
+      "learning_rate": 9.990816382755324e-05,
+      "loss": 2.5405,
+      "step": 33800
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.1892174333333969,
+      "learning_rate": 9.99075853294759e-05,
+      "loss": 2.5428,
+      "step": 33900
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.20231932401657104,
+      "learning_rate": 9.990700501675114e-05,
+      "loss": 2.5424,
+      "step": 34000
+    },
+    {
+      "epoch": 0.25,
+      "eval_loss": 2.5417420864105225,
+      "eval_runtime": 5451.1377,
+      "eval_samples_per_second": 1075.932,
+      "eval_steps_per_second": 67.246,
+      "step": 34000
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.17818154394626617,
+      "learning_rate": 9.990642288940005e-05,
+      "loss": 2.5406,
+      "step": 34100
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.20383848249912262,
+      "learning_rate": 9.990583894744378e-05,
+      "loss": 2.5414,
+      "step": 34200
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.19464430212974548,
+      "learning_rate": 9.99052531909036e-05,
+      "loss": 2.5413,
+      "step": 34300
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.17793488502502441,
+      "learning_rate": 9.990466561980076e-05,
+      "loss": 2.5421,
+      "step": 34400
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.2119537591934204,
+      "learning_rate": 9.990407623415668e-05,
+      "loss": 2.54,
+      "step": 34500
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.20085354149341583,
+      "learning_rate": 9.990348503399273e-05,
+      "loss": 2.5395,
+      "step": 34600
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.18408524990081787,
+      "learning_rate": 9.990289201933045e-05,
+      "loss": 2.5395,
+      "step": 34700
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.18326416611671448,
+      "learning_rate": 9.990229719019137e-05,
+      "loss": 2.5404,
+      "step": 34800
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.2630787193775177,
+      "learning_rate": 9.990170054659715e-05,
+      "loss": 2.5404,
+      "step": 34900
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.23082153499126434,
+      "learning_rate": 9.990110208856948e-05,
+      "loss": 2.5402,
+      "step": 35000
+    },
+    {
+      "epoch": 0.25,
+      "eval_loss": 2.540555477142334,
+      "eval_runtime": 5286.1621,
+      "eval_samples_per_second": 1109.511,
+      "eval_steps_per_second": 69.344,
+      "step": 35000
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.2101873904466629,
+      "learning_rate": 9.990050181613012e-05,
+      "loss": 2.5409,
+      "step": 35100
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.1748313456773758,
+      "learning_rate": 9.989989972930086e-05,
+      "loss": 2.5408,
+      "step": 35200
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.21641358733177185,
+      "learning_rate": 9.989929582810362e-05,
+      "loss": 2.5407,
+      "step": 35300
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.20202140510082245,
+      "learning_rate": 9.989869011256037e-05,
+      "loss": 2.5419,
+      "step": 35400
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.1987978219985962,
+      "learning_rate": 9.989808258269311e-05,
+      "loss": 2.54,
+      "step": 35500
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.17482729256153107,
+      "learning_rate": 9.989747323852394e-05,
+      "loss": 2.5398,
+      "step": 35600
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.15725675225257874,
+      "learning_rate": 9.989686208007502e-05,
+      "loss": 2.5384,
+      "step": 35700
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": Infinity,
+      "learning_rate": 9.989625524607613e-05,
+      "loss": 2.54,
+      "step": 35800
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.19513057172298431,
+      "learning_rate": 9.989564047727667e-05,
+      "loss": 2.541,
+      "step": 35900
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.24705880880355835,
+      "learning_rate": 9.989502389426411e-05,
+      "loss": 2.5398,
+      "step": 36000
+    },
+    {
+      "epoch": 0.26,
+      "eval_loss": 2.5393338203430176,
+      "eval_runtime": 5370.7521,
+      "eval_samples_per_second": 1092.036,
+      "eval_steps_per_second": 68.252,
+      "step": 36000
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.17639793455600739,
+      "learning_rate": 9.989440549706085e-05,
+      "loss": 2.5398,
+      "step": 36100
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.23090311884880066,
+      "learning_rate": 9.989378528568935e-05,
+      "loss": 2.5399,
+      "step": 36200
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.21377325057983398,
+      "learning_rate": 9.989316326017221e-05,
+      "loss": 2.5351,
+      "step": 36300
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.21422795951366425,
+      "learning_rate": 9.989253942053204e-05,
+      "loss": 2.5381,
+      "step": 36400
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.20974934101104736,
+      "learning_rate": 9.98919137667915e-05,
+      "loss": 2.5378,
+      "step": 36500
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.19352389872074127,
+      "learning_rate": 9.989128629897335e-05,
+      "loss": 2.5378,
+      "step": 36600
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.21016819775104523,
+      "learning_rate": 9.989065701710041e-05,
+      "loss": 2.5366,
+      "step": 36700
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.1750701367855072,
+      "learning_rate": 9.989002592119554e-05,
+      "loss": 2.5399,
+      "step": 36800
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.18955004215240479,
+      "learning_rate": 9.988939301128171e-05,
+      "loss": 2.5411,
+      "step": 36900
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.22183337807655334,
+      "learning_rate": 9.988875828738192e-05,
+      "loss": 2.5385,
+      "step": 37000
+    },
+    {
+      "epoch": 0.27,
+      "eval_loss": 2.537937879562378,
+      "eval_runtime": 5267.1428,
+      "eval_samples_per_second": 1113.517,
+      "eval_steps_per_second": 69.595,
+      "step": 37000
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.17703290283679962,
+      "learning_rate": 9.988812174951926e-05,
+      "loss": 2.5386,
+      "step": 37100
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.17456910014152527,
+      "learning_rate": 9.988748339771686e-05,
+      "loss": 2.536,
+      "step": 37200
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.1729470044374466,
+      "learning_rate": 9.988684323199795e-05,
+      "loss": 2.5367,
+      "step": 37300
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.14801037311553955,
+      "learning_rate": 9.988620125238578e-05,
+      "loss": 2.5365,
+      "step": 37400
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.19737888872623444,
+      "learning_rate": 9.988555745890371e-05,
+      "loss": 2.5373,
+      "step": 37500
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.2556416392326355,
+      "learning_rate": 9.988491185157514e-05,
+      "loss": 2.5389,
+      "step": 37600
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.22999688982963562,
+      "learning_rate": 9.988426443042357e-05,
+      "loss": 2.5366,
+      "step": 37700
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.20399746298789978,
+      "learning_rate": 9.988361519547252e-05,
+      "loss": 2.5375,
+      "step": 37800
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.1886683702468872,
+      "learning_rate": 9.988296414674556e-05,
+      "loss": 2.5378,
+      "step": 37900
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.15870767831802368,
+      "learning_rate": 9.988231128426643e-05,
+      "loss": 2.5396,
+      "step": 38000
+    },
+    {
+      "epoch": 0.28,
+      "eval_loss": 2.536235809326172,
+      "eval_runtime": 5469.5004,
+      "eval_samples_per_second": 1072.32,
+      "eval_steps_per_second": 67.02,
+      "step": 38000
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.19297446310520172,
+      "learning_rate": 9.988165660805883e-05,
+      "loss": 2.5364,
+      "step": 38100
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.1729346513748169,
+      "learning_rate": 9.988100011814657e-05,
+      "loss": 2.5374,
+      "step": 38200
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.2072724550962448,
+      "learning_rate": 9.988034181455352e-05,
+      "loss": 2.5381,
+      "step": 38300
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.1756744384765625,
+      "learning_rate": 9.987968169730362e-05,
+      "loss": 2.5379,
+      "step": 38400
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.27329882979393005,
+      "learning_rate": 9.987901976642086e-05,
+      "loss": 2.5349,
+      "step": 38500
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.23222842812538147,
+      "learning_rate": 9.987835602192934e-05,
+      "loss": 2.5355,
+      "step": 38600
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.2461015284061432,
+      "learning_rate": 9.987769046385316e-05,
+      "loss": 2.5364,
+      "step": 38700
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.1758180409669876,
+      "learning_rate": 9.987702309221651e-05,
+      "loss": 2.5377,
+      "step": 38800
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.15205176174640656,
+      "learning_rate": 9.987635390704369e-05,
+      "loss": 2.5376,
+      "step": 38900
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.1985970139503479,
+      "learning_rate": 9.987568290835903e-05,
+      "loss": 2.5372,
+      "step": 39000
+    },
+    {
+      "epoch": 0.28,
+      "eval_loss": 2.53525710105896,
+      "eval_runtime": 5498.4571,
+      "eval_samples_per_second": 1066.673,
+      "eval_steps_per_second": 66.667,
+      "step": 39000
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.1800653338432312,
+      "learning_rate": 9.987501009618691e-05,
+      "loss": 2.5349,
+      "step": 39100
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.22484809160232544,
+      "learning_rate": 9.987433547055178e-05,
+      "loss": 2.5364,
+      "step": 39200
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.1974021941423416,
+      "learning_rate": 9.98736590314782e-05,
+      "loss": 2.5333,
+      "step": 39300
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.23238864541053772,
+      "learning_rate": 9.987298077899076e-05,
+      "loss": 2.5371,
+      "step": 39400
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.17493529617786407,
+      "learning_rate": 9.987230071311411e-05,
+      "loss": 2.5356,
+      "step": 39500
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.23257118463516235,
+      "learning_rate": 9.987161883387299e-05,
+      "loss": 2.5354,
+      "step": 39600
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.18204239010810852,
+      "learning_rate": 9.987094198719394e-05,
+      "loss": 2.5348,
+      "step": 39700
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.20585016906261444,
+      "learning_rate": 9.987025649943133e-05,
+      "loss": 2.5347,
+      "step": 39800
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.19951903820037842,
+      "learning_rate": 9.986956919837858e-05,
+      "loss": 2.5355,
+      "step": 39900
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.1689595878124237,
+      "learning_rate": 9.986888008406065e-05,
+      "loss": 2.535,
+      "step": 40000
+    },
+    {
+      "epoch": 0.29,
+      "eval_loss": 2.534259796142578,
+      "eval_runtime": 5327.044,
+      "eval_samples_per_second": 1100.996,
+      "eval_steps_per_second": 68.812,
+      "step": 40000
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.16131243109703064,
+      "learning_rate": 9.986818915650265e-05,
+      "loss": 2.5343,
+      "step": 40100
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.16442734003067017,
+      "learning_rate": 9.986749641572963e-05,
+      "loss": 2.5336,
+      "step": 40200
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.17911262810230255,
+      "learning_rate": 9.986680186176684e-05,
+      "loss": 2.534,
+      "step": 40300
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.19391265511512756,
+      "learning_rate": 9.986610549463952e-05,
+      "loss": 2.5344,
+      "step": 40400
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.21224987506866455,
+      "learning_rate": 9.986540731437298e-05,
+      "loss": 2.5362,
+      "step": 40500
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.21114754676818848,
+      "learning_rate": 9.986470732099258e-05,
+      "loss": 2.5344,
+      "step": 40600
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.17994599044322968,
+      "learning_rate": 9.986400551452382e-05,
+      "loss": 2.5338,
+      "step": 40700
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.20839715003967285,
+      "learning_rate": 9.98633018949922e-05,
+      "loss": 2.5327,
+      "step": 40800
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.1808551549911499,
+      "learning_rate": 9.986259646242329e-05,
+      "loss": 2.5323,
+      "step": 40900
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.20471493899822235,
+      "learning_rate": 9.986188921684276e-05,
+      "loss": 2.5318,
+      "step": 41000
+    },
+    {
+      "epoch": 0.3,
+      "eval_loss": 2.5325772762298584,
+      "eval_runtime": 5462.6174,
+      "eval_samples_per_second": 1073.671,
+      "eval_steps_per_second": 67.104,
+      "step": 41000
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.18955881893634796,
+      "learning_rate": 9.986118015827632e-05,
+      "loss": 2.5328,
+      "step": 41100
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.20864000916481018,
+      "learning_rate": 9.986046928674974e-05,
+      "loss": 2.5303,
+      "step": 41200
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.17366698384284973,
+      "learning_rate": 9.985975660228888e-05,
+      "loss": 2.5314,
+      "step": 41300
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.2348269373178482,
+      "learning_rate": 9.985904210491963e-05,
+      "loss": 2.5336,
+      "step": 41400
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.2203817069530487,
+      "learning_rate": 9.985832579466801e-05,
+      "loss": 2.5307,
+      "step": 41500
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.20355217158794403,
+      "learning_rate": 9.985760767156003e-05,
+      "loss": 2.5329,
+      "step": 41600
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.21894210577011108,
+      "learning_rate": 9.985688773562183e-05,
+      "loss": 2.5325,
+      "step": 41700
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.18119728565216064,
+      "learning_rate": 9.985616598687954e-05,
+      "loss": 2.5316,
+      "step": 41800
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.22831158339977264,
+      "learning_rate": 9.985544242535946e-05,
+      "loss": 2.5316,
+      "step": 41900
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.18714164197444916,
+      "learning_rate": 9.985471705108787e-05,
+      "loss": 2.5315,
+      "step": 42000
+    },
+    {
+      "epoch": 0.31,
+      "eval_loss": 2.531712770462036,
+      "eval_runtime": 5461.1677,
+      "eval_samples_per_second": 1073.956,
+      "eval_steps_per_second": 67.122,
+      "step": 42000
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.19056876003742218,
+      "learning_rate": 9.985398986409115e-05,
+      "loss": 2.5305,
+      "step": 42100
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.19371190667152405,
+      "learning_rate": 9.985326086439573e-05,
+      "loss": 2.5333,
+      "step": 42200
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.19493506848812103,
+      "learning_rate": 9.985253005202813e-05,
+      "loss": 2.5279,
+      "step": 42300
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.17453902959823608,
+      "learning_rate": 9.985179742701491e-05,
+      "loss": 2.5334,
+      "step": 42400
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.19747483730316162,
+      "learning_rate": 9.985106298938272e-05,
+      "loss": 2.5321,
+      "step": 42500
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.18142738938331604,
+      "learning_rate": 9.985032673915826e-05,
+      "loss": 2.5312,
+      "step": 42600
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.21015243232250214,
+      "learning_rate": 9.984958867636828e-05,
+      "loss": 2.5328,
+      "step": 42700
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.20541459321975708,
+      "learning_rate": 9.984884880103966e-05,
+      "loss": 2.5318,
+      "step": 42800
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.24034422636032104,
+      "learning_rate": 9.984810711319927e-05,
+      "loss": 2.5305,
+      "step": 42900
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.22423624992370605,
+      "learning_rate": 9.98473636128741e-05,
+      "loss": 2.5298,
+      "step": 43000
+    },
+    {
+      "epoch": 0.31,
+      "eval_loss": 2.530484437942505,
+      "eval_runtime": 5481.4821,
+      "eval_samples_per_second": 1069.976,
+      "eval_steps_per_second": 66.874,
+      "step": 43000
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.1877780258655548,
+      "learning_rate": 9.984661830009115e-05,
+      "loss": 2.5308,
+      "step": 43100
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.2150457501411438,
+      "learning_rate": 9.984587117487755e-05,
+      "loss": 2.5318,
+      "step": 43200
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.17595185339450836,
+      "learning_rate": 9.984512223726045e-05,
+      "loss": 2.5291,
+      "step": 43300
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.1872701346874237,
+      "learning_rate": 9.984437148726708e-05,
+      "loss": 2.5298,
+      "step": 43400
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.15969082713127136,
+      "learning_rate": 9.984361892492475e-05,
+      "loss": 2.5322,
+      "step": 43500
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.22320838272571564,
+      "learning_rate": 9.984286455026082e-05,
+      "loss": 2.5284,
+      "step": 43600
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.22105048596858978,
+      "learning_rate": 9.984211593414306e-05,
+      "loss": 2.529,
+      "step": 43700
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.18613219261169434,
+      "learning_rate": 9.98413579530408e-05,
+      "loss": 2.5306,
+      "step": 43800
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.21678748726844788,
+      "learning_rate": 9.984059815969915e-05,
+      "loss": 2.5296,
+      "step": 43900
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.19831928610801697,
+      "learning_rate": 9.983983655414574e-05,
+      "loss": 2.5296,
+      "step": 44000
+    },
+    {
+      "epoch": 0.32,
+      "eval_loss": 2.5292155742645264,
+      "eval_runtime": 5508.261,
+      "eval_samples_per_second": 1064.774,
+      "eval_steps_per_second": 66.548,
+      "step": 44000
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": NaN,
+      "learning_rate": 9.983908077955583e-05,
+      "loss": 2.5294,
+      "step": 44100
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.18271589279174805,
+      "learning_rate": 9.983831556778345e-05,
+      "loss": 2.5312,
+      "step": 44200
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.24201270937919617,
+      "learning_rate": 9.98375485438823e-05,
+      "loss": 2.5303,
+      "step": 44300
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.18819460272789001,
+      "learning_rate": 9.983677970788026e-05,
+      "loss": 2.5285,
+      "step": 44400
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.16757667064666748,
+      "learning_rate": 9.983600905980529e-05,
+      "loss": 2.5289,
+      "step": 44500
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.16967040300369263,
+      "learning_rate": 9.983523659968541e-05,
+      "loss": 2.5306,
+      "step": 44600
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.19566090404987335,
+      "learning_rate": 9.983446232754872e-05,
+      "loss": 2.5276,
+      "step": 44700
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.19131731986999512,
+      "learning_rate": 9.983368624342335e-05,
+      "loss": 2.5304,
+      "step": 44800
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.1763111650943756,
+      "learning_rate": 9.983290834733753e-05,
+      "loss": 2.5281,
+      "step": 44900
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.18837742507457733,
+      "learning_rate": 9.983212863931956e-05,
+      "loss": 2.5301,
+      "step": 45000
+    },
+    {
+      "epoch": 0.33,
+      "eval_loss": 2.5283169746398926,
+      "eval_runtime": 5445.3147,
+      "eval_samples_per_second": 1077.082,
+      "eval_steps_per_second": 67.318,
+      "step": 45000
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.1776355654001236,
+      "learning_rate": 9.983134711939777e-05,
+      "loss": 2.5282,
+      "step": 45100
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.22749735414981842,
+      "learning_rate": 9.983056378760059e-05,
+      "loss": 2.5284,
+      "step": 45200
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.19231122732162476,
+      "learning_rate": 9.982977864395649e-05,
+      "loss": 2.5288,
+      "step": 45300
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.18644002079963684,
+      "learning_rate": 9.9828991688494e-05,
+      "loss": 2.5291,
+      "step": 45400
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.1956445276737213,
+      "learning_rate": 9.982820292124177e-05,
+      "loss": 2.5274,
+      "step": 45500
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.23224614560604095,
+      "learning_rate": 9.982741234222848e-05,
+      "loss": 2.5267,
+      "step": 45600
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.18513694405555725,
+      "learning_rate": 9.982661995148284e-05,
+      "loss": 2.5286,
+      "step": 45700
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.21743735671043396,
+      "learning_rate": 9.982582574903369e-05,
+      "loss": 2.5255,
+      "step": 45800
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.1804727017879486,
+      "learning_rate": 9.982502973490989e-05,
+      "loss": 2.5251,
+      "step": 45900
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.20864586532115936,
+      "learning_rate": 9.982423190914041e-05,
+      "loss": 2.5264,
+      "step": 46000
+    },
+    {
+      "epoch": 0.33,
+      "eval_loss": 2.5271406173706055,
+      "eval_runtime": 5458.9299,
+      "eval_samples_per_second": 1074.396,
+      "eval_steps_per_second": 67.15,
+      "step": 46000
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.2307577133178711,
+      "learning_rate": 9.982343227175422e-05,
+      "loss": 2.5278,
+      "step": 46100
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.20710083842277527,
+      "learning_rate": 9.982263082278043e-05,
+      "loss": 2.5277,
+      "step": 46200
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.1700790673494339,
+      "learning_rate": 9.982182756224816e-05,
+      "loss": 2.5279,
+      "step": 46300
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.2546616494655609,
+      "learning_rate": 9.982102249018664e-05,
+      "loss": 2.5257,
+      "step": 46400
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.2152370661497116,
+      "learning_rate": 9.982021560662511e-05,
+      "loss": 2.5245,
+      "step": 46500
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.177117258310318,
+      "learning_rate": 9.981940691159294e-05,
+      "loss": 2.5282,
+      "step": 46600
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.1751713901758194,
+      "learning_rate": 9.98185964051195e-05,
+      "loss": 2.5263,
+      "step": 46700
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.20219507813453674,
+      "learning_rate": 9.981779221937954e-05,
+      "loss": 2.5265,
+      "step": 46800
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.22525231540203094,
+      "learning_rate": 9.981697810822577e-05,
+      "loss": 2.5273,
+      "step": 46900
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.184763565659523,
+      "learning_rate": 9.981616218571906e-05,
+      "loss": 2.5277,
+      "step": 47000
+    },
+    {
+      "epoch": 0.34,
+      "eval_loss": 2.526160955429077,
+      "eval_runtime": 5512.6944,
+      "eval_samples_per_second": 1063.918,
+      "eval_steps_per_second": 66.495,
+      "step": 47000
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.21627645194530487,
+      "learning_rate": 9.981534445188906e-05,
+      "loss": 2.5277,
+      "step": 47100
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.19909755885601044,
+      "learning_rate": 9.981452490676553e-05,
+      "loss": 2.5279,
+      "step": 47200
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.17634861171245575,
+      "learning_rate": 9.981370355037826e-05,
+      "loss": 2.5262,
+      "step": 47300
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.19381621479988098,
+      "learning_rate": 9.981288038275712e-05,
+      "loss": 2.5266,
+      "step": 47400
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.20312373340129852,
+      "learning_rate": 9.981205540393204e-05,
+      "loss": 2.525,
+      "step": 47500
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.2033509761095047,
+      "learning_rate": 9.9811228613933e-05,
+      "loss": 2.5277,
+      "step": 47600
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.1845845878124237,
+      "learning_rate": 9.981040001279007e-05,
+      "loss": 2.5281,
+      "step": 47700
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.22188545763492584,
+      "learning_rate": 9.98095696005334e-05,
+      "loss": 2.5264,
+      "step": 47800
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.18713383376598358,
+      "learning_rate": 9.980873737719315e-05,
+      "loss": 2.5265,
+      "step": 47900
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.1767318844795227,
+      "learning_rate": 9.980790334279959e-05,
+      "loss": 2.5271,
+      "step": 48000
+    },
+    {
+      "epoch": 0.35,
+      "eval_loss": 2.525097131729126,
+      "eval_runtime": 5444.1367,
+      "eval_samples_per_second": 1077.316,
+      "eval_steps_per_second": 67.332,
+      "step": 48000
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.21088416874408722,
+      "learning_rate": 9.980706749738306e-05,
+      "loss": 2.5274,
+      "step": 48100
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.19877323508262634,
+      "learning_rate": 9.980622984097396e-05,
+      "loss": 2.5265,
+      "step": 48200
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.19006992876529694,
+      "learning_rate": 9.98053903736027e-05,
+      "loss": 2.5249,
+      "step": 48300
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.21013252437114716,
+      "learning_rate": 9.980454909529985e-05,
+      "loss": 2.5258,
+      "step": 48400
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.20572130382061005,
+      "learning_rate": 9.980370600609597e-05,
+      "loss": 2.525,
+      "step": 48500
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.1854819506406784,
+      "learning_rate": 9.980286110602174e-05,
+      "loss": 2.5267,
+      "step": 48600
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.18405982851982117,
+      "learning_rate": 9.980201439510786e-05,
+      "loss": 2.5266,
+      "step": 48700
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.1952805072069168,
+      "learning_rate": 9.980116587338512e-05,
+      "loss": 2.5257,
+      "step": 48800
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.19929055869579315,
+      "learning_rate": 9.980031554088438e-05,
+      "loss": 2.5236,
+      "step": 48900
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.19686761498451233,
+      "learning_rate": 9.979946339763656e-05,
+      "loss": 2.5265,
+      "step": 49000
+    },
+    {
+      "epoch": 0.36,
+      "eval_loss": 2.5248045921325684,
+      "eval_runtime": 5428.5128,
+      "eval_samples_per_second": 1080.416,
+      "eval_steps_per_second": 67.526,
+      "step": 49000
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.19359131157398224,
+      "learning_rate": 9.979860944367263e-05,
+      "loss": 2.525,
+      "step": 49100
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.1961933672428131,
+      "learning_rate": 9.979776224563294e-05,
+      "loss": 2.5269,
+      "step": 49200
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.21087850630283356,
+      "learning_rate": 9.97969046884364e-05,
+      "loss": 2.5256,
+      "step": 49300
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.23223061859607697,
+      "learning_rate": 9.979604532061681e-05,
+      "loss": 2.5252,
+      "step": 49400
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.19952614605426788,
+      "learning_rate": 9.97951841422054e-05,
+      "loss": 2.524,
+      "step": 49500
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.20062975585460663,
+      "learning_rate": 9.979432115323348e-05,
+      "loss": 2.5219,
+      "step": 49600
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.20036938786506653,
+      "learning_rate": 9.979345635373243e-05,
+      "loss": 2.5237,
+      "step": 49700
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.1781741976737976,
+      "learning_rate": 9.97925897437337e-05,
+      "loss": 2.5226,
+      "step": 49800
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.2209465205669403,
+      "learning_rate": 9.97917213232688e-05,
+      "loss": 2.5246,
+      "step": 49900
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.20452982187271118,
+      "learning_rate": 9.979085109236929e-05,
+      "loss": 2.526,
+      "step": 50000
+    },
+    {
+      "epoch": 0.36,
+      "eval_loss": 2.5235376358032227,
+      "eval_runtime": 5481.7843,
+      "eval_samples_per_second": 1069.917,
+      "eval_steps_per_second": 66.87,
+      "step": 50000
+    }
+  ],
+  "logging_steps": 100,
+  "max_steps": 412386,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 5000,
+  "total_flos": 2.399847579648e+17,
+  "train_batch_size": 96,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:786f7ba7c87c545a0ad49d253fe302a16355933a913e9933ea0c8f8c92fea23c
+size 4920