Initial commit

Browse files

Files changed (12) hide show

added_tokens.json +3 -0
config.json +41 -0
model.safetensors +3 -0
optimizer.pt +3 -0
rng_state.pth +3 -0
scheduler.pt +3 -0
special_tokens_map.json +15 -0
spm.model +3 -0
tokenizer.json +0 -0
tokenizer_config.json +58 -0
trainer_state.json +2585 -0
training_args.bin +3 -0

added_tokens.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "[MASK]": 128000
+}

config.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+  "_name_or_path": "results/checkpoint-25000",
+  "architectures": [
+    "DebertaV2ForSequenceClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "id2label": {
+    "0": "LABEL_0"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "label2id": {
+    "LABEL_0": 0
+  },
+  "layer_norm_eps": 1e-07,
+  "max_position_embeddings": 512,
+  "max_relative_positions": -1,
+  "model_type": "deberta-v2",
+  "norm_rel_ebd": "layer_norm",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "pooler_dropout": 0,
+  "pooler_hidden_act": "gelu",
+  "pooler_hidden_size": 768,
+  "pos_att_type": [
+    "p2c",
+    "c2p"
+  ],
+  "position_biased_input": true,
+  "position_buckets": 256,
+  "relative_attention": true,
+  "share_att_key": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.42.3",
+  "type_vocab_size": 0,
+  "vocab_size": 128100
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ed478963b7447a8f3e4bb2990487ccbc13a3b52dbb5b0ddd3a86b6c846c6ad24
+size 739289172

optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:40d6e18ae8fbc3711f3788dccb2405cadabf3dfbf24c412b2c10ed2ed6f6e3e1
+size 1478698874

rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0dc060a15caa2d6730d0bb07986669c84d6593d43c6d10f8bebbb5cd803f1c3c
+size 14244

scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5aa07b3c00281fb18434b93ce2b2a8e51a3da90ed296ea2c862ed48d45752cbf
+size 1064

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "bos_token": "[CLS]",
+  "cls_token": "[CLS]",
+  "eos_token": "[SEP]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": {
+    "content": "[UNK]",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

spm.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c679fbf93643d19aab7ee10c0b99e460bdbc02fedf34b92b05af343b4af586fd
+size 2464616

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,58 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128000": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "[CLS]",
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "do_lower_case": false,
+  "eos_token": "[SEP]",
+  "mask_token": "[MASK]",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "sp_model_kwargs": {},
+  "split_by_punct": false,
+  "tokenizer_class": "DebertaV2Tokenizer",
+  "unk_token": "[UNK]",
+  "vocab_type": "spm"
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,2585 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 40.0,
+  "eval_steps": 500,
+  "global_step": 160320,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.124750499001996,
+      "grad_norm": 13.176804542541504,
+      "learning_rate": 1.9937624750499e-06,
+      "loss": 0.2137,
+      "step": 500
+    },
+    {
+      "epoch": 0.249500998003992,
+      "grad_norm": 52.68854904174805,
+      "learning_rate": 1.9875249500998005e-06,
+      "loss": 0.2463,
+      "step": 1000
+    },
+    {
+      "epoch": 0.37425149700598803,
+      "grad_norm": 9.197150230407715,
+      "learning_rate": 1.9812874251497004e-06,
+      "loss": 0.2316,
+      "step": 1500
+    },
+    {
+      "epoch": 0.499001996007984,
+      "grad_norm": 23.94010353088379,
+      "learning_rate": 1.9750499001996007e-06,
+      "loss": 0.2095,
+      "step": 2000
+    },
+    {
+      "epoch": 0.6237524950099801,
+      "grad_norm": 25.69223976135254,
+      "learning_rate": 1.968812375249501e-06,
+      "loss": 0.2102,
+      "step": 2500
+    },
+    {
+      "epoch": 0.7485029940119761,
+      "grad_norm": 14.870789527893066,
+      "learning_rate": 1.9625748502994013e-06,
+      "loss": 0.2335,
+      "step": 3000
+    },
+    {
+      "epoch": 0.873253493013972,
+      "grad_norm": 19.752464294433594,
+      "learning_rate": 1.9563373253493016e-06,
+      "loss": 0.2065,
+      "step": 3500
+    },
+    {
+      "epoch": 0.998003992015968,
+      "grad_norm": 7.356762409210205,
+      "learning_rate": 1.9500998003992014e-06,
+      "loss": 0.215,
+      "step": 4000
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 0.4418031871318817,
+      "eval_runtime": 50.9423,
+      "eval_samples_per_second": 62.934,
+      "eval_steps_per_second": 15.743,
+      "step": 4008
+    },
+    {
+      "epoch": 1.122754491017964,
+      "grad_norm": 1.049210786819458,
+      "learning_rate": 1.9438622754491017e-06,
+      "loss": 0.1786,
+      "step": 4500
+    },
+    {
+      "epoch": 1.24750499001996,
+      "grad_norm": 33.95945358276367,
+      "learning_rate": 1.937624750499002e-06,
+      "loss": 0.2107,
+      "step": 5000
+    },
+    {
+      "epoch": 1.372255489021956,
+      "grad_norm": 3.9420273303985596,
+      "learning_rate": 1.931387225548902e-06,
+      "loss": 0.1877,
+      "step": 5500
+    },
+    {
+      "epoch": 1.4970059880239521,
+      "grad_norm": 15.459404945373535,
+      "learning_rate": 1.925149700598802e-06,
+      "loss": 0.1808,
+      "step": 6000
+    },
+    {
+      "epoch": 1.621756487025948,
+      "grad_norm": 0.35231631994247437,
+      "learning_rate": 1.9189121756487025e-06,
+      "loss": 0.1842,
+      "step": 6500
+    },
+    {
+      "epoch": 1.746506986027944,
+      "grad_norm": 22.17848014831543,
+      "learning_rate": 1.9126746506986028e-06,
+      "loss": 0.2165,
+      "step": 7000
+    },
+    {
+      "epoch": 1.8712574850299402,
+      "grad_norm": 31.21565055847168,
+      "learning_rate": 1.906437125748503e-06,
+      "loss": 0.1879,
+      "step": 7500
+    },
+    {
+      "epoch": 1.996007984031936,
+      "grad_norm": 1.7563763856887817,
+      "learning_rate": 1.9001996007984032e-06,
+      "loss": 0.1913,
+      "step": 8000
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 0.4956786632537842,
+      "eval_runtime": 48.4339,
+      "eval_samples_per_second": 66.193,
+      "eval_steps_per_second": 16.559,
+      "step": 8016
+    },
+    {
+      "epoch": 2.1207584830339323,
+      "grad_norm": 0.0910625234246254,
+      "learning_rate": 1.8939620758483032e-06,
+      "loss": 0.1712,
+      "step": 8500
+    },
+    {
+      "epoch": 2.245508982035928,
+      "grad_norm": 30.4615421295166,
+      "learning_rate": 1.8877245508982035e-06,
+      "loss": 0.1579,
+      "step": 9000
+    },
+    {
+      "epoch": 2.370259481037924,
+      "grad_norm": 29.169662475585938,
+      "learning_rate": 1.8814870259481036e-06,
+      "loss": 0.1701,
+      "step": 9500
+    },
+    {
+      "epoch": 2.49500998003992,
+      "grad_norm": 0.9950535893440247,
+      "learning_rate": 1.875249500998004e-06,
+      "loss": 0.1717,
+      "step": 10000
+    },
+    {
+      "epoch": 2.6197604790419162,
+      "grad_norm": 0.30978772044181824,
+      "learning_rate": 1.8690119760479042e-06,
+      "loss": 0.1778,
+      "step": 10500
+    },
+    {
+      "epoch": 2.744510978043912,
+      "grad_norm": 1.4617693424224854,
+      "learning_rate": 1.8627744510978043e-06,
+      "loss": 0.1772,
+      "step": 11000
+    },
+    {
+      "epoch": 2.8692614770459084,
+      "grad_norm": 13.257425308227539,
+      "learning_rate": 1.8565369261477044e-06,
+      "loss": 0.1697,
+      "step": 11500
+    },
+    {
+      "epoch": 2.9940119760479043,
+      "grad_norm": 11.522214889526367,
+      "learning_rate": 1.8502994011976047e-06,
+      "loss": 0.1651,
+      "step": 12000
+    },
+    {
+      "epoch": 3.0,
+      "eval_loss": 0.4591982960700989,
+      "eval_runtime": 48.8032,
+      "eval_samples_per_second": 65.692,
+      "eval_steps_per_second": 16.433,
+      "step": 12024
+    },
+    {
+      "epoch": 3.1187624750499,
+      "grad_norm": 20.58974266052246,
+      "learning_rate": 1.844061876247505e-06,
+      "loss": 0.1607,
+      "step": 12500
+    },
+    {
+      "epoch": 3.243512974051896,
+      "grad_norm": 54.52241516113281,
+      "learning_rate": 1.8378243512974053e-06,
+      "loss": 0.1527,
+      "step": 13000
+    },
+    {
+      "epoch": 3.3682634730538923,
+      "grad_norm": 12.846843719482422,
+      "learning_rate": 1.8315868263473054e-06,
+      "loss": 0.1484,
+      "step": 13500
+    },
+    {
+      "epoch": 3.493013972055888,
+      "grad_norm": 0.6479830145835876,
+      "learning_rate": 1.8253493013972054e-06,
+      "loss": 0.1621,
+      "step": 14000
+    },
+    {
+      "epoch": 3.6177644710578845,
+      "grad_norm": 0.7256312370300293,
+      "learning_rate": 1.8191117764471057e-06,
+      "loss": 0.1428,
+      "step": 14500
+    },
+    {
+      "epoch": 3.7425149700598803,
+      "grad_norm": 12.274479866027832,
+      "learning_rate": 1.8128742514970058e-06,
+      "loss": 0.1433,
+      "step": 15000
+    },
+    {
+      "epoch": 3.867265469061876,
+      "grad_norm": 35.40715408325195,
+      "learning_rate": 1.8066367265469061e-06,
+      "loss": 0.161,
+      "step": 15500
+    },
+    {
+      "epoch": 3.992015968063872,
+      "grad_norm": 0.78450608253479,
+      "learning_rate": 1.8003992015968064e-06,
+      "loss": 0.171,
+      "step": 16000
+    },
+    {
+      "epoch": 4.0,
+      "eval_loss": 0.4495457410812378,
+      "eval_runtime": 49.2624,
+      "eval_samples_per_second": 65.08,
+      "eval_steps_per_second": 16.28,
+      "step": 16032
+    },
+    {
+      "epoch": 4.116766467065868,
+      "grad_norm": 0.06905636936426163,
+      "learning_rate": 1.7941616766467065e-06,
+      "loss": 0.1475,
+      "step": 16500
+    },
+    {
+      "epoch": 4.241516966067865,
+      "grad_norm": 34.77931213378906,
+      "learning_rate": 1.7879241516966066e-06,
+      "loss": 0.1365,
+      "step": 17000
+    },
+    {
+      "epoch": 4.3662674650698605,
+      "grad_norm": 0.5809102058410645,
+      "learning_rate": 1.7816866267465069e-06,
+      "loss": 0.1366,
+      "step": 17500
+    },
+    {
+      "epoch": 4.491017964071856,
+      "grad_norm": 66.70156860351562,
+      "learning_rate": 1.775449101796407e-06,
+      "loss": 0.1526,
+      "step": 18000
+    },
+    {
+      "epoch": 4.615768463073852,
+      "grad_norm": 29.423938751220703,
+      "learning_rate": 1.7692115768463075e-06,
+      "loss": 0.135,
+      "step": 18500
+    },
+    {
+      "epoch": 4.740518962075848,
+      "grad_norm": 0.48827868700027466,
+      "learning_rate": 1.7629740518962075e-06,
+      "loss": 0.1444,
+      "step": 19000
+    },
+    {
+      "epoch": 4.865269461077844,
+      "grad_norm": 8.966581344604492,
+      "learning_rate": 1.7567365269461076e-06,
+      "loss": 0.1295,
+      "step": 19500
+    },
+    {
+      "epoch": 4.99001996007984,
+      "grad_norm": 3.6332414150238037,
+      "learning_rate": 1.750499001996008e-06,
+      "loss": 0.1407,
+      "step": 20000
+    },
+    {
+      "epoch": 5.0,
+      "eval_loss": 0.5054113268852234,
+      "eval_runtime": 46.022,
+      "eval_samples_per_second": 69.662,
+      "eval_steps_per_second": 17.426,
+      "step": 20040
+    },
+    {
+      "epoch": 5.114770459081837,
+      "grad_norm": 26.99722671508789,
+      "learning_rate": 1.744261477045908e-06,
+      "loss": 0.1307,
+      "step": 20500
+    },
+    {
+      "epoch": 5.2395209580838324,
+      "grad_norm": 0.7371481657028198,
+      "learning_rate": 1.7380239520958083e-06,
+      "loss": 0.1153,
+      "step": 21000
+    },
+    {
+      "epoch": 5.364271457085828,
+      "grad_norm": 0.3232800364494324,
+      "learning_rate": 1.7317864271457086e-06,
+      "loss": 0.1154,
+      "step": 21500
+    },
+    {
+      "epoch": 5.489021956087824,
+      "grad_norm": 1.8309438228607178,
+      "learning_rate": 1.7255489021956087e-06,
+      "loss": 0.1331,
+      "step": 22000
+    },
+    {
+      "epoch": 5.61377245508982,
+      "grad_norm": 0.4226222038269043,
+      "learning_rate": 1.719311377245509e-06,
+      "loss": 0.1206,
+      "step": 22500
+    },
+    {
+      "epoch": 5.738522954091817,
+      "grad_norm": 1.4337540864944458,
+      "learning_rate": 1.713073852295409e-06,
+      "loss": 0.13,
+      "step": 23000
+    },
+    {
+      "epoch": 5.863273453093813,
+      "grad_norm": 47.5312614440918,
+      "learning_rate": 1.7068363273453091e-06,
+      "loss": 0.1285,
+      "step": 23500
+    },
+    {
+      "epoch": 5.9880239520958085,
+      "grad_norm": 1.092816710472107,
+      "learning_rate": 1.7005988023952097e-06,
+      "loss": 0.1412,
+      "step": 24000
+    },
+    {
+      "epoch": 6.0,
+      "eval_loss": 0.4939550459384918,
+      "eval_runtime": 45.0298,
+      "eval_samples_per_second": 71.197,
+      "eval_steps_per_second": 17.81,
+      "step": 24048
+    },
+    {
+      "epoch": 6.112774451097804,
+      "grad_norm": 0.03936842083930969,
+      "learning_rate": 1.6943612774451097e-06,
+      "loss": 0.1134,
+      "step": 24500
+    },
+    {
+      "epoch": 6.2375249500998,
+      "grad_norm": 3.047616481781006,
+      "learning_rate": 1.6881237524950098e-06,
+      "loss": 0.1066,
+      "step": 25000
+    },
+    {
+      "epoch": 6.362275449101796,
+      "grad_norm": 16.7564754486084,
+      "learning_rate": 1.6818862275449101e-06,
+      "loss": 0.1615,
+      "step": 25500
+    },
+    {
+      "epoch": 6.487025948103792,
+      "grad_norm": 21.36778450012207,
+      "learning_rate": 1.6756487025948102e-06,
+      "loss": 0.1645,
+      "step": 26000
+    },
+    {
+      "epoch": 6.611776447105789,
+      "grad_norm": 78.45208740234375,
+      "learning_rate": 1.6694111776447105e-06,
+      "loss": 0.1675,
+      "step": 26500
+    },
+    {
+      "epoch": 6.736526946107785,
+      "grad_norm": 7.212148666381836,
+      "learning_rate": 1.6631736526946108e-06,
+      "loss": 0.146,
+      "step": 27000
+    },
+    {
+      "epoch": 6.86127744510978,
+      "grad_norm": 9.503207206726074,
+      "learning_rate": 1.6569361277445109e-06,
+      "loss": 0.1606,
+      "step": 27500
+    },
+    {
+      "epoch": 6.986027944111776,
+      "grad_norm": 0.4464740753173828,
+      "learning_rate": 1.6506986027944112e-06,
+      "loss": 0.1429,
+      "step": 28000
+    },
+    {
+      "epoch": 7.0,
+      "eval_loss": 0.4717544615268707,
+      "eval_runtime": 47.386,
+      "eval_samples_per_second": 67.657,
+      "eval_steps_per_second": 16.925,
+      "step": 28056
+    },
+    {
+      "epoch": 7.110778443113772,
+      "grad_norm": 0.42686018347740173,
+      "learning_rate": 1.6444610778443113e-06,
+      "loss": 0.1207,
+      "step": 28500
+    },
+    {
+      "epoch": 7.235528942115769,
+      "grad_norm": 24.92848014831543,
+      "learning_rate": 1.6382235528942113e-06,
+      "loss": 0.1351,
+      "step": 29000
+    },
+    {
+      "epoch": 7.360279441117765,
+      "grad_norm": 7.397327423095703,
+      "learning_rate": 1.6319860279441118e-06,
+      "loss": 0.1543,
+      "step": 29500
+    },
+    {
+      "epoch": 7.485029940119761,
+      "grad_norm": 0.43539106845855713,
+      "learning_rate": 1.625748502994012e-06,
+      "loss": 0.1494,
+      "step": 30000
+    },
+    {
+      "epoch": 7.6097804391217565,
+      "grad_norm": 14.456055641174316,
+      "learning_rate": 1.619510978043912e-06,
+      "loss": 0.1419,
+      "step": 30500
+    },
+    {
+      "epoch": 7.734530938123752,
+      "grad_norm": 9.563997268676758,
+      "learning_rate": 1.6132734530938123e-06,
+      "loss": 0.1357,
+      "step": 31000
+    },
+    {
+      "epoch": 7.859281437125748,
+      "grad_norm": 1.7568217515945435,
+      "learning_rate": 1.6070359281437124e-06,
+      "loss": 0.1369,
+      "step": 31500
+    },
+    {
+      "epoch": 7.984031936127744,
+      "grad_norm": 2.780186653137207,
+      "learning_rate": 1.600798403193613e-06,
+      "loss": 0.1451,
+      "step": 32000
+    },
+    {
+      "epoch": 8.0,
+      "eval_loss": 0.45947006344795227,
+      "eval_runtime": 44.2941,
+      "eval_samples_per_second": 72.38,
+      "eval_steps_per_second": 18.106,
+      "step": 32064
+    },
+    {
+      "epoch": 8.10878243512974,
+      "grad_norm": 10.451217651367188,
+      "learning_rate": 1.594560878243513e-06,
+      "loss": 0.1136,
+      "step": 32500
+    },
+    {
+      "epoch": 8.233532934131736,
+      "grad_norm": 0.18200552463531494,
+      "learning_rate": 1.588323353293413e-06,
+      "loss": 0.1259,
+      "step": 33000
+    },
+    {
+      "epoch": 8.358283433133732,
+      "grad_norm": 1.9428528547286987,
+      "learning_rate": 1.5820858283433134e-06,
+      "loss": 0.1279,
+      "step": 33500
+    },
+    {
+      "epoch": 8.48303393213573,
+      "grad_norm": 1.7016535997390747,
+      "learning_rate": 1.5758483033932135e-06,
+      "loss": 0.1231,
+      "step": 34000
+    },
+    {
+      "epoch": 8.607784431137725,
+      "grad_norm": 0.7158037424087524,
+      "learning_rate": 1.5696107784431135e-06,
+      "loss": 0.1446,
+      "step": 34500
+    },
+    {
+      "epoch": 8.732534930139721,
+      "grad_norm": 0.4712078273296356,
+      "learning_rate": 1.563373253493014e-06,
+      "loss": 0.1344,
+      "step": 35000
+    },
+    {
+      "epoch": 8.857285429141717,
+      "grad_norm": 24.5105037689209,
+      "learning_rate": 1.5571357285429141e-06,
+      "loss": 0.1331,
+      "step": 35500
+    },
+    {
+      "epoch": 8.982035928143713,
+      "grad_norm": 27.750621795654297,
+      "learning_rate": 1.5508982035928142e-06,
+      "loss": 0.1296,
+      "step": 36000
+    },
+    {
+      "epoch": 9.0,
+      "eval_loss": 0.47351646423339844,
+      "eval_runtime": 41.3902,
+      "eval_samples_per_second": 77.458,
+      "eval_steps_per_second": 19.377,
+      "step": 36072
+    },
+    {
+      "epoch": 9.106786427145709,
+      "grad_norm": 28.095134735107422,
+      "learning_rate": 1.5446606786427145e-06,
+      "loss": 0.119,
+      "step": 36500
+    },
+    {
+      "epoch": 9.231536926147704,
+      "grad_norm": 0.07204411178827286,
+      "learning_rate": 1.5384231536926146e-06,
+      "loss": 0.1098,
+      "step": 37000
+    },
+    {
+      "epoch": 9.3562874251497,
+      "grad_norm": 0.2767297327518463,
+      "learning_rate": 1.532185628742515e-06,
+      "loss": 0.1224,
+      "step": 37500
+    },
+    {
+      "epoch": 9.481037924151696,
+      "grad_norm": 0.14060889184474945,
+      "learning_rate": 1.5259481037924152e-06,
+      "loss": 0.1247,
+      "step": 38000
+    },
+    {
+      "epoch": 9.605788423153692,
+      "grad_norm": 32.673011779785156,
+      "learning_rate": 1.5197105788423153e-06,
+      "loss": 0.122,
+      "step": 38500
+    },
+    {
+      "epoch": 9.730538922155688,
+      "grad_norm": 0.21247480809688568,
+      "learning_rate": 1.5134730538922156e-06,
+      "loss": 0.1233,
+      "step": 39000
+    },
+    {
+      "epoch": 9.855289421157684,
+      "grad_norm": 0.4861377775669098,
+      "learning_rate": 1.5072355289421156e-06,
+      "loss": 0.1286,
+      "step": 39500
+    },
+    {
+      "epoch": 9.980039920159681,
+      "grad_norm": 11.489697456359863,
+      "learning_rate": 1.5009980039920157e-06,
+      "loss": 0.1203,
+      "step": 40000
+    },
+    {
+      "epoch": 10.0,
+      "eval_loss": 0.44174808263778687,
+      "eval_runtime": 40.4714,
+      "eval_samples_per_second": 79.216,
+      "eval_steps_per_second": 19.816,
+      "step": 40080
+    },
+    {
+      "epoch": 10.104790419161677,
+      "grad_norm": 0.06284382939338684,
+      "learning_rate": 1.4947604790419162e-06,
+      "loss": 0.1176,
+      "step": 40500
+    },
+    {
+      "epoch": 10.229540918163673,
+      "grad_norm": 0.8282334804534912,
+      "learning_rate": 1.4885229540918163e-06,
+      "loss": 0.1133,
+      "step": 41000
+    },
+    {
+      "epoch": 10.354291417165669,
+      "grad_norm": 0.675163984298706,
+      "learning_rate": 1.4822854291417164e-06,
+      "loss": 0.0977,
+      "step": 41500
+    },
+    {
+      "epoch": 10.479041916167665,
+      "grad_norm": 6.970102310180664,
+      "learning_rate": 1.4760479041916167e-06,
+      "loss": 0.1113,
+      "step": 42000
+    },
+    {
+      "epoch": 10.60379241516966,
+      "grad_norm": 8.85517406463623,
+      "learning_rate": 1.4698103792415168e-06,
+      "loss": 0.1164,
+      "step": 42500
+    },
+    {
+      "epoch": 10.728542914171657,
+      "grad_norm": 0.9282238483428955,
+      "learning_rate": 1.4635728542914173e-06,
+      "loss": 0.1167,
+      "step": 43000
+    },
+    {
+      "epoch": 10.853293413173652,
+      "grad_norm": 9.984148979187012,
+      "learning_rate": 1.4573353293413174e-06,
+      "loss": 0.1261,
+      "step": 43500
+    },
+    {
+      "epoch": 10.978043912175648,
+      "grad_norm": 0.20773719251155853,
+      "learning_rate": 1.4510978043912175e-06,
+      "loss": 0.1132,
+      "step": 44000
+    },
+    {
+      "epoch": 11.0,
+      "eval_loss": 0.49900639057159424,
+      "eval_runtime": 43.241,
+      "eval_samples_per_second": 74.143,
+      "eval_steps_per_second": 18.547,
+      "step": 44088
+    },
+    {
+      "epoch": 11.102794411177644,
+      "grad_norm": 12.603593826293945,
+      "learning_rate": 1.4448602794411178e-06,
+      "loss": 0.1061,
+      "step": 44500
+    },
+    {
+      "epoch": 11.22754491017964,
+      "grad_norm": 51.32432174682617,
+      "learning_rate": 1.4386227544910178e-06,
+      "loss": 0.1079,
+      "step": 45000
+    },
+    {
+      "epoch": 11.352295409181636,
+      "grad_norm": 10.22624397277832,
+      "learning_rate": 1.432385229540918e-06,
+      "loss": 0.1166,
+      "step": 45500
+    },
+    {
+      "epoch": 11.477045908183634,
+      "grad_norm": 11.041003227233887,
+      "learning_rate": 1.4261477045908184e-06,
+      "loss": 0.105,
+      "step": 46000
+    },
+    {
+      "epoch": 11.60179640718563,
+      "grad_norm": 35.79409408569336,
+      "learning_rate": 1.4199101796407185e-06,
+      "loss": 0.1124,
+      "step": 46500
+    },
+    {
+      "epoch": 11.726546906187625,
+      "grad_norm": 0.18676696717739105,
+      "learning_rate": 1.4136726546906188e-06,
+      "loss": 0.0928,
+      "step": 47000
+    },
+    {
+      "epoch": 11.851297405189621,
+      "grad_norm": 1.4925884008407593,
+      "learning_rate": 1.4074351297405189e-06,
+      "loss": 0.1098,
+      "step": 47500
+    },
+    {
+      "epoch": 11.976047904191617,
+      "grad_norm": 0.32953181862831116,
+      "learning_rate": 1.401197604790419e-06,
+      "loss": 0.1117,
+      "step": 48000
+    },
+    {
+      "epoch": 12.0,
+      "eval_loss": 0.4872562289237976,
+      "eval_runtime": 41.6872,
+      "eval_samples_per_second": 76.906,
+      "eval_steps_per_second": 19.239,
+      "step": 48096
+    },
+    {
+      "epoch": 12.100798403193613,
+      "grad_norm": 0.027937307953834534,
+      "learning_rate": 1.3949600798403195e-06,
+      "loss": 0.0992,
+      "step": 48500
+    },
+    {
+      "epoch": 12.225548902195609,
+      "grad_norm": 0.29068148136138916,
+      "learning_rate": 1.3887225548902196e-06,
+      "loss": 0.0921,
+      "step": 49000
+    },
+    {
+      "epoch": 12.350299401197605,
+      "grad_norm": 0.127395898103714,
+      "learning_rate": 1.3824850299401197e-06,
+      "loss": 0.0933,
+      "step": 49500
+    },
+    {
+      "epoch": 12.4750499001996,
+      "grad_norm": 0.09435238689184189,
+      "learning_rate": 1.37624750499002e-06,
+      "loss": 0.116,
+      "step": 50000
+    },
+    {
+      "epoch": 12.599800399201596,
+      "grad_norm": 39.19729232788086,
+      "learning_rate": 1.37000998003992e-06,
+      "loss": 0.1052,
+      "step": 50500
+    },
+    {
+      "epoch": 12.724550898203592,
+      "grad_norm": 0.28930047154426575,
+      "learning_rate": 1.3637724550898201e-06,
+      "loss": 0.1038,
+      "step": 51000
+    },
+    {
+      "epoch": 12.849301397205588,
+      "grad_norm": 0.15510033071041107,
+      "learning_rate": 1.3575349301397206e-06,
+      "loss": 0.0983,
+      "step": 51500
+    },
+    {
+      "epoch": 12.974051896207584,
+      "grad_norm": 81.58076477050781,
+      "learning_rate": 1.3512974051896207e-06,
+      "loss": 0.1117,
+      "step": 52000
+    },
+    {
+      "epoch": 13.0,
+      "eval_loss": 0.45387548208236694,
+      "eval_runtime": 43.4012,
+      "eval_samples_per_second": 73.869,
+      "eval_steps_per_second": 18.479,
+      "step": 52104
+    },
+    {
+      "epoch": 13.098802395209582,
+      "grad_norm": 4.060844421386719,
+      "learning_rate": 1.345059880239521e-06,
+      "loss": 0.0983,
+      "step": 52500
+    },
+    {
+      "epoch": 13.223552894211577,
+      "grad_norm": 33.315853118896484,
+      "learning_rate": 1.338822355289421e-06,
+      "loss": 0.0941,
+      "step": 53000
+    },
+    {
+      "epoch": 13.348303393213573,
+      "grad_norm": 0.1183587834239006,
+      "learning_rate": 1.3325848303393212e-06,
+      "loss": 0.0973,
+      "step": 53500
+    },
+    {
+      "epoch": 13.47305389221557,
+      "grad_norm": 40.30908966064453,
+      "learning_rate": 1.3263473053892215e-06,
+      "loss": 0.0871,
+      "step": 54000
+    },
+    {
+      "epoch": 13.597804391217565,
+      "grad_norm": 0.619777262210846,
+      "learning_rate": 1.3201097804391218e-06,
+      "loss": 0.1001,
+      "step": 54500
+    },
+    {
+      "epoch": 13.72255489021956,
+      "grad_norm": 0.2705942392349243,
+      "learning_rate": 1.3138722554890218e-06,
+      "loss": 0.0983,
+      "step": 55000
+    },
+    {
+      "epoch": 13.847305389221557,
+      "grad_norm": 6.151524066925049,
+      "learning_rate": 1.3076347305389221e-06,
+      "loss": 0.0793,
+      "step": 55500
+    },
+    {
+      "epoch": 13.972055888223553,
+      "grad_norm": 2.340573787689209,
+      "learning_rate": 1.3013972055888222e-06,
+      "loss": 0.099,
+      "step": 56000
+    },
+    {
+      "epoch": 14.0,
+      "eval_loss": 0.47363531589508057,
+      "eval_runtime": 42.3279,
+      "eval_samples_per_second": 75.742,
+      "eval_steps_per_second": 18.947,
+      "step": 56112
+    },
+    {
+      "epoch": 14.096806387225548,
+      "grad_norm": 2.052589178085327,
+      "learning_rate": 1.2951596806387225e-06,
+      "loss": 0.0875,
+      "step": 56500
+    },
+    {
+      "epoch": 14.221556886227544,
+      "grad_norm": 1.2925941944122314,
+      "learning_rate": 1.2889221556886228e-06,
+      "loss": 0.0812,
+      "step": 57000
+    },
+    {
+      "epoch": 14.34630738522954,
+      "grad_norm": 0.062304213643074036,
+      "learning_rate": 1.282684630738523e-06,
+      "loss": 0.1017,
+      "step": 57500
+    },
+    {
+      "epoch": 14.471057884231538,
+      "grad_norm": 0.1741693764925003,
+      "learning_rate": 1.2764471057884232e-06,
+      "loss": 0.0836,
+      "step": 58000
+    },
+    {
+      "epoch": 14.595808383233534,
+      "grad_norm": 0.6444254517555237,
+      "learning_rate": 1.2702095808383233e-06,
+      "loss": 0.0804,
+      "step": 58500
+    },
+    {
+      "epoch": 14.72055888223553,
+      "grad_norm": 2.0034759044647217,
+      "learning_rate": 1.2639720558882234e-06,
+      "loss": 0.0953,
+      "step": 59000
+    },
+    {
+      "epoch": 14.845309381237525,
+      "grad_norm": 52.82548522949219,
+      "learning_rate": 1.2577345309381237e-06,
+      "loss": 0.0996,
+      "step": 59500
+    },
+    {
+      "epoch": 14.970059880239521,
+      "grad_norm": 6.955111503601074,
+      "learning_rate": 1.251497005988024e-06,
+      "loss": 0.0857,
+      "step": 60000
+    },
+    {
+      "epoch": 15.0,
+      "eval_loss": 0.45942702889442444,
+      "eval_runtime": 43.409,
+      "eval_samples_per_second": 73.856,
+      "eval_steps_per_second": 18.475,
+      "step": 60120
+    },
+    {
+      "epoch": 15.094810379241517,
+      "grad_norm": 3.2324092388153076,
+      "learning_rate": 1.245259481037924e-06,
+      "loss": 0.0849,
+      "step": 60500
+    },
+    {
+      "epoch": 15.219560878243513,
+      "grad_norm": 61.83153533935547,
+      "learning_rate": 1.2390219560878243e-06,
+      "loss": 0.0798,
+      "step": 61000
+    },
+    {
+      "epoch": 15.344311377245509,
+      "grad_norm": 0.015876924619078636,
+      "learning_rate": 1.2327844311377244e-06,
+      "loss": 0.0785,
+      "step": 61500
+    },
+    {
+      "epoch": 15.469061876247505,
+      "grad_norm": 3.0025134086608887,
+      "learning_rate": 1.2265469061876247e-06,
+      "loss": 0.0881,
+      "step": 62000
+    },
+    {
+      "epoch": 15.5938123752495,
+      "grad_norm": 12.912367820739746,
+      "learning_rate": 1.220309381237525e-06,
+      "loss": 0.0802,
+      "step": 62500
+    },
+    {
+      "epoch": 15.718562874251496,
+      "grad_norm": 0.3600245714187622,
+      "learning_rate": 1.214071856287425e-06,
+      "loss": 0.0849,
+      "step": 63000
+    },
+    {
+      "epoch": 15.843313373253492,
+      "grad_norm": 0.21024100482463837,
+      "learning_rate": 1.2078343313373254e-06,
+      "loss": 0.078,
+      "step": 63500
+    },
+    {
+      "epoch": 15.968063872255488,
+      "grad_norm": 9.392132759094238,
+      "learning_rate": 1.2015968063872255e-06,
+      "loss": 0.0865,
+      "step": 64000
+    },
+    {
+      "epoch": 16.0,
+      "eval_loss": 0.48642057180404663,
+      "eval_runtime": 46.7976,
+      "eval_samples_per_second": 68.508,
+      "eval_steps_per_second": 17.138,
+      "step": 64128
+    },
+    {
+      "epoch": 16.092814371257486,
+      "grad_norm": 0.5227041244506836,
+      "learning_rate": 1.1953592814371256e-06,
+      "loss": 0.0722,
+      "step": 64500
+    },
+    {
+      "epoch": 16.21756487025948,
+      "grad_norm": 25.282564163208008,
+      "learning_rate": 1.1891217564870259e-06,
+      "loss": 0.0981,
+      "step": 65000
+    },
+    {
+      "epoch": 16.342315369261478,
+      "grad_norm": 0.6670591235160828,
+      "learning_rate": 1.1828842315369261e-06,
+      "loss": 0.0787,
+      "step": 65500
+    },
+    {
+      "epoch": 16.46706586826347,
+      "grad_norm": 22.668352127075195,
+      "learning_rate": 1.1766467065868262e-06,
+      "loss": 0.0764,
+      "step": 66000
+    },
+    {
+      "epoch": 16.59181636726547,
+      "grad_norm": 0.22597374022006989,
+      "learning_rate": 1.1704091816367265e-06,
+      "loss": 0.078,
+      "step": 66500
+    },
+    {
+      "epoch": 16.716566866267463,
+      "grad_norm": 21.123409271240234,
+      "learning_rate": 1.1641716566866266e-06,
+      "loss": 0.0766,
+      "step": 67000
+    },
+    {
+      "epoch": 16.84131736526946,
+      "grad_norm": 0.04259370267391205,
+      "learning_rate": 1.157934131736527e-06,
+      "loss": 0.0765,
+      "step": 67500
+    },
+    {
+      "epoch": 16.96606786427146,
+      "grad_norm": 0.021560240536928177,
+      "learning_rate": 1.1516966067864272e-06,
+      "loss": 0.0785,
+      "step": 68000
+    },
+    {
+      "epoch": 17.0,
+      "eval_loss": 0.4793809652328491,
+      "eval_runtime": 45.0906,
+      "eval_samples_per_second": 71.101,
+      "eval_steps_per_second": 17.786,
+      "step": 68136
+    },
+    {
+      "epoch": 17.090818363273453,
+      "grad_norm": 9.094868659973145,
+      "learning_rate": 1.1454590818363273e-06,
+      "loss": 0.0647,
+      "step": 68500
+    },
+    {
+      "epoch": 17.21556886227545,
+      "grad_norm": 0.195833221077919,
+      "learning_rate": 1.1392215568862276e-06,
+      "loss": 0.0698,
+      "step": 69000
+    },
+    {
+      "epoch": 17.340319361277444,
+      "grad_norm": 0.18507197499275208,
+      "learning_rate": 1.1329840319361277e-06,
+      "loss": 0.0712,
+      "step": 69500
+    },
+    {
+      "epoch": 17.465069860279442,
+      "grad_norm": 0.9911601543426514,
+      "learning_rate": 1.1267465069860278e-06,
+      "loss": 0.0752,
+      "step": 70000
+    },
+    {
+      "epoch": 17.589820359281436,
+      "grad_norm": 1.9703953266143799,
+      "learning_rate": 1.120508982035928e-06,
+      "loss": 0.0675,
+      "step": 70500
+    },
+    {
+      "epoch": 17.714570858283434,
+      "grad_norm": 41.10940933227539,
+      "learning_rate": 1.1142714570858283e-06,
+      "loss": 0.0705,
+      "step": 71000
+    },
+    {
+      "epoch": 17.839321357285428,
+      "grad_norm": 15.87336254119873,
+      "learning_rate": 1.1080339321357286e-06,
+      "loss": 0.0763,
+      "step": 71500
+    },
+    {
+      "epoch": 17.964071856287426,
+      "grad_norm": 0.060888275504112244,
+      "learning_rate": 1.1017964071856287e-06,
+      "loss": 0.0784,
+      "step": 72000
+    },
+    {
+      "epoch": 18.0,
+      "eval_loss": 0.4715409278869629,
+      "eval_runtime": 44.1035,
+      "eval_samples_per_second": 72.693,
+      "eval_steps_per_second": 18.184,
+      "step": 72144
+    },
+    {
+      "epoch": 18.08882235528942,
+      "grad_norm": 2.47182035446167,
+      "learning_rate": 1.0955588822355288e-06,
+      "loss": 0.0747,
+      "step": 72500
+    },
+    {
+      "epoch": 18.213572854291417,
+      "grad_norm": 40.5880126953125,
+      "learning_rate": 1.089321357285429e-06,
+      "loss": 0.0678,
+      "step": 73000
+    },
+    {
+      "epoch": 18.338323353293415,
+      "grad_norm": 0.4340246915817261,
+      "learning_rate": 1.0830838323353294e-06,
+      "loss": 0.0713,
+      "step": 73500
+    },
+    {
+      "epoch": 18.46307385229541,
+      "grad_norm": 4.4763312339782715,
+      "learning_rate": 1.0768463073852295e-06,
+      "loss": 0.065,
+      "step": 74000
+    },
+    {
+      "epoch": 18.587824351297407,
+      "grad_norm": 0.1397508829832077,
+      "learning_rate": 1.0706087824351298e-06,
+      "loss": 0.0727,
+      "step": 74500
+    },
+    {
+      "epoch": 18.7125748502994,
+      "grad_norm": 7.134496212005615,
+      "learning_rate": 1.0643712574850299e-06,
+      "loss": 0.0605,
+      "step": 75000
+    },
+    {
+      "epoch": 18.8373253493014,
+      "grad_norm": 0.05227530747652054,
+      "learning_rate": 1.05813373253493e-06,
+      "loss": 0.0764,
+      "step": 75500
+    },
+    {
+      "epoch": 18.962075848303392,
+      "grad_norm": 17.22441864013672,
+      "learning_rate": 1.0518962075848302e-06,
+      "loss": 0.0696,
+      "step": 76000
+    },
+    {
+      "epoch": 19.0,
+      "eval_loss": 0.4802711308002472,
+      "eval_runtime": 45.7109,
+      "eval_samples_per_second": 70.136,
+      "eval_steps_per_second": 17.545,
+      "step": 76152
+    },
+    {
+      "epoch": 19.08682634730539,
+      "grad_norm": 0.02889215387403965,
+      "learning_rate": 1.0456586826347305e-06,
+      "loss": 0.0625,
+      "step": 76500
+    },
+    {
+      "epoch": 19.211576846307384,
+      "grad_norm": 128.0497283935547,
+      "learning_rate": 1.0394211576846308e-06,
+      "loss": 0.0548,
+      "step": 77000
+    },
+    {
+      "epoch": 19.336327345309382,
+      "grad_norm": 0.22108981013298035,
+      "learning_rate": 1.033183632734531e-06,
+      "loss": 0.0695,
+      "step": 77500
+    },
+    {
+      "epoch": 19.461077844311376,
+      "grad_norm": 55.13557815551758,
+      "learning_rate": 1.026946107784431e-06,
+      "loss": 0.0679,
+      "step": 78000
+    },
+    {
+      "epoch": 19.585828343313374,
+      "grad_norm": 3.5990562438964844,
+      "learning_rate": 1.0207085828343313e-06,
+      "loss": 0.0697,
+      "step": 78500
+    },
+    {
+      "epoch": 19.710578842315368,
+      "grad_norm": 3.9640650749206543,
+      "learning_rate": 1.0144710578842316e-06,
+      "loss": 0.0699,
+      "step": 79000
+    },
+    {
+      "epoch": 19.835329341317365,
+      "grad_norm": 0.3529013395309448,
+      "learning_rate": 1.0082335329341317e-06,
+      "loss": 0.0676,
+      "step": 79500
+    },
+    {
+      "epoch": 19.960079840319363,
+      "grad_norm": 1.3875175714492798,
+      "learning_rate": 1.001996007984032e-06,
+      "loss": 0.0683,
+      "step": 80000
+    },
+    {
+      "epoch": 20.0,
+      "eval_loss": 0.5128437280654907,
+      "eval_runtime": 46.0282,
+      "eval_samples_per_second": 69.653,
+      "eval_steps_per_second": 17.424,
+      "step": 80160
+    },
+    {
+      "epoch": 20.084830339321357,
+      "grad_norm": 6.171479225158691,
+      "learning_rate": 9.95758483033932e-07,
+      "loss": 0.0698,
+      "step": 80500
+    },
+    {
+      "epoch": 20.209580838323355,
+      "grad_norm": 0.012239497154951096,
+      "learning_rate": 9.895209580838323e-07,
+      "loss": 0.0532,
+      "step": 81000
+    },
+    {
+      "epoch": 20.33433133732535,
+      "grad_norm": 7.920960426330566,
+      "learning_rate": 9.832834331337324e-07,
+      "loss": 0.0609,
+      "step": 81500
+    },
+    {
+      "epoch": 20.459081836327346,
+      "grad_norm": 59.41933822631836,
+      "learning_rate": 9.770459081836327e-07,
+      "loss": 0.0653,
+      "step": 82000
+    },
+    {
+      "epoch": 20.58383233532934,
+      "grad_norm": 0.10031065344810486,
+      "learning_rate": 9.708083832335328e-07,
+      "loss": 0.0497,
+      "step": 82500
+    },
+    {
+      "epoch": 20.708582834331338,
+      "grad_norm": 5.42900276184082,
+      "learning_rate": 9.645708582834331e-07,
+      "loss": 0.061,
+      "step": 83000
+    },
+    {
+      "epoch": 20.833333333333332,
+      "grad_norm": 20.380285263061523,
+      "learning_rate": 9.583333333333334e-07,
+      "loss": 0.0717,
+      "step": 83500
+    },
+    {
+      "epoch": 20.95808383233533,
+      "grad_norm": 0.10651753097772598,
+      "learning_rate": 9.520958083832335e-07,
+      "loss": 0.0638,
+      "step": 84000
+    },
+    {
+      "epoch": 21.0,
+      "eval_loss": 0.4833807945251465,
+      "eval_runtime": 46.5592,
+      "eval_samples_per_second": 68.859,
+      "eval_steps_per_second": 17.225,
+      "step": 84168
+    },
+    {
+      "epoch": 21.082834331337324,
+      "grad_norm": 0.3842374086380005,
+      "learning_rate": 9.458582834331337e-07,
+      "loss": 0.0603,
+      "step": 84500
+    },
+    {
+      "epoch": 21.20758483033932,
+      "grad_norm": 51.563140869140625,
+      "learning_rate": 9.396207584830339e-07,
+      "loss": 0.06,
+      "step": 85000
+    },
+    {
+      "epoch": 21.33233532934132,
+      "grad_norm": 0.037806153297424316,
+      "learning_rate": 9.333832335329342e-07,
+      "loss": 0.0612,
+      "step": 85500
+    },
+    {
+      "epoch": 21.457085828343313,
+      "grad_norm": 0.11586946994066238,
+      "learning_rate": 9.271457085828342e-07,
+      "loss": 0.0664,
+      "step": 86000
+    },
+    {
+      "epoch": 21.58183632734531,
+      "grad_norm": 0.34262338280677795,
+      "learning_rate": 9.209081836327344e-07,
+      "loss": 0.0602,
+      "step": 86500
+    },
+    {
+      "epoch": 21.706586826347305,
+      "grad_norm": 0.11894870549440384,
+      "learning_rate": 9.146706586826347e-07,
+      "loss": 0.0522,
+      "step": 87000
+    },
+    {
+      "epoch": 21.831337325349303,
+      "grad_norm": 0.1180167868733406,
+      "learning_rate": 9.084331337325349e-07,
+      "loss": 0.0616,
+      "step": 87500
+    },
+    {
+      "epoch": 21.956087824351297,
+      "grad_norm": 0.09437087923288345,
+      "learning_rate": 9.02195608782435e-07,
+      "loss": 0.0607,
+      "step": 88000
+    },
+    {
+      "epoch": 22.0,
+      "eval_loss": 0.4958905279636383,
+      "eval_runtime": 44.4581,
+      "eval_samples_per_second": 72.113,
+      "eval_steps_per_second": 18.039,
+      "step": 88176
+    },
+    {
+      "epoch": 22.080838323353294,
+      "grad_norm": 0.5892271399497986,
+      "learning_rate": 8.959580838323353e-07,
+      "loss": 0.058,
+      "step": 88500
+    },
+    {
+      "epoch": 22.20558882235529,
+      "grad_norm": 1.0569002628326416,
+      "learning_rate": 8.897205588822355e-07,
+      "loss": 0.0559,
+      "step": 89000
+    },
+    {
+      "epoch": 22.330339321357286,
+      "grad_norm": 50.68812561035156,
+      "learning_rate": 8.834830339321357e-07,
+      "loss": 0.05,
+      "step": 89500
+    },
+    {
+      "epoch": 22.45508982035928,
+      "grad_norm": 0.08090469241142273,
+      "learning_rate": 8.772455089820359e-07,
+      "loss": 0.0595,
+      "step": 90000
+    },
+    {
+      "epoch": 22.579840319361278,
+      "grad_norm": 14.62991714477539,
+      "learning_rate": 8.710079840319361e-07,
+      "loss": 0.059,
+      "step": 90500
+    },
+    {
+      "epoch": 22.704590818363272,
+      "grad_norm": 0.2893312871456146,
+      "learning_rate": 8.647704590818364e-07,
+      "loss": 0.0518,
+      "step": 91000
+    },
+    {
+      "epoch": 22.82934131736527,
+      "grad_norm": 22.239938735961914,
+      "learning_rate": 8.585329341317364e-07,
+      "loss": 0.0493,
+      "step": 91500
+    },
+    {
+      "epoch": 22.954091816367267,
+      "grad_norm": 0.09933929890394211,
+      "learning_rate": 8.522954091816366e-07,
+      "loss": 0.0536,
+      "step": 92000
+    },
+    {
+      "epoch": 23.0,
+      "eval_loss": 0.48672357201576233,
+      "eval_runtime": 44.6856,
+      "eval_samples_per_second": 71.746,
+      "eval_steps_per_second": 17.948,
+      "step": 92184
+    },
+    {
+      "epoch": 23.07884231536926,
+      "grad_norm": 0.821902871131897,
+      "learning_rate": 8.460578842315369e-07,
+      "loss": 0.0553,
+      "step": 92500
+    },
+    {
+      "epoch": 23.20359281437126,
+      "grad_norm": 0.2537296414375305,
+      "learning_rate": 8.398203592814371e-07,
+      "loss": 0.046,
+      "step": 93000
+    },
+    {
+      "epoch": 23.328343313373253,
+      "grad_norm": 0.198989599943161,
+      "learning_rate": 8.335828343313372e-07,
+      "loss": 0.0496,
+      "step": 93500
+    },
+    {
+      "epoch": 23.45309381237525,
+      "grad_norm": 14.523540496826172,
+      "learning_rate": 8.273453093812375e-07,
+      "loss": 0.0465,
+      "step": 94000
+    },
+    {
+      "epoch": 23.577844311377245,
+      "grad_norm": 0.3473449945449829,
+      "learning_rate": 8.211077844311377e-07,
+      "loss": 0.048,
+      "step": 94500
+    },
+    {
+      "epoch": 23.702594810379242,
+      "grad_norm": 4.4253129959106445,
+      "learning_rate": 8.14870259481038e-07,
+      "loss": 0.0489,
+      "step": 95000
+    },
+    {
+      "epoch": 23.827345309381236,
+      "grad_norm": 159.51025390625,
+      "learning_rate": 8.086327345309381e-07,
+      "loss": 0.0552,
+      "step": 95500
+    },
+    {
+      "epoch": 23.952095808383234,
+      "grad_norm": 0.31450316309928894,
+      "learning_rate": 8.023952095808383e-07,
+      "loss": 0.0537,
+      "step": 96000
+    },
+    {
+      "epoch": 24.0,
+      "eval_loss": 0.5026536583900452,
+      "eval_runtime": 46.0362,
+      "eval_samples_per_second": 69.641,
+      "eval_steps_per_second": 17.421,
+      "step": 96192
+    },
+    {
+      "epoch": 24.076846307385228,
+      "grad_norm": 1.8670942783355713,
+      "learning_rate": 7.961576846307386e-07,
+      "loss": 0.0556,
+      "step": 96500
+    },
+    {
+      "epoch": 24.201596806387226,
+      "grad_norm": 0.4119631052017212,
+      "learning_rate": 7.899201596806386e-07,
+      "loss": 0.0427,
+      "step": 97000
+    },
+    {
+      "epoch": 24.32634730538922,
+      "grad_norm": 4.47167444229126,
+      "learning_rate": 7.836826347305388e-07,
+      "loss": 0.0579,
+      "step": 97500
+    },
+    {
+      "epoch": 24.451097804391217,
+      "grad_norm": 0.940743625164032,
+      "learning_rate": 7.774451097804391e-07,
+      "loss": 0.0462,
+      "step": 98000
+    },
+    {
+      "epoch": 24.575848303393215,
+      "grad_norm": 4.091241359710693,
+      "learning_rate": 7.712075848303393e-07,
+      "loss": 0.0524,
+      "step": 98500
+    },
+    {
+      "epoch": 24.70059880239521,
+      "grad_norm": 11.099757194519043,
+      "learning_rate": 7.649700598802394e-07,
+      "loss": 0.0549,
+      "step": 99000
+    },
+    {
+      "epoch": 24.825349301397207,
+      "grad_norm": 2.001067876815796,
+      "learning_rate": 7.587325349301397e-07,
+      "loss": 0.0485,
+      "step": 99500
+    },
+    {
+      "epoch": 24.9500998003992,
+      "grad_norm": 0.15496690571308136,
+      "learning_rate": 7.524950099800399e-07,
+      "loss": 0.0537,
+      "step": 100000
+    },
+    {
+      "epoch": 25.0,
+      "eval_loss": 0.48970088362693787,
+      "eval_runtime": 48.0502,
+      "eval_samples_per_second": 66.722,
+      "eval_steps_per_second": 16.691,
+      "step": 100200
+    },
+    {
+      "epoch": 25.0748502994012,
+      "grad_norm": 5.718461513519287,
+      "learning_rate": 7.462574850299402e-07,
+      "loss": 0.0471,
+      "step": 100500
+    },
+    {
+      "epoch": 25.199600798403193,
+      "grad_norm": 53.097293853759766,
+      "learning_rate": 7.400199600798403e-07,
+      "loss": 0.0467,
+      "step": 101000
+    },
+    {
+      "epoch": 25.32435129740519,
+      "grad_norm": 70.51046752929688,
+      "learning_rate": 7.337824351297404e-07,
+      "loss": 0.0464,
+      "step": 101500
+    },
+    {
+      "epoch": 25.449101796407184,
+      "grad_norm": 6.485039234161377,
+      "learning_rate": 7.275449101796407e-07,
+      "loss": 0.0501,
+      "step": 102000
+    },
+    {
+      "epoch": 25.573852295409182,
+      "grad_norm": 0.2076825648546219,
+      "learning_rate": 7.213073852295409e-07,
+      "loss": 0.05,
+      "step": 102500
+    },
+    {
+      "epoch": 25.698602794411176,
+      "grad_norm": 40.60255432128906,
+      "learning_rate": 7.15069860279441e-07,
+      "loss": 0.0374,
+      "step": 103000
+    },
+    {
+      "epoch": 25.823353293413174,
+      "grad_norm": 1.1958940029144287,
+      "learning_rate": 7.088323353293413e-07,
+      "loss": 0.0533,
+      "step": 103500
+    },
+    {
+      "epoch": 25.948103792415168,
+      "grad_norm": 11.201072692871094,
+      "learning_rate": 7.025948103792415e-07,
+      "loss": 0.0388,
+      "step": 104000
+    },
+    {
+      "epoch": 26.0,
+      "eval_loss": 0.48730549216270447,
+      "eval_runtime": 48.7336,
+      "eval_samples_per_second": 65.786,
+      "eval_steps_per_second": 16.457,
+      "step": 104208
+    },
+    {
+      "epoch": 26.072854291417165,
+      "grad_norm": 0.08899884670972824,
+      "learning_rate": 6.963572854291417e-07,
+      "loss": 0.0482,
+      "step": 104500
+    },
+    {
+      "epoch": 26.197604790419163,
+      "grad_norm": 0.08736108243465424,
+      "learning_rate": 6.901197604790419e-07,
+      "loss": 0.042,
+      "step": 105000
+    },
+    {
+      "epoch": 26.322355289421157,
+      "grad_norm": 0.050059039145708084,
+      "learning_rate": 6.838822355289421e-07,
+      "loss": 0.0443,
+      "step": 105500
+    },
+    {
+      "epoch": 26.447105788423155,
+      "grad_norm": 0.3098917603492737,
+      "learning_rate": 6.776447105788423e-07,
+      "loss": 0.0431,
+      "step": 106000
+    },
+    {
+      "epoch": 26.57185628742515,
+      "grad_norm": 0.601845920085907,
+      "learning_rate": 6.714071856287425e-07,
+      "loss": 0.0474,
+      "step": 106500
+    },
+    {
+      "epoch": 26.696606786427147,
+      "grad_norm": 43.90340805053711,
+      "learning_rate": 6.651696606786426e-07,
+      "loss": 0.0546,
+      "step": 107000
+    },
+    {
+      "epoch": 26.82135728542914,
+      "grad_norm": 0.1658441424369812,
+      "learning_rate": 6.589321357285429e-07,
+      "loss": 0.0463,
+      "step": 107500
+    },
+    {
+      "epoch": 26.94610778443114,
+      "grad_norm": 0.7097954154014587,
+      "learning_rate": 6.526946107784431e-07,
+      "loss": 0.0413,
+      "step": 108000
+    },
+    {
+      "epoch": 27.0,
+      "eval_loss": 0.49195966124534607,
+      "eval_runtime": 48.5815,
+      "eval_samples_per_second": 65.992,
+      "eval_steps_per_second": 16.508,
+      "step": 108216
+    },
+    {
+      "epoch": 27.070858283433132,
+      "grad_norm": 0.12945351004600525,
+      "learning_rate": 6.464570858283432e-07,
+      "loss": 0.0514,
+      "step": 108500
+    },
+    {
+      "epoch": 27.19560878243513,
+      "grad_norm": 0.09241262078285217,
+      "learning_rate": 6.402195608782435e-07,
+      "loss": 0.0454,
+      "step": 109000
+    },
+    {
+      "epoch": 27.320359281437124,
+      "grad_norm": 0.07145562022924423,
+      "learning_rate": 6.339820359281437e-07,
+      "loss": 0.0381,
+      "step": 109500
+    },
+    {
+      "epoch": 27.44510978043912,
+      "grad_norm": 0.003607134334743023,
+      "learning_rate": 6.277445109780439e-07,
+      "loss": 0.0476,
+      "step": 110000
+    },
+    {
+      "epoch": 27.56986027944112,
+      "grad_norm": 10.220846176147461,
+      "learning_rate": 6.215069860279441e-07,
+      "loss": 0.0441,
+      "step": 110500
+    },
+    {
+      "epoch": 27.694610778443113,
+      "grad_norm": 0.18386581540107727,
+      "learning_rate": 6.152694610778443e-07,
+      "loss": 0.0461,
+      "step": 111000
+    },
+    {
+      "epoch": 27.81936127744511,
+      "grad_norm": 0.26254481077194214,
+      "learning_rate": 6.090319361277445e-07,
+      "loss": 0.0367,
+      "step": 111500
+    },
+    {
+      "epoch": 27.944111776447105,
+      "grad_norm": 68.7042007446289,
+      "learning_rate": 6.027944111776448e-07,
+      "loss": 0.0471,
+      "step": 112000
+    },
+    {
+      "epoch": 28.0,
+      "eval_loss": 0.4870954751968384,
+      "eval_runtime": 45.0714,
+      "eval_samples_per_second": 71.132,
+      "eval_steps_per_second": 17.794,
+      "step": 112224
+    },
+    {
+      "epoch": 28.068862275449103,
+      "grad_norm": 0.0271464716643095,
+      "learning_rate": 5.965568862275448e-07,
+      "loss": 0.0433,
+      "step": 112500
+    },
+    {
+      "epoch": 28.193612774451097,
+      "grad_norm": 0.0086235161870718,
+      "learning_rate": 5.903193612774451e-07,
+      "loss": 0.0475,
+      "step": 113000
+    },
+    {
+      "epoch": 28.318363273453095,
+      "grad_norm": 0.11506126821041107,
+      "learning_rate": 5.840818363273453e-07,
+      "loss": 0.0353,
+      "step": 113500
+    },
+    {
+      "epoch": 28.44311377245509,
+      "grad_norm": 10.355070114135742,
+      "learning_rate": 5.778443113772454e-07,
+      "loss": 0.0416,
+      "step": 114000
+    },
+    {
+      "epoch": 28.567864271457086,
+      "grad_norm": 0.2200528234243393,
+      "learning_rate": 5.716067864271457e-07,
+      "loss": 0.0325,
+      "step": 114500
+    },
+    {
+      "epoch": 28.69261477045908,
+      "grad_norm": 0.05802537873387337,
+      "learning_rate": 5.653692614770459e-07,
+      "loss": 0.0468,
+      "step": 115000
+    },
+    {
+      "epoch": 28.817365269461078,
+      "grad_norm": 0.10829133540391922,
+      "learning_rate": 5.591317365269461e-07,
+      "loss": 0.042,
+      "step": 115500
+    },
+    {
+      "epoch": 28.942115768463076,
+      "grad_norm": 0.162460595369339,
+      "learning_rate": 5.528942115768463e-07,
+      "loss": 0.049,
+      "step": 116000
+    },
+    {
+      "epoch": 29.0,
+      "eval_loss": 0.4795687198638916,
+      "eval_runtime": 45.1647,
+      "eval_samples_per_second": 70.985,
+      "eval_steps_per_second": 17.757,
+      "step": 116232
+    },
+    {
+      "epoch": 29.06686626746507,
+      "grad_norm": 134.6587677001953,
+      "learning_rate": 5.466566866267465e-07,
+      "loss": 0.0416,
+      "step": 116500
+    },
+    {
+      "epoch": 29.191616766467067,
+      "grad_norm": 0.09312257915735245,
+      "learning_rate": 5.404191616766467e-07,
+      "loss": 0.0287,
+      "step": 117000
+    },
+    {
+      "epoch": 29.31636726546906,
+      "grad_norm": 0.3530866503715515,
+      "learning_rate": 5.341816367265469e-07,
+      "loss": 0.0384,
+      "step": 117500
+    },
+    {
+      "epoch": 29.44111776447106,
+      "grad_norm": 0.033993642777204514,
+      "learning_rate": 5.27944111776447e-07,
+      "loss": 0.043,
+      "step": 118000
+    },
+    {
+      "epoch": 29.565868263473053,
+      "grad_norm": 0.3124711513519287,
+      "learning_rate": 5.217065868263473e-07,
+      "loss": 0.04,
+      "step": 118500
+    },
+    {
+      "epoch": 29.69061876247505,
+      "grad_norm": 10.49288272857666,
+      "learning_rate": 5.154690618762475e-07,
+      "loss": 0.0463,
+      "step": 119000
+    },
+    {
+      "epoch": 29.815369261477045,
+      "grad_norm": 0.024224599823355675,
+      "learning_rate": 5.092315369261477e-07,
+      "loss": 0.0411,
+      "step": 119500
+    },
+    {
+      "epoch": 29.940119760479043,
+      "grad_norm": 3.9215731620788574,
+      "learning_rate": 5.029940119760479e-07,
+      "loss": 0.0408,
+      "step": 120000
+    },
+    {
+      "epoch": 30.0,
+      "eval_loss": 0.492553174495697,
+      "eval_runtime": 46.2042,
+      "eval_samples_per_second": 69.388,
+      "eval_steps_per_second": 17.358,
+      "step": 120240
+    },
+    {
+      "epoch": 30.064870259481037,
+      "grad_norm": 0.021667474880814552,
+      "learning_rate": 4.967564870259481e-07,
+      "loss": 0.0374,
+      "step": 120500
+    },
+    {
+      "epoch": 30.189620758483034,
+      "grad_norm": 0.5888983011245728,
+      "learning_rate": 4.905189620758483e-07,
+      "loss": 0.0463,
+      "step": 121000
+    },
+    {
+      "epoch": 30.31437125748503,
+      "grad_norm": 0.09637131541967392,
+      "learning_rate": 4.842814371257485e-07,
+      "loss": 0.033,
+      "step": 121500
+    },
+    {
+      "epoch": 30.439121756487026,
+      "grad_norm": 0.23179832100868225,
+      "learning_rate": 4.780439121756487e-07,
+      "loss": 0.0402,
+      "step": 122000
+    },
+    {
+      "epoch": 30.563872255489024,
+      "grad_norm": 0.14170564711093903,
+      "learning_rate": 4.718063872255489e-07,
+      "loss": 0.0395,
+      "step": 122500
+    },
+    {
+      "epoch": 30.688622754491018,
+      "grad_norm": 0.006093321368098259,
+      "learning_rate": 4.6556886227544903e-07,
+      "loss": 0.0356,
+      "step": 123000
+    },
+    {
+      "epoch": 30.813373253493015,
+      "grad_norm": 0.1018219068646431,
+      "learning_rate": 4.593313373253493e-07,
+      "loss": 0.0419,
+      "step": 123500
+    },
+    {
+      "epoch": 30.93812375249501,
+      "grad_norm": 2.9131383895874023,
+      "learning_rate": 4.5309381237524947e-07,
+      "loss": 0.0378,
+      "step": 124000
+    },
+    {
+      "epoch": 31.0,
+      "eval_loss": 0.5052226781845093,
+      "eval_runtime": 43.1611,
+      "eval_samples_per_second": 74.28,
+      "eval_steps_per_second": 18.582,
+      "step": 124248
+    },
+    {
+      "epoch": 31.062874251497007,
+      "grad_norm": 11.588695526123047,
+      "learning_rate": 4.468562874251497e-07,
+      "loss": 0.0346,
+      "step": 124500
+    },
+    {
+      "epoch": 31.187624750499,
+      "grad_norm": 0.2488149255514145,
+      "learning_rate": 4.4061876247504985e-07,
+      "loss": 0.0351,
+      "step": 125000
+    },
+    {
+      "epoch": 31.312375249501,
+      "grad_norm": 12.691544532775879,
+      "learning_rate": 4.343812375249501e-07,
+      "loss": 0.0323,
+      "step": 125500
+    },
+    {
+      "epoch": 31.437125748502993,
+      "grad_norm": 0.004168800078332424,
+      "learning_rate": 4.281437125748503e-07,
+      "loss": 0.033,
+      "step": 126000
+    },
+    {
+      "epoch": 31.56187624750499,
+      "grad_norm": 0.042690277099609375,
+      "learning_rate": 4.219061876247505e-07,
+      "loss": 0.039,
+      "step": 126500
+    },
+    {
+      "epoch": 31.686626746506985,
+      "grad_norm": 1.1096973419189453,
+      "learning_rate": 4.1566866267465066e-07,
+      "loss": 0.0349,
+      "step": 127000
+    },
+    {
+      "epoch": 31.811377245508982,
+      "grad_norm": 0.2642970085144043,
+      "learning_rate": 4.094311377245509e-07,
+      "loss": 0.0338,
+      "step": 127500
+    },
+    {
+      "epoch": 31.936127744510976,
+      "grad_norm": 0.21338249742984772,
+      "learning_rate": 4.031936127744511e-07,
+      "loss": 0.0349,
+      "step": 128000
+    },
+    {
+      "epoch": 32.0,
+      "eval_loss": 0.4927305281162262,
+      "eval_runtime": 43.7641,
+      "eval_samples_per_second": 73.256,
+      "eval_steps_per_second": 18.326,
+      "step": 128256
+    },
+    {
+      "epoch": 32.060878243512974,
+      "grad_norm": 0.1497274786233902,
+      "learning_rate": 3.969560878243513e-07,
+      "loss": 0.0403,
+      "step": 128500
+    },
+    {
+      "epoch": 32.18562874251497,
+      "grad_norm": 0.5848351120948792,
+      "learning_rate": 3.9071856287425147e-07,
+      "loss": 0.037,
+      "step": 129000
+    },
+    {
+      "epoch": 32.31037924151697,
+      "grad_norm": 0.11372077465057373,
+      "learning_rate": 3.8448103792415166e-07,
+      "loss": 0.0383,
+      "step": 129500
+    },
+    {
+      "epoch": 32.43512974051896,
+      "grad_norm": 0.1047956719994545,
+      "learning_rate": 3.782435129740519e-07,
+      "loss": 0.0315,
+      "step": 130000
+    },
+    {
+      "epoch": 32.55988023952096,
+      "grad_norm": 0.2975727617740631,
+      "learning_rate": 3.7200598802395204e-07,
+      "loss": 0.0264,
+      "step": 130500
+    },
+    {
+      "epoch": 32.684630738522955,
+      "grad_norm": 0.2123280167579651,
+      "learning_rate": 3.657684630738523e-07,
+      "loss": 0.0341,
+      "step": 131000
+    },
+    {
+      "epoch": 32.80938123752495,
+      "grad_norm": 27.63080596923828,
+      "learning_rate": 3.5953093812375247e-07,
+      "loss": 0.0368,
+      "step": 131500
+    },
+    {
+      "epoch": 32.93413173652694,
+      "grad_norm": 0.034935545176267624,
+      "learning_rate": 3.5329341317365266e-07,
+      "loss": 0.0394,
+      "step": 132000
+    },
+    {
+      "epoch": 33.0,
+      "eval_loss": 0.4937605559825897,
+      "eval_runtime": 39.9355,
+      "eval_samples_per_second": 80.279,
+      "eval_steps_per_second": 20.082,
+      "step": 132264
+    },
+    {
+      "epoch": 33.05888223552894,
+      "grad_norm": 0.003380158683285117,
+      "learning_rate": 3.4705588822355285e-07,
+      "loss": 0.0394,
+      "step": 132500
+    },
+    {
+      "epoch": 33.18363273453094,
+      "grad_norm": 2.721451997756958,
+      "learning_rate": 3.408183632734531e-07,
+      "loss": 0.0365,
+      "step": 133000
+    },
+    {
+      "epoch": 33.308383233532936,
+      "grad_norm": 0.4309988021850586,
+      "learning_rate": 3.345808383233533e-07,
+      "loss": 0.0302,
+      "step": 133500
+    },
+    {
+      "epoch": 33.43313373253493,
+      "grad_norm": 0.24694228172302246,
+      "learning_rate": 3.283433133732535e-07,
+      "loss": 0.037,
+      "step": 134000
+    },
+    {
+      "epoch": 33.557884231536924,
+      "grad_norm": 0.34988418221473694,
+      "learning_rate": 3.2210578842315366e-07,
+      "loss": 0.0258,
+      "step": 134500
+    },
+    {
+      "epoch": 33.68263473053892,
+      "grad_norm": 0.19452495872974396,
+      "learning_rate": 3.158682634730539e-07,
+      "loss": 0.035,
+      "step": 135000
+    },
+    {
+      "epoch": 33.80738522954092,
+      "grad_norm": 0.006651519797742367,
+      "learning_rate": 3.096307385229541e-07,
+      "loss": 0.0368,
+      "step": 135500
+    },
+    {
+      "epoch": 33.93213572854292,
+      "grad_norm": 0.04128989204764366,
+      "learning_rate": 3.033932135728543e-07,
+      "loss": 0.0301,
+      "step": 136000
+    },
+    {
+      "epoch": 34.0,
+      "eval_loss": 0.4872666597366333,
+      "eval_runtime": 37.6035,
+      "eval_samples_per_second": 85.258,
+      "eval_steps_per_second": 21.328,
+      "step": 136272
+    },
+    {
+      "epoch": 34.05688622754491,
+      "grad_norm": 0.05333876982331276,
+      "learning_rate": 2.971556886227545e-07,
+      "loss": 0.0349,
+      "step": 136500
+    },
+    {
+      "epoch": 34.181636726546905,
+      "grad_norm": 1.3579726219177246,
+      "learning_rate": 2.909181636726547e-07,
+      "loss": 0.0285,
+      "step": 137000
+    },
+    {
+      "epoch": 34.3063872255489,
+      "grad_norm": 0.6725994348526001,
+      "learning_rate": 2.8468063872255486e-07,
+      "loss": 0.0361,
+      "step": 137500
+    },
+    {
+      "epoch": 34.4311377245509,
+      "grad_norm": 0.03919246420264244,
+      "learning_rate": 2.7844311377245504e-07,
+      "loss": 0.0274,
+      "step": 138000
+    },
+    {
+      "epoch": 34.55588822355289,
+      "grad_norm": 35.5837287902832,
+      "learning_rate": 2.722055888223553e-07,
+      "loss": 0.0363,
+      "step": 138500
+    },
+    {
+      "epoch": 34.68063872255489,
+      "grad_norm": 0.007728968746960163,
+      "learning_rate": 2.659680638722555e-07,
+      "loss": 0.0391,
+      "step": 139000
+    },
+    {
+      "epoch": 34.80538922155689,
+      "grad_norm": 0.07272203266620636,
+      "learning_rate": 2.5973053892215567e-07,
+      "loss": 0.0268,
+      "step": 139500
+    },
+    {
+      "epoch": 34.930139720558884,
+      "grad_norm": 0.33094656467437744,
+      "learning_rate": 2.5349301397205586e-07,
+      "loss": 0.0365,
+      "step": 140000
+    },
+    {
+      "epoch": 35.0,
+      "eval_loss": 0.4920032024383545,
+      "eval_runtime": 40.4781,
+      "eval_samples_per_second": 79.203,
+      "eval_steps_per_second": 19.813,
+      "step": 140280
+    },
+    {
+      "epoch": 35.054890219560875,
+      "grad_norm": 191.99266052246094,
+      "learning_rate": 2.472554890219561e-07,
+      "loss": 0.0333,
+      "step": 140500
+    },
+    {
+      "epoch": 35.17964071856287,
+      "grad_norm": 0.002573936013504863,
+      "learning_rate": 2.410179640718563e-07,
+      "loss": 0.0327,
+      "step": 141000
+    },
+    {
+      "epoch": 35.30439121756487,
+      "grad_norm": 0.04750495404005051,
+      "learning_rate": 2.3478043912175645e-07,
+      "loss": 0.0345,
+      "step": 141500
+    },
+    {
+      "epoch": 35.42914171656687,
+      "grad_norm": 193.8626251220703,
+      "learning_rate": 2.2854291417165667e-07,
+      "loss": 0.0321,
+      "step": 142000
+    },
+    {
+      "epoch": 35.553892215568865,
+      "grad_norm": 0.0009173236903734505,
+      "learning_rate": 2.2230538922155686e-07,
+      "loss": 0.0359,
+      "step": 142500
+    },
+    {
+      "epoch": 35.678642714570856,
+      "grad_norm": 0.12355954945087433,
+      "learning_rate": 2.1606786427145708e-07,
+      "loss": 0.0347,
+      "step": 143000
+    },
+    {
+      "epoch": 35.80339321357285,
+      "grad_norm": 0.24140344560146332,
+      "learning_rate": 2.0983033932135726e-07,
+      "loss": 0.031,
+      "step": 143500
+    },
+    {
+      "epoch": 35.92814371257485,
+      "grad_norm": 0.007129414472728968,
+      "learning_rate": 2.0359281437125748e-07,
+      "loss": 0.0214,
+      "step": 144000
+    },
+    {
+      "epoch": 36.0,
+      "eval_loss": 0.4941750466823578,
+      "eval_runtime": 38.7085,
+      "eval_samples_per_second": 82.824,
+      "eval_steps_per_second": 20.719,
+      "step": 144288
+    },
+    {
+      "epoch": 36.05289421157685,
+      "grad_norm": 0.27973344922065735,
+      "learning_rate": 1.9735528942115767e-07,
+      "loss": 0.0331,
+      "step": 144500
+    },
+    {
+      "epoch": 36.17764471057884,
+      "grad_norm": 0.05331612005829811,
+      "learning_rate": 1.911177644710579e-07,
+      "loss": 0.0303,
+      "step": 145000
+    },
+    {
+      "epoch": 36.30239520958084,
+      "grad_norm": 1.8135106563568115,
+      "learning_rate": 1.8488023952095808e-07,
+      "loss": 0.0349,
+      "step": 145500
+    },
+    {
+      "epoch": 36.427145708582835,
+      "grad_norm": 0.13009090721607208,
+      "learning_rate": 1.7864271457085827e-07,
+      "loss": 0.0405,
+      "step": 146000
+    },
+    {
+      "epoch": 36.55189620758483,
+      "grad_norm": 0.07144490629434586,
+      "learning_rate": 1.7240518962075848e-07,
+      "loss": 0.0377,
+      "step": 146500
+    },
+    {
+      "epoch": 36.67664670658683,
+      "grad_norm": 74.39689636230469,
+      "learning_rate": 1.6616766467065867e-07,
+      "loss": 0.0278,
+      "step": 147000
+    },
+    {
+      "epoch": 36.80139720558882,
+      "grad_norm": 0.08526595681905746,
+      "learning_rate": 1.599301397205589e-07,
+      "loss": 0.0306,
+      "step": 147500
+    },
+    {
+      "epoch": 36.92614770459082,
+      "grad_norm": 12.262850761413574,
+      "learning_rate": 1.5369261477045908e-07,
+      "loss": 0.0314,
+      "step": 148000
+    },
+    {
+      "epoch": 37.0,
+      "eval_loss": 0.49442577362060547,
+      "eval_runtime": 42.7404,
+      "eval_samples_per_second": 75.011,
+      "eval_steps_per_second": 18.764,
+      "step": 148296
+    },
+    {
+      "epoch": 37.050898203592816,
+      "grad_norm": 0.02493446320295334,
+      "learning_rate": 1.474550898203593e-07,
+      "loss": 0.0262,
+      "step": 148500
+    },
+    {
+      "epoch": 37.17564870259481,
+      "grad_norm": 0.14130648970603943,
+      "learning_rate": 1.4121756487025949e-07,
+      "loss": 0.0281,
+      "step": 149000
+    },
+    {
+      "epoch": 37.300399201596804,
+      "grad_norm": 0.035768117755651474,
+      "learning_rate": 1.3498003992015965e-07,
+      "loss": 0.0255,
+      "step": 149500
+    },
+    {
+      "epoch": 37.4251497005988,
+      "grad_norm": 0.18820720911026,
+      "learning_rate": 1.2874251497005986e-07,
+      "loss": 0.032,
+      "step": 150000
+    },
+    {
+      "epoch": 37.5499001996008,
+      "grad_norm": 0.37001463770866394,
+      "learning_rate": 1.2250499001996008e-07,
+      "loss": 0.0301,
+      "step": 150500
+    },
+    {
+      "epoch": 37.6746506986028,
+      "grad_norm": 0.06626907736063004,
+      "learning_rate": 1.1626746506986028e-07,
+      "loss": 0.0238,
+      "step": 151000
+    },
+    {
+      "epoch": 37.79940119760479,
+      "grad_norm": 19.17169189453125,
+      "learning_rate": 1.1002994011976049e-07,
+      "loss": 0.0385,
+      "step": 151500
+    },
+    {
+      "epoch": 37.924151696606785,
+      "grad_norm": 4.972864627838135,
+      "learning_rate": 1.0379241516966066e-07,
+      "loss": 0.0337,
+      "step": 152000
+    },
+    {
+      "epoch": 38.0,
+      "eval_loss": 0.48605817556381226,
+      "eval_runtime": 40.1954,
+      "eval_samples_per_second": 79.76,
+      "eval_steps_per_second": 19.953,
+      "step": 152304
+    },
+    {
+      "epoch": 38.04890219560878,
+      "grad_norm": 0.002587054157629609,
+      "learning_rate": 9.755489021956087e-08,
+      "loss": 0.0334,
+      "step": 152500
+    },
+    {
+      "epoch": 38.17365269461078,
+      "grad_norm": 70.7108383178711,
+      "learning_rate": 9.131736526946107e-08,
+      "loss": 0.0319,
+      "step": 153000
+    },
+    {
+      "epoch": 38.29840319361278,
+      "grad_norm": 0.5694107413291931,
+      "learning_rate": 8.507984031936127e-08,
+      "loss": 0.0313,
+      "step": 153500
+    },
+    {
+      "epoch": 38.42315369261477,
+      "grad_norm": 0.003176228841766715,
+      "learning_rate": 7.884231536926148e-08,
+      "loss": 0.0298,
+      "step": 154000
+    },
+    {
+      "epoch": 38.547904191616766,
+      "grad_norm": 0.004230498801916838,
+      "learning_rate": 7.260479041916168e-08,
+      "loss": 0.0284,
+      "step": 154500
+    },
+    {
+      "epoch": 38.672654690618764,
+      "grad_norm": 0.13844607770442963,
+      "learning_rate": 6.636726546906188e-08,
+      "loss": 0.0305,
+      "step": 155000
+    },
+    {
+      "epoch": 38.79740518962076,
+      "grad_norm": 0.05394995957612991,
+      "learning_rate": 6.012974051896207e-08,
+      "loss": 0.0269,
+      "step": 155500
+    },
+    {
+      "epoch": 38.92215568862275,
+      "grad_norm": 0.11763022094964981,
+      "learning_rate": 5.3892215568862274e-08,
+      "loss": 0.0279,
+      "step": 156000
+    },
+    {
+      "epoch": 39.0,
+      "eval_loss": 0.4873499870300293,
+      "eval_runtime": 44.0281,
+      "eval_samples_per_second": 72.817,
+      "eval_steps_per_second": 18.216,
+      "step": 156312
+    },
+    {
+      "epoch": 39.04690618762475,
+      "grad_norm": 0.22139760851860046,
+      "learning_rate": 4.765469061876248e-08,
+      "loss": 0.0255,
+      "step": 156500
+    },
+    {
+      "epoch": 39.17165668662675,
+      "grad_norm": 0.002428988926112652,
+      "learning_rate": 4.1417165668662674e-08,
+      "loss": 0.0302,
+      "step": 157000
+    },
+    {
+      "epoch": 39.296407185628745,
+      "grad_norm": 0.07879871129989624,
+      "learning_rate": 3.517964071856287e-08,
+      "loss": 0.027,
+      "step": 157500
+    },
+    {
+      "epoch": 39.421157684630735,
+      "grad_norm": 0.03594490885734558,
+      "learning_rate": 2.8942115768463073e-08,
+      "loss": 0.033,
+      "step": 158000
+    },
+    {
+      "epoch": 39.54590818363273,
+      "grad_norm": 0.12444847822189331,
+      "learning_rate": 2.2704590818363273e-08,
+      "loss": 0.0271,
+      "step": 158500
+    },
+    {
+      "epoch": 39.67065868263473,
+      "grad_norm": 47.82669448852539,
+      "learning_rate": 1.6467065868263473e-08,
+      "loss": 0.0276,
+      "step": 159000
+    },
+    {
+      "epoch": 39.79540918163673,
+      "grad_norm": 0.1385308802127838,
+      "learning_rate": 1.0229540918163672e-08,
+      "loss": 0.03,
+      "step": 159500
+    },
+    {
+      "epoch": 39.920159680638726,
+      "grad_norm": 0.1429419070482254,
+      "learning_rate": 3.992015968063871e-09,
+      "loss": 0.0303,
+      "step": 160000
+    }
+  ],
+  "logging_steps": 500,
+  "max_steps": 160320,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 40,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 4.21770798769152e+16,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:162afc2cd7810ba9eb3e571711d88bc2d858afcfe980344a5610c4caccebeac1
+size 5112