Upload 13 files

Browse files

Files changed (13) hide show

README.md +113 -3
added_tokens.json +3 -0
all_results.json +16 -0
config.json +35 -0
eval_results.json +10 -0
model.safetensors +3 -0
special_tokens_map.json +15 -0
spm.model +3 -0
tokenizer.json +0 -0
tokenizer_config.json +58 -0
train_results.json +9 -0
trainer_state.json +2485 -0
training_args.bin +3 -0

README.md CHANGED Viewed

@@ -1,3 +1,113 @@
----
-license: mit
----

+---
+language: en
+license: mit
+library_name: transformers
+tags:
+- generated_from_trainer
+- text-classification
+- fill-mask
+- embeddings
+metrics:
+- accuracy
+model-index:
+- name: deberta-v3-base-zyda-2
+  results:
+  - task:
+      type: text-classification
+      name: Text Classification
+    dataset:
+      name: Zyphra/Zyda-2 (subset)
+      type: Zyphra/Zyda-2
+    metrics:
+    - type: accuracy
+      value: 0.6191
+      name: Accuracy
+base_model: microsoft/deberta-v3-base
+---
+# DeBERTa-v3-base-Zyda-2
+## Model Description
+This model is a fine-tuned version of [microsoft/deberta-v3-base](https://huggingface.co/microsoft/deberta-v3-base) on a subset of the [Zyphra/Zyda-2](https://huggingface.co/datasets/Zyphra/Zyda-2) dataset. It was trained using the Masked Language Modeling (MLM) objective to enhance its understanding of the English language.
+## Performance
+The model achieves the following results on the evaluation set:
+- Loss: 2.1833
+- Accuracy: 0.6191
+## Intended Uses & Limitations
+This model is designed to be used and finetuned for the following tasks:
+- Text embedding
+- Text classification
+- Fill-in-the-blank tasks
+**Limitations:**
+- English language only
+- May be inaccurate for specialized jargon, dialects, slang, code, and LaTeX
+## Training Data
+The model was trained on the first 300&thinsp;000 rows of the [Zyphra/Zyda-2](https://huggingface.co/datasets/Zyphra/Zyda-2) dataset.
+5% of that data was used for validation.
+## Training Procedure
+### Hyperparameters
+The following hyperparameters were used during training:
+- Learning rate: 5e-05
+- Train batch size: 8
+- Eval batch size: 8
+- Seed: 42
+- Optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- Learning rate scheduler: Linear
+- Number of epochs: 1.0
+### Framework versions
+- Transformers: 4.46.3
+- Pytorch: 2.5.1+cu124
+- Datasets: 3.1.0
+- Tokenizers: 0.20.3
+## Usage Examples
+### Masked Language Modeling
+```python
+from transformers import pipeline
+unmasker = pipeline('fill-mask', model='agentlans/deberta-v3-base-zyda-2')
+result = unmasker("[MASK] is the capital of France.")
+print(result)
+```
+### Text Embedding
+```python
+from transformers import AutoTokenizer, AutoModel
+import torch
+model_name = "agentlans/deberta-v3-base-zyda-2"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModel.from_pretrained(model_name)
+text = "Example sentence for embedding."
+inputs = tokenizer(text, return_tensors='pt')
+with torch.no_grad():
+    outputs = model(**inputs)
+embeddings = outputs.last_hidden_state.mean(dim=1)
+print(embeddings)
+```
+## Ethical Considerations and Bias
+As this model is trained on a subset of the Zyda-2 dataset, it may inherit biases present in that data. Users should be aware of potential biases and evaluate the model's output critically, especially for sensitive applications.
+## Additional Information
+For more details about the base model, please refer to [microsoft/deberta-v3-base](https://huggingface.co/microsoft/deberta-v3-base).

added_tokens.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "[MASK]": 128000
+}

all_results.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+    "epoch": 1.0,
+    "eval_accuracy": 0.6191083101642154,
+    "eval_loss": 2.1832997798919678,
+    "eval_runtime": 619.8106,
+    "eval_samples": 36612,
+    "eval_samples_per_second": 59.07,
+    "eval_steps_per_second": 7.385,
+    "perplexity": 8.875545336961899,
+    "total_flos": 1.8427441878551347e+17,
+    "train_loss": 1.5726176189089465,
+    "train_runtime": 27622.4465,
+    "train_samples": 699309,
+    "train_samples_per_second": 25.317,
+    "train_steps_per_second": 6.329
+}

config.json ADDED Viewed

	@@ -0,0 +1,35 @@

+{
+  "_name_or_path": "microsoft/deberta-v3-base",
+  "architectures": [
+    "DebertaV2ForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-07,
+  "max_position_embeddings": 512,
+  "max_relative_positions": -1,
+  "model_type": "deberta-v2",
+  "norm_rel_ebd": "layer_norm",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "pooler_dropout": 0,
+  "pooler_hidden_act": "gelu",
+  "pooler_hidden_size": 768,
+  "pos_att_type": [
+    "p2c",
+    "c2p"
+  ],
+  "position_biased_input": false,
+  "position_buckets": 256,
+  "relative_attention": true,
+  "share_att_key": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.46.3",
+  "type_vocab_size": 0,
+  "vocab_size": 128100
+}

eval_results.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+    "epoch": 1.0,
+    "eval_accuracy": 0.6191083101642154,
+    "eval_loss": 2.1832997798919678,
+    "eval_runtime": 619.8106,
+    "eval_samples": 36612,
+    "eval_samples_per_second": 59.07,
+    "eval_steps_per_second": 7.385,
+    "perplexity": 8.875545336961899
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1c6027fb50e1e57a250247d1bd078ba866085b473710d84a431d2a6996756fd6
+size 738231856

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "bos_token": "[CLS]",
+  "cls_token": "[CLS]",
+  "eos_token": "[SEP]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": {
+    "content": "[UNK]",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

spm.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c679fbf93643d19aab7ee10c0b99e460bdbc02fedf34b92b05af343b4af586fd
+size 2464616

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,58 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128000": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "[CLS]",
+  "clean_up_tokenization_spaces": false,
+  "cls_token": "[CLS]",
+  "do_lower_case": false,
+  "eos_token": "[SEP]",
+  "mask_token": "[MASK]",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "sp_model_kwargs": {},
+  "split_by_punct": false,
+  "tokenizer_class": "DebertaV2Tokenizer",
+  "unk_token": "[UNK]",
+  "vocab_type": "spm"
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 1.0,
+    "total_flos": 1.8427441878551347e+17,
+    "train_loss": 1.5726176189089465,
+    "train_runtime": 27622.4465,
+    "train_samples": 699309,
+    "train_samples_per_second": 25.317,
+    "train_steps_per_second": 6.329
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,2485 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 174828,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0028599537831468643,
+      "grad_norm": 8.294376373291016,
+      "learning_rate": 4.985700231084266e-05,
+      "loss": 6.665,
+      "step": 500
+    },
+    {
+      "epoch": 0.005719907566293729,
+      "grad_norm": 8.308354377746582,
+      "learning_rate": 4.971400462168532e-05,
+      "loss": 5.1044,
+      "step": 1000
+    },
+    {
+      "epoch": 0.008579861349440594,
+      "grad_norm": 7.017335891723633,
+      "learning_rate": 4.9571006932527974e-05,
+      "loss": 4.6249,
+      "step": 1500
+    },
+    {
+      "epoch": 0.011439815132587457,
+      "grad_norm": 7.528384685516357,
+      "learning_rate": 4.942800924337063e-05,
+      "loss": 4.3456,
+      "step": 2000
+    },
+    {
+      "epoch": 0.014299768915734323,
+      "grad_norm": 7.852795600891113,
+      "learning_rate": 4.928501155421328e-05,
+      "loss": 4.0772,
+      "step": 2500
+    },
+    {
+      "epoch": 0.017159722698881188,
+      "grad_norm": 7.606760025024414,
+      "learning_rate": 4.914201386505594e-05,
+      "loss": 3.974,
+      "step": 3000
+    },
+    {
+      "epoch": 0.02001967648202805,
+      "grad_norm": 7.45611572265625,
+      "learning_rate": 4.8999016175898596e-05,
+      "loss": 3.8115,
+      "step": 3500
+    },
+    {
+      "epoch": 0.022879630265174915,
+      "grad_norm": 7.1126861572265625,
+      "learning_rate": 4.885601848674126e-05,
+      "loss": 3.7529,
+      "step": 4000
+    },
+    {
+      "epoch": 0.02573958404832178,
+      "grad_norm": 7.052072525024414,
+      "learning_rate": 4.871302079758392e-05,
+      "loss": 3.654,
+      "step": 4500
+    },
+    {
+      "epoch": 0.028599537831468645,
+      "grad_norm": 7.367290019989014,
+      "learning_rate": 4.8570023108426574e-05,
+      "loss": 3.5671,
+      "step": 5000
+    },
+    {
+      "epoch": 0.03145949161461551,
+      "grad_norm": 8.534124374389648,
+      "learning_rate": 4.842702541926923e-05,
+      "loss": 3.5032,
+      "step": 5500
+    },
+    {
+      "epoch": 0.034319445397762376,
+      "grad_norm": 7.498807907104492,
+      "learning_rate": 4.828402773011189e-05,
+      "loss": 3.5244,
+      "step": 6000
+    },
+    {
+      "epoch": 0.037179399180909234,
+      "grad_norm": 6.6923298835754395,
+      "learning_rate": 4.814103004095454e-05,
+      "loss": 3.4814,
+      "step": 6500
+    },
+    {
+      "epoch": 0.0400393529640561,
+      "grad_norm": 6.853496551513672,
+      "learning_rate": 4.7998032351797196e-05,
+      "loss": 3.3781,
+      "step": 7000
+    },
+    {
+      "epoch": 0.042899306747202964,
+      "grad_norm": 7.179901599884033,
+      "learning_rate": 4.785503466263985e-05,
+      "loss": 3.3461,
+      "step": 7500
+    },
+    {
+      "epoch": 0.04575926053034983,
+      "grad_norm": 7.566349506378174,
+      "learning_rate": 4.771203697348251e-05,
+      "loss": 3.3024,
+      "step": 8000
+    },
+    {
+      "epoch": 0.048619214313496695,
+      "grad_norm": 7.144839763641357,
+      "learning_rate": 4.756903928432517e-05,
+      "loss": 3.2582,
+      "step": 8500
+    },
+    {
+      "epoch": 0.05147916809664356,
+      "grad_norm": 6.929514408111572,
+      "learning_rate": 4.7426041595167824e-05,
+      "loss": 3.2371,
+      "step": 9000
+    },
+    {
+      "epoch": 0.054339121879790425,
+      "grad_norm": 6.699374198913574,
+      "learning_rate": 4.728304390601048e-05,
+      "loss": 3.201,
+      "step": 9500
+    },
+    {
+      "epoch": 0.05719907566293729,
+      "grad_norm": 5.757383346557617,
+      "learning_rate": 4.714004621685314e-05,
+      "loss": 3.1877,
+      "step": 10000
+    },
+    {
+      "epoch": 0.06005902944608415,
+      "grad_norm": 7.66983699798584,
+      "learning_rate": 4.6997048527695796e-05,
+      "loss": 3.1514,
+      "step": 10500
+    },
+    {
+      "epoch": 0.06291898322923102,
+      "grad_norm": 7.166614532470703,
+      "learning_rate": 4.6854050838538446e-05,
+      "loss": 3.1203,
+      "step": 11000
+    },
+    {
+      "epoch": 0.06577893701237789,
+      "grad_norm": 8.80114459991455,
+      "learning_rate": 4.671105314938111e-05,
+      "loss": 3.1312,
+      "step": 11500
+    },
+    {
+      "epoch": 0.06863889079552475,
+      "grad_norm": 29.41587257385254,
+      "learning_rate": 4.656805546022377e-05,
+      "loss": 3.1019,
+      "step": 12000
+    },
+    {
+      "epoch": 0.0714988445786716,
+      "grad_norm": 6.1705145835876465,
+      "learning_rate": 4.6425057771066424e-05,
+      "loss": 3.0878,
+      "step": 12500
+    },
+    {
+      "epoch": 0.07435879836181847,
+      "grad_norm": 7.218475341796875,
+      "learning_rate": 4.628206008190908e-05,
+      "loss": 3.0758,
+      "step": 13000
+    },
+    {
+      "epoch": 0.07721875214496533,
+      "grad_norm": 6.435647964477539,
+      "learning_rate": 4.613906239275174e-05,
+      "loss": 3.0396,
+      "step": 13500
+    },
+    {
+      "epoch": 0.0800787059281122,
+      "grad_norm": 7.471750736236572,
+      "learning_rate": 4.5996064703594396e-05,
+      "loss": 3.0454,
+      "step": 14000
+    },
+    {
+      "epoch": 0.08293865971125906,
+      "grad_norm": 6.561801910400391,
+      "learning_rate": 4.585306701443705e-05,
+      "loss": 3.0095,
+      "step": 14500
+    },
+    {
+      "epoch": 0.08579861349440593,
+      "grad_norm": 7.1273369789123535,
+      "learning_rate": 4.57100693252797e-05,
+      "loss": 3.0249,
+      "step": 15000
+    },
+    {
+      "epoch": 0.0886585672775528,
+      "grad_norm": 6.540430545806885,
+      "learning_rate": 4.556707163612236e-05,
+      "loss": 3.0156,
+      "step": 15500
+    },
+    {
+      "epoch": 0.09151852106069966,
+      "grad_norm": 6.394286632537842,
+      "learning_rate": 4.542407394696502e-05,
+      "loss": 2.9634,
+      "step": 16000
+    },
+    {
+      "epoch": 0.09437847484384652,
+      "grad_norm": 7.856606960296631,
+      "learning_rate": 4.5281076257807674e-05,
+      "loss": 2.9866,
+      "step": 16500
+    },
+    {
+      "epoch": 0.09723842862699339,
+      "grad_norm": 7.8352861404418945,
+      "learning_rate": 4.513807856865033e-05,
+      "loss": 2.9864,
+      "step": 17000
+    },
+    {
+      "epoch": 0.10009838241014025,
+      "grad_norm": 6.253101348876953,
+      "learning_rate": 4.499508087949299e-05,
+      "loss": 2.9592,
+      "step": 17500
+    },
+    {
+      "epoch": 0.10295833619328712,
+      "grad_norm": 6.485994815826416,
+      "learning_rate": 4.4852083190335646e-05,
+      "loss": 2.9405,
+      "step": 18000
+    },
+    {
+      "epoch": 0.10581828997643399,
+      "grad_norm": 6.409724712371826,
+      "learning_rate": 4.47090855011783e-05,
+      "loss": 2.9567,
+      "step": 18500
+    },
+    {
+      "epoch": 0.10867824375958085,
+      "grad_norm": 7.388598918914795,
+      "learning_rate": 4.456608781202096e-05,
+      "loss": 2.9123,
+      "step": 19000
+    },
+    {
+      "epoch": 0.11153819754272772,
+      "grad_norm": 7.503371715545654,
+      "learning_rate": 4.442309012286362e-05,
+      "loss": 2.922,
+      "step": 19500
+    },
+    {
+      "epoch": 0.11439815132587458,
+      "grad_norm": 6.702953338623047,
+      "learning_rate": 4.4280092433706274e-05,
+      "loss": 2.9321,
+      "step": 20000
+    },
+    {
+      "epoch": 0.11725810510902143,
+      "grad_norm": 7.328106880187988,
+      "learning_rate": 4.413709474454893e-05,
+      "loss": 2.8965,
+      "step": 20500
+    },
+    {
+      "epoch": 0.1201180588921683,
+      "grad_norm": 6.787193775177002,
+      "learning_rate": 4.399409705539159e-05,
+      "loss": 2.9204,
+      "step": 21000
+    },
+    {
+      "epoch": 0.12297801267531516,
+      "grad_norm": 5.832542896270752,
+      "learning_rate": 4.3851099366234246e-05,
+      "loss": 2.8724,
+      "step": 21500
+    },
+    {
+      "epoch": 0.12583796645846204,
+      "grad_norm": 6.784033298492432,
+      "learning_rate": 4.37081016770769e-05,
+      "loss": 2.889,
+      "step": 22000
+    },
+    {
+      "epoch": 0.1286979202416089,
+      "grad_norm": 7.457705497741699,
+      "learning_rate": 4.356510398791956e-05,
+      "loss": 2.8845,
+      "step": 22500
+    },
+    {
+      "epoch": 0.13155787402475577,
+      "grad_norm": 7.377457141876221,
+      "learning_rate": 4.342210629876222e-05,
+      "loss": 2.876,
+      "step": 23000
+    },
+    {
+      "epoch": 0.13441782780790262,
+      "grad_norm": 6.810230731964111,
+      "learning_rate": 4.327910860960487e-05,
+      "loss": 2.8881,
+      "step": 23500
+    },
+    {
+      "epoch": 0.1372777815910495,
+      "grad_norm": 6.137091636657715,
+      "learning_rate": 4.3136110920447525e-05,
+      "loss": 2.8599,
+      "step": 24000
+    },
+    {
+      "epoch": 0.14013773537419635,
+      "grad_norm": 27.535808563232422,
+      "learning_rate": 4.299311323129018e-05,
+      "loss": 2.8432,
+      "step": 24500
+    },
+    {
+      "epoch": 0.1429976891573432,
+      "grad_norm": 6.044827461242676,
+      "learning_rate": 4.285011554213284e-05,
+      "loss": 2.8644,
+      "step": 25000
+    },
+    {
+      "epoch": 0.14585764294049008,
+      "grad_norm": 6.300295829772949,
+      "learning_rate": 4.2707117852975496e-05,
+      "loss": 2.8269,
+      "step": 25500
+    },
+    {
+      "epoch": 0.14871759672363694,
+      "grad_norm": 5.811293125152588,
+      "learning_rate": 4.256412016381815e-05,
+      "loss": 2.8308,
+      "step": 26000
+    },
+    {
+      "epoch": 0.15157755050678381,
+      "grad_norm": 6.52765417098999,
+      "learning_rate": 4.242112247466081e-05,
+      "loss": 2.8309,
+      "step": 26500
+    },
+    {
+      "epoch": 0.15443750428993067,
+      "grad_norm": 6.731512546539307,
+      "learning_rate": 4.227812478550347e-05,
+      "loss": 2.8066,
+      "step": 27000
+    },
+    {
+      "epoch": 0.15729745807307755,
+      "grad_norm": 6.837157249450684,
+      "learning_rate": 4.2135127096346125e-05,
+      "loss": 2.8282,
+      "step": 27500
+    },
+    {
+      "epoch": 0.1601574118562244,
+      "grad_norm": 5.657121181488037,
+      "learning_rate": 4.199212940718878e-05,
+      "loss": 2.7904,
+      "step": 28000
+    },
+    {
+      "epoch": 0.16301736563937128,
+      "grad_norm": 8.501928329467773,
+      "learning_rate": 4.184913171803144e-05,
+      "loss": 2.8224,
+      "step": 28500
+    },
+    {
+      "epoch": 0.16587731942251813,
+      "grad_norm": 6.447242736816406,
+      "learning_rate": 4.1706134028874096e-05,
+      "loss": 2.7731,
+      "step": 29000
+    },
+    {
+      "epoch": 0.168737273205665,
+      "grad_norm": 6.049993991851807,
+      "learning_rate": 4.156313633971675e-05,
+      "loss": 2.7726,
+      "step": 29500
+    },
+    {
+      "epoch": 0.17159722698881186,
+      "grad_norm": 5.747082710266113,
+      "learning_rate": 4.142013865055941e-05,
+      "loss": 2.799,
+      "step": 30000
+    },
+    {
+      "epoch": 0.17445718077195874,
+      "grad_norm": 6.7925615310668945,
+      "learning_rate": 4.127714096140207e-05,
+      "loss": 2.7704,
+      "step": 30500
+    },
+    {
+      "epoch": 0.1773171345551056,
+      "grad_norm": 7.164943218231201,
+      "learning_rate": 4.1134143272244725e-05,
+      "loss": 2.7611,
+      "step": 31000
+    },
+    {
+      "epoch": 0.18017708833825247,
+      "grad_norm": 6.813632011413574,
+      "learning_rate": 4.099114558308738e-05,
+      "loss": 2.7637,
+      "step": 31500
+    },
+    {
+      "epoch": 0.18303704212139932,
+      "grad_norm": 5.981168270111084,
+      "learning_rate": 4.084814789393003e-05,
+      "loss": 2.7495,
+      "step": 32000
+    },
+    {
+      "epoch": 0.18589699590454617,
+      "grad_norm": 6.125492095947266,
+      "learning_rate": 4.070515020477269e-05,
+      "loss": 2.7592,
+      "step": 32500
+    },
+    {
+      "epoch": 0.18875694968769305,
+      "grad_norm": 44.21103286743164,
+      "learning_rate": 4.0562152515615347e-05,
+      "loss": 2.7711,
+      "step": 33000
+    },
+    {
+      "epoch": 0.1916169034708399,
+      "grad_norm": 5.714451789855957,
+      "learning_rate": 4.0419154826458004e-05,
+      "loss": 2.7456,
+      "step": 33500
+    },
+    {
+      "epoch": 0.19447685725398678,
+      "grad_norm": 5.732424736022949,
+      "learning_rate": 4.027615713730066e-05,
+      "loss": 2.7412,
+      "step": 34000
+    },
+    {
+      "epoch": 0.19733681103713363,
+      "grad_norm": 7.989277362823486,
+      "learning_rate": 4.013315944814332e-05,
+      "loss": 2.7429,
+      "step": 34500
+    },
+    {
+      "epoch": 0.2001967648202805,
+      "grad_norm": 6.200708389282227,
+      "learning_rate": 3.9990161758985975e-05,
+      "loss": 2.7497,
+      "step": 35000
+    },
+    {
+      "epoch": 0.20305671860342736,
+      "grad_norm": 6.867748260498047,
+      "learning_rate": 3.984716406982863e-05,
+      "loss": 2.7387,
+      "step": 35500
+    },
+    {
+      "epoch": 0.20591667238657424,
+      "grad_norm": 5.795921325683594,
+      "learning_rate": 3.970416638067129e-05,
+      "loss": 2.7262,
+      "step": 36000
+    },
+    {
+      "epoch": 0.2087766261697211,
+      "grad_norm": 6.110116958618164,
+      "learning_rate": 3.9561168691513947e-05,
+      "loss": 2.7416,
+      "step": 36500
+    },
+    {
+      "epoch": 0.21163657995286797,
+      "grad_norm": 6.253924369812012,
+      "learning_rate": 3.9418171002356604e-05,
+      "loss": 2.7215,
+      "step": 37000
+    },
+    {
+      "epoch": 0.21449653373601482,
+      "grad_norm": 6.117007732391357,
+      "learning_rate": 3.927517331319926e-05,
+      "loss": 2.7231,
+      "step": 37500
+    },
+    {
+      "epoch": 0.2173564875191617,
+      "grad_norm": 8.227131843566895,
+      "learning_rate": 3.913217562404192e-05,
+      "loss": 2.71,
+      "step": 38000
+    },
+    {
+      "epoch": 0.22021644130230855,
+      "grad_norm": 6.146326541900635,
+      "learning_rate": 3.8989177934884575e-05,
+      "loss": 2.7076,
+      "step": 38500
+    },
+    {
+      "epoch": 0.22307639508545543,
+      "grad_norm": 6.7277398109436035,
+      "learning_rate": 3.884618024572723e-05,
+      "loss": 2.7226,
+      "step": 39000
+    },
+    {
+      "epoch": 0.22593634886860228,
+      "grad_norm": 6.300662994384766,
+      "learning_rate": 3.870318255656989e-05,
+      "loss": 2.7025,
+      "step": 39500
+    },
+    {
+      "epoch": 0.22879630265174916,
+      "grad_norm": 5.755123138427734,
+      "learning_rate": 3.8560184867412547e-05,
+      "loss": 2.7025,
+      "step": 40000
+    },
+    {
+      "epoch": 0.231656256434896,
+      "grad_norm": 6.393768310546875,
+      "learning_rate": 3.8417187178255204e-05,
+      "loss": 2.7113,
+      "step": 40500
+    },
+    {
+      "epoch": 0.23451621021804286,
+      "grad_norm": 5.855433464050293,
+      "learning_rate": 3.8274189489097854e-05,
+      "loss": 2.721,
+      "step": 41000
+    },
+    {
+      "epoch": 0.23737616400118974,
+      "grad_norm": 4.719547271728516,
+      "learning_rate": 3.813119179994051e-05,
+      "loss": 2.6774,
+      "step": 41500
+    },
+    {
+      "epoch": 0.2402361177843366,
+      "grad_norm": 5.75437068939209,
+      "learning_rate": 3.798819411078317e-05,
+      "loss": 2.6922,
+      "step": 42000
+    },
+    {
+      "epoch": 0.24309607156748347,
+      "grad_norm": 6.258277416229248,
+      "learning_rate": 3.7845196421625825e-05,
+      "loss": 2.701,
+      "step": 42500
+    },
+    {
+      "epoch": 0.24595602535063033,
+      "grad_norm": 5.8440165519714355,
+      "learning_rate": 3.770219873246848e-05,
+      "loss": 2.697,
+      "step": 43000
+    },
+    {
+      "epoch": 0.2488159791337772,
+      "grad_norm": 5.4940009117126465,
+      "learning_rate": 3.755920104331114e-05,
+      "loss": 2.6826,
+      "step": 43500
+    },
+    {
+      "epoch": 0.2516759329169241,
+      "grad_norm": 8.00302791595459,
+      "learning_rate": 3.7416203354153804e-05,
+      "loss": 2.6782,
+      "step": 44000
+    },
+    {
+      "epoch": 0.25453588670007093,
+      "grad_norm": 6.31597375869751,
+      "learning_rate": 3.727320566499646e-05,
+      "loss": 2.7113,
+      "step": 44500
+    },
+    {
+      "epoch": 0.2573958404832178,
+      "grad_norm": 6.734432697296143,
+      "learning_rate": 3.713020797583911e-05,
+      "loss": 2.6883,
+      "step": 45000
+    },
+    {
+      "epoch": 0.26025579426636464,
+      "grad_norm": 8.607872009277344,
+      "learning_rate": 3.698721028668177e-05,
+      "loss": 2.675,
+      "step": 45500
+    },
+    {
+      "epoch": 0.26311574804951154,
+      "grad_norm": 6.785426139831543,
+      "learning_rate": 3.6844212597524425e-05,
+      "loss": 2.6662,
+      "step": 46000
+    },
+    {
+      "epoch": 0.2659757018326584,
+      "grad_norm": 5.7255072593688965,
+      "learning_rate": 3.670121490836708e-05,
+      "loss": 2.6566,
+      "step": 46500
+    },
+    {
+      "epoch": 0.26883565561580525,
+      "grad_norm": 5.778408527374268,
+      "learning_rate": 3.655821721920974e-05,
+      "loss": 2.6869,
+      "step": 47000
+    },
+    {
+      "epoch": 0.2716956093989521,
+      "grad_norm": 7.3644490242004395,
+      "learning_rate": 3.64152195300524e-05,
+      "loss": 2.6548,
+      "step": 47500
+    },
+    {
+      "epoch": 0.274555563182099,
+      "grad_norm": 9.922218322753906,
+      "learning_rate": 3.6272221840895054e-05,
+      "loss": 2.6548,
+      "step": 48000
+    },
+    {
+      "epoch": 0.27741551696524586,
+      "grad_norm": 6.6563944816589355,
+      "learning_rate": 3.612922415173771e-05,
+      "loss": 2.6466,
+      "step": 48500
+    },
+    {
+      "epoch": 0.2802754707483927,
+      "grad_norm": 5.308610439300537,
+      "learning_rate": 3.598622646258037e-05,
+      "loss": 2.6744,
+      "step": 49000
+    },
+    {
+      "epoch": 0.28313542453153956,
+      "grad_norm": 6.213603973388672,
+      "learning_rate": 3.584322877342302e-05,
+      "loss": 2.6484,
+      "step": 49500
+    },
+    {
+      "epoch": 0.2859953783146864,
+      "grad_norm": 5.715392589569092,
+      "learning_rate": 3.5700231084265676e-05,
+      "loss": 2.6573,
+      "step": 50000
+    },
+    {
+      "epoch": 0.2888553320978333,
+      "grad_norm": 6.067576885223389,
+      "learning_rate": 3.555723339510833e-05,
+      "loss": 2.6487,
+      "step": 50500
+    },
+    {
+      "epoch": 0.29171528588098017,
+      "grad_norm": 6.300750255584717,
+      "learning_rate": 3.541423570595099e-05,
+      "loss": 2.6445,
+      "step": 51000
+    },
+    {
+      "epoch": 0.294575239664127,
+      "grad_norm": 6.036895275115967,
+      "learning_rate": 3.5271238016793654e-05,
+      "loss": 2.6756,
+      "step": 51500
+    },
+    {
+      "epoch": 0.29743519344727387,
+      "grad_norm": 5.856159687042236,
+      "learning_rate": 3.512824032763631e-05,
+      "loss": 2.6415,
+      "step": 52000
+    },
+    {
+      "epoch": 0.3002951472304208,
+      "grad_norm": 12.173583984375,
+      "learning_rate": 3.498524263847897e-05,
+      "loss": 2.6386,
+      "step": 52500
+    },
+    {
+      "epoch": 0.30315510101356763,
+      "grad_norm": 5.493927478790283,
+      "learning_rate": 3.4842244949321625e-05,
+      "loss": 2.6515,
+      "step": 53000
+    },
+    {
+      "epoch": 0.3060150547967145,
+      "grad_norm": 5.786694526672363,
+      "learning_rate": 3.4699247260164276e-05,
+      "loss": 2.6449,
+      "step": 53500
+    },
+    {
+      "epoch": 0.30887500857986133,
+      "grad_norm": 5.755667686462402,
+      "learning_rate": 3.455624957100693e-05,
+      "loss": 2.6357,
+      "step": 54000
+    },
+    {
+      "epoch": 0.31173496236300824,
+      "grad_norm": 5.9297027587890625,
+      "learning_rate": 3.441325188184959e-05,
+      "loss": 2.6493,
+      "step": 54500
+    },
+    {
+      "epoch": 0.3145949161461551,
+      "grad_norm": 6.182466983795166,
+      "learning_rate": 3.427025419269225e-05,
+      "loss": 2.6298,
+      "step": 55000
+    },
+    {
+      "epoch": 0.31745486992930194,
+      "grad_norm": 6.565801620483398,
+      "learning_rate": 3.4127256503534904e-05,
+      "loss": 2.6611,
+      "step": 55500
+    },
+    {
+      "epoch": 0.3203148237124488,
+      "grad_norm": 5.94129753112793,
+      "learning_rate": 3.398425881437756e-05,
+      "loss": 2.6201,
+      "step": 56000
+    },
+    {
+      "epoch": 0.32317477749559564,
+      "grad_norm": 6.72519063949585,
+      "learning_rate": 3.384126112522022e-05,
+      "loss": 2.5961,
+      "step": 56500
+    },
+    {
+      "epoch": 0.32603473127874255,
+      "grad_norm": 6.440931797027588,
+      "learning_rate": 3.3698263436062876e-05,
+      "loss": 2.6322,
+      "step": 57000
+    },
+    {
+      "epoch": 0.3288946850618894,
+      "grad_norm": 6.059328079223633,
+      "learning_rate": 3.355526574690553e-05,
+      "loss": 2.615,
+      "step": 57500
+    },
+    {
+      "epoch": 0.33175463884503625,
+      "grad_norm": 6.007944107055664,
+      "learning_rate": 3.341226805774818e-05,
+      "loss": 2.5992,
+      "step": 58000
+    },
+    {
+      "epoch": 0.3346145926281831,
+      "grad_norm": 6.9386982917785645,
+      "learning_rate": 3.326927036859084e-05,
+      "loss": 2.6317,
+      "step": 58500
+    },
+    {
+      "epoch": 0.33747454641133,
+      "grad_norm": 5.493308067321777,
+      "learning_rate": 3.3126272679433504e-05,
+      "loss": 2.5975,
+      "step": 59000
+    },
+    {
+      "epoch": 0.34033450019447686,
+      "grad_norm": 7.026157855987549,
+      "learning_rate": 3.298327499027616e-05,
+      "loss": 2.6139,
+      "step": 59500
+    },
+    {
+      "epoch": 0.3431944539776237,
+      "grad_norm": 5.790646553039551,
+      "learning_rate": 3.284027730111882e-05,
+      "loss": 2.5916,
+      "step": 60000
+    },
+    {
+      "epoch": 0.34605440776077057,
+      "grad_norm": 5.980741024017334,
+      "learning_rate": 3.2697279611961476e-05,
+      "loss": 2.5811,
+      "step": 60500
+    },
+    {
+      "epoch": 0.3489143615439175,
+      "grad_norm": 6.555883407592773,
+      "learning_rate": 3.255428192280413e-05,
+      "loss": 2.5909,
+      "step": 61000
+    },
+    {
+      "epoch": 0.3517743153270643,
+      "grad_norm": 5.8480706214904785,
+      "learning_rate": 3.241128423364679e-05,
+      "loss": 2.6127,
+      "step": 61500
+    },
+    {
+      "epoch": 0.3546342691102112,
+      "grad_norm": 6.341095924377441,
+      "learning_rate": 3.226828654448944e-05,
+      "loss": 2.5959,
+      "step": 62000
+    },
+    {
+      "epoch": 0.357494222893358,
+      "grad_norm": 5.832342147827148,
+      "learning_rate": 3.21252888553321e-05,
+      "loss": 2.5946,
+      "step": 62500
+    },
+    {
+      "epoch": 0.36035417667650493,
+      "grad_norm": 6.495291709899902,
+      "learning_rate": 3.1982291166174755e-05,
+      "loss": 2.6122,
+      "step": 63000
+    },
+    {
+      "epoch": 0.3632141304596518,
+      "grad_norm": 6.527446746826172,
+      "learning_rate": 3.183929347701741e-05,
+      "loss": 2.573,
+      "step": 63500
+    },
+    {
+      "epoch": 0.36607408424279864,
+      "grad_norm": 6.4324951171875,
+      "learning_rate": 3.169629578786007e-05,
+      "loss": 2.6119,
+      "step": 64000
+    },
+    {
+      "epoch": 0.3689340380259455,
+      "grad_norm": 7.166018009185791,
+      "learning_rate": 3.1553298098702726e-05,
+      "loss": 2.6124,
+      "step": 64500
+    },
+    {
+      "epoch": 0.37179399180909234,
+      "grad_norm": 6.462119102478027,
+      "learning_rate": 3.141030040954538e-05,
+      "loss": 2.5552,
+      "step": 65000
+    },
+    {
+      "epoch": 0.37465394559223925,
+      "grad_norm": 6.0564703941345215,
+      "learning_rate": 3.126730272038804e-05,
+      "loss": 2.5672,
+      "step": 65500
+    },
+    {
+      "epoch": 0.3775138993753861,
+      "grad_norm": 5.307662487030029,
+      "learning_rate": 3.11243050312307e-05,
+      "loss": 2.5611,
+      "step": 66000
+    },
+    {
+      "epoch": 0.38037385315853295,
+      "grad_norm": 5.18694543838501,
+      "learning_rate": 3.0981307342073355e-05,
+      "loss": 2.5691,
+      "step": 66500
+    },
+    {
+      "epoch": 0.3832338069416798,
+      "grad_norm": 5.568657398223877,
+      "learning_rate": 3.083830965291601e-05,
+      "loss": 2.575,
+      "step": 67000
+    },
+    {
+      "epoch": 0.3860937607248267,
+      "grad_norm": 10.616528511047363,
+      "learning_rate": 3.069531196375867e-05,
+      "loss": 2.5886,
+      "step": 67500
+    },
+    {
+      "epoch": 0.38895371450797356,
+      "grad_norm": 6.7568206787109375,
+      "learning_rate": 3.0552314274601326e-05,
+      "loss": 2.5822,
+      "step": 68000
+    },
+    {
+      "epoch": 0.3918136682911204,
+      "grad_norm": 6.087740421295166,
+      "learning_rate": 3.040931658544398e-05,
+      "loss": 2.5472,
+      "step": 68500
+    },
+    {
+      "epoch": 0.39467362207426726,
+      "grad_norm": 6.702504634857178,
+      "learning_rate": 3.0266318896286637e-05,
+      "loss": 2.5897,
+      "step": 69000
+    },
+    {
+      "epoch": 0.39753357585741417,
+      "grad_norm": 6.2178053855896,
+      "learning_rate": 3.0123321207129297e-05,
+      "loss": 2.5698,
+      "step": 69500
+    },
+    {
+      "epoch": 0.400393529640561,
+      "grad_norm": 6.559543609619141,
+      "learning_rate": 2.9980323517971955e-05,
+      "loss": 2.5725,
+      "step": 70000
+    },
+    {
+      "epoch": 0.40325348342370787,
+      "grad_norm": 5.918066501617432,
+      "learning_rate": 2.9837325828814605e-05,
+      "loss": 2.5847,
+      "step": 70500
+    },
+    {
+      "epoch": 0.4061134372068547,
+      "grad_norm": 5.602575778961182,
+      "learning_rate": 2.9694328139657262e-05,
+      "loss": 2.583,
+      "step": 71000
+    },
+    {
+      "epoch": 0.40897339099000163,
+      "grad_norm": 5.304308891296387,
+      "learning_rate": 2.955133045049992e-05,
+      "loss": 2.5632,
+      "step": 71500
+    },
+    {
+      "epoch": 0.4118333447731485,
+      "grad_norm": 5.540666103363037,
+      "learning_rate": 2.9408332761342576e-05,
+      "loss": 2.5756,
+      "step": 72000
+    },
+    {
+      "epoch": 0.41469329855629533,
+      "grad_norm": 6.2000861167907715,
+      "learning_rate": 2.9265335072185234e-05,
+      "loss": 2.5357,
+      "step": 72500
+    },
+    {
+      "epoch": 0.4175532523394422,
+      "grad_norm": 5.1564459800720215,
+      "learning_rate": 2.912233738302789e-05,
+      "loss": 2.5516,
+      "step": 73000
+    },
+    {
+      "epoch": 0.42041320612258903,
+      "grad_norm": 6.008329391479492,
+      "learning_rate": 2.897933969387055e-05,
+      "loss": 2.5738,
+      "step": 73500
+    },
+    {
+      "epoch": 0.42327315990573594,
+      "grad_norm": 6.52450704574585,
+      "learning_rate": 2.883634200471321e-05,
+      "loss": 2.578,
+      "step": 74000
+    },
+    {
+      "epoch": 0.4261331136888828,
+      "grad_norm": 5.788220405578613,
+      "learning_rate": 2.8693344315555866e-05,
+      "loss": 2.5578,
+      "step": 74500
+    },
+    {
+      "epoch": 0.42899306747202964,
+      "grad_norm": 5.5810112953186035,
+      "learning_rate": 2.8550346626398516e-05,
+      "loss": 2.5643,
+      "step": 75000
+    },
+    {
+      "epoch": 0.4318530212551765,
+      "grad_norm": 5.334226608276367,
+      "learning_rate": 2.8407348937241173e-05,
+      "loss": 2.5376,
+      "step": 75500
+    },
+    {
+      "epoch": 0.4347129750383234,
+      "grad_norm": 5.804100513458252,
+      "learning_rate": 2.826435124808383e-05,
+      "loss": 2.541,
+      "step": 76000
+    },
+    {
+      "epoch": 0.43757292882147025,
+      "grad_norm": 5.555410385131836,
+      "learning_rate": 2.8121353558926487e-05,
+      "loss": 2.5364,
+      "step": 76500
+    },
+    {
+      "epoch": 0.4404328826046171,
+      "grad_norm": 5.454427719116211,
+      "learning_rate": 2.7978355869769148e-05,
+      "loss": 2.5602,
+      "step": 77000
+    },
+    {
+      "epoch": 0.44329283638776396,
+      "grad_norm": 16.772747039794922,
+      "learning_rate": 2.7835358180611805e-05,
+      "loss": 2.5674,
+      "step": 77500
+    },
+    {
+      "epoch": 0.44615279017091086,
+      "grad_norm": 8.047761917114258,
+      "learning_rate": 2.7692360491454462e-05,
+      "loss": 2.5334,
+      "step": 78000
+    },
+    {
+      "epoch": 0.4490127439540577,
+      "grad_norm": 6.612277507781982,
+      "learning_rate": 2.754936280229712e-05,
+      "loss": 2.5525,
+      "step": 78500
+    },
+    {
+      "epoch": 0.45187269773720457,
+      "grad_norm": 6.439370632171631,
+      "learning_rate": 2.740636511313977e-05,
+      "loss": 2.5349,
+      "step": 79000
+    },
+    {
+      "epoch": 0.4547326515203514,
+      "grad_norm": 6.890873908996582,
+      "learning_rate": 2.7263367423982427e-05,
+      "loss": 2.5145,
+      "step": 79500
+    },
+    {
+      "epoch": 0.4575926053034983,
+      "grad_norm": 5.4768500328063965,
+      "learning_rate": 2.7120369734825084e-05,
+      "loss": 2.5277,
+      "step": 80000
+    },
+    {
+      "epoch": 0.4604525590866452,
+      "grad_norm": 5.825018405914307,
+      "learning_rate": 2.697737204566774e-05,
+      "loss": 2.5505,
+      "step": 80500
+    },
+    {
+      "epoch": 0.463312512869792,
+      "grad_norm": 6.583479881286621,
+      "learning_rate": 2.68343743565104e-05,
+      "loss": 2.5562,
+      "step": 81000
+    },
+    {
+      "epoch": 0.4661724666529389,
+      "grad_norm": 6.420114040374756,
+      "learning_rate": 2.669137666735306e-05,
+      "loss": 2.5094,
+      "step": 81500
+    },
+    {
+      "epoch": 0.46903242043608573,
+      "grad_norm": 6.8168110847473145,
+      "learning_rate": 2.6548378978195716e-05,
+      "loss": 2.5347,
+      "step": 82000
+    },
+    {
+      "epoch": 0.47189237421923264,
+      "grad_norm": 6.224096298217773,
+      "learning_rate": 2.6405381289038373e-05,
+      "loss": 2.5154,
+      "step": 82500
+    },
+    {
+      "epoch": 0.4747523280023795,
+      "grad_norm": 6.240240097045898,
+      "learning_rate": 2.626238359988103e-05,
+      "loss": 2.535,
+      "step": 83000
+    },
+    {
+      "epoch": 0.47761228178552634,
+      "grad_norm": 6.053983211517334,
+      "learning_rate": 2.611938591072368e-05,
+      "loss": 2.5275,
+      "step": 83500
+    },
+    {
+      "epoch": 0.4804722355686732,
+      "grad_norm": 5.546879768371582,
+      "learning_rate": 2.5976388221566338e-05,
+      "loss": 2.5329,
+      "step": 84000
+    },
+    {
+      "epoch": 0.4833321893518201,
+      "grad_norm": 6.190423011779785,
+      "learning_rate": 2.5833390532408995e-05,
+      "loss": 2.5174,
+      "step": 84500
+    },
+    {
+      "epoch": 0.48619214313496695,
+      "grad_norm": 5.437402248382568,
+      "learning_rate": 2.5690392843251655e-05,
+      "loss": 2.49,
+      "step": 85000
+    },
+    {
+      "epoch": 0.4890520969181138,
+      "grad_norm": 6.8163557052612305,
+      "learning_rate": 2.5547395154094312e-05,
+      "loss": 2.524,
+      "step": 85500
+    },
+    {
+      "epoch": 0.49191205070126065,
+      "grad_norm": 6.754604816436768,
+      "learning_rate": 2.540439746493697e-05,
+      "loss": 2.5041,
+      "step": 86000
+    },
+    {
+      "epoch": 0.49477200448440756,
+      "grad_norm": 5.496472358703613,
+      "learning_rate": 2.5261399775779627e-05,
+      "loss": 2.5277,
+      "step": 86500
+    },
+    {
+      "epoch": 0.4976319582675544,
+      "grad_norm": 5.616280555725098,
+      "learning_rate": 2.5118402086622284e-05,
+      "loss": 2.5061,
+      "step": 87000
+    },
+    {
+      "epoch": 0.5004919120507013,
+      "grad_norm": 6.141283988952637,
+      "learning_rate": 2.4975404397464938e-05,
+      "loss": 2.5214,
+      "step": 87500
+    },
+    {
+      "epoch": 0.5033518658338482,
+      "grad_norm": 6.124631404876709,
+      "learning_rate": 2.4832406708307595e-05,
+      "loss": 2.4854,
+      "step": 88000
+    },
+    {
+      "epoch": 0.506211819616995,
+      "grad_norm": 6.740499496459961,
+      "learning_rate": 2.4689409019150252e-05,
+      "loss": 2.5054,
+      "step": 88500
+    },
+    {
+      "epoch": 0.5090717734001419,
+      "grad_norm": 6.040327548980713,
+      "learning_rate": 2.454641132999291e-05,
+      "loss": 2.5042,
+      "step": 89000
+    },
+    {
+      "epoch": 0.5119317271832887,
+      "grad_norm": 5.564330577850342,
+      "learning_rate": 2.4403413640835566e-05,
+      "loss": 2.5021,
+      "step": 89500
+    },
+    {
+      "epoch": 0.5147916809664356,
+      "grad_norm": 6.915059566497803,
+      "learning_rate": 2.4260415951678223e-05,
+      "loss": 2.5227,
+      "step": 90000
+    },
+    {
+      "epoch": 0.5176516347495824,
+      "grad_norm": 6.181910991668701,
+      "learning_rate": 2.411741826252088e-05,
+      "loss": 2.5098,
+      "step": 90500
+    },
+    {
+      "epoch": 0.5205115885327293,
+      "grad_norm": 5.829164505004883,
+      "learning_rate": 2.3974420573363534e-05,
+      "loss": 2.5133,
+      "step": 91000
+    },
+    {
+      "epoch": 0.5233715423158761,
+      "grad_norm": 14.621573448181152,
+      "learning_rate": 2.383142288420619e-05,
+      "loss": 2.503,
+      "step": 91500
+    },
+    {
+      "epoch": 0.5262314960990231,
+      "grad_norm": 6.3930511474609375,
+      "learning_rate": 2.368842519504885e-05,
+      "loss": 2.5124,
+      "step": 92000
+    },
+    {
+      "epoch": 0.5290914498821699,
+      "grad_norm": 5.840575695037842,
+      "learning_rate": 2.3545427505891506e-05,
+      "loss": 2.5177,
+      "step": 92500
+    },
+    {
+      "epoch": 0.5319514036653168,
+      "grad_norm": 6.612518787384033,
+      "learning_rate": 2.3402429816734163e-05,
+      "loss": 2.4881,
+      "step": 93000
+    },
+    {
+      "epoch": 0.5348113574484636,
+      "grad_norm": 6.505732536315918,
+      "learning_rate": 2.325943212757682e-05,
+      "loss": 2.4872,
+      "step": 93500
+    },
+    {
+      "epoch": 0.5376713112316105,
+      "grad_norm": 7.19988489151001,
+      "learning_rate": 2.3116434438419477e-05,
+      "loss": 2.4958,
+      "step": 94000
+    },
+    {
+      "epoch": 0.5405312650147573,
+      "grad_norm": 5.988187789916992,
+      "learning_rate": 2.2973436749262134e-05,
+      "loss": 2.5094,
+      "step": 94500
+    },
+    {
+      "epoch": 0.5433912187979042,
+      "grad_norm": 5.709506511688232,
+      "learning_rate": 2.2830439060104788e-05,
+      "loss": 2.4882,
+      "step": 95000
+    },
+    {
+      "epoch": 0.546251172581051,
+      "grad_norm": 5.567132949829102,
+      "learning_rate": 2.2687441370947445e-05,
+      "loss": 2.4909,
+      "step": 95500
+    },
+    {
+      "epoch": 0.549111126364198,
+      "grad_norm": 11.825920104980469,
+      "learning_rate": 2.2544443681790102e-05,
+      "loss": 2.4944,
+      "step": 96000
+    },
+    {
+      "epoch": 0.5519710801473449,
+      "grad_norm": 5.969587802886963,
+      "learning_rate": 2.240144599263276e-05,
+      "loss": 2.4912,
+      "step": 96500
+    },
+    {
+      "epoch": 0.5548310339304917,
+      "grad_norm": 6.31153678894043,
+      "learning_rate": 2.225844830347542e-05,
+      "loss": 2.4901,
+      "step": 97000
+    },
+    {
+      "epoch": 0.5576909877136386,
+      "grad_norm": 7.130558013916016,
+      "learning_rate": 2.2115450614318074e-05,
+      "loss": 2.4768,
+      "step": 97500
+    },
+    {
+      "epoch": 0.5605509414967854,
+      "grad_norm": 5.947187900543213,
+      "learning_rate": 2.197245292516073e-05,
+      "loss": 2.4971,
+      "step": 98000
+    },
+    {
+      "epoch": 0.5634108952799323,
+      "grad_norm": 6.830575466156006,
+      "learning_rate": 2.1829455236003388e-05,
+      "loss": 2.4901,
+      "step": 98500
+    },
+    {
+      "epoch": 0.5662708490630791,
+      "grad_norm": 5.682921409606934,
+      "learning_rate": 2.1686457546846045e-05,
+      "loss": 2.4946,
+      "step": 99000
+    },
+    {
+      "epoch": 0.569130802846226,
+      "grad_norm": 5.174154758453369,
+      "learning_rate": 2.15434598576887e-05,
+      "loss": 2.4813,
+      "step": 99500
+    },
+    {
+      "epoch": 0.5719907566293728,
+      "grad_norm": 5.400365352630615,
+      "learning_rate": 2.1400462168531356e-05,
+      "loss": 2.4498,
+      "step": 100000
+    },
+    {
+      "epoch": 0.5748507104125198,
+      "grad_norm": 5.433869361877441,
+      "learning_rate": 2.1257464479374013e-05,
+      "loss": 2.523,
+      "step": 100500
+    },
+    {
+      "epoch": 0.5777106641956666,
+      "grad_norm": 6.321377754211426,
+      "learning_rate": 2.1114466790216674e-05,
+      "loss": 2.4731,
+      "step": 101000
+    },
+    {
+      "epoch": 0.5805706179788135,
+      "grad_norm": 6.643988609313965,
+      "learning_rate": 2.0971469101059327e-05,
+      "loss": 2.4837,
+      "step": 101500
+    },
+    {
+      "epoch": 0.5834305717619603,
+      "grad_norm": 6.258885383605957,
+      "learning_rate": 2.0828471411901985e-05,
+      "loss": 2.4735,
+      "step": 102000
+    },
+    {
+      "epoch": 0.5862905255451072,
+      "grad_norm": 5.747689723968506,
+      "learning_rate": 2.068547372274464e-05,
+      "loss": 2.4742,
+      "step": 102500
+    },
+    {
+      "epoch": 0.589150479328254,
+      "grad_norm": 6.016144275665283,
+      "learning_rate": 2.05424760335873e-05,
+      "loss": 2.4633,
+      "step": 103000
+    },
+    {
+      "epoch": 0.5920104331114009,
+      "grad_norm": 5.250337600708008,
+      "learning_rate": 2.0399478344429953e-05,
+      "loss": 2.467,
+      "step": 103500
+    },
+    {
+      "epoch": 0.5948703868945477,
+      "grad_norm": 5.667397975921631,
+      "learning_rate": 2.025648065527261e-05,
+      "loss": 2.4709,
+      "step": 104000
+    },
+    {
+      "epoch": 0.5977303406776946,
+      "grad_norm": 6.414941310882568,
+      "learning_rate": 2.0113482966115267e-05,
+      "loss": 2.4805,
+      "step": 104500
+    },
+    {
+      "epoch": 0.6005902944608416,
+      "grad_norm": 6.118762493133545,
+      "learning_rate": 1.9970485276957927e-05,
+      "loss": 2.46,
+      "step": 105000
+    },
+    {
+      "epoch": 0.6034502482439884,
+      "grad_norm": 7.456865310668945,
+      "learning_rate": 1.9827487587800584e-05,
+      "loss": 2.4863,
+      "step": 105500
+    },
+    {
+      "epoch": 0.6063102020271353,
+      "grad_norm": 7.2666096687316895,
+      "learning_rate": 1.9684489898643238e-05,
+      "loss": 2.431,
+      "step": 106000
+    },
+    {
+      "epoch": 0.6091701558102821,
+      "grad_norm": 6.135725975036621,
+      "learning_rate": 1.9541492209485895e-05,
+      "loss": 2.4833,
+      "step": 106500
+    },
+    {
+      "epoch": 0.612030109593429,
+      "grad_norm": 6.930655002593994,
+      "learning_rate": 1.9398494520328553e-05,
+      "loss": 2.4791,
+      "step": 107000
+    },
+    {
+      "epoch": 0.6148900633765758,
+      "grad_norm": 5.848691940307617,
+      "learning_rate": 1.925549683117121e-05,
+      "loss": 2.4744,
+      "step": 107500
+    },
+    {
+      "epoch": 0.6177500171597227,
+      "grad_norm": 6.593609809875488,
+      "learning_rate": 1.9112499142013863e-05,
+      "loss": 2.4818,
+      "step": 108000
+    },
+    {
+      "epoch": 0.6206099709428695,
+      "grad_norm": 5.148362636566162,
+      "learning_rate": 1.8969501452856524e-05,
+      "loss": 2.4863,
+      "step": 108500
+    },
+    {
+      "epoch": 0.6234699247260165,
+      "grad_norm": 6.264626979827881,
+      "learning_rate": 1.882650376369918e-05,
+      "loss": 2.4896,
+      "step": 109000
+    },
+    {
+      "epoch": 0.6263298785091633,
+      "grad_norm": 7.046905040740967,
+      "learning_rate": 1.8683506074541838e-05,
+      "loss": 2.4746,
+      "step": 109500
+    },
+    {
+      "epoch": 0.6291898322923102,
+      "grad_norm": 6.274538993835449,
+      "learning_rate": 1.8540508385384492e-05,
+      "loss": 2.4395,
+      "step": 110000
+    },
+    {
+      "epoch": 0.632049786075457,
+      "grad_norm": 5.889391899108887,
+      "learning_rate": 1.839751069622715e-05,
+      "loss": 2.4307,
+      "step": 110500
+    },
+    {
+      "epoch": 0.6349097398586039,
+      "grad_norm": 5.6989030838012695,
+      "learning_rate": 1.8254513007069806e-05,
+      "loss": 2.4297,
+      "step": 111000
+    },
+    {
+      "epoch": 0.6377696936417507,
+      "grad_norm": 6.275044918060303,
+      "learning_rate": 1.8111515317912463e-05,
+      "loss": 2.4504,
+      "step": 111500
+    },
+    {
+      "epoch": 0.6406296474248976,
+      "grad_norm": 6.444321155548096,
+      "learning_rate": 1.7968517628755117e-05,
+      "loss": 2.4286,
+      "step": 112000
+    },
+    {
+      "epoch": 0.6434896012080444,
+      "grad_norm": 6.624863147735596,
+      "learning_rate": 1.7825519939597778e-05,
+      "loss": 2.463,
+      "step": 112500
+    },
+    {
+      "epoch": 0.6463495549911913,
+      "grad_norm": 7.994183540344238,
+      "learning_rate": 1.7682522250440435e-05,
+      "loss": 2.4362,
+      "step": 113000
+    },
+    {
+      "epoch": 0.6492095087743383,
+      "grad_norm": 5.6794257164001465,
+      "learning_rate": 1.7539524561283092e-05,
+      "loss": 2.4355,
+      "step": 113500
+    },
+    {
+      "epoch": 0.6520694625574851,
+      "grad_norm": 5.606757164001465,
+      "learning_rate": 1.739652687212575e-05,
+      "loss": 2.4525,
+      "step": 114000
+    },
+    {
+      "epoch": 0.654929416340632,
+      "grad_norm": 6.253554344177246,
+      "learning_rate": 1.7253529182968403e-05,
+      "loss": 2.4511,
+      "step": 114500
+    },
+    {
+      "epoch": 0.6577893701237788,
+      "grad_norm": 6.014497756958008,
+      "learning_rate": 1.711053149381106e-05,
+      "loss": 2.4571,
+      "step": 115000
+    },
+    {
+      "epoch": 0.6606493239069257,
+      "grad_norm": 6.601302146911621,
+      "learning_rate": 1.6967533804653717e-05,
+      "loss": 2.4505,
+      "step": 115500
+    },
+    {
+      "epoch": 0.6635092776900725,
+      "grad_norm": 7.215948104858398,
+      "learning_rate": 1.6824536115496374e-05,
+      "loss": 2.4351,
+      "step": 116000
+    },
+    {
+      "epoch": 0.6663692314732194,
+      "grad_norm": 5.974714279174805,
+      "learning_rate": 1.668153842633903e-05,
+      "loss": 2.4435,
+      "step": 116500
+    },
+    {
+      "epoch": 0.6692291852563662,
+      "grad_norm": 6.903178691864014,
+      "learning_rate": 1.653854073718169e-05,
+      "loss": 2.4388,
+      "step": 117000
+    },
+    {
+      "epoch": 0.6720891390395132,
+      "grad_norm": 6.214517116546631,
+      "learning_rate": 1.6395543048024346e-05,
+      "loss": 2.4405,
+      "step": 117500
+    },
+    {
+      "epoch": 0.67494909282266,
+      "grad_norm": 6.263461589813232,
+      "learning_rate": 1.6252545358867003e-05,
+      "loss": 2.4496,
+      "step": 118000
+    },
+    {
+      "epoch": 0.6778090466058069,
+      "grad_norm": 8.066364288330078,
+      "learning_rate": 1.6109547669709657e-05,
+      "loss": 2.4368,
+      "step": 118500
+    },
+    {
+      "epoch": 0.6806690003889537,
+      "grad_norm": 5.834959506988525,
+      "learning_rate": 1.5966549980552314e-05,
+      "loss": 2.4481,
+      "step": 119000
+    },
+    {
+      "epoch": 0.6835289541721006,
+      "grad_norm": 6.710206031799316,
+      "learning_rate": 1.582355229139497e-05,
+      "loss": 2.4325,
+      "step": 119500
+    },
+    {
+      "epoch": 0.6863889079552474,
+      "grad_norm": 5.984834671020508,
+      "learning_rate": 1.5680554602237628e-05,
+      "loss": 2.4454,
+      "step": 120000
+    },
+    {
+      "epoch": 0.6892488617383943,
+      "grad_norm": 5.370354652404785,
+      "learning_rate": 1.5537556913080285e-05,
+      "loss": 2.4279,
+      "step": 120500
+    },
+    {
+      "epoch": 0.6921088155215411,
+      "grad_norm": 6.09434175491333,
+      "learning_rate": 1.5394559223922942e-05,
+      "loss": 2.4314,
+      "step": 121000
+    },
+    {
+      "epoch": 0.694968769304688,
+      "grad_norm": 6.878710746765137,
+      "learning_rate": 1.52515615347656e-05,
+      "loss": 2.4191,
+      "step": 121500
+    },
+    {
+      "epoch": 0.697828723087835,
+      "grad_norm": 5.660272121429443,
+      "learning_rate": 1.5108563845608257e-05,
+      "loss": 2.433,
+      "step": 122000
+    },
+    {
+      "epoch": 0.7006886768709818,
+      "grad_norm": 6.489835739135742,
+      "learning_rate": 1.4965566156450914e-05,
+      "loss": 2.4491,
+      "step": 122500
+    },
+    {
+      "epoch": 0.7035486306541286,
+      "grad_norm": 5.600217819213867,
+      "learning_rate": 1.4822568467293567e-05,
+      "loss": 2.4235,
+      "step": 123000
+    },
+    {
+      "epoch": 0.7064085844372755,
+      "grad_norm": 5.281232833862305,
+      "learning_rate": 1.4679570778136226e-05,
+      "loss": 2.4219,
+      "step": 123500
+    },
+    {
+      "epoch": 0.7092685382204224,
+      "grad_norm": 5.651204586029053,
+      "learning_rate": 1.4536573088978883e-05,
+      "loss": 2.448,
+      "step": 124000
+    },
+    {
+      "epoch": 0.7121284920035692,
+      "grad_norm": 5.520606994628906,
+      "learning_rate": 1.439357539982154e-05,
+      "loss": 2.4118,
+      "step": 124500
+    },
+    {
+      "epoch": 0.714988445786716,
+      "grad_norm": 6.359561920166016,
+      "learning_rate": 1.4250577710664196e-05,
+      "loss": 2.4502,
+      "step": 125000
+    },
+    {
+      "epoch": 0.7178483995698629,
+      "grad_norm": 6.264361381530762,
+      "learning_rate": 1.4107580021506853e-05,
+      "loss": 2.4214,
+      "step": 125500
+    },
+    {
+      "epoch": 0.7207083533530099,
+      "grad_norm": 15.211498260498047,
+      "learning_rate": 1.396458233234951e-05,
+      "loss": 2.4476,
+      "step": 126000
+    },
+    {
+      "epoch": 0.7235683071361567,
+      "grad_norm": 6.165014266967773,
+      "learning_rate": 1.3821584643192167e-05,
+      "loss": 2.4255,
+      "step": 126500
+    },
+    {
+      "epoch": 0.7264282609193036,
+      "grad_norm": 5.279512882232666,
+      "learning_rate": 1.3678586954034823e-05,
+      "loss": 2.4458,
+      "step": 127000
+    },
+    {
+      "epoch": 0.7292882147024504,
+      "grad_norm": 6.13384485244751,
+      "learning_rate": 1.353558926487748e-05,
+      "loss": 2.4022,
+      "step": 127500
+    },
+    {
+      "epoch": 0.7321481684855973,
+      "grad_norm": 5.577615261077881,
+      "learning_rate": 1.3392591575720137e-05,
+      "loss": 2.4174,
+      "step": 128000
+    },
+    {
+      "epoch": 0.7350081222687441,
+      "grad_norm": 5.860058784484863,
+      "learning_rate": 1.3249593886562794e-05,
+      "loss": 2.4043,
+      "step": 128500
+    },
+    {
+      "epoch": 0.737868076051891,
+      "grad_norm": 6.8798065185546875,
+      "learning_rate": 1.3106596197405451e-05,
+      "loss": 2.3858,
+      "step": 129000
+    },
+    {
+      "epoch": 0.7407280298350378,
+      "grad_norm": 7.996329307556152,
+      "learning_rate": 1.2963598508248107e-05,
+      "loss": 2.3993,
+      "step": 129500
+    },
+    {
+      "epoch": 0.7435879836181847,
+      "grad_norm": 6.488850116729736,
+      "learning_rate": 1.2820600819090764e-05,
+      "loss": 2.4204,
+      "step": 130000
+    },
+    {
+      "epoch": 0.7464479374013316,
+      "grad_norm": 5.177313804626465,
+      "learning_rate": 1.2677603129933421e-05,
+      "loss": 2.433,
+      "step": 130500
+    },
+    {
+      "epoch": 0.7493078911844785,
+      "grad_norm": 6.9536895751953125,
+      "learning_rate": 1.2534605440776078e-05,
+      "loss": 2.4145,
+      "step": 131000
+    },
+    {
+      "epoch": 0.7521678449676253,
+      "grad_norm": 5.639203071594238,
+      "learning_rate": 1.2391607751618735e-05,
+      "loss": 2.3906,
+      "step": 131500
+    },
+    {
+      "epoch": 0.7550277987507722,
+      "grad_norm": 5.76200532913208,
+      "learning_rate": 1.2248610062461391e-05,
+      "loss": 2.4065,
+      "step": 132000
+    },
+    {
+      "epoch": 0.757887752533919,
+      "grad_norm": 7.033239364624023,
+      "learning_rate": 1.2105612373304048e-05,
+      "loss": 2.4045,
+      "step": 132500
+    },
+    {
+      "epoch": 0.7607477063170659,
+      "grad_norm": 6.319807529449463,
+      "learning_rate": 1.1962614684146704e-05,
+      "loss": 2.3646,
+      "step": 133000
+    },
+    {
+      "epoch": 0.7636076601002127,
+      "grad_norm": 6.506091117858887,
+      "learning_rate": 1.1819616994989362e-05,
+      "loss": 2.4247,
+      "step": 133500
+    },
+    {
+      "epoch": 0.7664676138833596,
+      "grad_norm": 6.245853424072266,
+      "learning_rate": 1.1676619305832018e-05,
+      "loss": 2.3998,
+      "step": 134000
+    },
+    {
+      "epoch": 0.7693275676665066,
+      "grad_norm": 6.403684616088867,
+      "learning_rate": 1.1533621616674675e-05,
+      "loss": 2.4072,
+      "step": 134500
+    },
+    {
+      "epoch": 0.7721875214496534,
+      "grad_norm": 6.385560035705566,
+      "learning_rate": 1.1390623927517332e-05,
+      "loss": 2.4078,
+      "step": 135000
+    },
+    {
+      "epoch": 0.7750474752328003,
+      "grad_norm": 6.857175350189209,
+      "learning_rate": 1.124762623835999e-05,
+      "loss": 2.4167,
+      "step": 135500
+    },
+    {
+      "epoch": 0.7779074290159471,
+      "grad_norm": 5.734222888946533,
+      "learning_rate": 1.1104628549202645e-05,
+      "loss": 2.411,
+      "step": 136000
+    },
+    {
+      "epoch": 0.780767382799094,
+      "grad_norm": 6.311659812927246,
+      "learning_rate": 1.0961630860045302e-05,
+      "loss": 2.4232,
+      "step": 136500
+    },
+    {
+      "epoch": 0.7836273365822408,
+      "grad_norm": 6.344162940979004,
+      "learning_rate": 1.0818633170887959e-05,
+      "loss": 2.3997,
+      "step": 137000
+    },
+    {
+      "epoch": 0.7864872903653877,
+      "grad_norm": 5.971358776092529,
+      "learning_rate": 1.0675635481730616e-05,
+      "loss": 2.4181,
+      "step": 137500
+    },
+    {
+      "epoch": 0.7893472441485345,
+      "grad_norm": 5.663905620574951,
+      "learning_rate": 1.0532637792573273e-05,
+      "loss": 2.3939,
+      "step": 138000
+    },
+    {
+      "epoch": 0.7922071979316814,
+      "grad_norm": 5.739428520202637,
+      "learning_rate": 1.0389640103415929e-05,
+      "loss": 2.3803,
+      "step": 138500
+    },
+    {
+      "epoch": 0.7950671517148283,
+      "grad_norm": 6.558109760284424,
+      "learning_rate": 1.0246642414258586e-05,
+      "loss": 2.3794,
+      "step": 139000
+    },
+    {
+      "epoch": 0.7979271054979752,
+      "grad_norm": 7.577678203582764,
+      "learning_rate": 1.0103644725101243e-05,
+      "loss": 2.4035,
+      "step": 139500
+    },
+    {
+      "epoch": 0.800787059281122,
+      "grad_norm": 6.890414237976074,
+      "learning_rate": 9.9606470359439e-06,
+      "loss": 2.3791,
+      "step": 140000
+    },
+    {
+      "epoch": 0.8036470130642689,
+      "grad_norm": 6.212318420410156,
+      "learning_rate": 9.817649346786556e-06,
+      "loss": 2.363,
+      "step": 140500
+    },
+    {
+      "epoch": 0.8065069668474157,
+      "grad_norm": 6.501023292541504,
+      "learning_rate": 9.674651657629213e-06,
+      "loss": 2.3794,
+      "step": 141000
+    },
+    {
+      "epoch": 0.8093669206305626,
+      "grad_norm": 6.136830806732178,
+      "learning_rate": 9.53165396847187e-06,
+      "loss": 2.3835,
+      "step": 141500
+    },
+    {
+      "epoch": 0.8122268744137094,
+      "grad_norm": 6.386491298675537,
+      "learning_rate": 9.388656279314527e-06,
+      "loss": 2.3836,
+      "step": 142000
+    },
+    {
+      "epoch": 0.8150868281968563,
+      "grad_norm": 6.060532093048096,
+      "learning_rate": 9.245658590157182e-06,
+      "loss": 2.3714,
+      "step": 142500
+    },
+    {
+      "epoch": 0.8179467819800033,
+      "grad_norm": 6.481443405151367,
+      "learning_rate": 9.10266090099984e-06,
+      "loss": 2.3842,
+      "step": 143000
+    },
+    {
+      "epoch": 0.8208067357631501,
+      "grad_norm": 6.378634929656982,
+      "learning_rate": 8.959663211842497e-06,
+      "loss": 2.4011,
+      "step": 143500
+    },
+    {
+      "epoch": 0.823666689546297,
+      "grad_norm": 7.321898937225342,
+      "learning_rate": 8.816665522685154e-06,
+      "loss": 2.3874,
+      "step": 144000
+    },
+    {
+      "epoch": 0.8265266433294438,
+      "grad_norm": 5.878232479095459,
+      "learning_rate": 8.673667833527811e-06,
+      "loss": 2.3747,
+      "step": 144500
+    },
+    {
+      "epoch": 0.8293865971125907,
+      "grad_norm": 6.182088375091553,
+      "learning_rate": 8.530670144370468e-06,
+      "loss": 2.3928,
+      "step": 145000
+    },
+    {
+      "epoch": 0.8322465508957375,
+      "grad_norm": 6.2058258056640625,
+      "learning_rate": 8.387672455213125e-06,
+      "loss": 2.3784,
+      "step": 145500
+    },
+    {
+      "epoch": 0.8351065046788844,
+      "grad_norm": 6.231584072113037,
+      "learning_rate": 8.24467476605578e-06,
+      "loss": 2.3715,
+      "step": 146000
+    },
+    {
+      "epoch": 0.8379664584620312,
+      "grad_norm": 6.14652156829834,
+      "learning_rate": 8.101677076898438e-06,
+      "loss": 2.3789,
+      "step": 146500
+    },
+    {
+      "epoch": 0.8408264122451781,
+      "grad_norm": 6.431158065795898,
+      "learning_rate": 7.958679387741095e-06,
+      "loss": 2.3792,
+      "step": 147000
+    },
+    {
+      "epoch": 0.843686366028325,
+      "grad_norm": 5.822235584259033,
+      "learning_rate": 7.815681698583752e-06,
+      "loss": 2.4062,
+      "step": 147500
+    },
+    {
+      "epoch": 0.8465463198114719,
+      "grad_norm": 5.64607048034668,
+      "learning_rate": 7.672684009426408e-06,
+      "loss": 2.368,
+      "step": 148000
+    },
+    {
+      "epoch": 0.8494062735946187,
+      "grad_norm": 6.182931900024414,
+      "learning_rate": 7.5296863202690655e-06,
+      "loss": 2.3877,
+      "step": 148500
+    },
+    {
+      "epoch": 0.8522662273777656,
+      "grad_norm": 6.151760578155518,
+      "learning_rate": 7.386688631111721e-06,
+      "loss": 2.3915,
+      "step": 149000
+    },
+    {
+      "epoch": 0.8551261811609124,
+      "grad_norm": 6.303664684295654,
+      "learning_rate": 7.243690941954379e-06,
+      "loss": 2.3565,
+      "step": 149500
+    },
+    {
+      "epoch": 0.8579861349440593,
+      "grad_norm": 6.381216526031494,
+      "learning_rate": 7.100693252797034e-06,
+      "loss": 2.3697,
+      "step": 150000
+    },
+    {
+      "epoch": 0.8608460887272061,
+      "grad_norm": 5.706302165985107,
+      "learning_rate": 6.957695563639692e-06,
+      "loss": 2.4026,
+      "step": 150500
+    },
+    {
+      "epoch": 0.863706042510353,
+      "grad_norm": 7.22359561920166,
+      "learning_rate": 6.814697874482348e-06,
+      "loss": 2.3759,
+      "step": 151000
+    },
+    {
+      "epoch": 0.8665659962935,
+      "grad_norm": 5.458381652832031,
+      "learning_rate": 6.671700185325006e-06,
+      "loss": 2.3836,
+      "step": 151500
+    },
+    {
+      "epoch": 0.8694259500766468,
+      "grad_norm": 5.785479545593262,
+      "learning_rate": 6.528702496167661e-06,
+      "loss": 2.3655,
+      "step": 152000
+    },
+    {
+      "epoch": 0.8722859038597937,
+      "grad_norm": 5.856048583984375,
+      "learning_rate": 6.385704807010319e-06,
+      "loss": 2.3669,
+      "step": 152500
+    },
+    {
+      "epoch": 0.8751458576429405,
+      "grad_norm": 5.491500377655029,
+      "learning_rate": 6.2427071178529756e-06,
+      "loss": 2.4154,
+      "step": 153000
+    },
+    {
+      "epoch": 0.8780058114260874,
+      "grad_norm": 5.936758518218994,
+      "learning_rate": 6.099709428695633e-06,
+      "loss": 2.3702,
+      "step": 153500
+    },
+    {
+      "epoch": 0.8808657652092342,
+      "grad_norm": 7.138918399810791,
+      "learning_rate": 5.956711739538289e-06,
+      "loss": 2.3582,
+      "step": 154000
+    },
+    {
+      "epoch": 0.8837257189923811,
+      "grad_norm": 6.457569122314453,
+      "learning_rate": 5.813714050380946e-06,
+      "loss": 2.381,
+      "step": 154500
+    },
+    {
+      "epoch": 0.8865856727755279,
+      "grad_norm": 6.026115894317627,
+      "learning_rate": 5.6707163612236024e-06,
+      "loss": 2.385,
+      "step": 155000
+    },
+    {
+      "epoch": 0.8894456265586748,
+      "grad_norm": 6.851065158843994,
+      "learning_rate": 5.52771867206626e-06,
+      "loss": 2.3664,
+      "step": 155500
+    },
+    {
+      "epoch": 0.8923055803418217,
+      "grad_norm": 6.16819953918457,
+      "learning_rate": 5.384720982908916e-06,
+      "loss": 2.3814,
+      "step": 156000
+    },
+    {
+      "epoch": 0.8951655341249686,
+      "grad_norm": 5.917440891265869,
+      "learning_rate": 5.241723293751574e-06,
+      "loss": 2.3701,
+      "step": 156500
+    },
+    {
+      "epoch": 0.8980254879081154,
+      "grad_norm": 10.217552185058594,
+      "learning_rate": 5.09872560459423e-06,
+      "loss": 2.3516,
+      "step": 157000
+    },
+    {
+      "epoch": 0.9008854416912623,
+      "grad_norm": 7.088205814361572,
+      "learning_rate": 4.955727915436887e-06,
+      "loss": 2.3936,
+      "step": 157500
+    },
+    {
+      "epoch": 0.9037453954744091,
+      "grad_norm": 6.357458591461182,
+      "learning_rate": 4.812730226279544e-06,
+      "loss": 2.3672,
+      "step": 158000
+    },
+    {
+      "epoch": 0.906605349257556,
+      "grad_norm": 6.871440887451172,
+      "learning_rate": 4.669732537122201e-06,
+      "loss": 2.3691,
+      "step": 158500
+    },
+    {
+      "epoch": 0.9094653030407028,
+      "grad_norm": 6.192137718200684,
+      "learning_rate": 4.526734847964857e-06,
+      "loss": 2.3608,
+      "step": 159000
+    },
+    {
+      "epoch": 0.9123252568238497,
+      "grad_norm": 6.265544414520264,
+      "learning_rate": 4.383737158807514e-06,
+      "loss": 2.3682,
+      "step": 159500
+    },
+    {
+      "epoch": 0.9151852106069966,
+      "grad_norm": 5.907118320465088,
+      "learning_rate": 4.2407394696501705e-06,
+      "loss": 2.3423,
+      "step": 160000
+    },
+    {
+      "epoch": 0.9180451643901435,
+      "grad_norm": 6.204267501831055,
+      "learning_rate": 4.097741780492828e-06,
+      "loss": 2.3605,
+      "step": 160500
+    },
+    {
+      "epoch": 0.9209051181732903,
+      "grad_norm": 6.978556156158447,
+      "learning_rate": 3.954744091335484e-06,
+      "loss": 2.3594,
+      "step": 161000
+    },
+    {
+      "epoch": 0.9237650719564372,
+      "grad_norm": 6.3842082023620605,
+      "learning_rate": 3.811746402178141e-06,
+      "loss": 2.3677,
+      "step": 161500
+    },
+    {
+      "epoch": 0.926625025739584,
+      "grad_norm": 6.20996618270874,
+      "learning_rate": 3.6687487130207977e-06,
+      "loss": 2.3538,
+      "step": 162000
+    },
+    {
+      "epoch": 0.9294849795227309,
+      "grad_norm": 6.184482574462891,
+      "learning_rate": 3.5257510238634545e-06,
+      "loss": 2.3787,
+      "step": 162500
+    },
+    {
+      "epoch": 0.9323449333058778,
+      "grad_norm": 6.219623565673828,
+      "learning_rate": 3.382753334706111e-06,
+      "loss": 2.3774,
+      "step": 163000
+    },
+    {
+      "epoch": 0.9352048870890246,
+      "grad_norm": 6.634711742401123,
+      "learning_rate": 3.239755645548768e-06,
+      "loss": 2.3671,
+      "step": 163500
+    },
+    {
+      "epoch": 0.9380648408721715,
+      "grad_norm": 7.119485855102539,
+      "learning_rate": 3.096757956391425e-06,
+      "loss": 2.356,
+      "step": 164000
+    },
+    {
+      "epoch": 0.9409247946553184,
+      "grad_norm": 6.833123207092285,
+      "learning_rate": 2.9537602672340818e-06,
+      "loss": 2.3451,
+      "step": 164500
+    },
+    {
+      "epoch": 0.9437847484384653,
+      "grad_norm": 6.631540298461914,
+      "learning_rate": 2.8107625780767385e-06,
+      "loss": 2.3324,
+      "step": 165000
+    },
+    {
+      "epoch": 0.9466447022216121,
+      "grad_norm": 6.187737941741943,
+      "learning_rate": 2.667764888919395e-06,
+      "loss": 2.3573,
+      "step": 165500
+    },
+    {
+      "epoch": 0.949504656004759,
+      "grad_norm": 5.523457050323486,
+      "learning_rate": 2.524767199762052e-06,
+      "loss": 2.3468,
+      "step": 166000
+    },
+    {
+      "epoch": 0.9523646097879058,
+      "grad_norm": 6.898806095123291,
+      "learning_rate": 2.381769510604709e-06,
+      "loss": 2.3534,
+      "step": 166500
+    },
+    {
+      "epoch": 0.9552245635710527,
+      "grad_norm": 6.348108291625977,
+      "learning_rate": 2.2387718214473658e-06,
+      "loss": 2.3588,
+      "step": 167000
+    },
+    {
+      "epoch": 0.9580845173541995,
+      "grad_norm": 6.188412189483643,
+      "learning_rate": 2.0957741322900225e-06,
+      "loss": 2.3607,
+      "step": 167500
+    },
+    {
+      "epoch": 0.9609444711373464,
+      "grad_norm": 6.769163608551025,
+      "learning_rate": 1.952776443132679e-06,
+      "loss": 2.3721,
+      "step": 168000
+    },
+    {
+      "epoch": 0.9638044249204932,
+      "grad_norm": 6.389153957366943,
+      "learning_rate": 1.8097787539753357e-06,
+      "loss": 2.381,
+      "step": 168500
+    },
+    {
+      "epoch": 0.9666643787036402,
+      "grad_norm": 5.625518798828125,
+      "learning_rate": 1.6667810648179926e-06,
+      "loss": 2.3656,
+      "step": 169000
+    },
+    {
+      "epoch": 0.969524332486787,
+      "grad_norm": 6.03477144241333,
+      "learning_rate": 1.5237833756606493e-06,
+      "loss": 2.3796,
+      "step": 169500
+    },
+    {
+      "epoch": 0.9723842862699339,
+      "grad_norm": 6.034476280212402,
+      "learning_rate": 1.3807856865033063e-06,
+      "loss": 2.3407,
+      "step": 170000
+    },
+    {
+      "epoch": 0.9752442400530807,
+      "grad_norm": 6.318973541259766,
+      "learning_rate": 1.237787997345963e-06,
+      "loss": 2.3537,
+      "step": 170500
+    },
+    {
+      "epoch": 0.9781041938362276,
+      "grad_norm": 6.3570237159729,
+      "learning_rate": 1.0947903081886197e-06,
+      "loss": 2.3744,
+      "step": 171000
+    },
+    {
+      "epoch": 0.9809641476193744,
+      "grad_norm": 5.440378189086914,
+      "learning_rate": 9.517926190312765e-07,
+      "loss": 2.3775,
+      "step": 171500
+    },
+    {
+      "epoch": 0.9838241014025213,
+      "grad_norm": 7.5823655128479,
+      "learning_rate": 8.087949298739332e-07,
+      "loss": 2.3301,
+      "step": 172000
+    },
+    {
+      "epoch": 0.9866840551856682,
+      "grad_norm": 6.07295560836792,
+      "learning_rate": 6.6579724071659e-07,
+      "loss": 2.3347,
+      "step": 172500
+    },
+    {
+      "epoch": 0.9895440089688151,
+      "grad_norm": 7.158942222595215,
+      "learning_rate": 5.227995515592468e-07,
+      "loss": 2.3567,
+      "step": 173000
+    },
+    {
+      "epoch": 0.992403962751962,
+      "grad_norm": 6.406834125518799,
+      "learning_rate": 3.798018624019036e-07,
+      "loss": 2.3204,
+      "step": 173500
+    },
+    {
+      "epoch": 0.9952639165351088,
+      "grad_norm": 5.863027572631836,
+      "learning_rate": 2.3680417324456038e-07,
+      "loss": 2.3569,
+      "step": 174000
+    },
+    {
+      "epoch": 0.9981238703182557,
+      "grad_norm": 6.552116394042969,
+      "learning_rate": 9.380648408721716e-08,
+      "loss": 2.3332,
+      "step": 174500
+    },
+    {
+      "epoch": 1.0,
+      "step": 174828,
+      "total_flos": 1.8427441878551347e+17,
+      "train_loss": 1.5726176189089465,
+      "train_runtime": 27622.4465,
+      "train_samples_per_second": 25.317,
+      "train_steps_per_second": 6.329
+    }
+  ],
+  "logging_steps": 500,
+  "max_steps": 174828,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.8427441878551347e+17,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:47be5dadfdc8b947d61e8f08b37ac38e233b0ac9bf801ba0357a96d49cfb86a5
+size 5368