myownip commited on Mar 13

Commit

7562595

•

1 Parent(s): 6831457

Upload folder using huggingface_hub

Browse files

Files changed (41) hide show

README.md +147 -0
adapter_config.json +33 -0
adapter_model.bin +3 -0
checkpoint-398/README.md +202 -0
checkpoint-398/adapter_config.json +33 -0
checkpoint-398/adapter_model.safetensors +3 -0
checkpoint-398/optimizer.pt +3 -0
checkpoint-398/rng_state.pth +3 -0
checkpoint-398/scheduler.pt +3 -0
checkpoint-398/trainer_state.json +2871 -0
checkpoint-398/training_args.bin +3 -0
checkpoint-431/README.md +202 -0
checkpoint-431/adapter_config.json +33 -0
checkpoint-431/adapter_model.safetensors +3 -0
checkpoint-431/optimizer.pt +3 -0
checkpoint-431/rng_state.pth +3 -0
checkpoint-431/scheduler.pt +3 -0
checkpoint-431/trainer_state.json +3070 -0
checkpoint-431/training_args.bin +3 -0
checkpoint-597/README.md +202 -0
checkpoint-597/adapter_config.json +33 -0
checkpoint-597/adapter_model.safetensors +3 -0
checkpoint-597/optimizer.pt +3 -0
checkpoint-597/rng_state.pth +3 -0
checkpoint-597/scheduler.pt +3 -0
checkpoint-597/trainer_state.json +0 -0
checkpoint-597/training_args.bin +3 -0
checkpoint-796/README.md +202 -0
checkpoint-796/adapter_config.json +33 -0
checkpoint-796/adapter_model.safetensors +3 -0
checkpoint-796/optimizer.pt +3 -0
checkpoint-796/rng_state.pth +3 -0
checkpoint-796/scheduler.pt +3 -0
checkpoint-796/trainer_state.json +0 -0
checkpoint-796/training_args.bin +3 -0
config.json +43 -0
runs/Mar13_21-58-24_8711e78fac20/events.out.tfevents.1710367104.8711e78fac20.40.0 +3 -0
runs/Mar13_22-06-09_8711e78fac20/events.out.tfevents.1710367570.8711e78fac20.172.0 +3 -0
special_tokens_map.json +24 -0
tokenizer.model +3 -0
tokenizer_config.json +43 -0

README.md ADDED Viewed

	@@ -0,0 +1,147 @@

+---
+license: apache-2.0
+library_name: peft
+tags:
+- generated_from_trainer
+base_model: openlm-research/open_llama_3b_v2
+model-index:
+- name: qlora-out
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+[<img src="https://raw.githubusercontent.com/OpenAccess-AI-Collective/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/OpenAccess-AI-Collective/axolotl)
+<details><summary>See axolotl config</summary>
+axolotl version: `0.4.0`
+```yaml
+base_model: openlm-research/open_llama_3b_v2
+model_type: LlamaForCausalLM
+tokenizer_type: LlamaTokenizer
+load_in_8bit: false
+load_in_4bit: true
+strict: false
+push_dataset_to_hub:
+datasets:
+  - path: mhenrichsen/alpaca_2k_test
+    type: alpaca
+dataset_prepared_path:
+val_set_size: 0.05
+adapter: qlora
+lora_model_dir:
+sequence_len: 1024
+sample_packing: true
+lora_r: 8
+lora_alpha: 32
+lora_dropout: 0.05
+lora_target_modules:
+lora_target_linear: true
+lora_fan_in_fan_out:
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+output_dir: ./qlora-out
+gradient_accumulation_steps: 1
+micro_batch_size: 2
+num_epochs: 4
+optimizer: paged_adamw_32bit
+torchdistx_path:
+lr_scheduler: cosine
+learning_rate: 0.0002
+train_on_inputs: false
+group_by_length: false
+bf16: false
+fp16: true
+tf32: false
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+gptq_groupsize:
+gptq_model_v1:
+warmup_steps: 20
+evals_per_epoch: 4
+saves_per_epoch: 1
+debug:
+deepspeed:
+weight_decay: 0.1
+fsdp:
+fsdp_config:
+special_tokens:
+  bos_token: "<s>"
+  eos_token: "</s>"
+  unk_token: "<unk>"
+```
+</details><br>
+# qlora-out
+This model is a fine-tuned version of [openlm-research/open_llama_3b_v2](https://huggingface.co/openlm-research/open_llama_3b_v2) on the None dataset.
+It achieves the following results on the evaluation set:
+- Loss: 1.4177
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 0.0002
+- train_batch_size: 2
+- eval_batch_size: 2
+- seed: 42
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_steps: 20
+- num_epochs: 4
+- mixed_precision_training: Native AMP
+### Training results
+| Training Loss | Epoch | Step | Validation Loss |
+|:-------------:|:-----:|:----:|:---------------:|
+| 1.3026        | 0.01  | 1    | 1.3435          |
+| 1.1146        | 0.25  | 50   | 1.1476          |
+| 1.2387        | 0.5   | 100  | 1.1319          |
+| 1.4159        | 0.75  | 150  | 1.1192          |
+| 1.2807        | 1.01  | 200  | 1.1153          |
+| 1.0465        | 1.24  | 250  | 1.1569          |
+| 0.9577        | 1.49  | 300  | 1.1493          |
+| 1.1257        | 1.74  | 350  | 1.1462          |
+| 0.9404        | 1.99  | 400  | 1.1520          |
+| 0.7161        | 2.22  | 450  | 1.2603          |
+| 0.5897        | 2.47  | 500  | 1.2661          |
+| 0.5271        | 2.72  | 550  | 1.2814          |
+| 0.6239        | 2.97  | 600  | 1.2705          |
+| 0.3486        | 3.21  | 650  | 1.3848          |
+| 0.5591        | 3.46  | 700  | 1.4171          |
+| 0.3804        | 3.71  | 750  | 1.4177          |
+### Framework versions
+- PEFT 0.9.0
+- Transformers 4.38.2
+- Pytorch 2.1.2+cu118
+- Datasets 2.18.0
+- Tokenizers 0.15.0

adapter_config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "openlm-research/open_llama_3b_v2",
+  "bias": "none",
+  "fan_in_fan_out": null,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "v_proj",
+    "o_proj",
+    "q_proj",
+    "down_proj",
+    "up_proj",
+    "gate_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

adapter_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ccf1d2eb3fe8099e23e6c42a359689426d253e71d33c399490b017b152b6d6e7
+size 50982842

checkpoint-398/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+library_name: peft
+base_model: openlm-research/open_llama_3b_v2
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.9.0

checkpoint-398/adapter_config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "openlm-research/open_llama_3b_v2",
+  "bias": "none",
+  "fan_in_fan_out": null,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "v_proj",
+    "o_proj",
+    "q_proj",
+    "down_proj",
+    "up_proj",
+    "gate_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

checkpoint-398/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8b6d7016774d54258741e87e7a11113ba7bfdad918c9560a3585544926ce0ee7
+size 50899792

checkpoint-398/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:58eb8a13f40a6bfa712cb3c9c7e1633673cb857b530d9a99f7237b6c46c703f9
+size 101919290

checkpoint-398/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b89fc1d394f02fc6ad0be146152ad044e5c16ee7b8e840eed13f3689d19910c5
+size 14244

checkpoint-398/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:07f80e9e26e4de9744278d23fb86ee2e353bf61ec1fe7a592c6b05e590e547be
+size 1064

checkpoint-398/trainer_state.json ADDED Viewed

	@@ -0,0 +1,2871 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.979899497487437,
+  "eval_steps": 50,
+  "global_step": 398,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.6762334704399109,
+      "learning_rate": 1e-05,
+      "loss": 1.3026,
+      "step": 1
+    },
+    {
+      "epoch": 0.01,
+      "eval_loss": 1.343465805053711,
+      "eval_runtime": 2.9584,
+      "eval_samples_per_second": 33.802,
+      "eval_steps_per_second": 16.901,
+      "step": 1
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.7722721695899963,
+      "learning_rate": 2e-05,
+      "loss": 1.5419,
+      "step": 2
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.6532348394393921,
+      "learning_rate": 3e-05,
+      "loss": 1.4429,
+      "step": 3
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.8427589535713196,
+      "learning_rate": 4e-05,
+      "loss": 1.4,
+      "step": 4
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.9355791807174683,
+      "learning_rate": 5e-05,
+      "loss": 1.2583,
+      "step": 5
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.6357808113098145,
+      "learning_rate": 6e-05,
+      "loss": 1.2655,
+      "step": 6
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.963829517364502,
+      "learning_rate": 7e-05,
+      "loss": 1.42,
+      "step": 7
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.6698102951049805,
+      "learning_rate": 8e-05,
+      "loss": 1.3938,
+      "step": 8
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.5394894480705261,
+      "learning_rate": 9e-05,
+      "loss": 1.2234,
+      "step": 9
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.8773290514945984,
+      "learning_rate": 0.0001,
+      "loss": 1.4257,
+      "step": 10
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.7960235476493835,
+      "learning_rate": 0.00011000000000000002,
+      "loss": 1.4272,
+      "step": 11
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.7909610867500305,
+      "learning_rate": 0.00012,
+      "loss": 1.352,
+      "step": 12
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.8417578339576721,
+      "learning_rate": 0.00013000000000000002,
+      "loss": 1.2048,
+      "step": 13
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.8076886534690857,
+      "learning_rate": 0.00014,
+      "loss": 1.4186,
+      "step": 14
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.7543106079101562,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 1.0873,
+      "step": 15
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.9430835247039795,
+      "learning_rate": 0.00016,
+      "loss": 1.4061,
+      "step": 16
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.7473496794700623,
+      "learning_rate": 0.00017,
+      "loss": 1.1407,
+      "step": 17
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.8123806715011597,
+      "learning_rate": 0.00018,
+      "loss": 1.4394,
+      "step": 18
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.7778059244155884,
+      "learning_rate": 0.00019,
+      "loss": 1.2752,
+      "step": 19
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.7027471661567688,
+      "learning_rate": 0.0002,
+      "loss": 1.3107,
+      "step": 20
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.8443830609321594,
+      "learning_rate": 0.00019999918050612108,
+      "loss": 1.2204,
+      "step": 21
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.6853266358375549,
+      "learning_rate": 0.00019999672203791565,
+      "loss": 1.2231,
+      "step": 22
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.8757483959197998,
+      "learning_rate": 0.00019999262463567773,
+      "loss": 1.2069,
+      "step": 23
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.7184014320373535,
+      "learning_rate": 0.00019998688836656323,
+      "loss": 1.2124,
+      "step": 24
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.6530072093009949,
+      "learning_rate": 0.0001999795133245889,
+      "loss": 1.1672,
+      "step": 25
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.7211533188819885,
+      "learning_rate": 0.0001999704996306308,
+      "loss": 1.3207,
+      "step": 26
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.7048207521438599,
+      "learning_rate": 0.00019995984743242226,
+      "loss": 1.2003,
+      "step": 27
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.6881248354911804,
+      "learning_rate": 0.00019994755690455152,
+      "loss": 1.117,
+      "step": 28
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.7877801656723022,
+      "learning_rate": 0.00019993362824845875,
+      "loss": 1.0531,
+      "step": 29
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.749905526638031,
+      "learning_rate": 0.000199918061692433,
+      "loss": 1.1462,
+      "step": 30
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.67184978723526,
+      "learning_rate": 0.00019990085749160822,
+      "loss": 1.0939,
+      "step": 31
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.6622844934463501,
+      "learning_rate": 0.0001998820159279591,
+      "loss": 1.1369,
+      "step": 32
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.763306736946106,
+      "learning_rate": 0.00019986153731029656,
+      "loss": 1.3525,
+      "step": 33
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.6171010136604309,
+      "learning_rate": 0.0001998394219742627,
+      "loss": 0.8807,
+      "step": 34
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.7575845718383789,
+      "learning_rate": 0.00019981567028232514,
+      "loss": 1.206,
+      "step": 35
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.5694592595100403,
+      "learning_rate": 0.00019979028262377118,
+      "loss": 0.9079,
+      "step": 36
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.7056426405906677,
+      "learning_rate": 0.00019976325941470146,
+      "loss": 1.1133,
+      "step": 37
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.6812122464179993,
+      "learning_rate": 0.00019973460109802305,
+      "loss": 1.2707,
+      "step": 38
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.5790569186210632,
+      "learning_rate": 0.0001997043081434423,
+      "loss": 1.0047,
+      "step": 39
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.6529936790466309,
+      "learning_rate": 0.00019967238104745696,
+      "loss": 1.0917,
+      "step": 40
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.6274911165237427,
+      "learning_rate": 0.00019963882033334826,
+      "loss": 1.2586,
+      "step": 41
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.6666668653488159,
+      "learning_rate": 0.00019960362655117218,
+      "loss": 1.1187,
+      "step": 42
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.6239954233169556,
+      "learning_rate": 0.00019956680027775051,
+      "loss": 1.0343,
+      "step": 43
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.6892250180244446,
+      "learning_rate": 0.0001995283421166614,
+      "loss": 1.0254,
+      "step": 44
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.7392664551734924,
+      "learning_rate": 0.00019948825269822934,
+      "loss": 1.0592,
+      "step": 45
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.7541553378105164,
+      "learning_rate": 0.00019944653267951504,
+      "loss": 1.2297,
+      "step": 46
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.685874342918396,
+      "learning_rate": 0.00019940318274430449,
+      "loss": 1.321,
+      "step": 47
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.7901135087013245,
+      "learning_rate": 0.00019935820360309777,
+      "loss": 1.2583,
+      "step": 48
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.6619594693183899,
+      "learning_rate": 0.00019931159599309757,
+      "loss": 0.9762,
+      "step": 49
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.6059371829032898,
+      "learning_rate": 0.00019926336067819684,
+      "loss": 1.1146,
+      "step": 50
+    },
+    {
+      "epoch": 0.25,
+      "eval_loss": 1.1476221084594727,
+      "eval_runtime": 2.9589,
+      "eval_samples_per_second": 33.796,
+      "eval_steps_per_second": 16.898,
+      "step": 50
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.6533025503158569,
+      "learning_rate": 0.00019921349844896654,
+      "loss": 1.2439,
+      "step": 51
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.5473713278770447,
+      "learning_rate": 0.00019916201012264254,
+      "loss": 0.8464,
+      "step": 52
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.6035101413726807,
+      "learning_rate": 0.00019910889654311208,
+      "loss": 1.1297,
+      "step": 53
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.7092946767807007,
+      "learning_rate": 0.00019905415858090036,
+      "loss": 1.0365,
+      "step": 54
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.602556049823761,
+      "learning_rate": 0.00019899779713315575,
+      "loss": 1.1238,
+      "step": 55
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.6566863059997559,
+      "learning_rate": 0.00019893981312363562,
+      "loss": 1.1097,
+      "step": 56
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.6582695245742798,
+      "learning_rate": 0.00019888020750269067,
+      "loss": 1.3681,
+      "step": 57
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.509901225566864,
+      "learning_rate": 0.00019881898124724981,
+      "loss": 0.7163,
+      "step": 58
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.6406445503234863,
+      "learning_rate": 0.0001987561353608038,
+      "loss": 1.1309,
+      "step": 59
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.5770175457000732,
+      "learning_rate": 0.00019869167087338907,
+      "loss": 1.1706,
+      "step": 60
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.6582055687904358,
+      "learning_rate": 0.00019862558884157068,
+      "loss": 1.1121,
+      "step": 61
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.7646100521087646,
+      "learning_rate": 0.00019855789034842504,
+      "loss": 1.1313,
+      "step": 62
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.7127470970153809,
+      "learning_rate": 0.00019848857650352214,
+      "loss": 1.258,
+      "step": 63
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.5721624493598938,
+      "learning_rate": 0.00019841764844290744,
+      "loss": 1.0163,
+      "step": 64
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.6494898796081543,
+      "learning_rate": 0.00019834510732908315,
+      "loss": 1.1974,
+      "step": 65
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.6703062057495117,
+      "learning_rate": 0.00019827095435098925,
+      "loss": 1.1376,
+      "step": 66
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.696711003780365,
+      "learning_rate": 0.000198195190723984,
+      "loss": 0.9931,
+      "step": 67
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.6563432216644287,
+      "learning_rate": 0.0001981178176898239,
+      "loss": 1.2047,
+      "step": 68
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.7269361019134521,
+      "learning_rate": 0.0001980388365166436,
+      "loss": 1.6113,
+      "step": 69
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.6356198191642761,
+      "learning_rate": 0.0001979582484989348,
+      "loss": 1.3778,
+      "step": 70
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.6009278893470764,
+      "learning_rate": 0.00019787605495752528,
+      "loss": 1.2131,
+      "step": 71
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.49109163880348206,
+      "learning_rate": 0.00019779225723955707,
+      "loss": 0.8246,
+      "step": 72
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.5709823966026306,
+      "learning_rate": 0.00019770685671846456,
+      "loss": 1.0578,
+      "step": 73
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.5613502860069275,
+      "learning_rate": 0.0001976198547939518,
+      "loss": 0.8883,
+      "step": 74
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.750335156917572,
+      "learning_rate": 0.0001975312528919697,
+      "loss": 1.1836,
+      "step": 75
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.6157568693161011,
+      "learning_rate": 0.00019744105246469263,
+      "loss": 1.0637,
+      "step": 76
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 0.6417941451072693,
+      "learning_rate": 0.00019734925499049447,
+      "loss": 1.2824,
+      "step": 77
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 0.8214441537857056,
+      "learning_rate": 0.0001972558619739246,
+      "loss": 1.1942,
+      "step": 78
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.6943228244781494,
+      "learning_rate": 0.00019716087494568317,
+      "loss": 1.3261,
+      "step": 79
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.739622950553894,
+      "learning_rate": 0.00019706429546259593,
+      "loss": 1.2639,
+      "step": 80
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 0.6374944448471069,
+      "learning_rate": 0.00019696612510758876,
+      "loss": 0.9929,
+      "step": 81
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 0.7595279812812805,
+      "learning_rate": 0.00019686636548966178,
+      "loss": 1.2859,
+      "step": 82
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.6465960144996643,
+      "learning_rate": 0.00019676501824386294,
+      "loss": 1.0333,
+      "step": 83
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.7063401341438293,
+      "learning_rate": 0.00019666208503126112,
+      "loss": 1.2189,
+      "step": 84
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 0.631826639175415,
+      "learning_rate": 0.00019655756753891916,
+      "loss": 1.2583,
+      "step": 85
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 0.6506052017211914,
+      "learning_rate": 0.0001964514674798659,
+      "loss": 1.2019,
+      "step": 86
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.7421661615371704,
+      "learning_rate": 0.00019634378659306832,
+      "loss": 1.2122,
+      "step": 87
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.5749310851097107,
+      "learning_rate": 0.00019623452664340306,
+      "loss": 1.0522,
+      "step": 88
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 0.6523499488830566,
+      "learning_rate": 0.0001961236894216272,
+      "loss": 1.2135,
+      "step": 89
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 0.5970554947853088,
+      "learning_rate": 0.00019601127674434928,
+      "loss": 1.0297,
+      "step": 90
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.587348461151123,
+      "learning_rate": 0.00019589729045399934,
+      "loss": 1.0214,
+      "step": 91
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.6518609523773193,
+      "learning_rate": 0.00019578173241879872,
+      "loss": 0.9928,
+      "step": 92
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 0.7513082027435303,
+      "learning_rate": 0.00019566460453272945,
+      "loss": 1.1204,
+      "step": 93
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 0.8648024201393127,
+      "learning_rate": 0.0001955459087155033,
+      "loss": 1.3671,
+      "step": 94
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.6207080483436584,
+      "learning_rate": 0.0001954256469125301,
+      "loss": 1.1286,
+      "step": 95
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.6174007058143616,
+      "learning_rate": 0.0001953038210948861,
+      "loss": 1.145,
+      "step": 96
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 0.6160337328910828,
+      "learning_rate": 0.00019518043325928157,
+      "loss": 1.2688,
+      "step": 97
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 0.662702202796936,
+      "learning_rate": 0.00019505548542802804,
+      "loss": 1.1212,
+      "step": 98
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.7133952379226685,
+      "learning_rate": 0.00019492897964900512,
+      "loss": 1.0514,
+      "step": 99
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.7767614126205444,
+      "learning_rate": 0.00019480091799562704,
+      "loss": 1.2387,
+      "step": 100
+    },
+    {
+      "epoch": 0.5,
+      "eval_loss": 1.1319388151168823,
+      "eval_runtime": 2.9089,
+      "eval_samples_per_second": 34.377,
+      "eval_steps_per_second": 17.189,
+      "step": 100
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 0.6398429870605469,
+      "learning_rate": 0.00019467130256680868,
+      "loss": 1.0076,
+      "step": 101
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 0.6510715484619141,
+      "learning_rate": 0.00019454013548693102,
+      "loss": 1.2372,
+      "step": 102
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.7204650044441223,
+      "learning_rate": 0.00019440741890580643,
+      "loss": 1.0999,
+      "step": 103
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.6531095504760742,
+      "learning_rate": 0.00019427315499864344,
+      "loss": 1.1123,
+      "step": 104
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 0.5871708989143372,
+      "learning_rate": 0.00019413734596601104,
+      "loss": 1.2162,
+      "step": 105
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 0.6323477625846863,
+      "learning_rate": 0.00019399999403380266,
+      "loss": 1.1369,
+      "step": 106
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.6977123618125916,
+      "learning_rate": 0.00019386110145319963,
+      "loss": 1.0952,
+      "step": 107
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.6638639569282532,
+      "learning_rate": 0.00019372067050063438,
+      "loss": 1.1125,
+      "step": 108
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 0.6010698676109314,
+      "learning_rate": 0.000193578703477753,
+      "loss": 1.1715,
+      "step": 109
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 0.5837023258209229,
+      "learning_rate": 0.00019343520271137763,
+      "loss": 0.8489,
+      "step": 110
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.6870157718658447,
+      "learning_rate": 0.0001932901705534683,
+      "loss": 1.0953,
+      "step": 111
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.5713046789169312,
+      "learning_rate": 0.00019314360938108425,
+      "loss": 1.1113,
+      "step": 112
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 0.5966447591781616,
+      "learning_rate": 0.00019299552159634517,
+      "loss": 1.2646,
+      "step": 113
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 0.6116918921470642,
+      "learning_rate": 0.00019284590962639176,
+      "loss": 1.0807,
+      "step": 114
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.5885886549949646,
+      "learning_rate": 0.0001926947759233459,
+      "loss": 0.9551,
+      "step": 115
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.5844876766204834,
+      "learning_rate": 0.00019254212296427044,
+      "loss": 1.0009,
+      "step": 116
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 0.5967299342155457,
+      "learning_rate": 0.0001923879532511287,
+      "loss": 0.863,
+      "step": 117
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 0.543732762336731,
+      "learning_rate": 0.0001922322693107434,
+      "loss": 0.8331,
+      "step": 118
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.6925728917121887,
+      "learning_rate": 0.0001920750736947553,
+      "loss": 1.1044,
+      "step": 119
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.5720507502555847,
+      "learning_rate": 0.00019191636897958122,
+      "loss": 1.2173,
+      "step": 120
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 0.6664772033691406,
+      "learning_rate": 0.0001917561577663721,
+      "loss": 0.9849,
+      "step": 121
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 0.6026978492736816,
+      "learning_rate": 0.00019159444268097012,
+      "loss": 1.2952,
+      "step": 122
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.6648169755935669,
+      "learning_rate": 0.00019143122637386566,
+      "loss": 0.8417,
+      "step": 123
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.7643215656280518,
+      "learning_rate": 0.00019126651152015403,
+      "loss": 1.1142,
+      "step": 124
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 0.6389123797416687,
+      "learning_rate": 0.00019110030081949156,
+      "loss": 1.2387,
+      "step": 125
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 0.7826026678085327,
+      "learning_rate": 0.00019093259699605125,
+      "loss": 1.1407,
+      "step": 126
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.6801394820213318,
+      "learning_rate": 0.0001907634027984782,
+      "loss": 0.932,
+      "step": 127
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.6450052857398987,
+      "learning_rate": 0.0001905927209998447,
+      "loss": 1.3197,
+      "step": 128
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 0.6216878890991211,
+      "learning_rate": 0.00019042055439760444,
+      "loss": 1.2593,
+      "step": 129
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 0.6000977158546448,
+      "learning_rate": 0.000190246905813547,
+      "loss": 0.9974,
+      "step": 130
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.5806196928024292,
+      "learning_rate": 0.0001900717780937514,
+      "loss": 1.1792,
+      "step": 131
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.6986164450645447,
+      "learning_rate": 0.00018989517410853955,
+      "loss": 1.252,
+      "step": 132
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 0.6852320432662964,
+      "learning_rate": 0.0001897170967524291,
+      "loss": 1.098,
+      "step": 133
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 0.6186272501945496,
+      "learning_rate": 0.00018953754894408616,
+      "loss": 1.1099,
+      "step": 134
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.7196840643882751,
+      "learning_rate": 0.0001893565336262773,
+      "loss": 1.1809,
+      "step": 135
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.6523413062095642,
+      "learning_rate": 0.00018917405376582145,
+      "loss": 1.2383,
+      "step": 136
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 0.7788291573524475,
+      "learning_rate": 0.00018899011235354115,
+      "loss": 1.023,
+      "step": 137
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 0.5616946220397949,
+      "learning_rate": 0.00018880471240421365,
+      "loss": 0.8242,
+      "step": 138
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.6670994758605957,
+      "learning_rate": 0.00018861785695652142,
+      "loss": 1.2797,
+      "step": 139
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.6285648345947266,
+      "learning_rate": 0.00018842954907300236,
+      "loss": 1.0959,
+      "step": 140
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 0.6495100855827332,
+      "learning_rate": 0.00018823979183999964,
+      "loss": 1.1426,
+      "step": 141
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 0.7513198256492615,
+      "learning_rate": 0.00018804858836761107,
+      "loss": 1.2578,
+      "step": 142
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.5422288775444031,
+      "learning_rate": 0.0001878559417896382,
+      "loss": 0.9833,
+      "step": 143
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.605277419090271,
+      "learning_rate": 0.0001876618552635348,
+      "loss": 1.2323,
+      "step": 144
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 0.7177323698997498,
+      "learning_rate": 0.00018746633197035527,
+      "loss": 1.2153,
+      "step": 145
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 0.5417729020118713,
+      "learning_rate": 0.00018726937511470246,
+      "loss": 0.9367,
+      "step": 146
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.6895157098770142,
+      "learning_rate": 0.00018707098792467515,
+      "loss": 1.3363,
+      "step": 147
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.5565975308418274,
+      "learning_rate": 0.00018687117365181512,
+      "loss": 1.0385,
+      "step": 148
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.7168130278587341,
+      "learning_rate": 0.00018666993557105377,
+      "loss": 1.2281,
+      "step": 149
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.839598536491394,
+      "learning_rate": 0.00018646727698065865,
+      "loss": 1.4159,
+      "step": 150
+    },
+    {
+      "epoch": 0.75,
+      "eval_loss": 1.119249939918518,
+      "eval_runtime": 2.9417,
+      "eval_samples_per_second": 33.994,
+      "eval_steps_per_second": 16.997,
+      "step": 150
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.5981218814849854,
+      "learning_rate": 0.00018626320120217923,
+      "loss": 1.0671,
+      "step": 151
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.6944805383682251,
+      "learning_rate": 0.00018605771158039253,
+      "loss": 1.3229,
+      "step": 152
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 0.6238952875137329,
+      "learning_rate": 0.00018585081148324832,
+      "loss": 1.1578,
+      "step": 153
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 0.6363958120346069,
+      "learning_rate": 0.00018564250430181387,
+      "loss": 1.3265,
+      "step": 154
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.5761409401893616,
+      "learning_rate": 0.00018543279345021834,
+      "loss": 1.1844,
+      "step": 155
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.810093104839325,
+      "learning_rate": 0.00018522168236559695,
+      "loss": 1.2033,
+      "step": 156
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 0.7487497329711914,
+      "learning_rate": 0.0001850091745080345,
+      "loss": 1.1043,
+      "step": 157
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 0.6162795424461365,
+      "learning_rate": 0.00018479527336050878,
+      "loss": 1.2486,
+      "step": 158
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.5720970034599304,
+      "learning_rate": 0.00018457998242883344,
+      "loss": 1.0381,
+      "step": 159
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.6686292886734009,
+      "learning_rate": 0.00018436330524160047,
+      "loss": 1.502,
+      "step": 160
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 0.5931655764579773,
+      "learning_rate": 0.00018414524535012244,
+      "loss": 1.0813,
+      "step": 161
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 0.6548634171485901,
+      "learning_rate": 0.00018392580632837423,
+      "loss": 1.3147,
+      "step": 162
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.559681236743927,
+      "learning_rate": 0.00018370499177293464,
+      "loss": 1.1096,
+      "step": 163
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.6365666389465332,
+      "learning_rate": 0.00018348280530292713,
+      "loss": 1.2062,
+      "step": 164
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 0.616242527961731,
+      "learning_rate": 0.00018325925055996076,
+      "loss": 1.1219,
+      "step": 165
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 0.6588903069496155,
+      "learning_rate": 0.0001830343312080704,
+      "loss": 1.2697,
+      "step": 166
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.5880855321884155,
+      "learning_rate": 0.00018280805093365672,
+      "loss": 1.1511,
+      "step": 167
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.7549880743026733,
+      "learning_rate": 0.00018258041344542566,
+      "loss": 1.2181,
+      "step": 168
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 0.6862443089485168,
+      "learning_rate": 0.00018235142247432782,
+      "loss": 1.8496,
+      "step": 169
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 0.5903118848800659,
+      "learning_rate": 0.0001821210817734972,
+      "loss": 1.2092,
+      "step": 170
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.6936279535293579,
+      "learning_rate": 0.00018188939511818965,
+      "loss": 1.0635,
+      "step": 171
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.6887457370758057,
+      "learning_rate": 0.0001816563663057211,
+      "loss": 0.9387,
+      "step": 172
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 0.6930254101753235,
+      "learning_rate": 0.00018142199915540527,
+      "loss": 1.1651,
+      "step": 173
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 0.6529977321624756,
+      "learning_rate": 0.00018118629750849105,
+      "loss": 1.2512,
+      "step": 174
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.705954372882843,
+      "learning_rate": 0.0001809492652280996,
+      "loss": 1.2601,
+      "step": 175
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.6263706088066101,
+      "learning_rate": 0.00018071090619916093,
+      "loss": 1.0446,
+      "step": 176
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 0.7754440307617188,
+      "learning_rate": 0.00018047122432835038,
+      "loss": 1.2517,
+      "step": 177
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 0.6904909610748291,
+      "learning_rate": 0.0001802302235440245,
+      "loss": 1.3028,
+      "step": 178
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.6373815536499023,
+      "learning_rate": 0.0001799879077961566,
+      "loss": 0.7538,
+      "step": 179
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.6192349791526794,
+      "learning_rate": 0.00017974428105627208,
+      "loss": 1.1583,
+      "step": 180
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 0.6500440239906311,
+      "learning_rate": 0.00017949934731738347,
+      "loss": 1.189,
+      "step": 181
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 0.5701293349266052,
+      "learning_rate": 0.0001792531105939247,
+      "loss": 0.9937,
+      "step": 182
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.6383854150772095,
+      "learning_rate": 0.0001790055749216856,
+      "loss": 1.0381,
+      "step": 183
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.7212352156639099,
+      "learning_rate": 0.00017875674435774547,
+      "loss": 1.2023,
+      "step": 184
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 0.7195665836334229,
+      "learning_rate": 0.00017850662298040678,
+      "loss": 1.4138,
+      "step": 185
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 0.6174137592315674,
+      "learning_rate": 0.0001782552148891283,
+      "loss": 0.8007,
+      "step": 186
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.672179102897644,
+      "learning_rate": 0.00017800252420445788,
+      "loss": 1.1403,
+      "step": 187
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.6487817168235779,
+      "learning_rate": 0.00017774855506796496,
+      "loss": 1.169,
+      "step": 188
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 0.7027740478515625,
+      "learning_rate": 0.0001774933116421725,
+      "loss": 1.2268,
+      "step": 189
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 0.7178415060043335,
+      "learning_rate": 0.00017723679811048904,
+      "loss": 1.2785,
+      "step": 190
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.682354748249054,
+      "learning_rate": 0.00017697901867713995,
+      "loss": 1.2195,
+      "step": 191
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.7199010252952576,
+      "learning_rate": 0.00017671997756709863,
+      "loss": 1.4132,
+      "step": 192
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 0.7743118405342102,
+      "learning_rate": 0.0001764596790260171,
+      "loss": 0.9824,
+      "step": 193
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 0.7540227174758911,
+      "learning_rate": 0.00017619812732015664,
+      "loss": 1.0527,
+      "step": 194
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.6113067269325256,
+      "learning_rate": 0.00017593532673631766,
+      "loss": 1.2446,
+      "step": 195
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.6951828598976135,
+      "learning_rate": 0.00017567128158176953,
+      "loss": 1.3333,
+      "step": 196
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 0.570866584777832,
+      "learning_rate": 0.00017540599618418007,
+      "loss": 1.0012,
+      "step": 197
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 0.5432811379432678,
+      "learning_rate": 0.00017513947489154443,
+      "loss": 1.1343,
+      "step": 198
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.6711558103561401,
+      "learning_rate": 0.00017487172207211396,
+      "loss": 1.0945,
+      "step": 199
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 0.675626814365387,
+      "learning_rate": 0.0001746027421143246,
+      "loss": 1.2807,
+      "step": 200
+    },
+    {
+      "epoch": 1.01,
+      "eval_loss": 1.1153115034103394,
+      "eval_runtime": 3.0007,
+      "eval_samples_per_second": 33.326,
+      "eval_steps_per_second": 16.663,
+      "step": 200
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 0.6204088926315308,
+      "learning_rate": 0.00017433253942672496,
+      "loss": 1.2167,
+      "step": 201
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 0.6080848574638367,
+      "learning_rate": 0.000174061118437904,
+      "loss": 0.979,
+      "step": 202
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 0.8325397372245789,
+      "learning_rate": 0.00017378848359641847,
+      "loss": 0.9095,
+      "step": 203
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 0.6108893752098083,
+      "learning_rate": 0.00017351463937072004,
+      "loss": 1.0784,
+      "step": 204
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 0.6140009164810181,
+      "learning_rate": 0.00017323959024908209,
+      "loss": 1.131,
+      "step": 205
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 0.7503536343574524,
+      "learning_rate": 0.00017296334073952605,
+      "loss": 1.0152,
+      "step": 206
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 0.6903036236763,
+      "learning_rate": 0.0001726858953697475,
+      "loss": 1.1751,
+      "step": 207
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 0.6842136979103088,
+      "learning_rate": 0.00017240725868704218,
+      "loss": 0.9362,
+      "step": 208
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 0.6317443251609802,
+      "learning_rate": 0.00017212743525823112,
+      "loss": 1.0199,
+      "step": 209
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 0.6331597566604614,
+      "learning_rate": 0.0001718464296695861,
+      "loss": 0.8634,
+      "step": 210
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 0.7953663468360901,
+      "learning_rate": 0.0001715642465267543,
+      "loss": 1.0635,
+      "step": 211
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 0.6130046248435974,
+      "learning_rate": 0.00017128089045468294,
+      "loss": 0.8426,
+      "step": 212
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 0.5984789729118347,
+      "learning_rate": 0.00017099636609754329,
+      "loss": 0.7435,
+      "step": 213
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 0.8032707571983337,
+      "learning_rate": 0.00017071067811865476,
+      "loss": 0.9271,
+      "step": 214
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 0.78273606300354,
+      "learning_rate": 0.00017042383120040834,
+      "loss": 0.8695,
+      "step": 215
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 0.7779294848442078,
+      "learning_rate": 0.00017013583004418993,
+      "loss": 1.085,
+      "step": 216
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 0.7201984524726868,
+      "learning_rate": 0.00016984667937030318,
+      "loss": 0.8079,
+      "step": 217
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 0.6246169805526733,
+      "learning_rate": 0.00016955638391789228,
+      "loss": 0.7941,
+      "step": 218
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 0.7627923488616943,
+      "learning_rate": 0.00016926494844486412,
+      "loss": 0.9281,
+      "step": 219
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 0.6979169249534607,
+      "learning_rate": 0.00016897237772781044,
+      "loss": 0.8461,
+      "step": 220
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 0.7872811555862427,
+      "learning_rate": 0.00016867867656192946,
+      "loss": 0.9413,
+      "step": 221
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 0.7482172846794128,
+      "learning_rate": 0.00016838384976094738,
+      "loss": 0.9107,
+      "step": 222
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 0.8587368130683899,
+      "learning_rate": 0.00016808790215703935,
+      "loss": 0.9886,
+      "step": 223
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 0.732606828212738,
+      "learning_rate": 0.00016779083860075033,
+      "loss": 0.6831,
+      "step": 224
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 0.9272279143333435,
+      "learning_rate": 0.0001674926639609157,
+      "loss": 1.1396,
+      "step": 225
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 0.6473172307014465,
+      "learning_rate": 0.00016719338312458124,
+      "loss": 0.8299,
+      "step": 226
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 0.8427954316139221,
+      "learning_rate": 0.00016689300099692332,
+      "loss": 0.9203,
+      "step": 227
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 0.8205825090408325,
+      "learning_rate": 0.00016659152250116812,
+      "loss": 0.8532,
+      "step": 228
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 0.7522780299186707,
+      "learning_rate": 0.00016628895257851135,
+      "loss": 0.7687,
+      "step": 229
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 0.8582683205604553,
+      "learning_rate": 0.000165985296188037,
+      "loss": 0.9217,
+      "step": 230
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 0.8408709168434143,
+      "learning_rate": 0.0001656805583066361,
+      "loss": 1.0371,
+      "step": 231
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 0.9851942658424377,
+      "learning_rate": 0.00016537474392892528,
+      "loss": 1.044,
+      "step": 232
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 0.8463842868804932,
+      "learning_rate": 0.00016506785806716465,
+      "loss": 0.9521,
+      "step": 233
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 0.825255811214447,
+      "learning_rate": 0.00016475990575117605,
+      "loss": 0.8524,
+      "step": 234
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 1.1519947052001953,
+      "learning_rate": 0.0001644508920282601,
+      "loss": 0.9906,
+      "step": 235
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 0.8102966547012329,
+      "learning_rate": 0.000164140821963114,
+      "loss": 0.9192,
+      "step": 236
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 1.0159798860549927,
+      "learning_rate": 0.0001638297006377481,
+      "loss": 1.0234,
+      "step": 237
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 1.0157923698425293,
+      "learning_rate": 0.00016351753315140287,
+      "loss": 0.8921,
+      "step": 238
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 0.8466264009475708,
+      "learning_rate": 0.00016320432462046516,
+      "loss": 0.7098,
+      "step": 239
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 0.8298121690750122,
+      "learning_rate": 0.00016289008017838445,
+      "loss": 0.8517,
+      "step": 240
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 1.2163349390029907,
+      "learning_rate": 0.00016257480497558873,
+      "loss": 1.1172,
+      "step": 241
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 0.9839556217193604,
+      "learning_rate": 0.0001622585041793999,
+      "loss": 1.1022,
+      "step": 242
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 0.7986888289451599,
+      "learning_rate": 0.00016194118297394936,
+      "loss": 0.7826,
+      "step": 243
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 0.9318971037864685,
+      "learning_rate": 0.00016162284656009274,
+      "loss": 0.8899,
+      "step": 244
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 1.0234252214431763,
+      "learning_rate": 0.00016130350015532496,
+      "loss": 0.8831,
+      "step": 245
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 0.8264230489730835,
+      "learning_rate": 0.00016098314899369446,
+      "loss": 1.1389,
+      "step": 246
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 0.8845193982124329,
+      "learning_rate": 0.0001606617983257176,
+      "loss": 1.0822,
+      "step": 247
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 0.9044338464736938,
+      "learning_rate": 0.00016033945341829248,
+      "loss": 1.0556,
+      "step": 248
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 0.9660309553146362,
+      "learning_rate": 0.00016001611955461265,
+      "loss": 1.0331,
+      "step": 249
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 1.0728851556777954,
+      "learning_rate": 0.0001596918020340805,
+      "loss": 1.0465,
+      "step": 250
+    },
+    {
+      "epoch": 1.24,
+      "eval_loss": 1.1568788290023804,
+      "eval_runtime": 2.9063,
+      "eval_samples_per_second": 34.408,
+      "eval_steps_per_second": 17.204,
+      "step": 250
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 0.9447798728942871,
+      "learning_rate": 0.00015936650617222063,
+      "loss": 0.9487,
+      "step": 251
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 1.0429767370224,
+      "learning_rate": 0.00015904023730059228,
+      "loss": 1.006,
+      "step": 252
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 0.9871753454208374,
+      "learning_rate": 0.00015871300076670234,
+      "loss": 0.9494,
+      "step": 253
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 0.7644299268722534,
+      "learning_rate": 0.00015838480193391754,
+      "loss": 0.6077,
+      "step": 254
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 1.1654846668243408,
+      "learning_rate": 0.0001580556461813766,
+      "loss": 1.0632,
+      "step": 255
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 1.0508393049240112,
+      "learning_rate": 0.00015772553890390197,
+      "loss": 0.8754,
+      "step": 256
+    },
+    {
+      "epoch": 1.27,
+      "grad_norm": 0.8676743507385254,
+      "learning_rate": 0.0001573944855119115,
+      "loss": 1.007,
+      "step": 257
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 1.178464412689209,
+      "learning_rate": 0.00015706249143132982,
+      "loss": 1.041,
+      "step": 258
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 1.0226370096206665,
+      "learning_rate": 0.00015672956210349923,
+      "loss": 1.1114,
+      "step": 259
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 0.9840787649154663,
+      "learning_rate": 0.00015639570298509064,
+      "loss": 0.9043,
+      "step": 260
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 1.0564519166946411,
+      "learning_rate": 0.0001560609195480142,
+      "loss": 0.9696,
+      "step": 261
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 0.9174713492393494,
+      "learning_rate": 0.00015572521727932935,
+      "loss": 0.9849,
+      "step": 262
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 0.7333153486251831,
+      "learning_rate": 0.00015538860168115527,
+      "loss": 0.7286,
+      "step": 263
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 0.9282216429710388,
+      "learning_rate": 0.00015505107827058036,
+      "loss": 0.8975,
+      "step": 264
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 1.003192663192749,
+      "learning_rate": 0.00015471265257957202,
+      "loss": 1.1836,
+      "step": 265
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 0.8726491928100586,
+      "learning_rate": 0.00015437333015488587,
+      "loss": 0.9313,
+      "step": 266
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 0.9721888899803162,
+      "learning_rate": 0.00015403311655797492,
+      "loss": 0.8935,
+      "step": 267
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 1.0440247058868408,
+      "learning_rate": 0.0001536920173648984,
+      "loss": 0.9741,
+      "step": 268
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 0.9814698100090027,
+      "learning_rate": 0.00015335003816623028,
+      "loss": 0.8982,
+      "step": 269
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 0.904926598072052,
+      "learning_rate": 0.00015300718456696778,
+      "loss": 0.8579,
+      "step": 270
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 1.0483490228652954,
+      "learning_rate": 0.00015266346218643947,
+      "loss": 0.8108,
+      "step": 271
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 0.9156501293182373,
+      "learning_rate": 0.000152318876658213,
+      "loss": 0.9442,
+      "step": 272
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 0.9268532395362854,
+      "learning_rate": 0.00015197343363000307,
+      "loss": 0.8243,
+      "step": 273
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 0.8675321340560913,
+      "learning_rate": 0.00015162713876357858,
+      "loss": 0.7758,
+      "step": 274
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 0.9131675362586975,
+      "learning_rate": 0.00015127999773467002,
+      "loss": 0.8845,
+      "step": 275
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 1.0260008573532104,
+      "learning_rate": 0.00015093201623287631,
+      "loss": 0.9032,
+      "step": 276
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 1.044528841972351,
+      "learning_rate": 0.00015058319996157172,
+      "loss": 1.0489,
+      "step": 277
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 0.9959388375282288,
+      "learning_rate": 0.0001502335546378122,
+      "loss": 0.858,
+      "step": 278
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 0.8414021730422974,
+      "learning_rate": 0.00014988308599224183,
+      "loss": 0.782,
+      "step": 279
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 0.9205671548843384,
+      "learning_rate": 0.00014953179976899878,
+      "loss": 0.8376,
+      "step": 280
+    },
+    {
+      "epoch": 1.39,
+      "grad_norm": 0.9481040239334106,
+      "learning_rate": 0.0001491797017256212,
+      "loss": 0.851,
+      "step": 281
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 0.8266577124595642,
+      "learning_rate": 0.00014882679763295306,
+      "loss": 0.7228,
+      "step": 282
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 1.0222742557525635,
+      "learning_rate": 0.0001484730932750491,
+      "loss": 0.7955,
+      "step": 283
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 1.0014468431472778,
+      "learning_rate": 0.00014811859444908052,
+      "loss": 0.9107,
+      "step": 284
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 0.9157910346984863,
+      "learning_rate": 0.00014776330696523963,
+      "loss": 1.0208,
+      "step": 285
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 1.0565227270126343,
+      "learning_rate": 0.00014740723664664483,
+      "loss": 0.6496,
+      "step": 286
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 1.0323175191879272,
+      "learning_rate": 0.00014705038932924503,
+      "loss": 1.0043,
+      "step": 287
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 1.0063213109970093,
+      "learning_rate": 0.00014669277086172406,
+      "loss": 1.1286,
+      "step": 288
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 0.8602890968322754,
+      "learning_rate": 0.00014633438710540489,
+      "loss": 0.7254,
+      "step": 289
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 0.9782769083976746,
+      "learning_rate": 0.00014597524393415335,
+      "loss": 0.7086,
+      "step": 290
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 0.9836515784263611,
+      "learning_rate": 0.00014561534723428205,
+      "loss": 0.8405,
+      "step": 291
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 1.0674123764038086,
+      "learning_rate": 0.00014525470290445392,
+      "loss": 1.0317,
+      "step": 292
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 0.9632031917572021,
+      "learning_rate": 0.00014489331685558525,
+      "loss": 0.9473,
+      "step": 293
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 1.0105828046798706,
+      "learning_rate": 0.00014453119501074924,
+      "loss": 0.8199,
+      "step": 294
+    },
+    {
+      "epoch": 1.46,
+      "grad_norm": 1.0012938976287842,
+      "learning_rate": 0.00014416834330507856,
+      "loss": 0.9099,
+      "step": 295
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 1.0367400646209717,
+      "learning_rate": 0.00014380476768566824,
+      "loss": 1.0958,
+      "step": 296
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 0.7329337000846863,
+      "learning_rate": 0.00014344047411147818,
+      "loss": 0.6189,
+      "step": 297
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 0.9014643430709839,
+      "learning_rate": 0.00014307546855323549,
+      "loss": 0.8168,
+      "step": 298
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 0.7568360567092896,
+      "learning_rate": 0.00014270975699333654,
+      "loss": 0.7857,
+      "step": 299
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 0.9573479890823364,
+      "learning_rate": 0.00014234334542574906,
+      "loss": 0.9577,
+      "step": 300
+    },
+    {
+      "epoch": 1.49,
+      "eval_loss": 1.149274230003357,
+      "eval_runtime": 2.9222,
+      "eval_samples_per_second": 34.22,
+      "eval_steps_per_second": 17.11,
+      "step": 300
+    },
+    {
+      "epoch": 1.49,
+      "grad_norm": 0.8575048446655273,
+      "learning_rate": 0.00014197623985591373,
+      "loss": 0.8521,
+      "step": 301
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 0.990980863571167,
+      "learning_rate": 0.00014160844630064595,
+      "loss": 1.0642,
+      "step": 302
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 1.1145374774932861,
+      "learning_rate": 0.00014123997078803707,
+      "loss": 0.8963,
+      "step": 303
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 0.9661507606506348,
+      "learning_rate": 0.00014087081935735564,
+      "loss": 0.9473,
+      "step": 304
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 1.019618272781372,
+      "learning_rate": 0.00014050099805894837,
+      "loss": 0.9048,
+      "step": 305
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 0.871661365032196,
+      "learning_rate": 0.00014013051295414108,
+      "loss": 0.6644,
+      "step": 306
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 0.9834782481193542,
+      "learning_rate": 0.00013975937011513932,
+      "loss": 0.9226,
+      "step": 307
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 0.9938518404960632,
+      "learning_rate": 0.00013938757562492873,
+      "loss": 0.9608,
+      "step": 308
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 1.0692541599273682,
+      "learning_rate": 0.00013901513557717553,
+      "loss": 0.9646,
+      "step": 309
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 1.039904236793518,
+      "learning_rate": 0.00013864205607612648,
+      "loss": 0.7799,
+      "step": 310
+    },
+    {
+      "epoch": 1.54,
+      "grad_norm": 0.9138852953910828,
+      "learning_rate": 0.000138268343236509,
+      "loss": 0.8297,
+      "step": 311
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 1.01775324344635,
+      "learning_rate": 0.00013789400318343068,
+      "loss": 0.8992,
+      "step": 312
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 1.0052934885025024,
+      "learning_rate": 0.0001375190420522792,
+      "loss": 0.8212,
+      "step": 313
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 1.0567269325256348,
+      "learning_rate": 0.00013714346598862166,
+      "loss": 1.0402,
+      "step": 314
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 0.8707680702209473,
+      "learning_rate": 0.00013676728114810367,
+      "loss": 0.8864,
+      "step": 315
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 0.959578812122345,
+      "learning_rate": 0.00013639049369634876,
+      "loss": 0.7048,
+      "step": 316
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 1.0675721168518066,
+      "learning_rate": 0.00013601310980885714,
+      "loss": 1.0025,
+      "step": 317
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 0.8831722736358643,
+      "learning_rate": 0.0001356351356709045,
+      "loss": 0.8058,
+      "step": 318
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 1.0400885343551636,
+      "learning_rate": 0.00013525657747744072,
+      "loss": 1.0273,
+      "step": 319
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 1.0046364068984985,
+      "learning_rate": 0.00013487744143298822,
+      "loss": 0.8441,
+      "step": 320
+    },
+    {
+      "epoch": 1.59,
+      "grad_norm": 1.029714822769165,
+      "learning_rate": 0.0001344977337515404,
+      "loss": 0.7771,
+      "step": 321
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 0.8168841004371643,
+      "learning_rate": 0.0001341174606564596,
+      "loss": 0.8024,
+      "step": 322
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 0.9833108186721802,
+      "learning_rate": 0.00013373662838037537,
+      "loss": 0.9065,
+      "step": 323
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 0.9366996884346008,
+      "learning_rate": 0.00013335524316508208,
+      "loss": 0.9436,
+      "step": 324
+    },
+    {
+      "epoch": 1.61,
+      "grad_norm": 0.8757138848304749,
+      "learning_rate": 0.00013297331126143667,
+      "loss": 0.8399,
+      "step": 325
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 1.1467972993850708,
+      "learning_rate": 0.00013259083892925633,
+      "loss": 1.1416,
+      "step": 326
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 0.9916189312934875,
+      "learning_rate": 0.00013220783243721572,
+      "loss": 0.9531,
+      "step": 327
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 0.9911974668502808,
+      "learning_rate": 0.0001318242980627444,
+      "loss": 0.9476,
+      "step": 328
+    },
+    {
+      "epoch": 1.63,
+      "grad_norm": 1.0219913721084595,
+      "learning_rate": 0.0001314402420919238,
+      "loss": 0.9288,
+      "step": 329
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 1.0889464616775513,
+      "learning_rate": 0.00013105567081938424,
+      "loss": 0.8025,
+      "step": 330
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 0.8797928690910339,
+      "learning_rate": 0.00013067059054820183,
+      "loss": 0.9002,
+      "step": 331
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 1.0043346881866455,
+      "learning_rate": 0.00013028500758979506,
+      "loss": 0.8971,
+      "step": 332
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 0.9221352934837341,
+      "learning_rate": 0.00012989892826382145,
+      "loss": 0.8181,
+      "step": 333
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 1.2053778171539307,
+      "learning_rate": 0.00012951235889807386,
+      "loss": 0.9374,
+      "step": 334
+    },
+    {
+      "epoch": 1.66,
+      "grad_norm": 1.2230528593063354,
+      "learning_rate": 0.00012912530582837682,
+      "loss": 0.9123,
+      "step": 335
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 0.8403642773628235,
+      "learning_rate": 0.00012873777539848283,
+      "loss": 0.9323,
+      "step": 336
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 1.1632657051086426,
+      "learning_rate": 0.00012834977395996818,
+      "loss": 1.1916,
+      "step": 337
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 0.9937611222267151,
+      "learning_rate": 0.0001279613078721289,
+      "loss": 1.141,
+      "step": 338
+    },
+    {
+      "epoch": 1.68,
+      "grad_norm": 0.8973978161811829,
+      "learning_rate": 0.0001275723835018767,
+      "loss": 0.8399,
+      "step": 339
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 1.0466402769088745,
+      "learning_rate": 0.0001271830072236343,
+      "loss": 0.8127,
+      "step": 340
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 0.9691051244735718,
+      "learning_rate": 0.0001267931854192313,
+      "loss": 0.9794,
+      "step": 341
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 0.925682544708252,
+      "learning_rate": 0.0001264029244777993,
+      "loss": 0.8233,
+      "step": 342
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 0.9783278703689575,
+      "learning_rate": 0.00012601223079566743,
+      "loss": 0.9542,
+      "step": 343
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 0.9945007562637329,
+      "learning_rate": 0.00012562111077625722,
+      "loss": 1.0757,
+      "step": 344
+    },
+    {
+      "epoch": 1.71,
+      "grad_norm": 1.1597148180007935,
+      "learning_rate": 0.000125229570829978,
+      "loss": 1.1052,
+      "step": 345
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 0.7987023591995239,
+      "learning_rate": 0.0001248376173741215,
+      "loss": 0.8602,
+      "step": 346
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 0.8969370126724243,
+      "learning_rate": 0.00012444525683275688,
+      "loss": 1.6019,
+      "step": 347
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 1.0622583627700806,
+      "learning_rate": 0.00012405249563662537,
+      "loss": 1.0735,
+      "step": 348
+    },
+    {
+      "epoch": 1.73,
+      "grad_norm": 1.0987950563430786,
+      "learning_rate": 0.00012365934022303491,
+      "loss": 0.9973,
+      "step": 349
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 0.9930221438407898,
+      "learning_rate": 0.00012326579703575462,
+      "loss": 1.1257,
+      "step": 350
+    },
+    {
+      "epoch": 1.74,
+      "eval_loss": 1.1461950540542603,
+      "eval_runtime": 2.9343,
+      "eval_samples_per_second": 34.08,
+      "eval_steps_per_second": 17.04,
+      "step": 350
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 1.0799540281295776,
+      "learning_rate": 0.00012287187252490913,
+      "loss": 0.8758,
+      "step": 351
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 1.0633143186569214,
+      "learning_rate": 0.00012247757314687297,
+      "loss": 1.0396,
+      "step": 352
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 0.9504884481430054,
+      "learning_rate": 0.00012208290536416463,
+      "loss": 0.8192,
+      "step": 353
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 0.8587303161621094,
+      "learning_rate": 0.00012168787564534078,
+      "loss": 0.748,
+      "step": 354
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 1.3652898073196411,
+      "learning_rate": 0.0001212924904648902,
+      "loss": 1.0768,
+      "step": 355
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 1.0679266452789307,
+      "learning_rate": 0.00012089675630312754,
+      "loss": 0.9099,
+      "step": 356
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 1.2426522970199585,
+      "learning_rate": 0.00012050067964608724,
+      "loss": 0.9869,
+      "step": 357
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 0.9639490246772766,
+      "learning_rate": 0.00012010426698541728,
+      "loss": 0.6993,
+      "step": 358
+    },
+    {
+      "epoch": 1.78,
+      "grad_norm": 1.1884175539016724,
+      "learning_rate": 0.0001197075248182726,
+      "loss": 0.9868,
+      "step": 359
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 0.9860052466392517,
+      "learning_rate": 0.00011931045964720881,
+      "loss": 0.7148,
+      "step": 360
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 0.8812693357467651,
+      "learning_rate": 0.00011891307798007536,
+      "loss": 0.9295,
+      "step": 361
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 1.032242774963379,
+      "learning_rate": 0.00011851538632990921,
+      "loss": 1.2292,
+      "step": 362
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 0.9777809381484985,
+      "learning_rate": 0.00011811739121482777,
+      "loss": 1.0646,
+      "step": 363
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 1.0464228391647339,
+      "learning_rate": 0.0001177190991579223,
+      "loss": 0.9703,
+      "step": 364
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 0.9763212203979492,
+      "learning_rate": 0.00011732051668715081,
+      "loss": 0.7753,
+      "step": 365
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 1.114912748336792,
+      "learning_rate": 0.00011692165033523117,
+      "loss": 0.9979,
+      "step": 366
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 0.8752657771110535,
+      "learning_rate": 0.00011652250663953415,
+      "loss": 0.9964,
+      "step": 367
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 0.9158682823181152,
+      "learning_rate": 0.00011612309214197599,
+      "loss": 0.7576,
+      "step": 368
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 0.8457457423210144,
+      "learning_rate": 0.00011572341338891144,
+      "loss": 0.9144,
+      "step": 369
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 1.0021049976348877,
+      "learning_rate": 0.00011532347693102632,
+      "loss": 0.9226,
+      "step": 370
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 0.9614117741584778,
+      "learning_rate": 0.00011492328932323022,
+      "loss": 1.0214,
+      "step": 371
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 0.9289172291755676,
+      "learning_rate": 0.00011452285712454904,
+      "loss": 0.8793,
+      "step": 372
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 1.0654929876327515,
+      "learning_rate": 0.00011412218689801748,
+      "loss": 1.1519,
+      "step": 373
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 1.0563515424728394,
+      "learning_rate": 0.00011372128521057155,
+      "loss": 0.9859,
+      "step": 374
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 1.011228322982788,
+      "learning_rate": 0.00011332015863294076,
+      "loss": 0.9138,
+      "step": 375
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 0.942287802696228,
+      "learning_rate": 0.00011291881373954065,
+      "loss": 0.8865,
+      "step": 376
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 0.9734610319137573,
+      "learning_rate": 0.00011251725710836489,
+      "loss": 0.8578,
+      "step": 377
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 1.184990406036377,
+      "learning_rate": 0.00011211549532087749,
+      "loss": 1.0107,
+      "step": 378
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 1.033831238746643,
+      "learning_rate": 0.00011171353496190498,
+      "loss": 1.0496,
+      "step": 379
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 1.018054485321045,
+      "learning_rate": 0.00011131138261952845,
+      "loss": 0.8782,
+      "step": 380
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 0.9694205522537231,
+      "learning_rate": 0.00011090904488497549,
+      "loss": 0.9928,
+      "step": 381
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 0.9095280170440674,
+      "learning_rate": 0.0001105065283525124,
+      "loss": 0.9821,
+      "step": 382
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 0.8029172420501709,
+      "learning_rate": 0.00011010383961933581,
+      "loss": 0.6811,
+      "step": 383
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 0.9388089776039124,
+      "learning_rate": 0.00010970098528546481,
+      "loss": 0.9703,
+      "step": 384
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 0.8639506697654724,
+      "learning_rate": 0.00010929797195363259,
+      "loss": 0.8579,
+      "step": 385
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 1.001845121383667,
+      "learning_rate": 0.0001088948062291783,
+      "loss": 1.038,
+      "step": 386
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 0.9668776392936707,
+      "learning_rate": 0.00010849149471993882,
+      "loss": 0.9457,
+      "step": 387
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 0.8607358932495117,
+      "learning_rate": 0.00010808804403614043,
+      "loss": 0.8795,
+      "step": 388
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 1.0189685821533203,
+      "learning_rate": 0.00010768446079029044,
+      "loss": 0.9203,
+      "step": 389
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 0.9952776432037354,
+      "learning_rate": 0.0001072807515970688,
+      "loss": 1.0368,
+      "step": 390
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 1.057427167892456,
+      "learning_rate": 0.00010687692307321984,
+      "loss": 1.0568,
+      "step": 391
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 0.822589099407196,
+      "learning_rate": 0.00010647298183744359,
+      "loss": 0.9598,
+      "step": 392
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 0.9903733730316162,
+      "learning_rate": 0.00010606893451028743,
+      "loss": 1.0595,
+      "step": 393
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 1.0125857591629028,
+      "learning_rate": 0.00010566478771403763,
+      "loss": 0.9646,
+      "step": 394
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 0.899347722530365,
+      "learning_rate": 0.00010526054807261067,
+      "loss": 1.0054,
+      "step": 395
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 1.0629827976226807,
+      "learning_rate": 0.00010485622221144484,
+      "loss": 0.9319,
+      "step": 396
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 0.9910023212432861,
+      "learning_rate": 0.00010445181675739144,
+      "loss": 0.9388,
+      "step": 397
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 0.8644474744796753,
+      "learning_rate": 0.00010404733833860639,
+      "loss": 0.8007,
+      "step": 398
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 796,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 4,
+  "save_steps": 199,
+  "total_flos": 1.531569014464512e+16,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-398/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d84e26ce27315b618f94e914bb6b67f0bb5aa37c3903b14adcd26c9fca9f3f82
+size 5624

checkpoint-431/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+library_name: peft
+base_model: openlm-research/open_llama_3b_v2
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.9.0

checkpoint-431/adapter_config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "openlm-research/open_llama_3b_v2",
+  "bias": "none",
+  "fan_in_fan_out": null,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj",
+    "o_proj",
+    "up_proj",
+    "down_proj",
+    "gate_proj",
+    "k_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

checkpoint-431/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:33c327d1f8c2e6bfc45c4c172b410eddab59af41c4d7f45ed14020b5bd6d9c0f
+size 50899792

checkpoint-431/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:76dbbb915b03d456a516c3d532d7833adc4c05e4b9f29e9a2d1c46ff2a6222b7
+size 101919290

checkpoint-431/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e766413e4f3b39c1d0ac620807b3bd3fd4dac79e0a0eed6a4a60c5746642e0a6
+size 14244

checkpoint-431/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:23f73a16ac262980457d80b6b9e4834ebbf9e3ee06ac2280222318fcdf9e15a8
+size 1064

checkpoint-431/trainer_state.json ADDED Viewed

	@@ -0,0 +1,3070 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 108,
+  "global_step": 431,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0,
+      "grad_norm": 0.8690947890281677,
+      "learning_rate": 1e-05,
+      "loss": 1.2567,
+      "step": 1
+    },
+    {
+      "epoch": 0.0,
+      "eval_loss": 1.3469510078430176,
+      "eval_runtime": 4.9827,
+      "eval_samples_per_second": 20.069,
+      "eval_steps_per_second": 20.069,
+      "step": 1
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 0.781764566898346,
+      "learning_rate": 2e-05,
+      "loss": 1.3328,
+      "step": 2
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 1.0551249980926514,
+      "learning_rate": 3e-05,
+      "loss": 1.6572,
+      "step": 3
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 1.39896821975708,
+      "learning_rate": 4e-05,
+      "loss": 1.5495,
+      "step": 4
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.6837481260299683,
+      "learning_rate": 5e-05,
+      "loss": 1.4339,
+      "step": 5
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 1.1228013038635254,
+      "learning_rate": 6e-05,
+      "loss": 1.2666,
+      "step": 6
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.9632679224014282,
+      "learning_rate": 7e-05,
+      "loss": 1.4717,
+      "step": 7
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.8218845725059509,
+      "learning_rate": 8e-05,
+      "loss": 1.4052,
+      "step": 8
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.8262380361557007,
+      "learning_rate": 9e-05,
+      "loss": 1.2844,
+      "step": 9
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.7251008749008179,
+      "learning_rate": 0.0001,
+      "loss": 1.1436,
+      "step": 10
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.8107339143753052,
+      "learning_rate": 0.00011000000000000002,
+      "loss": 1.4666,
+      "step": 11
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.8182777166366577,
+      "learning_rate": 0.00012,
+      "loss": 0.9755,
+      "step": 12
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 1.3238070011138916,
+      "learning_rate": 0.00013000000000000002,
+      "loss": 1.5148,
+      "step": 13
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 1.0324606895446777,
+      "learning_rate": 0.00014,
+      "loss": 1.1514,
+      "step": 14
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 2.189394950866699,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 1.2768,
+      "step": 15
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 1.4134596586227417,
+      "learning_rate": 0.00016,
+      "loss": 1.3283,
+      "step": 16
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 1.0440051555633545,
+      "learning_rate": 0.00017,
+      "loss": 1.297,
+      "step": 17
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 1.0499480962753296,
+      "learning_rate": 0.00018,
+      "loss": 1.321,
+      "step": 18
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.8342623710632324,
+      "learning_rate": 0.00019,
+      "loss": 1.2034,
+      "step": 19
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 1.6799418926239014,
+      "learning_rate": 0.0002,
+      "loss": 1.1958,
+      "step": 20
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 1.3825242519378662,
+      "learning_rate": 0.00019999707864731247,
+      "loss": 1.2299,
+      "step": 21
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 1.3574899435043335,
+      "learning_rate": 0.00019998831475993593,
+      "loss": 1.3222,
+      "step": 22
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 1.4586331844329834,
+      "learning_rate": 0.00019997370884991842,
+      "loss": 1.5933,
+      "step": 23
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 1.102039098739624,
+      "learning_rate": 0.0001999532617706403,
+      "loss": 1.3109,
+      "step": 24
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.9768686890602112,
+      "learning_rate": 0.00019992697471676413,
+      "loss": 1.3154,
+      "step": 25
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 1.053282380104065,
+      "learning_rate": 0.00019989484922416502,
+      "loss": 1.2525,
+      "step": 26
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.9527302980422974,
+      "learning_rate": 0.0001998568871698409,
+      "loss": 1.1981,
+      "step": 27
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.9766901135444641,
+      "learning_rate": 0.00019981309077180272,
+      "loss": 1.0464,
+      "step": 28
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.9839689135551453,
+      "learning_rate": 0.00019976346258894503,
+      "loss": 1.3556,
+      "step": 29
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 1.3148291110992432,
+      "learning_rate": 0.00019970800552089623,
+      "loss": 1.2592,
+      "step": 30
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.9205895066261292,
+      "learning_rate": 0.00019964672280784954,
+      "loss": 1.1584,
+      "step": 31
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 1.2833977937698364,
+      "learning_rate": 0.00019957961803037326,
+      "loss": 1.1818,
+      "step": 32
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 1.0097846984863281,
+      "learning_rate": 0.00019950669510920184,
+      "loss": 1.006,
+      "step": 33
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.9942007660865784,
+      "learning_rate": 0.0001994279583050067,
+      "loss": 1.1939,
+      "step": 34
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 1.3307287693023682,
+      "learning_rate": 0.00019934341221814739,
+      "loss": 1.5808,
+      "step": 35
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 1.069790005683899,
+      "learning_rate": 0.0001992530617884026,
+      "loss": 0.9687,
+      "step": 36
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 1.0054887533187866,
+      "learning_rate": 0.00019915691229468178,
+      "loss": 1.1882,
+      "step": 37
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 1.6451635360717773,
+      "learning_rate": 0.00019905496935471658,
+      "loss": 1.7297,
+      "step": 38
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.9728072881698608,
+      "learning_rate": 0.0001989472389247326,
+      "loss": 1.5091,
+      "step": 39
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 1.2531479597091675,
+      "learning_rate": 0.00019883372729910152,
+      "loss": 1.1329,
+      "step": 40
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.9028749465942383,
+      "learning_rate": 0.0001987144411099731,
+      "loss": 1.2428,
+      "step": 41
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.9710382223129272,
+      "learning_rate": 0.000198589387326888,
+      "loss": 1.2192,
+      "step": 42
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.8959193825721741,
+      "learning_rate": 0.00019845857325637031,
+      "loss": 1.5278,
+      "step": 43
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.9432021379470825,
+      "learning_rate": 0.00019832200654150076,
+      "loss": 1.1128,
+      "step": 44
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.8419432640075684,
+      "learning_rate": 0.0001981796951614701,
+      "loss": 1.0904,
+      "step": 45
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 1.1038836240768433,
+      "learning_rate": 0.00019803164743111302,
+      "loss": 1.2347,
+      "step": 46
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 1.2094361782073975,
+      "learning_rate": 0.00019787787200042223,
+      "loss": 1.3438,
+      "step": 47
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.8724287748336792,
+      "learning_rate": 0.00019771837785404305,
+      "loss": 0.9631,
+      "step": 48
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 1.0441787242889404,
+      "learning_rate": 0.00019755317431074859,
+      "loss": 1.3894,
+      "step": 49
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 1.0916489362716675,
+      "learning_rate": 0.0001973822710228951,
+      "loss": 1.019,
+      "step": 50
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 1.1637184619903564,
+      "learning_rate": 0.00019720567797585817,
+      "loss": 1.3044,
+      "step": 51
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 1.023861289024353,
+      "learning_rate": 0.0001970234054874493,
+      "loss": 1.1711,
+      "step": 52
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 1.2109915018081665,
+      "learning_rate": 0.0001968354642073129,
+      "loss": 1.2524,
+      "step": 53
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 1.138853907585144,
+      "learning_rate": 0.00019664186511630433,
+      "loss": 0.8772,
+      "step": 54
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.777930498123169,
+      "learning_rate": 0.000196442619525848,
+      "loss": 1.2304,
+      "step": 55
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 1.1599185466766357,
+      "learning_rate": 0.00019623773907727682,
+      "loss": 1.4214,
+      "step": 56
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 1.1862808465957642,
+      "learning_rate": 0.0001960272357411517,
+      "loss": 1.3264,
+      "step": 57
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.8586333394050598,
+      "learning_rate": 0.0001958111218165624,
+      "loss": 1.1423,
+      "step": 58
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.9254307746887207,
+      "learning_rate": 0.00019558940993040885,
+      "loss": 1.0662,
+      "step": 59
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.8446744084358215,
+      "learning_rate": 0.00019536211303666323,
+      "loss": 1.3621,
+      "step": 60
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.9066444635391235,
+      "learning_rate": 0.00019512924441561348,
+      "loss": 0.5954,
+      "step": 61
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.8354930281639099,
+      "learning_rate": 0.00019489081767308698,
+      "loss": 1.2582,
+      "step": 62
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 1.3045473098754883,
+      "learning_rate": 0.00019464684673965583,
+      "loss": 1.3778,
+      "step": 63
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.7895586490631104,
+      "learning_rate": 0.0001943973458698229,
+      "loss": 0.9473,
+      "step": 64
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.9904553294181824,
+      "learning_rate": 0.00019414232964118892,
+      "loss": 1.114,
+      "step": 65
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 1.0157588720321655,
+      "learning_rate": 0.00019388181295360078,
+      "loss": 1.0409,
+      "step": 66
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 1.1616543531417847,
+      "learning_rate": 0.00019361581102828095,
+      "loss": 1.0549,
+      "step": 67
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.8383068442344666,
+      "learning_rate": 0.0001933443394069383,
+      "loss": 1.1849,
+      "step": 68
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.9804612994194031,
+      "learning_rate": 0.00019306741395085976,
+      "loss": 1.1631,
+      "step": 69
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.7936605215072632,
+      "learning_rate": 0.0001927850508399839,
+      "loss": 1.1084,
+      "step": 70
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.963039755821228,
+      "learning_rate": 0.00019249726657195532,
+      "loss": 1.3065,
+      "step": 71
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.9112350940704346,
+      "learning_rate": 0.00019220407796116098,
+      "loss": 1.3096,
+      "step": 72
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 1.536594033241272,
+      "learning_rate": 0.00019190550213774756,
+      "loss": 1.1588,
+      "step": 73
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.8441547751426697,
+      "learning_rate": 0.00019160155654662076,
+      "loss": 0.7042,
+      "step": 74
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 1.0272456407546997,
+      "learning_rate": 0.00019129225894642593,
+      "loss": 1.0812,
+      "step": 75
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.8152709603309631,
+      "learning_rate": 0.00019097762740851061,
+      "loss": 1.0435,
+      "step": 76
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 1.0662580728530884,
+      "learning_rate": 0.0001906576803158686,
+      "loss": 1.1471,
+      "step": 77
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 1.001435399055481,
+      "learning_rate": 0.0001903324363620659,
+      "loss": 1.0687,
+      "step": 78
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.5956882834434509,
+      "learning_rate": 0.0001900019145501484,
+      "loss": 0.6305,
+      "step": 79
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.8889302015304565,
+      "learning_rate": 0.0001896661341915318,
+      "loss": 1.4107,
+      "step": 80
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.8901386260986328,
+      "learning_rate": 0.0001893251149048732,
+      "loss": 1.0325,
+      "step": 81
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.9966225624084473,
+      "learning_rate": 0.00018897887661492474,
+      "loss": 1.121,
+      "step": 82
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 1.066897988319397,
+      "learning_rate": 0.00018862743955136966,
+      "loss": 1.4332,
+      "step": 83
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.7830447554588318,
+      "learning_rate": 0.0001882708242476401,
+      "loss": 1.0578,
+      "step": 84
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.8779483437538147,
+      "learning_rate": 0.00018790905153971758,
+      "loss": 0.8687,
+      "step": 85
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.8768894076347351,
+      "learning_rate": 0.00018754214256491562,
+      "loss": 1.0531,
+      "step": 86
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.729383647441864,
+      "learning_rate": 0.00018717011876064453,
+      "loss": 1.0437,
+      "step": 87
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.8910384178161621,
+      "learning_rate": 0.0001867930018631592,
+      "loss": 1.0108,
+      "step": 88
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.8145522475242615,
+      "learning_rate": 0.00018641081390628877,
+      "loss": 1.1992,
+      "step": 89
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.8675424456596375,
+      "learning_rate": 0.00018602357722014964,
+      "loss": 1.2933,
+      "step": 90
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.7332017421722412,
+      "learning_rate": 0.00018563131442984044,
+      "loss": 1.0419,
+      "step": 91
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 1.0502578020095825,
+      "learning_rate": 0.00018523404845412027,
+      "loss": 1.1473,
+      "step": 92
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 1.497140645980835,
+      "learning_rate": 0.0001848318025040697,
+      "loss": 1.3815,
+      "step": 93
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.7718933820724487,
+      "learning_rate": 0.00018442460008173445,
+      "loss": 1.0141,
+      "step": 94
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.8856745958328247,
+      "learning_rate": 0.0001840124649787524,
+      "loss": 1.0661,
+      "step": 95
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 1.0631150007247925,
+      "learning_rate": 0.0001835954212749632,
+      "loss": 1.1029,
+      "step": 96
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.7661423683166504,
+      "learning_rate": 0.0001831734933370019,
+      "loss": 0.8733,
+      "step": 97
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.8283933997154236,
+      "learning_rate": 0.0001827467058168748,
+      "loss": 0.885,
+      "step": 98
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 1.1238280534744263,
+      "learning_rate": 0.00018231508365051922,
+      "loss": 1.315,
+      "step": 99
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.8286343812942505,
+      "learning_rate": 0.0001818786520563467,
+      "loss": 1.3218,
+      "step": 100
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 1.028365969657898,
+      "learning_rate": 0.00018143743653376942,
+      "loss": 1.0926,
+      "step": 101
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 1.0841856002807617,
+      "learning_rate": 0.0001809914628617105,
+      "loss": 1.5602,
+      "step": 102
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.7237169146537781,
+      "learning_rate": 0.00018054075709709756,
+      "loss": 1.1558,
+      "step": 103
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 1.2722384929656982,
+      "learning_rate": 0.00018008534557334064,
+      "loss": 1.289,
+      "step": 104
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.8757687211036682,
+      "learning_rate": 0.00017962525489879325,
+      "loss": 1.2181,
+      "step": 105
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.9850521683692932,
+      "learning_rate": 0.00017916051195519797,
+      "loss": 0.9446,
+      "step": 106
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.8803001642227173,
+      "learning_rate": 0.00017869114389611575,
+      "loss": 0.9984,
+      "step": 107
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.8752118945121765,
+      "learning_rate": 0.0001782171781453394,
+      "loss": 1.1776,
+      "step": 108
+    },
+    {
+      "epoch": 0.25,
+      "eval_loss": 1.1369682550430298,
+      "eval_runtime": 5.0569,
+      "eval_samples_per_second": 19.775,
+      "eval_steps_per_second": 19.775,
+      "step": 108
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.8919423222541809,
+      "learning_rate": 0.00017773864239529132,
+      "loss": 1.0642,
+      "step": 109
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.8584320545196533,
+      "learning_rate": 0.0001772555646054055,
+      "loss": 1.0944,
+      "step": 110
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.8071363568305969,
+      "learning_rate": 0.00017676797300049393,
+      "loss": 1.231,
+      "step": 111
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 1.2383997440338135,
+      "learning_rate": 0.00017627589606909755,
+      "loss": 1.2194,
+      "step": 112
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.557596743106842,
+      "learning_rate": 0.00017577936256182167,
+      "loss": 0.4546,
+      "step": 113
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.9943146109580994,
+      "learning_rate": 0.0001752784014896562,
+      "loss": 1.3126,
+      "step": 114
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.7767719626426697,
+      "learning_rate": 0.00017477304212228057,
+      "loss": 1.0385,
+      "step": 115
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.849905788898468,
+      "learning_rate": 0.0001742633139863538,
+      "loss": 1.1383,
+      "step": 116
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 1.038552165031433,
+      "learning_rate": 0.00017374924686378905,
+      "loss": 1.2284,
+      "step": 117
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 1.1603586673736572,
+      "learning_rate": 0.0001732308707900137,
+      "loss": 1.1579,
+      "step": 118
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.8285987973213196,
+      "learning_rate": 0.0001727082160522145,
+      "loss": 1.1173,
+      "step": 119
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.8592681288719177,
+      "learning_rate": 0.0001721813131875679,
+      "loss": 1.0445,
+      "step": 120
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.9642465710639954,
+      "learning_rate": 0.00017165019298145585,
+      "loss": 0.7456,
+      "step": 121
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.8067343235015869,
+      "learning_rate": 0.00017111488646566727,
+      "loss": 1.1104,
+      "step": 122
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.9156467914581299,
+      "learning_rate": 0.00017057542491658468,
+      "loss": 1.321,
+      "step": 123
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.9722065329551697,
+      "learning_rate": 0.000170031839853357,
+      "loss": 1.3756,
+      "step": 124
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.9338364005088806,
+      "learning_rate": 0.00016948416303605795,
+      "loss": 1.4055,
+      "step": 125
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.7224316000938416,
+      "learning_rate": 0.0001689324264638304,
+      "loss": 0.8181,
+      "step": 126
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.6058412194252014,
+      "learning_rate": 0.00016837666237301663,
+      "loss": 0.5958,
+      "step": 127
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.9860162138938904,
+      "learning_rate": 0.00016781690323527511,
+      "loss": 1.1325,
+      "step": 128
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.8655012845993042,
+      "learning_rate": 0.00016725318175568306,
+      "loss": 1.0151,
+      "step": 129
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.7968864440917969,
+      "learning_rate": 0.00016668553087082567,
+      "loss": 1.288,
+      "step": 130
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.8813712000846863,
+      "learning_rate": 0.0001661139837468717,
+      "loss": 1.3115,
+      "step": 131
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.7319151163101196,
+      "learning_rate": 0.00016553857377763566,
+      "loss": 1.0332,
+      "step": 132
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 1.1282501220703125,
+      "learning_rate": 0.0001649593345826268,
+      "loss": 1.2969,
+      "step": 133
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.799923837184906,
+      "learning_rate": 0.00016437630000508464,
+      "loss": 0.9233,
+      "step": 134
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 1.0394463539123535,
+      "learning_rate": 0.00016378950411000183,
+      "loss": 1.2451,
+      "step": 135
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 1.032335877418518,
+      "learning_rate": 0.00016319898118213365,
+      "loss": 1.0193,
+      "step": 136
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 1.0510966777801514,
+      "learning_rate": 0.00016260476572399496,
+      "loss": 1.4264,
+      "step": 137
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.8531592488288879,
+      "learning_rate": 0.00016200689245384424,
+      "loss": 1.0608,
+      "step": 138
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 1.2909444570541382,
+      "learning_rate": 0.00016140539630365522,
+      "loss": 1.116,
+      "step": 139
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.6848735213279724,
+      "learning_rate": 0.00016080031241707578,
+      "loss": 0.9243,
+      "step": 140
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.8676778674125671,
+      "learning_rate": 0.0001601916761473747,
+      "loss": 1.1039,
+      "step": 141
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 1.1020722389221191,
+      "learning_rate": 0.00015957952305537597,
+      "loss": 1.1554,
+      "step": 142
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.883198082447052,
+      "learning_rate": 0.00015896388890738127,
+      "loss": 1.2192,
+      "step": 143
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.8749895691871643,
+      "learning_rate": 0.00015834480967308003,
+      "loss": 1.0946,
+      "step": 144
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 1.0936459302902222,
+      "learning_rate": 0.00015772232152344795,
+      "loss": 1.2226,
+      "step": 145
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 1.003266453742981,
+      "learning_rate": 0.0001570964608286336,
+      "loss": 1.0759,
+      "step": 146
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.9020757675170898,
+      "learning_rate": 0.00015646726415583344,
+      "loss": 0.8462,
+      "step": 147
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.9358006119728088,
+      "learning_rate": 0.0001558347682671553,
+      "loss": 1.4268,
+      "step": 148
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 1.215381383895874,
+      "learning_rate": 0.00015519901011747044,
+      "loss": 1.1037,
+      "step": 149
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 1.1152362823486328,
+      "learning_rate": 0.00015456002685225448,
+      "loss": 1.3482,
+      "step": 150
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 1.1836152076721191,
+      "learning_rate": 0.00015391785580541698,
+      "loss": 1.3423,
+      "step": 151
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.8727456331253052,
+      "learning_rate": 0.0001532725344971202,
+      "loss": 1.6155,
+      "step": 152
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 1.0771983861923218,
+      "learning_rate": 0.0001526241006315869,
+      "loss": 1.4055,
+      "step": 153
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.7963141798973083,
+      "learning_rate": 0.00015197259209489747,
+      "loss": 1.407,
+      "step": 154
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.9456453323364258,
+      "learning_rate": 0.00015131804695277612,
+      "loss": 1.1586,
+      "step": 155
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.9810377359390259,
+      "learning_rate": 0.00015066050344836706,
+      "loss": 1.1881,
+      "step": 156
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.8879066109657288,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 1.2283,
+      "step": 157
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.6425898671150208,
+      "learning_rate": 0.0001493365751989454,
+      "loss": 0.8447,
+      "step": 158
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.7046767473220825,
+      "learning_rate": 0.0001486702678071598,
+      "loss": 0.8164,
+      "step": 159
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.7500093579292297,
+      "learning_rate": 0.00014800111675502094,
+      "loss": 0.9141,
+      "step": 160
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.8106879591941833,
+      "learning_rate": 0.00014732916113905335,
+      "loss": 1.1849,
+      "step": 161
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 1.1407498121261597,
+      "learning_rate": 0.0001466544402196439,
+      "loss": 1.0616,
+      "step": 162
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.8494256138801575,
+      "learning_rate": 0.00014597699341874806,
+      "loss": 0.7441,
+      "step": 163
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.8430727124214172,
+      "learning_rate": 0.00014529686031758643,
+      "loss": 0.8714,
+      "step": 164
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 1.0597652196884155,
+      "learning_rate": 0.00014461408065433227,
+      "loss": 1.372,
+      "step": 165
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 0.7789810299873352,
+      "learning_rate": 0.00014392869432178971,
+      "loss": 1.0039,
+      "step": 166
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 0.7892749905586243,
+      "learning_rate": 0.00014324074136506284,
+      "loss": 1.1196,
+      "step": 167
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 1.3845864534378052,
+      "learning_rate": 0.00014255026197921596,
+      "loss": 1.0634,
+      "step": 168
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 0.9081052541732788,
+      "learning_rate": 0.00014185729650692533,
+      "loss": 1.2649,
+      "step": 169
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 0.9540420174598694,
+      "learning_rate": 0.0001411618854361218,
+      "loss": 1.3414,
+      "step": 170
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 1.3803868293762207,
+      "learning_rate": 0.00014046406939762545,
+      "loss": 1.1241,
+      "step": 171
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 1.0107030868530273,
+      "learning_rate": 0.0001397638891627714,
+      "loss": 1.2018,
+      "step": 172
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 1.2234842777252197,
+      "learning_rate": 0.00013906138564102793,
+      "loss": 1.2857,
+      "step": 173
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 1.0201656818389893,
+      "learning_rate": 0.00013835659987760605,
+      "loss": 1.306,
+      "step": 174
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 0.9351068735122681,
+      "learning_rate": 0.0001376495730510614,
+      "loss": 1.3483,
+      "step": 175
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 1.046683430671692,
+      "learning_rate": 0.0001369403464708884,
+      "loss": 1.2294,
+      "step": 176
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 1.1101959943771362,
+      "learning_rate": 0.00013622896157510658,
+      "loss": 1.3065,
+      "step": 177
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 0.7936177849769592,
+      "learning_rate": 0.00013551545992783947,
+      "loss": 0.8732,
+      "step": 178
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.8901675939559937,
+      "learning_rate": 0.0001347998832168862,
+      "loss": 1.0109,
+      "step": 179
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 1.0607932806015015,
+      "learning_rate": 0.0001340822732512857,
+      "loss": 1.459,
+      "step": 180
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.9324591159820557,
+      "learning_rate": 0.00013336267195887398,
+      "loss": 1.1962,
+      "step": 181
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.8911722898483276,
+      "learning_rate": 0.00013264112138383445,
+      "loss": 0.9749,
+      "step": 182
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.7679628133773804,
+      "learning_rate": 0.00013191766368424133,
+      "loss": 1.1086,
+      "step": 183
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 1.0897005796432495,
+      "learning_rate": 0.00013119234112959655,
+      "loss": 0.9565,
+      "step": 184
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 0.969784677028656,
+      "learning_rate": 0.00013046519609836,
+      "loss": 1.3534,
+      "step": 185
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 0.7279093265533447,
+      "learning_rate": 0.00012973627107547346,
+      "loss": 1.261,
+      "step": 186
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 0.9844076037406921,
+      "learning_rate": 0.0001290056086498785,
+      "loss": 1.2582,
+      "step": 187
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.8580243587493896,
+      "learning_rate": 0.00012827325151202782,
+      "loss": 1.2099,
+      "step": 188
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.9180042147636414,
+      "learning_rate": 0.00012753924245139135,
+      "loss": 1.1773,
+      "step": 189
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.8368516564369202,
+      "learning_rate": 0.00012680362435395595,
+      "loss": 1.1123,
+      "step": 190
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 1.815647006034851,
+      "learning_rate": 0.00012606644019971968,
+      "loss": 1.4643,
+      "step": 191
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 0.8074012398719788,
+      "learning_rate": 0.00012532773306018076,
+      "loss": 1.1728,
+      "step": 192
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 1.0798611640930176,
+      "learning_rate": 0.00012458754609582097,
+      "loss": 1.003,
+      "step": 193
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 0.7999426126480103,
+      "learning_rate": 0.00012384592255358385,
+      "loss": 1.0413,
+      "step": 194
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 0.9997307062149048,
+      "learning_rate": 0.00012310290576434795,
+      "loss": 1.4449,
+      "step": 195
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 0.7481138706207275,
+      "learning_rate": 0.00012235853914039515,
+      "loss": 0.9215,
+      "step": 196
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.967931866645813,
+      "learning_rate": 0.00012161286617287419,
+      "loss": 0.944,
+      "step": 197
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.8793982863426208,
+      "learning_rate": 0.00012086593042925964,
+      "loss": 1.2297,
+      "step": 198
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.6650402545928955,
+      "learning_rate": 0.00012011777555080638,
+      "loss": 0.6958,
+      "step": 199
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 1.2048183679580688,
+      "learning_rate": 0.00011936844524999966,
+      "loss": 1.3445,
+      "step": 200
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 0.845542848110199,
+      "learning_rate": 0.00011861798330800125,
+      "loss": 1.0368,
+      "step": 201
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 1.0240477323532104,
+      "learning_rate": 0.00011786643357209136,
+      "loss": 0.9428,
+      "step": 202
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 1.2319940328598022,
+      "learning_rate": 0.00011711383995310681,
+      "loss": 1.117,
+      "step": 203
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 0.8866223692893982,
+      "learning_rate": 0.00011636024642287546,
+      "loss": 1.1306,
+      "step": 204
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.8940457701683044,
+      "learning_rate": 0.00011560569701164697,
+      "loss": 1.3186,
+      "step": 205
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 2.126744508743286,
+      "learning_rate": 0.00011485023580552039,
+      "loss": 1.4878,
+      "step": 206
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.8328797817230225,
+      "learning_rate": 0.00011409390694386817,
+      "loss": 1.179,
+      "step": 207
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.8258645534515381,
+      "learning_rate": 0.00011333675461675739,
+      "loss": 1.0724,
+      "step": 208
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.9328184723854065,
+      "learning_rate": 0.00011257882306236775,
+      "loss": 1.2991,
+      "step": 209
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 0.7689169645309448,
+      "learning_rate": 0.00011182015656440692,
+      "loss": 1.0181,
+      "step": 210
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 0.9105891585350037,
+      "learning_rate": 0.00011106079944952317,
+      "loss": 1.2888,
+      "step": 211
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 1.0148977041244507,
+      "learning_rate": 0.00011030079608471544,
+      "loss": 0.897,
+      "step": 212
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 1.0647752285003662,
+      "learning_rate": 0.00010954019087474124,
+      "loss": 1.4844,
+      "step": 213
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.8448371887207031,
+      "learning_rate": 0.00010877902825952197,
+      "loss": 1.0933,
+      "step": 214
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.9057207107543945,
+      "learning_rate": 0.00010801735271154669,
+      "loss": 1.1517,
+      "step": 215
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.9787666201591492,
+      "learning_rate": 0.00010725520873327361,
+      "loss": 1.1082,
+      "step": 216
+    },
+    {
+      "epoch": 0.5,
+      "eval_loss": 1.1230801343917847,
+      "eval_runtime": 5.7682,
+      "eval_samples_per_second": 17.336,
+      "eval_steps_per_second": 17.336,
+      "step": 216
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.8504732251167297,
+      "learning_rate": 0.00010649264085452988,
+      "loss": 0.9788,
+      "step": 217
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 1.1127643585205078,
+      "learning_rate": 0.00010572969362990998,
+      "loss": 1.2786,
+      "step": 218
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 0.9895543456077576,
+      "learning_rate": 0.0001049664116361724,
+      "loss": 1.1889,
+      "step": 219
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 1.016451358795166,
+      "learning_rate": 0.0001042028394696352,
+      "loss": 1.0312,
+      "step": 220
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 1.0872440338134766,
+      "learning_rate": 0.00010343902174357039,
+      "loss": 1.1398,
+      "step": 221
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.7803768515586853,
+      "learning_rate": 0.00010267500308559732,
+      "loss": 1.1324,
+      "step": 222
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 1.0301629304885864,
+      "learning_rate": 0.0001019108281350752,
+      "loss": 1.0781,
+      "step": 223
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.812462568283081,
+      "learning_rate": 0.0001011465415404949,
+      "loss": 1.2578,
+      "step": 224
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.8909915089607239,
+      "learning_rate": 0.0001003821879568704,
+      "loss": 0.9458,
+      "step": 225
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.9123939275741577,
+      "learning_rate": 9.96178120431296e-05,
+      "loss": 1.1767,
+      "step": 226
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 0.8767213821411133,
+      "learning_rate": 9.88534584595051e-05,
+      "loss": 1.0316,
+      "step": 227
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 0.7890278100967407,
+      "learning_rate": 9.80891718649248e-05,
+      "loss": 0.9678,
+      "step": 228
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 0.8505096435546875,
+      "learning_rate": 9.732499691440266e-05,
+      "loss": 1.3792,
+      "step": 229
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 0.7998389601707458,
+      "learning_rate": 9.656097825642961e-05,
+      "loss": 1.2242,
+      "step": 230
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.7925727367401123,
+      "learning_rate": 9.579716053036479e-05,
+      "loss": 0.9716,
+      "step": 231
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.8605330586433411,
+      "learning_rate": 9.503358836382761e-05,
+      "loss": 1.2368,
+      "step": 232
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.971171498298645,
+      "learning_rate": 9.427030637009003e-05,
+      "loss": 1.1605,
+      "step": 233
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 1.1337116956710815,
+      "learning_rate": 9.35073591454701e-05,
+      "loss": 0.8344,
+      "step": 234
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 0.8743841648101807,
+      "learning_rate": 9.274479126672641e-05,
+      "loss": 1.109,
+      "step": 235
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 0.9219100475311279,
+      "learning_rate": 9.198264728845332e-05,
+      "loss": 1.3502,
+      "step": 236
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 0.800612211227417,
+      "learning_rate": 9.122097174047805e-05,
+      "loss": 1.1531,
+      "step": 237
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 0.7432851791381836,
+      "learning_rate": 9.045980912525879e-05,
+      "loss": 0.9259,
+      "step": 238
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 0.7886478304862976,
+      "learning_rate": 8.969920391528458e-05,
+      "loss": 0.8643,
+      "step": 239
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.7922253012657166,
+      "learning_rate": 8.893920055047686e-05,
+      "loss": 1.0508,
+      "step": 240
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 1.112563967704773,
+      "learning_rate": 8.81798434355931e-05,
+      "loss": 1.1824,
+      "step": 241
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.7471961379051208,
+      "learning_rate": 8.742117693763227e-05,
+      "loss": 1.0301,
+      "step": 242
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.8147581815719604,
+      "learning_rate": 8.666324538324264e-05,
+      "loss": 1.0864,
+      "step": 243
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 0.8042265772819519,
+      "learning_rate": 8.590609305613184e-05,
+      "loss": 1.2239,
+      "step": 244
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 0.7160305976867676,
+      "learning_rate": 8.514976419447964e-05,
+      "loss": 1.1087,
+      "step": 245
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 0.7818716168403625,
+      "learning_rate": 8.439430298835304e-05,
+      "loss": 1.2434,
+      "step": 246
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 0.8190276622772217,
+      "learning_rate": 8.363975357712457e-05,
+      "loss": 1.2271,
+      "step": 247
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.8305711150169373,
+      "learning_rate": 8.28861600468932e-05,
+      "loss": 0.8105,
+      "step": 248
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.8715688586235046,
+      "learning_rate": 8.213356642790867e-05,
+      "loss": 0.8814,
+      "step": 249
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.8992697596549988,
+      "learning_rate": 8.138201669199879e-05,
+      "loss": 1.1118,
+      "step": 250
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.9572819471359253,
+      "learning_rate": 8.063155475000037e-05,
+      "loss": 1.0346,
+      "step": 251
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.7713218927383423,
+      "learning_rate": 7.988222444919364e-05,
+      "loss": 0.9758,
+      "step": 252
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 1.0017491579055786,
+      "learning_rate": 7.913406957074037e-05,
+      "loss": 1.2187,
+      "step": 253
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 0.6401370763778687,
+      "learning_rate": 7.838713382712583e-05,
+      "loss": 0.6044,
+      "step": 254
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 0.7729225158691406,
+      "learning_rate": 7.76414608596049e-05,
+      "loss": 1.0083,
+      "step": 255
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 0.7392654418945312,
+      "learning_rate": 7.68970942356521e-05,
+      "loss": 0.6903,
+      "step": 256
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 1.2603764533996582,
+      "learning_rate": 7.615407744641619e-05,
+      "loss": 1.0388,
+      "step": 257
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 1.094689130783081,
+      "learning_rate": 7.541245390417906e-05,
+      "loss": 1.1617,
+      "step": 258
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.7427874803543091,
+      "learning_rate": 7.467226693981925e-05,
+      "loss": 0.977,
+      "step": 259
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 1.2259175777435303,
+      "learning_rate": 7.393355980028039e-05,
+      "loss": 1.3001,
+      "step": 260
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 0.7839226722717285,
+      "learning_rate": 7.319637564604412e-05,
+      "loss": 1.3195,
+      "step": 261
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 0.9151708483695984,
+      "learning_rate": 7.246075754860868e-05,
+      "loss": 0.8617,
+      "step": 262
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 0.9389089345932007,
+      "learning_rate": 7.172674848797219e-05,
+      "loss": 1.1299,
+      "step": 263
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 0.9104580879211426,
+      "learning_rate": 7.099439135012153e-05,
+      "loss": 1.4089,
+      "step": 264
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 1.0850249528884888,
+      "learning_rate": 7.026372892452653e-05,
+      "loss": 1.1907,
+      "step": 265
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.6725065112113953,
+      "learning_rate": 6.953480390164e-05,
+      "loss": 0.7875,
+      "step": 266
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 1.6374472379684448,
+      "learning_rate": 6.880765887040343e-05,
+      "loss": 1.4271,
+      "step": 267
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.691931426525116,
+      "learning_rate": 6.808233631575867e-05,
+      "loss": 0.7463,
+      "step": 268
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.9267165660858154,
+      "learning_rate": 6.735887861616556e-05,
+      "loss": 1.2322,
+      "step": 269
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 0.8684428930282593,
+      "learning_rate": 6.663732804112603e-05,
+      "loss": 1.1335,
+      "step": 270
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 0.9493190050125122,
+      "learning_rate": 6.591772674871434e-05,
+      "loss": 1.25,
+      "step": 271
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 0.8068637847900391,
+      "learning_rate": 6.520011678311382e-05,
+      "loss": 1.2451,
+      "step": 272
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 1.043681263923645,
+      "learning_rate": 6.448454007216054e-05,
+      "loss": 1.1108,
+      "step": 273
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 1.0855973958969116,
+      "learning_rate": 6.377103842489343e-05,
+      "loss": 0.8609,
+      "step": 274
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.7895878553390503,
+      "learning_rate": 6.305965352911161e-05,
+      "loss": 0.9859,
+      "step": 275
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.89215087890625,
+      "learning_rate": 6.235042694893862e-05,
+      "loss": 1.3013,
+      "step": 276
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 1.1240825653076172,
+      "learning_rate": 6.164340012239396e-05,
+      "loss": 1.2481,
+      "step": 277
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 0.8804110288619995,
+      "learning_rate": 6.093861435897208e-05,
+      "loss": 1.354,
+      "step": 278
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 0.8492618799209595,
+      "learning_rate": 6.02361108372286e-05,
+      "loss": 1.196,
+      "step": 279
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 0.7941075563430786,
+      "learning_rate": 5.953593060237457e-05,
+      "loss": 1.1934,
+      "step": 280
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 0.831951379776001,
+      "learning_rate": 5.883811456387821e-05,
+      "loss": 0.9866,
+      "step": 281
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 0.8883886337280273,
+      "learning_rate": 5.8142703493074714e-05,
+      "loss": 0.9827,
+      "step": 282
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.8326197862625122,
+      "learning_rate": 5.7449738020784085e-05,
+      "loss": 1.2837,
+      "step": 283
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.8599093556404114,
+      "learning_rate": 5.675925863493721e-05,
+      "loss": 1.1051,
+      "step": 284
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.9380517601966858,
+      "learning_rate": 5.607130567821031e-05,
+      "loss": 1.1518,
+      "step": 285
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.9544126391410828,
+      "learning_rate": 5.5385919345667715e-05,
+      "loss": 1.2313,
+      "step": 286
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 1.0354762077331543,
+      "learning_rate": 5.4703139682413586e-05,
+      "loss": 1.2945,
+      "step": 287
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 0.9215995669364929,
+      "learning_rate": 5.402300658125197e-05,
+      "loss": 0.9978,
+      "step": 288
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 0.7332642674446106,
+      "learning_rate": 5.334555978035609e-05,
+      "loss": 0.9027,
+      "step": 289
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 1.0011807680130005,
+      "learning_rate": 5.267083886094668e-05,
+      "loss": 1.4776,
+      "step": 290
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.9119071364402771,
+      "learning_rate": 5.199888324497907e-05,
+      "loss": 1.0865,
+      "step": 291
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.905940055847168,
+      "learning_rate": 5.132973219284023e-05,
+      "loss": 1.1807,
+      "step": 292
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.8075955510139465,
+      "learning_rate": 5.0663424801054595e-05,
+      "loss": 1.1457,
+      "step": 293
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.9124309420585632,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 1.0714,
+      "step": 294
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 1.0044463872909546,
+      "learning_rate": 4.9339496551632944e-05,
+      "loss": 1.4506,
+      "step": 295
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 0.9177619218826294,
+      "learning_rate": 4.8681953047223914e-05,
+      "loss": 0.8655,
+      "step": 296
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 1.0393673181533813,
+      "learning_rate": 4.8027407905102585e-05,
+      "loss": 1.1033,
+      "step": 297
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 0.7895400524139404,
+      "learning_rate": 4.73758993684131e-05,
+      "loss": 0.8526,
+      "step": 298
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 0.6922112107276917,
+      "learning_rate": 4.672746550287985e-05,
+      "loss": 0.9227,
+      "step": 299
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 1.066853404045105,
+      "learning_rate": 4.6082144194583056e-05,
+      "loss": 1.4358,
+      "step": 300
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.911865770816803,
+      "learning_rate": 4.543997314774553e-05,
+      "loss": 1.0442,
+      "step": 301
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.8865716457366943,
+      "learning_rate": 4.4800989882529574e-05,
+      "loss": 1.139,
+      "step": 302
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.9678537249565125,
+      "learning_rate": 4.41652317328447e-05,
+      "loss": 1.2122,
+      "step": 303
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 0.9031918048858643,
+      "learning_rate": 4.3532735844166574e-05,
+      "loss": 1.0144,
+      "step": 304
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 0.8521971702575684,
+      "learning_rate": 4.2903539171366393e-05,
+      "loss": 1.1825,
+      "step": 305
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 1.575620174407959,
+      "learning_rate": 4.227767847655205e-05,
+      "loss": 1.2486,
+      "step": 306
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 0.8732744455337524,
+      "learning_rate": 4.165519032691998e-05,
+      "loss": 1.2195,
+      "step": 307
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 0.6721953749656677,
+      "learning_rate": 4.1036111092618725e-05,
+      "loss": 0.9281,
+      "step": 308
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.8912865519523621,
+      "learning_rate": 4.042047694462404e-05,
+      "loss": 1.059,
+      "step": 309
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.9858503341674805,
+      "learning_rate": 3.9808323852625316e-05,
+      "loss": 1.1077,
+      "step": 310
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 1.1012351512908936,
+      "learning_rate": 3.919968758292425e-05,
+      "loss": 1.3365,
+      "step": 311
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.9721686244010925,
+      "learning_rate": 3.859460369634479e-05,
+      "loss": 1.1154,
+      "step": 312
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 0.9255664348602295,
+      "learning_rate": 3.799310754615578e-05,
+      "loss": 1.3577,
+      "step": 313
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 0.6955008506774902,
+      "learning_rate": 3.7395234276005087e-05,
+      "loss": 0.8553,
+      "step": 314
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 1.000977635383606,
+      "learning_rate": 3.6801018817866375e-05,
+      "loss": 1.0546,
+      "step": 315
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 0.9564784169197083,
+      "learning_rate": 3.62104958899982e-05,
+      "loss": 1.1865,
+      "step": 316
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 1.024553894996643,
+      "learning_rate": 3.562369999491536e-05,
+      "loss": 1.4169,
+      "step": 317
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.7676483392715454,
+      "learning_rate": 3.504066541737323e-05,
+      "loss": 1.0726,
+      "step": 318
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.7936806082725525,
+      "learning_rate": 3.4461426222364336e-05,
+      "loss": 1.0008,
+      "step": 319
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.8642453551292419,
+      "learning_rate": 3.3886016253128326e-05,
+      "loss": 1.1144,
+      "step": 320
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.9978905320167542,
+      "learning_rate": 3.3314469129174364e-05,
+      "loss": 1.3217,
+      "step": 321
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.955431342124939,
+      "learning_rate": 3.2746818244316956e-05,
+      "loss": 1.0409,
+      "step": 322
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 1.0944534540176392,
+      "learning_rate": 3.2183096764724915e-05,
+      "loss": 1.4031,
+      "step": 323
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.9666315913200378,
+      "learning_rate": 3.16233376269834e-05,
+      "loss": 1.4093,
+      "step": 324
+    },
+    {
+      "epoch": 0.75,
+      "eval_loss": 1.1115437746047974,
+      "eval_runtime": 5.4226,
+      "eval_samples_per_second": 18.441,
+      "eval_steps_per_second": 18.441,
+      "step": 324
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.7884992957115173,
+      "learning_rate": 3.106757353616966e-05,
+      "loss": 0.8406,
+      "step": 325
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.8696621656417847,
+      "learning_rate": 3.0515836963942056e-05,
+      "loss": 1.2945,
+      "step": 326
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.8080437183380127,
+      "learning_rate": 2.9968160146643022e-05,
+      "loss": 1.1088,
+      "step": 327
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 1.0544288158416748,
+      "learning_rate": 2.9424575083415362e-05,
+      "loss": 1.5478,
+      "step": 328
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.9855386018753052,
+      "learning_rate": 2.888511353433274e-05,
+      "loss": 1.1224,
+      "step": 329
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 0.8086793422698975,
+      "learning_rate": 2.8349807018544174e-05,
+      "loss": 1.195,
+      "step": 330
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 0.7815480828285217,
+      "learning_rate": 2.7818686812432136e-05,
+      "loss": 1.1851,
+      "step": 331
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 1.0230026245117188,
+      "learning_rate": 2.7291783947785543e-05,
+      "loss": 1.506,
+      "step": 332
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 0.9805968403816223,
+      "learning_rate": 2.6769129209986322e-05,
+      "loss": 1.4654,
+      "step": 333
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 0.9589524865150452,
+      "learning_rate": 2.6250753136210983e-05,
+      "loss": 0.8588,
+      "step": 334
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.792204737663269,
+      "learning_rate": 2.5736686013646228e-05,
+      "loss": 1.1536,
+      "step": 335
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 1.411292314529419,
+      "learning_rate": 2.5226957877719436e-05,
+      "loss": 1.1398,
+      "step": 336
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 1.2491581439971924,
+      "learning_rate": 2.4721598510343858e-05,
+      "loss": 1.3447,
+      "step": 337
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.9221548438072205,
+      "learning_rate": 2.4220637438178317e-05,
+      "loss": 1.0514,
+      "step": 338
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 1.1859768629074097,
+      "learning_rate": 2.372410393090243e-05,
+      "loss": 1.1367,
+      "step": 339
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 0.8900750279426575,
+      "learning_rate": 2.3232026999506062e-05,
+      "loss": 1.2437,
+      "step": 340
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 0.8685091137886047,
+      "learning_rate": 2.2744435394594497e-05,
+      "loss": 1.2518,
+      "step": 341
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 0.7511561512947083,
+      "learning_rate": 2.22613576047087e-05,
+      "loss": 0.9799,
+      "step": 342
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.9015387296676636,
+      "learning_rate": 2.1782821854660606e-05,
+      "loss": 1.096,
+      "step": 343
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 1.0497876405715942,
+      "learning_rate": 2.130885610388428e-05,
+      "loss": 1.335,
+      "step": 344
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 1.0829570293426514,
+      "learning_rate": 2.0839488044802036e-05,
+      "loss": 1.7663,
+      "step": 345
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.7861917614936829,
+      "learning_rate": 2.037474510120676e-05,
+      "loss": 1.1382,
+      "step": 346
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 0.9880712628364563,
+      "learning_rate": 1.9914654426659374e-05,
+      "loss": 1.0684,
+      "step": 347
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 0.882596492767334,
+      "learning_rate": 1.945924290290242e-05,
+      "loss": 1.1136,
+      "step": 348
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 1.4436990022659302,
+      "learning_rate": 1.9008537138289527e-05,
+      "loss": 1.5471,
+      "step": 349
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 0.8502001166343689,
+      "learning_rate": 1.8562563466230576e-05,
+      "loss": 1.2033,
+      "step": 350
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 0.8720386624336243,
+      "learning_rate": 1.8121347943653332e-05,
+      "loss": 1.3097,
+      "step": 351
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.6880800724029541,
+      "learning_rate": 1.7684916349480794e-05,
+      "loss": 0.943,
+      "step": 352
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 1.0741546154022217,
+      "learning_rate": 1.7253294183125223e-05,
+      "loss": 1.1616,
+      "step": 353
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.966346263885498,
+      "learning_rate": 1.6826506662998097e-05,
+      "loss": 1.353,
+      "step": 354
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.7042524814605713,
+      "learning_rate": 1.64045787250368e-05,
+      "loss": 1.102,
+      "step": 355
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 0.8945311903953552,
+      "learning_rate": 1.5987535021247667e-05,
+      "loss": 1.0206,
+      "step": 356
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 0.7530681490898132,
+      "learning_rate": 1.5575399918265542e-05,
+      "loss": 1.1307,
+      "step": 357
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 0.9247698783874512,
+      "learning_rate": 1.5168197495930315e-05,
+      "loss": 1.4552,
+      "step": 358
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 0.8581532835960388,
+      "learning_rate": 1.476595154587973e-05,
+      "loss": 0.9923,
+      "step": 359
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.9739102125167847,
+      "learning_rate": 1.436868557015959e-05,
+      "loss": 1.3055,
+      "step": 360
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 1.1127710342407227,
+      "learning_rate": 1.3976422779850384e-05,
+      "loss": 1.3258,
+      "step": 361
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.7771569490432739,
+      "learning_rate": 1.3589186093711226e-05,
+      "loss": 0.9989,
+      "step": 362
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.9304590821266174,
+      "learning_rate": 1.3206998136840831e-05,
+      "loss": 1.2083,
+      "step": 363
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 1.4773048162460327,
+      "learning_rate": 1.2829881239355468e-05,
+      "loss": 1.3064,
+      "step": 364
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 1.212895393371582,
+      "learning_rate": 1.2457857435084408e-05,
+      "loss": 2.7761,
+      "step": 365
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 0.7761988639831543,
+      "learning_rate": 1.2090948460282414e-05,
+      "loss": 1.1132,
+      "step": 366
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 1.2847517728805542,
+      "learning_rate": 1.1729175752359922e-05,
+      "loss": 1.3685,
+      "step": 367
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 0.8102625608444214,
+      "learning_rate": 1.1372560448630376e-05,
+      "loss": 1.1863,
+      "step": 368
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.7838321328163147,
+      "learning_rate": 1.102112338507526e-05,
+      "loss": 1.153,
+      "step": 369
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 1.1303842067718506,
+      "learning_rate": 1.067488509512683e-05,
+      "loss": 0.9941,
+      "step": 370
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.917951226234436,
+      "learning_rate": 1.0333865808468202e-05,
+      "loss": 1.0509,
+      "step": 371
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 1.0915648937225342,
+      "learning_rate": 9.998085449851635e-06,
+      "loss": 0.999,
+      "step": 372
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 0.8339885473251343,
+      "learning_rate": 9.667563637934129e-06,
+      "loss": 0.8986,
+      "step": 373
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 0.8891353607177734,
+      "learning_rate": 9.342319684131395e-06,
+      "loss": 1.1157,
+      "step": 374
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 1.030124545097351,
+      "learning_rate": 9.02237259148938e-06,
+      "loss": 1.1778,
+      "step": 375
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 0.877785325050354,
+      "learning_rate": 8.70774105357407e-06,
+      "loss": 1.2047,
+      "step": 376
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 0.9380621910095215,
+      "learning_rate": 8.398443453379267e-06,
+      "loss": 1.1252,
+      "step": 377
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.879805326461792,
+      "learning_rate": 8.094497862252471e-06,
+      "loss": 1.3558,
+      "step": 378
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 1.081350564956665,
+      "learning_rate": 7.795922038839032e-06,
+      "loss": 1.3124,
+      "step": 379
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.8164477348327637,
+      "learning_rate": 7.502733428044683e-06,
+      "loss": 1.0358,
+      "step": 380
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.8706633448600769,
+      "learning_rate": 7.214949160016115e-06,
+      "loss": 1.1296,
+      "step": 381
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 0.9576455354690552,
+      "learning_rate": 6.932586049140255e-06,
+      "loss": 1.126,
+      "step": 382
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 0.9922929406166077,
+      "learning_rate": 6.655660593061719e-06,
+      "loss": 1.2583,
+      "step": 383
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 0.8414103984832764,
+      "learning_rate": 6.384188971719052e-06,
+      "loss": 1.1785,
+      "step": 384
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 1.0709247589111328,
+      "learning_rate": 6.11818704639926e-06,
+      "loss": 1.4718,
+      "step": 385
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 1.3863894939422607,
+      "learning_rate": 5.857670358811096e-06,
+      "loss": 1.2066,
+      "step": 386
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 1.3715444803237915,
+      "learning_rate": 5.6026541301771095e-06,
+      "loss": 1.0306,
+      "step": 387
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.759107768535614,
+      "learning_rate": 5.353153260344179e-06,
+      "loss": 0.4256,
+      "step": 388
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.7659134864807129,
+      "learning_rate": 5.109182326913054e-06,
+      "loss": 1.1735,
+      "step": 389
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.8766858577728271,
+      "learning_rate": 4.870755584386544e-06,
+      "loss": 1.135,
+      "step": 390
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 1.0143039226531982,
+      "learning_rate": 4.63788696333678e-06,
+      "loss": 1.3126,
+      "step": 391
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 0.8704203963279724,
+      "learning_rate": 4.410590069591192e-06,
+      "loss": 1.0583,
+      "step": 392
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 1.024915337562561,
+      "learning_rate": 4.188878183437594e-06,
+      "loss": 0.9828,
+      "step": 393
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 0.9721056222915649,
+      "learning_rate": 3.972764258848305e-06,
+      "loss": 0.8001,
+      "step": 394
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.8740870356559753,
+      "learning_rate": 3.7622609227231818e-06,
+      "loss": 1.2904,
+      "step": 395
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 1.0467931032180786,
+      "learning_rate": 3.5573804741519833e-06,
+      "loss": 1.0275,
+      "step": 396
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.8117631673812866,
+      "learning_rate": 3.3581348836956738e-06,
+      "loss": 0.8395,
+      "step": 397
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.8695462942123413,
+      "learning_rate": 3.1645357926870955e-06,
+      "loss": 1.0884,
+      "step": 398
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 1.1519237756729126,
+      "learning_rate": 2.9765945125507235e-06,
+      "loss": 1.3518,
+      "step": 399
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 1.028548240661621,
+      "learning_rate": 2.7943220241418377e-06,
+      "loss": 1.6295,
+      "step": 400
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 1.0098282098770142,
+      "learning_rate": 2.6177289771049274e-06,
+      "loss": 1.2196,
+      "step": 401
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 0.8729552030563354,
+      "learning_rate": 2.4468256892514417e-06,
+      "loss": 0.8682,
+      "step": 402
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.7072241306304932,
+      "learning_rate": 2.281622145956952e-06,
+      "loss": 0.7565,
+      "step": 403
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.8818898797035217,
+      "learning_rate": 2.122127999577783e-06,
+      "loss": 1.0552,
+      "step": 404
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 1.243230938911438,
+      "learning_rate": 1.9683525688869773e-06,
+      "loss": 1.2233,
+      "step": 405
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.8495836853981018,
+      "learning_rate": 1.8203048385299181e-06,
+      "loss": 1.1537,
+      "step": 406
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 1.1537418365478516,
+      "learning_rate": 1.6779934584992718e-06,
+      "loss": 1.2535,
+      "step": 407
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 0.8221082091331482,
+      "learning_rate": 1.5414267436297037e-06,
+      "loss": 1.1052,
+      "step": 408
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 0.7960305213928223,
+      "learning_rate": 1.4106126731119996e-06,
+      "loss": 1.2468,
+      "step": 409
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 0.9470089077949524,
+      "learning_rate": 1.2855588900269056e-06,
+      "loss": 1.193,
+      "step": 410
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 0.9635655283927917,
+      "learning_rate": 1.1662727008984964e-06,
+      "loss": 1.474,
+      "step": 411
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.8623586297035217,
+      "learning_rate": 1.0527610752673944e-06,
+      "loss": 1.138,
+      "step": 412
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.8060218095779419,
+      "learning_rate": 9.450306452834179e-07,
+      "loss": 1.137,
+      "step": 413
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 1.009254813194275,
+      "learning_rate": 8.430877053182129e-07,
+      "loss": 1.1293,
+      "step": 414
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.8745806813240051,
+      "learning_rate": 7.469382115974032e-07,
+      "loss": 1.2717,
+      "step": 415
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 0.8753180503845215,
+      "learning_rate": 6.565877818526245e-07,
+      "loss": 1.6308,
+      "step": 416
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 0.7818780541419983,
+      "learning_rate": 5.72041694993286e-07,
+      "loss": 0.7243,
+      "step": 417
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 1.1222724914550781,
+      "learning_rate": 4.933048907981741e-07,
+      "loss": 1.2855,
+      "step": 418
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 0.7851914763450623,
+      "learning_rate": 4.203819696267486e-07,
+      "loss": 1.1144,
+      "step": 419
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 1.2824020385742188,
+      "learning_rate": 3.532771921504696e-07,
+      "loss": 0.9624,
+      "step": 420
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.9270046353340149,
+      "learning_rate": 2.919944791037632e-07,
+      "loss": 1.1227,
+      "step": 421
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.9121943712234497,
+      "learning_rate": 2.3653741105499338e-07,
+      "loss": 1.3002,
+      "step": 422
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.8677274584770203,
+      "learning_rate": 1.8690922819727398e-07,
+      "loss": 1.2818,
+      "step": 423
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.9009543657302856,
+      "learning_rate": 1.4311283015910893e-07,
+      "loss": 1.4132,
+      "step": 424
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 0.7963302135467529,
+      "learning_rate": 1.0515077583498344e-07,
+      "loss": 1.0084,
+      "step": 425
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 0.7949801087379456,
+      "learning_rate": 7.302528323589464e-08,
+      "loss": 0.8838,
+      "step": 426
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 0.845206081867218,
+      "learning_rate": 4.6738229359732935e-08,
+      "loss": 0.9944,
+      "step": 427
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 0.7643120288848877,
+      "learning_rate": 2.6291150081603212e-08,
+      "loss": 1.1573,
+      "step": 428
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.9906445145606995,
+      "learning_rate": 1.168524006410765e-08,
+      "loss": 1.3015,
+      "step": 429
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 1.216629147529602,
+      "learning_rate": 2.921352687534906e-09,
+      "loss": 0.9889,
+      "step": 430
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.8904082179069519,
+      "learning_rate": 0.0,
+      "loss": 1.1844,
+      "step": 431
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 431,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "total_flos": 7839152706846720.0,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-431/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:20ae85fcf3e62c3900e9e6048e9302b4db3cc1e6fbd50b029bc0739c47ef99b5
+size 5624

checkpoint-597/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+library_name: peft
+base_model: openlm-research/open_llama_3b_v2
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.9.0

checkpoint-597/adapter_config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "openlm-research/open_llama_3b_v2",
+  "bias": "none",
+  "fan_in_fan_out": null,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "v_proj",
+    "o_proj",
+    "q_proj",
+    "down_proj",
+    "up_proj",
+    "gate_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

checkpoint-597/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d0f1f41f014b771db793fe3989264831c3ef8df78bac770780ea2ed6a5da31fe
+size 50899792

checkpoint-597/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5d8f2bb2165989a836390f9898996deca00393348e405a4d33a1e049a58a4a6e
+size 101919290

checkpoint-597/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cc2f1b74cde4a4420e9e40dc897108fc36ea3a012b622f2e0123ed439bcd1152
+size 14244

checkpoint-597/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2462c7ed82882387b0c2e4a03b3e21bb60ef12f7c00dbb318874547acc66bc62
+size 1064

checkpoint-597/trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-597/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d84e26ce27315b618f94e914bb6b67f0bb5aa37c3903b14adcd26c9fca9f3f82
+size 5624

checkpoint-796/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+library_name: peft
+base_model: openlm-research/open_llama_3b_v2
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.9.0

checkpoint-796/adapter_config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "openlm-research/open_llama_3b_v2",
+  "bias": "none",
+  "fan_in_fan_out": null,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "v_proj",
+    "o_proj",
+    "q_proj",
+    "down_proj",
+    "up_proj",
+    "gate_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

checkpoint-796/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:842013bff659a1aeece0bd337debda2899c1b7f9c9fce662fc2a102cd6d462ed
+size 50899792

checkpoint-796/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5c789c934200ad012c91419cf47b9e24f7c2811b9c98f262f5c606103d4d1b13
+size 101919290

checkpoint-796/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8d963dde01b390675a90cd7d31c489e6ee115fa4d8a03b6f3619b526cbbfa719
+size 14244

checkpoint-796/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:61fd7e6c894e16612a66cbd972048c812ca7c4e2a3781a69ab7e52fea2402ea2
+size 1064

checkpoint-796/trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-796/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d84e26ce27315b618f94e914bb6b67f0bb5aa37c3903b14adcd26c9fca9f3f82
+size 5624

config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "_name_or_path": "openlm-research/open_llama_3b_v2",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "hidden_act": "silu",
+  "hidden_size": 3200,
+  "initializer_range": 0.02,
+  "intermediate_size": 8640,
+  "max_position_embeddings": 2048,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 26,
+  "num_key_value_heads": 32,
+  "pad_token_id": 0,
+  "pretraining_tp": 1,
+  "quantization_config": {
+    "_load_in_4bit": true,
+    "_load_in_8bit": false,
+    "bnb_4bit_compute_dtype": "float16",
+    "bnb_4bit_quant_type": "nf4",
+    "bnb_4bit_use_double_quant": true,
+    "llm_int8_enable_fp32_cpu_offload": false,
+    "llm_int8_has_fp16_weight": false,
+    "llm_int8_skip_modules": null,
+    "llm_int8_threshold": 6.0,
+    "load_in_4bit": true,
+    "load_in_8bit": false,
+    "quant_method": "bitsandbytes"
+  },
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float16",
+  "transformers_version": "4.38.2",
+  "use_cache": false,
+  "vocab_size": 32000
+}

runs/Mar13_21-58-24_8711e78fac20/events.out.tfevents.1710367104.8711e78fac20.40.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7631a30f73d64fe416f3d56cc639c398b6f148c0744264cb60d4cc07ce49a72e
+size 97683

runs/Mar13_22-06-09_8711e78fac20/events.out.tfevents.1710367570.8711e78fac20.172.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ac5f6f47b09946d9d1ec181e9edd7640acb54188e8ce1f21a9384bd741b8b978
+size 177950

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:91b289e85fa20fd375d8b33dc12f77616f18abc6359804471d1fafcb425fecb8
+size 511574

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": true,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": true,
+  "model_max_length": 2048,
+  "pad_token": "</s>",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false,
+  "use_fast": true
+}